コード例 #1
0
def run_populate(rconfig):
    """Populate xml directory in the target directory with limit files from the
    source file list or the source directory."""

    output_name = DOCUMENT_PROCESSING_IO[POPULATE]['out']
    dataset = DataSet(POPULATE, output_name, rconfig)
    fspecs = FileSpecificationList(rconfig.filelist, dataset.files_processed,
                                   rconfig.limit)
    print "[--populate] adding %d files to %s" % (len(fspecs), dataset)
    count = 0
    for fspec in fspecs:
        count += 1
        src_file = fspec.source
        dst_file = os.path.join(rconfig.corpus, 'data', output_name,
                                dataset.version_id, 'files', fspec.target)
        # allow for compressed files, while being handed the name without extension
        if not os.path.exists(src_file):
            src_file += ".gz"
            dst_file += ".gz"
        if rconfig.verbose:
            print "[--populate] %04d %s" % (count, dst_file)
        ensure_path(os.path.dirname(dst_file))
        _copy_file(src_file, dst_file)
        compress(dst_file)
        _update_state_files_processed(dataset, count)
    return count % STEP, [dataset]
コード例 #2
0
def _cleanup_cinfo(cinfo_file, cinfo_file_sorted):
    run_command("gzip %s" % cinfo_file)
    run_command("gzip %s" % cinfo_file_sorted)
    info_dir = os.path.dirname(cinfo_file) + os.sep + 'info'
    ensure_path(info_dir)
    run_command("mv %s.gz %s" % (cinfo_file, info_dir))
    run_command("mv %s.gz %s" % (cinfo_file_sorted, info_dir))
コード例 #3
0
 def _create_directories(self):
     """Create subdirectory structure in self.location."""
     print "[--init] creating directory structure in %s" % self.location
     ensure_path(self.conf_path)
     for subdir in config.DATA_DIRS:
         subdir_path = self.data_path + os.sep + subdir
         ensure_path(subdir_path)
コード例 #4
0
def _prepare_io(stage, fspec, input_dataset, output_dataset, rconfig, count):
    """Generate the file paths for the datasets and make sure the path to the file exists for
    the output dataset."""
    filename = fspec.target
    _print_file_progress(stage, filename, count, rconfig)
    file_id = filename[1:] if filename.startswith(os.sep) else filename
    file_in = os.path.join(input_dataset.path, 'files', file_id)
    file_out = os.path.join(output_dataset.path, 'files', file_id)
    ensure_path(os.path.dirname(file_out))
    return file_in, file_out
コード例 #5
0
 def run(self):
     """Run the trainer by finding the input data and building a model from it. Also
     writes files with information on configuration settings, features, gold standard
     term annotations and other things required to reproduce the model."""
     if os.path.exists(self.train_dir):
         exit("WARNING: Classifier model %s already exists" % self.train_dir)
     ensure_path(self.train_dir)
     ensure_path(self.info_dir)
     self._find_datasets()
     self._create_info_files()
     self._create_mallet_file()
コード例 #6
0
def merge_mallet_files(target_dir, mallet_files):
    t1 = time.time()
    target_file = os.path.join(target_dir, 'train.mallet')
    info_file = os.path.join(target_dir, 'train.mallet.info')
    print "\nMerging"
    for f in mallet_files:
        print '  ', f
    print "Target mallet file\n  ", target_file
    merge_command = "cat %s > %s" % (' '.join(mallet_files), target_file)
    print "\n$", merge_command, "\n"
    ensure_path(target_dir)
    os.system(merge_command)
    write_info(info_file, mallet_files, t1)
コード例 #7
0
def run_itrainer(corpus,
                 filelist,
                 model,
                 features,
                 annotation_file,
                 phr_feats_file=None,
                 verbose=False):

    mallet_file = os.path.join(model, 'itrain.mallet')
    phr_feats_file = os.path.join(model, 'keyfeats.ta.dat')
    ensure_path(model)
    _itrainer_create_info_file(corpus, model, filelist, features,
                               annotation_file)
    _itrainer_create_dat_file(phr_feats_file, corpus, filelist)
    _itrainer_create_mallet_file(annotation_file, phr_feats_file, mallet_file)
    patent_invention_train(mallet_file)
コード例 #8
0
def run_iclassifier(corpus,
                    filelist,
                    model,
                    classification,
                    label_file='iclassify.MaxEnt.label',
                    verbose=False):
    """Run the invention classifier on the corpus using the model specified and
    create a classification."""
    print
    print '[run_iclassifier] corpus =', corpus
    print '[run_iclassifier] files  =', filelist
    print '[run_iclassifier] model  =', model
    print '[run_iclassifier] class  =', classification
    t1 = time.time()
    ensure_path(classification)
    create_info_files(corpus, model, filelist, classification)
    # create classification/iclassify.mallet from given files in the corpus
    invention.create_mallet_classify_file(corpus,
                                          filelist,
                                          classification,
                                          "invention",
                                          "1",
                                          verbose=True)
    t2 = time.time()
    # create result files in the classification
    invention.patent_invention_classify(None,
                                        train_dir=model,
                                        test_dir=classification)
    t3 = time.time()
    # creates the label file from the classifier output
    print "[run_iclassifier] creating the .label file"
    command = "cat %s/%s | egrep -v '^name' | egrep '\|.*\|' | python %s > %s/%s" \
              % (classification, 'iclassify.MaxEnt.out', 'invention_top_scores.py',
                 classification, label_file)
    print '   $', command
    subprocess.call(command, shell=True)
    t4 = time.time()
    process_label_file(corpus, classification, label_file, verbose)
    create_processing_time_file(classification, t1, t2, t3, t4)
    print
コード例 #9
0
ファイル: repository.py プロジェクト: techknowledgist/test
def copy_and_compress(source, target_dir, target_file):
    """Copy source to target file, making sure there is a directory. Compress
    the new file."""
    ensure_path(target_dir)
    shutil.copyfile(source, target_file)
    compress(target_file)
コード例 #10
0
        print fname
        fh_terms = open_input_file(fname)
        count = 0
        for line in fh_terms:
            count += 1
            if count > 100000: break
            if count % 500000 == 0: print '  ', count
            fields = line.split("\t")
            term = fields[0]
            term_count = int(fields[2])
            terms[term] = terms.get(term, 0) + term_count
    return terms


if __name__ == '__main__':

    target_dir = sys.argv[1]
    result_files = []
    for exp in sys.argv[2:]:
        files = glob.glob(exp)
        result_files.extend(files)

    ensure_path(target_dir)
    infofile = target_dir + '/merged_term_frequencies.info.txt'
    fh_info = codecs.open(infofile, 'w', encoding='utf-8')
    fh_info.write("git commit = %s\n\n" % get_git_commit())
    for fname in result_files:
        fh_info.write(fname + u"\n")

    merge_result_files(target_dir, result_files)