Example #1
0
 def write_html(self):
     ensure_path(self.html_dir)
     self.write_index_file()
     for term in self.terms:
         term_file = os.path.join(self.html_dir, "%05d.html" % term.id)
         term_fh = codecs.open(term_file, 'w', encoding='utf-8')
         term.generate_html(fh=term_fh)
Example #2
0
def _cleanup_cinfo(cinfo_file, cinfo_file_sorted):
    run_command("gzip %s" % cinfo_file)
    run_command("gzip %s" % cinfo_file_sorted)
    info_dir = os.path.dirname(cinfo_file) + os.sep + 'info'
    ensure_path(info_dir)
    run_command("mv %s.gz %s" % (cinfo_file, info_dir))
    run_command("mv %s.gz %s" % (cinfo_file_sorted, info_dir))
 def _create_directories(self):
     """Create subdirectory structure in target_path."""
     print "[--init] creating directory structure in %s" % (
         self.target_path)
     ensure_path(self.conf_path)
     for subdir in config.PROCESSING_AREAS:
         subdir_path = self.data_path + os.sep + subdir
         ensure_path(subdir_path)
def prepare_io(filename, input_dataset, output_dataset):
    """Generate the file paths for the datasets and make sure the path to the file exists for
    the output dataset. May need to add a version that deals with multiple output datasets."""
    file_id = filename[1:] if filename.startswith(os.sep) else filename
    file_in = os.path.join(input_dataset.path, 'files', file_id)
    file_out = os.path.join(output_dataset.path, 'files', file_id)
    ensure_path(os.path.dirname(file_out))
    return file_in, file_out
 def run_classifier(self, t1, corpus):
     ensure_path(self.output)
     self._create_mallet_file(corpus=corpus)
     self._run_classifier()
     self._calculate_scores()
     self._create_info_files(t1)
     for fname in (self.results_file, self.mallet_file, self.scores_s1):
         print "[Classifier.run_classifier] Compressing", fname
         compress(fname)
Example #6
0
 def __init__(self, corpus, index_name):
     self.corpus = corpus
     self.index_name = index_name
     self.idx_dir = os.path.join(corpus, 'data', 'o1_index', index_name)
     ensure_path(self.idx_dir)
     self.db_info = InfoDatabase(self.idx_dir, 'db-info.sqlite')
     self.db_years = YearsDatabase(self.idx_dir, 'db-years.sqlite')
     self.db_summary = SummaryDatabase(self.idx_dir, 'db-terms.sqlite')
     self.db_terms = {}
     self.pp()
 def run(self):
     """Run the trainer by finding the input data and building a model from it. Also
     writes files with information on configuration settings, features, gold standard
     term annotations and other things required to reproduce the model."""
     if os.path.exists(self.train_dir):
         exit("WARNING: Classifier model %s already exists" % self.train_dir)
     ensure_path(self.train_dir)
     ensure_path(self.info_dir)
     self._find_datasets()
     self._create_info_files()
     self._create_mallet_file()
 def run(self):
     if os.path.exists(self.info_file_general):
         sys.exit("WARNING: already have classifier results in %s" %
                  self.batch)
     ensure_path(self.batch)
     self._find_datasets()
     self._create_mallet_file()
     self._run_classifier()
     self._calculate_scores()
     self._run_eval()
     self._create_info_files()
     compress(self.results_file, self.mallet_file, self.scores_s1)
def merge_mallet_files(target_dir, mallet_files):
    t1 = time.time()
    target_file = os.path.join(target_dir, 'train.mallet')
    info_file = os.path.join(target_dir, 'train.mallet.info')
    print "\nMerging"
    for f in mallet_files:
        print '  ', f
    print "Target mallet file\n  ", target_file
    merge_command = "cat %s > %s" % (' '.join(mallet_files), target_file)
    print "\n$", merge_command, "\n"
    ensure_path(target_dir)
    os.system(merge_command)
    write_info(info_file, mallet_files, t1)
Example #10
0
def write_info(rconfig, dirname, filelist):
    """Generate a file with general information and copy the file list to the
    annotation directory."""
    print "Writing general info..."
    ensure_path(dirname)
    with open(os.path.join(dirname, 'annotate.info.general.txt'), 'w') as fh:
        fh.write("$ python %s\n\n" % ' '.join(sys.argv))
        fh.write("corpus            =  %s\n" % os.path.abspath(rconfig.corpus))
        fh.write("file_list         =  %s\n" % os.path.abspath(filelist))
        fh.write("config_file       =  %s\n" % \
                     os.path.basename(rconfig.pipeline_config_file))
        fh.write("git_commit        =  %s" % get_git_commit())
    print "Copying %s..." % (filelist)
    shutil.copyfile(filelist,
                    os.path.join(dirname, 'annotate.info.filelist.txt'))
Example #11
0
 def _create_info_files(self):
     if os.path.exists(self.info_file_general):
         sys.exit("WARNING: already ran indexer for batch %s" % self.batch)
     print "[Collector] initializing data/o1_index/%s directory" % self.batch
     ensure_path(self.batch_dir)
     with open(self.info_file_general, 'w') as fh:
         fh.write("$ python %s\n\n" % ' '.join(sys.argv))
         fh.write("batch        =  %s\n" % self.batch)
         fh.write("file_list    =  %s\n" % self.file_list)
         fh.write("config_file  =  %s\n" %
                  os.path.basename(rconfig.pipeline_config_file))
         fh.write("git_commit   =  %s\n" % get_git_commit())
     shutil.copyfile(self.rconfig.pipeline_config_file,
                     self.info_file_config)
     shutil.copyfile(self.file_list, self.info_file_filelist)
Example #12
0
 def initialize_on_disk(self):
     """All that is guaranteed to exist is a directory like data/patents/en/d1_txt, but
     sub structures is not there. Create the substructure and initial versions of all
     needed files in configuration and state directories."""
     for subdir in ('config', 'state', 'files'):
         ensure_path(os.path.join(self.path, subdir))
     create_file(os.path.join(self.path, 'state', 'processed.txt'), "0\n")
     create_file(os.path.join(self.path, 'state', 'processing-history.txt'))
     trace, head = self.split_pipeline()
     trace_str = pipeline_component_as_string(trace)
     head_str = pipeline_component_as_string([head])
     create_file(os.path.join(self.path, 'config', 'pipeline-head.txt'),
                 head_str)
     create_file(os.path.join(self.path, 'config', 'pipeline-trace.txt'),
                 trace_str)
     self.files_processed = 0
def run_itrainer(corpus,
                 filelist,
                 model,
                 features,
                 annotation_file,
                 phr_feats_file=None,
                 verbose=False):

    mallet_file = os.path.join(model, 'itrain.mallet')
    phr_feats_file = os.path.join(model, 'keyfeats.ta.dat')
    ensure_path(model)
    _itrainer_create_info_file(corpus, model, filelist, features,
                               annotation_file)
    _itrainer_create_dat_file(phr_feats_file, corpus, filelist)
    _itrainer_create_mallet_file(annotation_file, phr_feats_file, mallet_file)
    patent_invention_train(mallet_file)
Example #14
0
 def _create_info_files(self):
     if os.path.exists(self.info_file_general):
         sys.exit("WARNING: already have matcher results in %s" %
                  self.output)
     print "[Matcher] initializing data/o2_matcher/%s directory" % self.output
     ensure_path(self.output_dir)
     with open(self.info_file_general, 'w') as fh:
         fh.write("$ python %s\n\n" % ' '.join(sys.argv))
         fh.write("output       =  %s\n" % self.output)
         fh.write("file_list    =  %s\n" % self.file_list)
         #fh.write("config_file  =  %s\n" % os.path.basename(rconfig.pipeline_config_file))
         fh.write("config_file  =  %s\n" % rconfig.pipeline_config_file)
         fh.write("git_commit   =  %s\n" % get_git_commit())
     if self.rconfig.pipeline_config_file is not None:
         shutil.copyfile(self.rconfig.pipeline_config_file,
                         self.info_file_config)
     shutil.copyfile(self.file_list, self.info_file_filelist)
Example #15
0
def run_populate(rconfig, limit, verbose=False):
    """Populate xml directory in the target directory with limit files from the
    source file list or the source directory."""

    output_name = DOCUMENT_PROCESSING_IO[POPULATE]['out']
    dataset = DataSet(POPULATE, output_name, rconfig)

    # initialize data set if it does not exist, this is not contingent on
    # anything because --populate is the first step
    if not dataset.exists():
        dataset.initialize_on_disk()
        dataset.load_from_disk()

    fspecs = get_lines(rconfig.filenames, dataset.files_processed, limit)
    print "[--populate] adding %d files to %s" % (len(fspecs), dataset)
    count = 0
    for fspec in fspecs:
        count += 1
        src_file = fspec.source
        dst_file = os.path.join(rconfig.target_path, 'data', output_name,
                                dataset.version_id, 'files', fspec.target)
        # allow for compressed files, while being handed the name without
        # extension
        if not os.path.exists(src_file):
            src_file += ".gz"
            dst_file += ".gz"
        if verbose:
            print "[--populate] %04d %s" % (count, dst_file)
        ensure_path(os.path.dirname(dst_file))
        try:
            shutil.copyfile(src_file, dst_file)
        except IOError:
            print "                  WARNING: source file does not exist, not copying"
            print "                  %s" % src_file
        # at some point there seemed to be an issue with compressing for Chinese,
        # so added this to do language dependent compressing, there is now no
        # difference for the population phase
        if rconfig.language == 'en': compress(dst_file)
        elif rconfig.language == 'cn': compress(dst_file)
        # TODO: does this mean that you miss some if total_count % STEP != 0
        if count % STEP == 0:
            dataset.update_processed_count(STEP)

    return (count % STEP, [dataset])
def run_iclassifier(corpus,
                    filelist,
                    model,
                    classification,
                    label_file='iclassify.MaxEnt.label',
                    verbose=False):
    """Run the invention classifier on the corpus using the model specified and
    create a classification."""
    print
    print '[run_iclassifier] corpus =', corpus
    print '[run_iclassifier] files  =', filelist
    print '[run_iclassifier] model  =', model
    print '[run_iclassifier] class  =', classification
    t1 = time.time()
    ensure_path(classification)
    create_info_files(corpus, model, filelist, classification)
    # create classification/iclassify.mallet from given files in the corpus
    invention.create_mallet_classify_file(corpus,
                                          filelist,
                                          classification,
                                          "invention",
                                          "1",
                                          verbose=True)
    t2 = time.time()
    # create result files in the classification
    invention.patent_invention_classify(None,
                                        train_dir=model,
                                        test_dir=classification)
    t3 = time.time()
    # creates the label file from the classifier output
    print "[run_iclassifier] creating the .label file"
    command = "cat %s/%s | egrep -v '^name' | egrep '\|.*\|' | python %s > %s/%s" \
              % (classification, 'iclassify.MaxEnt.out', 'invention_top_scores.py',
                 classification, label_file)
    print '   $', command
    subprocess.call(command, shell=True)
    t4 = time.time()
    process_label_file(corpus, classification, label_file, verbose)
    create_processing_time_file(classification, t1, t2, t3, t4)
    print
def copy_and_compress(source, target_dir, target_file):
    """Copy source to target file, making sure there is a directory. Compress
    the new file."""
    ensure_path(target_dir)
    shutil.copyfile(source, target_file)
    compress(target_file)
Example #18
0
        print fname
        fh_terms = open_input_file(fname)
        count = 0
        for line in fh_terms:
            count += 1
            if count > 100000: break
            if count % 500000 == 0: print '  ', count
            fields = line.split("\t")
            term = fields[0]
            term_count = int(fields[2])
            terms[term] = terms.get(term, 0) + term_count
    return terms


if __name__ == '__main__':

    target_dir = sys.argv[1]
    result_files = []
    for exp in sys.argv[2:]:
        files = glob.glob(exp)
        result_files.extend(files)

    ensure_path(target_dir)
    infofile = target_dir + '/merged_term_frequencies.info.txt'
    fh_info = codecs.open(infofile, 'w', encoding='utf-8')
    fh_info.write("git commit = %s\n\n" % get_git_commit())
    for fname in result_files:
        fh_info.write(fname + u"\n")

    merge_result_files(target_dir, result_files)