def write_html(self): ensure_path(self.html_dir) self.write_index_file() for term in self.terms: term_file = os.path.join(self.html_dir, "%05d.html" % term.id) term_fh = codecs.open(term_file, 'w', encoding='utf-8') term.generate_html(fh=term_fh)
def _cleanup_cinfo(cinfo_file, cinfo_file_sorted): run_command("gzip %s" % cinfo_file) run_command("gzip %s" % cinfo_file_sorted) info_dir = os.path.dirname(cinfo_file) + os.sep + 'info' ensure_path(info_dir) run_command("mv %s.gz %s" % (cinfo_file, info_dir)) run_command("mv %s.gz %s" % (cinfo_file_sorted, info_dir))
def _create_directories(self): """Create subdirectory structure in target_path.""" print "[--init] creating directory structure in %s" % ( self.target_path) ensure_path(self.conf_path) for subdir in config.PROCESSING_AREAS: subdir_path = self.data_path + os.sep + subdir ensure_path(subdir_path)
def prepare_io(filename, input_dataset, output_dataset): """Generate the file paths for the datasets and make sure the path to the file exists for the output dataset. May need to add a version that deals with multiple output datasets.""" file_id = filename[1:] if filename.startswith(os.sep) else filename file_in = os.path.join(input_dataset.path, 'files', file_id) file_out = os.path.join(output_dataset.path, 'files', file_id) ensure_path(os.path.dirname(file_out)) return file_in, file_out
def run_classifier(self, t1, corpus): ensure_path(self.output) self._create_mallet_file(corpus=corpus) self._run_classifier() self._calculate_scores() self._create_info_files(t1) for fname in (self.results_file, self.mallet_file, self.scores_s1): print "[Classifier.run_classifier] Compressing", fname compress(fname)
def __init__(self, corpus, index_name): self.corpus = corpus self.index_name = index_name self.idx_dir = os.path.join(corpus, 'data', 'o1_index', index_name) ensure_path(self.idx_dir) self.db_info = InfoDatabase(self.idx_dir, 'db-info.sqlite') self.db_years = YearsDatabase(self.idx_dir, 'db-years.sqlite') self.db_summary = SummaryDatabase(self.idx_dir, 'db-terms.sqlite') self.db_terms = {} self.pp()
def run(self): """Run the trainer by finding the input data and building a model from it. Also writes files with information on configuration settings, features, gold standard term annotations and other things required to reproduce the model.""" if os.path.exists(self.train_dir): exit("WARNING: Classifier model %s already exists" % self.train_dir) ensure_path(self.train_dir) ensure_path(self.info_dir) self._find_datasets() self._create_info_files() self._create_mallet_file()
def run(self): if os.path.exists(self.info_file_general): sys.exit("WARNING: already have classifier results in %s" % self.batch) ensure_path(self.batch) self._find_datasets() self._create_mallet_file() self._run_classifier() self._calculate_scores() self._run_eval() self._create_info_files() compress(self.results_file, self.mallet_file, self.scores_s1)
def merge_mallet_files(target_dir, mallet_files): t1 = time.time() target_file = os.path.join(target_dir, 'train.mallet') info_file = os.path.join(target_dir, 'train.mallet.info') print "\nMerging" for f in mallet_files: print ' ', f print "Target mallet file\n ", target_file merge_command = "cat %s > %s" % (' '.join(mallet_files), target_file) print "\n$", merge_command, "\n" ensure_path(target_dir) os.system(merge_command) write_info(info_file, mallet_files, t1)
def write_info(rconfig, dirname, filelist): """Generate a file with general information and copy the file list to the annotation directory.""" print "Writing general info..." ensure_path(dirname) with open(os.path.join(dirname, 'annotate.info.general.txt'), 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("corpus = %s\n" % os.path.abspath(rconfig.corpus)) fh.write("file_list = %s\n" % os.path.abspath(filelist)) fh.write("config_file = %s\n" % \ os.path.basename(rconfig.pipeline_config_file)) fh.write("git_commit = %s" % get_git_commit()) print "Copying %s..." % (filelist) shutil.copyfile(filelist, os.path.join(dirname, 'annotate.info.filelist.txt'))
def _create_info_files(self): if os.path.exists(self.info_file_general): sys.exit("WARNING: already ran indexer for batch %s" % self.batch) print "[Collector] initializing data/o1_index/%s directory" % self.batch ensure_path(self.batch_dir) with open(self.info_file_general, 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("batch = %s\n" % self.batch) fh.write("file_list = %s\n" % self.file_list) fh.write("config_file = %s\n" % os.path.basename(rconfig.pipeline_config_file)) fh.write("git_commit = %s\n" % get_git_commit()) shutil.copyfile(self.rconfig.pipeline_config_file, self.info_file_config) shutil.copyfile(self.file_list, self.info_file_filelist)
def initialize_on_disk(self): """All that is guaranteed to exist is a directory like data/patents/en/d1_txt, but sub structures is not there. Create the substructure and initial versions of all needed files in configuration and state directories.""" for subdir in ('config', 'state', 'files'): ensure_path(os.path.join(self.path, subdir)) create_file(os.path.join(self.path, 'state', 'processed.txt'), "0\n") create_file(os.path.join(self.path, 'state', 'processing-history.txt')) trace, head = self.split_pipeline() trace_str = pipeline_component_as_string(trace) head_str = pipeline_component_as_string([head]) create_file(os.path.join(self.path, 'config', 'pipeline-head.txt'), head_str) create_file(os.path.join(self.path, 'config', 'pipeline-trace.txt'), trace_str) self.files_processed = 0
def run_itrainer(corpus, filelist, model, features, annotation_file, phr_feats_file=None, verbose=False): mallet_file = os.path.join(model, 'itrain.mallet') phr_feats_file = os.path.join(model, 'keyfeats.ta.dat') ensure_path(model) _itrainer_create_info_file(corpus, model, filelist, features, annotation_file) _itrainer_create_dat_file(phr_feats_file, corpus, filelist) _itrainer_create_mallet_file(annotation_file, phr_feats_file, mallet_file) patent_invention_train(mallet_file)
def _create_info_files(self): if os.path.exists(self.info_file_general): sys.exit("WARNING: already have matcher results in %s" % self.output) print "[Matcher] initializing data/o2_matcher/%s directory" % self.output ensure_path(self.output_dir) with open(self.info_file_general, 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("output = %s\n" % self.output) fh.write("file_list = %s\n" % self.file_list) #fh.write("config_file = %s\n" % os.path.basename(rconfig.pipeline_config_file)) fh.write("config_file = %s\n" % rconfig.pipeline_config_file) fh.write("git_commit = %s\n" % get_git_commit()) if self.rconfig.pipeline_config_file is not None: shutil.copyfile(self.rconfig.pipeline_config_file, self.info_file_config) shutil.copyfile(self.file_list, self.info_file_filelist)
def run_populate(rconfig, limit, verbose=False): """Populate xml directory in the target directory with limit files from the source file list or the source directory.""" output_name = DOCUMENT_PROCESSING_IO[POPULATE]['out'] dataset = DataSet(POPULATE, output_name, rconfig) # initialize data set if it does not exist, this is not contingent on # anything because --populate is the first step if not dataset.exists(): dataset.initialize_on_disk() dataset.load_from_disk() fspecs = get_lines(rconfig.filenames, dataset.files_processed, limit) print "[--populate] adding %d files to %s" % (len(fspecs), dataset) count = 0 for fspec in fspecs: count += 1 src_file = fspec.source dst_file = os.path.join(rconfig.target_path, 'data', output_name, dataset.version_id, 'files', fspec.target) # allow for compressed files, while being handed the name without # extension if not os.path.exists(src_file): src_file += ".gz" dst_file += ".gz" if verbose: print "[--populate] %04d %s" % (count, dst_file) ensure_path(os.path.dirname(dst_file)) try: shutil.copyfile(src_file, dst_file) except IOError: print " WARNING: source file does not exist, not copying" print " %s" % src_file # at some point there seemed to be an issue with compressing for Chinese, # so added this to do language dependent compressing, there is now no # difference for the population phase if rconfig.language == 'en': compress(dst_file) elif rconfig.language == 'cn': compress(dst_file) # TODO: does this mean that you miss some if total_count % STEP != 0 if count % STEP == 0: dataset.update_processed_count(STEP) return (count % STEP, [dataset])
def run_iclassifier(corpus, filelist, model, classification, label_file='iclassify.MaxEnt.label', verbose=False): """Run the invention classifier on the corpus using the model specified and create a classification.""" print print '[run_iclassifier] corpus =', corpus print '[run_iclassifier] files =', filelist print '[run_iclassifier] model =', model print '[run_iclassifier] class =', classification t1 = time.time() ensure_path(classification) create_info_files(corpus, model, filelist, classification) # create classification/iclassify.mallet from given files in the corpus invention.create_mallet_classify_file(corpus, filelist, classification, "invention", "1", verbose=True) t2 = time.time() # create result files in the classification invention.patent_invention_classify(None, train_dir=model, test_dir=classification) t3 = time.time() # creates the label file from the classifier output print "[run_iclassifier] creating the .label file" command = "cat %s/%s | egrep -v '^name' | egrep '\|.*\|' | python %s > %s/%s" \ % (classification, 'iclassify.MaxEnt.out', 'invention_top_scores.py', classification, label_file) print ' $', command subprocess.call(command, shell=True) t4 = time.time() process_label_file(corpus, classification, label_file, verbose) create_processing_time_file(classification, t1, t2, t3, t4) print
def copy_and_compress(source, target_dir, target_file): """Copy source to target file, making sure there is a directory. Compress the new file.""" ensure_path(target_dir) shutil.copyfile(source, target_file) compress(target_file)
print fname fh_terms = open_input_file(fname) count = 0 for line in fh_terms: count += 1 if count > 100000: break if count % 500000 == 0: print ' ', count fields = line.split("\t") term = fields[0] term_count = int(fields[2]) terms[term] = terms.get(term, 0) + term_count return terms if __name__ == '__main__': target_dir = sys.argv[1] result_files = [] for exp in sys.argv[2:]: files = glob.glob(exp) result_files.extend(files) ensure_path(target_dir) infofile = target_dir + '/merged_term_frequencies.info.txt' fh_info = codecs.open(infofile, 'w', encoding='utf-8') fh_info.write("git commit = %s\n\n" % get_git_commit()) for fname in result_files: fh_info.write(fname + u"\n") merge_result_files(target_dir, result_files)