def run_populate(rconfig): """Populate xml directory in the target directory with limit files from the source file list or the source directory.""" output_name = DOCUMENT_PROCESSING_IO[POPULATE]['out'] dataset = DataSet(POPULATE, output_name, rconfig) fspecs = FileSpecificationList(rconfig.filelist, dataset.files_processed, rconfig.limit) print "[--populate] adding %d files to %s" % (len(fspecs), dataset) count = 0 for fspec in fspecs: count += 1 src_file = fspec.source dst_file = os.path.join(rconfig.corpus, 'data', output_name, dataset.version_id, 'files', fspec.target) # allow for compressed files, while being handed the name without extension if not os.path.exists(src_file): src_file += ".gz" dst_file += ".gz" if rconfig.verbose: print "[--populate] %04d %s" % (count, dst_file) ensure_path(os.path.dirname(dst_file)) _copy_file(src_file, dst_file) compress(dst_file) _update_state_files_processed(dataset, count) return count % STEP, [dataset]
def _cleanup_cinfo(cinfo_file, cinfo_file_sorted): run_command("gzip %s" % cinfo_file) run_command("gzip %s" % cinfo_file_sorted) info_dir = os.path.dirname(cinfo_file) + os.sep + 'info' ensure_path(info_dir) run_command("mv %s.gz %s" % (cinfo_file, info_dir)) run_command("mv %s.gz %s" % (cinfo_file_sorted, info_dir))
def _create_directories(self): """Create subdirectory structure in self.location.""" print "[--init] creating directory structure in %s" % self.location ensure_path(self.conf_path) for subdir in config.DATA_DIRS: subdir_path = self.data_path + os.sep + subdir ensure_path(subdir_path)
def _prepare_io(stage, fspec, input_dataset, output_dataset, rconfig, count): """Generate the file paths for the datasets and make sure the path to the file exists for the output dataset.""" filename = fspec.target _print_file_progress(stage, filename, count, rconfig) file_id = filename[1:] if filename.startswith(os.sep) else filename file_in = os.path.join(input_dataset.path, 'files', file_id) file_out = os.path.join(output_dataset.path, 'files', file_id) ensure_path(os.path.dirname(file_out)) return file_in, file_out
def run(self): """Run the trainer by finding the input data and building a model from it. Also writes files with information on configuration settings, features, gold standard term annotations and other things required to reproduce the model.""" if os.path.exists(self.train_dir): exit("WARNING: Classifier model %s already exists" % self.train_dir) ensure_path(self.train_dir) ensure_path(self.info_dir) self._find_datasets() self._create_info_files() self._create_mallet_file()
def merge_mallet_files(target_dir, mallet_files): t1 = time.time() target_file = os.path.join(target_dir, 'train.mallet') info_file = os.path.join(target_dir, 'train.mallet.info') print "\nMerging" for f in mallet_files: print ' ', f print "Target mallet file\n ", target_file merge_command = "cat %s > %s" % (' '.join(mallet_files), target_file) print "\n$", merge_command, "\n" ensure_path(target_dir) os.system(merge_command) write_info(info_file, mallet_files, t1)
def run_itrainer(corpus, filelist, model, features, annotation_file, phr_feats_file=None, verbose=False): mallet_file = os.path.join(model, 'itrain.mallet') phr_feats_file = os.path.join(model, 'keyfeats.ta.dat') ensure_path(model) _itrainer_create_info_file(corpus, model, filelist, features, annotation_file) _itrainer_create_dat_file(phr_feats_file, corpus, filelist) _itrainer_create_mallet_file(annotation_file, phr_feats_file, mallet_file) patent_invention_train(mallet_file)
def run_iclassifier(corpus, filelist, model, classification, label_file='iclassify.MaxEnt.label', verbose=False): """Run the invention classifier on the corpus using the model specified and create a classification.""" print print '[run_iclassifier] corpus =', corpus print '[run_iclassifier] files =', filelist print '[run_iclassifier] model =', model print '[run_iclassifier] class =', classification t1 = time.time() ensure_path(classification) create_info_files(corpus, model, filelist, classification) # create classification/iclassify.mallet from given files in the corpus invention.create_mallet_classify_file(corpus, filelist, classification, "invention", "1", verbose=True) t2 = time.time() # create result files in the classification invention.patent_invention_classify(None, train_dir=model, test_dir=classification) t3 = time.time() # creates the label file from the classifier output print "[run_iclassifier] creating the .label file" command = "cat %s/%s | egrep -v '^name' | egrep '\|.*\|' | python %s > %s/%s" \ % (classification, 'iclassify.MaxEnt.out', 'invention_top_scores.py', classification, label_file) print ' $', command subprocess.call(command, shell=True) t4 = time.time() process_label_file(corpus, classification, label_file, verbose) create_processing_time_file(classification, t1, t2, t3, t4) print
def copy_and_compress(source, target_dir, target_file): """Copy source to target file, making sure there is a directory. Compress the new file.""" ensure_path(target_dir) shutil.copyfile(source, target_file) compress(target_file)
print fname fh_terms = open_input_file(fname) count = 0 for line in fh_terms: count += 1 if count > 100000: break if count % 500000 == 0: print ' ', count fields = line.split("\t") term = fields[0] term_count = int(fields[2]) terms[term] = terms.get(term, 0) + term_count return terms if __name__ == '__main__': target_dir = sys.argv[1] result_files = [] for exp in sys.argv[2:]: files = glob.glob(exp) result_files.extend(files) ensure_path(target_dir) infofile = target_dir + '/merged_term_frequencies.info.txt' fh_info = codecs.open(infofile, 'w', encoding='utf-8') fh_info.write("git commit = %s\n\n" % get_git_commit()) for fname in result_files: fh_info.write(fname + u"\n") merge_result_files(target_dir, result_files)