def create_info_file(classification, language, condense_results): with open(os.path.join(classification, 'iclassify.info'), 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("classification = %s\n" % classification) fh.write("language = %s\n" % language) fh.write("condense_results = %s\n" % condense_results) fh.write("git_commit = %s\n" % get_git_commit())
def create_info_file(filename, batch, infile, outfile): with open(filename, 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("batch = %s\n" % batch) fh.write("source file = %s\n" % infile) fh.write("summary file = %s\n" % outfile) fh.write("git_commit = %s\n" % get_git_commit())
def _update_info_files(self): """Write files with information on the build.""" fh = open(os.path.join(self.idx_dir, 'index.info.general.txt'), 'a') fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("index_name = %s\n" % self.index_name) fh.write("dataset = %s\n" % self.dataset) fh.write("git_commit = %s\n" % get_git_commit()) fh.write("timestamp = %s\n\n" % time.strftime('%Y%m%d-%H%M%S'))
def write_info(info_file, mallet_files, t1): fh = open(info_file, 'w') fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("timestamp = %s\n" % time.strftime("%Y%m%d:%H%M%S")) fh.write("seconds elapsed = %d\n" % int(time.time() - t1)) fh.write("git_commit = %s\n\n" % get_git_commit()) for f in mallet_files: fh.write("source = %s\n" % f)
def _write_info(source_file, out_file1, out_file2, t1, t2): for fname in (out_file1, out_file2): fh = open(fname + '.info', 'w') fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("source file = %s\n" % os.path.abspath(source_file)) fh.write("target file = %s\n" % os.path.abspath(fname)) fh.write("timestamp = %s\n" % time.strftime("%Y%m%d:%H%M%S")) fh.write("seconds elapsed = %d\n" % int(t2 - t1)) fh.write("git_commit = %s" % get_git_commit())
def _write_info(source_file, target_file, feats_file, feats, t1): fh = open(target_file + '.info', 'w') fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("source file = %s\n" % os.path.abspath(source_file)) fh.write("target file = %s\n" % os.path.abspath(target_file)) fh.write("features file = %s\n" % feats_file) fh.write("features = %s\n" % ' '.join(sorted(feats.keys()))) fh.write("timestamp = %s\n" % time.strftime("%Y%m%d:%H%M%S")) fh.write("processing time = %ds\n" % (time.time() - t1)) fh.write("git_commit = %s" % get_git_commit())
def _itrainer_create_info_file(corpus, model, filelist, features, annotation): with open(os.path.join(model, 'itrain.info.general'), 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("corpus = %s\n" % corpus) fh.write("file_list = %s\n" % filelist) fh.write("model = %s\n" % model) fh.write("features = %s\n" % features) fh.write("anotation = %s\n" % annotation) fh.write("git_commit = %s\n" % get_git_commit()) shutil.copyfile(annotation, os.path.join(model, 'itrain.info.annotations')) shutil.copyfile(filelist, os.path.join(model, 'itrain.info.files'))
def create_info_files(corpus, model, filelist, classification): with open(os.path.join(classification, 'iclassify.info.general'), 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("corpus = %s\n" % corpus) fh.write("file_list = %s\n" % filelist) fh.write("model = %s\n" % model) fh.write("classification = %s\n" % classification) fh.write("git_commit = %s\n" % get_git_commit()) shutil.copyfile(filelist, os.path.join(classification, 'iclassify.info.files'))
def _write_info(source_file, target_file, info_string, threshold, t1, t2): if target_file.endswith('.gz'): target_file = target_file[:-3] fh = open(target_file + '.info', 'w') fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("source file = %s\n" % os.path.abspath(source_file)) fh.write("target file = %s\n" % os.path.abspath(target_file)) fh.write("threshold = %d\n" % threshold) fh.write("timestamp = %s\n" % time.strftime("%Y%m%d:%H%M%S")) fh.write("seconds elapsed = %d\n" % int(t2 - t1)) fh.write("git_commit = %s" % get_git_commit()) fh.write("\n\n" + info_string + "\n")
def _generate_settings(self): self.command = "$ python %s\n\n" % ' '.join(sys.argv) self.settings = [ "timestamp = %s\n" % time.strftime("%x %X"), "language = %s\n" % self.language, "datasource = %s\n" % self.datasource, "source_file = %s\n" % self.source_file, "source_path = %s\n" % self.source_path, "target_path = %s\n" % self.target_path, "shuffle = %s\n" % str(self.shuffle_file), "git_commit = %s\n" % get_git_commit() ]
def add_info_file(corpus_dir, extra_files, added): """Append information to CORPUS/config/additions.txt.""" info_file = os.path.join(corpus_dir, 'config', corpus.FNAME_INFO_ADDITIONS) make_writable(info_file) fh = open(info_file, 'a') fh.write("$ %s\n\n" % ' '.join(sys.argv)) fh.write("timestamp = %s\n" % time.strftime("%x %X")) fh.write("file_list = %s\n" % extra_files) fh.write("files_added = %s\n" % added) fh.write("git_commit = %s\n\n\n" % get_git_commit()) fh.close() read_only(info_file)
def _create_info_general_file(self): with open(self.info_file_general, 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("model = %s\n" % self.model) fh.write("xval = %s\n" % self.xval) fh.write("file_list = %s\n" % self.file_list) fh.write("annotation_file = %s\n" % self.annotation_file) fh.write("annotation_count = %s\n" % self.annotation_count) fh.write("config_file = %s\n" % \ os.path.basename(rconfig.pipeline_config_file)) fh.write("features = %s\n" % self.features) fh.write("git_commit = %s" % get_git_commit())
def _create_info_general_file(self): with open(self.info_file_general, 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("model = %s\n" % os.path.abspath(self.model)) fh.write("corpus = %s\n" % os.path.abspath(self.corpus)) fh.write("features = %s\n" % ' '.join(get_features())) fh.write("file_list = %s\n" % os.path.abspath(self.file_list)) fh.write("annotation_file = %s\n" % os.path.abspath(self.annotation_file)) fh.write("annotation_count = %s\n" % self.annotation_count) fh.write("config_file = %s\n" % \ os.path.abspath(rconfig.pipeline_config_file)) fh.write("timestamp = %s\n" % time.strftime("%Y%m%d:%H%M%S")) fh.write("git_commit = %s\n" % get_git_commit())
def _create_info_files(self): print "[--classify] initializing %s directory" % self.batch with open(self.info_file_general, 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("batch = %s\n" % self.batch) fh.write("file_list = %s\n" % self.file_list) fh.write("model = %s\n" % self.model) fh.write("features = %s\n" % ' '.join(self.features)) fh.write("config_file = %s\n" % os.path.basename(rconfig.pipeline_config_file)) fh.write("git_commit = %s" % get_git_commit()) shutil.copyfile(self.rconfig.pipeline_config_file, self.info_file_config) shutil.copyfile(self.file_list, self.info_file_filelist)
def update_state(self, limit, t1): """Update the content of state/processed.txt and state/processing-history.txt.""" # TODO: should not just print the files processed in the history, but also the # range of files. time_elapsed = time.time() - t1 processed = "%d\n" % self.files_processed create_file(os.path.join(self.path, 'state', 'processed.txt'), processed) history_file = os.path.join(self.path, 'state', 'processing-history.txt') fh = open(history_file, 'a') fh.write("%s\t%d\t%s\t%s\t%s\n" % (self.stage_name, limit, time.strftime("%Y:%m:%d-%H:%M:%S"), get_git_commit(), time_elapsed))
def write_info(rconfig, dirname, filelist): """Generate a file with general information and copy the file list to the annotation directory.""" print "Writing general info..." ensure_path(dirname) with open(os.path.join(dirname, 'annotate.info.general.txt'), 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("corpus = %s\n" % os.path.abspath(rconfig.corpus)) fh.write("file_list = %s\n" % os.path.abspath(filelist)) fh.write("config_file = %s\n" % \ os.path.basename(rconfig.pipeline_config_file)) fh.write("git_commit = %s" % get_git_commit()) print "Copying %s..." % (filelist) shutil.copyfile(filelist, os.path.join(dirname, 'annotate.info.filelist.txt'))
def _create_info_files(self): if os.path.exists(self.info_file_general): sys.exit("WARNING: already ran indexer for batch %s" % self.batch) print "[Collector] initializing data/o1_index/%s directory" % self.batch ensure_path(self.batch_dir) with open(self.info_file_general, 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("batch = %s\n" % self.batch) fh.write("file_list = %s\n" % self.file_list) fh.write("config_file = %s\n" % os.path.basename(rconfig.pipeline_config_file)) fh.write("git_commit = %s\n" % get_git_commit()) shutil.copyfile(self.rconfig.pipeline_config_file, self.info_file_config) shutil.copyfile(self.file_list, self.info_file_filelist)
def _create_info_files(self, t1): with open(self.info_file_general, 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("output = %s\n" % os.path.abspath(self.output)) fh.write("file_list = %s\n" % self.file_list) fh.write("model = %s\n" % self.model) fh.write("features = %s\n" % ' '.join(self.features)) fh.write("config_file = %s\n" % rconfig.pipeline_config_file) fh.write("timestamp = %s\n" % time.strftime("%Y%m%d:%H%M%S")) fh.write("processing time = %ds\n" % int(time.time() - t1)) fh.write("git_commit = %s" % get_git_commit()) if self.rconfig.pipeline_config_file is not None: shutil.copyfile(self.rconfig.pipeline_config_file, self.info_file_config) shutil.copyfile(self.file_list, self.info_file_filelist)
def _create_info_files(self): if os.path.exists(self.info_file_general): sys.exit("WARNING: already have matcher results in %s" % self.output) print "[Matcher] initializing data/o2_matcher/%s directory" % self.output ensure_path(self.output_dir) with open(self.info_file_general, 'w') as fh: fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("output = %s\n" % self.output) fh.write("file_list = %s\n" % self.file_list) #fh.write("config_file = %s\n" % os.path.basename(rconfig.pipeline_config_file)) fh.write("config_file = %s\n" % rconfig.pipeline_config_file) fh.write("git_commit = %s\n" % get_git_commit()) if self.rconfig.pipeline_config_file is not None: shutil.copyfile(self.rconfig.pipeline_config_file, self.info_file_config) shutil.copyfile(self.file_list, self.info_file_filelist)
def _write_info(mallet_file, model_file, mtrainer, out_file, stderr_file, t1): fh = open(model_file + '.info', 'w') fh.write("$ python %s\n\n" % ' '.join(sys.argv)) fh.write("mallet file = %s\n" % os.path.abspath(mallet_file)) fh.write("model file = %s\n" % os.path.abspath(model_file)) fh.write("trainer settings = %s\n" % mtrainer.settings()) fh.write("timestamp = %s\n" % time.strftime("%Y%m%d:%H%M%S")) fh.write("time elapsed = %ds\n" % (time.time() - t1)) fh.write("git_commit = %s\n\n" % get_git_commit()) fh.write("$ %s\n\n" % mtrainer.saved_create_vectors_command) fh.write("$ %s\n" % mtrainer.saved_create_model_command) fh.write("\nContents of .out file:\n\n") for line in open(out_file): fh.write(" %s" % line) fh.write("\nContents of .stderr file:\n\n") for line in open(stderr_file): line = line.replace("\f", "\n ") line = line.replace("\r", "\n ") fh.write(" %s" % line) for cmd in mtrainer.saved_create_cinfo_commands: fh.write("\n$ %s\n" % cmd)
print fname fh_terms = open_input_file(fname) count = 0 for line in fh_terms: count += 1 if count > 100000: break if count % 500000 == 0: print ' ', count fields = line.split("\t") term = fields[0] term_count = int(fields[2]) terms[term] = terms.get(term, 0) + term_count return terms if __name__ == '__main__': target_dir = sys.argv[1] result_files = [] for exp in sys.argv[2:]: files = glob.glob(exp) result_files.extend(files) ensure_path(target_dir) infofile = target_dir + '/merged_term_frequencies.info.txt' fh_info = codecs.open(infofile, 'w', encoding='utf-8') fh_info.write("git commit = %s\n\n" % get_git_commit()) for fname in result_files: fh_info.write(fname + u"\n") merge_result_files(target_dir, result_files)