def run_xml2txt(rconfig, options): """Takes the xml file and produces a txt file with a simplified document structure, keeping date, title, abstract, summary, description_rest, first_claim and other_claims. Does this by calling the document structure parser in onto mode if the document source is LexisNexis and uses a simple parser defined in xml2txt if the source is WoS.""" input_dataset, output_dataset = _get_datasets(XML2TXT, rconfig) count = 0 doc_parser = _make_parser(rconfig.language) workspace = os.path.join(rconfig.corpus, 'data', 'workspace') fspecs = FileSpecificationList(rconfig.filelist, output_dataset.files_processed, rconfig.limit) for fspec in fspecs: count += 1 file_in, file_out = _prepare_io(XML2TXT, fspec, input_dataset, output_dataset, rconfig, count) uncompress(file_in) try: xml2txt.xml2txt(doc_parser, rconfig.datasource, file_in, file_out, workspace) except Exception: # just write an empty file that can be consumed downstream fh = codecs.open(file_out, 'w') fh.close() print "[--xml2txt] WARNING: error on", file_in compress(file_in, file_out) _update_state_files_processed(output_dataset, count) return count % STEP, [output_dataset]
def run_populate(rconfig): """Populate xml directory in the target directory with limit files from the source file list or the source directory.""" output_name = DOCUMENT_PROCESSING_IO[POPULATE]['out'] dataset = DataSet(POPULATE, output_name, rconfig) fspecs = FileSpecificationList(rconfig.filelist, dataset.files_processed, rconfig.limit) print "[--populate] adding %d files to %s" % (len(fspecs), dataset) count = 0 for fspec in fspecs: count += 1 src_file = fspec.source dst_file = os.path.join(rconfig.corpus, 'data', output_name, dataset.version_id, 'files', fspec.target) # allow for compressed files, while being handed the name without extension if not os.path.exists(src_file): src_file += ".gz" dst_file += ".gz" if rconfig.verbose: print "[--populate] %04d %s" % (count, dst_file) ensure_path(os.path.dirname(dst_file)) _copy_file(src_file, dst_file) compress(dst_file) _update_state_files_processed(dataset, count) return count % STEP, [dataset]
def mallet_train_classifier(self): commands = [self.mallet_config.cmd_csv2vectors_train, self.mallet_config.cmd_train_classifier, self.mallet_config.cmd_classifier2info, self.mallet_config.cmd_cinfo_sorted] if self.mallet_config.prune_p: commands.append(self.mallet_config.cmd_prune) for cmd in commands: print "[mallet_train_classifier]" run_command(cmd) compress(self.mallet_config.cinfo_file, self.mallet_config.cinfo_sorted_file, self.mallet_config.train_vectors_file, self.mallet_config.train_vectors_out_file, self.mallet_config.train_mallet_file)
def run_seg2tag(rconfig, options): """Takes seg files and runs the Chinese tagger on them.""" input_dataset, output_dataset = _get_datasets(SEG2TAG, rconfig) count = 0 tagger = cn_seg2tag.Tagger() fspecs = FileSpecificationList(rconfig.filelist, output_dataset.files_processed, rconfig.limit) for fspec in fspecs: count += 1 file_in, file_out = _prepare_io(SEG2TAG, fspec, input_dataset, output_dataset, rconfig, count) uncompress(file_in) tagger.tag(file_in, file_out) compress(file_in, file_out) _update_state_files_processed(output_dataset, count) return count % STEP, [output_dataset]
def run_txt2seg(rconfig, options): """Takes txt files and runs the Chinese segmenter on them.""" input_dataset, output_dataset = _get_datasets(TXT2SEG, rconfig) count = 0 segmenter = cn_txt2seg.Segmenter() fspecs = FileSpecificationList(rconfig.filelist, output_dataset.files_processed, rconfig.limit) for fspec in fspecs: count += 1 file_in, file_out = _prepare_io(TXT2SEG, fspec, input_dataset, output_dataset, rconfig, count) uncompress(file_in) segmenter.process(file_in, file_out) compress(file_in, file_out) _update_state_files_processed(output_dataset, count) return count % STEP, [output_dataset]
def copy_and_compress(source, target_dir, target_file): """Copy source to target file, making sure there is a directory. Compress the new file.""" ensure_path(target_dir) shutil.copyfile(source, target_file) compress(target_file)
def compress_files(self): compress(self.mallet_config.test_mallet_file, self.mallet_config.classifier_out_file)