Exemple #1
0
def run_xml2txt(rconfig, options):
    """Takes the xml file and produces a txt file with a simplified document
    structure, keeping date, title, abstract, summary, description_rest,
    first_claim and other_claims. Does this by calling the document structure
    parser in onto mode if the document source is LexisNexis and uses a simple
    parser defined in xml2txt if the source is WoS."""

    input_dataset, output_dataset = _get_datasets(XML2TXT, rconfig)
    count = 0
    doc_parser = _make_parser(rconfig.language)
    workspace = os.path.join(rconfig.corpus, 'data', 'workspace')
    fspecs = FileSpecificationList(rconfig.filelist,
                                   output_dataset.files_processed,
                                   rconfig.limit)
    for fspec in fspecs:
        count += 1
        file_in, file_out = _prepare_io(XML2TXT, fspec, input_dataset,
                                        output_dataset, rconfig, count)
        uncompress(file_in)
        try:
            xml2txt.xml2txt(doc_parser, rconfig.datasource, file_in, file_out,
                            workspace)
        except Exception:
            # just write an empty file that can be consumed downstream
            fh = codecs.open(file_out, 'w')
            fh.close()
            print "[--xml2txt] WARNING: error on", file_in
        compress(file_in, file_out)
        _update_state_files_processed(output_dataset, count)
    return count % STEP, [output_dataset]
Exemple #2
0
def run_populate(rconfig):
    """Populate xml directory in the target directory with limit files from the
    source file list or the source directory."""

    output_name = DOCUMENT_PROCESSING_IO[POPULATE]['out']
    dataset = DataSet(POPULATE, output_name, rconfig)
    fspecs = FileSpecificationList(rconfig.filelist, dataset.files_processed,
                                   rconfig.limit)
    print "[--populate] adding %d files to %s" % (len(fspecs), dataset)
    count = 0
    for fspec in fspecs:
        count += 1
        src_file = fspec.source
        dst_file = os.path.join(rconfig.corpus, 'data', output_name,
                                dataset.version_id, 'files', fspec.target)
        # allow for compressed files, while being handed the name without extension
        if not os.path.exists(src_file):
            src_file += ".gz"
            dst_file += ".gz"
        if rconfig.verbose:
            print "[--populate] %04d %s" % (count, dst_file)
        ensure_path(os.path.dirname(dst_file))
        _copy_file(src_file, dst_file)
        compress(dst_file)
        _update_state_files_processed(dataset, count)
    return count % STEP, [dataset]
    def mallet_train_classifier(self):

        commands = [self.mallet_config.cmd_csv2vectors_train,
                    self.mallet_config.cmd_train_classifier,
                    self.mallet_config.cmd_classifier2info,
                    self.mallet_config.cmd_cinfo_sorted]
        if self.mallet_config.prune_p:
            commands.append(self.mallet_config.cmd_prune)
        for cmd in commands:
            print "[mallet_train_classifier]"
            run_command(cmd)
        compress(self.mallet_config.cinfo_file,
                 self.mallet_config.cinfo_sorted_file,
                 self.mallet_config.train_vectors_file,
                 self.mallet_config.train_vectors_out_file,
                 self.mallet_config.train_mallet_file)
Exemple #4
0
def run_seg2tag(rconfig, options):
    """Takes seg files and runs the Chinese tagger on them."""

    input_dataset, output_dataset = _get_datasets(SEG2TAG, rconfig)
    count = 0
    tagger = cn_seg2tag.Tagger()
    fspecs = FileSpecificationList(rconfig.filelist,
                                   output_dataset.files_processed,
                                   rconfig.limit)
    for fspec in fspecs:
        count += 1
        file_in, file_out = _prepare_io(SEG2TAG, fspec, input_dataset,
                                        output_dataset, rconfig, count)
        uncompress(file_in)
        tagger.tag(file_in, file_out)
        compress(file_in, file_out)
        _update_state_files_processed(output_dataset, count)
    return count % STEP, [output_dataset]
Exemple #5
0
def run_txt2seg(rconfig, options):
    """Takes txt files and runs the Chinese segmenter on them."""

    input_dataset, output_dataset = _get_datasets(TXT2SEG, rconfig)
    count = 0
    segmenter = cn_txt2seg.Segmenter()
    fspecs = FileSpecificationList(rconfig.filelist,
                                   output_dataset.files_processed,
                                   rconfig.limit)
    for fspec in fspecs:
        count += 1
        file_in, file_out = _prepare_io(TXT2SEG, fspec, input_dataset,
                                        output_dataset, rconfig, count)
        uncompress(file_in)
        segmenter.process(file_in, file_out)
        compress(file_in, file_out)
        _update_state_files_processed(output_dataset, count)
    return count % STEP, [output_dataset]
Exemple #6
0
def copy_and_compress(source, target_dir, target_file):
    """Copy source to target file, making sure there is a directory. Compress
    the new file."""
    ensure_path(target_dir)
    shutil.copyfile(source, target_file)
    compress(target_file)
 def compress_files(self):
     compress(self.mallet_config.test_mallet_file,
              self.mallet_config.classifier_out_file)