def run_txt2seg(rconfig, limit, options, verbose):
    """Takes txt files and runs the Chinese segmenter on them."""

    input_dataset = find_input_dataset(TXT2SEG, rconfig)
    output_dataset = find_output_dataset(TXT2SEG, rconfig)
    print_datasets(TXT2SEG, input_dataset, output_dataset)
    check_file_counts(input_dataset, output_dataset, limit)

    count = 0
    segmenter = sdp.Segmenter()
    swrapper = cn_txt2seg.SegmenterWrapper(segmenter)

    fspecs = get_lines(rconfig.filenames, output_dataset.files_processed,
                       limit)
    for fspec in fspecs:
        count += 1
        filename = fspec.target
        print_file_progress(TXT2SEG, rconfig.corpus, count, filename, verbose)
        file_in, file_out = prepare_io(filename, input_dataset, output_dataset)
        uncompress(file_in)
        #cn_txt2seg.seg(file_in, file_out, segmenter)
        swrapper.process(file_in, file_out)
        compress(file_in, file_out)
        if count % STEP == 0:
            output_dataset.update_processed_count(STEP)

    return (count % STEP, [output_dataset])
def run_txt2tag(rconfig, limit, options, verbose):
    """Takes txt files and runs the tagger on them."""

    input_dataset = find_input_dataset(TXT2TAG, rconfig)
    output_dataset = find_output_dataset(TXT2TAG, rconfig)
    print_datasets(TXT2TAG, input_dataset, output_dataset)
    check_file_counts(input_dataset, output_dataset, limit)

    count = 0
    tagger = txt2tag.get_tagger(rconfig.language)
    fspecs = get_lines(rconfig.filenames, output_dataset.files_processed,
                       limit)
    for fspec in fspecs:
        count += 1
        filename = fspec.target
        print_file_progress(TXT2TAG, rconfig.corpus, count, filename, verbose)
        file_in, file_out = prepare_io(filename, input_dataset, output_dataset)
        uncompress(file_in)
        txt2tag.tag(file_in, file_out, tagger)
        # this will become relevant for cn only when we have a segmenter/tagger
        # that uses only one step
        if rconfig.language == 'en': compress(file_in, file_out)
        if count % STEP == 0:
            output_dataset.update_processed_count(STEP)

    return (count % STEP, [output_dataset])
 def run_classifier(self, t1, corpus):
     ensure_path(self.output)
     self._create_mallet_file(corpus=corpus)
     self._run_classifier()
     self._calculate_scores()
     self._create_info_files(t1)
     for fname in (self.results_file, self.mallet_file, self.scores_s1):
         print "[Classifier.run_classifier] Compressing", fname
         compress(fname)
 def run(self):
     if os.path.exists(self.info_file_general):
         sys.exit("WARNING: already have classifier results in %s" %
                  self.batch)
     ensure_path(self.batch)
     self._find_datasets()
     self._create_mallet_file()
     self._run_classifier()
     self._calculate_scores()
     self._run_eval()
     self._create_info_files()
     compress(self.results_file, self.mallet_file, self.scores_s1)
    def mallet_train_classifier(self):

        commands = [self.mallet_config.cmd_csv2vectors_train,
                    self.mallet_config.cmd_train_classifier,
                    self.mallet_config.cmd_classifier2info,
                    self.mallet_config.cmd_cinfo_sorted]
        if self.mallet_config.prune_p:
            commands.append(self.mallet_config.cmd_prune)
        for cmd in commands:
            print "[mallet_train_classifier]"
            run_command(cmd)
        compress(self.mallet_config.cinfo_file,
                 self.mallet_config.cinfo_sorted_file,
                 self.mallet_config.train_vectors_file,
                 self.mallet_config.train_vectors_out_file,
                 self.mallet_config.train_mallet_file)
def run_populate(rconfig, limit, verbose=False):
    """Populate xml directory in the target directory with limit files from the
    source file list or the source directory."""

    output_name = DOCUMENT_PROCESSING_IO[POPULATE]['out']
    dataset = DataSet(POPULATE, output_name, rconfig)

    # initialize data set if it does not exist, this is not contingent on
    # anything because --populate is the first step
    if not dataset.exists():
        dataset.initialize_on_disk()
        dataset.load_from_disk()

    fspecs = get_lines(rconfig.filenames, dataset.files_processed, limit)
    print "[--populate] adding %d files to %s" % (len(fspecs), dataset)
    count = 0
    for fspec in fspecs:
        count += 1
        src_file = fspec.source
        dst_file = os.path.join(rconfig.target_path, 'data', output_name,
                                dataset.version_id, 'files', fspec.target)
        # allow for compressed files, while being handed the name without
        # extension
        if not os.path.exists(src_file):
            src_file += ".gz"
            dst_file += ".gz"
        if verbose:
            print "[--populate] %04d %s" % (count, dst_file)
        ensure_path(os.path.dirname(dst_file))
        try:
            shutil.copyfile(src_file, dst_file)
        except IOError:
            print "                  WARNING: source file does not exist, not copying"
            print "                  %s" % src_file
        # at some point there seemed to be an issue with compressing for Chinese,
        # so added this to do language dependent compressing, there is now no
        # difference for the population phase
        if rconfig.language == 'en': compress(dst_file)
        elif rconfig.language == 'cn': compress(dst_file)
        # TODO: does this mean that you miss some if total_count % STEP != 0
        if count % STEP == 0:
            dataset.update_processed_count(STEP)

    return (count % STEP, [dataset])
def run_xml2txt(rconfig, limit, options, verbose=False):
    """Takes the xml file and produces a txt file with a simplified document
    structure, keeping date, title, abstract, summary, description_rest,
    first_claim and other_claims. Does this by calling the document structure
    parser in onto mode if the document source is LEXISNEXIS and uses a simple
    parser defined in xml2txt if the source is WOS.."""

    input_dataset = find_input_dataset(XML2TXT, rconfig)
    output_dataset = find_output_dataset(XML2TXT, rconfig)
    print_datasets(XML2TXT, input_dataset, output_dataset)
    check_file_counts(input_dataset, output_dataset, limit)

    count = 0
    doc_parser = make_parser(rconfig.language)

    workspace = os.path.join(rconfig.target_path, 'data', 'workspace')
    fspecs = get_lines(rconfig.filenames, output_dataset.files_processed,
                       limit)
    for fspec in fspecs:
        count += 1
        filename = fspec.target
        print_file_progress(XML2TXT, rconfig.corpus, count, filename, verbose)
        file_in, file_out = prepare_io(filename, input_dataset, output_dataset)
        uncompress(file_in)
        try:
            xml2txt.xml2txt(doc_parser, rconfig.datasource, file_in, file_out,
                            workspace)
        except Exception as e:
            # just write an empty file that can be consumed downstream
            fh = codecs.open(file_out, 'w')
            fh.close()
            print "[--xml2txt] WARNING: error on", file_in
            #print "           ", e
        # we now do compress the cn output of the document parser (which we
        # initialy did not do)
        if rconfig.language == 'en': compress(file_in, file_out)
        elif rconfig.language == 'cn': compress(file_in, file_out)
        if count % STEP == 0:
            output_dataset.update_processed_count(STEP)

    #xml2txt.print_stats()
    return (count % STEP, [output_dataset])
def run_seg2tag(rconfig, limit, options, verbose):
    """Takes seg files and runs the Chinese tagger on them."""

    input_dataset = find_input_dataset(SEG2TAG, rconfig)
    output_dataset = find_output_dataset(SEG2TAG, rconfig)
    print_datasets(SEG2TAG, input_dataset, output_dataset)
    check_file_counts(input_dataset, output_dataset, limit)

    count = 0
    tagger = txt2tag.get_tagger(rconfig.language)
    fspecs = get_lines(rconfig.filenames, output_dataset.files_processed,
                       limit)
    for fspec in fspecs:
        count += 1
        filename = fspec.target
        print_file_progress(SEG2TAG, rconfig.corpus, count, filename, verbose)
        file_in, file_out = prepare_io(filename, input_dataset, output_dataset)
        uncompress(file_in)
        cn_seg2tag.tag(file_in, file_out, tagger)
        compress(file_in, file_out)
        if count % STEP == 0:
            output_dataset.update_processed_count(STEP)

    return (count % STEP, [output_dataset])
 def compress_files(self):
     compress(self.mallet_config.test_mallet_file,
              self.mallet_config.classifier_out_file)
def copy_and_compress(source, target_dir, target_file):
    """Copy source to target file, making sure there is a directory. Compress
    the new file."""
    ensure_path(target_dir)
    shutil.copyfile(source, target_file)
    compress(target_file)