Example #1
0
 def process(self, title, text):
     import Corpus
     self.count += 1
     title = title.replace(' ', '_').encode('utf8')
     text = text.encode('utf8')
     if self.name_set.__contains__(title):
         self.writer.write(Corpus.Document(str(self.id), '<title>%s</title>\n%s' % (title, text)))
         print self.count, self.id, title
         self.id += 1
Example #2
0
def do_batch_apply(trec_path, model_dir, pattern_path, out_path, lib_dir):
    get_classpath(lib_dir)
    check_java_compile(lib_dir)
    pattern_set = set(
        map(lambda line: line.split()[0],
            open(pattern_path).readlines()))
    base_tag_trec_path = '%s.basetag' % trec_path
    command = [
        'java', '-Xms13G', '-Xmx13G', '-classpath', class_path,
        stanford_tag_program, '--batch-trec', trec_path, base_tag_trec_path
    ]
    print ' '.join(command)
    subprocess.call(command)

    t = time.time()
    reader = Corpus.TRECReader()
    reader.open(base_tag_trec_path)
    doc = reader.next()
    indecies = [0]
    ids = []
    all_tagged_text = None
    while doc:
        tagged_text = TaggedText()
        tagged_text.get_from_string('\n'.join(
            filter(lambda line: not line.startswith('<'),
                   doc.text.split('\n'))))
        if all_tagged_text:
            all_tagged_text += tagged_text
        else:
            all_tagged_text = tagged_text
        indecies.append(len(all_tagged_text))
        tagged_text = apply_tag(trec_path, tagged_text, model_dir, pattern_set)
        ids.append(doc.ID)
        doc = reader.next()
    reader.close()
    os.remove(base_tag_trec_path)

    #tagged_text = apply_tag(trec_path, all_tagged_text, model_dir, pattern_set)
    print len(tagged_text)
    writer = Corpus.TRECWriter(out_path)
    for i in xrange(len(ids)):
        doc = Corpus.Document(
            ids[i], tagged_text[indecies[i]:indecies[i + 1]].__str__())
        writer.write(doc)
    writer.close()
    global prune_t, label_t
    print time.time() - t, prune_t, label_t