def process(self, title, text): import Corpus self.count += 1 title = title.replace(' ', '_').encode('utf8') text = text.encode('utf8') if self.name_set.__contains__(title): self.writer.write(Corpus.Document(str(self.id), '<title>%s</title>\n%s' % (title, text))) print self.count, self.id, title self.id += 1
def do_batch_apply(trec_path, model_dir, pattern_path, out_path, lib_dir): get_classpath(lib_dir) check_java_compile(lib_dir) pattern_set = set( map(lambda line: line.split()[0], open(pattern_path).readlines())) base_tag_trec_path = '%s.basetag' % trec_path command = [ 'java', '-Xms13G', '-Xmx13G', '-classpath', class_path, stanford_tag_program, '--batch-trec', trec_path, base_tag_trec_path ] print ' '.join(command) subprocess.call(command) t = time.time() reader = Corpus.TRECReader() reader.open(base_tag_trec_path) doc = reader.next() indecies = [0] ids = [] all_tagged_text = None while doc: tagged_text = TaggedText() tagged_text.get_from_string('\n'.join( filter(lambda line: not line.startswith('<'), doc.text.split('\n')))) if all_tagged_text: all_tagged_text += tagged_text else: all_tagged_text = tagged_text indecies.append(len(all_tagged_text)) tagged_text = apply_tag(trec_path, tagged_text, model_dir, pattern_set) ids.append(doc.ID) doc = reader.next() reader.close() os.remove(base_tag_trec_path) #tagged_text = apply_tag(trec_path, all_tagged_text, model_dir, pattern_set) print len(tagged_text) writer = Corpus.TRECWriter(out_path) for i in xrange(len(ids)): doc = Corpus.Document( ids[i], tagged_text[indecies[i]:indecies[i + 1]].__str__()) writer.write(doc) writer.close() global prune_t, label_t print time.time() - t, prune_t, label_t