def do_count_length(in_trec, out_path): import Corpus reader = Corpus.TRECReader() reader.open(in_trec) doc = reader.next() count = 1; entry_per_file = 10000 json_list = [] start_time = time.time() with codecs.open(out_path, encoding='utf8', mode='w') as writer: while doc: length = len(doc.text) if '#redirect' in doc.text.lower(): doc = reader.next() continue plain = Wiki2Plain(get_main_section(doc.text)) text = plain.text body_start_pos = text.find('\n') if body_start_pos > 0: title = text[:body_start_pos] writer.write(u'%s\t%d\n' % (title, length)) writer.flush() doc = reader.next() reader.close()
def do_match(infobox_path, text_path, out_path): import Corpus import time print 'loading......' infobox = load_infobox(infobox_path) reader = Corpus.TRECReader() reader.open(text_path) writer = Corpus.TRECWriter(out_path) matcher = InfoBoxMatcher() t0 = time.time() count = 0 doc = reader.next() while doc: text = doc.text lines = text.split('\n') newlines = lines[:3] title_line = lines[1] title_begin_index = title_line.find('>') title_end_index = title_line.find('<', title_begin_index + 1) title = '' if title_begin_index >= 0 and title_end_index >= 0: title = title_line[title_begin_index + 1:title_end_index].strip() if infobox.has_key(title): tagged_text = matcher.match(infobox[title], lines[3:]) doc.text = '\n'.join(lines[:3]) + '\n' doc.text += tagged_text writer.write(doc) doc = reader.next() count += 1 if count % 100 == 0: print count, time.time() - t0 writer.close()
def do_batch(in_trec, out_dir): import Corpus reader = Corpus.TRECReader() reader.open(in_trec) doc = reader.next() count = 1; entry_per_file = 10000 json_list = [] start_time = time.time() while doc: plain = Wiki2Plain(get_main_section(doc.text)) text = plain.text body_start_pos = text.find('\n') if body_start_pos > 0: title = text[:body_start_pos] body = text[body_start_pos:] if not title.count(':') or not re.match(invalid_title_pattern, title.split(':')[0]): json_list.append({'id': str(count), 'title': title.strip(), 'body': body.strip()}) if count % entry_per_file == 0: out_path = os.path.join(out_dir, str(count / entry_per_file) + '.json') print('writing', out_path) with codecs.open(out_path, encoding='utf-8', mode='w') as writer: json.dump(json_list, writer, indent=2, ensure_ascii=False) json_list = [] print(count, title, time.time() - start_time) count += 1 doc = reader.next() reader.close()
def do_filter(sample_url_path, corpus_path, sample_corpus_path): import Corpus name_set = set( map(lambda line: line.strip().split()[0].split('/')[-1], open(sample_url_path).readlines())) trec_reader = Corpus.TRECReader() trec_reader.open(corpus_path) trec_writer = Corpus.TRECWriter(sample_corpus_path) doc = trec_reader.next() start_title_tag = '<title>' start_title_tag_len = len(start_title_tag) end_title_tag = '</title>' count = 0 while doc: text = doc.text start = text.find(start_title_tag) end = text.find(end_title_tag) title = '' if start >= 0 and end >= 0: title = text[start + start_title_tag_len:end] if name_set.__contains__(title): trec_writer.write(doc) count += 1 if count % 1000 == 0: print count doc = trec_reader.next() trec_reader.close() trec_writer.close()
def do_convert_mallet(match_path, mallet_path, tag_path, num): import Corpus reader = Corpus.TRECReader() reader.open(match_path) doc = reader.next() converter_type = 'token' converter = get_converter(converter_type) converter.open(mallet_path) tag_set = set(map(lambda s: s.strip(), open(tag_path).readlines())) num = int(num) doc_count = 0 t0 = time.time() total_count = 0 while doc: tagged_text = TaggedText() tagged_text.get_from_string(doc.text) convert_mallet(tagged_text, converter, tag_set) doc = reader.next() doc_count += 1 if doc_count % 10 == 0: print doc_count, time.time() - t0 if doc_count > num: break converter.close() reader.close()
def do_batch_apply(trec_path, model_dir, pattern_path, out_path, lib_dir): get_classpath(lib_dir) check_java_compile(lib_dir) pattern_set = set( map(lambda line: line.split()[0], open(pattern_path).readlines())) base_tag_trec_path = '%s.basetag' % trec_path command = [ 'java', '-Xms13G', '-Xmx13G', '-classpath', class_path, stanford_tag_program, '--batch-trec', trec_path, base_tag_trec_path ] print ' '.join(command) subprocess.call(command) t = time.time() reader = Corpus.TRECReader() reader.open(base_tag_trec_path) doc = reader.next() indecies = [0] ids = [] all_tagged_text = None while doc: tagged_text = TaggedText() tagged_text.get_from_string('\n'.join( filter(lambda line: not line.startswith('<'), doc.text.split('\n')))) if all_tagged_text: all_tagged_text += tagged_text else: all_tagged_text = tagged_text indecies.append(len(all_tagged_text)) tagged_text = apply_tag(trec_path, tagged_text, model_dir, pattern_set) ids.append(doc.ID) doc = reader.next() reader.close() os.remove(base_tag_trec_path) #tagged_text = apply_tag(trec_path, all_tagged_text, model_dir, pattern_set) print len(tagged_text) writer = Corpus.TRECWriter(out_path) for i in xrange(len(ids)): doc = Corpus.Document( ids[i], tagged_text[indecies[i]:indecies[i + 1]].__str__()) writer.write(doc) writer.close() global prune_t, label_t print time.time() - t, prune_t, label_t
def do_batch(in_trec, out_trec): import Corpus reader = Corpus.TRECReader() reader.open(in_trec) writer = Corpus.TRECWriter(out_trec) doc = reader.next() count = 1 while doc: plain = Wiki2Plain(doc.text) text = plain.text pos = text.find('\n') if pos > 0: text = '<title>%s</title>%s' % (text[:pos], text[pos:]) doc.text = text writer.write(doc) doc = reader.next() if count % 1000 == 0: print count count += 1 reader.close() writer.close()
def do_stat(match_path): import Corpus counts = {} conflicts = set() reader = Corpus.TRECReader() reader.open(match_path) doc = reader.next() doc_count = 0 t0 = time.time() total_count = 0 while doc: for token in doc.text.split(): pos = token.find('/') if pos > 0: tag_string = token[pos + 1:] if tag_string.startswith('[') and tag_string.endswith(']'): conflict_set = set() for tag_token in tag_string[1:-1].split(','): if tag_token.startswith('wiki:'): conflict_set.add(tag_token) total_count += 1 if counts.has_key(tag_token): counts[tag_token] += 1 else: counts[tag_token] = 1 if len(conflict_set) > 1: conflicts.add(' '.join(list(conflict_set))) doc = reader.next() doc_count += 1 if doc_count % 1000 == 0: print doc_count, time.time() - t0, total_count, len(counts), len( conflicts) count_array = map(lambda tag_count: (tag_count[1], tag_count[0]), counts.items()) count_array.sort(reverse=True) for count, tag in count_array: print count, tag for conflict in conflicts: print conflict