def do_match(infobox_path, text_path, out_path):
    import Corpus
    import time

    print 'loading......'
    infobox = load_infobox(infobox_path)
    reader = Corpus.TRECReader()
    reader.open(text_path)
    writer = Corpus.TRECWriter(out_path)
    matcher = InfoBoxMatcher()

    t0 = time.time()
    count = 0
    doc = reader.next()
    while doc:
        text = doc.text
        lines = text.split('\n')
        newlines = lines[:3]

        title_line = lines[1]
        title_begin_index = title_line.find('>')
        title_end_index = title_line.find('<', title_begin_index + 1)
        title = ''
        if title_begin_index >= 0 and title_end_index >= 0:
            title = title_line[title_begin_index + 1:title_end_index].strip()
            if infobox.has_key(title):
                tagged_text = matcher.match(infobox[title], lines[3:])
                doc.text = '\n'.join(lines[:3]) + '\n'
                doc.text += tagged_text
                writer.write(doc)
        doc = reader.next()
        count += 1
        if count % 100 == 0:
            print count, time.time() - t0
    writer.close()
def do_filter(sample_url_path, corpus_path, sample_corpus_path):
    import Corpus
    name_set = set(
        map(lambda line: line.strip().split()[0].split('/')[-1],
            open(sample_url_path).readlines()))
    trec_reader = Corpus.TRECReader()
    trec_reader.open(corpus_path)
    trec_writer = Corpus.TRECWriter(sample_corpus_path)
    doc = trec_reader.next()
    start_title_tag = '<title>'
    start_title_tag_len = len(start_title_tag)
    end_title_tag = '</title>'
    count = 0
    while doc:
        text = doc.text
        start = text.find(start_title_tag)
        end = text.find(end_title_tag)
        title = ''
        if start >= 0 and end >= 0:
            title = text[start + start_title_tag_len:end]
        if name_set.__contains__(title):
            trec_writer.write(doc)
            count += 1
            if count % 1000 == 0:
                print count
        doc = trec_reader.next()
    trec_reader.close()
    trec_writer.close()
Exemple #3
0
def do_batch_apply(trec_path, model_dir, pattern_path, out_path, lib_dir):
    get_classpath(lib_dir)
    check_java_compile(lib_dir)
    pattern_set = set(
        map(lambda line: line.split()[0],
            open(pattern_path).readlines()))
    base_tag_trec_path = '%s.basetag' % trec_path
    command = [
        'java', '-Xms13G', '-Xmx13G', '-classpath', class_path,
        stanford_tag_program, '--batch-trec', trec_path, base_tag_trec_path
    ]
    print ' '.join(command)
    subprocess.call(command)

    t = time.time()
    reader = Corpus.TRECReader()
    reader.open(base_tag_trec_path)
    doc = reader.next()
    indecies = [0]
    ids = []
    all_tagged_text = None
    while doc:
        tagged_text = TaggedText()
        tagged_text.get_from_string('\n'.join(
            filter(lambda line: not line.startswith('<'),
                   doc.text.split('\n'))))
        if all_tagged_text:
            all_tagged_text += tagged_text
        else:
            all_tagged_text = tagged_text
        indecies.append(len(all_tagged_text))
        tagged_text = apply_tag(trec_path, tagged_text, model_dir, pattern_set)
        ids.append(doc.ID)
        doc = reader.next()
    reader.close()
    os.remove(base_tag_trec_path)

    #tagged_text = apply_tag(trec_path, all_tagged_text, model_dir, pattern_set)
    print len(tagged_text)
    writer = Corpus.TRECWriter(out_path)
    for i in xrange(len(ids)):
        doc = Corpus.Document(
            ids[i], tagged_text[indecies[i]:indecies[i + 1]].__str__())
        writer.write(doc)
    writer.close()
    global prune_t, label_t
    print time.time() - t, prune_t, label_t
Exemple #4
0
def do_batch(in_trec, out_trec):
    import Corpus
    reader = Corpus.TRECReader()
    reader.open(in_trec)
    writer = Corpus.TRECWriter(out_trec)
    doc = reader.next()
    count = 1
    while doc:
        plain = Wiki2Plain(doc.text)
        text = plain.text
        pos = text.find('\n')
        if pos > 0:
            text = '<title>%s</title>%s' % (text[:pos], text[pos:])
        doc.text = text
        writer.write(doc)
        doc = reader.next()
        if count % 1000 == 0:
            print count
        count += 1
    reader.close()
    writer.close()
Exemple #5
0
 def __init__(self, names, out_path):
     import Corpus
     self.name_set = set(names)
     self.writer = Corpus.TRECWriter(out_path)
     self.id = 1
     self.count = 0