Beispiel #1
0
def do_count_length(in_trec, out_path):
    import Corpus
    reader = Corpus.TRECReader()
    reader.open(in_trec)
    doc = reader.next()
    count = 1;
    entry_per_file = 10000
    json_list = []
    start_time = time.time()
    with codecs.open(out_path, encoding='utf8', mode='w') as writer:
        while doc:
            length = len(doc.text)
            if '#redirect' in doc.text.lower():
                doc = reader.next()
                continue
            plain = Wiki2Plain(get_main_section(doc.text))
            text = plain.text

            body_start_pos = text.find('\n')
            if body_start_pos > 0:
                title = text[:body_start_pos]
                writer.write(u'%s\t%d\n' % (title, length))
                writer.flush()

            doc = reader.next()
    reader.close()
def do_match(infobox_path, text_path, out_path):
    import Corpus
    import time

    print 'loading......'
    infobox = load_infobox(infobox_path)
    reader = Corpus.TRECReader()
    reader.open(text_path)
    writer = Corpus.TRECWriter(out_path)
    matcher = InfoBoxMatcher()

    t0 = time.time()
    count = 0
    doc = reader.next()
    while doc:
        text = doc.text
        lines = text.split('\n')
        newlines = lines[:3]

        title_line = lines[1]
        title_begin_index = title_line.find('>')
        title_end_index = title_line.find('<', title_begin_index + 1)
        title = ''
        if title_begin_index >= 0 and title_end_index >= 0:
            title = title_line[title_begin_index + 1:title_end_index].strip()
            if infobox.has_key(title):
                tagged_text = matcher.match(infobox[title], lines[3:])
                doc.text = '\n'.join(lines[:3]) + '\n'
                doc.text += tagged_text
                writer.write(doc)
        doc = reader.next()
        count += 1
        if count % 100 == 0:
            print count, time.time() - t0
    writer.close()
Beispiel #3
0
def do_batch(in_trec, out_dir):
    import Corpus
    reader = Corpus.TRECReader()
    reader.open(in_trec)
    doc = reader.next()
    count = 1;
    entry_per_file = 10000
    json_list = []
    start_time = time.time()
    while doc:
        plain = Wiki2Plain(get_main_section(doc.text))
        text = plain.text

        body_start_pos = text.find('\n')
        if body_start_pos > 0:
            title = text[:body_start_pos]
            body = text[body_start_pos:] 
            if not title.count(':') or not re.match(invalid_title_pattern, title.split(':')[0]):
                json_list.append({'id': str(count), 'title': title.strip(), 'body': body.strip()})
                if count % entry_per_file == 0:
                    out_path = os.path.join(out_dir, str(count / entry_per_file) + '.json')
                    print('writing', out_path)
                    with codecs.open(out_path, encoding='utf-8', mode='w') as writer:
                       json.dump(json_list, writer, indent=2, ensure_ascii=False) 
                       json_list = []
                    print(count, title, time.time() - start_time)
                count += 1
        doc = reader.next()
    reader.close()
def do_filter(sample_url_path, corpus_path, sample_corpus_path):
    import Corpus
    name_set = set(
        map(lambda line: line.strip().split()[0].split('/')[-1],
            open(sample_url_path).readlines()))
    trec_reader = Corpus.TRECReader()
    trec_reader.open(corpus_path)
    trec_writer = Corpus.TRECWriter(sample_corpus_path)
    doc = trec_reader.next()
    start_title_tag = '<title>'
    start_title_tag_len = len(start_title_tag)
    end_title_tag = '</title>'
    count = 0
    while doc:
        text = doc.text
        start = text.find(start_title_tag)
        end = text.find(end_title_tag)
        title = ''
        if start >= 0 and end >= 0:
            title = text[start + start_title_tag_len:end]
        if name_set.__contains__(title):
            trec_writer.write(doc)
            count += 1
            if count % 1000 == 0:
                print count
        doc = trec_reader.next()
    trec_reader.close()
    trec_writer.close()
Beispiel #5
0
def do_convert_mallet(match_path, mallet_path, tag_path, num):
    import Corpus
    reader = Corpus.TRECReader()
    reader.open(match_path)
    doc = reader.next()
    converter_type = 'token'
    converter = get_converter(converter_type)
    converter.open(mallet_path)
    tag_set = set(map(lambda s: s.strip(), open(tag_path).readlines()))
    num = int(num)

    doc_count = 0
    t0 = time.time()
    total_count = 0
    while doc:
        tagged_text = TaggedText()
        tagged_text.get_from_string(doc.text)
        convert_mallet(tagged_text, converter, tag_set)
        doc = reader.next()
        doc_count += 1
        if doc_count % 10 == 0:
            print doc_count, time.time() - t0
        if doc_count > num:
            break
    converter.close()
    reader.close()
Beispiel #6
0
def do_batch_apply(trec_path, model_dir, pattern_path, out_path, lib_dir):
    get_classpath(lib_dir)
    check_java_compile(lib_dir)
    pattern_set = set(
        map(lambda line: line.split()[0],
            open(pattern_path).readlines()))
    base_tag_trec_path = '%s.basetag' % trec_path
    command = [
        'java', '-Xms13G', '-Xmx13G', '-classpath', class_path,
        stanford_tag_program, '--batch-trec', trec_path, base_tag_trec_path
    ]
    print ' '.join(command)
    subprocess.call(command)

    t = time.time()
    reader = Corpus.TRECReader()
    reader.open(base_tag_trec_path)
    doc = reader.next()
    indecies = [0]
    ids = []
    all_tagged_text = None
    while doc:
        tagged_text = TaggedText()
        tagged_text.get_from_string('\n'.join(
            filter(lambda line: not line.startswith('<'),
                   doc.text.split('\n'))))
        if all_tagged_text:
            all_tagged_text += tagged_text
        else:
            all_tagged_text = tagged_text
        indecies.append(len(all_tagged_text))
        tagged_text = apply_tag(trec_path, tagged_text, model_dir, pattern_set)
        ids.append(doc.ID)
        doc = reader.next()
    reader.close()
    os.remove(base_tag_trec_path)

    #tagged_text = apply_tag(trec_path, all_tagged_text, model_dir, pattern_set)
    print len(tagged_text)
    writer = Corpus.TRECWriter(out_path)
    for i in xrange(len(ids)):
        doc = Corpus.Document(
            ids[i], tagged_text[indecies[i]:indecies[i + 1]].__str__())
        writer.write(doc)
    writer.close()
    global prune_t, label_t
    print time.time() - t, prune_t, label_t
Beispiel #7
0
def do_batch(in_trec, out_trec):
    import Corpus
    reader = Corpus.TRECReader()
    reader.open(in_trec)
    writer = Corpus.TRECWriter(out_trec)
    doc = reader.next()
    count = 1
    while doc:
        plain = Wiki2Plain(doc.text)
        text = plain.text
        pos = text.find('\n')
        if pos > 0:
            text = '<title>%s</title>%s' % (text[:pos], text[pos:])
        doc.text = text
        writer.write(doc)
        doc = reader.next()
        if count % 1000 == 0:
            print count
        count += 1
    reader.close()
    writer.close()
def do_stat(match_path):
    import Corpus
    counts = {}
    conflicts = set()
    reader = Corpus.TRECReader()
    reader.open(match_path)
    doc = reader.next()
    doc_count = 0
    t0 = time.time()
    total_count = 0
    while doc:
        for token in doc.text.split():
            pos = token.find('/')
            if pos > 0:
                tag_string = token[pos + 1:]
                if tag_string.startswith('[') and tag_string.endswith(']'):
                    conflict_set = set()
                    for tag_token in tag_string[1:-1].split(','):
                        if tag_token.startswith('wiki:'):
                            conflict_set.add(tag_token)
                            total_count += 1
                            if counts.has_key(tag_token):
                                counts[tag_token] += 1
                            else:
                                counts[tag_token] = 1
                    if len(conflict_set) > 1:
                        conflicts.add(' '.join(list(conflict_set)))
        doc = reader.next()
        doc_count += 1
        if doc_count % 1000 == 0:
            print doc_count, time.time() - t0, total_count, len(counts), len(
                conflicts)
    count_array = map(lambda tag_count: (tag_count[1], tag_count[0]),
                      counts.items())
    count_array.sort(reverse=True)
    for count, tag in count_array:
        print count, tag
    for conflict in conflicts:
        print conflict