Exemple #1
0
def xml_train_test_prep(field='body', relevance='TACM'):
    train_file = os.path.join(args.data_dir, 'qd.xml.seg.train')
    test_file = os.path.join(args.data_dir, 'qd.xml.seg.test')
    max_vocab_size = args.max_vocab_size
    train_word_file = os.path.join(args.data_dir, 'train.pointwise')
    test_word_file = os.path.join(args.data_dir, 'test.pointwise')
    train_prep_file = os.path.join(args.data_dir, 'train.prep.pointwise')
    test_prep_file = os.path.join(args.data_dir, 'test.prep.pointwise')
    vocab_file = os.path.join(args.data_dir, 'vocab')
    field_in_xml = xml_field_maping(field)
    print('build vocab ...')
    vocab = Vocab(max_size=max_vocab_size)
    for i, qd in enumerate(qd_xml_iterator(train_file)):
        '''
    query = qd.find('./query').text
    words = query.split(' ')
    for doc in qd.findall('./doc/{}'.format(field_in_xml)):
      words.extend(doc.text.split(' '))
    '''
        if i % 10000 == 0:
            print('{}w'.format(i // 10000))
        query = qd['query']
        words = query.split(' ')
        for doc in qd['doc']:
            words.extend(doc[field_in_xml].split(' '))
        for w in words:
            vocab.add(w)
    vocab.build()
    vocab.save_to_file(vocab_file)
    for from_file, word_file, prep_file in \
      [(train_file, train_word_file, train_prep_file), (test_file, test_word_file, test_prep_file)]:
        qd_xml_to_prep(from_file,
                       prep_file,
                       vocab,
                       field_in_xml=field_in_xml,
                       relevance=relevance)
Exemple #2
0
def preprocess():
    binary = args.binary_html
    data_dir = args.data_dir
    max_vocab_size = args.max_vocab_size
    docs_dir = os.path.join(data_dir, 'docs')
    query_filepath = os.path.join(data_dir, 'query')
    train_filepath = os.path.join(data_dir, 'train.pointwise')
    test_filepath = os.path.join(data_dir, 'test.pointwise')
    vocab = Vocab(max_size=max_vocab_size)
    train_query_ids, train_doc_ids = get_query_doc_ids(train_filepath)
    test_query_ids, test_doc_ids = get_query_doc_ids(test_filepath)
    query_ids = train_query_ids | test_query_ids
    doc_ids = train_doc_ids | test_doc_ids
    print('total query: {}, total doc: {}'.format(len(query_ids),
                                                  len(doc_ids)))
    query_dict = load_from_query_file(query_filepath)
    doc_dict = {}
    for qid in sorted(train_query_ids):
        for term in query_dict[qid].split():
            vocab.add(term)
    count = 0
    for docid in sorted(train_doc_ids):
        count += 1
        if count % 10000 == 0:
            print('processed {}w docs'.format(count // 10000))
        doc_body = load_from_html_cascade(os.path.join(docs_dir,
                                                       docid + '.html'),
                                          binary=binary)['body']
        doc_dict[docid] = doc_body
        #print(docid)
        #print(' '.join(doc_body))
        #input()
        for term in doc_body:
            vocab.add(term)
    vocab.build()
    vocab.save_to_file(os.path.join(data_dir, 'vocab'))
    empty_qid, empty_docid = set(), set()
    with open(os.path.join(data_dir, 'query.prep'), 'w') as fp:
        for qid in sorted(query_ids):
            qt = query_dict[qid].split()
            if len(qt) == 0:
                empty_qid.add(qid)
                continue
            fp.write('{}\t{}\n'.format(
                qid, ' '.join(map(lambda x: str(x), vocab.encode(qt)))))
    with open(os.path.join(data_dir, 'docs.prep'), 'w') as fp:
        for docid in sorted(doc_ids):
            if docid in doc_dict:
                doc_body = doc_dict[docid]
            else:
                doc_body = load_from_html_cascade(os.path.join(
                    docs_dir, docid + '.html'),
                                                  binary=binary)['body']
            if len(doc_body) == 0:
                empty_docid.add(docid)
                continue
            fp.write('{}\t{}\n'.format(
                docid, ' '.join(map(lambda x: str(x),
                                    vocab.encode(doc_body)))))
    print('have {} empty query, have {} empty doc'.format(
        len(empty_qid), len(empty_docid)))
    filter_samples(train_filepath,
                   '{}.prep.{}'.format(*train_filepath.rsplit('.', 1)),
                   empty_qid, empty_docid)
    filter_samples(test_filepath,
                   '{}.prep.{}'.format(*test_filepath.rsplit('.', 1)),
                   empty_qid, empty_docid)