def xml_train_test_prep(field='body', relevance='TACM'): train_file = os.path.join(args.data_dir, 'qd.xml.seg.train') test_file = os.path.join(args.data_dir, 'qd.xml.seg.test') max_vocab_size = args.max_vocab_size train_word_file = os.path.join(args.data_dir, 'train.pointwise') test_word_file = os.path.join(args.data_dir, 'test.pointwise') train_prep_file = os.path.join(args.data_dir, 'train.prep.pointwise') test_prep_file = os.path.join(args.data_dir, 'test.prep.pointwise') vocab_file = os.path.join(args.data_dir, 'vocab') field_in_xml = xml_field_maping(field) print('build vocab ...') vocab = Vocab(max_size=max_vocab_size) for i, qd in enumerate(qd_xml_iterator(train_file)): ''' query = qd.find('./query').text words = query.split(' ') for doc in qd.findall('./doc/{}'.format(field_in_xml)): words.extend(doc.text.split(' ')) ''' if i % 10000 == 0: print('{}w'.format(i // 10000)) query = qd['query'] words = query.split(' ') for doc in qd['doc']: words.extend(doc[field_in_xml].split(' ')) for w in words: vocab.add(w) vocab.build() vocab.save_to_file(vocab_file) for from_file, word_file, prep_file in \ [(train_file, train_word_file, train_prep_file), (test_file, test_word_file, test_prep_file)]: qd_xml_to_prep(from_file, prep_file, vocab, field_in_xml=field_in_xml, relevance=relevance)
def preprocess(): binary = args.binary_html data_dir = args.data_dir max_vocab_size = args.max_vocab_size docs_dir = os.path.join(data_dir, 'docs') query_filepath = os.path.join(data_dir, 'query') train_filepath = os.path.join(data_dir, 'train.pointwise') test_filepath = os.path.join(data_dir, 'test.pointwise') vocab = Vocab(max_size=max_vocab_size) train_query_ids, train_doc_ids = get_query_doc_ids(train_filepath) test_query_ids, test_doc_ids = get_query_doc_ids(test_filepath) query_ids = train_query_ids | test_query_ids doc_ids = train_doc_ids | test_doc_ids print('total query: {}, total doc: {}'.format(len(query_ids), len(doc_ids))) query_dict = load_from_query_file(query_filepath) doc_dict = {} for qid in sorted(train_query_ids): for term in query_dict[qid].split(): vocab.add(term) count = 0 for docid in sorted(train_doc_ids): count += 1 if count % 10000 == 0: print('processed {}w docs'.format(count // 10000)) doc_body = load_from_html_cascade(os.path.join(docs_dir, docid + '.html'), binary=binary)['body'] doc_dict[docid] = doc_body #print(docid) #print(' '.join(doc_body)) #input() for term in doc_body: vocab.add(term) vocab.build() vocab.save_to_file(os.path.join(data_dir, 'vocab')) empty_qid, empty_docid = set(), set() with open(os.path.join(data_dir, 'query.prep'), 'w') as fp: for qid in sorted(query_ids): qt = query_dict[qid].split() if len(qt) == 0: empty_qid.add(qid) continue fp.write('{}\t{}\n'.format( qid, ' '.join(map(lambda x: str(x), vocab.encode(qt))))) with open(os.path.join(data_dir, 'docs.prep'), 'w') as fp: for docid in sorted(doc_ids): if docid in doc_dict: doc_body = doc_dict[docid] else: doc_body = load_from_html_cascade(os.path.join( docs_dir, docid + '.html'), binary=binary)['body'] if len(doc_body) == 0: empty_docid.add(docid) continue fp.write('{}\t{}\n'.format( docid, ' '.join(map(lambda x: str(x), vocab.encode(doc_body))))) print('have {} empty query, have {} empty doc'.format( len(empty_qid), len(empty_docid))) filter_samples(train_filepath, '{}.prep.{}'.format(*train_filepath.rsplit('.', 1)), empty_qid, empty_docid) filter_samples(test_filepath, '{}.prep.{}'.format(*test_filepath.rsplit('.', 1)), empty_qid, empty_docid)