def filter_query(): data_dir = args.data_dir min_query_freq = args.min_query_freq query2freq = load_from_query_file(os.path.join(data_dir, 'query_freq')) qid2query = load_from_query_file(os.path.join(data_dir, 'query_all')) save_query_file([(k, v) for k, v in qid2query.items() if int(query2freq[v]) >= min_query_freq], os.path.join(data_dir, 'query'))
def shuqi_bing_redirect(): MARK = b'\t-----\t' data_dir = args.data_dir shuqi_bing_web_dir = args.shuqi_bing_web_dir docid_to_url = load_from_query_file(os.path.join(data_dir, 'docid_to_url')) print('#all url: {}'.format(len(docid_to_url))) url_to_docid = {v: k for k, v in docid_to_url.items()} count = 0 wrong_url_count = 0 with open(os.path.join(shuqi_bing_web_dir, 'allweb.txt'), 'r') as fp: for l in fp: l = l.strip() url, ind = l.split('\t') if url not in url_to_docid: wrong_url_count += 1 continue old_path = os.path.join(shuqi_bing_web_dir, 'web{}.txt'.format(ind)) if not os.path.exists(old_path): continue count += 1 if count % 100000 == 0: print('count: {}w'.format(count / 10000)) new_ind = url_to_docid[url] with open(os.path.join(data_dir, 'docs', new_ind + '.html'), 'wb') as nh: try: h = open(old_path, 'rb').read() except: print('read error: {}'.format(old_path)) raise nh.write(h[h.find(MARK) + len(MARK):]) print('#downloaded url: {}, #wrong url: {}'.format(count, wrong_url_count))
def generate_train_test(): data_dir = args.data_dir query_filepath = os.path.join(data_dir, 'query') judge_filepath = os.path.join(data_dir, 'judgement') run_filepath = os.path.join(data_dir, 'run') # split train and test dataset based on queries rather than qid query_dict = load_from_query_file(query_filepath) unique_queries = np.unique(list(query_dict.values())) np.random.shuffle(unique_queries) train_size = int(len(unique_queries) * args.train_test_ratio) test_size = len(unique_queries) - train_size if train_size <= 0 or test_size <= 0: raise Exception('train test dataset size is incorrect') print('#unique queries: {}, train size: {}, test size: {}'.format( len(unique_queries), train_size, test_size)) train_queries = set(unique_queries[:train_size]) test_queries = set(unique_queries[train_size:]) train_qids = set([q for q in query_dict if query_dict[q] in train_queries]) test_qids = set([q for q in query_dict if query_dict[q] in test_queries]) miss_docs = set() have_docs = set() train_samples = [] test_samples = [] qd_judge = load_judge_file(judge_filepath) for q in qd_judge: for d in qd_judge[q]: if qd_judge[q][d] is None: # skip documents without judgement continue if not os.path.exists(os.path.join(data_dir, 'docs', d + '.html')): miss_docs.add(d) continue have_docs.add(d) if q in train_qids: train_samples.append((q, d, qd_judge[q][d])) elif q in test_qids and not os.path.exists(run_filepath): test_samples.append((q, d, qd_judge[q][d])) if os.path.exists(run_filepath): run_result = load_run_file(run_filepath) for q, _, d, rank, score, _ in run_result: if qd_judge[q][d] is None: # skip documents without judgement continue if not os.path.exists(os.path.join(data_dir, 'docs', d + '.html')): miss_docs.add(d) continue have_docs.add(d) if q in test_qids: test_samples.append((q, d, qd_judge[q][d])) print('have {} docs, miss {} docs'.format(len(have_docs), len(miss_docs))) save_train_test_file(train_samples, os.path.join(data_dir, 'train.pointwise')) save_train_test_file(test_samples, os.path.join(data_dir, 'test.pointwise'))
def filter_judgement(): filtered_ext = ['.pdf', '.ppt', '.pptx', '.doc', '.docx', '.txt'] filtered_ext = tuple(filtered_ext + [ext.upper() for ext in filtered_ext]) allowed_ext = tuple(['html', 'htm', 'com', 'cn', 'asp', 'shtml', 'php']) data_dir = args.data_dir docid_to_url = load_from_query_file(os.path.join(data_dir, 'docid_to_url')) qd_judge = load_judge_file(os.path.join(data_dir, 'judgement_rel')) qd_judge_new = defaultdict(lambda: defaultdict(lambda: None)) count = 0 for q in qd_judge: for d in qd_judge[q]: if docid_to_url[d].endswith(filtered_ext): count += 1 continue qd_judge_new[q][d] = qd_judge[q][d] print('#non-html url: {}'.format(count)) save_judge_file(qd_judge_new, os.path.join(data_dir, 'judgement'))
def preprocess(): binary = args.binary_html data_dir = args.data_dir max_vocab_size = args.max_vocab_size docs_dir = os.path.join(data_dir, 'docs') query_filepath = os.path.join(data_dir, 'query') train_filepath = os.path.join(data_dir, 'train.pointwise') test_filepath = os.path.join(data_dir, 'test.pointwise') vocab = Vocab(max_size=max_vocab_size) train_query_ids, train_doc_ids = get_query_doc_ids(train_filepath) test_query_ids, test_doc_ids = get_query_doc_ids(test_filepath) query_ids = train_query_ids | test_query_ids doc_ids = train_doc_ids | test_doc_ids print('total query: {}, total doc: {}'.format(len(query_ids), len(doc_ids))) query_dict = load_from_query_file(query_filepath) doc_dict = {} for qid in sorted(train_query_ids): for term in query_dict[qid].split(): vocab.add(term) count = 0 for docid in sorted(train_doc_ids): count += 1 if count % 10000 == 0: print('processed {}w docs'.format(count // 10000)) doc_body = load_from_html_cascade(os.path.join(docs_dir, docid + '.html'), binary=binary)['body'] doc_dict[docid] = doc_body #print(docid) #print(' '.join(doc_body)) #input() for term in doc_body: vocab.add(term) vocab.build() vocab.save_to_file(os.path.join(data_dir, 'vocab')) empty_qid, empty_docid = set(), set() with open(os.path.join(data_dir, 'query.prep'), 'w') as fp: for qid in sorted(query_ids): qt = query_dict[qid].split() if len(qt) == 0: empty_qid.add(qid) continue fp.write('{}\t{}\n'.format( qid, ' '.join(map(lambda x: str(x), vocab.encode(qt))))) with open(os.path.join(data_dir, 'docs.prep'), 'w') as fp: for docid in sorted(doc_ids): if docid in doc_dict: doc_body = doc_dict[docid] else: doc_body = load_from_html_cascade(os.path.join( docs_dir, docid + '.html'), binary=binary)['body'] if len(doc_body) == 0: empty_docid.add(docid) continue fp.write('{}\t{}\n'.format( docid, ' '.join(map(lambda x: str(x), vocab.encode(doc_body))))) print('have {} empty query, have {} empty doc'.format( len(empty_qid), len(empty_docid))) filter_samples(train_filepath, '{}.prep.{}'.format(*train_filepath.rsplit('.', 1)), empty_qid, empty_docid) filter_samples(test_filepath, '{}.prep.{}'.format(*test_filepath.rsplit('.', 1)), empty_qid, empty_docid)