def process_washington_post(filename): with open(filename, 'r', encoding='utf-8') as f: for line in tqdm(f): obj = json.loads(line) obj['kicker'] = filter_kicker(obj) if obj['kicker'] is False: continue obj['body'] = extract_body([obj['contents']]) # to lower case obj['title'] = str(obj['title']).lower() obj['body'] = str(obj['body']).lower() # stemming w_list = cfg.word_cut(obj['body']) for i in range(len(w_list)): if w_list[i].isalpha(): w_list[i] = stemmer.stem(w_list[i]) obj['body'] = ' '.join(w_list) w_list = cfg.word_cut(obj['title']) for i in range(len(w_list)): if w_list[i].isalpha(): w_list[i] = stemmer.stem(w_list[i]) obj['title'] = ' '.join(w_list) del obj['contents'] obj['title_body'] = (str(obj['title']) + ' ' + str(obj['body'])).lower() obj['title_author_date'] = (str(obj['title']) + ' ' + str(obj['author']) + ' ' + str(obj['published_date'])).lower() doc = json.dumps(obj) # insert data res = es.index(index=INDEX_NAME, id=obj['id'], body=doc)
def words_index_single(line, filter_kicker): obj = json.loads(line) doc_id = obj['id'] contents = obj['contents'] doc = "" for li in contents: if type(li).__name__ == 'dict': if 'type' in li and li['type'] == 'kicker': # skip filter kickers if li['content'] in filter_kicker.keys(): return () if 'subtype' in li and li['subtype'] == 'paragraph': paragraph = li['content'].strip() # Replace <.*?> with "" paragraph = re.sub(r'<.*?>', '', paragraph) doc += ' ' + paragraph doc = doc.strip() w_list = cfg.word_cut(doc) w_list = set(w_list) res = [] for w in w_list: ds = set() ds.add(doc_id) res.append((w, ds)) return res
def calc_score(line, words_df, query, avgdl, flag=False): k1 = 1.5 b = 0.75 obj = line if not flag: obj = json.loads(line) body = extract_body([obj['contents']]) doc_id = obj['id'] w_list = cfg.word_cut(body) # calc tf for the doc tf = {} for w in w_list: if w in tf: tf[w] += 1 else: tf[w] = 1 # calc bm25 for the doc score = 0.0 for w in query: tfi = 0 if w in tf: tfi = tf[w] dfi = 1e-7 if w in words_df.value: dfi = words_df.value[w] dl = len(w_list) N = cfg.DOCUMENT_COUNT score += np.log(N / dfi) * ((k1 + 1) * tfi) / (k1 * ( (1 - b) + b * dl / avgdl) + tfi) return (score, doc_id)
def split_body(args=None): body, max_length = args max_length = int(max_length) w_list = cfg.word_cut(body) if len(w_list) <= max_length-2: return body head_len = int((max_length - 2) / 2) tail_len = int(max_length - 2 - head_len) return ' '.join(w_list[:head_len]) + ' '.join(w_list[-tail_len:])
def process_wiki(filepath): # load case case_mp = {} with open(path_mp['DataPath'] + path_mp['entities'], 'r', encoding='utf-8') as f: li = [] mp = {} topic_id = '' for line in f: topic_id_tmp = re.search(r'<num>.*?</num>', line) if topic_id_tmp is not None: if len(li) > 0: case_mp[topic_id] = li li = [] topic_id = topic_id_tmp topic_id = topic_id.group(0)[5 + 9:-7] doc_id = re.search(r'<docid>.*?</docid>', line) if doc_id is not None: doc_id = doc_id.group(0)[7:-8] li.append(doc_id) entity_id = re.search(r'<id>.*?</id>', line) if entity_id is not None: entity_id = entity_id.group(0)[5:-6] mp['id'] = entity_id mention = re.search(r'<mention>.*?</mention>', line) if mention is not None: mention = mention.group(0)[9:-10] mp['mention'] = mention.lower() link = re.search(r'<link>.*?</link>', line) if link is not None: link = link.group(0)[6:-7] mp['link'] = link.lower() li.append(mp) mp = {} if len(li) != 0: case_mp[topic_id] = li li = [] # find entity wiki page for topic_id in case_mp: for entity in case_mp[topic_id][1:]: dsl = {"size": 100, 'query': {'match': {'inlink': entity['link']}}} res = es.search(index=SEARCH_NAME, body=dsl) print(entity['id'], len(res['hits']['hits'])) for ri in res['hits']['hits']: obj = ri['_source'] obj['inlink'] = entity['link'] # stemming w_list = cfg.word_cut(obj['body']) for i in range(len(w_list)): if w_list[i].isalpha(): w_list[i] = stemmer.stem(w_list[i]) obj['body'] = ' '.join(w_list) doc = json.dumps(obj) # insert data res = es.index(index=INDEX_NAME, body=doc)
def process(obj): obj['body'] = extract_body([obj['contents']]) # to lower case obj['title'] = str(obj['title']).lower() obj['body'] = str(obj['body']).lower() # stemming w_list = cfg.word_cut(obj['body']) for i in range(len(w_list)): if w_list[i].isalpha(): w_list[i] = stemmer.stem(w_list[i]) obj['body'] = ' '.join(w_list) w_list = cfg.word_cut(obj['title']) for i in range(len(w_list)): if w_list[i].isalpha(): w_list[i] = stemmer.stem(w_list[i]) obj['title'] = ' '.join(w_list) del obj['contents'] obj['title_body'] = (str(obj['title']) + ' ' + str(obj['body'])).lower() obj['title_author_date'] = (str(obj['title']) + ' ' + str(obj['author']) + ' ' + str(obj['published_date'])).lower() return obj
def tfidf_index_single(line, filter_kicker, words_mp, num): obj = json.loads(line) doc_id = obj['id'] contents = obj['contents'] doc = "" for li in contents: if type(li).__name__ == 'dict': if 'type' in li and li['type'] == 'kicker': # skip filter kickers if li['content'] in filter_kicker.keys(): return () if 'subtype' in li and li['subtype'] == 'paragraph': paragraph = li['content'].strip() # Replace <.*?> with "" paragraph = re.sub(r'<.*?>', '', paragraph) doc += ' ' + paragraph doc = doc.strip() w_list = cfg.word_cut(doc) num = int(num) # calculate term frequency for each word in the str tf = {} for w in w_list: if w in tf: tf[w] += 1 else: tf[w] = 1 # calculate idf and tf-idf for each word tfidf_val = {} for w in w_list: # word not in vocabulary if w not in words_mp: continue idf = np.log(cfg.DOCUMENT_COUNT * 1.0 / len(words_mp[w])) tfidf_val[w] = tf[w] * 1.0 * idf # sort by tf-idf, combine top inverted file line number list tfidf_val = sorted(tfidf_val.items(), key=lambda d: d[1], reverse=True) res = set() for i in range(min(num, len(tfidf_val))): w = tfidf_val[i][0] res = res | set(words_mp[w]) return doc_id + ' ' + ' '.join(res)
def gen_sample(args=None): max_length = args[0] max_length = int(max_length) # read all the doc, load as json, line count start from 1 WashingtonPost = {} with open(path_mp['DataPath'] + path_mp['WashingtonPost'], 'r', encoding='utf-8') as f: for line in tqdm(f): obj = json.loads(line) doc_id = obj['id'] WashingtonPost[doc_id] = obj print('WashingtonPost dataset loaded.') # read topics idx topics_mp = {} with open(cfg.OUTPUT + 'topics_index.txt', 'r', encoding='utf-8') as f: for line in tqdm(f): li = line[:-1].split(' ') topics_mp[li[0]] = set(li[1:]) print('Topics idx loaded.') # read tfidf_mp tfidf_mp = {} with open(cfg.OUTPUT + 'tfidf_index.txt', 'r', encoding='utf-8') as f: for line in tqdm(f): li = line[:-1].split(' ') tfidf_mp[li[0]] = li[1:] tfidf_list = list(tfidf_mp.keys()) print('TFIDF idx loaded.') # read words_mp words_index = {} with open(cfg.OUTPUT + 'words_index.txt', 'r', encoding='utf-8') as f: for line in tqdm(f): li = line[:-1].split(' ') words_index[li[0]] = set(li[1:]) print('words idx loaded.') with open(cfg.OUTPUT + 'Dataset_BertCls.txt', 'w', encoding='utf-8') as out: for cur_id in tqdm(tfidf_list): obj = WashingtonPost[cur_id] contents = obj['contents'] title = obj['title'] author = obj['author'] date = obj['published_date'] body = "" topic_name = "" for li in contents: if type(li).__name__ == 'dict': if 'type' in li and li[ 'type'] == 'kicker' and topic_name == "": topic_name = li['content'].strip() if 'subtype' in li and li['subtype'] == 'paragraph': paragraph = li['content'].strip() # Replace <.*?> with "" paragraph = re.sub(r'<.*?>', '', paragraph) body += ' ' + paragraph # Recall By tf_idf body = body.strip() res_tfidf = set() for w in tfidf_mp[cur_id]: res_tfidf = res_tfidf | words_index[w] res_tfidf = list(res_tfidf) # Recall By topics res_topic = [] if topic_name in res_topic: res_topic = list(topics_mp[topic_name]) # Combie Recall results similar_doc = {} # Filter cur_key = '' if title is not None: cur_key += title if author is not None: cur_key += '#' + author if date is None: cur_key += '#' + str(date) similar_doc[cur_key] = 1 res_mask = {} res_mask[0] = set() res_mask[1] = set() res_mask[2] = set() res_mask[3] = set() res_tfidf_mp = {} # help decide which is 8 for li in res_tfidf: # Filter by kicker if li in tfidf_mp and filter_doc(WashingtonPost[li], date, similar_doc): res_mask[2].add(li) res_tfidf_mp[li] = 1 for li in res_topic: # Filter by kicker if li in tfidf_mp and filter_doc(WashingtonPost[li], date, similar_doc): if li in res_tfidf_mp: res_mask[3].add(li) else: res_mask[1].add(li) # random add 100 label 0 document zero = np.random.randint(0, len(tfidf_mp), size=[100]) for li in zero: doc_id = tfidf_list[li] if filter_doc(WashingtonPost[doc_id], date, similar_doc): res_mask[0].add(doc_id) # split from body sen1 = split_body([body, max_length]) # Sampling and Generate examples # label 0, 2, 4, 8 for label in res_mask.keys(): res_mask[label] = list(res_mask[label]) if len(res_mask[label]) <= 0: continue idx = random.randint(0, len(res_mask[label]) - 1) doc_id = res_mask[label][idx] doc_body = extract_body([WashingtonPost[doc_id]['contents']]) sen2 = split_body([doc_body, max_length]) out.write(str(label) + '\t' + sen1 + '\t' + sen2 + '\n') # label 16 middle from the body w_list = cfg.word_cut(body) st = (len(w_list) - max_length + 2) // 2 ed = st + max_length - 2 sen2 = ' '.join(w_list[st:ed]) out.write(str(4) + '\t' + sen1 + '\t' + sen2 + '\n')
def test_entity_ranking(): # stop words stop_words = {} with open('../elastic/stopwords.txt', 'r', encoding='utf-8') as f: for w in f: w = w[:-1] stop_words[w] = 1 print('stop words loaded.') # test case: topic_id, list:[docid, entity_id] case_mp = {} with open(path_mp['DataPath'] + path_mp['entities19'], 'r', encoding='utf-8') as f: li = [] mp = {} topic_id = '' for line in f: topic_id_tmp = re.search(r'<num>.*?</num>', line) if topic_id_tmp is not None: if len(li) > 0: case_mp[topic_id] = li li = [] topic_id = topic_id_tmp topic_id = topic_id.group(0)[5 + 9:-7] doc_id = re.search(r'<docid>.*?</docid>', line) if doc_id is not None: doc_id = doc_id.group(0)[7:-8] li.append(doc_id) entity_id = re.search(r'<id>.*?</id>', line) if entity_id is not None: entity_id = entity_id.group(0)[5:-6] mp['id'] = entity_id mention = re.search(r'<mention>.*?</mention>', line) if mention is not None: mention = mention.group(0)[9:-10] mp['mention'] = mention.lower() link = re.search(r'<link>.*?</link>', line) if link is not None: link = link.group(0)[6:-7] mp['link'] = link.lower() li.append(mp) mp = {} if len(li) != 0: case_mp[topic_id] = li li = [] print('test case loaded.') with open('eresult_7191.test', 'w', encoding='utf-8') as f: # with open('/home/trec7/lianxiaoying/trec_eval.9.0/test/eresult.test', 'w', encoding='utf-8') as f: for topic_id in case_mp.keys(): li = case_mp[topic_id] doc_id = li[0] out_doc_id = {'97b489e2-0a38-11e5-9e39-0db921c47b93': 1} doc = '' if doc_id not in out_doc_id: dsl = {'query': {'match': {'id': doc_id}}} res = es.search(index=INDEX_NAME, body=dsl) # print(res) doc = res['hits']['hits'][0]['_source'] else: with open(doc_id + '.txt', 'r', encoding='utf-8') as rin: for line in rin: doc = json.loads(line) doc = process(doc) tmp1 = cfg.word_cut(doc['title_body']) tmp = [] for w in tmp1: if w not in stop_words: tmp.append(w) qr = ' '.join(tmp) # qr = doc['title_body'] dsl = { "size": 1000, "timeout": "1m", "query": { 'bool': { 'must': { 'match': { 'body': { 'query': qr, 'boost': 1 } } } } } } res = es.search(index=WIKI_INDEX, body=dsl, request_timeout=30) res = res['hits']['hits'] inlink_to_rank = {} rank = 1 for ri in res: inlink = ri['_source']['inlink'] if inlink not in inlink_to_rank: inlink_to_rank[inlink] = rank rank += 1 cnt = 1 for entity in li[1:]: print(entity['id']) out = [] out.append(topic_id) out.append('Q0') out.append(entity['id']) out.append(str(cnt)) sc = 0 if entity['link'] in inlink_to_rank: sc = 1000 - inlink_to_rank[entity['link']] out.append(str(sc)) out.append('ICTNET_estem') ans = "\t".join(out) + "\n" f.write(ans) cnt += 1
MAX_IDF = 0.6 stop_words = {} with open('../elastic/stopwords.txt', 'r', encoding='utf-8') as f: for w in f: w = w[:-1] stop_words[w] = 1 print('stop words loaded.') idf = {} N = 0 with open('/home/trec7/lianxiaoying/data/vector_corpus.txt', 'r', encoding='utf-8') as f: for line in tqdm(f): w_list = cfg.word_cut(line[:-1]) tf = {} for w in w_list: w = w.strip() if w not in stop_words and len(w) > 2: if w in tf: tf[w] += 1 else: tf[w] = 1 for w in tf.keys(): # appear in one doc more than MIN_FREQ if tf[w] >= MIN_FREQ: if w not in idf: idf[w] = 1 else: idf[w] += 1
def calc_doc_length(line): obj = json.loads(line) body = extract_body([obj['contents']]) w_list = cfg.word_cut(body) return len(w_list)
def gen_res(args=None): SparkContext.getOrCreate().stop() conf = SparkConf().setMaster("local[*]").setAppName("bm25") \ .set("spark.executor.memory", "10g") \ .set("spark.driver.maxResultSize", "10g") \ .set("spark.cores.max", 10) \ .set("spark.executor.cores", 10) \ .set("spark.default.parallelism", 20) sc = SparkContext(conf=conf) # stop words stop_words = {} with open('../elastic/stopwords.txt', 'r', encoding='utf-8') as f: for w in f: w = w[:-1] stop_words[w] = 1 print('stop words loaded.') # words df words_df = sc.textFile(cfg.OUTPUT + 'words_index.txt') \ .filter(lambda line: line != '') \ .map(lambda line: (str(line.split(' ')[0]).lower(), len(line.split(' ')[1:]))) \ .collectAsMap() words_df = sc.broadcast(words_df) print('words_df loaded.') # avgdl avgdl = sc.textFile(path_mp['DataPath'] + path_mp['WashingtonPost']) \ .map(lambda line: calc_doc_length(line)).sum() avgdl = avgdl * 1.0 / 595037 print('avgdl loaded.') # WashingtonPost WashingtonPost = sc.textFile(path_mp['DataPath'] + path_mp['WashingtonPost']) \ .map(lambda line: return_doc(line)).collectAsMap() print('WashingtonPost loaded.') # test case case_mp = {} with open(path_mp['DataPath'] + path_mp['topics'], 'r', encoding='utf-8') as f: li = [] for line in f: topic_id = re.search(r'<num>.*?</num>', line) if topic_id is not None: topic_id = topic_id.group(0)[5 + 9:-7] li.append(topic_id) doc_id = re.search(r'<docid>.*?</docid>', line) if doc_id is not None: doc_id = doc_id.group(0)[7:-8] li.append(doc_id) if len(li) == 2: case_mp[li[1]] = li[0] li = [] print('test case loaded.') # filter and generate result with open('/home/trec7/lianxiaoying/trec_eval.9.0/test/bresult.test', 'w', encoding='utf-8') as f: for cur_id in case_mp.keys(): topic_id = case_mp[cur_id] print('now is processing:', topic_id) obj = WashingtonPost[cur_id] body = extract_body([obj['contents']]) # query (modify) tmp1 = cfg.word_cut(str(obj['title'] + ' ' + body).lower()) tmp = [] for w in tmp1: if w not in stop_words: tmp.append(w) query = tmp if len(tmp) > 768: query = tmp[:512] + tmp[-256:] res = bm25(sc, query, words_df, avgdl) # filter title = obj['title'] author = obj['author'] date = obj['published_date'] similar_doc = {} cur_key = '' if title is not None: cur_key += title if author is not None: cur_key += '#' + author if date is None: cur_key += '#' + str(date) similar_doc[cur_key] = 1 for score, doc_id in res: doc = WashingtonPost[doc_id] if filter_doc(doc, date, similar_doc): out = [] out.append(topic_id) out.append('Q0') out.append(doc_id) out.append(str(0)) out.append(str(score)) out.append('ICTNET') f.write("\t".join(out) + "\n")