def normalized_entity_distance(entity, context): """ :param extracted_entities: :param context: :return filtered_entities: """ filtered_entities = [] cn = context entity = entity.lower() query = {} res = es.search(index="twosent_tud", doc_type="twosentnorules", body=query) NN = res['hits']['total'] query = {"query": {"match": { "content.chapter.sentpositive": { "query": entity, "operator": "and" } } } } res = es.search(index="twosent_tud", doc_type="twosentnorules", body=query) total_a = res['hits']['total'] query = {"query": {"match": { "content.chapter.sentpositive": { "query": cn, "operator": "and" } } } } res = es.search(index="twosent_tud", doc_type="twosentnorules", body=query) total_b = res['hits']['total'] query_text = entity + ' ' + cn query = {"query": {"match": { "content.chapter.sentpositive": { "query": query_text, "operator": "and" } } } } res = es.search(index="twosent_tud", doc_type="twosentnorules", body=query) total_ab = res['hits']['total'] pmi = 0 if total_a and total_b and total_ab: total_ab = total_ab / NN total_a = total_a / NN total_b = total_b / NN pmi = total_ab / (total_a * total_b) pmi = math.log(pmi, 2) return pmi
def query(unit_name): response = es.search( index='units', body={"query": { "term": { "unit_name.keyword": unit_name } }}) return response['hits']['total']['value'] > 0
def search_content_exact(keyword, page, limit): """ 精确搜索 :return: """ body = {"query": {"term": {"query.keyword": keyword}}} ret_content = es.search(index="hot_words", doc_type="doc", body=body) return ret_content
def extract_similar_sentences(es_id): """ Function for finding similar sentences given the code of a sentence (everything is stored in elasticsearch) """ query = {"query": {"match": {"_id": {"query": es_id, "operator": "and"}}}} similar_sentence = '' res = es.search(index="devtwosentnew", body=query, size=5) if len(res) > 1: for doc in res['hits']['hits']: similar_sentence = doc['_source']['content.chapter.sentpositive'] return similar_sentence
def callback(unit_name): response = es.search( index='units', body={"query": { "term": { "unit_name.keyword": unit_name } }}) if response['hits']['total']['value'] == 0: return data = response['hits']['hits'][0]['_source'] area = f"{data['province']}/{data['city']}/{data['district']}" result[unit_name] = [area, data['unit_type'], data['industry']]
def get(self): parser = reqparse.RequestParser() parser.add_argument('limit', type=int, help='limit必须为int',default=100) parser.add_argument('q', type=str, help='请输入有效查询') args = parser.parse_args() s = es.search(index="whatweb", q=args.q, size=args.limit) if s['hits']['hits']: hits = [] for hit in s['hits']['hits']: hits.append(hit['_source']) return {'count': len(hits), 'data': hits} else: return []
def get(self): parser = reqparse.RequestParser() parser.add_argument('limit', type=int, help='limit必须为int',default=1) parser.add_argument('domain', type=str, help='请输入有效域名') args = parser.parse_args() s = es.search(index="subdomains", q='domain:{}'.format(args.domain), size=args.limit) if s['hits']['hits']: hits = [] for hit in s['hits']['hits']: hits.append(hit['_source']) return {'count': len(hits), 'data': hits} else: return []
def elasticsearch_like(like_string): request_data = { 'query': { "more_like_this": { "fields": ['title', 'company_name'], 'like_text': like_string, "min_doc_freq": 1, "min_term_freq": 1 } } } response = es.search(index='vacancies', body=request_data) ids = map(lambda x: int(x['_id']), response['hits']['hits']) print ids return ids
def query_by_keyword(): query_body = { "query": { "match": { "title": "中国" } # 新闻标题中包含中国关键词 }, "size": 10000 } tb = time.time() results = es.search(index=ES_INDEX_NAME, doc_type=ES_INDEX_DOC_TYPE, body=query_body)["hits"]["hits"] ff = open("search_results.txt", "w", encoding="UTF-8") for item in results: data = item["_source"] ff.write("%s|.|%s|.|%s|.|%s\n" % (data["url"], data["date"], data["title"], data["content"])) ff.close() tb1 = time.time() print("ES search data time (seconds): ", tb1 - tb)
def find(self, size=100): return es.search(index="id", doc_type=self.class_name, size=size, sort={'created_at': 'desc'})['hits']['hits']
def sentence_extraction(model_name: str, training_cycle: int, list_of_seeds: list) -> None: """ Extracts from the corpus all sentences that include at least one of the given seeds (in list_of_seeds). In addition, it excludes sentences that have any of the entities from a test set, when provided. :param model_name: :type model_name: :param training_cycle: :type training_cycle: :param list_of_seeds: text list of seed entities :type list_of_seeds: str :returns: Creates and saves files for seeds and sentences :rtype: None """ print('Started initial training data extraction') testing = False test_entities = [] if testing: # We get the entity names which have been used in the testing set to exclude them from the # training sentences test_entities = [] path = ROOTPATH + '/data/demo-test.txt' with open(path, 'r') as file: for row in file.readlines(): test_entities.append(row.strip()) test_entities = [e.lower() for e in test_entities] test_entities = list(set(test_entities)) # List of seed names seed_entities = [] seed_entities = list_of_seeds if training_cycle == 0: seed_entities = list_of_seeds else: path = ROOTPATH + '/processing_files/' + model_name + '_filtered_entities_majority_' + str(training_cycle - 1) + '.txt' with open(path, 'r') as file: for row in file.readlines(): seed_entities.append(row.strip()) file.close() seed_entities = [e.lower() for e in seed_entities] seed_entities = list(set(seed_entities)) print('Extracting sentences for', len(seed_entities), 'seed terms') paragraph = [] # Using the seeds, extract the sentences from the publications text in Elasticsearch index for entity in seed_entities: entity_name = re.sub(r'\([^)]*\)', '', entity) print('.', end='') query = {"query": {"match": {"content.chapter.sentpositive": {"query": "\"" + entity_name + "\"", # alex: use quotation marks to only query full matches "operator": "and" } } } } res = es.search(index="twosent", body=query, size=1000) # clean up the sentences and if they don't contain the names of the test set then add them as # the training data for doc in res['hits']['hits']: sentence = doc["_source"]["content.chapter.sentpositive"] words = nltk.word_tokenize(sentence) lengths = [len(x) for x in words] average = sum(lengths) / len(lengths) if average < 3: continue sentence = sentence.replace("@ BULLET", "") sentence = sentence.replace("@BULLET", "") sentence = sentence.replace(", ", " , ") sentence = sentence.replace('(', '') sentence = sentence.replace(')', '') sentence = sentence.replace('[', '') sentence = sentence.replace(']', '') sentence = sentence.replace(',', ' ,') sentence = sentence.replace('?', ' ?') sentence = sentence.replace('..', '.') if any(e in words for e in test_entities): continue else: paragraph.append(sentence) paragraph = list(set(paragraph)) # Store sentences and seeds path = ROOTPATH + '/processing_files/' + model_name + '_sentences_' + str(training_cycle) + '.txt' f = open(path, 'w', encoding='utf-8') for item in paragraph: f.write('%s\n' % item) f.close() path = ROOTPATH + '/processing_files/' + model_name + '_seeds_' + str(training_cycle) + '.txt' f = open(path, 'w', encoding='utf-8') # We could use mongodb instead for item in seed_entities: f.write('%s\n' % item) f.close() print('Process finished with', len(seed_entities), 'seeds and', len(paragraph), 'sentences added for training in cycle number', str(training_cycle)) sys.stdout.flush()
publications = ['arxiv'] # "WWW", "ICSE", "VLDB", "JCDL", "TREC", "SIGIR", "ICWSM", "ECDL", "ESWC", "TPDL", # "PLoS Biology", "Breast Cancer Research", "BMC Evolutionary Biology", "BMC Genomics", # "BMC Biotechnology", # "BMC Neuroscience", "Genome Biology", "PLoS Genetics", "Breast Cancer Research : BCR", # "Genome Biology and Evolution", "Breast Cancer Research and Treatment" model_name = 'DATA' for publication in publications: res = es.search( index="smartpub", body={"query": { "match": { "journal": { "query": publication } } }}, size=20) total_docs = res['hits']['total'] _query = {"query": {"match": {"journal": {"query": publication}}}} # res = es.search(index="ir_full", doc_type="publications", # body=query, size=10000) # print(len(res['hits']['hits'])) # for doc in res['hits']['hits']:
def normalized_pub_distance(extracted_entities, context): """ :param extracted_entities: :param context: :return filtered_entities: """ filtered_entities = [] context_words = context # context words for dataset # context_words = ['dataset', 'corpus', 'collection', 'repository', 'benchmark', 'website'] # context words for method # context_words = ['method', 'model', 'algorithm', 'approach','technique'] # context words for proteins # context_words = ['protein', 'receptor'] extracted_entities = [x.lower() for x in extracted_entities] extracted_entities = list(set(extracted_entities)) for cn in context_words: for entity in extracted_entities: if any(x in entity.lower() for x in context_words): filtered_entities.append(entity) query = {} res = es.search(index="twosent", body=query) NN = res['hits']['total']['value'] query = {"query": {"match": { "content.chapter.sentpositive": { "query": "\"" + entity + "\"", "operator": "and" } } } } res = es.search(index="twosent", body=query) total_a = res['hits']['total']['value'] query = {"query": {"match": { "content.chapter.sentpositive": { "query": cn, "operator": "and" } } } } res = es.search(index="twosent", body=query) total_b = res['hits']['total']['value'] query_text = "\"" + entity + "\"" + ' ' + cn query = {"query": {"match": { "content.chapter.sentpositive": { "query": query_text, "operator": "and" } } } } res = es.search(index="twosent", body=query) total_ab = res['hits']['total']['value'] pmi = 0 if total_a and total_b and total_ab: total_ab = total_ab / NN total_a = total_a / NN total_b = total_b / NN pmi = total_ab / (total_a * total_b) pmi = math.log(pmi, 2) if pmi >= 2: filtered_entities.append(entity) return filtered_entities, pmi
def ne_extraction_conferences(model_name, training_cycle, sentence_expansion): print('Started extraction for the', model_name, 'model, in cycle number', training_cycle) if sentence_expansion: path_to_model = ROOTPATH + '/crf_trained_files/' + model_name + '_TSE_model_' + str( training_cycle) + '.ser.gz' else: path_to_model = ROOTPATH + '/crf_trained_files/' + model_name + '_TE_model_' + str( training_cycle) + '.ser.gz' # use the trained Stanford NER model to extract entities from the publications ner_tagger = StanfordNERTagger(path_to_model, STANFORD_NER_PATH) result = [] for conference in evaluation_conferences: query = {"query": {"match": {"journal": conference}}} # Maximum size of 2100 to ensure total number of evaluation publications from 11 conferences is around 11k res = es.search(index="ir_full", doc_type="publications", body=query, size=2100) print( f'Extracting entities for {len(res["hits"]["hits"])} {conference} conference papers' ) sys.stdout.flush() counter = 0 for doc in res['hits']['hits']: counter += 1 if counter % 20 == 0: print(f'Tagged {counter}/' + str(len(res['hits']['hits'])), 'full texts for ' + conference) sentence = doc["_source"]["content"] sentence = sentence.replace("@ BULLET", "") sentence = sentence.replace("@BULLET", "") sentence = sentence.replace(", ", " , ") sentence = sentence.replace('(', '') sentence = sentence.replace(')', '') sentence = sentence.replace('[', '') sentence = sentence.replace(']', '') sentence = sentence.replace(',', ' ,') sentence = sentence.replace('?', ' ?') sentence = sentence.replace('..', '.') sentence = re.sub(r"(\.)([A-Z])", r"\1 \2", sentence) tagged = ner_tagger.tag(sentence.split()) for jj, (a, b) in enumerate(tagged): tag = model_name.upper() if b == tag: a = a.translate(str.maketrans('', '', string.punctuation)) try: if res[jj + 1][1] == tag: temp = res[jj + 1][0].translate( str.maketrans('', '', string.punctuation)) bigram = a + ' ' + temp result.append(bigram) except: result.append(a) continue result.append(a) print('.', end='') sys.stdout.flush() result = list(set(result)) result = [w.replace('"', '') for w in result] filtered_words = [ word for word in set(result) if word not in stopwords.words('english') ] print('Total of', len(filtered_words), 'filtered entities added') sys.stdout.flush() f1 = open(ROOTPATH + '/processing_files/' + model_name + '_extracted_entities_' + str(training_cycle) + '.txt', 'w', encoding='utf-8') for item in filtered_words: f1.write(item + '\n') f1.close()
def ne_extraction(model_name, training_cycle, sentence_expansion): print('started extraction for the', model_name, 'model, in cycle number', training_cycle) if sentence_expansion: path_to_model = ROOTPATH + '/crf_trained_files/' + model_name + '_TSE_model_' + str( training_cycle) + '.ser.gz' else: path_to_model = ROOTPATH + '/crf_trained_files/' + model_name + '_TE_model_' + str( training_cycle) + '.ser.gz' # use the trained Stanford NER model to extract entities from the publications ner_tagger = StanfordNERTagger(path_to_model, STANFORD_NER_PATH) query = { "query": { "function_score": { "functions": [{ "random_score": { "seed": str(int(round(time.time() * 1000))) } }] } } } res = es.search(index="ir", body=query, size=10000) hits = res['hits']['hits'] total = len(hits) print(total) sys.stdout.flush() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') def word_index_to_sentence(word_index): for sentence_index, start_index in enumerate(sentences_index): if start_index > word_index: return sentences[sentence_index - 1] return sentences[-1] result = {} # Alex: Batch Stanford NER, since this increases the tagging speed substantially (9 hrs -> 18 min) for counter in range(0, total, 50): print(f'Tagged {counter}/' + str(total), 'full texts') sentence = " ".join(doc["_source"]["abstract"] for doc in hits[counter:min(counter + 50, total)]) sentence = sentence.replace("@ BULLET", "") sentence = sentence.replace("@BULLET", "") sentence = sentence.replace(", ", " , ") sentence = sentence.replace('(', '') sentence = sentence.replace(')', '') sentence = sentence.replace('[', '') sentence = sentence.replace(']', '') sentence = sentence.replace(',', ' ,') sentence = sentence.replace('?', ' ?') sentence = sentence.replace('..', '.') sentence = re.sub(r"(\.)([A-Z])", r"\1 \2", sentence) sentences = sent_detector.tokenize(sentence) sentences_index = [] words = [] for sentence in sentences: sentences_index.append(len(words)) words.extend(sentence.split()) tagged = ner_tagger.tag(sentence.split()) for index, (word, tag) in enumerate(tagged): classification_tag = model_name.upper() if tag == classification_tag: word = word.translate(str.maketrans('', '', string.punctuation)) if word not in result: result[word] = word_index_to_sentence(index) if index + 1 < len(tagged) and tagged[ index + 1][1] == classification_tag: temp = tagged[index + 1][0].translate( str.maketrans('', '', string.punctuation)) bigram = word + ' ' + temp if bigram not in result: result[bigram] = word_index_to_sentence(index) #print('.', end='') #sys.stdout.flush() result = [(k, v) for k, v in result.items()] result = [(k.replace('"', '').strip(), v) for k, v in result] result = [(k, v) for k, v in result if len(k) > 0] filtered_words = [(word, sentence) for word, sentence in result if word not in stopwords.words('english')] print('Total of', len(filtered_words), 'filtered entities added') sys.stdout.flush() with open(ROOTPATH + '/processing_files/' + model_name + '_extracted_entities_' + str(training_cycle) + '.txt', 'w', encoding='utf-8') as f1: for word, sentence in filtered_words: f1.write(word + '\n') # Alex: also record sentences for context (can later be used for BERT clustering) with open(ROOTPATH + '/processing_files/' + model_name + '_extracted_entities_sentences_' + str(training_cycle) + '.txt', 'w', encoding='utf-8') as f1: for word, sentence in filtered_words: f1.write(word + '\t' + sentence.replace('\n', '') + '\n')
def search_content_indistinct(keyword, page, limit): if len(keyword) > 1: if re.match(r"^[a-zA-Z0-9]+$", keyword): # fuzzy “fuzziness”为“编辑距离”,相似度,“prefix_length”前缀相同长度。 body = { "query": { "fuzzy": { "query": { "value": keyword, "fuzziness": 1, # "fuzziness": "AUTO" "prefix_length": 2 } } }, "from": (int(page) - 1) * int(limit), "size": int(limit), # ES默认显示10条数据 } else: body = { "query": { "bool": { "must": { "multi_match": { "query": keyword, "fields": ["query^2", "source"], # 在字段末尾添加 ^boost, 代表权重值,默认为1.0 "fuzziness": "AUTO", # "operator":"and", # 多个切词结果在一个item, 等价于 "minimum_should_match":"100%" } }, # "filter": { # "bool":{ # "must": [ # {"term": {"deleted": "0"}}, # {"term": {"status": "published"}} # ], # } # } } }, "from": (int(page) - 1) * int(limit), "size": int(limit), # 排序 # "sort": { # "update_time": { # "order": "desc" # } # }, # 高亮 "highlight": { "fields": { "query": {} } } } else: # 单个字搜索 body = { "query": { "wildcard": { "query": "*" + keyword + "*", } }, "from": (int(page) - 1) * int(limit), "size": int(limit), } ret_content = es.search(index="hot_words", doc_type="doc", body=body) ret_content = _highlight(ret_content) return ret_content