def normalized_entity_distance(entity, context):
    """

    :param extracted_entities:
    :param context:
    :return filtered_entities:
    """
    filtered_entities = []
    cn = context
    entity = entity.lower()

    query = {}
    res = es.search(index="twosent_tud", doc_type="twosentnorules", body=query)
    NN = res['hits']['total']
    
    query = {"query":
        {"match": {
            "content.chapter.sentpositive": {
                "query": entity,
                "operator": "and"
            }
        }
        }
    }
    res = es.search(index="twosent_tud", doc_type="twosentnorules", body=query)
    total_a = res['hits']['total']
    query = {"query":
        {"match": {
            "content.chapter.sentpositive": {
                "query": cn,
                "operator": "and"
            }
        }
        }
    }
    res = es.search(index="twosent_tud", doc_type="twosentnorules", body=query)
    total_b = res['hits']['total']
    query_text = entity + ' ' + cn
    query = {"query":
        {"match": {
            "content.chapter.sentpositive": {
                "query": query_text,
                "operator": "and"
            }
        }
        }
    }
    res = es.search(index="twosent_tud", doc_type="twosentnorules", body=query)
    total_ab = res['hits']['total']
    pmi = 0
    if total_a and total_b and total_ab:
        total_ab = total_ab / NN
        total_a = total_a / NN
        total_b = total_b / NN
        pmi = total_ab / (total_a * total_b)
        pmi = math.log(pmi, 2)
    return pmi
Beispiel #2
0
def query(unit_name):
    response = es.search(
        index='units',
        body={"query": {
            "term": {
                "unit_name.keyword": unit_name
            }
        }})
    return response['hits']['total']['value'] > 0
Beispiel #3
0
def search_content_exact(keyword, page, limit):
    """
    精确搜索
    :return:
    """
    body = {"query": {"term": {"query.keyword": keyword}}}

    ret_content = es.search(index="hot_words", doc_type="doc", body=body)

    return ret_content
def extract_similar_sentences(es_id):
    """
    Function for finding similar sentences given the code of a sentence (everything is stored in elasticsearch)
    """
    query = {"query": {"match": {"_id": {"query": es_id, "operator": "and"}}}}
    similar_sentence = ''
    res = es.search(index="devtwosentnew", body=query, size=5)
    if len(res) > 1:
        for doc in res['hits']['hits']:
            similar_sentence = doc['_source']['content.chapter.sentpositive']

    return similar_sentence
Beispiel #5
0
def callback(unit_name):
    response = es.search(
        index='units',
        body={"query": {
            "term": {
                "unit_name.keyword": unit_name
            }
        }})
    if response['hits']['total']['value'] == 0:
        return
    data = response['hits']['hits'][0]['_source']
    area = f"{data['province']}/{data['city']}/{data['district']}"
    result[unit_name] = [area, data['unit_type'], data['industry']]
Beispiel #6
0
 def get(self):
     parser = reqparse.RequestParser()
     parser.add_argument('limit', type=int, help='limit必须为int',default=100)
     parser.add_argument('q', type=str, help='请输入有效查询')
     args = parser.parse_args()
     s = es.search(index="whatweb", q=args.q, size=args.limit)
     if s['hits']['hits']:
         hits = []
         for hit in s['hits']['hits']:
             hits.append(hit['_source'])
         return {'count': len(hits), 'data': hits}
     else:
         return []
Beispiel #7
0
 def get(self):
     parser = reqparse.RequestParser()
     parser.add_argument('limit', type=int, help='limit必须为int',default=1)
     parser.add_argument('domain', type=str, help='请输入有效域名')
     args = parser.parse_args()
     s = es.search(index="subdomains", q='domain:{}'.format(args.domain), size=args.limit)
     if s['hits']['hits']:
         hits = []
         for hit in s['hits']['hits']:
             hits.append(hit['_source'])
         return {'count': len(hits), 'data': hits}
     else:
         return []
Beispiel #8
0
def elasticsearch_like(like_string):
    request_data = {
        'query': {
            "more_like_this": {
                "fields": ['title', 'company_name'],
                'like_text': like_string,
                "min_doc_freq": 1,
                "min_term_freq": 1
            }
        }
    }
    response = es.search(index='vacancies', body=request_data)
    ids = map(lambda x: int(x['_id']), response['hits']['hits'])
    print ids
    return ids
Beispiel #9
0
def query_by_keyword():
    query_body = {
        "query": {
            "match": {
                "title": "中国"
            }  # 新闻标题中包含中国关键词
        },
        "size": 10000
    }

    tb = time.time()
    results = es.search(index=ES_INDEX_NAME,
                        doc_type=ES_INDEX_DOC_TYPE,
                        body=query_body)["hits"]["hits"]

    ff = open("search_results.txt", "w", encoding="UTF-8")
    for item in results:
        data = item["_source"]
        ff.write("%s|.|%s|.|%s|.|%s\n" %
                 (data["url"], data["date"], data["title"], data["content"]))
    ff.close()
    tb1 = time.time()
    print("ES search data time (seconds): ", tb1 - tb)
 def find(self, size=100):
     return es.search(index="id",
                      doc_type=self.class_name,
                      size=size,
                      sort={'created_at': 'desc'})['hits']['hits']
Beispiel #11
0
def sentence_extraction(model_name: str, training_cycle: int, list_of_seeds: list) -> None:
    """
    Extracts from the corpus all sentences that include at least one of the given seeds (in list_of_seeds).
    In addition, it excludes sentences that have any of the entities from a test set, when provided.
    :param model_name:
    :type model_name:
    :param training_cycle:
    :type training_cycle:
    :param list_of_seeds: text list of seed entities
    :type list_of_seeds: str
    :returns: Creates and saves files for seeds and sentences
    :rtype: None
    """
    print('Started initial training data extraction')

    testing = False
    test_entities = []
    if testing:
        # We get the entity names which have been used in the testing set to exclude them from the
        # training sentences
        test_entities = []
        path = ROOTPATH + '/data/demo-test.txt'
        with open(path, 'r') as file:
            for row in file.readlines():
                test_entities.append(row.strip())
        test_entities = [e.lower() for e in test_entities]
        test_entities = list(set(test_entities))

    # List of seed names
    seed_entities = []
    seed_entities = list_of_seeds
    
    if training_cycle == 0:
        seed_entities = list_of_seeds
    else:
        path = ROOTPATH + '/processing_files/' + model_name + '_filtered_entities_majority_' + str(training_cycle - 1) + '.txt'
        with open(path, 'r') as file:
            for row in file.readlines():
                seed_entities.append(row.strip())
        file.close()

    seed_entities = [e.lower() for e in seed_entities]
    seed_entities = list(set(seed_entities))

    print('Extracting sentences for', len(seed_entities), 'seed terms')
    paragraph = []

    # Using the seeds, extract the sentences from the publications text in Elasticsearch index
    for entity in seed_entities:
        entity_name = re.sub(r'\([^)]*\)', '', entity)
        print('.', end='')
        query = {"query":
                    {"match":
                        {"content.chapter.sentpositive":
                            {"query": "\"" + entity_name + "\"",  # alex: use quotation marks to only query full matches
                             "operator": "and"
                             }
                         }
                     }
                 }

        res = es.search(index="twosent",
                        body=query, size=1000)

        # clean up the sentences and if they don't contain the names of the test set then add them as
        # the training data
        for doc in res['hits']['hits']:
            sentence = doc["_source"]["content.chapter.sentpositive"]
            words = nltk.word_tokenize(sentence)
            lengths = [len(x) for x in words]
            average = sum(lengths) / len(lengths)
            if average < 3:
                continue
            sentence = sentence.replace("@ BULLET", "")
            sentence = sentence.replace("@BULLET", "")
            sentence = sentence.replace(", ", " , ")
            sentence = sentence.replace('(', '')
            sentence = sentence.replace(')', '')
            sentence = sentence.replace('[', '')
            sentence = sentence.replace(']', '')
            sentence = sentence.replace(',', ' ,')
            sentence = sentence.replace('?', ' ?')
            sentence = sentence.replace('..', '.')

            if any(e in words for e in test_entities):
                continue
            else:
                paragraph.append(sentence)

    paragraph = list(set(paragraph))

    # Store sentences and seeds
    path = ROOTPATH + '/processing_files/' + model_name + '_sentences_' + str(training_cycle) + '.txt'
    f = open(path, 'w', encoding='utf-8')
    for item in paragraph:
        f.write('%s\n' % item)
    f.close()

    path = ROOTPATH + '/processing_files/' + model_name + '_seeds_' + str(training_cycle) + '.txt'
    f = open(path, 'w', encoding='utf-8')   # We could use mongodb instead
    for item in seed_entities:
        f.write('%s\n' % item)
    f.close()

    print('Process finished with', len(seed_entities), 'seeds and',
          len(paragraph), 'sentences added for training in cycle number', str(training_cycle))
    sys.stdout.flush()
publications = ['arxiv']
# "WWW", "ICSE", "VLDB", "JCDL", "TREC", "SIGIR", "ICWSM", "ECDL", "ESWC", "TPDL",
# "PLoS Biology", "Breast Cancer Research", "BMC Evolutionary Biology", "BMC Genomics",
# "BMC Biotechnology",
# "BMC Neuroscience", "Genome Biology", "PLoS Genetics", "Breast Cancer Research : BCR",
# "Genome Biology and Evolution", "Breast Cancer Research and Treatment"

model_name = 'DATA'

for publication in publications:

    res = es.search(
        index="smartpub",
        body={"query": {
            "match": {
                "journal": {
                    "query": publication
                }
            }
        }},
        size=20)
    total_docs = res['hits']['total']

    _query = {"query": {"match": {"journal": {"query": publication}}}}

    #     res = es.search(index="ir_full", doc_type="publications",
    #                     body=query, size=10000)

    #     print(len(res['hits']['hits']))

    #     for doc in res['hits']['hits']:
def normalized_pub_distance(extracted_entities, context):
    """

    :param extracted_entities:
    :param context:
    :return filtered_entities:
    """
    filtered_entities = []
    context_words = context

    # context words for dataset
    # context_words = ['dataset', 'corpus', 'collection', 'repository', 'benchmark', 'website']

    # context words for method
    # context_words = ['method', 'model', 'algorithm', 'approach','technique']

    # context words for proteins
    # context_words = ['protein', 'receptor']

    extracted_entities = [x.lower() for x in extracted_entities]
    extracted_entities = list(set(extracted_entities))
    for cn in context_words:
        for entity in extracted_entities:
            if any(x in entity.lower() for x in context_words):
                filtered_entities.append(entity)
                
            query = {}
            res = es.search(index="twosent", body=query)
            NN = res['hits']['total']['value']
            
            query = {"query":
                {"match": {
                    "content.chapter.sentpositive": {
                        "query": "\"" + entity + "\"",
                        "operator": "and"
                    }
                }
                }
            }
            res = es.search(index="twosent", body=query)
            total_a = res['hits']['total']['value']
            query = {"query":
                {"match": {
                    "content.chapter.sentpositive": {
                        "query": cn,
                        "operator": "and"
                    }
                }
                }
            }
            res = es.search(index="twosent", body=query)
            total_b = res['hits']['total']['value']
            query_text = "\"" + entity + "\"" + ' ' + cn
            query = {"query":
                {"match": {
                    "content.chapter.sentpositive": {
                        "query": query_text,
                        "operator": "and"
                    }
                }
                }
            }
            res = es.search(index="twosent", body=query)
            total_ab = res['hits']['total']['value']
            pmi = 0
            if total_a and total_b and total_ab:
                total_ab = total_ab / NN
                total_a = total_a / NN
                total_b = total_b / NN
                pmi = total_ab / (total_a * total_b)
                pmi = math.log(pmi, 2)
                if pmi >= 2:
                    filtered_entities.append(entity)
    return filtered_entities, pmi
def ne_extraction_conferences(model_name, training_cycle, sentence_expansion):
    print('Started extraction for the', model_name, 'model, in cycle number',
          training_cycle)

    if sentence_expansion:
        path_to_model = ROOTPATH + '/crf_trained_files/' + model_name + '_TSE_model_' + str(
            training_cycle) + '.ser.gz'
    else:
        path_to_model = ROOTPATH + '/crf_trained_files/' + model_name + '_TE_model_' + str(
            training_cycle) + '.ser.gz'

    # use the trained Stanford NER model to extract entities from the publications
    ner_tagger = StanfordNERTagger(path_to_model, STANFORD_NER_PATH)

    result = []

    for conference in evaluation_conferences:
        query = {"query": {"match": {"journal": conference}}}

        # Maximum size of 2100 to ensure total number of evaluation publications from 11 conferences is around 11k
        res = es.search(index="ir_full",
                        doc_type="publications",
                        body=query,
                        size=2100)

        print(
            f'Extracting entities for {len(res["hits"]["hits"])} {conference} conference papers'
        )

        sys.stdout.flush()

        counter = 0
        for doc in res['hits']['hits']:
            counter += 1
            if counter % 20 == 0:
                print(f'Tagged {counter}/' + str(len(res['hits']['hits'])),
                      'full texts for ' + conference)
            sentence = doc["_source"]["content"]
            sentence = sentence.replace("@ BULLET", "")
            sentence = sentence.replace("@BULLET", "")
            sentence = sentence.replace(", ", " , ")
            sentence = sentence.replace('(', '')
            sentence = sentence.replace(')', '')
            sentence = sentence.replace('[', '')
            sentence = sentence.replace(']', '')
            sentence = sentence.replace(',', ' ,')
            sentence = sentence.replace('?', ' ?')
            sentence = sentence.replace('..', '.')
            sentence = re.sub(r"(\.)([A-Z])", r"\1 \2", sentence)

            tagged = ner_tagger.tag(sentence.split())

            for jj, (a, b) in enumerate(tagged):
                tag = model_name.upper()
                if b == tag:
                    a = a.translate(str.maketrans('', '', string.punctuation))
                    try:
                        if res[jj + 1][1] == tag:
                            temp = res[jj + 1][0].translate(
                                str.maketrans('', '', string.punctuation))
                            bigram = a + ' ' + temp
                            result.append(bigram)
                    except:
                        result.append(a)
                        continue
                    result.append(a)
            print('.', end='')
            sys.stdout.flush()

    result = list(set(result))
    result = [w.replace('"', '') for w in result]
    filtered_words = [
        word for word in set(result) if word not in stopwords.words('english')
    ]
    print('Total of', len(filtered_words), 'filtered entities added')
    sys.stdout.flush()
    f1 = open(ROOTPATH + '/processing_files/' + model_name +
              '_extracted_entities_' + str(training_cycle) + '.txt',
              'w',
              encoding='utf-8')
    for item in filtered_words:
        f1.write(item + '\n')
    f1.close()
def ne_extraction(model_name, training_cycle, sentence_expansion):
    print('started extraction for the', model_name, 'model, in cycle number',
          training_cycle)

    if sentence_expansion:
        path_to_model = ROOTPATH + '/crf_trained_files/' + model_name + '_TSE_model_' + str(
            training_cycle) + '.ser.gz'
    else:
        path_to_model = ROOTPATH + '/crf_trained_files/' + model_name + '_TE_model_' + str(
            training_cycle) + '.ser.gz'

    # use the trained Stanford NER model to extract entities from the publications
    ner_tagger = StanfordNERTagger(path_to_model, STANFORD_NER_PATH)

    query = {
        "query": {
            "function_score": {
                "functions": [{
                    "random_score": {
                        "seed": str(int(round(time.time() * 1000)))
                    }
                }]
            }
        }
    }

    res = es.search(index="ir", body=query, size=10000)

    hits = res['hits']['hits']
    total = len(hits)
    print(total)
    sys.stdout.flush()
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    def word_index_to_sentence(word_index):
        for sentence_index, start_index in enumerate(sentences_index):
            if start_index > word_index:
                return sentences[sentence_index - 1]
        return sentences[-1]

    result = {}

    # Alex: Batch Stanford NER, since this increases the tagging speed substantially (9 hrs -> 18 min)
    for counter in range(0, total, 50):
        print(f'Tagged {counter}/' + str(total), 'full texts')
        sentence = " ".join(doc["_source"]["abstract"]
                            for doc in hits[counter:min(counter + 50, total)])
        sentence = sentence.replace("@ BULLET", "")
        sentence = sentence.replace("@BULLET", "")
        sentence = sentence.replace(", ", " , ")
        sentence = sentence.replace('(', '')
        sentence = sentence.replace(')', '')
        sentence = sentence.replace('[', '')
        sentence = sentence.replace(']', '')
        sentence = sentence.replace(',', ' ,')
        sentence = sentence.replace('?', ' ?')
        sentence = sentence.replace('..', '.')
        sentence = re.sub(r"(\.)([A-Z])", r"\1 \2", sentence)

        sentences = sent_detector.tokenize(sentence)
        sentences_index = []
        words = []
        for sentence in sentences:
            sentences_index.append(len(words))
            words.extend(sentence.split())

        tagged = ner_tagger.tag(sentence.split())
        for index, (word, tag) in enumerate(tagged):
            classification_tag = model_name.upper()
            if tag == classification_tag:
                word = word.translate(str.maketrans('', '',
                                                    string.punctuation))
                if word not in result:
                    result[word] = word_index_to_sentence(index)

                if index + 1 < len(tagged) and tagged[
                        index + 1][1] == classification_tag:
                    temp = tagged[index + 1][0].translate(
                        str.maketrans('', '', string.punctuation))
                    bigram = word + ' ' + temp
                    if bigram not in result:
                        result[bigram] = word_index_to_sentence(index)
        #print('.', end='')
        #sys.stdout.flush()

    result = [(k, v) for k, v in result.items()]
    result = [(k.replace('"', '').strip(), v) for k, v in result]
    result = [(k, v) for k, v in result if len(k) > 0]
    filtered_words = [(word, sentence) for word, sentence in result
                      if word not in stopwords.words('english')]
    print('Total of', len(filtered_words), 'filtered entities added')
    sys.stdout.flush()
    with open(ROOTPATH + '/processing_files/' + model_name +
              '_extracted_entities_' + str(training_cycle) + '.txt',
              'w',
              encoding='utf-8') as f1:
        for word, sentence in filtered_words:
            f1.write(word + '\n')

    # Alex: also record sentences for context (can later be used for BERT clustering)
    with open(ROOTPATH + '/processing_files/' + model_name +
              '_extracted_entities_sentences_' + str(training_cycle) + '.txt',
              'w',
              encoding='utf-8') as f1:
        for word, sentence in filtered_words:
            f1.write(word + '\t' + sentence.replace('\n', '') + '\n')
Beispiel #16
0
def search_content_indistinct(keyword, page, limit):
    if len(keyword) > 1:
        if re.match(r"^[a-zA-Z0-9]+$", keyword):
            # fuzzy “fuzziness”为“编辑距离”,相似度,“prefix_length”前缀相同长度。
            body = {
                "query": {
                    "fuzzy": {
                        "query": {
                            "value": keyword,
                            "fuzziness": 1,  # "fuzziness": "AUTO"
                            "prefix_length": 2
                        }
                    }
                },
                "from": (int(page) - 1) * int(limit),
                "size": int(limit),  # ES默认显示10条数据
            }
        else:
            body = {
                "query": {
                    "bool": {
                        "must": {
                            "multi_match": {
                                "query": keyword,
                                "fields":
                                ["query^2",
                                 "source"],  # 在字段末尾添加 ^boost, 代表权重值,默认为1.0
                                "fuzziness": "AUTO",
                                # "operator":"and",  # 多个切词结果在一个item, 等价于 "minimum_should_match":"100%"
                            }
                        },
                        # "filter": {
                        #     "bool":{
                        #         "must": [
                        #             {"term": {"deleted": "0"}},
                        #             {"term": {"status": "published"}}
                        #         ],
                        #     }
                        # }
                    }
                },
                "from": (int(page) - 1) * int(limit),
                "size": int(limit),
                # 排序
                # "sort": {
                #     "update_time": {
                #         "order": "desc"
                #     }
                # },
                # 高亮
                "highlight": {
                    "fields": {
                        "query": {}
                    }
                }
            }

    else:
        # 单个字搜索
        body = {
            "query": {
                "wildcard": {
                    "query": "*" + keyword + "*",
                }
            },
            "from": (int(page) - 1) * int(limit),
            "size": int(limit),
        }

    ret_content = es.search(index="hot_words", doc_type="doc", body=body)
    ret_content = _highlight(ret_content)
    return ret_content