Exemple #1
0
class Searcher():
    """Searches papers from elasticsearch database

    Longer class information....
    Longer class information....

    """
    def __init__(self, index_name='paperdb', doc_type='papers', host='10.1.114.114', port=9200):
        """Initialize a search engine

        Args:
            host: A host name of elasticsearch
            port: A port number of elasticsearch
            index_name: name of the index you want to search for
            doc_type: name of the doc_type under certain index

        """
        self.es = Elasticsearch([{'host': host, 'port': port}])
        self.index = index_name
        self.doc_type = doc_type

    def generate_dsl(self, search_info):
        """Generate DSL given query and search settings

        Args:
            search_info: a dict including a query and other settings
            Attention that 'query_type' must be consistent with 'match' !
        Example:
            {
                'query_type': 'integrated_search',
                'query': 'attention network',
                'match': {
                    'title': True,
                    'abstract': True,
                    'paperContent': True,
                    'videoContent': True,
                },
                'filter': {
                    'yearfrom': 1000,
                    'yearbefore': 3000,
                },
                'sort': 'year',
                'is_filter': True,
                'is_rescore': True,
                'is_cited': False
            }
            or
            {
                'query_type': 'advanced_search',
                'match': {
                    'title': 'attention',
                    'abstract': 'attention',
                    'paperContent': 'attention',
                    'videoContent': None,
                },
                'filter': {
                    'yearfrom': 1000,
                    'yearbefore': 3000,
                },
                'sort': 'relevance',
                'is_filter': False,
                'is_rescore': True,
                'is_cited': False
            }
        Return:
            dsl: a dsl translated from search info
        """

        # check search_info
        if 'integrated' in search_info['query_type']:
            assert 'query' in search_info, "Integrated search must have query !"
            assert isinstance(search_info['match']['title'], bool), "Here needs bool type !"
        else:
            assert isinstance(search_info['match']['title'], (str, None)), \
            "Here needs a string or None !"

        if search_info['is_cited'] is False:
            dsl = Vividict()
            dsl['query']['bool']['must'] = []
            dsl['query']['bool']['should'] = []
            dsl['rescore'] = []

            if 'integrated' in search_info['query_type']:
                match = self.get_integrated_match(search_info['query'], search_info['match'])
                dsl['query']['bool']['should'] = match
                if search_info['is_filter'] is True:
                    filter = self.get_filter_query(search_info['query'])
                    dsl['query']['bool']['must'].append(filter)
                if search_info['is_rescore'] is True:
                    rescore = self.get_rescore_query(match)
                    dsl['rescore'] = rescore

            else:  # 'advanced_search'
                match = self.get_advanced_match(search_info['match'])
                dsl['query']['bool']['must'] = match
                if search_info['is_rescore'] is True:
                    rescore = self.get_rescore_query(match)
                    dsl['rescore'] = rescore

            year_range = Vividict()
            year_range['range']['year']['gte'] = search_info['filter'].get('yearfrom', 1000)
            year_range['range']['year']['lte'] = search_info['filter'].get('yearbefore', 3000)
            dsl['query']['bool']['must'].append(year_range)

        else:  # cited-function_score
            dsl = Vividict()
            dsl['query']['function_score']['query']['bool']['must'] = []
            dsl['query']['function_score']['query']['bool']['should'] = []
            dsl['query']['function_score']['field_value_factor'] = []
            dsl['rescore'] = []

            if 'integrated' in search_info['query_type']:
                match = self.get_integrated_match(search_info['query'], search_info['match'])
                dsl['query']['function_score']['query']['bool']['should'] = match
                cited = self.get_function_factor()
                dsl['query']['function_score']['field_value_factor'] = cited
                if search_info['is_filter'] is True:
                    filter = self.get_filter_query(search_info['query'])
                    dsl['query']['function_score']['query']['bool']['must'].append(filter)
                if search_info['is_rescore'] is True:
                    rescore = self.get_rescore_query(match)
                    dsl['rescore'] = rescore

            else:  # 'advanced_search'
                match = self.get_advanced_match(search_info['match'])
                dsl['query']['bool']['must'] = match
                if search_info['is_rescore'] is True:
                    rescore = self.get_rescore_query(match)
                    dsl['rescore'] = rescore

            year_range = Vividict()
            year_range['range']['year']['gte'] = search_info['filter'].get('yearfrom', 1000)
            year_range['range']['year']['lte'] = search_info['filter'].get('yearbefore', 3000)
            dsl['query']['function_score']['query']['bool']['must'].append(year_range)

        if search_info['sort'] == 'year':
            dsl['sort']['year'] = 'desc'
        elif search_info['sort'] == 'cited':
            dsl['sort']['cited'] = 'asc'

        return dsl

    def get_integrated_match(self, query, match):
        """get match of intergrated search

        Args:
            query: query string from user
            match: A dict contained title, abstract...

        Return:
            res: A list of match
        """
        res = []

        if match['title'] or match['abstract']:
            tmp = Vividict()
            tmp['multi_match']['query'] = query

            fields = []
            if match['title']:
                fields.append('title^3')

            if match['abstract']:
                fields.append('abstract^2')

            tmp['multi_match']['fields'] = fields
            res.append(tmp)

        if match['paperContent']:
            nest = self.get_nested_query_paperContent(query)
            res.append(nest)

        if match['videoContent']:
            nest = self.get_nested_query_videoContent(query)
            res.append(nest)

        return res

    def get_advanced_match(self, match):
        """get match of advanced search

        Args:
            match: A dict contained title, abstract, paper_content...

        Return:
            res: A list of match
        """
        res = []
        if match['title']:
            _match = {'match': {'title': match['title']}}
            res.append(_match)

        if match['abstract']:
            _match = {'match': {'abstract': match['abstract']}}
            res.append(_match)

        if match['paperContent']:
            nest = self.get_nested_query_paperContent(match['paperContent'])
            res.append(nest)

        if match['videoContent']:
            nest = self.get_nested_query_videoContent(match['videoContent'])
            res.append(nest)

        return res

    def get_nested_query_paperContent(self, query):

        nest = Vividict()
        nest['nested']['path'] = 'paperContent'
        nest['nested']['score_mode'] = 'max'

        tmp = Vividict()
        fields = ['paperContent.text', 'paperContent.subtitles^2', 'paperContent.subtexts']
        tmp['multi_match']['fields'] = fields
        tmp['multi_match']['query'] = query
        nest['nested']['query']['bool']['must'] = tmp

        return nest

    def get_nested_query_videoContent(self, query):

        nest = Vividict()
        nest['nested']['path'] = 'videoContent'
        nest['nested']['score_mode'] = 'max'

        tmp = Vividict()
        tmp['match']['videoContent.textEnglish'] = query
        nest['nested']['query']['bool']['must'] = tmp

        return nest

    def get_function_factor(self):
        cited = Vividict()
        cited['field'] = 'cited'
        cited['modifier'] = 'log1p'
        cited['factor'] = 0.5
        cited['missing'] = 0

        return cited

    def get_filter_query(self, query):
        filter = Vividict()
        tag_list = query.split()
        filter['terms']['abstract'] = tag_list

        return filter

    def get_rescore_query(self, match):
        rescore = Vividict()
        rescore['window_size'] = 100
        rescore['query']['rescore_query'] = match[0]
        rescore['query']['query_weight'] = 1.5
        rescore['query']['rescore_query_weight'] = 0.5

        return rescore

    def search_paper_by_name(self, search_info, only_top_k=True, k=100):
        """Search paper by name
        Args:
            query: query string from user

        Return:
            res_list: A list of paper information
            num: The number of returned paper
        """
        dsl = self.generate_dsl(search_info)
        result = self.es.search(index=self.index, doc_type=self.doc_type, body=dsl, scroll="5m", size=10)
        # import pdb; pdb.set_trace();
        sid = result['_scroll_id']
        scroll_size = result['hits']['total']
        res_list, paper_id, num = [], [], scroll_size
        while scroll_size > 0:
            result = self.es.scroll(scroll_id=sid, scroll="5m")
            sid = result['_scroll_id']
            scroll_size = len(result["hits"]["hits"])
            paper, p_id, _ = self.get_paper_info(result)
            res_list += paper
            paper_id += p_id

            if only_top_k and len(res_list) >= k:
                break

        return res_list, paper_id, num

    def get_video_pos_by_paper_id(self, search_info, paper_id, threshold=0.6):
        """
        Args:
            search_info: the same as that in self.generate_dsl()
            paper_id: A string, given by es

        Return:
            a sorted video captions' list according to similarity between
            captions and query
        """
        
        assert isinstance(paper_id, str), "paper_id must be a string, here need only one id !"

        paper = self.es.get_source(index=self.index, doc_type=self.doc_type, id=paper_id)

        return self.get_video_pos_by_paper(search_info=search_info,
                                           paper=paper,
                                           threshold=threshold)

    def get_video_pos_by_paper(self, search_info, paper, threshold=0.6):
        """
        Args:
            paper: A dict contained title, abstract ...

        Return:
            a sorted video captions' list according to similarity between
            captions and query
        """

        assert isinstance(paper, dict), "paper must be a dict, here need only one paper !"

        if 'integrated' in search_info['query_type']:
            query = search_info['query']
        else:
            query = search_info['match']['videoContent']

        assert (query is not None)

        if 'videoContent' not in paper:
            return [None]

        pos = self.get_video_pos(query=query,
                            videoContent=paper['videoContent'],
                            threshold=threshold)
        return pos

    @staticmethod
    def get_paper_info(res):
        """Return raw paper info given es search result
        Args:
            res: A dict of result from es.search

        Return:
            paper_list: A list of dicts, each dict stores information of a paper
            num: length of paper_list
        """
        paper_list = []
        paper_id = []
        hits = res['hits']['hits']
        num = res['hits']['total']
        # import pdb; pdb.set_trace();
        for hit in hits:
            paper_list.append(hit['_source'])
            paper_id.append(hit['_id'])
        return paper_list, paper_id, num

    @staticmethod
    def remove_text_embedding(papers):
        """Remove textEmbedding in videoContent
        Args:
            papers: A list of paper
        """
        for paper in papers:
            if 'videoContent' in paper:
                for v in paper['videoContent']:
                    if 'textEmbedding' in v:
                        v.pop('textEmbedding')
    @staticmethod
    def get_video_pos(query, videoContent, threshold=0.6):
        """Return a list of video captions related to user's query

        Args:
            query: english query text
            videoContent: a list of video caption information
            threshold: captions whose similarity score is > threshold are returned

        Return:
            res_list: a sorted video captions' list according to similarity between
                    captions and query
        """
        emd_list = [v.pop('textEmbedding') for v in videoContent]
        sim_list = test_similarity(query, emd_list)
        if sim_list == '__ERROR__':
            return sim_list

        res_list = []
        for s, v in zip(sim_list, videoContent):
            v['score'] = s
            if v['score'] > threshold:
                res_list.append(v)
            elif query in v['textEnglish']:
                res_list.append(v)
        # print('query:' + query)
        # pprint(res_list)
        return res_list
Exemple #2
0
    curr_obj = result_pending.pop()
    scroll_id = curr_obj.get("_scroll_id")
    for hit in curr_obj["hits"]["hits"]:
        hitSource = hit["_source"]
        for key, val in hitSource.items():
            if not key == "locale":
                if isinstance(val, (list)):
                    for item in val:
                        x = stData.get(hitSource["locale"], set())
                        x.add(item.strip())
                        stData[hitSource["locale"]] = x
                else:
                    x = stData.get(hitSource["locale"], set())
                    x.add(val.strip())
                    stData[hitSource["locale"]] = x
    response = esObj.scroll(body='''{"scroll":"5s","scroll_id":"''' +
                            scroll_id + '''"}''')
    if response["hits"]["hits"]:
        result_pending.append(response)

for key, val in stData.items():
    for item in val:
        doc = {
            "dateInserted":
            datetime.datetime.now().strftime("%Y%m%dT%H%M%S+0000"),
            "id": str(uuid.uuid4()),
            "org": org,
            "rootOrg": rootOrg,
            "searchTerm": item,
            "searchTermAnalysed": item.lower(),
            "isSuggested": False,
            "locale": key
                                filter_path=[
                                    'hits.hits._source.sorted_term',
                                    'hits.hits._source.pid', '_scroll_id'
                                ])
scroll_id = sorted_term_vectors['_scroll_id']
sorted_term_vectors = sorted_term_vectors['hits']['hits']
for x in sorted_term_vectors:
    if x['_source']['sorted_term']:
        sorted_term_vectors_dict[x['_source']['pid']] = " ".join(
            x['_source']['sorted_term'])
    else:
        sorted_term_vectors_dict[x['_source']['pid']] = None

for _ in range(len(df) // 10000):
    print("sorted term : ", _ * 10000)
    sorted_term_vectors = es.scroll(scroll_id=scroll_id,
                                    scroll='1m')['hits']['hits']
    for x in sorted_term_vectors:
        if x['_source']['sorted_term']:
            sorted_term_vectors_dict[x['_source']['pid']] = " ".join(
                x['_source']['sorted_term'])
        else:
            sorted_term_vectors_dict[x['_source']['pid']] = None
    del sorted_term_vectors

print('merge dfs')
df_temp = pd.DataFrame.from_dict(sorted_term_vectors_dict, orient='index')
df_temp = df_temp.reset_index()
df_temp.columns = ['pid', 'term_vector']
df = df.merge(df_temp, on=['pid'])
del df_temp
    sorted_terms = sorted(term_dict.items())
    sorted_terms = [tup[1] for tup in sorted_terms]
    return sorted_terms


if __name__ == '__main__':
    count_list = [x for x in range(0, count, 10000)]
    count_list.append(count)

    results = list()
    results.append(es.search(index='prd_review', size=10000, scroll='1m'))
    scroll_id = results[0]['_scroll_id']
    results = results[0]['hits']['hits']

    for _ in range(count // 10000):
        results.extend(es.scroll(scroll_id=scroll_id, scroll='1m')['hits']['hits'])

    results = [result['_source'] for result in results]

    data = []
    for result in results:
        data.append({})
        data[-1]['m_id'] = result['message_id']
        data[-1]['score'] = result['prd_satisfact']
        data[-1]['cus_grade'] = result['cus_grade']
        data[-1]['best_flag'] = result['best_flag']
        data[-1]['text'] = result['title']

    df = pd.DataFrame(data)
    df.m_id = df.m_id.astype('str')
    # df['pos'] = df.id.apply(lambda _id: get_termvectors(_id))