コード例 #1
0
 def get_facts_structure(self):
     base_url = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100'
     search_url = base_url.format(self.es_url, self._index, 'texta')
     query = {"query": {"term": {"facts.doc_type": self._type.lower()}}}
     query = json.dumps(query)
     response = ES_Manager.plain_post(search_url, data=query)
     scroll_id = response['_scroll_id']
     total = response['hits']['total']
     prog = Progress(total)
     n_count = 0
     facts_structure = {}
     while total > 0:
         response = ES_Manager.plain_post(
             '{0}/_search/scroll?scroll=1m'.format(self.es_url),
             data=scroll_id)
         total = len(response['hits']['hits'])
         scroll_id = response['_scroll_id']
         for hit in response['hits']['hits']:
             n_count += 1
             prog.update(n_count)
             fact = hit['_source']['facts']['fact']
             doc_path = hit['_source']['facts']['doc_path']
             if fact not in facts_structure:
                 facts_structure[fact] = set()
             facts_structure[fact].add(doc_path)
     prog.done()
     return facts_structure
コード例 #2
0
    def link_all(self):

        self._build_facts_structure()
        print '- Total of unique facts.fact: {0}'.format(
            len(self.facts_structure.keys()))
        print 'Linking ... '

        search_url_base = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100'
        search_url = search_url_base.format(self.es_url, self._index, 'texta')

        query = {"query": {"term": {"facts.doc_type": self._type.lower()}}}
        query = json.dumps(query)
        response = ES_Manager.plain_post(search_url, data=query)
        scroll_id = response['_scroll_id']
        total = response['hits']['total']
        n_total = total
        n_count = 0
        prog = Progress(n_total)
        while total > 0:
            response = ES_Manager.plain_post(
                '{0}/_search/scroll?scroll=1m'.format(self.es_url),
                data=scroll_id)
            total = len(response['hits']['hits'])
            scroll_id = response['_scroll_id']
            for hit in response['hits']['hits']:
                n_count += 1
                prog.update(n_count)

                fact = hit['_source']['facts']['fact']
                doc_path = hit['_source']['facts']['doc_path']
                if fact not in self.facts_structure:
                    self.facts_structure[fact] = set()
                self.facts_structure[fact].add(doc_path)
                fact_link = u'{0}.{1}'.format(doc_path, fact)
                doc_id = hit['_source']['facts']['doc_id']
                links = self.get_texta_link_facts_by_id(doc_id)
                if links is not None:
                    texta_link = {'texta_link': {'facts': links}}
                    if fact_link not in texta_link['texta_link']['facts']:
                        texta_link['texta_link']['facts'].append(fact_link)
                        self.update_texta_link_by_id(doc_id, texta_link)

            # Check errors in the database request
            if (response['_shards']['total'] > 0
                    and response['_shards']['successful']
                    == 0) or response['timed_out']:
                msg_base = 'Elasticsearch: *** Shards: {0} *** Timeout: {1} *** Took: {2}'
                msg = msg_base.format(response['_shards'],
                                      response['timed_out'], response['took'])
                print msg
        prog.done()
コード例 #3
0
 def update_texta_link_by_id(self, doc_id, texta_link):
     base_url = '{0}/{1}/{2}/{3}/_update'
     request_url = base_url.format(self.es_url, self._index, self._type,
                                   doc_id)
     d = json.dumps({'doc': texta_link})
     response = ES_Manager.plain_post(request_url, data=d)
     return response
コード例 #4
0
 def _get_fact_hits(self):
     scroll_url = '{0}/_search/scroll?scroll=1m'.format(self.es_url)
     search_url = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100'.format(
         self.es_url, self._index, self.TEXTA)
     query = {u'query': {u'bool': {u'should': [], u'must': []}}}
     q = json.dumps(query)
     response = ES_Manager.plain_post(search_url, data=q)
     scroll_id = response['_scroll_id']
     total_msg = response['hits']['total']
     while total_msg > 0:
         response = ES_Manager.plain_post(scroll_url, data=scroll_id)
         scroll_id = response['_scroll_id']
         total_msg = len(response['hits']['hits'])
         self._check_es_error(response)
         for hit in response['hits']['hits']:
             yield hit
コード例 #5
0
 def _get_total_facts(self):
     request_url = 'http://localhost:9200/{0}/{1}/_count'.format(
         self._index, self.TEXTA)
     response = ES_Manager.plain_post(request_url)
     return response['count']
コード例 #6
0
 def _get_total_facts(self):
     request_url = '{0}/{1}/{2}/_count'.format(self.es_url, self._index,
                                               self.TEXTA)
     response = ES_Manager.plain_post(request_url)
     return response['count']