def get_facts_structure(self): base_url = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100' search_url = base_url.format(self.es_url, self._index, 'texta') query = {"query": {"term": {"facts.doc_type": self._type.lower()}}} query = json.dumps(query) response = ES_Manager.plain_post(search_url, data=query) scroll_id = response['_scroll_id'] total = response['hits']['total'] prog = Progress(total) n_count = 0 facts_structure = {} while total > 0: response = ES_Manager.plain_post( '{0}/_search/scroll?scroll=1m'.format(self.es_url), data=scroll_id) total = len(response['hits']['hits']) scroll_id = response['_scroll_id'] for hit in response['hits']['hits']: n_count += 1 prog.update(n_count) fact = hit['_source']['facts']['fact'] doc_path = hit['_source']['facts']['doc_path'] if fact not in facts_structure: facts_structure[fact] = set() facts_structure[fact].add(doc_path) prog.done() return facts_structure
def link_all(self): self._build_facts_structure() print '- Total of unique facts.fact: {0}'.format( len(self.facts_structure.keys())) print 'Linking ... ' search_url_base = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100' search_url = search_url_base.format(self.es_url, self._index, 'texta') query = {"query": {"term": {"facts.doc_type": self._type.lower()}}} query = json.dumps(query) response = ES_Manager.plain_post(search_url, data=query) scroll_id = response['_scroll_id'] total = response['hits']['total'] n_total = total n_count = 0 prog = Progress(n_total) while total > 0: response = ES_Manager.plain_post( '{0}/_search/scroll?scroll=1m'.format(self.es_url), data=scroll_id) total = len(response['hits']['hits']) scroll_id = response['_scroll_id'] for hit in response['hits']['hits']: n_count += 1 prog.update(n_count) fact = hit['_source']['facts']['fact'] doc_path = hit['_source']['facts']['doc_path'] if fact not in self.facts_structure: self.facts_structure[fact] = set() self.facts_structure[fact].add(doc_path) fact_link = u'{0}.{1}'.format(doc_path, fact) doc_id = hit['_source']['facts']['doc_id'] links = self.get_texta_link_facts_by_id(doc_id) if links is not None: texta_link = {'texta_link': {'facts': links}} if fact_link not in texta_link['texta_link']['facts']: texta_link['texta_link']['facts'].append(fact_link) self.update_texta_link_by_id(doc_id, texta_link) # Check errors in the database request if (response['_shards']['total'] > 0 and response['_shards']['successful'] == 0) or response['timed_out']: msg_base = 'Elasticsearch: *** Shards: {0} *** Timeout: {1} *** Took: {2}' msg = msg_base.format(response['_shards'], response['timed_out'], response['took']) print msg prog.done()
def update_texta_link_by_id(self, doc_id, texta_link): base_url = '{0}/{1}/{2}/{3}/_update' request_url = base_url.format(self.es_url, self._index, self._type, doc_id) d = json.dumps({'doc': texta_link}) response = ES_Manager.plain_post(request_url, data=d) return response
def _get_fact_hits(self): scroll_url = '{0}/_search/scroll?scroll=1m'.format(self.es_url) search_url = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100'.format( self.es_url, self._index, self.TEXTA) query = {u'query': {u'bool': {u'should': [], u'must': []}}} q = json.dumps(query) response = ES_Manager.plain_post(search_url, data=q) scroll_id = response['_scroll_id'] total_msg = response['hits']['total'] while total_msg > 0: response = ES_Manager.plain_post(scroll_url, data=scroll_id) scroll_id = response['_scroll_id'] total_msg = len(response['hits']['hits']) self._check_es_error(response) for hit in response['hits']['hits']: yield hit
def _get_total_facts(self): request_url = 'http://localhost:9200/{0}/{1}/_count'.format( self._index, self.TEXTA) response = ES_Manager.plain_post(request_url) return response['count']
def _get_total_facts(self): request_url = '{0}/{1}/{2}/_count'.format(self.es_url, self._index, self.TEXTA) response = ES_Manager.plain_post(request_url) return response['count']