def get_facts_structure(self): base_url = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100' search_url = base_url.format(self.es_url, self._index, 'texta') query = {"query": {"term": {"facts.doc_type": self._type.lower()}}} query = json.dumps(query) response = ES_Manager.plain_post(search_url, data=query) scroll_id = response['_scroll_id'] total = response['hits']['total'] prog = Progress(total) n_count = 0 facts_structure = {} while total > 0: response = ES_Manager.plain_post( '{0}/_search/scroll?scroll=1m'.format(self.es_url), data=scroll_id) total = len(response['hits']['hits']) scroll_id = response['_scroll_id'] for hit in response['hits']['hits']: n_count += 1 prog.update(n_count) fact = hit['_source']['facts']['fact'] doc_path = hit['_source']['facts']['doc_path'] if fact not in facts_structure: facts_structure[fact] = set() facts_structure[fact].add(doc_path) prog.done() return facts_structure
def link_all(self): self._build_facts_structure() print '- Total of unique facts.fact: {0}'.format( len(self.facts_structure.keys())) print 'Linking ... ' search_url_base = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100' search_url = search_url_base.format(self.es_url, self._index, 'texta') query = {"query": {"term": {"facts.doc_type": self._type.lower()}}} query = json.dumps(query) response = ES_Manager.plain_post(search_url, data=query) scroll_id = response['_scroll_id'] total = response['hits']['total'] n_total = total n_count = 0 prog = Progress(n_total) while total > 0: response = ES_Manager.plain_post( '{0}/_search/scroll?scroll=1m'.format(self.es_url), data=scroll_id) total = len(response['hits']['hits']) scroll_id = response['_scroll_id'] for hit in response['hits']['hits']: n_count += 1 prog.update(n_count) fact = hit['_source']['facts']['fact'] doc_path = hit['_source']['facts']['doc_path'] if fact not in self.facts_structure: self.facts_structure[fact] = set() self.facts_structure[fact].add(doc_path) fact_link = u'{0}.{1}'.format(doc_path, fact) doc_id = hit['_source']['facts']['doc_id'] links = self.get_texta_link_facts_by_id(doc_id) if links is not None: texta_link = {'texta_link': {'facts': links}} if fact_link not in texta_link['texta_link']['facts']: texta_link['texta_link']['facts'].append(fact_link) self.update_texta_link_by_id(doc_id, texta_link) # Check errors in the database request if (response['_shards']['total'] > 0 and response['_shards']['successful'] == 0) or response['timed_out']: msg_base = 'Elasticsearch: *** Shards: {0} *** Timeout: {1} *** Took: {2}' msg = msg_base.format(response['_shards'], response['timed_out'], response['took']) print msg prog.done()
def _get_fact_hits(self): scroll_url = '{0}/_search/scroll?scroll=1m'.format(self.es_url) search_url = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100'.format( self.es_url, self._index, self.TEXTA) query = {u'query': {u'bool': {u'should': [], u'must': []}}} q = json.dumps(query) response = ES_Manager.plain_post(search_url, data=q) scroll_id = response['_scroll_id'] total_msg = response['hits']['total'] while total_msg > 0: response = ES_Manager.plain_post(scroll_url, data=scroll_id) scroll_id = response['_scroll_id'] total_msg = len(response['hits']['hits']) self._check_es_error(response) for hit in response['hits']['hits']: yield hit
def update_texta_link_by_id(self, doc_id, texta_link): base_url = '{0}/{1}/{2}/{3}/_update' request_url = base_url.format(self.es_url, self._index, self._type, doc_id) d = json.dumps({'doc': texta_link}) response = ES_Manager.plain_post(request_url, data=d) return response
def check_if_has_texta_mapping(self): request_url = '{0}/{1}'.format(self.es_url, self._index) response = ES_Manager.plain_get(request_url) mappings = response[self._index]['mappings'] if self.TEXTA not in mappings: error_msg = 'Mapping [{0}] was not found'.format(self.TEXTA) raise CheckCritical(error_msg) self.maybe_print('OK', 'Mapping [{0}] is present'.format(self.TEXTA))
def check_index_present(self): request_url = '{0}/_aliases'.format(self.es_url) response = ES_Manager.plain_get(request_url) indexes = response.keys() if self._index not in indexes: error_msg = 'Index {0} was not found'.format(self._index) raise CheckCritical(error_msg) self.maybe_print('OK', 'Index {0} is present'.format(self._index))
def get_texta_link_facts_by_id(self, doc_id): base_url = '{0}/{1}/{2}/{3}?fields=texta_link.facts' request_url = base_url.format(self.es_url, self._index, self._type, doc_id) response = ES_Manager.plain_get(request_url) doc = None try: if response['found']: doc = [] if 'fields' in response: doc = response['fields']['texta_link.facts'] except KeyError: return None return doc
def main(): args = sys.argv script_name = args[0] commands = [] commands.append( ['--indexes', 0, 'python {0} port --indexes'.format(script_name)]) commands.append([ '--check', 1, 'python {0} port --maps index_name'.format(script_name) ]) commands.append([ '--check', 1, 'python {0} port --check index_name'.format(script_name) ]) commands.append([ '--link', 1, 'python {0} port --link index_name map_name'.format(script_name) ]) try: port = long(args[1]) c = args[2] es_url = 'http://localhost:{0}'.format(port) if c == '--indexes': request_url = '{0}/_aliases'.format(es_url) response = ES_Manager.plain_get(request_url) for k in response.keys(): print k return if c == '--maps': _index = u'{0}'.format(args[3]) request_url = '{0}/{1}'.format(es_url, _index) response = ES_Manager.plain_get(request_url).json() for k in response[_index]['mappings'].keys(): print k return if c == '--check': _index = u'{0}'.format(args[3]) print('Checking... URL: {0}/{1} \n'.format(es_url, _index)) start_time = time.time() check = FactsCheck(es_url, _index) check.check_all() check.summary() end_time = time.time() print '\n... total time: {0:2.2f} [min]'.format( (end_time - start_time) / 60.0) return if c == '--link': _index = u'{0}'.format(args[3]) _type = u'{0}'.format(args[4]) if _type == u'texta': raise Exception('Mapping link cant be texta!') print('Linking... URL: {0}/{1} - mapping: {2} \n'.format( es_url, _index, _type)) start_time = time.time() link = FactsLink(es_url, _index, _type) link.link_all() end_time = time.time() print '\n... total time: {0:2.2f} [min]'.format( (end_time - start_time) / 60.0) return except Exception as e: print '--- Error: {0} \n'.format(e) print_help(commands)
def _get_total_facts(self): request_url = 'http://localhost:9200/{0}/{1}/_count'.format( self._index, self.TEXTA) response = ES_Manager.plain_post(request_url) return response['count']
def _check_element(self, _id, fact): doc_type = fact['doc_type'] fact_name = fact['fact'] doc_path = fact['doc_path'] doc_id = fact['doc_id'] spans = fact['spans'] # Check fact name size if len(fact_name) == 0: error_msg = 'Fact _id:{0} has empty fact_name'.format(_id) raise CheckError(error_msg) # Check fact name with dots if '.' in fact_name: error_msg = 'Fact _id:{0} contains dot (.) - {1}'.format( _id, fact_name) raise CheckError(error_msg) # Check fact name max size (warning) if len(fact_name) > 100: warning_msg = 'Fact _id:{0} has long fact_name'.format(_id) self._set_warning(warning_msg) # Check doc_id and recover document request_url = 'http://localhost:9200/{0}/{1}/{2}'.format( self._index, doc_type, doc_id) response = ES_Manager.plain_get(request_url) if not response['found']: error_msg = 'Fact _id:{0} has an invalid document [doc_id:{1}]'.format( _id, doc_id) raise CheckError(error_msg) try: spans = json.loads(spans) assert isinstance(spans, list) except Exception: error_msg = 'Fact _id:{0} has invalid spans field '.format(_id) raise CheckError(error_msg) _source = response['_source'] # Check spans len_spans = len(spans) if len_spans == 0: warning_msg = 'Fact _id:{0} has empty spans'.format(_id) self._set_warning(warning_msg) # Check doc_path doc = _source path_parts = doc_path.split('.') try: for p in path_parts: doc = doc[p] except KeyError: error_msg = 'Fact _id:{0} has invalid doc_path [doc_path:{1}]'.format( _id, doc_path) raise CheckError(error_msg) # Check fact link is_linked = False if 'texta_link' not in _source or 'facts' not in _source['texta_link']: is_linked = False else: for fact_link in _source['texta_link']['facts']: is_linked = is_linked or (doc_path in fact_link) if not is_linked: error_msg = 'Fact _id:{0} is not linked with document [doc_id:{1}]'.format( _id, doc_id) raise CheckError(error_msg) # Check spanned content len_field = len(doc) + 1 max_span = max([s[1] for s in spans]) if max_span > len_field: warning_msg = 'Fact _id:{0} has likely a wrong span'.format(_id) self._set_warning(warning_msg)
def check_version(self): request_url = '{0}'.format(self.es_url) response = ES_Manager.plain_get(request_url) ver = response['version']['number'] self.maybe_print('OK', 'ES version {0}'.format(ver))
#es_url = 'http://127.0.0.1:9200' def transform_document(doc_dict): new_doc_dict = doc_dict content = doc_dict['content'] lemmas = doc_dict['lemmas'] del new_doc_dict['content'] del new_doc_dict['lemmas'] new_doc_dict['content'] = {'text':content, 'lemmas':lemmas} return new_doc_dict ES_Manager.plain_delete(es_url+'/'+index) ES_Manager.plain_put(es_url+'/'+index, data=json.dumps({'mappings':{mapping:{},'facts':{}}})) with ZipFile('data.zip','r') as zip_file: with zip_file.open('data.csv') as fin: reader = csv.DictReader(fin) data = [] counter = 0 for row in reader: counter += 1 doc = transform_document(row)
def _get_total_facts(self): request_url = '{0}/{1}/{2}/_count'.format(self.es_url, self._index, self.TEXTA) response = ES_Manager.plain_post(request_url) return response['count']
def transform_document(doc_dict): new_doc_dict = doc_dict content = doc_dict['content'] lemmas = doc_dict['lemmas'] del new_doc_dict['content'] del new_doc_dict['lemmas'] new_doc_dict['content'] = {'text': content, 'lemmas': lemmas} return new_doc_dict ES_Manager.plain_delete(es_url + '/' + index) ES_Manager.plain_put(es_url + '/' + index, data=json.dumps({'mappings': { mapping: {}, 'facts': {} }})) with ZipFile('data.zip', 'r') as zip_file: with zip_file.open('data.csv') as fin: reader = csv.DictReader(fin) data = [] counter = 0 for row in reader: