Beispiel #1
0
 def get_facts_structure(self):
     base_url = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100'
     search_url = base_url.format(self.es_url, self._index, 'texta')
     query = {"query": {"term": {"facts.doc_type": self._type.lower()}}}
     query = json.dumps(query)
     response = ES_Manager.plain_post(search_url, data=query)
     scroll_id = response['_scroll_id']
     total = response['hits']['total']
     prog = Progress(total)
     n_count = 0
     facts_structure = {}
     while total > 0:
         response = ES_Manager.plain_post(
             '{0}/_search/scroll?scroll=1m'.format(self.es_url),
             data=scroll_id)
         total = len(response['hits']['hits'])
         scroll_id = response['_scroll_id']
         for hit in response['hits']['hits']:
             n_count += 1
             prog.update(n_count)
             fact = hit['_source']['facts']['fact']
             doc_path = hit['_source']['facts']['doc_path']
             if fact not in facts_structure:
                 facts_structure[fact] = set()
             facts_structure[fact].add(doc_path)
     prog.done()
     return facts_structure
Beispiel #2
0
    def link_all(self):

        self._build_facts_structure()
        print '- Total of unique facts.fact: {0}'.format(
            len(self.facts_structure.keys()))
        print 'Linking ... '

        search_url_base = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100'
        search_url = search_url_base.format(self.es_url, self._index, 'texta')

        query = {"query": {"term": {"facts.doc_type": self._type.lower()}}}
        query = json.dumps(query)
        response = ES_Manager.plain_post(search_url, data=query)
        scroll_id = response['_scroll_id']
        total = response['hits']['total']
        n_total = total
        n_count = 0
        prog = Progress(n_total)
        while total > 0:
            response = ES_Manager.plain_post(
                '{0}/_search/scroll?scroll=1m'.format(self.es_url),
                data=scroll_id)
            total = len(response['hits']['hits'])
            scroll_id = response['_scroll_id']
            for hit in response['hits']['hits']:
                n_count += 1
                prog.update(n_count)

                fact = hit['_source']['facts']['fact']
                doc_path = hit['_source']['facts']['doc_path']
                if fact not in self.facts_structure:
                    self.facts_structure[fact] = set()
                self.facts_structure[fact].add(doc_path)
                fact_link = u'{0}.{1}'.format(doc_path, fact)
                doc_id = hit['_source']['facts']['doc_id']
                links = self.get_texta_link_facts_by_id(doc_id)
                if links is not None:
                    texta_link = {'texta_link': {'facts': links}}
                    if fact_link not in texta_link['texta_link']['facts']:
                        texta_link['texta_link']['facts'].append(fact_link)
                        self.update_texta_link_by_id(doc_id, texta_link)

            # Check errors in the database request
            if (response['_shards']['total'] > 0
                    and response['_shards']['successful']
                    == 0) or response['timed_out']:
                msg_base = 'Elasticsearch: *** Shards: {0} *** Timeout: {1} *** Took: {2}'
                msg = msg_base.format(response['_shards'],
                                      response['timed_out'], response['took'])
                print msg
        prog.done()
Beispiel #3
0
 def _get_fact_hits(self):
     scroll_url = '{0}/_search/scroll?scroll=1m'.format(self.es_url)
     search_url = '{0}/{1}/{2}/_search?search_type=scan&scroll=1m&size=100'.format(
         self.es_url, self._index, self.TEXTA)
     query = {u'query': {u'bool': {u'should': [], u'must': []}}}
     q = json.dumps(query)
     response = ES_Manager.plain_post(search_url, data=q)
     scroll_id = response['_scroll_id']
     total_msg = response['hits']['total']
     while total_msg > 0:
         response = ES_Manager.plain_post(scroll_url, data=scroll_id)
         scroll_id = response['_scroll_id']
         total_msg = len(response['hits']['hits'])
         self._check_es_error(response)
         for hit in response['hits']['hits']:
             yield hit
Beispiel #4
0
 def update_texta_link_by_id(self, doc_id, texta_link):
     base_url = '{0}/{1}/{2}/{3}/_update'
     request_url = base_url.format(self.es_url, self._index, self._type,
                                   doc_id)
     d = json.dumps({'doc': texta_link})
     response = ES_Manager.plain_post(request_url, data=d)
     return response
Beispiel #5
0
    def check_if_has_texta_mapping(self):
        request_url = '{0}/{1}'.format(self.es_url, self._index)
        response = ES_Manager.plain_get(request_url)
        mappings = response[self._index]['mappings']
        if self.TEXTA not in mappings:
            error_msg = 'Mapping [{0}] was not found'.format(self.TEXTA)
            raise CheckCritical(error_msg)

        self.maybe_print('OK', 'Mapping [{0}] is present'.format(self.TEXTA))
Beispiel #6
0
    def check_index_present(self):
        request_url = '{0}/_aliases'.format(self.es_url)
        response = ES_Manager.plain_get(request_url)
        indexes = response.keys()
        if self._index not in indexes:
            error_msg = 'Index {0} was not found'.format(self._index)
            raise CheckCritical(error_msg)

        self.maybe_print('OK', 'Index {0} is present'.format(self._index))
Beispiel #7
0
 def get_texta_link_facts_by_id(self, doc_id):
     base_url = '{0}/{1}/{2}/{3}?fields=texta_link.facts'
     request_url = base_url.format(self.es_url, self._index, self._type,
                                   doc_id)
     response = ES_Manager.plain_get(request_url)
     doc = None
     try:
         if response['found']:
             doc = []
         if 'fields' in response:
             doc = response['fields']['texta_link.facts']
     except KeyError:
         return None
     return doc
Beispiel #8
0
def main():

    args = sys.argv
    script_name = args[0]
    commands = []
    commands.append(
        ['--indexes', 0, 'python {0} port --indexes'.format(script_name)])
    commands.append([
        '--check', 1, 'python {0} port --maps index_name'.format(script_name)
    ])
    commands.append([
        '--check', 1, 'python {0} port --check index_name'.format(script_name)
    ])
    commands.append([
        '--link', 1,
        'python {0} port --link index_name map_name'.format(script_name)
    ])
    try:

        port = long(args[1])
        c = args[2]
        es_url = 'http://localhost:{0}'.format(port)

        if c == '--indexes':
            request_url = '{0}/_aliases'.format(es_url)
            response = ES_Manager.plain_get(request_url)
            for k in response.keys():
                print k
            return

        if c == '--maps':
            _index = u'{0}'.format(args[3])
            request_url = '{0}/{1}'.format(es_url, _index)
            response = ES_Manager.plain_get(request_url).json()
            for k in response[_index]['mappings'].keys():
                print k
            return

        if c == '--check':
            _index = u'{0}'.format(args[3])
            print('Checking... URL: {0}/{1} \n'.format(es_url, _index))
            start_time = time.time()
            check = FactsCheck(es_url, _index)
            check.check_all()
            check.summary()
            end_time = time.time()
            print '\n... total time: {0:2.2f} [min]'.format(
                (end_time - start_time) / 60.0)
            return

        if c == '--link':
            _index = u'{0}'.format(args[3])
            _type = u'{0}'.format(args[4])
            if _type == u'texta':
                raise Exception('Mapping link cant be texta!')
            print('Linking... URL: {0}/{1} - mapping: {2} \n'.format(
                es_url, _index, _type))
            start_time = time.time()
            link = FactsLink(es_url, _index, _type)
            link.link_all()
            end_time = time.time()
            print '\n... total time: {0:2.2f} [min]'.format(
                (end_time - start_time) / 60.0)
            return

    except Exception as e:
        print '--- Error: {0} \n'.format(e)
    print_help(commands)
Beispiel #9
0
 def _get_total_facts(self):
     request_url = 'http://localhost:9200/{0}/{1}/_count'.format(
         self._index, self.TEXTA)
     response = ES_Manager.plain_post(request_url)
     return response['count']
Beispiel #10
0
    def _check_element(self, _id, fact):
        doc_type = fact['doc_type']
        fact_name = fact['fact']
        doc_path = fact['doc_path']
        doc_id = fact['doc_id']
        spans = fact['spans']

        # Check fact name size
        if len(fact_name) == 0:
            error_msg = 'Fact _id:{0} has empty fact_name'.format(_id)
            raise CheckError(error_msg)

        # Check fact name with dots
        if '.' in fact_name:
            error_msg = 'Fact _id:{0} contains dot (.) - {1}'.format(
                _id, fact_name)
            raise CheckError(error_msg)

        # Check fact name max size (warning)
        if len(fact_name) > 100:
            warning_msg = 'Fact _id:{0} has long fact_name'.format(_id)
            self._set_warning(warning_msg)

        # Check doc_id and recover document
        request_url = 'http://localhost:9200/{0}/{1}/{2}'.format(
            self._index, doc_type, doc_id)
        response = ES_Manager.plain_get(request_url)
        if not response['found']:
            error_msg = 'Fact _id:{0} has an invalid document [doc_id:{1}]'.format(
                _id, doc_id)
            raise CheckError(error_msg)

        try:
            spans = json.loads(spans)
            assert isinstance(spans, list)
        except Exception:
            error_msg = 'Fact _id:{0} has invalid spans field '.format(_id)
            raise CheckError(error_msg)

        _source = response['_source']

        # Check spans
        len_spans = len(spans)
        if len_spans == 0:
            warning_msg = 'Fact _id:{0} has empty spans'.format(_id)
            self._set_warning(warning_msg)

        # Check doc_path
        doc = _source
        path_parts = doc_path.split('.')
        try:
            for p in path_parts:
                doc = doc[p]
        except KeyError:
            error_msg = 'Fact _id:{0} has invalid doc_path [doc_path:{1}]'.format(
                _id, doc_path)
            raise CheckError(error_msg)

        # Check fact link
        is_linked = False
        if 'texta_link' not in _source or 'facts' not in _source['texta_link']:
            is_linked = False
        else:
            for fact_link in _source['texta_link']['facts']:
                is_linked = is_linked or (doc_path in fact_link)
        if not is_linked:
            error_msg = 'Fact _id:{0} is not linked with document [doc_id:{1}]'.format(
                _id, doc_id)
            raise CheckError(error_msg)

        # Check spanned content
        len_field = len(doc) + 1
        max_span = max([s[1] for s in spans])
        if max_span > len_field:
            warning_msg = 'Fact _id:{0} has likely a wrong span'.format(_id)
            self._set_warning(warning_msg)
Beispiel #11
0
 def check_version(self):
     request_url = '{0}'.format(self.es_url)
     response = ES_Manager.plain_get(request_url)
     ver = response['version']['number']
     self.maybe_print('OK', 'ES version {0}'.format(ver))
Beispiel #12
0
#es_url = 'http://127.0.0.1:9200'

def transform_document(doc_dict):
    new_doc_dict = doc_dict
    content = doc_dict['content']
    lemmas = doc_dict['lemmas']
    
    del new_doc_dict['content']
    del new_doc_dict['lemmas']
    
    new_doc_dict['content'] = {'text':content, 'lemmas':lemmas}
    
    return new_doc_dict

ES_Manager.plain_delete(es_url+'/'+index)
ES_Manager.plain_put(es_url+'/'+index, data=json.dumps({'mappings':{mapping:{},'facts':{}}}))

with ZipFile('data.zip','r') as zip_file:
    with zip_file.open('data.csv') as fin:
        reader = csv.DictReader(fin)

        data = []

        counter = 0

        for row in reader:
            counter += 1
            
            doc = transform_document(row)
            
Beispiel #13
0
 def _get_total_facts(self):
     request_url = '{0}/{1}/{2}/_count'.format(self.es_url, self._index,
                                               self.TEXTA)
     response = ES_Manager.plain_post(request_url)
     return response['count']
Beispiel #14
0

def transform_document(doc_dict):
    new_doc_dict = doc_dict
    content = doc_dict['content']
    lemmas = doc_dict['lemmas']

    del new_doc_dict['content']
    del new_doc_dict['lemmas']

    new_doc_dict['content'] = {'text': content, 'lemmas': lemmas}

    return new_doc_dict


ES_Manager.plain_delete(es_url + '/' + index)
ES_Manager.plain_put(es_url + '/' + index,
                     data=json.dumps({'mappings': {
                         mapping: {},
                         'facts': {}
                     }}))

with ZipFile('data.zip', 'r') as zip_file:
    with zip_file.open('data.csv') as fin:
        reader = csv.DictReader(fin)

        data = []

        counter = 0

        for row in reader: