class ElasticSearchBackend(BaseBackend):
    
    def __init__(self, es_url='http://localhost:9200/', batch_size=10, **kwargs):
        """
        Do what is necessary to create/open the index.
        """
        self.batch_size = batch_size
        self.batch_count = 0
        self.es_url = es_url
        self.fast = kwargs.get('fast', False)
        if kwargs.get('noisy', False):
            from logging import getLogger, StreamHandler, DEBUG
            import sys
            logger = getLogger('pyelasticsearch')
            logger.setLevel(DEBUG)
            logger.addHandler(StreamHandler(sys.stdout))
            
        self.es = ElasticSearch(self.es_url)
        try:
            self.es.count('*')
        except ConnectionError:
            print "Error connecting to ElasticSearch server!"
            raise
        self.urls = defaultdict(set) #track urls to be deleted before committing new content
        self.batches = defaultdict(list) #site: [list of docs]
    
    def create_index(self, name):
        name = name.lower()
        try:
            self.es.create_index(name)
            self.update_mapping(name)
        except Exception, e:
            print e
            return
Esempio n. 2
0
def analyze_post(token, text):
    response = {
        'post_now': False,
        'hours_to_wait': 1,
        'total_score': 0,
        'time_score': 0,
        'text_score': 0,
        'hint': "Building index",
    }

    try:
        data = Newsfeed.filter_only_posts_by_people(token)

    except Exception, e:
        es = ElasticSearch('http://localhost:9200/')

        try:
            es.create_index(token.lower())
            Newsfeed.newsfeed(token, [], 0, None, 1)

            t = threading.Thread(target=Newsfeed.newsfeed, args=(token, [], 0, None, 1500))
            t.setDaemon(True)
            t.start()

        except Exception, e:
            print e.message
Esempio n. 3
0
def update_process_datetime(doc_id, timestamp):
  ''' Updates the last_update_date for the document id passed into function.
    The document id in will be the name of another index in the cluster.
  '''
  connection_string = 'http://localhost:9200'
  process_index = 'openfdametadata'
  _type = 'last_run'
  _map = {}
  _map[_type] = {}
  _map[_type]['properties'] = {}
  _map[_type]['properties']['last_update_date'] = {}
  _map[_type]['properties']['last_update_date']['type'] = 'date'
  _map[_type]['properties']['last_update_date']['format'] = 'dateOptionalTime'

  es = ElasticSearch(connection_string)
  try:
    es.create_index(process_index)
    logging.info('Creating index %s', process_index)
  except exceptions.IndexAlreadyExistsError as e:
    logging.info('%s already exists', process_index)

  try:
    es.put_mapping(process_index, doc_type=_type, mapping=_map)
    logging.info('Successfully created mapping')
  except:
    logging.fatal('Could not create the mapping')

  new_doc = {}
  new_doc['last_update_date'] = timestamp
  es.index(process_index,
           doc_type=_type,
           id=doc_id,
           doc=new_doc,
           overwrite_existing=True)
Esempio n. 4
0
def main():
    """
    Method to kick things off
    """

    # Setup workers
    pool = Pool(processes=CPU_COUNT)

    # Prepare URLs
    urls = []
    for url in CRAWL_URLS:
        urls.append(str(BASE_URL + url))

    if USE_ES:
        # Create connection
        es = ElasticSearch(ES_URL)

        try:
            # Delete the existing index
            es.delete_index(ES_INDEX)
        except:
            # In case the index does not exist
            pass

        # Create the index to use
        es.create_index(ES_INDEX)

    else:
        # Setup the database tables, connect
        init_db()

    # Scrape and store async
    pool.map(scrape, urls)
Esempio n. 5
0
def IndexData(request):
    es = ElasticSearch(settings.ELASTIC_SEARCH)
    for file in fileHolder:
        index = file['segment_name'].lower()
        rawfiles = file['rawfiles']
        data_for_es = file['dataFrames']
        try:
            es.delete_index(index.replace(" ", ""))
        except:
            pass
    es.create_index(index.replace(" ", ""))

    ## Loop dataframe and to elasticsearch index
    docs = json.loads(data_for_es.to_json(orient='records'))
    es.bulk((es.index_op(doc) for doc in docs),
            index=index.replace(" ", ""),
            doc_type=index)

    ##Create segment template
    file_names = []
    for file in rawfiles:
        file_names.append(file.name)

    segment = Segments(name=index,
                       files_added=",".join(file_names),
                       es_index=index.replace(" ", ""))
    segment.save()

    segment = Segments.objects.get(name=index)

    return render(request, 'analyse.html', {'segment': segment})
Esempio n. 6
0
def update_process_datetime(doc_id, timestamp):
    ''' Updates the last_update_date for the document id passed into function.
    The document id in will be the name of another index in the cluster.
  '''
    connection_string = 'http://localhost:9200'
    process_index = 'openfdametadata'
    _type = 'last_run'
    _map = {}
    _map[_type] = {}
    _map[_type]['properties'] = {}
    _map[_type]['properties']['last_update_date'] = {}
    _map[_type]['properties']['last_update_date']['type'] = 'date'
    _map[_type]['properties']['last_update_date'][
        'format'] = 'dateOptionalTime'

    es = ElasticSearch(connection_string)
    try:
        es.create_index(process_index)
        logging.info('Creating index %s', process_index)
    except exceptions.IndexAlreadyExistsError as e:
        logging.info('%s already exists', process_index)

    try:
        es.put_mapping(process_index, doc_type=_type, mapping=_map)
        logging.info('Successfully created mapping')
    except:
        logging.fatal('Could not create the mapping')

    new_doc = {}
    new_doc['last_update_date'] = timestamp
    es.index(process_index,
             doc_type=_type,
             id=doc_id,
             doc=new_doc,
             overwrite_existing=True)
Esempio n. 7
0
def init_schema():
    """Should be called at application startup. Makes sure the mappings and
    index exist."""
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    try:
        es.create_index(settings.ELASTIC_SEARCH_INDEX)
    except IndexAlreadyExistsError:
        pass

    #   Does not replace if exact mapping already exists
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'reg_tree',
                   {'reg_tree': {
                       'properties': NODE_SEARCH_SCHEMA
                   }})
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'layer',
                   {'layer': {
                       'properties': LAYER_SCHEMA
                   }})
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'notice',
                   {'notice': {
                       'properties': LAYER_SCHEMA
                   }})
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'diff',
                   {'diff': {
                       'properties': DIFF_SCHEMA
                   }})
Esempio n. 8
0
def cli(index_name, delete_index, mapping_file, settings_file, doc_type,
        import_file, delimiter, tab, host, docs_per_chunk, bytes_per_chunk,
        parallel, quiet):
    """
    Bulk import a delimited file into a target Elasticsearch instance. Common
    delimited files include things like CSV and TSV.

    \b
    Load a CSV file:
      csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv
    \b
    For a TSV file, note the tab delimiter option
      csv2es --index-name tomatoes --doc-type tomato \
             --import-file tomatoes.tsv --tab
    \b
    For a nifty pipe-delimited file (delimiters must be one character):
      csv2es --index-name pipes --doc-type pipe --import-file pipes.psv \
             --delimiter '|'

    """

    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if delete_index:
        try:
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        if settings_file:
            echo('Applying mapping from: ' + settings_file, quiet)
            with open(settings_file) as f:
                settings = json.loads(f.read())
            es.create_index(index_name, settings)
        else:
            es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except ElasticHttpError as e:
        if e.error['type'] == 'index_already_exists_exception':
            echo('Index ' + index_name + ' already exists', quiet)
        else:
            raise

    echo('Using document type: ' + doc_type, quiet)
    if mapping_file:
        echo('Applying mapping from: ' + mapping_file, quiet)
        with open(mapping_file) as f:
            mapping = json.loads(f.read())
        es.put_mapping(index_name, doc_type, mapping)

    target_delimiter = sanitize_delimiter(delimiter, tab)
    documents = documents_from_file(es, import_file, target_delimiter, quiet)
    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
Esempio n. 9
0
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, host,
        docs_per_chunk, bytes_per_chunk, parallel, quiet, parser, config_file,
        user, passwd):

    with open(config_file, "rb") as f:
        con = json.loads(f.read())
    host = con['es_config']['host']
    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if con['db']['type'] == "oracle":
        db = import_module('cx_Oracle')
        collection = db.connect(user, passwd, con['db']['con_str'])
    else:
        db = import_module('MySQLdb')
        collection = db.connect(con['db']['con_str'][0],
                                user,
                                passwd,
                                con['db']['con_str'][1],
                                charset=con['db']['con_str'][2])

    if delete_index:  # 删除索引
        try:
            stamp = 0
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        if settings_file:
            with open(settings_file, 'r') as f:
                settings_json = json.loads(f.read())
            es.create_index(index_name, settings=settings_json)
        else:
            es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except Exception:
        echo('Index ' + index_name + ' already exists', quiet)

    echo('Using document type: ' + doc_type, quiet)

    es.put_mapping(index_name, doc_type, con['mapping'])

    parser_fun = None
    if parser is not None:
        # 加载解释函数
        parser_fun = import_module(PARSER_PATH + '.' + parser)

    documents = documents_from_file(es, collection, quiet, parser_fun, con)

    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
    print "end:" + time.strftime(
        ISOTIMEFORMAT, time.localtime()) + '/n all records import complete.'
Esempio n. 10
0
def import_json_into_es(types, inputfolder, logger):
    """
    imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch
    :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'}
    :param inputfolder:
    :param logger:
    :return:
    """

    es = ElasticSearch(config.ELASTICSEARCH_URL)

    try:
        es.delete_index('wikidata')
        es.create_index('wikidata')
        logger.info('rebuild index [wikidata]')
    except:
        logger.warning('cant delete wikidata index')


    # convert type dictionary
    wd_types = dict()
    for key in types.keys():
        value = int(types[key].split('/')[-1][1:])
        wd_types[value] = {'type': key,
                           'filename': path.join(inputfolder, '{}.json.bz2'.format(key))}


    # import each given type
    for key in wd_types:
        logger.info(wd_types[key])

        done = 0
        items = []

        for line in BZ2File(wd_types[key]['filename'],'rb'):
            line = line.strip()
            item = loads(line)
            item['uri'] = 'http://wikidata.org/wiki/' + item['id']

            items.append(item)
            done += 1

            if ( done % 5000 == 0 ):
                es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id')
                items = []

            # if done % len(wd_types) / 10 == 0: # log 10% steps
            #     logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done ))

            if done % 10000 == 0:
                logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d')))

        if len(items) > 0:
            es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id')
        logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d')))
Esempio n. 11
0
def feed(index='monolith', type='downloads', es_port=9200):
    client = ElasticSearch('http://0.0.0.0:%d/' % es_port)
    platforms = ['Mac OS X', 'Windows 8', 'Ubuntu']

    # indexing a year of data (2012)
    first_day = datetime.datetime(2012, 1, 1)
    last_day = datetime.datetime(2012, 12, 31)
    day_range = last_day - first_day

    for month in range(1, 13):
        name = 'time_2012-%.2d' % month
        try:
            client.delete_index(name)
        except Exception:
            pass
        client.create_index(name,
                            settings={
                                'number_of_shards': 1,
                                'number_of_replicas': 0,
                                'analysis': {
                                    'analyzer': {
                                        'default': {
                                            'type': 'custom',
                                            'tokenizer': 'keyword'
                                        }
                                    }
                                },
                                'store': {
                                    'compress': {
                                        'stored': 'true'
                                    }
                                },
                            })

    # indexing 100 apps
    for add_on in range(100):
        docs = defaultdict(list)
        for delta in range(day_range.days):
            date = first_day + datetime.timedelta(days=delta)
            data = {
                'date': date,
                'os': random.choice(platforms),
                'downloads_count': random.randint(1000, 1500),
                'users_count': random.randint(10000, 15000),
                'add_on': add_on + 1
            }
            docs[date.month].append(data)
        for month, values in docs.items():
            client.bulk_index('time_2012-%.2d' % month, type, values)
            sys.stdout.write('.')
            sys.stdout.flush()

    client.optimize('time_*', max_num_segments=1, wait_for_merge=True)
    client.flush()
    sys.stdout.write('\nDone!\n')
Esempio n. 12
0
    def setUp(self):
        es_connection = ElasticSearch('http://localhost:9200')
        try:
            es_connection.delete_index('unit_tests')
        except:
            pass
        es_connection.create_index('unit_tests')

        class TestModel(SearchModel):
            index_name = 'unit_tests'

        self.model = TestModel
Esempio n. 13
0
def index_data(data_source, index_name, doc_type):
    es = ElasticSearch(urls='http://localhost', port=9200)
    try:
        es.delete_index(index_name)
    except:
        pass
    es.create_index(index_name)
    try:
        es.bulk_index(index_name, doc_type, data_source)
    except:
        print("Error! Skipping Document...!")
        pass
Esempio n. 14
0
def cli(index_name, delete_index, mapping_file, doc_type, import_file,
        delimiter, tab, host, docs_per_chunk, bytes_per_chunk, parallel, quiet,
        document_id_in_file):
    """
    Bulk import a delimited file into a target Elasticsearch instance. Common
    delimited files include things like CSV and TSV.

    \b
    Load a CSV file:
      csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv
    \b
    For a TSV file, note the tab delimiter option
      csv2es --index-name tomatoes --doc-type tomato --import-file tomatoes.tsv --tab
    \b
    For a nifty pipe-delimited file (delimiters must be one character):
      csv2es --index-name pipes --doc-type pipe --import-file pipes.psv --delimiter '|'

    """

    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if delete_index:
        try:
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except IndexAlreadyExistsError:
        echo('Index ' + index_name + ' already exists', quiet)
    except ElasticHttpError as exception:
        echo(
            'Error creating index %s. ElasticHttpError [%s]' %
            (index_name, exception.error), quiet)

    echo('Using document type: ' + doc_type, quiet)
    if mapping_file:
        echo('Applying mapping from: ' + mapping_file, quiet)
        with open(mapping_file) as f:
            mapping = json.loads(f.read())
        es.put_mapping(index_name, doc_type, mapping)

    target_delimiter = sanitize_delimiter(delimiter, tab)
    documents = documents_from_file(es, import_file, target_delimiter, quiet,
                                    document_id_in_file)
    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
def es_indexer():
    es=ElasticSearch('http://localhost:9200/')
    if es:
        # Delete index /sentiment_analysis if it already exists
        try:
            es.delete_index("sentiment_analysis")
            print "Deleted index sentiment_analysis if it already existed."
        except:
            raise 'ElasticHttpNotFoundError'
        finally:
            print "Creating index sentiment_analysis ...."
            es.create_index("sentiment_analysis",{
                                    'settings': {
                                        'index': {
                                            'store': {
                                                'type': "default"
                                            },
                                            'number_of_shards': 1,
                                            'number_of_replicas': 1
                                        },
                                        'analysis': {
                                            'analyzer': {
                                                'default_english': {
                                                    'type': 'english'
                                                }
                                            }
                                        }
                                    },
                                    "mappings": {
                                        "document": {
                                            "properties": {
                                                "text": {
                                                    "type": "string",
                                                    "store": True,
                                                    "index": "analyzed",
                                                    "term_vector": "with_positions_offsets_payloads",
                                                    "analyzer": "default_english"
                                                },
                                                "sentiment": {
                                                    "type": "string",
                                                    "store": True,
                                                    "index": "analyzed",
                                                    "analyzer": "default_english"
                                                }
                                            }
                                        }
                                    }
                                })
            print "Created index 'sentiment_analysis' with type 'document' and an analyzed field 'text'."
    else:
        print "ElasticSearch is not running or the default cluster is down."
Esempio n. 16
0
class Indexer(object):
  
  def __init__(self, input):
    self.input = input
    self.es = ElasticSearch()
    self.index_name = "psim"
    self.doc_type = 'book'
    
  def delete_index(self):
    # Delete index if already found one
    try:
      self.es.delete_index(index = self.index_name)
    except Exception:
      pass
  
  def create_index(self):
    self.es.create_index(index=self.index_name, settings = self.get_index_settings())
    
  def get_index_settings(self):
    settings = {
                        "mappings": {
                           "book": {
                             "_all" : {"enabled" : "false"},       
                             "properties": {
                                "codes": {"type": "string",
                                         "term_vector": "yes",
                                         "store": "true"},
                                "pid" : {"type" : "string"},
                                "embedding": {"type": "float",
                                              "store": "true"},
                                "magnitude": {"type": "float", "store": "true"}
                             }     
                           }
                        }
               }
    return settings
  
  def documents(self):
    with open(self.input) as input_file:
      for line in input_file:
        json_doc = json.loads(line)
        yield self.es.index_op(json_doc, doc_type=self.doc_type)
    
  def index(self):
    self.delete_index()
    self.create_index()
    for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000):
      self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type)
    self.es.refresh(self.index_name)
Esempio n. 17
0
def index_data(data_path, chunksize, index_name, doc_type):
    f = open(data_path)
    csvfile = pd.read_csv(f, iterator=True, chunksize=chunksize)
    es = ElasticSearch(urls='http://localhost', port=9200)
    try:
        es.delete_index(index_name)
    except:
        pass
    es.create_index(index_name)
    for i, df in enumerate(csvfile):
        records = df.where(pd.notnull(df), None).T.to_dict()
        records_list = [records[i] for i in records]
        try:
            es.bulk_index(index_name, doc_type, records_list)
        except:
            print("Error! Skipping chunk...!")
            pass
Esempio n. 18
0
 def test_cluster_size_3(self):
     cluster = self._make_one(size=3)
     cluster.start()
     self.assertEqual(len(cluster), 3)
     self.assertEqual(len(cluster.hosts), 3)
     self.assertEqual(len(os.listdir(cluster.working_path)), 3)
     self.assertEqual(len(cluster.urls), 3)
     client = ElasticSearch(cluster.urls, max_retries=2)
     self.assertEqual(client.health()['number_of_nodes'], 3)
     # test if routing works and data is actually distributed across nodes
     client.create_index('test_shards', settings={
         'number_of_shards': 1,
         'number_of_replicas': 2,
     })
     client.index('test_shards', 'spam', {'eggs': 'bacon'})
     client.refresh('test_shards')
     shard_info = client.status()['indices']['test_shards']['shards']['0']
     nodes = set([s['routing']['node'] for s in shard_info])
     self.assertTrue(len(nodes) > 1)
Esempio n. 19
0
def feed(index='monolith', type='downloads', es_port=9200):
    client = ElasticSearch('http://0.0.0.0:%d/' % es_port)
    platforms = ['Mac OS X', 'Windows 8', 'Ubuntu']

    # indexing a year of data (2012)
    first_day = datetime.datetime(2012, 1, 1)
    last_day = datetime.datetime(2012, 12, 31)
    day_range = last_day - first_day

    for month in range(1, 13):
        name = 'time_2012-%.2d' % month
        try:
            client.delete_index(name)
        except Exception:
            pass
        client.create_index(name, settings={
            'number_of_shards': 1,
            'number_of_replicas': 0,
            'analysis': {'analyzer': {'default': {
                'type': 'custom', 'tokenizer': 'keyword'
            }}},
            'store': {'compress': {'stored': 'true'}},
        })

    # indexing 100 apps
    for add_on in range(100):
        docs = defaultdict(list)
        for delta in range(day_range.days):
            date = first_day + datetime.timedelta(days=delta)
            data = {'date': date,
                    'os': random.choice(platforms),
                    'downloads_count': random.randint(1000, 1500),
                    'users_count': random.randint(10000, 15000),
                    'add_on': add_on + 1}
            docs[date.month].append(data)
        for month, values in docs.items():
            client.bulk_index('time_2012-%.2d' % month, type, values)
            sys.stdout.write('.')
            sys.stdout.flush()

    client.optimize('time_*', max_num_segments=1, wait_for_merge=True)
    client.flush()
    sys.stdout.write('\nDone!\n')
Esempio n. 20
0
def init_schema():
    """Should be called at application startup. Makes sure the mappings and
    index exist."""
    es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)
    try:
        es.create_index(settings.ELASTIC_SEARCH_INDEX)
    except IndexAlreadyExistsError:
        pass

    #   Does not replace if exact mapping already exists
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', {
        'reg_tree': {'properties': NODE_SEARCH_SCHEMA}
    })
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'layer', {
        'layer': {'properties': LAYER_SCHEMA}
    })
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'notice', {
        'notice': {'properties': LAYER_SCHEMA}
    })
    es.put_mapping(settings.ELASTIC_SEARCH_INDEX, 'diff', {
        'diff': {'properties': DIFF_SCHEMA}
    })
Esempio n. 21
0
class ElasticSearchBackend(BaseBackend):
    def __init__(self,
                 es_url='http://localhost:9200/',
                 batch_size=10,
                 **kwargs):
        """
        Do what is necessary to create/open the index.
        """
        self.batch_size = batch_size
        self.batch_count = 0
        self.es_url = es_url
        self.fast = kwargs.get('fast', False)
        if kwargs.get('noisy', False):
            from logging import getLogger, StreamHandler, DEBUG
            import sys
            logger = getLogger('pyelasticsearch')
            logger.setLevel(DEBUG)
            logger.addHandler(StreamHandler(sys.stdout))

        self.es = ElasticSearch(self.es_url)
        try:
            self.es.count('*')
        except ConnectionError:
            print "Error connecting to ElasticSearch server!"
            raise
        self.urls = defaultdict(
            set)  #track urls to be deleted before committing new content
        self.batches = defaultdict(list)  #site: [list of docs]

    def create_index(self, name):
        name = name.lower()
        try:
            self.es.create_index(name)
            self.update_mapping(name)
        except Exception, e:
            print e
            return
Esempio n. 22
0
class LBRest():

    def __init__(self, base=None, idx_exp_url=None):
        self.base = base
        self.idx_exp_url = idx_exp_url
        if self.idx_exp_url is not None:
            http, space, address, _index, _type = self.idx_exp_url.split('/')
            self.es = ElasticSearch('/'.join([http, space, address]))

    def get_bases(self):
        """ Get all bases which has to index registries
        """
        bases = [ ]
        params = """{
            "select": [
                "name",
                "idx_exp_time",
                "idx_exp_url"
            ],
            "literal": "idx_exp is true",
            "limit": null
        }"""
        req = requests.get(config.REST_URL, params={'$$':params})
        try:
            req.raise_for_status()
            response = req.json()
            bases = response["results"]
        except:
            logger.error("""
                Erro ao tentar recuperar bases. url: %s. Reposta: %s
            """ % (config.REST_URL, req._content))
        return bases

    def get_passed_registries(self):
        """
        Realiza leitura da base de log de indexação
        """
        # Cria base de log se não existir
        self.create_log_base()
        registries = [ ]
        params = {'$$':"""{
            "select":["id_doc_orig", "dt_last_up_orig"],
            "literal": "nm_base = '%s'",
            "limit": null
            }""" % self.base }
        url = config.REST_URL + '/log_lbindex/doc'
        req = requests.get(url, params=params)
        try:
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except:
            logger.error("""
                Erro ao recuperar registros da base %s'. Resposta: %s
            """ % ('log_lbindex', req._content))


        resp = {} 
        for reg in registries:
            resp[reg['id_doc_orig']] = reg['dt_last_up_orig']
        return resp
        #return {reg['id_doc_orig']: reg['dt_last_up_orig'] for reg in registries}
        
    def get_registries(self):
        """Função que lista todos os registros a serem indexados"""
        registries = [ ]
        if config.FORCE_INDEX:
            params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'}
        else:
            params = {'$$':'{"select":["id_doc", "dt_last_up"],"literal":"dt_idx is null", "limit": %d}'}

        params.update(result_count='false')
        params['$$'] = params['$$'] % config.DEFAULT_LIMIT

        url = config.REST_URL + '/' + self.base + '/doc'
        req = requests.get(url, params=params)
        try:
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except:
            logger.error("""
                Erro ao recuperar registros da base %s'. Resposta: %s
            """ % (self.base, req._content))





        # Erro ao recuperar registros da base docs_pro'. Resposta: {"status": 500, 
        # "request": {"path": "/api/docs_pro/doc", "client_addr": "10.72.246.21", 
        #         "user_agent": "python-requests/2.3.0 CPython/2.6.6 Linux/2.6.32-504.el6.x86_64", 
        #         "method": "GET"}, "error_message": "SearchError: (OperationalError) could not 
        # connect to server: No route to host\n\tIs the server running on host \"10.72.247.144\" 
        # and accepting\n\tTCP/IP connections on port 5432?\n None None", "type": "Exception"}





        passed = self.get_passed_registries()
        _registries = [ ]
        for reg in registries:
            if reg['_metadata']['id_doc'] in passed:
                dt_last_up = passed[reg['_metadata']['id_doc']]
                if dt_last_up != reg['_metadata']['dt_last_up']:
                    _registries.append(reg)
            else:
                _registries.append(reg)

        return _registries

    def get_full_reg(self, id, dt_last_up):
        logger.info('Recuperando registro %s da base %s ...' % (str(id), self.base))
        response = None
        url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full'
        req = requests.get(url)
        try:
            req.raise_for_status()
            response = req.json()
        except:
            error_msg = """
                Erro ao recuperar registro %s na base %s'. Resposta: %s
            """ % (str(id), self.base, req._content)
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return response

    def index_member(self, registry, id, dt_last_up):
        logger.info('Indexando registro %s da base %s na url %s ...' % (str(id), self.base, self.idx_exp_url))
        try:

            http, space, address, _index, _type = self.idx_exp_url.split('/')
            self.es.index(_index, _type, registry, id=id)
            return True

        except Exception as e:
            error_msg = """
                Erro ao indexar registro %s da base %s na url %s'. Mensagem de erro: %s
            """ % (str(id), self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
            return False

    def update_dt_index(self, id, dt_last_up):
        logger.info('Alterando data de indexacao do registro %s da base %s ...' % (str(id), self.base))
        params = {'value': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')}
        url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/_metadata/dt_idx'
        req = requests.put(url, params=params)
        try:
            req.raise_for_status()
            return True
        except:
            error_msg = """
                Erro ao alterar data de indexacao do registro %s na base %s'. Resposta: %s
            """ % (str(id), self.base, req._content)
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return False

    def write_error(self, id_doc, dt_last_up, error_msg):
        """ Write errors to LightBase
        """
        error = {
            'nm_base': self.base,
            'id_doc_orig': id_doc,
            'error_msg': error_msg,
            'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
            'dt_last_up_orig': dt_last_up
        }
        url = config.REST_URL + '/log_lbindex/doc'
        data = {'value': json.dumps(error)}
        req = requests.post(url, data=data)
        try:
            req.raise_for_status()
        except:
            logger.error("""
                Erro ao tentar escrever erro no Lightbase. Reposta: %s
            """ % req._content)

    def get_errors(self):
        """ Get all bases which has to index registries
        """
        errors = [ ]
        params = """{
            "literal": "base = '%s'",
            "limit": 250
        }""" % (self.base)
        url = config.REST_URL + '/_index_error'
        req = requests.get(url, params={'$$':params})
        try:
            req.raise_for_status()
            response = req.json()
            errors = response["results"]
        except:
            logger.error("""
                Erro ao tentar recuperar erros de indice. url: %s. Reposta: %s
            """ % (url, req._content))
        return errors

    def create_index(self):
        """
        Cria índice com as opções de mapeamento padrão
        Atualiza o índice se já estiver criado
        """
        settings = {
            "settings": {
                # "number_of_shards": "5",
                # "number_of_replicas": "1",
                "analysis.analyzer.default.filter.0": "lowercase",
                "analysis.analyzer.default.filter.1": "asciifolding",
                "analysis.analyzer.default.tokenizer": "standard",
                "analysis.analyzer.default.type": "custom",
                "analysis.filter.pt_stemmer.type": "stemmer",
                "analysis.filter.pt_stemmer.name": "portuguese"
            },
            "mappings": {
                "document": {
                    "_timestamp": {
                        "enabled": "true"
                    }
                }
            }
        }

        http, space, address, _index, _type = self.idx_exp_url.split('/')
        try:
            result = self.es.create_index(
                index=_index,
                settings=settings
            )
        except IndexAlreadyExistsError as e:
            logger.info("O índice já existe. Tentando atualizar o mapping...")
            self.es.close_index(index=_index)
            result = self.es.update_settings(
                index=_index,
                settings=settings
            )
            logger.info("Mapping atualizado com sucesso. Abrindo o índice...")
            self.es.open_index(index=_index)
            logger.info("Índice reaberto com sucesso!")

    def delete_index(self, registry):
        id = registry['id_doc']
        try:
            http, space, address, _index, _type = self.idx_exp_url.split('/')
            self.es.delete(_index, _type, id=id)
            return True

        except ElasticHttpNotFoundError as e:
            return True

        except Exception as e:
            error_msg = """
                Erro ao deletar indice %s da base %s na url %s'. Mensagem de erro: %s
            """ % (str(id), self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)
            return False

    def delete_error(self, registry):
        url = config.REST_URL + """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}"""
        url = url % (registry['base'], registry['id_doc'])
        logger.info('Deletando registro de erro de indice na url %s' % url)
        req = requests.delete(url)
        try:
            req.raise_for_status()
            return True
        except:
            error_msg = """
                Erro ao deletar erro de indice. Resposta: %s
            """ % (req._content)
            logger.error(error_msg)
        return False

    @staticmethod
    def create_log_base():
        """
        Cria base de log do índice caso não exista
        """
        log_base = model.LogBase()
        response = log_base.get_base()
        if not response:
            # Cria a base já que ela não existe
            logger.info("Criando base de log do índice...")
            result = log_base.create_base()
            if result is None:
                logger.error("Erro na criação da base de log: \n%s", response.text)
                return False
            else:
                logger.info("Base de log criada com sucesso!")

        return True
Esempio n. 23
0
index_settings = {
    "number_of_shards": 3,
    "number_of_replicas": 1,
    "mappings": {
        ELASTICSEARCH_DOC: {
            "properties": {
                "location": {
                    "type": "geo_shape",
                    "tree": "quadtree",
                    "precision": "1m"
                }
            }
        },
    }
}
es.create_index(ELASTICSEARCH_INDEX, settings=index_settings)

for filename in FILES:
    print "Processing %s" % filename

    sf = shapefile.Reader(filename)

    shapes = sf.shapes()
    for i, shape in enumerate(shapes, start=1):
        points = [(p[0], p[1]) for p in shape.points]

        data = {
            'filename': filename,
            'location': {
                'type': 'polygon',
                'coordinates': [points]
Esempio n. 24
0
def cleanJson(json_data):
    data = json.loads(json_data)
    del data['_updated']
    del data['_created']
    del data['_links']
    del data['_id']
    return data

#es = ElasticSearch('http://*****:*****@recast-791793413.us-east-1.bonsai.io'
ELASTIC_SEARCH_URL = '127.0.0.1:9200'
es = ElasticSearch(ELASTIC_SEARCH_URL)

try:
    es.create_index('recast')
except IndexAlreadyExistsError, e:
    pass


r = requests.get(ELASTIC_SEARCH_URL)
i=1
while r.status_code == 200:
    url = 'http://recast-rest-api.herokuapp.com/analysis/{}'.format(i)
    r = requests.get(url)
    if not r.status_code == 200:
        break

    data = cleanJson(r.content)
    es.index('recast', 'analysis', json.dumps(data))
    i = i+1
Esempio n. 25
0
            'adUrl': {'type': 'string'},
            'adType': {'type': 'string'},
            'adSize': {'type': 'string'},
            'dateCreated': {'type': 'date', 'format' : 'YYYY-MM-dd HH:mm:ss'},
            'websiteId': {'type': 'integer'},
            'website': {'type': 'string', 'analyzer': 'simple'},
            'category': {'type': 'string'},
            'subCategory': {'type': 'string'}
        }
    }
}


es.health(wait_for_status='yellow')
es.delete_index('write-ads')
es.create_index('write-ads', settings={'mappings': ad_mapping})

dateYMD = args["date"]
prepareDataFromDB(dateYMD)

dir = DATA_FILES_JSON + '/' + dateYMD
for filename in os.listdir(dir):
    if filename.endswith('.json'):
        with open(dir + '/' + filename) as open_file:
            json_docs = json.load(open_file)
            es.bulk((es.index_op(doc) for doc in json_docs),
                index='write-ads',
                doc_type='ad')

es.refresh("write-ads")
Esempio n. 26
0
class IbbdElasticSearch:
    """
    es操作
    文档:http://pyelasticsearch.readthedocs.io/en/latest/
    """
    es = None
    config = {}

    mapping_is_set = False  # 判断是否已经设置了es的mapping

    def __init__(self, config):
        """
        es初始化
        配置参数:
        host: es连接字符串
        indexName: index的名字
        deleteIndex: 是否删除已经存在的index,默认为false,不删除
        settings: index的配置。具体的配置项,请看es的文档。
        settingsFile: index的配置,json文件。具体的配置项,请看es的文档。
        mappings: mappings的配置。具体的配置项,请看es的文档。
        mappingsFile: mappings的配置,json文件。具体的配置项,请看es的文档。
        idField: id字段。有些数据是包含id字段的

        说明:settings和settingsFile最多只能有一项
        mappings和mappingsFile最多也只能有一项
        """
        self.es = ElasticSearch(config['host'])

        if 'docType' not in config:
            config['docType'] = config['indexName']
        self.config = config

        if 'deleteIndex' in config and config['deleteIndex']:
            try:
                self.es.delete_index(config['indexName'])

                print('delete index ' + config['indexName'] + ' success!')
            except ElasticHttpNotFoundError:  # 如果本来不存在,则输出提示就好
                print('Index ' + config['indexName'] \
                                + ' not found, nothing to delete!')
            except:
                raise Exception('Index ' + config['indexName'] + ' delete error!')

        try:
            if 'settings' in config:
                self.es.create_index(config['indexName'],
                                     settings=config['settings'])
            elif 'settingsFile' in config:
                with open(config['settingsFile'], 'r') as f:
                    config['settings'] = json.loads(f.read())
                self.es.create_index(config['indexName'],
                                     settings=config['settings'])
            else:
                self.es.create_index(config['indexName'])

            print('create index ' + config['indexName'] + ' success!')
        except Exception:
            raise Exception("create index " + config['indexName'] + ' error!')

    def _putMapping(self, row):
        """
        设置es的mapping。
        可以根据row生成默认配置, 生成配置规则如下:
        """
        try:
            if 'mappingsFile' in self.config:
                with open(self.config['mappingsFile'], 'r') as f:
                    self.config['mappings'] = json.loads(f.read())

            if 'mappings' in self.config:
                self.es.put_mapping(self.config['indexName'],
                                    self.config['docType'],
                                    self.config['mappings'])
            print("put mapping " + self.config['indexName'] + ' success!')
        except Exception:
            raise Exception("put mapping " + self.config['indexName'] + ' error!')

    def read(self):
        pass

    def batchRead(self):
        pass

    def write(self, row):
        """
        写入单行记录
        """
        return self.batchWrite([row])

    def batchWrite(self, rows):
        """
        写入多行记录
        """
        if not self.mapping_is_set:   # 设置mapping
            self.mapping_is_set = True
            self._putMapping(rows[0])

        docs = ()
        if 'idField' in self.config:
            docs = (self.es.index_op(doc, id=doc.pop(self.config['idField'])) \
                    for doc in rows)
        else:
            docs = (self.es.index_op(doc) for doc in rows)

        self.es.bulk(docs,
                     index=self.config['indexName'],
                     doc_type=self.config['docType'])

        return True
csv_filename='robinhood-daily-rets.csv'
# size of the bulk
chunksize=5000

# parse csv with pandas
csvfile=pd.read_csv(csv_filename)

# init ElasticSearch
es = ElasticSearch('http://104.236.201.91:9200/')

# init index
try :
    es.delete_index("robinhood")
except :
    pass

es.create_index("robinhood")

# start bulk indexing
print("now indexing %s..."%(csv_filename))

records=csvfile.where(pd.notnull(csvfile), None).T.to_dict()
list_records=[records[it] for it in records]
try :
    es.bulk_index("robinhood","myPortfolio",list_records)
except :
    print("error!, skipping a date")
    pass

print("done in %.3fs"%(time()-t0))
Esempio n. 28
0
IGNORED_GENRES = ("9", "15", "19"
                  )  # We only care about stations that play music.

import settings

es = ElasticSearch(settings.ES_URL)
INDEX_NAME = settings.ES_INDEX

try:
    es.delete_index(INDEX_NAME)
except ElasticHttpNotFoundError:
    pass

try:
    es.create_index(INDEX_NAME)
except IndexAlreadyExistsError:
    pass

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36'
}

failures = 0
pk = 0
while failures < 200:
    pk += 1
    r = requests.get("http://www.iheart.com/a/live/station/%d/" % pk,
                     headers=headers)
Esempio n. 29
0
class SearchIndex(object):
    def __init__(self, model):
        self.es = ElasticSearch()
        self.model = model

    def put_mapping(self, index, doc_type):
        mapping = {
            doc_type: {
                "properties": {
                    "location": {
                        "type": "geo_point"
                    },
                }
            }
        }
        self.es.put_mapping(index=index, doc_type=doc_type, mapping=mapping)

    def bulk_items(self, index, doc_type):
        for m in self.model.objects.all():
            self.es.bulk([
                self.es.index_op({
                    "pk": m.pk,
                    "name": m.name,
                    "rating": m.rating,
                    "address": m.address,
                    "description": m.description,
                    "location": {
                        "lon": m.longitude,
                        "lat": m.latitude
                    }
                }),
                ],
                doc_type=doc_type,
                index=index)

    def search(self, index, question, longitude, latitude, size=10):
        #self.es.delete_index(index)
        try:
            self.es.create_index(index)
            self.put_mapping(index, "place")
            self.bulk_items(index, "place")
        except IndexAlreadyExistsError:
            pass

        query = {
            "query": {
                "function_score": {
                    "query": {
                        "bool": {
                            "should": [
                                {"match": {"name": question}},
                                {"match": {"_all": {
                                    "query": question,
                                    "operator": "or",
                                    "fuzziness": "auto",
                                    "zero_terms_query": "all"
                                    }}}
                                ]
                            }
                        },
                    "functions": [
                        {"exp": {"rating": {"origin": 5, "scale": 1, "offset": 0.1}}},
                    ]
                    }
                }
            }

        if longitude and longitude is not None:
            query['query']['function_score']['functions'] = [
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "550m", "scale": "1km"}
                    }},
                {'gauss': {
                    "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "500m", "scale": "2km"}
                    }},
            ]

        results = self.es.search(query, index=index, size=size)

        self.es.refresh()

        return results
Esempio n. 30
0
index_settings = {
    "number_of_shards": 3,
    "number_of_replicas": 1,
    "mappings": {
        ELASTICSEARCH_DOC: {
            "properties": {
                "location": {
                    "type": "geo_shape",
                    "tree": "quadtree",
                    "precision": "1m"
                }
            }
        },
    }
}
es.create_index(ELASTICSEARCH_INDEX, settings=index_settings)

for filename in FILES:
    print "Processing %s" % filename

    sf = shapefile.Reader(filename)

    shapes = sf.shapes()
    for i, shape in enumerate(shapes, start=1):
        points = [(p[0], p[1]) for p in shape.points]

        data = {
            'filename': filename,
            'location': {
                'type': 'polygon',
                'coordinates': [points]
Esempio n. 31
0
class ElasticSearch(object):
    conn = None
    url = settings.ELASTICSEARCH_URL
    index_name = settings.ELASTICSEARCH_INDEX_NAME
    stdout = None
    stderr = None

    def __init__(self, index_name=None, stdout=None, stderr=None):
        self.conn = PyElasticSearch()
        if index_name:
            self.index_name = index_name
        if stdout:
            self.stdout = stdout
        if stderr:
            self.stderr = stderr

    def create_index(self, delete=True):
        if delete:
            try:
                self.conn.delete_index(self.index_name)
            except ElasticHttpNotFoundError as e:
                pass
        mappings = dict(
            (k, v) for k, v in get_elasticsearch_properties().items())
        self.conn.create_index(self.index_name,
                               settings={'mappings': mappings})

    def index_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.index_activity(activity)

    def delete_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.delete_activity(activity)

    def index_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            docs = self.get_activity_documents(activity, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(
                        doc, id=doc.pop('id'), parent=doc.pop('_parent', None))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_investor(self, investor):
        for doc_type in DOC_TYPES_INVESTOR:
            docs = self.get_investor_documents(investor, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_activity_documents(self, activity_identifiers=[]):
        activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter(
            fk_status__in=(
                HistoricalActivity.STATUS_ACTIVE,
                HistoricalActivity.STATUS_PENDING,
                HistoricalActivity.STATUS_OVERWRITTEN,
                HistoricalActivity.STATUS_DELETED)).distinct().values_list(
                    'activity_identifier', flat=True).distinct()

        for doc_type in DOC_TYPES_ACTIVITY:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i deals...' %
                (doc_type, len(activity_identifiers)))
            for activity_identifier in activity_identifiers:
                for activity in self.get_activity_versions(
                        activity_identifier):
                    docs.extend(
                        self.get_activity_documents(activity,
                                                    doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                paginator = Paginator(docs, 1000)
                for page in paginator.page_range:
                    try:
                        self.conn.bulk(
                            (self.conn.index_op(doc,
                                                id=doc.pop('id'),
                                                parent=doc.pop(
                                                    '_parent', None))
                             for doc in paginator.page(page)),
                            index=self.index_name,
                            doc_type=doc_type)
                    except BulkError as e:
                        for error in e.errors:
                            msg = '%s: %s on ID %s' % (
                                error['index']['error']['type'],
                                error['index']['error']['reason'],
                                error['index']['_id'])
                            if 'caused_by' in error['index']['error']:
                                msg += ' (%s: %s)' % (error['index']['error']
                                                      ['caused_by']['type'],
                                                      error['index']['error']
                                                      ['caused_by']['reason'])
                            self.stderr and self.stderr.write(msg)
                    self.conn.refresh()

    def index_investor_documents(self):
        investors = Investor.objects.public().order_by(
            'investor_identifier', '-id').distinct('investor_identifier')

        for doc_type in DOC_TYPES_INVESTOR:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i investors...' %
                (doc_type, investors.count()))
            for investor in investors:
                docs.extend(
                    self.get_investor_documents(investor, doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    #def index_activity_by_version(self, activity_identifier):
    #    for doc_type in get_elasticsearch_properties().keys():
    #        docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type)
    #        if len(docs) > 0:
    #            try:
    #                self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs),
    #                    index=self.index_name,
    #                    doc_type=doc_type)
    #            except BulkError as e:
    #                for error in e.errors:
    #                    stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % (
    #                            error['index']['error']['type'],
    #                            error['index']['error']['reason'],
    #                            error['index']['error']['caused_by']['type'],
    #                            error['index']['error']['caused_by']['reason'],
    #                            error['index']['_id']
    #                          ))

    def get_activity_versions(self, activity_identifier):
        versions = []
        # get the newest non-pending, readable historic version:
        try:
            newest = HistoricalActivity.objects.filter(
                activity_identifier=activity_identifier,
                fk_status__in=(
                    HistoricalActivity.STATUS_ACTIVE,
                    HistoricalActivity.STATUS_OVERWRITTEN,
                    HistoricalActivity.STATUS_DELETED)).distinct().latest()
            if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED:
                versions.append(newest)
        except HistoricalActivity.DoesNotExist:
            newest = None

        # get newer pendings
        pendings = HistoricalActivity.objects.filter(
            activity_identifier=activity_identifier,
            fk_status_id=HistoricalActivity.STATUS_PENDING).distinct()
        if newest:
            pendings.filter(history_date__gt=newest.history_date)
        versions.extend(pendings)

        return versions

    def get_activity_documents(self, activity, doc_type='deal'):
        docs = []
        deal_attrs = {
            'id': activity.id,
            'activity_identifier': activity.activity_identifier,
            'historical_activity_id': activity.id,
            'status': activity.fk_status_id,
        }

        # Todo: Is there a nice way to prevent this extra Activity query?
        # e.g. if we save is_public/deal_scope as ActivityAttributes
        public_activity = Activity.objects.filter(
            activity_identifier=activity.activity_identifier).order_by(
                '-id').first()
        if public_activity:
            deal_attrs.update({
                'is_public':
                public_activity.is_public,
                'deal_scope':
                public_activity.deal_scope,
                'deal_size':
                public_activity.deal_size,
                'current_negotiation_status':
                public_activity.negotiation_status,
                'top_investors':
                public_activity.top_investors,
                'fully_updated_date':
                public_activity.fully_updated_date,
            })
        else:
            # Fixme: This should not happen
            self.stderr and self.stderr.write(
                _('Missing activity for historical activity %i (Activity identifier: #%i)'
                  % (activity.id, activity.activity_identifier)))
        #except Activity.MultipleObjectsReturned:
        #    # Fixme: This should not happen
        #    self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % (
        #        activity.id,
        #        activity.activity_identifier
        #    )))

        for a in activity.attributes.select_related('fk_group__name').order_by(
                'fk_group__name'):
            # do not include the django object id
            if a.name == 'id':
                continue
            attribute = None
            attribute_key = '%s_attr' % a.name
            if attribute_key in get_elasticsearch_properties(
            )['deal']['properties'].keys():
                attribute = {
                    'value': a.value,
                    'value2': a.value2,
                    'date': a.date,
                    'is_current': a.is_current,
                }
            value = a.value

            # Area field?
            if a.name and 'area' in a.name and a.polygon is not None:
                # Get polygon
                #value = json.loads(a.polygon.json)
                # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work
                #value['type'] = 'multipolygon'
                value = a.polygon.json or ''
            # do not include empty values
            if value is None or value == '':
                continue

            # Doc types: location, data_source or contract
            group_match = a.fk_group and a.fk_group.name or ''
            group_match = re.match(
                '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)',
                group_match)
            if group_match:
                dt, count = group_match.groupdict()['doc_type'], int(
                    group_match.groupdict()['count'])
                if doc_type == dt:
                    while len(docs) < count:
                        docs.append({
                            '_parent': activity.activity_identifier,
                            'id': a.id,  #'%i_%i' % (a.id, count),
                        })
                    docs[count - 1][a.name] = [
                        value,
                    ]
                # Set doc type counter within deal doc type (for location/data_source/contract)
                elif doc_type == 'deal':
                    # Set counter
                    key = '%s_count' % dt
                    if key not in deal_attrs.keys():
                        deal_attrs[key] = count
                    elif deal_attrs[key] < count:
                        deal_attrs[key] = count

                    # Create list with correct length to ensure formset values have the same index
                    if not a.name in deal_attrs:
                        deal_attrs[a.name] = [''] * count
                        if attribute:
                            deal_attrs[attribute_key] = [''] * count
                    else:
                        while len(deal_attrs[a.name]) < count:
                            deal_attrs[a.name].append('')
                            if attribute:
                                deal_attrs[attribute_key].append('')
                    deal_attrs[a.name][count - 1] = value
                    if attribute:
                        deal_attrs['%s_attr' % a.name][count - 1] = attribute

            # Doc type: deal and not formset
            elif doc_type == 'deal':
                if a.name in deal_attrs:
                    deal_attrs[a.name].append(value)
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name].append(attribute)
                else:
                    deal_attrs[a.name] = [
                        value,
                    ]
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name] = [
                            attribute,
                        ]

        if doc_type == 'deal':
            # Additionally save operational company attributes
            oc = Investor.objects.filter(
                investoractivityinvolvement__fk_activity__activity_identifier=
                activity.activity_identifier)
            if oc.count() > 0:
                oc = oc.first()
                for field in Investor._meta.fields:
                    if isinstance(field, ForeignKey):
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(
                                       oc, '%s_id' % field.name)
                    else:
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(oc, field.name)
            else:
                pass
                #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier)

        # Create single document for each location
        # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now?
        spatial_names = list(get_spatial_properties())
        for i in range(deal_attrs.get('location_count', 0)):
            doc = deal_attrs.copy()
            for name in spatial_names:
                if not name in doc:
                    continue
                if len(deal_attrs[name]) > i:
                    doc[name] = deal_attrs[name][i]
                else:
                    doc[name] = ''
            # Set unique ID for location (deals can have multiple locations)
            doc['id'] = '%s_%i' % (doc['id'], i)
            point_lat = doc.get('point_lat', None)
            point_lon = doc.get('point_lon', None)
            if point_lat and point_lon:
                # Parse values
                try:
                    parsed_lat, parsed_lon = float(point_lat), float(point_lon)
                    doc['geo_point'] = '%s,%s' % (point_lat, point_lon)
                except ValueError:
                    doc['geo_point'] = '0,0'
            else:
                doc['point_lat'] = '0'
                doc['point_lon'] = '0'
                doc['geo_point'] = '0,0'
            # FIXME: we dont really need 'point_lat' and 'point_lon' here,
            # so we should pop them from doc when adding 'geo_point'
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def get_export_properties(self, doc, doc_type='deal'):
        if doc_type == 'investor':
            return ExportInvestorForm.export(doc)
        elif doc_type == 'involvement':
            return InvestorVentureInvolvementForm.export(doc)
        else:
            properties = {
                'deal_scope_export':
                doc.get('deal_scope', ''),
                'is_public_export':
                doc.get('is_public', False) and str(_('Yes')) or str(_('No')),
                'deal_size_export':
                doc.get('deal_size', ''),
                'current_negotiation_status_export':
                doc.get('current_negotiation_status', ''),
                'top_investors_export':
                doc.get('top_investors', ''),
                'fully_updated_date_export':
                doc.get('fully_updated_date', ''),
            }
            # Doc types: deal, location, contract and data_source
            for form in ChangeDealView.FORMS:
                formset_name = hasattr(form, "form") and form.Meta.name or None
                form = formset_name and form.form or form
                properties.update(form.export(doc, formset=formset_name))
            properties.update(
                ExportInvestorForm.export(doc, prefix='operational_company_'))
            return properties

    def get_investor_documents(self, investor, doc_type='investor'):
        docs = []
        # Doc types: involvement and investor
        if doc_type == 'involvement':
            ivis = InvestorVentureInvolvement.objects.filter(
                Q(fk_venture=investor) | Q(fk_investor=investor))
            for ivi in ivis:
                doc = {}
                for field in ivi._meta.local_fields:
                    if isinstance(field, ForeignKey):
                        doc[field.name] = getattr(ivi, '%s_id' % field.name)
                    else:
                        doc[field.name] = getattr(ivi, field.name)
                docs.append(doc)
        elif doc_type == 'investor':
            doc = {}
            for field in investor._meta.local_fields:
                if isinstance(field, ForeignKey):
                    doc[field.name] = getattr(investor, '%s_id' % field.name)
                else:
                    doc[field.name] = getattr(investor, field.name)
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def refresh_index(self):
        self.conn.refresh(self.index_name)

    def search(self, elasticsearch_query, doc_type='deal', sort=[]):
        """ Executes paginated queries until all results have been retrieved. 
            @return: The full list of hits. """
        start = 0
        size = 10000  # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better)
        raw_result_list = []

        done = False
        while not done:
            query = {
                'query': elasticsearch_query,
                'from': start,
                'size': size,
            }
            if sort:
                query['sort'] = sort
            query_result = self.conn.search(query,
                                            index=self.index_name,
                                            doc_type=doc_type)
            raw_result_list.extend(query_result['hits']['hits'])
            results_total = query_result['hits']['total']

            if len(raw_result_list) >= results_total:
                done = True
            else:
                start = len(raw_result_list)

        print('\nElasticsearch returned %i documents from a total of %i \n\n' %
              (len(raw_result_list), query_result['hits']['total']))
        return raw_result_list

    def delete_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            try:
                if doc_type == 'deal':
                    self.conn.delete(id=activity.activity_identifier,
                                     index=self.index_name,
                                     doc_type=doc_type)
                else:
                    self.conn.delete_by_query(query={
                        "parent_id": {
                            "type": "deal",
                            "id": str(activity.activity_identifier),
                        }
                    },
                                              index=self.index_name,
                                              doc_type=doc_type)
            except ElasticHttpNotFoundError as e:
                pass

    def get_deals_by_activity_identifier(self,
                                         activity_identifier,
                                         doc_type='deal'):
        return self.search({
            "constant_score": {
                "filter": {
                    "term": {
                        "activity_identifier": activity_identifier
                    }
                }
            }
        })
Esempio n. 32
0
from pyelasticsearch import ElasticSearch
import simplejson,sys

s=ElasticSearch("http://localhost:9200")

if "init" in sys.argv :
	try :
		s.delete_index("flights");
	except Exception, e:
		print e
	try :
		s.create_index("flights")
	except Exception, e:
		print e
	else :	
		print "Created flights"


	s.put_mapping("flights","flight",simplejson.loads('{"flight":{"properties":{"datum":{"type":"string","index":"not_analyzed","omit_norms":true,"index_options":"docs"},"type": { "type": "string", "index" : "not_analyzed" }, "duration":{"type":"double"},"end":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}},"flight":{"type":"string","store":true,"analyzer":"keyword"},"hex":{"type":"string","store":true,"analyzer":"keyword"},"id":{"type":"string","store":true},"radar":{"type":"string","store":true,"analyzer":"keyword"},"reg":{"type":"string","store":true,"analyzer":"keyword"},"route":{"properties":{"coordinates":{"type":"double"},"type":{"type":"string"}}},"start":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}}}}}'))



def md(a) :
    a["datum"]=a["starttime"][:10]
    return a
    
    
def makets(a) :
    for f in ("starttime","endtime") :
        a[f]=maket(a[f])
    return a
Esempio n. 33
0
class ESWrapper(BaseDB):
    def __init__(self, index_name, host='http://localhost', port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        self._base_query = {
            "query": {
                "bool": {
                    "must": {
                        "match": {
                            "name.raw": ""
                        }
                    }
                }
            }
        }
        self._geo_filter = {
            "geo_distance": {
                "distance": "20km",
                "coordinates": {}
            }
        }
        self._index = index_name
        self._doctype = "places"

    def query(self, qkey, qtype="exact"):
        """
        qtype values are exact, relaxed or geo_distance
        """
        q = self._base_query.copy()
        if qtype == "exact":
            q["query"]["bool"]["must"]["match"]["name.raw"] = qkey
        elif qtype == "relaxed":
            q["query"]["bool"]["must"]["match"]["name"] = qkey
            q["query"]["bool"]["must"]["match"].pop("name.raw")
        elif qtype == "geo_distance":
            q = {
                "query": {
                    "bool": {
                        "must": {
                            "match_all": {}
                        }
                    },
                    "filter": {
                        "geo_distance": {
                            "distance": "20km",
                            "coordinates": qkey
                        }
                    }
                }
            }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def near_geo(self, geo_point):
        q = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    }
                },
                "filter": self._geo_filter
            }
        }
        q["query"]["bool"]["geo_distance"]["coordinates"] = geo_point
        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)

        self.eserver.create_index(index='geonames', settings=settings)
        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index='geonames', doc_type='places')
            print "..",

        self.eserver.refresh('geonames')

    def _opLoader(self, datacsv, confDir):
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                row['coordinates'] = [
                    float(row['longitude']),
                    float(row['latitude'])
                ]
                del (row['latitude'])
                del (row['longitude'])
                row['alternatenames'] = row['alternatenames'].split(",")
                cnt += 1
                #if cnt > 100:
                #break
                yield self.eserver.index_op(row,
                                            index="geonames",
                                            doc_type="places")
db = SQLAlchemy(app)
cache = Cache(app, config={'CACHE_TYPE': 'simple'})

app.config['MAIL_SERVER'] = 'smtp.gmail.com'
app.config['MAIL_PORT'] = 587
app.config['MAIL_USE_TLS'] = True
app.config['MAIL_USERNAME'] = '******'
app.config['MAIL_PASSWORD'] = '******'
app.config['MAIL_DEFAULT_SENDER'] = ('Sender name', 'sender email')
mail = Mail(app)

app.secret_key = 'some_random_key'

es = ElasticSearch('http://localhost:9200/')
try:
    es.create_index('catalog')
except IndexAlreadyExistsError, e:
    pass

app.config.update(
    CELERY_BROKER_URL='redis://localhost:6379',
    CELERY_RESULT_BACKEND='redis://localhost:6379'
)

def make_celery(app):
    celery = Celery(
        app.import_name, broker=app.config['CELERY_BROKER_URL']
    )
    celery.conf.update(app.config)
    TaskBase = celery.Task
    class ContextTask(TaskBase):
# open csv file
f = open(raw_data_path+csv_filename) # read csv

# parse csv with pandas
csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) 

# init ElasticSearch
es = ElasticSearch('http://localhost:9200/')

# init index
try :
    es.delete_index("weiboscope")
except :
    pass

es.create_index("weiboscope")

# start bulk indexing 
print ("now indexing %s..."%(csv_filename))

for i,df in enumerate(csvfile): 
    print (i)
    records=df.where(pd.notnull(df), None).T.to_dict()
    list_records=[records[it] for it in records]
    try :
        es.bulk_index("weiboscope","tweet",list_records)
    except :
        print ("error!, skiping some tweets sorry")
        pass

print( "done in %.3fs"%(time()-t0))
Esempio n. 36
0
File: index.py Progetto: Yenlo/ddld
                            "type": "string",
                            "index": "analyzed"
                            },
                        "untouched": {
                            "type": "string",
                            "index": "not_analyzed"
                            }
                        }
                   },
                "topics": {
                    "type": "multi_field",
                    "fields": {
                        "topics": {
                            "type": "string",
                            "index": "analyzed"
                            },
                        "untouched": {
                            "type": "string",
                            "index": "not_analyzed"
                            }
                        }
                   }
                }
            }
        }

    es.create_index(index, {"mappings": mapping})
    # es.put_mapping(index, doc_type, mapping)

    es.bulk_index(index, doc_type, get_docs(fname), 'persistent_id')
Esempio n. 37
0
input = len(sys.argv)
if input < 2:
	usage()
	sys.exit(1)
else:
	qname = sys.argv[1]

from pyelasticsearch import ElasticSearch
es = ElasticSearch(elasticsearch)

try:
	s = es.status('oplog')
except:
	print "Creating index: oplog"
	try:
		s = es.create_index('oplog')
		print "sleeping for 5 to ensure index exists"
		time.sleep(5)
	except:
		print "ERROR: index creation failed!"
		sys.exit()

print "Creating queue: %s" % qname
try:
	es.put_mapping('oplog',qname,{"properties" : { "from" : {"type" : "string", "null_value" : "na"}, "sent" : {"type" : "string", "null_value" : "na"}, "submitted" : {"type" : "date"}, "subject" : {"type" : "string", "null_value" : "na"}, "message" : {"type" : "string", "null_value" : "na"} }})
	print "Created queue with mapping:"
	print es.get_mapping('oplog',qname)
except:
	print "ERROR: queue creation failed!"
Esempio n. 38
0
db = SQLAlchemy(app)
cache = Cache(app, config={'CACHE_TYPE': 'simple'})

app.config['MAIL_SERVER'] = 'smtp.gmail.com'
app.config['MAIL_PORT'] = 587
app.config['MAIL_USE_TLS'] = True
app.config['MAIL_USERNAME'] = '******'
app.config['MAIL_PASSWORD'] = '******'
app.config['MAIL_DEFAULT_SENDER'] = ('Sender name', 'sender email')
mail = Mail(app)

app.secret_key = 'some_random_key'

es = ElasticSearch('http://localhost:9200/')
try:
    es.create_index('catalog')
except IndexAlreadyExistsError, e:
    pass

app.config.update(CELERY_BROKER_URL='redis://localhost:6379',
                  CELERY_RESULT_BACKEND='redis://localhost:6379')


def make_celery(app):
    celery = Celery(app.import_name, broker=app.config['CELERY_BROKER_URL'])
    celery.conf.update(app.config)
    TaskBase = celery.Task

    class ContextTask(TaskBase):
        abstract = True
Esempio n. 39
0
import time

IGNORED_GENRES = ("9", "15", "19")  # We only care about stations that play music.

import settings

es = ElasticSearch(settings.ES_URL)
INDEX_NAME = settings.ES_INDEX

try:
	es.delete_index(INDEX_NAME)
except ElasticHttpNotFoundError:
	pass

try:
	es.create_index(INDEX_NAME)
except IndexAlreadyExistsError:
	pass

headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36'
}

failures = 0
pk = 0
while failures < 200:
	pk += 1
	r = requests.get("http://www.iheart.com/a/live/station/%d/" % pk, headers=headers)

	if r.status_code != 200:
		if r.status_code > 500:
Esempio n. 40
0
                    'analyzer': 'mmseg',
                    'boost': 0.7,
                    'term_vector': 'with_positions_offsets'
                },
                'categories': {
                    'type': 'nested',
                    'properties': {
                        'url': {
                            'type': 'string',
                            'index': 'not_analyzed'
                        },
                        'name': {
                            'type': 'string',
                            'index': 'not_analyzed'
                        },
                    }
                }
            }
        }
    }
}

es = ElasticSearch(HOST)
try:
    es.delete_index(INDEX)
except ElasticHttpNotFoundError:
    # No index found
    pass

es.create_index(INDEX, settings=index_settings)
Esempio n. 41
0
class ElasticSearchProvider(SearchProvider):
    def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None):
        self.debug = False
        self.config = config
        if db is not None:
            self.db = db
        self.syncES = ElasticSearch(
            '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config
        )
        self.asyncES = ESConnection(
            host=config.get('ELASTIC_SEARCH_HOST'),
            port=config.get('ELASTIC_SEARCH_PORT'),
            io_loop=io_loop,
            protocol=config.get('ELASTIC_SEARCH_PROTOCOL'),
        )
        self.index = config.get('ELASTIC_SEARCH_INDEX')
        self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES')

    def activate_debug(self):
        self.debug = True

    def connect_to_db(self):
        from sqlalchemy import create_engine
        from sqlalchemy.orm import scoped_session, sessionmaker
        conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING')
        engine = create_engine(
            conn_string,
            convert_unicode=True,
            pool_size=1,
            max_overflow=0,
            echo=self.debug
        )
        maker = sessionmaker(bind=engine, autoflush=True)
        self.db = scoped_session(maker)

    def _assemble_inner_query(self, domain=None, page_filter=None):
        if page_filter and domain:
            page_prefix = '%s/%s' % (domain.url, page_filter)
        else:
            page_prefix = None

        if page_prefix:
            return {
                'prefix': {
                    'page_url': page_prefix
                }
            }
        else:
            return {
                'match_all': {}
            }

    def _assemble_outer_query(self, inner_query, filter_terms):
        return {
            'filtered': {
                'query': inner_query,
                'filter': {
                    'and': [{
                        'term': filter_term
                    } for filter_term in filter_terms]
                }
            }
        }

    def _assemble_filter_terms(self, key_id=None, domain=None):
        filter_terms = []

        if key_id:
            filter_terms.append({'keys.id': key_id})

        if domain:
            filter_terms.append({'domain_id': domain.id})

        return filter_terms

    def gen_doc(self, review):
        return {
            'keys': [{'id': violation.key_id} for violation in review.violations],
            'uuid': str(review.uuid),
            'completed_date': review.completed_date,
            'violation_count': review.violation_count,
            'page_id': review.page_id,
            'page_uuid': str(review.page.uuid),
            'page_url': review.page.url,
            'page_last_review_date': review.page.last_review_date,
            'domain_id': review.domain_id,
            'domain_name': review.domain.name,
        }

    def index_review(self, review):
        for attempt in range(self.max_retries):
            try:
                self.syncES.send_request(
                    method='POST',
                    path_components=[self.index, 'review', review.page_id],
                    body=dumps(self.gen_doc(review)),
                    encode_body=False
                )
                break
            except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e:
                values = review.id, review.page_id, str(e)
                logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values))
                time.sleep(1)
                if attempt >= self.max_retries - 1:
                    raise
            else:
                raise

    def index_reviews(self, reviewed_pages, reviews_count, batch_size):
        action = {'index': {'_type': 'review'}}

        for i in range(0, reviews_count, batch_size):
            body_bits = []

            for page in reviewed_pages[i:i + batch_size]:
                doc = self.gen_doc(page.last_review)

                action['index']['_id'] = doc['page_id']

                body_bits.append(dumps(action))
                body_bits.append(dumps(doc))

            # Yes, that trailing newline IS necessary
            body = '\n'.join(body_bits) + '\n'

            self.syncES.send_request(
                method='POST',
                path_components=[self.index, '_bulk'],
                body=body,
                encode_body=False
            )

        logging.info('Done!')

    @return_future
    def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    reviews_data = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        reviews_data.append({
                            'uuid': hit['_source']['uuid'],
                            'page': {
                                'uuid': hit['_source']['page_uuid'],
                                'url': hit['_source']['page_url'],
                                'completedAt': completedAt
                            },
                            'domain': hit['_source']['domain_name']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviews': reviews_data,
                        'reviewsCount': reviews_count
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain, page_filter)
        filter_terms = self._assemble_filter_terms(key_id, domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'completed_date': {
                'order': 'desc'
            }
        }, {
            'violation_count': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    @return_future
    def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    pages = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        pages.append({
                            'url': hit['_source']['page_url'],
                            'uuid': hit['_source']['page_uuid'],
                            'violationCount': len(hit['_source']['keys']),
                            'completedAt': completedAt,
                            'reviewId': hit['_source']['uuid']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviewsCount': reviews_count,
                        'pages': pages
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter)
        filter_terms = self._assemble_filter_terms(domain=domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'violation_count': {
                'order': 'desc'
            }
        }, {
            'completed_date': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    def refresh(self):
        try:
            self.syncES.refresh(index=self.index)
        except Exception as e:
            logging.error('Could not refresh index (%s)' % e)

    def get_index_settings(cls):
        return {
            'index': {
                'number_of_shards': 4
            }
        }

    def get_index_mapping(cls):
        return {
            'review': {
                'properties': {
                    'keys': {
                        'properties': {
                            'id': {
                                'type': 'integer'
                            }
                        }
                    },
                    'uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'completed_date': {
                        'type': 'integer'
                    },
                    'violation_count': {
                        'type': 'float'
                    },
                    'page_id': {
                        'type': 'integer'
                    },
                    'page_uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_url': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_last_review_date': {
                        'type': 'integer'
                    },
                    'domain_id': {
                        'type': 'integer'
                    },
                    'domain_name': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            }
        }

    def setup_index(self):
        try:
            settings = self.get_index_settings()
            self.syncES.create_index(index=self.index, settings=settings)
            mapping = self.get_index_mapping()
            self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping)
            logging.info('Index %s created.' % self.index)
        except Exception as e:
            raise e

    def delete_index(self):
        try:
            self.syncES.delete_index(index=self.index)
            logging.info('Index %s deleted.' % self.index)
        except Exception as e:
            raise e

    def _get_max_page_id_from_index(self, must_have_domain_name=False):
        if must_have_domain_name:
            inner_query = {
                'constant_score': {
                    'filter': {
                        'not': {
                            'missing': {
                                'field': 'domain_name'
                            }
                        }
                    }
                }
            }
        else:
            inner_query = {
                'match_all': {}
            }

        query = {
            'query': inner_query,
            'sort': [{
                'page_id': {
                    'order': 'desc'
                }
            }]
        }

        results = self.syncES.search(query, index=self.index, doc_type='review')
        if results['hits']['total'] > 0:
            return results['hits']['hits'][0]['_id'] or 0
        return 0

    def index_all_reviews(self, keys=None, batch_size=200, replace=False):
        logging.info('Querying database...')
        self.connect_to_db()

        if keys is not None:
            keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()]

        try:
            max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True)
        except Exception:
            logging.error('Could not retrieve max page_id! Use with --replace (with caution)')
            return

        def apply_filters(query):
            if keys is not None:
                query = query \
                    .filter(Violation.review_id == Page.last_review_id) \
                    .filter(Violation.key_id.in_(keys))

            if not replace:
                query = query.filter(Page.id > max_page_id)

            return query.filter(Page.last_review_id != None)

        reviews_count = apply_filters(self.db.query(func.count(Page))).scalar()

        query = self.db.query(Page).options(joinedload('last_review'))
        reviewed_pages = apply_filters(query).order_by(Page.id.asc())

        logging.info('Indexing %d reviews...' % reviews_count)

        self.index_reviews(reviewed_pages, reviews_count, batch_size)

    @classmethod
    def new_instance(cls, config):
        return ElasticSearchProvider(config)

    @classmethod
    def main(cls):
        import sys

        parser = cls.argparser()
        args = parser.parse_args()

        config = {}
        host = None
        port = None
        index = None
        es = None

        levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG']
        log_level = levels[args.verbose]
        logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s')

        if not (args.create or args.recreate or args.delete or args.keys or args.all_keys):
            parser.print_help()
            sys.exit(1)

        if args.conf:
            from derpconf.config import ConfigurationError
            from holmes.config import Config
            try:
                config = Config().load(args.conf[0])
                host = config['ELASTIC_SEARCH_HOST']
                port = config['ELASTIC_SEARCH_PORT']
                index = config['ELASTIC_SEARCH_INDEX']
            except ConfigurationError:
                logging.error('Could not load config! Use --conf conf_file')
                sys.exit(1)
            except KeyError:
                logging.error('Could not parse config! Check it\'s contents')
                sys.exit(1)

        if args.server:
            try:
                host, port = args.server[0].split(':')
                config['ELASTIC_SEARCH_HOST'] = host
                config['ELASTIC_SEARCH_PORT'] = port
            except Exception:
                logging.error('Could not parse server host and port! Use --server host:port')
                sys.exit(1)

        if args.index:
                index = args.index[0]
                config['ELASTIC_SEARCH_INDEX'] = index

        from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError
        from requests.exceptions import ConnectionError
        try:

            if args.create or args.recreate or args.delete:
                if host is None or port is None:
                    logging.error('Need either a host and port or a config file to perform such operation!')
                    sys.exit(1)
                if index is None:
                    logging.error('Need either an index name or a config file to perform such operation!')
                    sys.exit(1)
                else:
                    es = cls.new_instance(config)
                    if args.recreate or args.delete:
                        try:
                            es.delete_index()
                        except ElasticHttpNotFoundError:
                            pass
                        except InvalidJsonResponseError as e:
                            logging.error('Invalid response! Reason: %s' % e)
                            sys.exit(1)
                    if args.create or args.recreate:
                        es.setup_index()

            if args.keys or args.all_keys:
                if config is None:
                    logging.error('Need a config file to perform such operation! Use --conf conf_file')
                else:
                    batch_size = args.batch_size[0] if args.batch_size else 200
                    es = cls.new_instance(config) if not es else es
                    try:
                        if args.verbose > 2:
                            es.activate_debug()
                        if args.keys:
                            es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size)
                        elif args.all_keys:
                            es.index_all_reviews(replace=args.replace, batch_size=batch_size)
                    except InvalidJsonResponseError as e:
                        logging.error('Invalid response! Reason: %s' % e)
                        sys.exit(1)

        except IndexAlreadyExistsError:
            logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index)
        except ConnectionError:
            logging.error('Could not connect to server at %s:%s' % (host, port))
        except KeyError:
            logging.error('Could not get host nor port! Use either -conf or --server')
            sys.exit(1)
Esempio n. 42
0
class ESWrapper(BaseDB):
    def __init__(self,
                 index_name,
                 doc_type,
                 host='http://localhost',
                 port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        #self._base_query = {"query": {"bool": {"must": {"match": {}}}}}
        #self._base_query = {"query": {"bool": {}}}
        self._geo_filter = {"distance": "20km", "coordinates": {}}
        self._population_filter = {'population': {'gte': 5000}}
        self._index = index_name
        self._doctype = doc_type

    def getByid(self, geonameId):
        maincondition = {"match": {"id": geonameId}}
        q = {"query": {"bool": {"must": maincondition}}}
        return self.eserver.search(
            q, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']

    def _query(self,
               qkey,
               qtype="exact",
               analyzer=None,
               min_popln=None,
               size=10,
               **kwargs):
        """
        qtype values are exact, relaxed or geo_distance
        Always limit results to 10
        """
        q = {"query": {"bool": {}}}
        query_name = kwargs.pop('query_name', 'must')
        query_name = "should"
        if query_name == "should":
            q["query"]["bool"]["minimum_number_should_match"] = 1

        maincondition = {}
        if qtype == "exact":
            maincondition = [{
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            }]
            #maincondition["match"] = {"name.raw": {"query": qkey}}
            if analyzer:
                maincondition["match"]["name.raw"]["analyzer"] = analyzer

        elif qtype == "relaxed":
            maincondition["match"] = {"alternatenames": {"query": qkey}}
            if analyzer:
                maincondition["match"]["alternatenames"]["analyzer"] = analyzer

            #q["query"]["bool"][query_name]["match"].pop("name.raw", "")
        elif qtype == "combined":
            maincondition = [{
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": qkey,
                            "fields":
                            ["name.raw", "asciiname", "alternatenames"]
                        }
                    },
                    "filter": {
                        "bool": {
                            "should": [{
                                "range": {
                                    "population": {
                                        "gte": 5000
                                    }
                                }
                            }, {
                                "terms": {
                                    "featureCode": [
                                        "pcla", "pcli", "cont", "rgn", "admd",
                                        "adm1", "adm2"
                                    ]
                                }
                            }]
                        }
                    }
                }
            }, {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey[1:]
                    }
                }
            }, {
                "match": {
                    "alternatenames": {
                        "query": qkey,
                        'fuzziness': kwargs.pop("fuzzy", 0),
                        "max_expansions": kwargs.pop("max_expansion", 5),
                        "prefix_length": kwargs.pop("prefix_length", 1)
                    }
                }
            }]

        if maincondition:
            q["query"]["bool"][query_name] = maincondition

            if min_popln:
                filter_cond = [{"range": {"population": {"gte": min_popln}}}]
            else:
                filter_cond = []

            if kwargs:
                #filter_cond = [{"range": {"population": {"gte": min_popln}}}]
                filter_cond += [{
                    "term": {
                        key: val
                    }
                } for key, val in kwargs.viewitems()]
                # print(filter_cond)
                q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}
            elif min_popln:
                filter_cond = [{
                    "range": {
                        "population": {
                            "gte": min_popln
                        }
                    }
                }, {
                    "terms": {
                        "featureCode": ["ppla", "pplx"]
                    }
                }]

                q["query"]["bool"]["filter"] = {
                    "bool": {
                        "should": filter_cond
                    }
                }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def query(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def near_geo(self, geo_point, min_popln=5000, **kwargs):
        q2 = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    },
                    "filter": [{
                        "geo_distance": {
                            "distance": "30km",
                            "coordinates": geo_point
                        }
                    }, {
                        "terms": {
                            "featureCode": ["pcli", "ppl", "ppla2", "adm3"]
                        }
                    }]
                }
            },
            "sort": {
                "population": "desc"
            }
        }

        res = self.eserver.search(q2,
                                  index=self._index,
                                  doc_type=self._doctype,
                                  **kwargs)['hits']['hits'][0]['_source']
        res['confidence'] = 1.0
        return [GeoPoint(**res)]

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

    def _opLoader(self, datacsv, confDir):
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                try:
                    row['coordinates'] = [
                        float(row['longitude']),
                        float(row['latitude'])
                    ]
                    try:
                        row['population'] = int(row["population"])
                    except:
                        row['population'] = -1

                    try:
                        row['elevation'] = int(row['elevation'])
                    except:
                        row['elevation'] = -1

                    del (row['latitude'])
                    del (row['longitude'])
                    #print row['name']
                    row['alternatenames'] = row['alternatenames'].split(",")
                    cnt += 1
                    yield self.eserver.index_op(row,
                                                index=self._index,
                                                doc_type=self._doctype)
                except:
                    print json.dumps(row)
                    continue
Esempio n. 43
0
class ESWrapper(BaseDB):
    def __init__(self,
                 index_name,
                 doc_type,
                 host='http://localhost',
                 port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        #self._base_query = {"query": {"bool": {"must": {"match": {}}}}}
        #self._base_query = {"query": {"bool": {}}}
        self._geo_filter = {"distance": "20km", "coordinates": {}}
        self._population_filter = {'population': {'gte': 5000}}
        self._index = index_name
        self._doctype = doc_type

    def getByid(self, geonameId):
        maincondition = {"match": {"id": geonameId}}
        q = {"query": {"bool": {"must": maincondition}}}
        return self.eserver.search(
            q, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']

    def _query(self, qkey, **kwargs):
        q = {"query": {"bool": {}}}
        query_name = "should"
        q["query"]["bool"]["minimum_number_should_match"] = 1
        kwargs.pop("qtype", "")

        placetokens = [
            l.strip() for l in tokenizer.split(qkey)
            if l and l not in STOP_WORDS and l[-1] != '.'
        ]
        if placetokens:
            reduced_placename = u" ".join(placetokens[0:])
            if len(placetokens[0]) < 3 and len(
                    placetokens) > 1 and 3.0 / len(placetokens) >= .5:
                reduced_placename = u" ".join(placetokens[1:])
        else:
            reduced_placename = qkey

        # print "qkey", qkey, "reduced", reduced_placename
        maincondition = [
            {
                "bool": {
                    "must": [{
                        "multi_match": {
                            "query":
                            qkey,
                            "fields":
                            ["name.raw^5", "asciiname^5", "alternatenames"],
                            "operator":
                            "and"
                        }
                    }, {
                        "terms": {
                            "featureClass": ["a", "p"]
                        }
                    }],
                }
            },
            {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "normalized_asciiname": {
                        "value": qkey
                    }
                }
            },
            # {"term": {"alternatenames": {"value": qkey[1:]}}},
            {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            },
            # {"multi_match": {"query": reduced_placename if 'fuzzy' in kwargs else unicode(unidecode(reduced_placename)),
            {
                "multi_match": {
                    "query":
                    reduced_placename if 'fuzzy' in kwargs else unicode(
                        unidecode(reduced_placename)),
                    'fuzziness':
                    kwargs.pop("fuzzy", 0),
                    "max_expansions":
                    kwargs.pop("max_expansion", 10),
                    "prefix_length":
                    kwargs.pop("prefix_length", 1),
                    'operator':
                    kwargs.pop("operator", "and"),
                    "fields": [
                        "name^3", "asciiname^3", "alternatenames",
                        "normalized_asciiname^3"
                    ]
                }
            }
        ]

        q["query"]["bool"][query_name] = maincondition

        if kwargs:
            filter_cond = []
            if 'min_popln' in kwargs:
                popln = kwargs.pop("min_popln")
                if popln is not None:
                    filter_cond.append(
                        {"range": {
                            "population": {
                                "gte": popln
                            }
                        }})

            for key, val in kwargs.viewitems():
                if not isinstance(val, basestring):
                    val = list([(v) for v in val])
                    filter_cond.append({"terms": {key: val}})
                else:
                    filter_cond.append({"term": {key: (val)}})

            q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}

        q['from'] = 0
        q['size'] = 50
        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def query(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        # print(max_score)
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def _oldquery(self,
                  qkey,
                  qtype="exact",
                  analyzer=None,
                  min_popln=None,
                  size=10,
                  **kwargs):
        """
        qtype values are exact, relaxed or geo_distance
        Always limit results to 10
        """
        q = {"query": {"bool": {}}}
        query_name = kwargs.pop('query_name', 'must')
        query_name = "should"
        if query_name == "should":
            q["query"]["bool"]["minimum_number_should_match"] = 1

        maincondition = {}
        if qtype == "exact":
            maincondition = [{
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            }]
            if analyzer:
                maincondition["match"]["name.raw"]["analyzer"] = analyzer

        elif qtype == "relaxed":
            maincondition["match"] = {"alternatenames": {"query": qkey}}
            if analyzer:
                maincondition["match"]["alternatenames"]["analyzer"] = analyzer

            #q["query"]["bool"][query_name]["match"].pop("name.raw", "")
        elif qtype == "combined":
            maincondition = [{
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": qkey,
                            "fields":
                            ["name.raw", "asciiname", "alternatenames"]
                        }
                    },
                    "filter": {
                        "bool": {
                            "should": [{
                                "range": {
                                    "population": {
                                        "gte": 5000
                                    }
                                }
                            }, {
                                "terms": {
                                    "featureCode": [
                                        "pcla", "pcli", "cont", "rgn", "admd",
                                        "adm1", "adm2"
                                    ]
                                }
                            }]
                        }
                    }
                }
            }, {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey[1:]
                    }
                }
            }, {
                "match": {
                    "alternatenames": {
                        "query": qkey,
                        'fuzziness': kwargs.pop("fuzzy", 0),
                        "max_expansions": kwargs.pop("max_expansion", 5),
                        "prefix_length": kwargs.pop("prefix_length", 1)
                    }
                }
            }]

        if maincondition:
            q["query"]["bool"][query_name] = maincondition

            if min_popln:
                filter_cond = [{"range": {"population": {"gte": min_popln}}}]
            else:
                filter_cond = []

            if kwargs:
                #filter_cond = [{"range": {"population": {"gte": min_popln}}}]
                filter_cond += [{
                    "term": {
                        key: val
                    }
                } for key, val in kwargs.viewitems()]
                # print(filter_cond)
                q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}
            elif min_popln:
                filter_cond = [{
                    "range": {
                        "population": {
                            "gte": min_popln
                        }
                    }
                }, {
                    "terms": {
                        "featureCode": ["ppla", "pplx"]
                    }
                }]

                q["query"]["bool"]["filter"] = {
                    "bool": {
                        "should": filter_cond
                    }
                }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def oldquery(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def near_geo(self, geo_point, min_popln=5000, **kwargs):
        q2 = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    },
                    "filter": [
                        {
                            "geo_distance": {
                                "distance": "30km",
                                "coordinates": geo_point
                            }
                        },
                        {
                            "terms":
                            # {"featureCode":
                            #  ["pcli", "ppl", "ppla2", "adm3"]}
                            {
                                "featureClass": ["a", "h", "l", "t", "p", "v"]
                            }
                        }
                    ]
                }
            },
            "sort": {
                "population": "desc"
            }
        }
        if kwargs:
            for key in kwargs:
                q2['query']['bool']['filter'].append(
                    {"term": {
                        key: kwargs[key]
                    }})

        res = self.eserver.search(
            q2, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']
        res['confidence'] = 1.0
        return [GeoPoint(**res)]

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

    def _opLoader(self, datacsv, confDir):
        ere = re.compile("[^\sa-zA-Z0-9]")
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                try:
                    row['coordinates'] = [
                        float(row['longitude']),
                        float(row['latitude'])
                    ]
                    try:
                        row['population'] = int(row["population"])
                    except:
                        row['population'] = -1

                    try:
                        row['elevation'] = int(row['elevation'])
                    except:
                        row['elevation'] = -1

                    del (row['latitude'])
                    del (row['longitude'])
                    #print row['name']
                    row['alternatenames'] = row['alternatenames'].lower(
                    ).split(",")
                    row['normalized_asciiname'] = (re.sub(
                        r'\s+', r' ', ere.sub("", row['asciiname']))).strip()
                    cnt += 1
                    yield self.eserver.index_op(row,
                                                index=self._index,
                                                doc_type=self._doctype)
                except:
                    print json.dumps(row)
                    continue

    def remove_dynamic_stopwords(self, term):
        # cc = {}
        # ttl = 0
        words = [w for t in term.split("-") for w in t.split() if len(w) > 1]

        if len(words) == 1:
            return term

        stopword_removed = ""
        for word in words:
            try:
                t = self.eserver.count(word)['count']
                if t >= 20000:
                    continue
            except:
                pass

            stopword_removed += (word + " ")
            # else:
            #     print(term, "stopword ", word)

        return stopword_removed.strip()
Esempio n. 44
0
def import_json_into_es(types, inputfolder, logger):
    """
    imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch
    :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'}
    :param inputfolder:
    :param logger:
    :return:
    """

    es = ElasticSearch(config.ELASTICSEARCH_URL)

    try:
        es.delete_index('wikidata')
        es.create_index('wikidata')
        logger.info('rebuild index [wikidata]')
    except:
        logger.warning('cant delete wikidata index')

    # convert type dictionary
    wd_types = dict()
    for key in types.keys():
        value = int(types[key].split('/')[-1][1:])
        wd_types[value] = {
            'type': key,
            'filename': path.join(inputfolder, '{}.json.bz2'.format(key))
        }

    # import each given type
    for key in wd_types:
        logger.info(wd_types[key])

        done = 0
        items = []

        for line in BZ2File(wd_types[key]['filename'], 'rb'):
            line = line.strip()
            item = loads(line)
            item['uri'] = 'http://wikidata.org/wiki/' + item['id']

            items.append(item)
            done += 1

            if (done % 5000 == 0):
                es.bulk_index('wikidata',
                              wd_types[key]['type'],
                              items,
                              id_field='id')
                items = []

            # if done % len(wd_types) / 10 == 0: # log 10% steps
            #     logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done ))

            if done % 10000 == 0:
                logger.info('imported {}: {}'.format(wd_types[key]['type'],
                                                     format(done, ',d')))

        if len(items) > 0:
            es.bulk_index('wikidata',
                          wd_types[key]['type'],
                          items,
                          id_field='id')
        logger.info('imported {}: {}'.format(wd_types[key]['type'],
                                             format(done, ',d')))
Esempio n. 45
0
class LBRest():

    def __init__(self, base=None, idx_exp_url=None, 
                 txt_mapping=None, cfg_idx=None):
        """Serve para cosumir o LBG e o ES."""

        self.base = base
        self.idx_exp_url = idx_exp_url
        if self.idx_exp_url is not None:
            self.idx_exp_host = idx_exp_url.split('/')[2]
            self.idx_exp_index = idx_exp_url.split('/')[3]
            self.idx_exp_type = idx_exp_url.split('/')[4]
            self.es = ElasticSearch("http://" + self.idx_exp_host)
        self.txt_mapping = txt_mapping
        self.cfg_idx = cfg_idx
        self.con_refsd = False

    def get_index(self, bases_list):
        """Obter a a configuração de indexação p/ as bases."""

        bases_indexes = []
        for base in bases_list:
            idx_exp_url = base['metadata']['idx_exp_url']
            nm_idx = idx_exp_url.split('/')[3]
            url_txt_idx = config.REST_URL + "/_txt_idx/" + nm_idx
            req = None
            try:
                req = requests.get(url_txt_idx)
                req.raise_for_status()
                idx_resp = req.json()
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 404:

                    # NOTE: Para os casos onde não há configuração de 
                    # indexação setada na rota "_txt_idx"! By Questor
                    idx_resp = None
                else:
                    fail_content = None
                    if req is not None:
                        fail_content = req._content
                    else:
                        fail_content = str(e)
                    logger.error("Falha HTTP ao tentar obter configuração de "\
                    "índice textual! URL: %s. FALHA: %s" % 
                    (config.REST_URL, fail_content))
                    return []
            except Exception as e:
                fail_content = None
                if req is not None:
                    fail_content = req._content
                else:
                    fail_content = str(e)
                logger.error("Erro ao tentar obter a configuração de índice "\
                "textual! URL: %s. FALHA: %s" % 
                (config.REST_URL, fail_content))
                return []
            bases_indexes.append({"base": base, "index": idx_resp})
        return bases_indexes

    def get_bases(self):
        """Get all bases which has to index registries."""

        # NOTE: A construção logo abaixo tá meio tosca. O objetivo é
        # checar se na estrutura de dados da table "lb_base" já está 
        # o atributo (campo struct) e o campo "txt_mapping". Se não 
        # tiver, tenta obter a base com todos os campos. Trata-se de 
        # um "workaround" sendo o correto que a estrutura de dados 
        # na table "lb_base" esteja atualizada! By Questor
        bases = [ ]
        req = None
        try:
            params = """{
                "select": [
                    "name",
                    "idx_exp_time",
                    "idx_exp_url",
                    "txt_mapping"
                ],
                "literal": "idx_exp is true",
                "limit": null
            }"""
            req = requests.get(config.REST_URL, params={'$$':params})
            if config.FORCE_INDEX == True:
                data = [ ]
                results = dict({
                    u'metadata' : {
                        u'idx_exp_url'  : u''+config.ES_URL+'',
                        u'name'         : u''+config.NM_BASE+'',
                        u'idx_exp_time' : u''+config.TIME_IDX+''
                    }
                })
                data.append(results)
                bases = data
            else:
                req.raise_for_status()
                response = req.json()
                bases = response["results"]
        except Exception as e:
            bases = [ ]
            req = None
            try:
                params = """{
                    "literal": "idx_exp is true",
                    "limit": null
                }"""
                req = requests.get(config.REST_URL, params={'$$':params})
                req.raise_for_status()
                response = req.json()
                bases = response["results"]
            except Exception as e:
                # NOTE: A variável de instância "self.con_refsd" 
                # serve p/ evitar que o aviso mais abaixo seja 
                # exibido repetidamente detonando o log! By Questor
                if self.con_refsd:
                    return bases

                # NOTE: Estou usando '"Connection refused" in str(e)' 
                # pq "raise_for_status()" mais acima não retorna uma 
                # exceção do tipo "requests.exceptions.HTTPError" de 
                # forma q possamos usar o código em "status_code" 
                # tratar erro de forma mais específica! By Questor
                if "Connection refused" in str(e) and not self.con_refsd:
                    logger.error('Erro ao obter a lista bases para '\
                    'indexação. URL: %s. FALHA: Servidor indisponivel! '\
                    'HTTPCode: 502 (Connection refused)!' % (config.REST_URL))
                    self.con_refsd = True
                    return bases
                self.con_refsd = False
                fail_content = None
                if req is not None:
                    fail_content = req._content
                else:
                    fail_content = str(e)
                logger.error(
                    ("Erro ao obter a lista bases para indexação. "
                        "URL: %s. FALHA: %s") % (
                        config.REST_URL, 
                        fail_content))
        return bases

    def get_passed_registries(self):
        """Retorna registros da base de log erros de indexação. 
        Apenas "id_doc_orig" e "dt_last_up_orig".
        """

        # NOTE: Cria base de log se não existir! By Questor
        self.create_log_base()

        registries = [ ]
        params = {'$$':"""{
            "select":["id_doc_orig", "dt_last_up_orig"],
            "literal": "nm_base = '%s'",
            "limit": null
            }""" % self.base}
        url = config.REST_URL + '/log_lbindex/doc'

        req = None
        try:
            req = requests.get(url, params=params)
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                1 Erro ao recuperar registros da base %s'. FALHA: %s
            """ % ('log_lbindex', fail_content))

        resp = {}
        for reg in registries:
            resp[reg['id_doc_orig']] = reg['dt_last_up_orig']
        return resp

    def get_registries(self):
        """Retorna registros à serem indexados que sob certos critérios não 
        tenham falhado no passado.
        """

        # NOTE: Obtêm registros da base de log de erros! Registros 
        # q tenham falhado no passado! By Questor
        registries = [ ]
        if config.FORCE_INDEX:
            params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'}
        else:
            params = {
                '$$':'{"select":["id_doc", "dt_last_up"], \
                "literal":"dt_idx is null", "limit": %d}'
            }

        params.update(result_count='false')
        params['$$'] = params['$$'] % config.DEFAULT_LIMIT

        url = config.REST_URL + '/' + self.base + '/doc'

        req = None
        try:
            req = requests.get(url, params=params)
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                Erro ao recuperar registros da base %s'. FALHA: %s
            """ % (self.base, fail_content))

        '''
        TODO: Essa lógica poderia ser mais eficiente... A 
        princípio vejo duas soluções...
        1 - Guardar em cache (mais complicada);
        2 - Trazer apenas os registros (id_doc) envolvidos 
        no processo de indexação atual.
        By Questor
        '''

        '''
        TODO: Esse método "self.get_passed_registries()" deveria 
        ser chamado sempre? Mesmo quando a operação é "create"? 
        Checar melhor... By Questor
        '''

        # NOTE: Obtêm registros da base de log de erros! Registros 
        # q tenham falhado no passado! By Questor
        passed = self.get_passed_registries()

        _registries = [ ]
        for reg in registries:
            if reg['_metadata']['id_doc'] in passed:
                '''
                NOTE: O objetivo aqui é checar se o registro 
                está no log de erros (registros que tentou-se 
                indexar no passado) e se estiver ignora-os a 
                não ser que a data de "update" do registro 
                registrado na base de logs seja diferente da 
                data atual do registro, nesses casos o LBIndex 
                vai tentar novamente!
                By Questor
                '''

                '''
                NOTE: No dict "passed" consta apenas o valor 
                do campo "dt_last_up_orig" da base "log_lbindex"! 
                By Questor
                '''
                dt_last_up = passed[reg['_metadata']['id_doc']]

                if dt_last_up != reg['_metadata']['dt_last_up']:
                    _registries.append(reg)
            else:
                _registries.append(reg)

        return _registries

    def get_full_reg(self, id, dt_last_up):
        """Obtêm o registro doc mais textos extraídos dos arquivos anexos se 
        houverem.
        """

        # TODO: Registrar essa ação no log toda "santa vez"? By Questor
        logger.info('Recuperando registro %s da base %s ...' % 
            (str(id), self.base))

        response = None
        url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full'

        req = None
        try:
            req = requests.get(url)
            req.raise_for_status()
            response = req.json()
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            error_msg = """
                Erro ao recuperar registro %s na base %s'. FALHA: %s
            """ % (str(id), self.base, fail_content)

            # TODO: Pq duas chamadas as logs? By Questor
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return response

    def es_create_mapping(self):
        """Cria um mapping p/ uma base se houver configuração p/ isso."""

        response_0 = None
        response_0_json = None
        index_url = None
        try:
            index_url = ("http://" + self.idx_exp_host + "/" + 
                self.idx_exp_index + "/" + self.idx_exp_type)
            response_0 = requests.get(index_url + "/_mapping")
            response_0.raise_for_status()
            response_0_json = response_0.json()
        except requests.exceptions.HTTPError as e:

            # NOTE: Normalmente entrará nesse bloco de código 
            # quando o índice não existe! By Questor
            self.es_create_index()
        except requests.exceptions.RequestException as e:
            raise Exception("Problem in the mapping provider! " + str(e))
        except Exception as e:
            raise Exception("Mapping operation. Program error! " + str(e))

        if (response_0.status_code == 200 and not response_0_json and 
                (self.txt_mapping is not None and self.txt_mapping)):
            response_1 = None
            try:
                response_1 = self.es.put_mapping(
                    index=self.idx_exp_index,
                    doc_type=self.idx_exp_type,
                    mapping=self.txt_mapping)

                if (response_1 is None or
                        response_1.get("acknowledged", None) is None or
                        response_1.get("acknowledged", None) != True):
                    raise Exception("Retorno inesperado do servidor \
                        ao criar mapping! " + 
                        str(response_1))
            except Exception as e:
                raise Exception("Mapping creation error! " + str(e))

    def es_create_index(self):
        """Criar um índice p/ a base com as configurações setadas, não havendo 
        criar um índice genérico.
        """

        response_0 = None
        try:
            cfg_idx_holder = None

            # NOTE: Se não houver configuração de indexação "setada" 
            # o sistema vai criar uma padrão! By Questor
            if self.cfg_idx is not None and self.cfg_idx:
                cfg_idx_holder = self.cfg_idx
            else:
                cfg_idx_holder = {
                        "settings":{
                            "analysis":{
                                "analyzer":{
                                    "default":{
                                        "tokenizer":"standard",
                                        "filter":[
                                            "lowercase",
                                            "asciifolding"
                                        ]
                                    }
                                }
                            }
                        }
                    }

            response_0 = self.es.create_index(index=self.idx_exp_index,
                                              settings=cfg_idx_holder)

            if (response_0 is None or
                response_0.get("acknowledged", None) is None or
                response_0.get("acknowledged", None) != True):
                raise Exception("Retorno inesperado do servidor \
                    ao criar index! " + 
                    str(response_0))

            self.es_create_mapping()
        except IndexAlreadyExistsError as e:
            self.es_create_mapping()
        except Exception as e:
            raise Exception("Index creation error! " + str(e))

    def index_member(self, registry, id, dt_last_up):
        """Criar o índice textual para cada registro."""

        logger.info(
            'Indexando registro %s da base %s na url %s ...' % (
                str(id), 
                self.base, self.idx_exp_url))

        try:

            # NOTE: Trata e cria os mappings e index textuais! 
            # By Questor
            self.es_create_mapping()
            self.es.index(self.idx_exp_index, self.idx_exp_type, 
                          registry, id=id)
            return True
        except Exception as e:
            error_msg = ("Erro ao indexar registro %s da base %s na url %s'. "
                "Mensagem de erro: %s") % (
                str(id), 
                self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)

            # TODO: Pq dois logs? By Questor
            self.write_error(id, dt_last_up, error_msg)
            return False

    def update_dt_index(self, id, dt_last_up):
        """Atualizar a data de atualização da indexação textual do registro."""

        logger.info('Alterando data de indexacao do '\
            'registro %s da base %s ...' % (str(id), self.base))
        params = {'value': datetime.datetime.now().\
            strftime('%d/%m/%Y %H:%M:%S')}
        url = (config.REST_URL + '/' + self.base + '/doc/' + str(id) + 
            '/_metadata/dt_idx')

        req = None
        try:
            req = requests.put(url, params=params)
            req.raise_for_status()
            return True
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            error_msg = 'Erro ao alterar data de indexacao do registro %s na '\
                'base %s. FALHA: %s' % (str(id), self.base, fail_content)
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return False

    def write_error(self, id_doc, dt_last_up, error_msg):
        """Write errors to LightBase."""

        error = {
            'nm_base': self.base,
            'id_doc_orig': id_doc,
            'error_msg': error_msg,
            'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
            'dt_last_up_orig': dt_last_up
        }
        url = config.REST_URL + '/log_lbindex/doc'
        data = {'value': json.dumps(error)}
        req = None
        try:
            req = requests.post(url, data=data)
            req.raise_for_status()
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                0 Erro ao tentar escrever erro no Lightbase. FALHA: %s
            """ % fail_content)

    def get_errors(self):
        """Get all bases which has to index registries."""

        errors = [ ]
        params = """{
            "literal": "base = '%s'",
            "limit": 250
        }""" % (self.base)
        url = config.REST_URL + '/_index_error'

        req = None
        try:
            req = requests.get(url, params={'$$':params})
            req.raise_for_status()
            response = req.json()
            errors = response["results"]
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                Erro ao tentar recuperar erros de indice. URL: %s. FALHA: %s
            """ % (url, fail_content))
        return errors

    # TODO: Esse método serve para criar/atualizar p/ uma 
    # indexação (index) padrão! No momento está "desvirtuado", 
    # pois basta apagar o índice p/ q ele seja recriado com a 
    # indexação setada na rota "_txt_idx"! Creio que esse 
    # método não faz muito sentido aqui. Sugiro remover! 
    # By Questor
    def create_index(self):
        """Cria índice com as opções de mapeamento padrão
        Atualiza o índice se já estiver criado.
        """

        settings = {
            "settings":{
                "analysis":{
                    "analyzer":{
                        "default":{
                            "tokenizer":"standard",
                            "filter":[
                                "lowercase",
                                "asciifolding"
                            ]
                        }
                    }
                }
            }
        }

        http, space, address, _index, _type = self.idx_exp_url.split('/')

        try:
            result = self.es.create_index(
                index=_index,
                settings=settings
            )
        except IndexAlreadyExistsError as e:
            logger.info("O índice já existe. Tentando atualizar o mapping...")
            self.es.close_index(index=_index)
            result = self.es.update_settings(
                index=_index,
                settings=settings
            )
            logger.info("Mapping atualizado com sucesso. Abrindo o índice...")
            self.es.open_index(index=_index)
            logger.info("Índice reaberto com sucesso!")

    def delete_index(self, registry):
        """Deletar registros no index."""

        id = registry['id_doc']
        try:
            http, space, address, _index, _type = self.idx_exp_url.split('/')
            self.es.delete(_index, _type, id=id)
            return True

        except ElasticHttpNotFoundError as e:
            return True

        except Exception as e:
            error_msg = 'Erro ao deletar indice %s da base %s na url %s. '\
                'Mensagem de erro: %s' % \
                (str(id), self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)
            return False

    def delete_error(self, registry):
        """Deletar registro de erros na rota '_index_error'."""

        url = (config.REST_URL + 
            """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}""")
        url = url % (registry['base'], registry['id_doc'])
        logger.info('Deletando registro de erro de indice na url %s' % url)

        req = None
        try:
            req = requests.delete(url)
            req.raise_for_status()
            return True
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            error_msg = """
                Erro ao deletar erro de indice. FALHA: %s
            """ % (fail_content)
            logger.error(error_msg)
        return False

    @staticmethod
    def create_log_base():
        """Cria base de log do LBIndex caso não exista."""

        log_base = model.LogBase()
        response = log_base.get_base()
        if not response:

            # NOTE: Cria a base já que ela não existe!
            logger.info("Criando base de log do índice...")
            result = log_base.create_base()
            if result is None:
                logger.error("Erro na criação da base de log: \n%s", 
                             response.text)
                return False
            else:
                logger.info("Base de log criada com sucesso!")
        return True
Esempio n. 46
0
class ElasticConnector(Connector):
    """
    Class for connectors that are operate with elasticsearch database
  """
    MAX_SIZE = 1000

    def __init__(self, database, host='http://localhost:9200/'):
        self.client = ElasticSearch(host)
        self.index = database
        self.create_index()

    def query_to_id(self, query):
        """
      Returns id representation of a specified query
      This is a temporary method as a replacement of elasticsearch query search
    """
        return "_".join(str(k) + "_" + str(v)
                        for k, v in query.items()).replace("/", "_")

    def create_index(self):
        """
      Creates specified index or catches an exception if it has already been created
    """
        try:
            self.client.create_index(self.index)
        except Exception as e:
            pass

    def set_dynamic_mapping(self, collection):
        """
      Sets dynamic mapping for a specified document type
    """
        self.client.put_mapping(self.index, collection, {'dynamic': True})

    def save_block(self, block):
        """
      Saves operation info in a database
    """
        super().save_block(block)
        collection = block.get_collection()
        dictionary = block.to_dict()
        query = block.get_query()
        self.update_by_query(collection, query, block)

    def update_by_query(self, collection, query, document):
        """
      Sets dynamic mapping for a specified collection,
      then creates a new id for a document depending on query for it.
      Saves a new object in a database as a new one
    """
        try:
            self.set_dynamic_mapping(collection)
            document_id = document.get_id()
            document_body = document.to_dict()
            if "_id" in document_body.keys():
                del document_body['_id']
            self.client.index(self.index,
                              collection,
                              document_body,
                              id=self.query_to_id(query))
        except Exception as e:
            print(e)
            pass

    def find_last_block(self):
        """
      Finds last block index as a value field of a document 
      in a status collection with specified id
    """
        try:
            document = self.client.get(self.index, 'status',
                                       'height_all_tsx')['_source']
            return document['value']
        except ElasticHttpNotFoundError as e:
            return 0

    def update_last_block(self, last_block):
        """
      Updates last block index as a value field of a document 
      in a status collection with specified id
    """
        self.client.index(self.index,
                          'status', {'value': last_block},
                          id='height_all_tsx')

    def save_instance(self, instance):
        """
      Saves account or comment object
    """
        self.update_by_query(instance.get_collection(), instance.get_query(),
                             instance)

    def get_instances_to_update(self, collection):
        """
      Finds and returns all dictionaries with objects that should be updated
    """
        hits = self.client.search("need_update:true",
                                  index=self.index,
                                  doc_type=collection,
                                  size=self.MAX_SIZE)['hits']['hits']
        return [{**hit['_source'], **{"_id": hit["_id"]}} for hit in hits]

    def update_instances(self, collection, instances):
        """
      Resets need_update flag for all instances in a list by their ids in _id field
    """
        for instance in instances:
            self.client.update(self.index,
                               collection,
                               instance["_id"],
                               doc={'need_update': False})