Example #1
0
def delete(config, tree_names, all, force):
    """Delete indices and their catalog entries.
    
    This deletes the indices that have the format version of the copy of DXR
    this runs under.
    
    """
    es = ElasticSearch(config.es_hosts)
    if all:
        echo('Deleting catalog...')
        es.delete_index(config.es_catalog_index)
        # TODO: Delete tree indices as well.
    else:
        for tree_name in tree_names:
            frozen_id = '%s/%s' % (FORMAT, tree_name)
            try:
                frozen = es.get(config.es_catalog_index, TREE, frozen_id)
            except ElasticHttpNotFoundError:
                raise ClickException('No tree "%s" in catalog.' % tree_name)
            # Delete the index first. That way, if that fails, we can still
            # try again; we won't have lost the catalog entry. Refresh is
            # infrequent enough that we wouldn't avoid a race around a
            # catalogued but deleted instance the other way around.
            try:
                es.delete_index(frozen['_source']['es_alias'])
            except ElasticHttpNotFoundError:
                # It's already gone. Fine. Just remove the catalog entry.
                pass
            es.delete(config.es_catalog_index, TREE, frozen_id)
Example #2
0
def main():
    """
    Method to kick things off
    """

    # Setup workers
    pool = Pool(processes=CPU_COUNT)

    # Prepare URLs
    urls = []
    for url in CRAWL_URLS:
        urls.append(str(BASE_URL + url))

    if USE_ES:
        # Create connection
        es = ElasticSearch(ES_URL)

        try:
            # Delete the existing index
            es.delete_index(ES_INDEX)
        except:
            # In case the index does not exist
            pass

        # Create the index to use
        es.create_index(ES_INDEX)

    else:
        # Setup the database tables, connect
        init_db()

    # Scrape and store async
    pool.map(scrape, urls)
Example #3
0
def IndexData(request):
    es = ElasticSearch(settings.ELASTIC_SEARCH)
    for file in fileHolder:
        index = file['segment_name'].lower()
        rawfiles = file['rawfiles']
        data_for_es = file['dataFrames']
        try:
            es.delete_index(index.replace(" ", ""))
        except:
            pass
    es.create_index(index.replace(" ", ""))

    ## Loop dataframe and to elasticsearch index
    docs = json.loads(data_for_es.to_json(orient='records'))
    es.bulk((es.index_op(doc) for doc in docs),
            index=index.replace(" ", ""),
            doc_type=index)

    ##Create segment template
    file_names = []
    for file in rawfiles:
        file_names.append(file.name)

    segment = Segments(name=index,
                       files_added=",".join(file_names),
                       es_index=index.replace(" ", ""))
    segment.save()

    segment = Segments.objects.get(name=index)

    return render(request, 'analyse.html', {'segment': segment})
Example #4
0
def cli(index_name, delete_index, mapping_file, settings_file, doc_type,
        import_file, delimiter, tab, host, docs_per_chunk, bytes_per_chunk,
        parallel, quiet):
    """
    Bulk import a delimited file into a target Elasticsearch instance. Common
    delimited files include things like CSV and TSV.

    \b
    Load a CSV file:
      csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv
    \b
    For a TSV file, note the tab delimiter option
      csv2es --index-name tomatoes --doc-type tomato \
             --import-file tomatoes.tsv --tab
    \b
    For a nifty pipe-delimited file (delimiters must be one character):
      csv2es --index-name pipes --doc-type pipe --import-file pipes.psv \
             --delimiter '|'

    """

    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if delete_index:
        try:
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        if settings_file:
            echo('Applying mapping from: ' + settings_file, quiet)
            with open(settings_file) as f:
                settings = json.loads(f.read())
            es.create_index(index_name, settings)
        else:
            es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except ElasticHttpError as e:
        if e.error['type'] == 'index_already_exists_exception':
            echo('Index ' + index_name + ' already exists', quiet)
        else:
            raise

    echo('Using document type: ' + doc_type, quiet)
    if mapping_file:
        echo('Applying mapping from: ' + mapping_file, quiet)
        with open(mapping_file) as f:
            mapping = json.loads(f.read())
        es.put_mapping(index_name, doc_type, mapping)

    target_delimiter = sanitize_delimiter(delimiter, tab)
    documents = documents_from_file(es, import_file, target_delimiter, quiet)
    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
Example #5
0
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, host,
        docs_per_chunk, bytes_per_chunk, parallel, quiet, parser, config_file,
        user, passwd):

    with open(config_file, "rb") as f:
        con = json.loads(f.read())
    host = con['es_config']['host']
    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if con['db']['type'] == "oracle":
        db = import_module('cx_Oracle')
        collection = db.connect(user, passwd, con['db']['con_str'])
    else:
        db = import_module('MySQLdb')
        collection = db.connect(con['db']['con_str'][0],
                                user,
                                passwd,
                                con['db']['con_str'][1],
                                charset=con['db']['con_str'][2])

    if delete_index:  # 删除索引
        try:
            stamp = 0
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        if settings_file:
            with open(settings_file, 'r') as f:
                settings_json = json.loads(f.read())
            es.create_index(index_name, settings=settings_json)
        else:
            es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except Exception:
        echo('Index ' + index_name + ' already exists', quiet)

    echo('Using document type: ' + doc_type, quiet)

    es.put_mapping(index_name, doc_type, con['mapping'])

    parser_fun = None
    if parser is not None:
        # 加载解释函数
        parser_fun = import_module(PARSER_PATH + '.' + parser)

    documents = documents_from_file(es, collection, quiet, parser_fun, con)

    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
    print "end:" + time.strftime(
        ISOTIMEFORMAT, time.localtime()) + '/n all records import complete.'
Example #6
0
class ElasticSearchTestCase(unittest.TestCase):
    def setUp(self):
        self.conn = ElasticSearch('http://localhost:9200/')

    def tearDown(self):
        self.conn.delete_index("test-index")

    def assertResultContains(self, result, expected):
        for (key, value) in expected.items():
            self.assertEquals(value, result[key])
Example #7
0
class ElasticSearchTestCase(unittest.TestCase):
    def setUp(self):
        self.conn = ElasticSearch('http://localhost:9200/')

    def tearDown(self):
        self.conn.delete_index("test-index")

    def assertResultContains(self, result, expected):
        for (key, value) in expected.items():
            self.assertEquals(value, result[key])
Example #8
0
def import_json_into_es(types, inputfolder, logger):
    """
    imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch
    :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'}
    :param inputfolder:
    :param logger:
    :return:
    """

    es = ElasticSearch(config.ELASTICSEARCH_URL)

    try:
        es.delete_index('wikidata')
        es.create_index('wikidata')
        logger.info('rebuild index [wikidata]')
    except:
        logger.warning('cant delete wikidata index')


    # convert type dictionary
    wd_types = dict()
    for key in types.keys():
        value = int(types[key].split('/')[-1][1:])
        wd_types[value] = {'type': key,
                           'filename': path.join(inputfolder, '{}.json.bz2'.format(key))}


    # import each given type
    for key in wd_types:
        logger.info(wd_types[key])

        done = 0
        items = []

        for line in BZ2File(wd_types[key]['filename'],'rb'):
            line = line.strip()
            item = loads(line)
            item['uri'] = 'http://wikidata.org/wiki/' + item['id']

            items.append(item)
            done += 1

            if ( done % 5000 == 0 ):
                es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id')
                items = []

            # if done % len(wd_types) / 10 == 0: # log 10% steps
            #     logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done ))

            if done % 10000 == 0:
                logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d')))

        if len(items) > 0:
            es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id')
        logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d')))
Example #9
0
def feed(index='monolith', type='downloads', es_port=9200):
    client = ElasticSearch('http://0.0.0.0:%d/' % es_port)
    platforms = ['Mac OS X', 'Windows 8', 'Ubuntu']

    # indexing a year of data (2012)
    first_day = datetime.datetime(2012, 1, 1)
    last_day = datetime.datetime(2012, 12, 31)
    day_range = last_day - first_day

    for month in range(1, 13):
        name = 'time_2012-%.2d' % month
        try:
            client.delete_index(name)
        except Exception:
            pass
        client.create_index(name,
                            settings={
                                'number_of_shards': 1,
                                'number_of_replicas': 0,
                                'analysis': {
                                    'analyzer': {
                                        'default': {
                                            'type': 'custom',
                                            'tokenizer': 'keyword'
                                        }
                                    }
                                },
                                'store': {
                                    'compress': {
                                        'stored': 'true'
                                    }
                                },
                            })

    # indexing 100 apps
    for add_on in range(100):
        docs = defaultdict(list)
        for delta in range(day_range.days):
            date = first_day + datetime.timedelta(days=delta)
            data = {
                'date': date,
                'os': random.choice(platforms),
                'downloads_count': random.randint(1000, 1500),
                'users_count': random.randint(10000, 15000),
                'add_on': add_on + 1
            }
            docs[date.month].append(data)
        for month, values in docs.items():
            client.bulk_index('time_2012-%.2d' % month, type, values)
            sys.stdout.write('.')
            sys.stdout.flush()

    client.optimize('time_*', max_num_segments=1, wait_for_merge=True)
    client.flush()
    sys.stdout.write('\nDone!\n')
Example #10
0
def index_data(data_source, index_name, doc_type):
    es = ElasticSearch(urls='http://localhost', port=9200)
    try:
        es.delete_index(index_name)
    except:
        pass
    es.create_index(index_name)
    try:
        es.bulk_index(index_name, doc_type, data_source)
    except:
        print("Error! Skipping Document...!")
        pass
Example #11
0
    def setUp(self):
        es_connection = ElasticSearch('http://localhost:9200')
        try:
            es_connection.delete_index('unit_tests')
        except:
            pass
        es_connection.create_index('unit_tests')

        class TestModel(SearchModel):
            index_name = 'unit_tests'

        self.model = TestModel
Example #12
0
def cli(index_name, delete_index, mapping_file, doc_type, import_file,
        delimiter, tab, host, docs_per_chunk, bytes_per_chunk, parallel, quiet,
        document_id_in_file):
    """
    Bulk import a delimited file into a target Elasticsearch instance. Common
    delimited files include things like CSV and TSV.

    \b
    Load a CSV file:
      csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv
    \b
    For a TSV file, note the tab delimiter option
      csv2es --index-name tomatoes --doc-type tomato --import-file tomatoes.tsv --tab
    \b
    For a nifty pipe-delimited file (delimiters must be one character):
      csv2es --index-name pipes --doc-type pipe --import-file pipes.psv --delimiter '|'

    """

    echo('Using host: ' + host, quiet)
    es = ElasticSearch(host)

    if delete_index:
        try:
            es.delete_index(index_name)
            echo('Deleted: ' + index_name, quiet)
        except ElasticHttpNotFoundError:
            echo('Index ' + index_name + ' not found, nothing to delete',
                 quiet)

    try:
        es.create_index(index_name)
        echo('Created new index: ' + index_name, quiet)
    except IndexAlreadyExistsError:
        echo('Index ' + index_name + ' already exists', quiet)
    except ElasticHttpError as exception:
        echo(
            'Error creating index %s. ElasticHttpError [%s]' %
            (index_name, exception.error), quiet)

    echo('Using document type: ' + doc_type, quiet)
    if mapping_file:
        echo('Applying mapping from: ' + mapping_file, quiet)
        with open(mapping_file) as f:
            mapping = json.loads(f.read())
        es.put_mapping(index_name, doc_type, mapping)

    target_delimiter = sanitize_delimiter(delimiter, tab)
    documents = documents_from_file(es, import_file, target_delimiter, quiet,
                                    document_id_in_file)
    perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk,
                       bytes_per_chunk, parallel)
def es_indexer():
    es=ElasticSearch('http://localhost:9200/')
    if es:
        # Delete index /sentiment_analysis if it already exists
        try:
            es.delete_index("sentiment_analysis")
            print "Deleted index sentiment_analysis if it already existed."
        except:
            raise 'ElasticHttpNotFoundError'
        finally:
            print "Creating index sentiment_analysis ...."
            es.create_index("sentiment_analysis",{
                                    'settings': {
                                        'index': {
                                            'store': {
                                                'type': "default"
                                            },
                                            'number_of_shards': 1,
                                            'number_of_replicas': 1
                                        },
                                        'analysis': {
                                            'analyzer': {
                                                'default_english': {
                                                    'type': 'english'
                                                }
                                            }
                                        }
                                    },
                                    "mappings": {
                                        "document": {
                                            "properties": {
                                                "text": {
                                                    "type": "string",
                                                    "store": True,
                                                    "index": "analyzed",
                                                    "term_vector": "with_positions_offsets_payloads",
                                                    "analyzer": "default_english"
                                                },
                                                "sentiment": {
                                                    "type": "string",
                                                    "store": True,
                                                    "index": "analyzed",
                                                    "analyzer": "default_english"
                                                }
                                            }
                                        }
                                    }
                                })
            print "Created index 'sentiment_analysis' with type 'document' and an analyzed field 'text'."
    else:
        print "ElasticSearch is not running or the default cluster is down."
Example #14
0
class Indexer(object):
  
  def __init__(self, input):
    self.input = input
    self.es = ElasticSearch()
    self.index_name = "psim"
    self.doc_type = 'book'
    
  def delete_index(self):
    # Delete index if already found one
    try:
      self.es.delete_index(index = self.index_name)
    except Exception:
      pass
  
  def create_index(self):
    self.es.create_index(index=self.index_name, settings = self.get_index_settings())
    
  def get_index_settings(self):
    settings = {
                        "mappings": {
                           "book": {
                             "_all" : {"enabled" : "false"},       
                             "properties": {
                                "codes": {"type": "string",
                                         "term_vector": "yes",
                                         "store": "true"},
                                "pid" : {"type" : "string"},
                                "embedding": {"type": "float",
                                              "store": "true"},
                                "magnitude": {"type": "float", "store": "true"}
                             }     
                           }
                        }
               }
    return settings
  
  def documents(self):
    with open(self.input) as input_file:
      for line in input_file:
        json_doc = json.loads(line)
        yield self.es.index_op(json_doc, doc_type=self.doc_type)
    
  def index(self):
    self.delete_index()
    self.create_index()
    for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000):
      self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type)
    self.es.refresh(self.index_name)
Example #15
0
def index_data(data_path, chunksize, index_name, doc_type):
    f = open(data_path)
    csvfile = pd.read_csv(f, iterator=True, chunksize=chunksize)
    es = ElasticSearch(urls='http://localhost', port=9200)
    try:
        es.delete_index(index_name)
    except:
        pass
    es.create_index(index_name)
    for i, df in enumerate(csvfile):
        records = df.where(pd.notnull(df), None).T.to_dict()
        records_list = [records[i] for i in records]
        try:
            es.bulk_index(index_name, doc_type, records_list)
        except:
            print("Error! Skipping chunk...!")
            pass
Example #16
0
def feed(index='monolith', type='downloads', es_port=9200):
    client = ElasticSearch('http://0.0.0.0:%d/' % es_port)
    platforms = ['Mac OS X', 'Windows 8', 'Ubuntu']

    # indexing a year of data (2012)
    first_day = datetime.datetime(2012, 1, 1)
    last_day = datetime.datetime(2012, 12, 31)
    day_range = last_day - first_day

    for month in range(1, 13):
        name = 'time_2012-%.2d' % month
        try:
            client.delete_index(name)
        except Exception:
            pass
        client.create_index(name, settings={
            'number_of_shards': 1,
            'number_of_replicas': 0,
            'analysis': {'analyzer': {'default': {
                'type': 'custom', 'tokenizer': 'keyword'
            }}},
            'store': {'compress': {'stored': 'true'}},
        })

    # indexing 100 apps
    for add_on in range(100):
        docs = defaultdict(list)
        for delta in range(day_range.days):
            date = first_day + datetime.timedelta(days=delta)
            data = {'date': date,
                    'os': random.choice(platforms),
                    'downloads_count': random.randint(1000, 1500),
                    'users_count': random.randint(10000, 15000),
                    'add_on': add_on + 1}
            docs[date.month].append(data)
        for month, values in docs.items():
            client.bulk_index('time_2012-%.2d' % month, type, values)
            sys.stdout.write('.')
            sys.stdout.flush()

    client.optimize('time_*', max_num_segments=1, wait_for_merge=True)
    client.flush()
    sys.stdout.write('\nDone!\n')
Example #17
0
            'adId': {'type': 'integer'},
            'adUrl': {'type': 'string'},
            'adType': {'type': 'string'},
            'adSize': {'type': 'string'},
            'dateCreated': {'type': 'date', 'format' : 'YYYY-MM-dd HH:mm:ss'},
            'websiteId': {'type': 'integer'},
            'website': {'type': 'string', 'analyzer': 'simple'},
            'category': {'type': 'string'},
            'subCategory': {'type': 'string'}
        }
    }
}


es.health(wait_for_status='yellow')
es.delete_index('write-ads')
es.create_index('write-ads', settings={'mappings': ad_mapping})

dateYMD = args["date"]
prepareDataFromDB(dateYMD)

dir = DATA_FILES_JSON + '/' + dateYMD
for filename in os.listdir(dir):
    if filename.endswith('.json'):
        with open(dir + '/' + filename) as open_file:
            json_docs = json.load(open_file)
            es.bulk((es.index_op(doc) for doc in json_docs),
                index='write-ads',
                doc_type='ad')

es.refresh("write-ads")
Example #18
0
class IbbdElasticSearch:
    """
    es操作
    文档:http://pyelasticsearch.readthedocs.io/en/latest/
    """
    es = None
    config = {}

    mapping_is_set = False  # 判断是否已经设置了es的mapping

    def __init__(self, config):
        """
        es初始化
        配置参数:
        host: es连接字符串
        indexName: index的名字
        deleteIndex: 是否删除已经存在的index,默认为false,不删除
        settings: index的配置。具体的配置项,请看es的文档。
        settingsFile: index的配置,json文件。具体的配置项,请看es的文档。
        mappings: mappings的配置。具体的配置项,请看es的文档。
        mappingsFile: mappings的配置,json文件。具体的配置项,请看es的文档。
        idField: id字段。有些数据是包含id字段的

        说明:settings和settingsFile最多只能有一项
        mappings和mappingsFile最多也只能有一项
        """
        self.es = ElasticSearch(config['host'])

        if 'docType' not in config:
            config['docType'] = config['indexName']
        self.config = config

        if 'deleteIndex' in config and config['deleteIndex']:
            try:
                self.es.delete_index(config['indexName'])

                print('delete index ' + config['indexName'] + ' success!')
            except ElasticHttpNotFoundError:  # 如果本来不存在,则输出提示就好
                print('Index ' + config['indexName'] \
                                + ' not found, nothing to delete!')
            except:
                raise Exception('Index ' + config['indexName'] + ' delete error!')

        try:
            if 'settings' in config:
                self.es.create_index(config['indexName'],
                                     settings=config['settings'])
            elif 'settingsFile' in config:
                with open(config['settingsFile'], 'r') as f:
                    config['settings'] = json.loads(f.read())
                self.es.create_index(config['indexName'],
                                     settings=config['settings'])
            else:
                self.es.create_index(config['indexName'])

            print('create index ' + config['indexName'] + ' success!')
        except Exception:
            raise Exception("create index " + config['indexName'] + ' error!')

    def _putMapping(self, row):
        """
        设置es的mapping。
        可以根据row生成默认配置, 生成配置规则如下:
        """
        try:
            if 'mappingsFile' in self.config:
                with open(self.config['mappingsFile'], 'r') as f:
                    self.config['mappings'] = json.loads(f.read())

            if 'mappings' in self.config:
                self.es.put_mapping(self.config['indexName'],
                                    self.config['docType'],
                                    self.config['mappings'])
            print("put mapping " + self.config['indexName'] + ' success!')
        except Exception:
            raise Exception("put mapping " + self.config['indexName'] + ' error!')

    def read(self):
        pass

    def batchRead(self):
        pass

    def write(self, row):
        """
        写入单行记录
        """
        return self.batchWrite([row])

    def batchWrite(self, rows):
        """
        写入多行记录
        """
        if not self.mapping_is_set:   # 设置mapping
            self.mapping_is_set = True
            self._putMapping(rows[0])

        docs = ()
        if 'idField' in self.config:
            docs = (self.es.index_op(doc, id=doc.pop(self.config['idField'])) \
                    for doc in rows)
        else:
            docs = (self.es.index_op(doc) for doc in rows)

        self.es.bulk(docs,
                     index=self.config['indexName'],
                     doc_type=self.config['docType'])

        return True
Example #19
0
def drop_elastic(settings):
    try:
        es = ElasticSearch(settings['ELASTICSEARCH_URL'])
        es.delete_index(settings['ELASTICSEARCH_INDEX'])
    except:
        pass
Example #20
0
es = ElasticSearch(ELASTICSEARCH_HOST, timeout=120)


FILES = [
    "nofly/shapefile/us_national_parks",
    "nofly/shapefile/us_military",
    "nofly/shapefile/5_mile_airport",
]

try:
    es.delete_all(ELASTICSEARCH_DOC, ELASTICSEARCH_INDEX)
except ElasticHttpError:
    pass

try:
    es.delete_index(ELASTICSEARCH_INDEX)
except ElasticHttpError:
    pass

index_settings = {
    "number_of_shards": 3,
    "number_of_replicas": 1,
    "mappings": {
        ELASTICSEARCH_DOC: {
            "properties": {
                "location": {
                    "type": "geo_shape",
                    "tree": "quadtree",
                    "precision": "1m"
                }
            }
Example #21
0
class ESWrapper(BaseDB):
    def __init__(self,
                 index_name,
                 doc_type,
                 host='http://localhost',
                 port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        #self._base_query = {"query": {"bool": {"must": {"match": {}}}}}
        #self._base_query = {"query": {"bool": {}}}
        self._geo_filter = {"distance": "20km", "coordinates": {}}
        self._population_filter = {'population': {'gte': 5000}}
        self._index = index_name
        self._doctype = doc_type

    def getByid(self, geonameId):
        maincondition = {"match": {"id": geonameId}}
        q = {"query": {"bool": {"must": maincondition}}}
        return self.eserver.search(
            q, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']

    def _query(self,
               qkey,
               qtype="exact",
               analyzer=None,
               min_popln=None,
               size=10,
               **kwargs):
        """
        qtype values are exact, relaxed or geo_distance
        Always limit results to 10
        """
        q = {"query": {"bool": {}}}
        query_name = kwargs.pop('query_name', 'must')
        query_name = "should"
        if query_name == "should":
            q["query"]["bool"]["minimum_number_should_match"] = 1

        maincondition = {}
        if qtype == "exact":
            maincondition = [{
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            }]
            #maincondition["match"] = {"name.raw": {"query": qkey}}
            if analyzer:
                maincondition["match"]["name.raw"]["analyzer"] = analyzer

        elif qtype == "relaxed":
            maincondition["match"] = {"alternatenames": {"query": qkey}}
            if analyzer:
                maincondition["match"]["alternatenames"]["analyzer"] = analyzer

            #q["query"]["bool"][query_name]["match"].pop("name.raw", "")
        elif qtype == "combined":
            maincondition = [{
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": qkey,
                            "fields":
                            ["name.raw", "asciiname", "alternatenames"]
                        }
                    },
                    "filter": {
                        "bool": {
                            "should": [{
                                "range": {
                                    "population": {
                                        "gte": 5000
                                    }
                                }
                            }, {
                                "terms": {
                                    "featureCode": [
                                        "pcla", "pcli", "cont", "rgn", "admd",
                                        "adm1", "adm2"
                                    ]
                                }
                            }]
                        }
                    }
                }
            }, {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey[1:]
                    }
                }
            }, {
                "match": {
                    "alternatenames": {
                        "query": qkey,
                        'fuzziness': kwargs.pop("fuzzy", 0),
                        "max_expansions": kwargs.pop("max_expansion", 5),
                        "prefix_length": kwargs.pop("prefix_length", 1)
                    }
                }
            }]

        if maincondition:
            q["query"]["bool"][query_name] = maincondition

            if min_popln:
                filter_cond = [{"range": {"population": {"gte": min_popln}}}]
            else:
                filter_cond = []

            if kwargs:
                #filter_cond = [{"range": {"population": {"gte": min_popln}}}]
                filter_cond += [{
                    "term": {
                        key: val
                    }
                } for key, val in kwargs.viewitems()]
                # print(filter_cond)
                q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}
            elif min_popln:
                filter_cond = [{
                    "range": {
                        "population": {
                            "gte": min_popln
                        }
                    }
                }, {
                    "terms": {
                        "featureCode": ["ppla", "pplx"]
                    }
                }]

                q["query"]["bool"]["filter"] = {
                    "bool": {
                        "should": filter_cond
                    }
                }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def query(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def near_geo(self, geo_point, min_popln=5000, **kwargs):
        q2 = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    },
                    "filter": [{
                        "geo_distance": {
                            "distance": "30km",
                            "coordinates": geo_point
                        }
                    }, {
                        "terms": {
                            "featureCode": ["pcli", "ppl", "ppla2", "adm3"]
                        }
                    }]
                }
            },
            "sort": {
                "population": "desc"
            }
        }

        res = self.eserver.search(q2,
                                  index=self._index,
                                  doc_type=self._doctype,
                                  **kwargs)['hits']['hits'][0]['_source']
        res['confidence'] = 1.0
        return [GeoPoint(**res)]

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

    def _opLoader(self, datacsv, confDir):
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                try:
                    row['coordinates'] = [
                        float(row['longitude']),
                        float(row['latitude'])
                    ]
                    try:
                        row['population'] = int(row["population"])
                    except:
                        row['population'] = -1

                    try:
                        row['elevation'] = int(row['elevation'])
                    except:
                        row['elevation'] = -1

                    del (row['latitude'])
                    del (row['longitude'])
                    #print row['name']
                    row['alternatenames'] = row['alternatenames'].split(",")
                    cnt += 1
                    yield self.eserver.index_op(row,
                                                index=self._index,
                                                doc_type=self._doctype)
                except:
                    print json.dumps(row)
                    continue
Example #22
0
from pyelasticsearch import ElasticSearch
import simplejson,sys

s=ElasticSearch("http://localhost:9200")

if "init" in sys.argv :
	try :
		s.delete_index("flights");
	except Exception, e:
		print e
	try :
		s.create_index("flights")
	except Exception, e:
		print e
	else :	
		print "Created flights"


	s.put_mapping("flights","flight",simplejson.loads('{"flight":{"properties":{"datum":{"type":"string","index":"not_analyzed","omit_norms":true,"index_options":"docs"},"type": { "type": "string", "index" : "not_analyzed" }, "duration":{"type":"double"},"end":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}},"flight":{"type":"string","store":true,"analyzer":"keyword"},"hex":{"type":"string","store":true,"analyzer":"keyword"},"id":{"type":"string","store":true},"radar":{"type":"string","store":true,"analyzer":"keyword"},"reg":{"type":"string","store":true,"analyzer":"keyword"},"route":{"properties":{"coordinates":{"type":"double"},"type":{"type":"string"}}},"start":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}}}}}'))



def md(a) :
    a["datum"]=a["starttime"][:10]
    return a
    
    
def makets(a) :
    for f in ("starttime","endtime") :
        a[f]=maket(a[f])
    return a
Example #23
0
def drop_elastic(settings):
    try:
        es = ElasticSearch(settings['ELASTICSEARCH_URL'])
        es.delete_index(settings['ELASTICSEARCH_INDEX'])
    except:
        pass
Example #24
0
def delete_all():
    es = ElasticSearch(CONTEXT["datahub-store"])
    es.delete_index(CONTEXT["datahub-index"])
Example #25
0
from pyelasticsearch import ElasticSearch
from pyelasticsearch.exceptions import ElasticHttpNotFoundError, IndexAlreadyExistsError
import requests
import time

IGNORED_GENRES = ("9", "15", "19"
                  )  # We only care about stations that play music.

import settings

es = ElasticSearch(settings.ES_URL)
INDEX_NAME = settings.ES_INDEX

try:
    es.delete_index(INDEX_NAME)
except ElasticHttpNotFoundError:
    pass

try:
    es.create_index(INDEX_NAME)
except IndexAlreadyExistsError:
    pass

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36'
}

failures = 0
pk = 0
while failures < 200:
Example #26
0
def documents_from_mails(mails):
    """Build document from mail"""
    for mail in mails:
        if 'Date' in mail.headers:  # Some mails seem broken.
            yield {
                '@source': 'stuff://',
                '@type': 'mailadmin',
                '@tags': [mail.headers['From']],
                '@fields': mail.headers,
                '@timestamp': parse_date(mail.headers['Date']),
                '@source_host': 'localhost',
                '@source_path': 'mail/admin ',
                '@message': mail.body,
                'id': mail.headers['Message-Id']
            }

if __name__ == '__main__':
    # Instantiate it with an url
    es = ElasticSearch(sys.argv[1])
    # Kibana need this kind of name
    NAME = 'logstash-2013.06.13'
    try:
        es.delete_index(NAME)
    except ElasticHttpNotFoundError:
        pass  # Nobody cares
    emails = mbox(sys.argv[2])
    for n, docs in enumerate(bulk_iterate(documents_from_mails(emails), 100)):
        es.bulk_index(NAME, 'mailadmin', docs)
        print(n)
    print es.refresh(NAME)
Example #27
0
                    'analyzer': 'mmseg',
                    'boost': 0.7,
                    'term_vector': 'with_positions_offsets'
                },
                'categories': {
                    'type': 'nested',
                    'properties': {
                        'url': {
                            'type': 'string',
                            'index': 'not_analyzed'
                        },
                        'name': {
                            'type': 'string',
                            'index': 'not_analyzed'
                        },
                    }
                }
            }
        }
    }
}

es = ElasticSearch(HOST)
try:
    es.delete_index(INDEX)
except ElasticHttpNotFoundError:
    # No index found
    pass

es.create_index(INDEX, settings=index_settings)
Example #28
0
    def getPets(self, breed, page):
        query = breed['ename']
        if '宠物' not in query:
            query = '宠物' + query
        url = self.pet_site_url % (query, page) 
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        result = response.read().decode('gbk')
        if result is not None:
            data = json.loads(result)['data'][0]['disp_data']
            if data:
                return data

es = ElasticSearch('http://localhost:9200/')
es.delete_index('pet')
spider = Spider()
breeds = spider.getPetBreeds()
p = Pinyin()
for breed in breeds:
    flg = 1
    page = 1
    pet_list = []
    while(flg):
        pets = spider.getPets(breed, (page - 1) * spider.limit)
        if not pets:
            flg = 0
        else:
            page = page + 1
            for pet in pets:
                pet_obj = {}
Example #29
0
class ElasticSearchProvider(SearchProvider):
    def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None):
        self.debug = False
        self.config = config
        if db is not None:
            self.db = db
        self.syncES = ElasticSearch(
            '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config
        )
        self.asyncES = ESConnection(
            host=config.get('ELASTIC_SEARCH_HOST'),
            port=config.get('ELASTIC_SEARCH_PORT'),
            io_loop=io_loop,
            protocol=config.get('ELASTIC_SEARCH_PROTOCOL'),
        )
        self.index = config.get('ELASTIC_SEARCH_INDEX')
        self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES')

    def activate_debug(self):
        self.debug = True

    def connect_to_db(self):
        from sqlalchemy import create_engine
        from sqlalchemy.orm import scoped_session, sessionmaker
        conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING')
        engine = create_engine(
            conn_string,
            convert_unicode=True,
            pool_size=1,
            max_overflow=0,
            echo=self.debug
        )
        maker = sessionmaker(bind=engine, autoflush=True)
        self.db = scoped_session(maker)

    def _assemble_inner_query(self, domain=None, page_filter=None):
        if page_filter and domain:
            page_prefix = '%s/%s' % (domain.url, page_filter)
        else:
            page_prefix = None

        if page_prefix:
            return {
                'prefix': {
                    'page_url': page_prefix
                }
            }
        else:
            return {
                'match_all': {}
            }

    def _assemble_outer_query(self, inner_query, filter_terms):
        return {
            'filtered': {
                'query': inner_query,
                'filter': {
                    'and': [{
                        'term': filter_term
                    } for filter_term in filter_terms]
                }
            }
        }

    def _assemble_filter_terms(self, key_id=None, domain=None):
        filter_terms = []

        if key_id:
            filter_terms.append({'keys.id': key_id})

        if domain:
            filter_terms.append({'domain_id': domain.id})

        return filter_terms

    def gen_doc(self, review):
        return {
            'keys': [{'id': violation.key_id} for violation in review.violations],
            'uuid': str(review.uuid),
            'completed_date': review.completed_date,
            'violation_count': review.violation_count,
            'page_id': review.page_id,
            'page_uuid': str(review.page.uuid),
            'page_url': review.page.url,
            'page_last_review_date': review.page.last_review_date,
            'domain_id': review.domain_id,
            'domain_name': review.domain.name,
        }

    def index_review(self, review):
        for attempt in range(self.max_retries):
            try:
                self.syncES.send_request(
                    method='POST',
                    path_components=[self.index, 'review', review.page_id],
                    body=dumps(self.gen_doc(review)),
                    encode_body=False
                )
                break
            except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e:
                values = review.id, review.page_id, str(e)
                logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values))
                time.sleep(1)
                if attempt >= self.max_retries - 1:
                    raise
            else:
                raise

    def index_reviews(self, reviewed_pages, reviews_count, batch_size):
        action = {'index': {'_type': 'review'}}

        for i in range(0, reviews_count, batch_size):
            body_bits = []

            for page in reviewed_pages[i:i + batch_size]:
                doc = self.gen_doc(page.last_review)

                action['index']['_id'] = doc['page_id']

                body_bits.append(dumps(action))
                body_bits.append(dumps(doc))

            # Yes, that trailing newline IS necessary
            body = '\n'.join(body_bits) + '\n'

            self.syncES.send_request(
                method='POST',
                path_components=[self.index, '_bulk'],
                body=body,
                encode_body=False
            )

        logging.info('Done!')

    @return_future
    def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    reviews_data = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        reviews_data.append({
                            'uuid': hit['_source']['uuid'],
                            'page': {
                                'uuid': hit['_source']['page_uuid'],
                                'url': hit['_source']['page_url'],
                                'completedAt': completedAt
                            },
                            'domain': hit['_source']['domain_name']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviews': reviews_data,
                        'reviewsCount': reviews_count
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain, page_filter)
        filter_terms = self._assemble_filter_terms(key_id, domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'completed_date': {
                'order': 'desc'
            }
        }, {
            'violation_count': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    @return_future
    def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None):
        def treat_response(response):
            if response.error is None:
                try:
                    hits = loads(response.body).get('hits', {'hits': []})

                    pages = []
                    for hit in hits['hits']:
                        completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date'])
                        pages.append({
                            'url': hit['_source']['page_url'],
                            'uuid': hit['_source']['page_uuid'],
                            'violationCount': len(hit['_source']['keys']),
                            'completedAt': completedAt,
                            'reviewId': hit['_source']['uuid']
                        })

                    reviews_count = hits.get('total', 0)

                    callback({
                        'reviewsCount': reviews_count,
                        'pages': pages
                    })
                except Exception as e:
                    reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message)
                    logging.error(reason)
                    callback({'error': {'status_code': 500, 'reason': reason}})
            else:
                reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body)
                logging.error(reason)
                callback({'error': {'status_code': 500, 'reason': reason}})

        inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter)
        filter_terms = self._assemble_filter_terms(domain=domain)

        query = self._assemble_outer_query(inner_query, filter_terms)

        sort_ = [{
            'violation_count': {
                'order': 'desc'
            }
        }, {
            'completed_date': {
                'order': 'desc'
            }
        }]

        source = {'query': query, 'sort': sort_}

        self.asyncES.search(
            callback=treat_response,
            index=self.index,
            type='review',
            source=source,
            page=current_page,
            size=page_size,
        )

    def refresh(self):
        try:
            self.syncES.refresh(index=self.index)
        except Exception as e:
            logging.error('Could not refresh index (%s)' % e)

    def get_index_settings(cls):
        return {
            'index': {
                'number_of_shards': 4
            }
        }

    def get_index_mapping(cls):
        return {
            'review': {
                'properties': {
                    'keys': {
                        'properties': {
                            'id': {
                                'type': 'integer'
                            }
                        }
                    },
                    'uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'completed_date': {
                        'type': 'integer'
                    },
                    'violation_count': {
                        'type': 'float'
                    },
                    'page_id': {
                        'type': 'integer'
                    },
                    'page_uuid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_url': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'page_last_review_date': {
                        'type': 'integer'
                    },
                    'domain_id': {
                        'type': 'integer'
                    },
                    'domain_name': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    }
                }
            }
        }

    def setup_index(self):
        try:
            settings = self.get_index_settings()
            self.syncES.create_index(index=self.index, settings=settings)
            mapping = self.get_index_mapping()
            self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping)
            logging.info('Index %s created.' % self.index)
        except Exception as e:
            raise e

    def delete_index(self):
        try:
            self.syncES.delete_index(index=self.index)
            logging.info('Index %s deleted.' % self.index)
        except Exception as e:
            raise e

    def _get_max_page_id_from_index(self, must_have_domain_name=False):
        if must_have_domain_name:
            inner_query = {
                'constant_score': {
                    'filter': {
                        'not': {
                            'missing': {
                                'field': 'domain_name'
                            }
                        }
                    }
                }
            }
        else:
            inner_query = {
                'match_all': {}
            }

        query = {
            'query': inner_query,
            'sort': [{
                'page_id': {
                    'order': 'desc'
                }
            }]
        }

        results = self.syncES.search(query, index=self.index, doc_type='review')
        if results['hits']['total'] > 0:
            return results['hits']['hits'][0]['_id'] or 0
        return 0

    def index_all_reviews(self, keys=None, batch_size=200, replace=False):
        logging.info('Querying database...')
        self.connect_to_db()

        if keys is not None:
            keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()]

        try:
            max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True)
        except Exception:
            logging.error('Could not retrieve max page_id! Use with --replace (with caution)')
            return

        def apply_filters(query):
            if keys is not None:
                query = query \
                    .filter(Violation.review_id == Page.last_review_id) \
                    .filter(Violation.key_id.in_(keys))

            if not replace:
                query = query.filter(Page.id > max_page_id)

            return query.filter(Page.last_review_id != None)

        reviews_count = apply_filters(self.db.query(func.count(Page))).scalar()

        query = self.db.query(Page).options(joinedload('last_review'))
        reviewed_pages = apply_filters(query).order_by(Page.id.asc())

        logging.info('Indexing %d reviews...' % reviews_count)

        self.index_reviews(reviewed_pages, reviews_count, batch_size)

    @classmethod
    def new_instance(cls, config):
        return ElasticSearchProvider(config)

    @classmethod
    def main(cls):
        import sys

        parser = cls.argparser()
        args = parser.parse_args()

        config = {}
        host = None
        port = None
        index = None
        es = None

        levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG']
        log_level = levels[args.verbose]
        logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s')

        if not (args.create or args.recreate or args.delete or args.keys or args.all_keys):
            parser.print_help()
            sys.exit(1)

        if args.conf:
            from derpconf.config import ConfigurationError
            from holmes.config import Config
            try:
                config = Config().load(args.conf[0])
                host = config['ELASTIC_SEARCH_HOST']
                port = config['ELASTIC_SEARCH_PORT']
                index = config['ELASTIC_SEARCH_INDEX']
            except ConfigurationError:
                logging.error('Could not load config! Use --conf conf_file')
                sys.exit(1)
            except KeyError:
                logging.error('Could not parse config! Check it\'s contents')
                sys.exit(1)

        if args.server:
            try:
                host, port = args.server[0].split(':')
                config['ELASTIC_SEARCH_HOST'] = host
                config['ELASTIC_SEARCH_PORT'] = port
            except Exception:
                logging.error('Could not parse server host and port! Use --server host:port')
                sys.exit(1)

        if args.index:
                index = args.index[0]
                config['ELASTIC_SEARCH_INDEX'] = index

        from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError
        from requests.exceptions import ConnectionError
        try:

            if args.create or args.recreate or args.delete:
                if host is None or port is None:
                    logging.error('Need either a host and port or a config file to perform such operation!')
                    sys.exit(1)
                if index is None:
                    logging.error('Need either an index name or a config file to perform such operation!')
                    sys.exit(1)
                else:
                    es = cls.new_instance(config)
                    if args.recreate or args.delete:
                        try:
                            es.delete_index()
                        except ElasticHttpNotFoundError:
                            pass
                        except InvalidJsonResponseError as e:
                            logging.error('Invalid response! Reason: %s' % e)
                            sys.exit(1)
                    if args.create or args.recreate:
                        es.setup_index()

            if args.keys or args.all_keys:
                if config is None:
                    logging.error('Need a config file to perform such operation! Use --conf conf_file')
                else:
                    batch_size = args.batch_size[0] if args.batch_size else 200
                    es = cls.new_instance(config) if not es else es
                    try:
                        if args.verbose > 2:
                            es.activate_debug()
                        if args.keys:
                            es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size)
                        elif args.all_keys:
                            es.index_all_reviews(replace=args.replace, batch_size=batch_size)
                    except InvalidJsonResponseError as e:
                        logging.error('Invalid response! Reason: %s' % e)
                        sys.exit(1)

        except IndexAlreadyExistsError:
            logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index)
        except ConnectionError:
            logging.error('Could not connect to server at %s:%s' % (host, port))
        except KeyError:
            logging.error('Could not get host nor port! Use either -conf or --server')
            sys.exit(1)
class TestClient(unittest.TestCase):
    def setUp(self):
        super(TestClient, self).setUp()
        docs = []

        self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks')
    def test_datetime_ranges(self, _mock):
        "Test datetime ranges get converted to dates."
        client = self._make_one()
        start = datetime.datetime(2012, 1, 1, 12, 34, 56)
        end = datetime.datetime(2012, 1, 31, 12, 34, 56)
        list(client('downloads_count', start, end, interval='week'))
        self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1))
        assert not isinstance(_mock.call_args[0][0], datetime.datetime)
        self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31))
        assert not isinstance(_mock.call_args[0][1], datetime.datetime)

    def test_date_order(self):
        # Ensure fill doesn't change date ordering.
        client = self._make_one()
        prev_date = datetime.date(2000, 1, 1)

        # Addon 1 doesn't have downloads for every month and the client will
        # fill zeroes for the missing dates.
        hits = list(
            client('downloads_count',
                   START,
                   '2012-05-01',
                   interval='month',
                   add_on='1'))
        for hit in hits:
            d = hit['date']
            assert prev_date < d
            prev_date = d
Example #31
0
import sys, os
from pyelasticsearch import ElasticSearch, ElasticHttpNotFoundError
from flask import *
sys.path.insert(0, '..')
from app import app
from flask.ext.sqlalchemy import SQLAlchemy
from apps.glyph.models import *

app.testing = True
client = app.test_client()
ctx = app.test_request_context()
ctx.push()

es = ElasticSearch('http://localhost:9200/')
try:
    es.delete_index('cdpp')
except ElasticHttpNotFoundError:
    # we can safely ignore this, because it might be an initial run
    pass
res = db.session.query(Sign).all()
for r in res:
    d = r.__dict__
    d.pop('_sa_instance_state', None)
# bulk-index the cleaned signs
es.bulk_index('cdpp', 'sign', [r.__dict__ for r in res], id_field='id')



tablets = db.session.query(Tablet).all()
repr = []
for result in tablets:
Example #32
0
from pyelasticsearch import ElasticSearch
from pyelasticsearch.exceptions import ElasticHttpNotFoundError, IndexAlreadyExistsError
import requests
import time

IGNORED_GENRES = ("9", "15", "19")  # We only care about stations that play music.

import settings

es = ElasticSearch(settings.ES_URL)
INDEX_NAME = settings.ES_INDEX

try:
	es.delete_index(INDEX_NAME)
except ElasticHttpNotFoundError:
	pass

try:
	es.create_index(INDEX_NAME)
except IndexAlreadyExistsError:
	pass

headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36'
}

failures = 0
pk = 0
while failures < 200:
	pk += 1
	r = requests.get("http://www.iheart.com/a/live/station/%d/" % pk, headers=headers)

t0=time()
csv_filename='robinhood-daily-rets.csv'
# size of the bulk
chunksize=5000

# parse csv with pandas
csvfile=pd.read_csv(csv_filename)

# init ElasticSearch
es = ElasticSearch('http://104.236.201.91:9200/')

# init index
try :
    es.delete_index("robinhood")
except :
    pass

es.create_index("robinhood")

# start bulk indexing
print("now indexing %s..."%(csv_filename))

records=csvfile.where(pd.notnull(csvfile), None).T.to_dict()
list_records=[records[it] for it in records]
try :
    es.bulk_index("robinhood","myPortfolio",list_records)
except :
    print("error!, skipping a date")
    pass
Example #34
0
class ElasticSearch(object):
    conn = None
    url = settings.ELASTICSEARCH_URL
    index_name = settings.ELASTICSEARCH_INDEX_NAME
    stdout = None
    stderr = None

    def __init__(self, index_name=None, stdout=None, stderr=None):
        self.conn = PyElasticSearch()
        if index_name:
            self.index_name = index_name
        if stdout:
            self.stdout = stdout
        if stderr:
            self.stderr = stderr

    def create_index(self, delete=True):
        if delete:
            try:
                self.conn.delete_index(self.index_name)
            except ElasticHttpNotFoundError as e:
                pass
        mappings = dict(
            (k, v) for k, v in get_elasticsearch_properties().items())
        self.conn.create_index(self.index_name,
                               settings={'mappings': mappings})

    def index_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.index_activity(activity)

    def delete_activity_by_id(self, activity_id):
        activity = HistoricalActivity.objects.get(pk=activity_id)
        return self.delete_activity(activity)

    def index_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            docs = self.get_activity_documents(activity, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(
                        doc, id=doc.pop('id'), parent=doc.pop('_parent', None))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_investor(self, investor):
        for doc_type in DOC_TYPES_INVESTOR:
            docs = self.get_investor_documents(investor, doc_type=doc_type)
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    def index_activity_documents(self, activity_identifiers=[]):
        activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter(
            fk_status__in=(
                HistoricalActivity.STATUS_ACTIVE,
                HistoricalActivity.STATUS_PENDING,
                HistoricalActivity.STATUS_OVERWRITTEN,
                HistoricalActivity.STATUS_DELETED)).distinct().values_list(
                    'activity_identifier', flat=True).distinct()

        for doc_type in DOC_TYPES_ACTIVITY:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i deals...' %
                (doc_type, len(activity_identifiers)))
            for activity_identifier in activity_identifiers:
                for activity in self.get_activity_versions(
                        activity_identifier):
                    docs.extend(
                        self.get_activity_documents(activity,
                                                    doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                paginator = Paginator(docs, 1000)
                for page in paginator.page_range:
                    try:
                        self.conn.bulk(
                            (self.conn.index_op(doc,
                                                id=doc.pop('id'),
                                                parent=doc.pop(
                                                    '_parent', None))
                             for doc in paginator.page(page)),
                            index=self.index_name,
                            doc_type=doc_type)
                    except BulkError as e:
                        for error in e.errors:
                            msg = '%s: %s on ID %s' % (
                                error['index']['error']['type'],
                                error['index']['error']['reason'],
                                error['index']['_id'])
                            if 'caused_by' in error['index']['error']:
                                msg += ' (%s: %s)' % (error['index']['error']
                                                      ['caused_by']['type'],
                                                      error['index']['error']
                                                      ['caused_by']['reason'])
                            self.stderr and self.stderr.write(msg)
                    self.conn.refresh()

    def index_investor_documents(self):
        investors = Investor.objects.public().order_by(
            'investor_identifier', '-id').distinct('investor_identifier')

        for doc_type in DOC_TYPES_INVESTOR:
            docs = []
            # Collect documents
            self.stdout and self.stdout.write(
                'Collect %ss for %i investors...' %
                (doc_type, investors.count()))
            for investor in investors:
                docs.extend(
                    self.get_investor_documents(investor, doc_type=doc_type))
            # Bulk index documents
            self.stdout and self.stdout.write('Index %i %ss...' %
                                              (len(docs), doc_type))
            if len(docs) > 0:
                try:
                    self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id'))
                                    for doc in docs),
                                   index=self.index_name,
                                   doc_type=doc_type)
                except BulkError as e:
                    for error in e.errors:
                        msg = '%s: %s on ID %s' % (
                            error['index']['error']['type'],
                            error['index']['error']['reason'],
                            error['index']['_id'])
                        if 'caused_by' in error['index']['error']:
                            msg += ' (%s: %s)' % (
                                error['index']['error']['caused_by']['type'],
                                error['index']['error']['caused_by']['reason'])
                        self.stderr and self.stderr.write(msg)

    #def index_activity_by_version(self, activity_identifier):
    #    for doc_type in get_elasticsearch_properties().keys():
    #        docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type)
    #        if len(docs) > 0:
    #            try:
    #                self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs),
    #                    index=self.index_name,
    #                    doc_type=doc_type)
    #            except BulkError as e:
    #                for error in e.errors:
    #                    stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % (
    #                            error['index']['error']['type'],
    #                            error['index']['error']['reason'],
    #                            error['index']['error']['caused_by']['type'],
    #                            error['index']['error']['caused_by']['reason'],
    #                            error['index']['_id']
    #                          ))

    def get_activity_versions(self, activity_identifier):
        versions = []
        # get the newest non-pending, readable historic version:
        try:
            newest = HistoricalActivity.objects.filter(
                activity_identifier=activity_identifier,
                fk_status__in=(
                    HistoricalActivity.STATUS_ACTIVE,
                    HistoricalActivity.STATUS_OVERWRITTEN,
                    HistoricalActivity.STATUS_DELETED)).distinct().latest()
            if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED:
                versions.append(newest)
        except HistoricalActivity.DoesNotExist:
            newest = None

        # get newer pendings
        pendings = HistoricalActivity.objects.filter(
            activity_identifier=activity_identifier,
            fk_status_id=HistoricalActivity.STATUS_PENDING).distinct()
        if newest:
            pendings.filter(history_date__gt=newest.history_date)
        versions.extend(pendings)

        return versions

    def get_activity_documents(self, activity, doc_type='deal'):
        docs = []
        deal_attrs = {
            'id': activity.id,
            'activity_identifier': activity.activity_identifier,
            'historical_activity_id': activity.id,
            'status': activity.fk_status_id,
        }

        # Todo: Is there a nice way to prevent this extra Activity query?
        # e.g. if we save is_public/deal_scope as ActivityAttributes
        public_activity = Activity.objects.filter(
            activity_identifier=activity.activity_identifier).order_by(
                '-id').first()
        if public_activity:
            deal_attrs.update({
                'is_public':
                public_activity.is_public,
                'deal_scope':
                public_activity.deal_scope,
                'deal_size':
                public_activity.deal_size,
                'current_negotiation_status':
                public_activity.negotiation_status,
                'top_investors':
                public_activity.top_investors,
                'fully_updated_date':
                public_activity.fully_updated_date,
            })
        else:
            # Fixme: This should not happen
            self.stderr and self.stderr.write(
                _('Missing activity for historical activity %i (Activity identifier: #%i)'
                  % (activity.id, activity.activity_identifier)))
        #except Activity.MultipleObjectsReturned:
        #    # Fixme: This should not happen
        #    self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % (
        #        activity.id,
        #        activity.activity_identifier
        #    )))

        for a in activity.attributes.select_related('fk_group__name').order_by(
                'fk_group__name'):
            # do not include the django object id
            if a.name == 'id':
                continue
            attribute = None
            attribute_key = '%s_attr' % a.name
            if attribute_key in get_elasticsearch_properties(
            )['deal']['properties'].keys():
                attribute = {
                    'value': a.value,
                    'value2': a.value2,
                    'date': a.date,
                    'is_current': a.is_current,
                }
            value = a.value

            # Area field?
            if a.name and 'area' in a.name and a.polygon is not None:
                # Get polygon
                #value = json.loads(a.polygon.json)
                # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work
                #value['type'] = 'multipolygon'
                value = a.polygon.json or ''
            # do not include empty values
            if value is None or value == '':
                continue

            # Doc types: location, data_source or contract
            group_match = a.fk_group and a.fk_group.name or ''
            group_match = re.match(
                '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)',
                group_match)
            if group_match:
                dt, count = group_match.groupdict()['doc_type'], int(
                    group_match.groupdict()['count'])
                if doc_type == dt:
                    while len(docs) < count:
                        docs.append({
                            '_parent': activity.activity_identifier,
                            'id': a.id,  #'%i_%i' % (a.id, count),
                        })
                    docs[count - 1][a.name] = [
                        value,
                    ]
                # Set doc type counter within deal doc type (for location/data_source/contract)
                elif doc_type == 'deal':
                    # Set counter
                    key = '%s_count' % dt
                    if key not in deal_attrs.keys():
                        deal_attrs[key] = count
                    elif deal_attrs[key] < count:
                        deal_attrs[key] = count

                    # Create list with correct length to ensure formset values have the same index
                    if not a.name in deal_attrs:
                        deal_attrs[a.name] = [''] * count
                        if attribute:
                            deal_attrs[attribute_key] = [''] * count
                    else:
                        while len(deal_attrs[a.name]) < count:
                            deal_attrs[a.name].append('')
                            if attribute:
                                deal_attrs[attribute_key].append('')
                    deal_attrs[a.name][count - 1] = value
                    if attribute:
                        deal_attrs['%s_attr' % a.name][count - 1] = attribute

            # Doc type: deal and not formset
            elif doc_type == 'deal':
                if a.name in deal_attrs:
                    deal_attrs[a.name].append(value)
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name].append(attribute)
                else:
                    deal_attrs[a.name] = [
                        value,
                    ]
                    if '%s_attr' % a.name in get_elasticsearch_properties(
                    )['deal']['properties'].keys():
                        deal_attrs['%s_attr' % a.name] = [
                            attribute,
                        ]

        if doc_type == 'deal':
            # Additionally save operational company attributes
            oc = Investor.objects.filter(
                investoractivityinvolvement__fk_activity__activity_identifier=
                activity.activity_identifier)
            if oc.count() > 0:
                oc = oc.first()
                for field in Investor._meta.fields:
                    if isinstance(field, ForeignKey):
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(
                                       oc, '%s_id' % field.name)
                    else:
                        deal_attrs['operational_company_%s' %
                                   field.name] = getattr(oc, field.name)
            else:
                pass
                #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier)

        # Create single document for each location
        # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now?
        spatial_names = list(get_spatial_properties())
        for i in range(deal_attrs.get('location_count', 0)):
            doc = deal_attrs.copy()
            for name in spatial_names:
                if not name in doc:
                    continue
                if len(deal_attrs[name]) > i:
                    doc[name] = deal_attrs[name][i]
                else:
                    doc[name] = ''
            # Set unique ID for location (deals can have multiple locations)
            doc['id'] = '%s_%i' % (doc['id'], i)
            point_lat = doc.get('point_lat', None)
            point_lon = doc.get('point_lon', None)
            if point_lat and point_lon:
                # Parse values
                try:
                    parsed_lat, parsed_lon = float(point_lat), float(point_lon)
                    doc['geo_point'] = '%s,%s' % (point_lat, point_lon)
                except ValueError:
                    doc['geo_point'] = '0,0'
            else:
                doc['point_lat'] = '0'
                doc['point_lon'] = '0'
                doc['geo_point'] = '0,0'
            # FIXME: we dont really need 'point_lat' and 'point_lon' here,
            # so we should pop them from doc when adding 'geo_point'
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def get_export_properties(self, doc, doc_type='deal'):
        if doc_type == 'investor':
            return ExportInvestorForm.export(doc)
        elif doc_type == 'involvement':
            return InvestorVentureInvolvementForm.export(doc)
        else:
            properties = {
                'deal_scope_export':
                doc.get('deal_scope', ''),
                'is_public_export':
                doc.get('is_public', False) and str(_('Yes')) or str(_('No')),
                'deal_size_export':
                doc.get('deal_size', ''),
                'current_negotiation_status_export':
                doc.get('current_negotiation_status', ''),
                'top_investors_export':
                doc.get('top_investors', ''),
                'fully_updated_date_export':
                doc.get('fully_updated_date', ''),
            }
            # Doc types: deal, location, contract and data_source
            for form in ChangeDealView.FORMS:
                formset_name = hasattr(form, "form") and form.Meta.name or None
                form = formset_name and form.form or form
                properties.update(form.export(doc, formset=formset_name))
            properties.update(
                ExportInvestorForm.export(doc, prefix='operational_company_'))
            return properties

    def get_investor_documents(self, investor, doc_type='investor'):
        docs = []
        # Doc types: involvement and investor
        if doc_type == 'involvement':
            ivis = InvestorVentureInvolvement.objects.filter(
                Q(fk_venture=investor) | Q(fk_investor=investor))
            for ivi in ivis:
                doc = {}
                for field in ivi._meta.local_fields:
                    if isinstance(field, ForeignKey):
                        doc[field.name] = getattr(ivi, '%s_id' % field.name)
                    else:
                        doc[field.name] = getattr(ivi, field.name)
                docs.append(doc)
        elif doc_type == 'investor':
            doc = {}
            for field in investor._meta.local_fields:
                if isinstance(field, ForeignKey):
                    doc[field.name] = getattr(investor, '%s_id' % field.name)
                else:
                    doc[field.name] = getattr(investor, field.name)
            docs.append(doc)

        # Update docs with export values
        for doc in docs:
            doc.update(self.get_export_properties(doc, doc_type=doc_type))

        return docs

    def refresh_index(self):
        self.conn.refresh(self.index_name)

    def search(self, elasticsearch_query, doc_type='deal', sort=[]):
        """ Executes paginated queries until all results have been retrieved. 
            @return: The full list of hits. """
        start = 0
        size = 10000  # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better)
        raw_result_list = []

        done = False
        while not done:
            query = {
                'query': elasticsearch_query,
                'from': start,
                'size': size,
            }
            if sort:
                query['sort'] = sort
            query_result = self.conn.search(query,
                                            index=self.index_name,
                                            doc_type=doc_type)
            raw_result_list.extend(query_result['hits']['hits'])
            results_total = query_result['hits']['total']

            if len(raw_result_list) >= results_total:
                done = True
            else:
                start = len(raw_result_list)

        print('\nElasticsearch returned %i documents from a total of %i \n\n' %
              (len(raw_result_list), query_result['hits']['total']))
        return raw_result_list

    def delete_activity(self, activity):
        for doc_type in DOC_TYPES_ACTIVITY:
            try:
                if doc_type == 'deal':
                    self.conn.delete(id=activity.activity_identifier,
                                     index=self.index_name,
                                     doc_type=doc_type)
                else:
                    self.conn.delete_by_query(query={
                        "parent_id": {
                            "type": "deal",
                            "id": str(activity.activity_identifier),
                        }
                    },
                                              index=self.index_name,
                                              doc_type=doc_type)
            except ElasticHttpNotFoundError as e:
                pass

    def get_deals_by_activity_identifier(self,
                                         activity_identifier,
                                         doc_type='deal'):
        return self.search({
            "constant_score": {
                "filter": {
                    "term": {
                        "activity_identifier": activity_identifier
                    }
                }
            }
        })
Example #35
0
es = ElasticSearch(ELASTICSEARCH_HOST, timeout=120)

FILES = [
    "nofly/shapefile/us_national_parks",
    "nofly/shapefile/us_military",
    "nofly/shapefile/5_mile_airport",
]

try:
    es.delete_all(ELASTICSEARCH_DOC, ELASTICSEARCH_INDEX)
except ElasticHttpError:
    pass

try:
    es.delete_index(ELASTICSEARCH_INDEX)
except ElasticHttpError:
    pass

index_settings = {
    "number_of_shards": 3,
    "number_of_replicas": 1,
    "mappings": {
        ELASTICSEARCH_DOC: {
            "properties": {
                "location": {
                    "type": "geo_shape",
                    "tree": "quadtree",
                    "precision": "1m"
                }
            }
Example #36
0
class ESWrapper(BaseDB):
    def __init__(self,
                 index_name,
                 doc_type,
                 host='http://localhost',
                 port=9200):
        self.eserver = ElasticSearch(urls=host,
                                     port=port,
                                     timeout=60,
                                     max_retries=3)
        #self._base_query = {"query": {"bool": {"must": {"match": {}}}}}
        #self._base_query = {"query": {"bool": {}}}
        self._geo_filter = {"distance": "20km", "coordinates": {}}
        self._population_filter = {'population': {'gte': 5000}}
        self._index = index_name
        self._doctype = doc_type

    def getByid(self, geonameId):
        maincondition = {"match": {"id": geonameId}}
        q = {"query": {"bool": {"must": maincondition}}}
        return self.eserver.search(
            q, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']

    def _query(self, qkey, **kwargs):
        q = {"query": {"bool": {}}}
        query_name = "should"
        q["query"]["bool"]["minimum_number_should_match"] = 1
        kwargs.pop("qtype", "")

        placetokens = [
            l.strip() for l in tokenizer.split(qkey)
            if l and l not in STOP_WORDS and l[-1] != '.'
        ]
        if placetokens:
            reduced_placename = u" ".join(placetokens[0:])
            if len(placetokens[0]) < 3 and len(
                    placetokens) > 1 and 3.0 / len(placetokens) >= .5:
                reduced_placename = u" ".join(placetokens[1:])
        else:
            reduced_placename = qkey

        # print "qkey", qkey, "reduced", reduced_placename
        maincondition = [
            {
                "bool": {
                    "must": [{
                        "multi_match": {
                            "query":
                            qkey,
                            "fields":
                            ["name.raw^5", "asciiname^5", "alternatenames"],
                            "operator":
                            "and"
                        }
                    }, {
                        "terms": {
                            "featureClass": ["a", "p"]
                        }
                    }],
                }
            },
            {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            },
            {
                "term": {
                    "normalized_asciiname": {
                        "value": qkey
                    }
                }
            },
            # {"term": {"alternatenames": {"value": qkey[1:]}}},
            {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            },
            # {"multi_match": {"query": reduced_placename if 'fuzzy' in kwargs else unicode(unidecode(reduced_placename)),
            {
                "multi_match": {
                    "query":
                    reduced_placename if 'fuzzy' in kwargs else unicode(
                        unidecode(reduced_placename)),
                    'fuzziness':
                    kwargs.pop("fuzzy", 0),
                    "max_expansions":
                    kwargs.pop("max_expansion", 10),
                    "prefix_length":
                    kwargs.pop("prefix_length", 1),
                    'operator':
                    kwargs.pop("operator", "and"),
                    "fields": [
                        "name^3", "asciiname^3", "alternatenames",
                        "normalized_asciiname^3"
                    ]
                }
            }
        ]

        q["query"]["bool"][query_name] = maincondition

        if kwargs:
            filter_cond = []
            if 'min_popln' in kwargs:
                popln = kwargs.pop("min_popln")
                if popln is not None:
                    filter_cond.append(
                        {"range": {
                            "population": {
                                "gte": popln
                            }
                        }})

            for key, val in kwargs.viewitems():
                if not isinstance(val, basestring):
                    val = list([(v) for v in val])
                    filter_cond.append({"terms": {key: val}})
                else:
                    filter_cond.append({"term": {key: (val)}})

            q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}

        q['from'] = 0
        q['size'] = 50
        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def query(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        # print(max_score)
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def _oldquery(self,
                  qkey,
                  qtype="exact",
                  analyzer=None,
                  min_popln=None,
                  size=10,
                  **kwargs):
        """
        qtype values are exact, relaxed or geo_distance
        Always limit results to 10
        """
        q = {"query": {"bool": {}}}
        query_name = kwargs.pop('query_name', 'must')
        query_name = "should"
        if query_name == "should":
            q["query"]["bool"]["minimum_number_should_match"] = 1

        maincondition = {}
        if qtype == "exact":
            maincondition = [{
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey
                    }
                }
            }]
            if analyzer:
                maincondition["match"]["name.raw"]["analyzer"] = analyzer

        elif qtype == "relaxed":
            maincondition["match"] = {"alternatenames": {"query": qkey}}
            if analyzer:
                maincondition["match"]["alternatenames"]["analyzer"] = analyzer

            #q["query"]["bool"][query_name]["match"].pop("name.raw", "")
        elif qtype == "combined":
            maincondition = [{
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": qkey,
                            "fields":
                            ["name.raw", "asciiname", "alternatenames"]
                        }
                    },
                    "filter": {
                        "bool": {
                            "should": [{
                                "range": {
                                    "population": {
                                        "gte": 5000
                                    }
                                }
                            }, {
                                "terms": {
                                    "featureCode": [
                                        "pcla", "pcli", "cont", "rgn", "admd",
                                        "adm1", "adm2"
                                    ]
                                }
                            }]
                        }
                    }
                }
            }, {
                "term": {
                    "name.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "asciiname.raw": {
                        "value": qkey
                    }
                }
            }, {
                "term": {
                    "alternatenames": {
                        "value": qkey[1:]
                    }
                }
            }, {
                "match": {
                    "alternatenames": {
                        "query": qkey,
                        'fuzziness': kwargs.pop("fuzzy", 0),
                        "max_expansions": kwargs.pop("max_expansion", 5),
                        "prefix_length": kwargs.pop("prefix_length", 1)
                    }
                }
            }]

        if maincondition:
            q["query"]["bool"][query_name] = maincondition

            if min_popln:
                filter_cond = [{"range": {"population": {"gte": min_popln}}}]
            else:
                filter_cond = []

            if kwargs:
                #filter_cond = [{"range": {"population": {"gte": min_popln}}}]
                filter_cond += [{
                    "term": {
                        key: val
                    }
                } for key, val in kwargs.viewitems()]
                # print(filter_cond)
                q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}}
            elif min_popln:
                filter_cond = [{
                    "range": {
                        "population": {
                            "gte": min_popln
                        }
                    }
                }, {
                    "terms": {
                        "featureCode": ["ppla", "pplx"]
                    }
                }]

                q["query"]["bool"]["filter"] = {
                    "bool": {
                        "should": filter_cond
                    }
                }

        return self.eserver.search(q,
                                   index=self._index,
                                   doc_type=self._doctype)

    def oldquery(self, qkey, min_popln=None, **kwargs):
        #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits']
        res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']
        #max_score = sum([r['_score'] for r in res])
        max_score = res['max_score']  #sum([r['_score'] for r in res])
        #for t in res:
        gps = []
        if max_score == 0.0:
            ## no results were obtained by elasticsearch instead it returned a random/very
            ## low scoring one
            res['hits'] = []

        for t in res['hits']:
            t['_source']['geonameid'] = t["_source"]["id"]
            #t['_source']['_score'] = t[1] / max_score
            t['_source']['_score'] = t['_score'] / max_score
            pt = GeoPoint(**t["_source"])
            if t['_source']['featureCode'].lower() == "cont":
                gps = [pt]
                break

            gps.append(pt)

        if len(gps) == 1:
            gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) /
                             max(float(len(gps[0].name)), float(len(qkey))))

        return gps

    def near_geo(self, geo_point, min_popln=5000, **kwargs):
        q2 = {
            "query": {
                "bool": {
                    "must": {
                        "match_all": {}
                    },
                    "filter": [
                        {
                            "geo_distance": {
                                "distance": "30km",
                                "coordinates": geo_point
                            }
                        },
                        {
                            "terms":
                            # {"featureCode":
                            #  ["pcli", "ppl", "ppla2", "adm3"]}
                            {
                                "featureClass": ["a", "h", "l", "t", "p", "v"]
                            }
                        }
                    ]
                }
            },
            "sort": {
                "population": "desc"
            }
        }
        if kwargs:
            for key in kwargs:
                q2['query']['bool']['filter'].append(
                    {"term": {
                        key: kwargs[key]
                    }})

        res = self.eserver.search(
            q2, index=self._index,
            doc_type=self._doctype)['hits']['hits'][0]['_source']
        res['confidence'] = 1.0
        return [GeoPoint(**res)]

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

    def _opLoader(self, datacsv, confDir):
        ere = re.compile("[^\sa-zA-Z0-9]")
        with DataReader(datacsv, os.path.join(confDir,
                                              'geonames.conf')) as reader:
            cnt = 0
            for row in reader:
                try:
                    row['coordinates'] = [
                        float(row['longitude']),
                        float(row['latitude'])
                    ]
                    try:
                        row['population'] = int(row["population"])
                    except:
                        row['population'] = -1

                    try:
                        row['elevation'] = int(row['elevation'])
                    except:
                        row['elevation'] = -1

                    del (row['latitude'])
                    del (row['longitude'])
                    #print row['name']
                    row['alternatenames'] = row['alternatenames'].lower(
                    ).split(",")
                    row['normalized_asciiname'] = (re.sub(
                        r'\s+', r' ', ere.sub("", row['asciiname']))).strip()
                    cnt += 1
                    yield self.eserver.index_op(row,
                                                index=self._index,
                                                doc_type=self._doctype)
                except:
                    print json.dumps(row)
                    continue

    def remove_dynamic_stopwords(self, term):
        # cc = {}
        # ttl = 0
        words = [w for t in term.split("-") for w in t.split() if len(w) > 1]

        if len(words) == 1:
            return term

        stopword_removed = ""
        for word in words:
            try:
                t = self.eserver.count(word)['count']
                if t >= 20000:
                    continue
            except:
                pass

            stopword_removed += (word + " ")
            # else:
            #     print(term, "stopword ", word)

        return stopword_removed.strip()
Example #37
0
def import_json_into_es(types, inputfolder, logger):
    """
    imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch
    :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'}
    :param inputfolder:
    :param logger:
    :return:
    """

    es = ElasticSearch(config.ELASTICSEARCH_URL)

    try:
        es.delete_index('wikidata')
        es.create_index('wikidata')
        logger.info('rebuild index [wikidata]')
    except:
        logger.warning('cant delete wikidata index')

    # convert type dictionary
    wd_types = dict()
    for key in types.keys():
        value = int(types[key].split('/')[-1][1:])
        wd_types[value] = {
            'type': key,
            'filename': path.join(inputfolder, '{}.json.bz2'.format(key))
        }

    # import each given type
    for key in wd_types:
        logger.info(wd_types[key])

        done = 0
        items = []

        for line in BZ2File(wd_types[key]['filename'], 'rb'):
            line = line.strip()
            item = loads(line)
            item['uri'] = 'http://wikidata.org/wiki/' + item['id']

            items.append(item)
            done += 1

            if (done % 5000 == 0):
                es.bulk_index('wikidata',
                              wd_types[key]['type'],
                              items,
                              id_field='id')
                items = []

            # if done % len(wd_types) / 10 == 0: # log 10% steps
            #     logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done ))

            if done % 10000 == 0:
                logger.info('imported {}: {}'.format(wd_types[key]['type'],
                                                     format(done, ',d')))

        if len(items) > 0:
            es.bulk_index('wikidata',
                          wd_types[key]['type'],
                          items,
                          id_field='id')
        logger.info('imported {}: {}'.format(wd_types[key]['type'],
                                             format(done, ',d')))
# size of the bulk
chunksize=5000

# open csv file
f = open(raw_data_path+csv_filename) # read csv

# parse csv with pandas
csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) 

# init ElasticSearch
es = ElasticSearch('http://localhost:9200/')

# init index
try :
    es.delete_index("weiboscope")
except :
    pass

es.create_index("weiboscope")

# start bulk indexing 
print ("now indexing %s..."%(csv_filename))

for i,df in enumerate(csvfile): 
    print (i)
    records=df.where(pd.notnull(df), None).T.to_dict()
    list_records=[records[it] for it in records]
    try :
        es.bulk_index("weiboscope","tweet",list_records)
    except :
Example #39
0
class TestClient(unittest.TestCase):

    def setUp(self):
        super(TestClient, self).setUp()
        docs = []

        self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks')
    def test_datetime_ranges(self, _mock):
        "Test datetime ranges get converted to dates."
        client = self._make_one()
        start = datetime.datetime(2012, 1, 1, 12, 34, 56)
        end = datetime.datetime(2012, 1, 31, 12, 34, 56)
        list(client('downloads_count', start, end, interval='week'))
        self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1))
        assert not isinstance(_mock.call_args[0][0], datetime.datetime)
        self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31))
        assert not isinstance(_mock.call_args[0][1], datetime.datetime)

    def test_date_order(self):
        # Ensure fill doesn't change date ordering.
        client = self._make_one()
        prev_date = datetime.date(2000, 1, 1)

        # Addon 1 doesn't have downloads for every month and the client will
        # fill zeroes for the missing dates.
        hits = list(client('downloads_count', START, '2012-05-01',
                           interval='month', add_on='1'))
        for hit in hits:
            d = hit['date']
            assert prev_date < d
            prev_date = d