Ejemplo n.º 1
0
 def __del__(self):
     try:
         self.bulk_result = helpers.bulk(self.eS, self.actions, stats_only = False)
         helpers.parallel_bulk(self.eS, self.actions)
     except:
         logging.error("ERROR in final elasticsearch BULK...")
         #self.bulk_result = helpers.bulk(self.eS, self.actions, stats_only = False)
         
         raise SubmitterError()
     
     if(hasattr(self,'bulk_result')):    
         logging.info("result of final elasticsearch bulk: "+str(self.bulk_result))
     
     return
Ejemplo n.º 2
0
    def index(self, queryset: QuerySet):
        self._init_index()

        for success, info in parallel_bulk(self.es_connection,
                                           generate_es_query(queryset)):
            if not success:
                raise RuntimeError(f"Error indexando query a ES: {info}")
Ejemplo n.º 3
0
    def build_custom_dict(self):
        df = pd.read_excel(os.path.join(MEDIA_ROOT, 'dict.xlsx'))
        number_of_words = len(df)

        index = es.Index(ES_INDEX_CUSTOM_DICTIONARY_WORD, using=ES_CLIENT)
        index.delete(ignore=404)
        print("Creating index")
        CustomDictionaryWord.init()

        failed, success = 0, 0
        batch_size = 1000
        for ok, result in parallel_bulk(ES_CLIENT,
                                        self.word_generator(df),
                                        index=ES_INDEX_CUSTOM_DICTIONARY_WORD,
                                        chunk_size=batch_size,
                                        raise_on_error=False,
                                        thread_count=6):
            if ok:
                success += 1
            else:
                failed += 1
                action, result = result.popitem()
                print("!!!", action, result)

            if failed > 3:
                raise Exception("Too many failed!!")
            if (success + failed) % batch_size == 0:
                print(f'{success+failed}/{number_of_words} processed')
Ejemplo n.º 4
0
    def es_index(self, articles):
        def annotatearticle(r):
            if r['_id'] in self.variantreferences:
                r['variants'] = list(self.variantreferences[r['_id']])
            if r['_id'] in self.genereferences:
                r['genes'] = list(self.genereferences[r['_id']])
                r['geneids'] = list(self.gids[r['_id']])
                r_ = [self.uniprot[gid]
                      for gid in self.gids[r['_id']]
                      if gid in self.uniprot
                      ]
                r__ = list()
                for i in r_:
                    for j in i:
                        r__.append(j)
                r['uniprotids'] = r__
            _id = str(r['_id'])

        def preparearticles():
            for r in articles:
                if 'references' in r:
                    del r['references']
                annotatearticle(r)
                yield r
        for ok, result in parallel_bulk(
                self.es, preparearticles(),
                thread_count=14, queue_size=1400,
                index=self.index, chunk_size=140
        ):
            if not ok:
                action, result = result.popitem()
                doc_id = '/%s/commits/%s' % (self.index, result['_id'])
                print('Failed to %s document %s: %r' % (action, doc_id, 'result'))
Ejemplo n.º 5
0
    def send_elastic(self):
        success = 0
        failed = 0
        qs = Document.objects.filter(id__gt=self.from_id)
        if self.to_id:
            qs = qs.filter(id__lte=self.to_id)
        qs = qs.order_by('id')
        # import datetime
        # qs = qs.filter(datetime__gte=datetime.date(2019, 1, 1), datetime__lte=datetime.date(2019, 3, 1)).order_by('id')
        # qs = qs.filter(source__corpus__name="gos").order_by('id')
        print("Start build")
        number_of_documents = qs.count()
        for ok, result in parallel_bulk(self.client, self.document_generator(qs), index=ES_INDEX_DOCUMENT,
                                        chunk_size=self.batch_size, raise_on_error=False, thread_count=6):
            if ok:
                success += 1
            else:
                failed += 1
                action, result = result.popitem()
                print("!!!", action, result)

            if failed > self.batch_size / 100:
                raise Exception("Too many failed!!")
            if (success + failed) % self.batch_size == 0:
                print(f'{success+failed}/{number_of_documents} processed')
Ejemplo n.º 6
0
def push2es_parallel(config):
    es = recreate_index(config)
    source_path = config.get('main', 'source_path')
    _index = config.get('main', 'index')
    _type = config.get('main', 'type')
    parser = config.get('xml', 'parser')
    id_field = config.get('main', 'id_field', fallback=None)

    #assumes data is a list of dictionaries
    def genereate_actions(data):
        for item in data:
            action = {
                '_op_type': 'index',
                '_index': _index,
                '_type': _type,
                '_source': item
            }
            if '_id' in item:
                action['_id'] = item['_id']
                del item['_id']
            yield action

    data = []
    data = parseXML(source_path, parser=parser, id_field=id_field)

    #paralell bulk
    for success, info in parallel_bulk(es,
                                       genereate_actions(data),
                                       thread_count=4):
        if not success: print('Doc failed', info)
Ejemplo n.º 7
0
  def insert(self, file_name, index_name):

    total = 0
    actions = []
    not_inserted = 0
    chunk_size = 5000

    self.es.indices.delete(index=index_name, ignore=[400, 404])
    self.logger.info("deleted es index " + index_name)
    self.es.indices.create(index=index_name, body=self.index_settings)
    self.logger.info("created es index " + index_name)
    #self.logger.info('before chunk')
    for chunk in pd.read_table(file_name, sep='\t', header=None, na_filter = False,chunksize=chunk_size, low_memory=False, names=['fact_id', 'subject', 'predicate', 'object', 'number']):
        #self.logger.info('next chunk')
        if chunk.shape[1] != 5:
            raise Exception(file_name + ' does not contain 5 columns')
        actions = np.vectorize(self.add_action)(chunk['subject'], chunk['predicate'], chunk['object'], index_name)

        #self.logger.info('before bulk')
        for success, info in helpers.parallel_bulk(client=self.es, actions=actions , thread_count=5):
            if not success:
                print("Insert failed: ", info)
        #self.logger.info('after bulk')
        #del actions[0:len(actions)]
        total = total + chunk_size
        if total % 1000000 == 0:
            self.logger.info("sent a total of " + str(total) + " messages to es")
        #insert the last rows
    self.logger.info("total inserted : " + str(total) + " total not inserted : " + str(not_inserted))
Ejemplo n.º 8
0
def get_locations(**kwargs):
    from elasticsearch.helpers import parallel_bulk
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT_LOCATION
    from .util import locations_generator

    import datetime

    failed = 0
    success = 0

    for ok, result in parallel_bulk(ES_CLIENT,
                                    (doc.to_dict()
                                     for doc in locations_generator(**kwargs)),
                                    index=ES_INDEX_DOCUMENT_LOCATION,
                                    chunk_size=10000,
                                    raise_on_error=True,
                                    thread_count=4):
        if (failed + success) % 10000 == 0:
            print(f"!!!{failed + success} processed", datetime.datetime.now())
        if failed > 5:
            raise Exception("Too many failed ES!!!")
        if not ok:
            failed += 1
        else:
            success += 1

    return 'Done'
Ejemplo n.º 9
0
    def bulk(
        self,
        index,
        docs,
        chunk_size=None,
        max_chunk_bytes=None,
        queue_size=None,
        thread_count=None,
        refresh=False,
    ):
        """Bulk index, update, delete docs to Elasticsearch."""
        chunk_size = chunk_size or ELASTICSEARCH_CHUNK_SIZE
        max_chunk_bytes = max_chunk_bytes or ELASTICSEARCH_MAX_CHUNK_BYTES
        thread_count = thread_count or ELASTICSEARCH_THREAD_COUNT
        queue_size = queue_size or ELASTICSEARCH_QUEUE_SIZE

        for _ in parallel_bulk(
            self.__es,
            docs,
            index=index,
            thread_count=thread_count,
            chunk_size=chunk_size,
            max_chunk_bytes=max_chunk_bytes,
            queue_size=queue_size,
            refresh=refresh,
        ):
            pass
Ejemplo n.º 10
0
def remove_duplicates(index):
    has_duplicates = True

    while has_duplicates:

        # query for duplicates
        buckets = es_client.search(
            index=index, body=query,
            request_timeout=120)['aggregations']['duplicated_hash']['buckets']

        ids_to_delete = []

        if len(buckets) > 0:
            for bucket in buckets:
                documents = bucket["documents"]["hits"]["hits"]
                # skip first
                for doc in documents[1:]:
                    remove_doc = {
                        '_op_type': 'delete',
                        '_index': index,
                        '_type': '_doc',
                        '_id': doc["_id"]
                    }
                    ids_to_delete.append(remove_doc)

            deletes = helpers.parallel_bulk(es_client, ids_to_delete)
            try:
                for item in deletes:
                    pass
            except:
                pass
            print(len(ids_to_delete), "duplicates was removed.")
        else:
            has_duplicates = False
            print("No duplicates found.")
Ejemplo n.º 11
0
    def bulk_index(self, index_name='restaurants', data=[]):
        print('updating index ...')
        docs = []
        for elem in data:
            docs.append({
                '_op_type': 'index',
                '_index': index_name,
                '_id': elem['id'],
                'body': {
                    'cuisine': elem['cuisine']
                }
            })
        # print(docs[:2])
        # bulk(self.es,iter(docs),request_timeout=30)
        for success, info in parallel_bulk(self.es,
                                           iter(docs),
                                           request_timeout=30):
            if not success:
                print('A document failed:', info)
            else:
                print('success')

        if data == []:
            print('no data provided')
        else:
            print('done')
Ejemplo n.º 12
0
    def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger):
        es_doc = es_doc_cls()

        elasticsearch_conn = connections.get_connection()

        sync_timestamp = current_server_timestamp()

        pending_insertions = self._compute_dirty_documents(
            sql_table_cls, es_doc.doc_type)

        bulk_op = self._synchronisation_op(es_doc, pending_insertions)

        self._logging(logging.INFO, 'Performing synchronization.')

        for ok, info in parallel_bulk(elasticsearch_conn, bulk_op):
            obj_id = info['index']['_id'] \
                if 'index' in info else info['update']['_id']

            if ok:
                # Mark the task as handled so we don't retreat it next time
                self._logging(logging.INFO,
                              'Document %s has been synced successfully.'
                              % obj_id)

                sql_table_cls.update_last_sync(obj_id, sync_timestamp)
            else:
                id_logger(obj_id, logging.ERROR,
                          'Error while syncing document %s index.' % obj_id)

        # Refresh indices to increase research speed
        elasticsearch_dsl.Index(es_doc.index).refresh()
Ejemplo n.º 13
0
    def load(self, client, index_name, chunk_size, threads, timeout):

        # Open and store mapping
        with open(self.get_mapping_file(index_name), 'r') as mapping:
            # Create index with mapping. If it already exists, ignore this
            client.indices.create(index=index_name,
                                  ignore=400,
                                  body=mapping.read())

        # check if it exists some document in index from this file
        es_query = Search(using=client,
                          index=index_name).filter('term',
                                                   path=self.basename)[:0]
        result = es_query.execute()
        if result.hits.total != 0:
            raise IndexNotEmptyError(
                'There are {0} documents from this file in the index'.format(
                    result.hits.total))

        # Send docs to elasticsearch
        for success, info in parallel_bulk(client,
                                           self.make_docs(),
                                           thread_count=threads,
                                           chunk_size=chunk_size,
                                           request_timeout=timeout,
                                           index=index_name,
                                           doc_type='doc',
                                           raise_on_exception=False):
            if not success:
                print('Doc failed', info)
Ejemplo n.º 14
0
    def bulk(self, docs, index = "", doc_type = "", op_type = 'index'):
        '''
        bulk sample:
        {"_op_type":"index", _index" : "test", "_type" : "type1", "_id" : "1" , "_source":{"field1":"value1", "field2":"value2"}}
        { "_op_type":"delete" ,  "_index" : "test", "_type" : "type1", "_id" : "2" } 

        '''
        index_ = self.index if index == "" else index
        doc_type_ = self.doc_type if doc_type == "" else doc_type
 
        allow_op = ['index', 'delete']
        if op_type not in allow_op:
            raise exceptions.RequestError(400, '{"msg":"op_type is not allowed, you can use index or delete"}')

        actions = []
        for doc in docs:
            action = {}
            action["_index"] = index_
            action["_type"] = doc_type_
            action["_id"] = doc["_id"]
            if op_type == 'index':
                del doc["_id"]
                action["_source"] = doc
            action["_op_type"] = op_type
            actions.append(action)

        return helpers.parallel_bulk(self.es, actions)
Ejemplo n.º 15
0
 async def load_bulk(self, items_chunk):
     payload = self.create_bulk(items_chunk)
     try:
         success, _ = parallel_bulk(
             self.client, payload, chunk_size=1000)
     except:
         pass
Ejemplo n.º 16
0
Archivo: eisp.py Proyecto: cceh/eisp
    def main(self) -> None:
        try:
            instance.config = ConfigParser()
            instance.config.read_dict(defaultconfig())
            logger().info('Started eisp with pid %s', getpid())

            for i in [i for i in argv if i.startswith('--')]:
                try:
                    mod('eisp.param.{}'.format(i[2:])).__dict__[i[2:]](argv)
                except:
                    exit('Invalid parameter or argument to {}'.format(i[2:]))

            conf = dotdict(instance.config['data'])
            connections.create_connection(hosts=[conf.host])
            delete_index(conf.index_name)
            create_index(conf.elastic_mapping, conf.index_name)

            for ok, info in helpers.parallel_bulk(connections.get_connection(),
                                                  actions=index_pdfs(
                                                      conf.index_name,
                                                      conf.root),
                                                  request_timeout=60,
                                                  chunk_size=100,
                                                  thread_count=8,
                                                  queue_size=8):
                if not ok:
                    print(info)

        except KeyboardInterrupt:
            print('\N{bomb}')
        except Exception as exception:
            logger().exception(exception)
        except SystemExit as exception:
            logger().critical(str(exception))
Ejemplo n.º 17
0
def insert_multiple_docs(client, data_generator, para=False):
    if para:
        bulk(client, data_generator)
    else:
        for success, info in parallel_bulk(client, data_generator):
            if not success:
                print("A document failed:", info)
Ejemplo n.º 18
0
    def populate(self, index_name, corpus, thread=4, chunk=500):
        start_time = time.time()
        connection = sqlite3.connect(corpus, check_same_thread=False)
        cursor = connection.cursor()
        number_of_docs = 0
        for row in cursor.execute('SELECT * FROM documents'):
            number_of_docs += 1

        cursor.close()

        if not (self.client.indices.exists(index_name)):
            print("This index does not exist")
        else:

            print("Creating an index...")
            #create_index(self.client, index_name)

            print("Indexing documents...")
            progress = tqdm.tqdm(unit="docs", total=number_of_docs)
            successes = 0
            for ok, action in parallel_bulk(client=self.client,
                                            index=index_name,
                                            actions=self.generate_docs(corpus),
                                            thread_count=thread,
                                            chunk_size=chunk):
                progress.update(1)
                successes += ok
            print("Indexed %d/%d documents" % (successes, number_of_docs))
            print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 19
0
 def bulk_data(self, index, doc_type, is_parallel=True, batch_chunk_size=5000, threads_counts=8):
     '''
     数据批量插入
     :param index: 要插入数据的index
     :param doc_type: index的文档类型
     :param chunk_size: 批量插入的大小,只用于非并行插入
     :param is_parallel: 是否要并行插入,默认为并行插入
     :param threads_counts: 线程数量,默认为4,只有在并行插入数据时该参数才有效
     :return:
     '''
     if is_parallel is None or is_parallel == True:
         gen_action = self._gen_parallel_data(index, doc_type)
         print("正在并行插入数据...")
         start = time.time()
         for success, info in helpers.parallel_bulk(client=self.es, actions=gen_action, thread_count=threads_counts, chunk_size=1000):
             if not success:
                 print("Insert failed: ", info)
         print("插入数据成功... ", time.time()-start)
     elif is_parallel == False:
         gen_action = self._gen_data(index, doc_type, batch_chunk_size)
         try:
             print("正在插入数据...")
             t3 = time.time()
             helpers.bulk(client=self.es, actions=gen_action, chunk_size=500)
             print("插入成功....", time.time() - t3)
         except  Exception as e:
             print(e, "插入失败!")
     else:
         raise ValueError("is_parallel应该为True或False")
Ejemplo n.º 20
0
    def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger):
        es_doc = es_doc_cls()

        elasticsearch_conn = connections.get_connection()

        sync_timestamp = current_server_timestamp()

        pending_insertions = self._compute_dirty_documents(
            sql_table_cls, es_doc.doc_type)

        bulk_op = self._synchronisation_op(es_doc, pending_insertions)

        self._logging(logging.INFO, 'Performing synchronization.')

        for ok, info in parallel_bulk(elasticsearch_conn, bulk_op):
            obj_id = info['index']['_id'] \
                if 'index' in info else info['update']['_id']

            if ok:
                # Mark the task as handled so we don't retreat it next time
                self._logging(
                    logging.INFO,
                    'Document %s has been synced successfully.' % obj_id)

                sql_table_cls.update_last_sync(obj_id, sync_timestamp)
            else:
                id_logger(obj_id, logging.ERROR,
                          'Error while syncing document %s index.' % obj_id)

        # Refresh indices to increase research speed
        elasticsearch_dsl.Index(es_doc.index).refresh()
Ejemplo n.º 21
0
    def run(self, distribution):
        fields = distribution.field_set.all()
        fields = {field.title: field.identifier for field in fields}
        df = self.init_df(distribution, fields)

        # Aplica la operación de procesamiento e indexado a cada columna
        result = [
            process_column(df[col], self.index_name) for col in df.columns
        ]

        if not result:  # Distribución sin series cargadas
            return

        # List flatten: si el resultado son múltiples listas las junto en una sola
        actions = reduce(lambda x, y: x + y, result) if isinstance(
            result[0], list) else result

        self.add_catalog_keyword(actions, distribution)
        for success, info in parallel_bulk(self.elastic, actions):
            if not success:
                logger.warning(strings.BULK_REQUEST_ERROR, info)

        remove_duplicated_fields(distribution)
        for field in distribution.field_set.exclude(title='indice_tiempo'):
            field.enhanced_meta.update_or_create(key=meta_keys.AVAILABLE,
                                                 value='true')

        # Cálculo de metadatos adicionales sobre cada serie
        df.apply(update_enhanced_meta,
                 args=(distribution.dataset.catalog.identifier,
                       distribution.identifier))
Ejemplo n.º 22
0
def parallel_insert(client, executors, filename):
    start = time.time()
    reader = jsonlines.open(filename, 'r')
    futures = [executors.submit(json_to_doc, obj) for obj in reader]
    kwargs = {
        'total': len(futures),
        'unit': 'parsed',
        'unit_scale': True,
        'leave': True
    }
    for f in tqdm(as_completed(futures), **kwargs):
        pass
    results = [f.result() for f in futures]
    print("Json preprocessing done in {:.5f}s".format(start - time.time()))

    t = time.time()
    slicing = 5000
    for i in range(slicing, len(results) + slicing, slicing):
        docs = results[i - slicing:i]
        for r in parallel_bulk(client,
                               Post.bulk_dicts(docs),
                               thread_count=4,
                               chunk_size=400):
            pass
        print('{} insertion done in {:.5f}s'.format(i, time.time() - t))
        t = time.time()
    print('Total elapsed time: {:.5f}s'.format(time.time() - start))
Ejemplo n.º 23
0
    def _buildES(self, cat, feat, reprList):
        """
        Build the category's elasticsearch model using corpus
        """
        if not self.reprDict.has_key(cat):
            self.reprDict[cat] = {}
        self.reprDict[cat][feat] = reprList
        lowerCat = cat.lower() + feat
        os.system(
            ut.rp('elastic/init_entity_search.sh ') + lowerCat + ' ' +
            lowerCat)
        actionList = []
        uniqReprList = list(set(reprList))
        for each in uniqReprList:
            action = {
                "_index": lowerCat,
                "_type": lowerCat,
                "_source": {
                    "name": each
                }
            }
            actionList.append(action)

        for success, info in helpers.parallel_bulk(es_client,
                                                   actionList,
                                                   chunk_size=200,
                                                   thread_count=12):
            print success, info

        self.save()
Ejemplo n.º 24
0
def _upload_to_es(payload_file, my_uuid, timestamp, es, my_node, my_pod, index_retries):
    documents = {
            "total": 0,
            "existent": 0,
            "total": 0
    }

    def doc_stream():
        for scribed in transcribe(payload_file, 'stockpile'):
            doc = json.loads(scribed)
            es_index = "%s-metadata" % doc["module"]
            doc["uuid"] = my_uuid
            _id = hashlib.sha256(str(doc).encode()).hexdigest()
            # This information changes depending on the node and pod where stockpile-wrapper is executed
            # Don't include it in the _id calculation to avoid indexing several times documents not
            # specific to a node
            doc["node_name"] = my_node
            doc["pod_name"] = my_pod
            doc["timestamp"] = timestamp
            documents["total"] += 1
            yield {"_index": es_index,
                   "_source": doc,
                   "_id": _id,
                   "_op_type": "create"}

    failed_docs = []
    for r in range(index_retries):
        documents["failed"] = 0
        documents["existent"] = 0
        try:
            for ok, resp in parallel_bulk(es, doc_stream()):
                pass
        # Catch indexing exception
        except BulkIndexError as err:
            exception = err
            # An exception can refer to multiple documents
            for failed_doc in err.errors:
                # Document already exists in ES
                if failed_doc["create"]["status"] == 409:
                    documents["existent"] += 1
                    continue
                documents["failed"] += 1
                es_index = "%s-metadata" % failed_doc["create"]["data"]["module"]
                doc = {"_index": es_index,
                       "_source": failed_doc["create"]["data"],
                       "_id": failed_doc["create"]["_id"],
                       "_op_type": "create"}
                failed_docs.append(doc)
        except Exception as err:
            print("Unknown indexing error: %s" % err)
            return
        if not documents["failed"]:
            break
    if documents["total"] > documents["failed"] + documents["existent"]:
        print("%d documents successfully indexed" % (documents["total"] - documents["failed"] - documents["existent"]))
    if documents["failed"] > 0:
        print("%d documents couldn't be indexed" % documents["failed"])
        print("Indexing exception found %s" % exception)
    if documents["existent"] > 0:
        print("%d documents already exist in ES" % documents["existent"])
Ejemplo n.º 25
0
def json2es(json_data, es: Elasticsearch, index_name):
    """
    将json数据写入到ES
    :param json_data: 包含json数据的迭代器或json数据
    :param es: ES客户端
    :param index_name: 写入的索引名
    """
    actions = []
    count = 0
    if helper.is_json(json_data):
        action = es_helper.get_action(json_data, index_name)
        actions.append(action)
    elif isinstance(json_data, Iterator):
        for line in json_data:
            if helper.is_json(line):
                action = es_helper.get_action(line, index_name)
                actions.append(action)
            else:
                print(json_data + "is not json data")
    for success, info in helpers.parallel_bulk(es,
                                               actions,
                                               thread_count=1,
                                               chunk_size=4000,
                                               max_chunk_bytes=100 * 1024 *
                                               1024 * 2):
        if not success:
            print('Doc failed', info)
        else:
            count = count + 1

    print("total insert " + str(count) + " event to " + index_name)
Ejemplo n.º 26
0
    def storeTweetsWithTag(self, tweets, query, event=""):
        tweets_not_created = []

        to_update = (
        {
        '_op_type': 'update',
        '_type':'tweets',
        '_index':self.index,
        '_id': tweet["id"],

        'script': {
            'lang': "painless",
            "inline" : "ctx._source.tags.contains(params.query) ? (ctx.op = \"none\") : ctx._source.tags.add(params.query)",
            "params": {
                "query": query,
                "event": event
            }
        },
        'upsert': tweet
        }
              for tweet in self.format_tweets(tweets, query, event) if "entities" in tweet)

        errors = []
        for res, item in helpers.parallel_bulk(self.es,to_update,chunk_size=chunk_size, thread_count=thread_count, raise_on_error=False):
            if not res:
                errors.append(item)
        return errors
Ejemplo n.º 27
0
    def run(self, corpus, index_name="fact_corpus", document_class=Fact, **kwargs):
        connections.create_connection(hosts=["localhost"])
        document_class.init()

        documents = (
            document_class(meta={"id": id}, fact=doc["fact"]).to_dict(True)
            for id, doc in corpus.items()
        )

        logger.info(f"Building corpus index for {index_name}")

        # RayExecutor().run(documents, self.save_data, {})

        for success, info in tqdm(
            parallel_bulk(
                connections.get_connection(),
                documents,
                thread_count=kwargs.pop("batch_size", multiprocessing.cpu_count()),
                chunk_size=100000,
                max_chunk_bytes=2 * 1024 ** 3,
            )
        ):
            if not success:
                logger.error(f"A document failed: {info} ")

        logger.success("Elastic index successfully built")

        return index_name
Ejemplo n.º 28
0
    def to_es(self, df, index, doc_type=None, use_index=False, thread_count=2, chunk_size=1000, request_timeout=60,
              success_threshold=0.9):
        '''
        :param df: pandas DataFrame data
        :param index: full name of es indices
        :param doc_type: full name of es template
        :param use_index: use DataFrame index as records' _id
        :param delete: delete existing doc_type template if True
        :param thread_count: number of thread sent data to es
        :param chunk_size: number of docs in one chunk sent to es
        :param request_timeout:
        :param success_threshold:
        :return: num of the number of data written into es successfully
        '''
        if self.es7:
            doc_type = '_doc'
        if not doc_type:
            doc_type = index + '_type'
        gen = helpers.parallel_bulk(self.es, (self.rec_to_actions(df, index, doc_type=doc_type, use_index=use_index, chunk_size=chunk_size)),
                                    thread_count=thread_count,
                                    chunk_size=chunk_size, raise_on_error=True, request_timeout=request_timeout)

        success_num = np.sum([res[0] for res in gen])
        rec_num = len(df)
        fail_num = rec_num - success_num

        if (success_num / rec_num) < success_threshold:
            raise Exception('%d records write failed' % fail_num)

        return success_num
Ejemplo n.º 29
0
 def WriteES(self, Index_name, Tag_name, IndexData, PreResult):
     from elasticsearch import helpers
     """
     ES数据写入库连接封装类,输入数据分别是Index_name是ES数据库中的表名,tag_name为需要打标签的名字,IndexData为需要更新的索引ID编号
     形式如下['J3213225318122300014', 'J3205075218122300001']
     PreResult为预测的二维数组更新内容,没有则新建[[0.93480456],[0.9358239 ],[0.8241926 ],[0.9171963 ]]
     """
     actions = []
     num = len(IndexData)
     for line in range(num):
         # res = str(PreResult[line][0])
         res = round(PreResult[line][0], 3)
         action = {
             '_op_type': 'update',
             "_index": Index_name,
             "_type": "_doc",
             "_id": IndexData[line],
             "doc": {
                 Tag_name: res,
             }
         }
         actions.append(action)
     ess = helpers.parallel_bulk(self.connes, actions, self.thread,
                                 self.chunk_size)
     for ok, response in ess:
         if not ok:
             print(response)
Ejemplo n.º 30
0
def build_index(CLIENT, VCLAIMS, INDEX_FILE, INDEX_NAME, KEYS):
    vclaims_count = VCLAIMS.shape[0]
    clear_index(CLIENT, INDEX_NAME)

    with open(INDEX_FILE) as index_file:
        source = index_file.read()
        CLIENT.indices.create(index=INDEX_NAME, body=source)

    lib.logger.info(f"Embedding vclaims.")
    actions = []
    for i, vclaim in tqdm(VCLAIMS.iterrows(), total=vclaims_count):
        if not CLIENT.exists(index=INDEX_NAME, id=i):
            body = vclaim.loc[KEYS[:-1]].replace(np.nan, "").to_dict()
            body["vector"] = lib.embedd(vclaim['vclaim'])
            actions.append({
                '_op_type': 'create',
                '_index': INDEX_NAME,
                '_id': i + 1,
                '_source': body
            })
    lib.logger.info(
        f"Adding {vclaims_count} entries to '{INDEX_NAME}' with fieldnames: {KEYS}"
    )

    for entry in tqdm(helpers.parallel_bulk(client=CLIENT, actions=actions),
                      total=vclaims_count):
        pass
Ejemplo n.º 31
0
def start_bulk_indexing(es_instance, dataset, dataset_name, doc_type):
    for success, info in helpers.parallel_bulk(
        es_instance, generate_actions(dataset, dataset_name, doc_type), thread_count=4, chunk_size=1000
    ):
        if not success:
            print("A document failed to index: {}".format(info))
            logger.info("A document failed to index: {}".format(info))
Ejemplo n.º 32
0
def run_and_index(directory, metadata={}):
    if not metadata:
        with open(f"{directory}/metadata.json") as reader:
            metadata = json.load(reader)
    # setup patches index
    data_host = environment["elasticsearch_url"]
    data_index = "fyp-patches"
    data_es = Elasticsearch([
        {
            "host": data_host,
            "port": 443,
            "use_ssl": True,
            "timeout": 60,
            "max_retries": 10,
            "retry_on_timeout": True
        },
    ])
    # create index if doesn't exist
    mapping = {"mappings": {"properties": {"location": {"type": "geo_shape"}}}}
    data_es.indices.create(index=data_index, ignore=400, body=mapping)

    deque(helpers.parallel_bulk(client=data_es,
                                actions=get_data(directory, metadata,
                                                 data_index),
                                chunk_size=500),
          maxlen=0)
Ejemplo n.º 33
0
    def bulk(self, docs, index="", doc_type="", op_type='index'):
        '''
        bulk sample:
        {"_op_type":"index", _index" : "test", "_type" : "type1", "_id" : "1" , "_source":{"field1":"value1", "field2":"value2"}}
        { "_op_type":"delete" ,  "_index" : "test", "_type" : "type1", "_id" : "2" } 

        '''
        index_ = self.index if index == "" else index
        doc_type_ = self.doc_type if doc_type == "" else doc_type

        allow_op = ['index', 'delete']
        if op_type not in allow_op:
            raise exceptions.RequestError(
                400,
                '{"msg":"op_type is not allowed, you can use index or delete"}'
            )

        actions = []
        for doc in docs:
            action = {}
            action["_index"] = index_
            action["_type"] = doc_type_
            action["_id"] = doc["_id"]
            if op_type == 'index':
                del doc["_id"]
                action["_source"] = doc
            action["_op_type"] = op_type
            actions.append(action)

        return helpers.parallel_bulk(self.es, actions)
Ejemplo n.º 34
0
  def run(self):
    logger.debug("Starting thread '{}'".format(self.name))

    while True:
      cursor = None
      actions = []
      for entry in self.jrnl:
        action, cursor = self._journal_entry_to_action(entry)
        actions.append(action)

      if len(actions) > 0:
        for success, info in helpers.parallel_bulk(self.es, actions, thread_count=2, index=self.index):
          if not success:
            logger.error('Failed: {}'.format(info[0]))
            return False

        self.es.indices.refresh()
        count = self.es.count(index=self.index)
        logger.debug("Items counted on index '{}': {}".format(self.index, count['count']))

        cursorpath = Path("/var/cache/geeft_systemd.cursor")
        with open(cursorpath, 'w') as cursorfile:
          cursorfile.write(cursor)
      time.sleep(1)

    logger.debug("Exiting thread '{}'".format(self.name))
Ejemplo n.º 35
0
 def test_chunk_sent_from_different_threads(self, _process_bulk_chunk):
     actions = ({"x": i} for i in range(100))
     results = list(
         helpers.parallel_bulk(
             Elasticsearch(), actions, thread_count=10, chunk_size=2
         )
     )
     self.assertTrue(len(set([r[1] for r in results])) > 1)
Ejemplo n.º 36
0
def reindex(config, **kwargs):
    """
    Recreate the Search Index.
    """
    client = config.registry["elasticsearch.client"]
    db = Session(bind=config.registry["sqlalchemy.engine"])

    # We use a randomly named index so that we can do a zero downtime reindex.
    # Essentially we'll use a randomly named index which we will use until all
    # of the data has been reindexed, at which point we'll point an alias at
    # our randomly named index, and then delete the old randomly named index.

    # Create the new index and associate all of our doc types with it.
    index_base = config.registry["elasticsearch.index"]
    random_token = binascii.hexlify(os.urandom(5)).decode("ascii")
    new_index_name = "{}-{}".format(index_base, random_token)
    doc_types = config.registry.get("search.doc_types", set())
    new_index = get_index(
        new_index_name,
        doc_types,
        using=client,
        shards=config.registry.get("elasticsearch.shards", 1),
        replicas=config.registry.get("elasticsearch.replicas", 1),
    )
    new_index.create()

    # From this point on, if any error occurs, we want to be able to delete our
    # in progress index.
    try:
        db.execute("SET statement_timeout = '600s'")

        for _ in parallel_bulk(client, _project_docs(db)):
            pass
    except:
        new_index.delete()
        raise
    finally:
        db.rollback()
        db.close()

    # Now that we've finished indexing all of our data, we'll point the alias
    # at our new randomly named index and delete the old index.
    if client.indices.exists_alias(name=index_base):
        to_delete = set()
        actions = []
        for name in client.indices.get_alias(name=index_base):
            to_delete.add(name)
            actions.append({"remove": {"index": name, "alias": index_base}})
        actions.append({"add": {"index": new_index_name, "alias": index_base}})
        client.indices.update_aliases({"actions": actions})
        client.indices.delete(",".join(to_delete))
    else:
        client.indices.put_alias(name=index_base, index=new_index_name)
Ejemplo n.º 37
0
def create_es_indices_bulk_parallel(es, data_list, thread_cnt):
    '''
    For creating elastic search indices in bulk ( Works with parallel threading, thus faster than es_indices_bulk)
    :param es:
    :param data_list:
    :param thread_cnt:
    :return:
    '''
    for success, info in helpers.parallel_bulk(es, data_list, thread_count=thread_cnt, chunk_size=1000000,
                                               request_timeout=30):
        if not success:
            print('A document failed:', info)
Ejemplo n.º 38
0
    def flush_cache(self):
        if len(self.cache) == 0:
            return True
        retry = 2
        for i in range(retry):
            try:
                to_upload = helpers.parallel_bulk(
                    self.es, self.cache_insertable_iterable())
                counter = 0
                num_items = len(self.cache)
                for item in to_upload:
                    self.logger.debug(
                        "{} of {} Elastic objects uploaded".format(
                            num_items, counter))
                    counter = counter + 1
                output = "Pushed {} items to Elasticsearch to index {}".format(
                    num_items, self.index)
                output += " and browbeat UUID {}".format(str(browbeat_uuid))
                self.logger.info(output)
                self.cache = deque()
                self.last_upload = datetime.datetime.utcnow()
                return True
            except Exception as Err:
                self.logger.error(
                    "Error pushing data to Elasticsearch, going to retry"
                    " in 10 seconds")
                self.logger.error("Exception: {}".format(Err))
                time.sleep(10)
                if i == (retry - 1):
                    self.logger.error(
                        "Pushing Data to Elasticsearch failed in spite of retry,"
                        " dumping JSON for {} cached items".format(
                            len(
                                self.cache)))
                    for item in self.cache:
                        filename = item['test_name'] + '-' + item['identifier']
                        filename += '-elastic' + '.' + 'json'
                        elastic_file = os.path.join(item['result_dir'],
                                                    filename)

                        with open(elastic_file, 'w') as result_file:
                            json.dump(item['result'],
                                      result_file,
                                      indent=4,
                                      sort_keys=True)

                            self.logger.info(
                                "Saved Elasticsearch consumable result JSON to {}". format(
                                    elastic_file))
                    self.cache = deque()
                    self.last_upload = datetime.datetime.utcnow()
                    return False
Ejemplo n.º 39
0
    def bulk(self,actions):
        if len(actions) > 0:
            self.tempData.extend(actions)

            #If more than # MB, store on elasticsearch and clear list
            if sys.getsizeof(self.tempData) > 1000000:
                print 'UPLOADING ########### The SIZE IS:  ' + str(sys.getsizeof(self.tempData))
                #try:
                for success, info in helpers.parallel_bulk(self.es, self.tempData, thread_count=4):
                    if not success: print('Doc failed', info)
                #except:
                    #print 'Error while uploading : ', actions
                print 'uploaded!'
                self.tempData = []
Ejemplo n.º 40
0
 def PushMessage(self, es):
   try:
     #r = requests.post('%s/_bulk?' % args.elasticserver, data=data, timeout=args.timeout)
     #helpers.parallel_bulk(es, data, chunk_size=5)
     for success, info in helpers.parallel_bulk(es, self.send_data, chunk_size=1500):
       #print '\n', info, success
       print info, success
       if not success:
         print('A document failed:', info)
     self.data = {}
     self.send_data = []
     loggerIndex.info('Bulk API request to Elasticsearch returned with code ' )
   except Exception, e:
     loggerIndex.error('Failed to send to Elasticsearch: %s' % e)
Ejemplo n.º 41
0
 def _create_class_docs(self, routing, binary_id, classes):
     bulk_actions = []
     for objc_class in classes:
         bulk_actions.append({
             "_index": self.index,
             "_type": "class",
             "_parent": binary_id,
             "_routing": routing,
             "_source": dict(objc_class)
         })
     
     for success, info in helpers.parallel_bulk(self.es, bulk_actions):
         if not success:
             raise Exception("A class document failed: %s" % info)
Ejemplo n.º 42
0
 def _create_request_docs(self, routing, analysis_id, requests):
     bulk_actions = []
     for request in requests:
         bulk_actions.append({
             "_index": self.index,
             "_type": "network_request",
             "_parent": analysis_id,
             "_routing": routing,
             "_source": dict(request)
         })
     
     for success, info in helpers.parallel_bulk(self.es, bulk_actions):
         if not success:
             raise Exception( "A network_request document failed: %s" % info)
Ejemplo n.º 43
0
 def _create_file_docs(self, routing, analysis_id, files):
     bulk_actions = []
     for a_file in files:
         bulk_actions.append({
             "_index": self.index,
             "_type": "file_access",
             "_parent": analysis_id,
             "_routing": routing,
             "_source": dict(a_file)
         })
     
     for success, info in helpers.parallel_bulk(self.es, bulk_actions):
         if not success:
             raise Exception("A file_access document failed: %s" % info)
Ejemplo n.º 44
0
    def _geocomplete_index_batch(self, elasticsearch_conn, to_index):
        log_msg = 'Indexing documents.'
        self._logging(logging.INFO, log_msg)

        for ok, info in parallel_bulk(elasticsearch_conn, to_index):
            if not ok:
                doc_id = info['create']['_id']
                doc_type = info['create']['_type']
                doc_index = info['create']['_index']

                logging_level = logging.ERROR
                err_msg = "Couldn't index document: '%s', of type: %s, " \
                          "under index: %s." % (doc_id, doc_type, doc_index)

                self._logging(logging_level, err_msg)
Ejemplo n.º 45
0
 def do_commit(rrr):
     print ('COMMITTING BATCH...',vectors_model_name,len(rrr))
     
     if mc_config.LOW_LEVEL:
         ii = parallel_bulk(es,
                            rrr,
                            thread_count = 1,
                            chunk_size = 500,
                            max_chunk_bytes = 100 * 1024 * 1024, #100MB
                            )
     else:
         ii = nes.parallel_bulk(rrr)
     
     for is_success,res in ii:
         #print ('COMMITTED_VECTORS',vectors_model_name,is_success,res)
         pass
     
     rrr[:] = []
     print ('COMMITTED')
Ejemplo n.º 46
0
    def do_commit(rrr):
        print ('COMMITTING BATCH...',len(rrr))

        #print ('SERVER_RESULTS_SAMPLE', rrr[5:])
        #raw_input_enter()
        
        from elasticsearch.helpers import parallel_bulk, scan

        ii = parallel_bulk(es,
                           rrr,
                           thread_count = 1,
                           chunk_size = 500,
                           max_chunk_bytes = 100 * 1024 * 1024, #100MB
                           )

        for is_success,res in ii:
            #print ('COMMITTED',is_success,res)
            pass

        rrr[:] = []
        print ('COMMITTED')
Ejemplo n.º 47
0
def reindex_project(self, request, project_name):
    r = redis.StrictRedis.from_url(request.registry.settings["celery.scheduler_url"])
    try:
        with SearchLock(r, timeout=15, blocking_timeout=1):
            client = request.registry["elasticsearch.client"]
            doc_types = request.registry.get("search.doc_types", set())
            index_name = request.registry["elasticsearch.index"]
            get_index(
                index_name,
                doc_types,
                using=client,
                shards=request.registry.get("elasticsearch.shards", 1),
                replicas=request.registry.get("elasticsearch.replicas", 0),
            )

            for _ in parallel_bulk(
                client, _project_docs(request.db, project_name), index=index_name
            ):
                pass
    except redis.exceptions.LockError as exc:
        raise self.retry(countdown=60, exc=exc)
Ejemplo n.º 48
0
def update_thread(thread, es=None):
    """Put into the index new child posts"""

    if not es:
        es = Elasticsearch()
    currents_posts = fetch_hn_data(thread)['kids']
    query = {
        "_source": False,
        'query': {
            'term': {"parent": thread}
        }
    }
    older_posts_gen = helpers.scan(es, query)
    old_posts_ids = {int(item['_id']) for item in older_posts_gen}
    new_posts_ids = set(currents_posts) - old_posts_ids
    if new_posts_ids:
        print("There are {} new posts!".format(len(new_posts_ids)))
        actions = [format_data_for_action(r)
            for r in new_posts_ids
            if format_data_for_action(r)]
        list(helpers.parallel_bulk(es, actions))
Ejemplo n.º 49
0
import csv
from collections import deque
import elasticsearch
from elasticsearch import helpers

def readMovies():
    csvfile = open('ml-latest-small/movies.csv', 'r')

    titleLookup = readMovies()

    reader = csv.DictReader( csvfile )
    for line in reader:
        rating = {}
        rating['movie_id'] = int(line['movieId'])
        rating['title'] = titleLookup[line['title']]
        rating['genres'] = titleLookup[line['genres']]
        yield rating


es = elasticsearch.Elasticsearch()

es.indices.delete(index="movies",ignore=404)
deque(helpers.parallel_bulk(es,readMovies(),index="movies",doc_type="movie"), maxlen=0)
es.indices.refresh()
Ejemplo n.º 50
0
 def parallel_bulk(self,thread_count=4):
     if self.client and (len(self.actions) > 0):
         helpers.parallel_bulk(self.client, self.actions, thread_count)
     else:
         raise _EsError('commit:_Es have no redis or actions = 0')
Ejemplo n.º 51
0
def reindex(self, request):
    """
    Recreate the Search Index.
    """
    r = redis.StrictRedis.from_url(request.registry.settings["celery.scheduler_url"])
    try:
        with SearchLock(r, timeout=30 * 60, blocking_timeout=30):
            p = urllib.parse.urlparse(request.registry.settings["elasticsearch.url"])
            client = elasticsearch.Elasticsearch(
                [urllib.parse.urlunparse(p[:2] + ("",) * 4)],
                verify_certs=True,
                ca_certs=certifi.where(),
                timeout=30,
                retry_on_timeout=True,
                serializer=serializer.serializer,
            )
            number_of_replicas = request.registry.get("elasticsearch.replicas", 0)
            refresh_interval = request.registry.get("elasticsearch.interval", "1s")

            # We use a randomly named index so that we can do a zero downtime reindex.
            # Essentially we'll use a randomly named index which we will use until all
            # of the data has been reindexed, at which point we'll point an alias at
            # our randomly named index, and then delete the old randomly named index.

            # Create the new index and associate all of our doc types with it.
            index_base = request.registry["elasticsearch.index"]
            random_token = binascii.hexlify(os.urandom(5)).decode("ascii")
            new_index_name = "{}-{}".format(index_base, random_token)
            doc_types = request.registry.get("search.doc_types", set())
            shards = request.registry.get("elasticsearch.shards", 1)

            # Create the new index with zero replicas and index refreshes disabled
            # while we are bulk indexing.
            new_index = get_index(
                new_index_name,
                doc_types,
                using=client,
                shards=shards,
                replicas=0,
                interval="-1",
            )
            new_index.create(wait_for_active_shards=shards)

            # From this point on, if any error occurs, we want to be able to delete our
            # in progress index.
            try:
                request.db.execute("SET statement_timeout = '600s'")

                for _ in parallel_bulk(
                    client, _project_docs(request.db), index=new_index_name
                ):
                    pass
            except:  # noqa
                new_index.delete()
                raise
            finally:
                request.db.rollback()
                request.db.close()

            # Now that we've finished indexing all of our data we can update the
            # replicas and refresh intervals.
            client.indices.put_settings(
                index=new_index_name,
                body={
                    "index": {
                        "number_of_replicas": number_of_replicas,
                        "refresh_interval": refresh_interval,
                    }
                },
            )

            # Point the alias at our new randomly named index and delete the old index.
            if client.indices.exists_alias(name=index_base):
                to_delete = set()
                actions = []
                for name in client.indices.get_alias(name=index_base):
                    to_delete.add(name)
                    actions.append({"remove": {"index": name, "alias": index_base}})
                actions.append({"add": {"index": new_index_name, "alias": index_base}})
                client.indices.update_aliases({"actions": actions})
                client.indices.delete(",".join(to_delete))
            else:
                client.indices.put_alias(name=index_base, index=new_index_name)
    except redis.exceptions.LockError as exc:
        raise self.retry(countdown=60, exc=exc)
Ejemplo n.º 52
0
    def es_bulk_indexing_of_model(self, model, force_reindexing=False):
        """Perform a bulk action on documents of a given model. Use the ``objects_per_batch`` property to index.

        See http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.bulk
        and http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.parallel_bulk

        .. attention::
            + Currently only implemented with "index" and "update" !
            + Currently only working with ``AbstractESDjangoIndexable``.

        :param model: and model
        :type model: class
        :param force_reindexing: force all document to be returned
        :type force_reindexing: bool
        :return: the number of documents indexed
        :rtype: int
        """

        if not self.connected_to_es:
            return

        if not self.index_exists:
            raise NeedIndex()

        # better safe than sorry
        if model.__name__ == 'FakeChapter':
            self.logger.warn('Cannot index FakeChapter model. Please index its parent model.')
            return 0

        documents_formatter = partial(es_document_mapper, force_reindexing, self.index)
        objects_per_batch = getattr(model, 'objects_per_batch', 100)
        indexed_counter = 0
        if model.__name__ == 'PublishedContent':
            generate = model.get_es_indexable(force_reindexing)
            while True:
                with transaction.atomic():
                    try:
                        # fetch a batch
                        objects = next(generate)
                    except StopIteration:
                        break
                    if not objects:
                        break
                    if hasattr(objects[0], 'parent_model'):
                        model_to_update = objects[0].parent_model
                        pks = [o.parent_id for o in objects]
                    else:
                        model_to_update = model
                        pks = [o.pk for o in objects]

                    formatted_documents = list(map(documents_formatter, objects))

                    for _, hit in parallel_bulk(
                        self.es,
                        formatted_documents,
                        chunk_size=objects_per_batch,
                        request_timeout=30
                    ):
                        action = list(hit.keys())[0]
                        self.logger.info('{} {} with id {}'.format(action, hit[action]['_type'], hit[action]['_id']))

                    # mark all these objects as indexed at once
                    model_to_update.objects.filter(pk__in=pks) \
                                           .update(es_already_indexed=True, es_flagged=False)
                    indexed_counter += len(objects)
            return indexed_counter
        else:
            then = time.time()
            prev_obj_per_sec = False
            last_pk = 0
            object_source = model.get_es_indexable(force_reindexing)

            while True:
                with transaction.atomic():
                    # fetch a batch
                    objects = list(object_source.filter(pk__gt=last_pk)[:objects_per_batch])
                    if not objects:
                        break

                    formatted_documents = list(map(documents_formatter, objects))

                    for _, hit in parallel_bulk(
                        self.es,
                        formatted_documents,
                        chunk_size=objects_per_batch,
                        request_timeout=30
                    ):
                        if self.logger.getEffectiveLevel() <= logging.INFO:
                            action = list(hit.keys())[0]
                            self.logger.info('{} {} with id {}'.format(
                                action, hit[action]['_type'], hit[action]['_id']))

                    # mark all these objects as indexed at once
                    model.objects.filter(pk__in=[o.pk for o in objects]) \
                                 .update(es_already_indexed=True, es_flagged=False)
                    indexed_counter += len(objects)

                    # basic estimation of indexed objects per second
                    now = time.time()
                    last_batch_duration = int(now - then) or 1
                    then = now
                    obj_per_sec = round(float(objects_per_batch) / last_batch_duration, 2)
                    if force_reindexing:
                        print('    {} so far ({} obj/s, batch size: {})'.format(
                              indexed_counter, obj_per_sec, objects_per_batch))

                    if prev_obj_per_sec is False:
                        prev_obj_per_sec = obj_per_sec
                    else:
                        ratio = obj_per_sec / prev_obj_per_sec
                        # if we processed this batch 10% slower/faster than the previous one,
                        # shrink/increase batch size
                        if abs(1 - ratio) > 0.1:
                            objects_per_batch = int(objects_per_batch * ratio)
                            if force_reindexing:
                                print('     {}x, new batch size: {}'.format(round(ratio, 2), objects_per_batch))
                        prev_obj_per_sec = obj_per_sec

                    # fetch next batch
                    last_pk = objects[-1].pk

            return indexed_counter
Ejemplo n.º 53
0
 def forceBulk(self):
     if len(self.tempData) > 0:
         print 'UPLOADING ############# The SIZE IS:  ' + str(sys.getsizeof(self.tempData))
         for success, info in helpers.parallel_bulk(self.es, self.tempData, thread_count=4):
             if not success: print('Doc failed', info)
         self.tempData = []
def lambda_handler(event, context):

    host = 'your_elasticsearchservice_endpoint'
    region = "your_elasticsearchservice_cluster_region"
    awsauth = AWS4Auth(os.environ['AWS_ACCESS_KEY_ID'], os.environ['AWS_SECRET_ACCESS_KEY'], region, 'es', session_token=os.environ['AWS_SESSION_TOKEN'])
    
    mappings={"mappings" : {
                 "cloudtrail":{
             		"properties" : {
             			"userIdentity" : {
                    			"type":"object",
                    			"properties":{
                            		"arn" : { "type" : "string", "index" : "not_analyzed" },
                            		"accountId": { "type" : "string", "index" : "not_analyzed" },
                            		"invokedBy": { "type" : "string", "index" : "not_analyzed" },
                            		"userName": { "type" : "string", "index" : "not_analyzed" }
                            	}
                        	},
                    	"eventSource": { "type" : "string", "index" : "not_analyzed" },
                    	"awsRegion": { "type" : "string", "index" : "not_analyzed" },
                    	"userAgent": { "type" : "string", "index" : "not_analyzed" }
                    	
                    	},
                    "dynamic_templates":[
                    	{"resourceIdentifiers":{
                    		"match":"*Id",
                    		"match_mapping_type":"string",
                    		"mapping":{
                    			"type":"string",
                    			"index":"not_analyzed"
                    			}
                    		}
                    	},
                    	{"resourceIdentifiersLower":{
                    		"match":"*id",
                    		"match_mapping_type":"string",
                    		"mapping":{
                    			"type":"string",
                    			"index":"not_analyzed"
                    			}
                    		}
                    	},
                    	{"resourceIdentifiersUpper":{
                    		"match":"*ID",
                    		"match_mapping_type":"string",
                    		"mapping":{
                    			"type":"string",
                    			"index":"not_analyzed"
                    			}
                    		}
                    	}
                    ]
            	}
       	}
     }

    index = 'cloudtrail-' + datetime.strftime(datetime.utcnow(),'%Y-%m-%d')

    es = Elasticsearch(
        hosts=[{'host': host, 'port': 443}],
        http_auth=awsauth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection
    )
    
    if not es.indices.exists(index):
        es.indices.create(index=index,body=mappings)

    s3 = boto3.client('s3')

    bucket_name=event['Records'][0]['s3']['bucket']['name']
    key_name=event['Records'][0]['s3']['object']['key']

    if not "CloudTrail-Digest" in key_name:
        obj = s3.get_object(Bucket=bucket_name,Key=key_name)
        compressed = StringIO(obj['Body'].read())
        decompressed = gzip.GzipFile(fileobj=compressed,mode='rb')
        raw_event = json.loads(decompressed.read())

        bulk_json = []

        for record in raw_event['Records']:
            entry={"_op_type":"index","_index":index,"_type":"cloudtrail","_id":record['eventID'],"_source":record}
            bulk_json.append(entry)

        for success,info in helpers.parallel_bulk(es,bulk_json,thread_count=4):
            if not success:
                print('Failed to index document: ',info)
    else:
        print("File is a digest. Skipping ",key_name)
Ejemplo n.º 55
0
   sniff_on_connection_fail=True,
   sniffer_timeout=60
   )

def dump_json(dir):
    s = {}
    for dir2 in os.listdir(dir):
        print dir2
        if dir2 !="_temporary":
            for file in os.listdir(os.path.join(dir,dir2)):
                file_name = os.path.join(dir,dir2,file)
                print file_name
                f = open(file_name, 'r')
                for line in f:
                    line = str(line.rstrip())
                   # print s.format(line)
                   # if ("""[{"target": "a phone number"}]""" in line) or ("""[{"target": "an email"}]"""  in line):
              #      print line
              #      else:
                    s = {'_op_type':'create','_type':'transaction','_source':line}
                    yield(s)
                      #  print line
#dump_json(args.directory)
success, _ = parallel_bulk(es,dump_json(args.directory),index='venmo2018',thread_count=2,raise_on_error=False)
print('Performed %d actions' % success)
#count = 0
#for item in dump_json(args.directory):
#    print count
#    print item
#    count += 1
Ejemplo n.º 56
0
    reader = csv.DictReader( csvfile )

    titleLookup = {}

    for movie in reader:
            titleLookup[movie['movieId']] = movie['title']

    return titleLookup

def readTags():
    csvfile = open('ml-latest-small/tags.csv', 'r')

    titleLookup = readMovies()

    reader = csv.DictReader( csvfile )
    for line in reader:
        tag = {}
        tag['user_id'] = int(line['userId'])
        tag['movie_id'] = int(line['movieId'])
        tag['title'] = titleLookup[line['movieId']]
        tag['tag'] = line['tag']
        tag['timestamp'] = int(line['timestamp'])
        yield tag


es = elasticsearch.Elasticsearch()

es.indices.delete(index="tags",ignore=404)
deque(helpers.parallel_bulk(es,readTags(),index="tags",doc_type="tag"), maxlen=0)
es.indices.refresh()
Ejemplo n.º 57
0
                'title': {
                    'type': 'string',
                    'analyzer': 'spanish',
                },
                'narr': {
                    'type': 'string',
                    'analyzer': 'spanish',
                },
            }
        }
    }
})


def actions(docs):
    for doc in docs:
        body = {child.tag: child.text for child in doc.iterchildren()}
        yield {
            '_op_type': 'create',
            '_index': DB_INDEX,
            '_type': DOC_TYPE,
            '_id': body['docid'],
            '_source': body,
        }

for filename in sorted(os.listdir('efe95')):
    with open('efe95/{}'.format(filename), encoding='iso-8859-1') as data:
        docs = lxml.html.fragments_fromstring(data.read())
    for success, info in es_helpers.parallel_bulk(es, actions(docs)):
        pass
Ejemplo n.º 58
0
    def test_all_chunks_sent(self, _process_bulk_chunk):
        actions = ({'x': i} for i in range(100))
        list(helpers.parallel_bulk(Elasticsearch(), actions, chunk_size=2))

        self.assertEquals(50, _process_bulk_chunk.call_count)
Ejemplo n.º 59
0
                user["all_years"] = list(set(user["all_years"]))
                yield user
                num_users += 1
                if num_users % 10000 == 0:
                    print("Indexed %s users" % (num_users))
                user.clear()
                user["userId"] = int(row["userId"])
                user["liked"] = []
                user["disliked"] = []
                user["indifferent"] = []
                user["all_rated"] = []
                user["all_years"] = []
                user["liked_years"] = []
            user["all_rated"].append(title)
            if "year" in movies[row["movieId"]]:
                user["all_years"].append(movies[row["movieId"]]["year"])
                if rating >= 4.0:
                    user["liked_years"].append(movies[row["movieId"]]["year"])
            user["liked"].append(title) if rating >= 4.0 else (
            user["indifferent"].append(title) if rating > 2.0 else user["disliked"].append(title))
        yield user

index = "movie_lens_users"
doc_type = "user"
es.indices.delete(index=index, ignore=404)
es.indices.create(index=index, body=open(mapping_file,"r").read(), ignore=404)
print("Indexing users...")
deque(helpers.parallel_bulk(es, read_users(ratings_file, read_movies(movies_file)), index = index, doc_type = doc_type), maxlen = 0)
print("Indexing Complete")
es.indices.refresh()
    # print json.dumps(mapping)
    # if put error, see
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-put-mapping.html#merging-conflicts
    try:
        client.indices.put_mapping(
            index=db, doc_type=table_name, body=json.dumps(mapping), update_all_types=True)
    except Exception, e:
        print(
            "put mapping failed, maybe error u'illegal_argument_exception', u'mapper [xxxx] cannot be changed from type [long] to [string]'")

    try:
        # get bulk actions
        actions = get_actions(cursor=cursor, table_name=table_name, index=db)

        failed_count = 0
        responses = helpers.parallel_bulk(
            client=client, actions=actions, thread_count=thread_count, chunk_size=chunk_size)
        for success, msg in responses:
            status = msg['create']['status']
            if not success or status != 201:
                failed_count += 1
        if failed_count > 0:
            print("----------------> failed records %i" % failed_count)
        else:
            print("migrate success")
        migrate_fail_count += failed_count
    except ConnectionError, e:
        print e
        sys.exit(1)
    except Exception, e:
        print "migrate table: %s error" % table_name, type(e)
        # print e