def main():

    es = connections.get_connection()

    dry = '--dry' in sys.argv
    if not dry:
        utils.add_file_logger(logger, __file__)
    preprints = Preprint.objects.filter(primary_file__isnull=False).select_related('primary_file', 'provider')
    total_preprints = preprints.count()
    logger.info('Collecting data on {} preprints...'.format(total_preprints))

    batch_to_update = []
    for i, preprint in enumerate(preprints, 1):
        preprint_id = preprint._id
        provider_id = preprint.provider._id
        file_id = preprint.primary_file._id
        page_counters = (
            PageCounter.objects
            .filter(
                _id__startswith='download:{preprint_id}:{file_id}:'.format(
                    preprint_id=preprint_id,
                    file_id=file_id
                )
            ).values_list('_id', 'date')
        )
        for page_counter in page_counters:
            page_counter__id, date = page_counter
            version_num = page_counter__id.split(':')[-1]
            for date, totals in date.items():
                timestamp = datetime.datetime.strptime(date, '%Y/%m/%d').replace(tzinfo=pytz.utc)
                batch_to_update.append({
                    '_index': 'osf_preprintdownload_{}'.format(timestamp.strftime(settings.ELASTICSEARCH_METRICS_DATE_FORMAT)),
                    '_source': {
                        'count': totals['total'],
                        'path': '/{}'.format(file_id),
                        'preprint_id': preprint_id,
                        'provider_id': provider_id,
                        'timestamp': timestamp,
                        'user_id': None,  # Pagecounter never tracked this
                        'version': int(version_num) + 1
                    },
                    '_type': 'doc'
                })

                if len(batch_to_update) >= MAX_BATCH_SIZE:
                    logger.info('Bulk-indexing data from {} PageCounter records'.format(len(batch_to_update)))
                    if not dry:
                        bulk(es, batch_to_update, max_retries=3, chunk_size=CHUNK_SIZE, request_timeout=REQUEST_TIMEOUT)
                    batch_to_update = []
                    # Allow elasticsearch to catch up
                    print('{}/{} preprints completed ({:.2f}%)'.format(i + 1, total_preprints, (i + 1) / total_preprints * 100))
                    sleep(THROTTLE_PERIOD)

    # Index final batch
    if len(batch_to_update):
        logger.info('Bulk-indexing data from {} PageCounter records'.format(len(batch_to_update)))
        if not dry:
            bulk(es, batch_to_update, max_retries=3, chunk_size=CHUNK_SIZE, request_timeout=REQUEST_TIMEOUT)

    logger.info('This will migrate {} Pagecounter entries to Elasticsearch'.format(len(batch_to_update)))
Example #2
0
    def insert(self):
        global config
        sys.stderr.flush()

        js_arr = []
        i = 0
        for entry in self.json:
            js = entry
            mid = js['mid']
            if self.dtype == 'mbox_source':
                del js['mid']
            js_arr.append({
                '_op_type': 'index',
                '_consistency': self.wc,
                '_index': dbname,
                '_type': self.dtype,
                '_id': mid,
                'doc': js,
                '_source': js
            })
        if not args.dry:
            try:
                helpers.bulk(self.xes, js_arr)
            except Exception as err:
                print("%s: Warning: Could not bulk insert: %s into %s" % (self.id,err,self.dtype))
Example #3
0
File: esindex.py Project: 01-/luigi
    def run(self):
        """
        Run task, namely:

        * purge existing index, if requested (`purge_existing_index`),
        * create the index, if missing,
        * apply mappings, if given,
        * set refresh interval to -1 (disable) for performance reasons,
        * bulk index in batches of size `chunk_size` (2000),
        * set refresh interval to 1s,
        * refresh Elasticsearch,
        * create entry in marker index.
        """
        if self.purge_existing_index:
            self.delete_index()
        self.create_index()
        es = self._init_connection()
        if self.mapping:
            es.indices.put_mapping(index=self.index, doc_type=self.doc_type,
                                   body=self.mapping)
        es.indices.put_settings({"index": {"refresh_interval": "-1"}},
                                index=self.index)

        bulk(es, self._docs(), chunk_size=self.chunk_size,
             raise_on_error=self.raise_on_error)

        es.indices.put_settings({"index": {"refresh_interval": "1s"}},
                                index=self.index)
        es.indices.refresh()
        self.output().touch()
def sync_events_elasticsearch():
    # Sync update and inserts
    index_count = redis_store.scard('event_index')
    index_event_data = ({'_type': 'event',
                         '_index': 'events',
                         '_id': event_[0],
                         'name': event_[1],
                         'description': event_[2] or None,
                         'searchable_location_name': event_[3] or None,
                         'organizer_name': event_[4] or None,
                         'organizer_description': event_[5] or None}
                        for event_ in EventIterator(index_count, 'event_index'))
    try:
        helpers.bulk(es_store, index_event_data)
    except Exception as e:
        print(e)

    # sync both soft and hard deletes
    del_count = redis_store.scard('event_delete')
    del_event_data = ({'_type': 'event',
                       '_index': 'events',
                       '_id': event_[0],
                       'name': event_[1],
                       'description': event_[2] or None,
                       'searchable_location_name': event_[3] or None,
                       'organizer_name': event_[4] or None,
                       'organizer_description': event_[5] or None}
                      for event_ in EventIterator(del_count, 'event_delete'))
    try:
        helpers.bulk(es_store, del_event_data)
    except Exception as e:
        print(e)
Example #5
0
def cli(host, index_name, doc_type, import_file, mapping_file,
        id_field_idx, delete_index, quiet):
    """
    Bulk import a delimited file into a target Elasticsearch instance.
    Common delimited files include things like CSV.

    Load a CSV file:
    data2es --index-name myindex --doc-type mydoc --import-file test.csv
    """

    echo('Using host: %s' % host, quiet)
    es = Elasticsearch(hosts=[host])

    if es.indices.exists(index_name):
        echo('Index %s already exist' % index_name, False)
        if delete_index:
            es.indices.delete(index=index_name)
            echo('Deleted: %s' % index_name, quiet)
            es.indices.create(index=index_name)
            echo('Created new index: %s' % index_name, quiet)
    else:
        es.indices.create(index=index_name)
        echo('Created new index: %s' % index_name, quiet)

    echo('Using document type: %s' % doc_type, quiet)
    if mapping_file:
        echo('Applying mapping from: %s' % mapping_file, quiet)
        with open(mapping_file) as f:
            mapping = json.loads(f.read())
        es.indices.put_mapping(doc_type, mapping, [index_name,])

    action_g = docs_from_file(import_file, index_name, doc_type,
                              id_field_idx, quiet)
    helpers.bulk(es, action_g())
Example #6
0
def capture(pcap_files, node, chunk, trace):
    try:
        es = None
        if node != None:
            es = Elasticsearch(node)

        print "Loading packet capture file(s)"
        for pcap_file in pcap_files:
            print pcap_file
            stats = os.stat(pcap_file)
            file_date_utc = datetime.utcfromtimestamp(stats.st_ctime)
            capture = pyshark.FileCapture(pcap_file)

            # If no Elasticsearch node specified, dump to stdout
            if node == None:
                dump_packets(capture, file_date_utc)
            else:
                helpers.bulk(
                    es, index_packets(capture, pcap_file, file_date_utc), chunk_size=chunk, raise_on_error=False
                )

    except Exception as e:
        print "error: ", e
        if trace == True:
            traceback.print_exc(file=sys.stdout)
Example #7
0
def insert_headers(headers):
    elastic_host = "localhost"
    elastic_index = 'indextest'
    elastic_index_settings = {
      "settings": {
        "number_of_shards": 5,
        "number_of_replicas": 0,
      }
    }
  
    elastic_mapping = {
      "nntp": {
        "properties": {
          "Subject": { 
            "type": "string",
            "index": "not_analyzed"
          }
        }
      }
    }

    print "inserting " + str(len(headers)) + " into ES"  
  
    es = Elasticsearch([elastic_host], sniff_on_start=True)
    es.indices.create(index=elastic_index, body=elastic_index_settings, ignore=400)  
    es.indices.put_mapping(index=elastic_index, doc_type="nntp", body=elastic_mapping)
  
    if len(headers) > 0:
      helpers.bulk(es, headers)
  
    return True
Example #8
0
 def commit(self):
     """Process list of dict (yes) and push them to DB """
     self.total_objs += len(self.nlist)
     count = 0
     full_body = ""
     items = []
     for evt_array in self.nlist:
         for entry in evt_array['events']:
             items.append({"index" : {}})
             entry['whitelisted'] = "false"
             entry['comments'] = "import:"+str(datetime.datetime.now())
             # go utf-8 ?
             for x in entry.keys():
                 if isinstance(entry[x], basestring):
                     entry[x] = unicode(entry[x], errors='replace')
             items.append(entry)
             count += 1
     mapfunc = partial(json.dumps, ensure_ascii=False)
     try:
         full_body = "\n".join(map(mapfunc,items)) + "\n"
     except:
         print "Unexpected error:", sys.exc_info()[0]
         print "Unable to json.dumps : "
         pprint.pprint(items)
     bulk(self.es, items, index=self.cfg["elastic"]["index"], doc_type="events", raise_on_error=True)
     self.total_commits += count
     logging.debug("Written "+str(self.total_commits)+" events")
     print "Written "+str(self.total_commits)+" events"
     del self.nlist[0:len(self.nlist)]
Example #9
0
def bulk_insert(es, records=[]):    
    from elasticsearch import helpers 
    if records == None or len(records) == 0:
        record = { "_index": "chunzhi", "_type": "study", "_id": "111", "_source": 
                   { "request": "request", "response": "response"}
                 }
        records = []
        records.append(record)
    
    records.clear()
    record=[]
    import time
    for page in search_data(es):
        for r in page:
            s=r["_source"]["started_at"]
            s = time.strftime('%Y-%m-%dT%H:%M:%S.123Z',time.localtime(1466561471))
            r["_source"]["started_at"] = s
            r["_source"]["started_year"] = s[0:4]
            r["_source"]["started_month"] =s[5:7]
            r["_source"]["started_day"] = s[8:10]
            r["_source"]["started_hour"] = s[11:13]
            r["_source"]["started_minute"] = s[14:16]
            r["_source"]["started_second"] = s[17:19]            
            record={"_index": "kongtest-2016.06", "_type":"log", "_source": r["_source"]}
            records.append(record)
        
    helpers.bulk(es, records)
Example #10
0
def restoreIndex(hosts, fileName):
    es = Elasticsearch(hosts)
    fileName = fileName
    fd = open(fileName, 'r')
    data = (json.loads(line) for line in fd.readlines())
    helpers.bulk(es, data)
    return
Example #11
0
    def index_many_crashes(
        self, number, processed_crash=None, raw_crash=None, loop_field=None
    ):
        processed_crash = processed_crash or {}
        raw_crash = raw_crash or {}

        actions = []
        for i in range(number):
            crash_id = str(uuid.UUID(int=random.getrandbits(128)))

            if loop_field is not None:
                processed_copy = processed_crash.copy()
                processed_copy[loop_field] = processed_crash[loop_field] % i
            else:
                processed_copy = processed_crash

            doc = {
                'crash_id': crash_id,
                'processed_crash': processed_copy,
                'raw_crash': raw_crash,
            }
            action = {
                '_index': self.config.elasticsearch.elasticsearch_index,
                '_type': self.config.elasticsearch.elasticsearch_doctype,
                '_id': crash_id,
                '_source': doc,
            }
            actions.append(action)

        bulk(
            client=self.connection,
            actions=actions,
        )
        self.refresh_index()
    def flush(self):
        """ Flushes the buffer into ES
        :return: None
        """
        if self._timer is not None and self._timer.is_alive():
            self._timer.cancel()
        self._timer = None

        if self._buffer:
            try:
                with self._buffer_lock:
                    logs_buffer = self._buffer
                    self._buffer = []
                actions = (
                    {
                        '_index': self._index_name_func.__func__(self.es_index_name),
                        '_type': self.es_doc_type,
                        '_source': log_record
                    }
                    for log_record in logs_buffer
                )
                eshelpers.bulk(
                    client=self.__get_es_client(),
                    actions=actions,
                    stats_only=True
                )
            except Exception as exception:
                if self.raise_on_indexing_exceptions:
                    raise exception
Example #13
0
    def send_messages(self, message_batch):
        if not self._init_es():
            return
        start_time = time.monotonic()
        try:
            actions = []
            for msg in message_batch:
                message = json.loads(msg.decode("utf8"))
                timestamp = message.get("timestamp")
                if "__REALTIME_TIMESTAMP" in message:
                    timestamp = datetime.datetime.utcfromtimestamp(message["__REALTIME_TIMESTAMP"])
                else:
                    timestamp = datetime.datetime.utcnow()

                message["timestamp"] = timestamp
                index_name = "{}-{}".format(self.index_name, datetime.datetime.date(timestamp))
                if index_name not in self.indices:
                    self.create_index_and_mappings(index_name)

                actions.append({
                    "_index": index_name,
                    "_type": "journal_msg",
                    "_source": message,
                })
            if actions:
                helpers.bulk(self.es, actions)
                self.log.debug("Sent %d log events to ES, took: %.2fs",
                               len(message_batch), time.monotonic() - start_time)
        except Exception as ex:  # pylint: disable=broad-except
            self.log.warning("Problem sending logs to ES: %r", ex)
            return False
        return True
Example #14
0
    def add_bulk(self, obj_list):
        # Group all objects by their type
        type_set = {}
        for obj in obj_list:
            # Object must be a decendant of Indexed and be a django model
            if not self.object_can_be_indexed(obj):
                continue

            # Get object type
            obj_type = obj.indexed_get_content_type()

            # If type is currently not in set, add it
            if obj_type not in type_set:
                type_set[obj_type] = []

            # Add object to set
            type_set[obj_type].append(obj.indexed_build_document())

        # Loop through each type and bulk add them
        for type_name, type_objects in type_set.items():
            # Get list of actions
            actions = []
            for obj in type_objects:
                action = {
                    '_index': self.es_index,
                    '_type': type_name,
                    '_id': obj['id'],
                }
                action.update(obj)
                actions.append(action)

            bulk(self.es, actions)
    def es_insert(cls, mock=False):

        """
        Insert documents.

        Args:
            mock (bool): If true, generate mock data.
        """

        if not mock:
            actions = cls.es_stream_docs()

        else:
            actions = cls.es_stream_mock_docs()

        # Clear the index.
        cls.es_reset()

        # Batch-insert the documents.
        bulk(
            client=config.es,
            actions=actions,
            raise_on_exception=False,
            raise_on_error=False,
            doc_type=cls.es_index,
            index=cls.es_index
        )

        # Commit the index.
        config.es.indices.flush(cls.es_index)
Example #16
0
def delete_source(source_id):
    """Delete all documents from a particular source."""
    q = {'query': {'term': {'source_id': source_id}}, '_source': False}

    def deletes():
        for res in scan(get_es(), query=q, index=get_es_index(),
                        doc_type=[TYPE_RECORD]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_parent': res.get('_parent'),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }
        for res in scan(get_es(), query=q, index=get_es_index(),
                        doc_type=[TYPE_DOCUMENT]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }

    try:
        bulk(get_es(), deletes(), stats_only=True, chunk_size=2000,
             request_timeout=60.0)
    except Exception:
        log.debug("Failed to clear documents: %r", source_id)
Example #17
0
def bulk_insert(es):
	fintput = open('/home/liangzx/parse_result');
	single_jsstr = '';
	actions = [];
	# format = '%Y-%m-%d %X';
	format = '%Y.%m.%d';
	current_time = time.strftime(format,time.localtime(time.time()))
	while 1:
		line = fintput.readline();
		if not line:
			break
		if line == '\n':
			es_object = json.loads(single_jsstr);
			action = {
				"_index":"php_tracer",
				"_type":es_object['doc_type'],
				"_source":es_object['content']
			}
			actions.append(action);
			single_jsstr = '';
		else:
			single_jsstr += line;
	helpers.bulk(es,actions)
	del actions[0:len(actions)]
	fintput.close();
	foutput = open('/home/liangzx/parse_result','w');
	foutput.write('');
	foutput.close();
Example #18
0
    def import_from_iterable(self, iterable, field_to_hash='text', batch_size=500):
        """Load data into Elasticsearch from iterable.

        iterable: generally a list of dicts, but possibly a list of strings
            This is your data.  Your dictionary structure defines the schema
            of the elasticsearch index.
        field_to_hash: string identifier of field to hash for content ID.  For
            list of dicts, a valid key value in the dictionary is required. For
            list of strings, a dictionary with one key, "text" is created and
            used.
        """
        if field_to_hash:
            self.hash_field = field_to_hash
            batch = []
            for item in iterable:
                if isinstance(item, basestring):
                    item = {field_to_hash: item}
                id = hash(item[field_to_hash])
                action = {'_op_type': 'update',
                          '_index': self.index,
                          '_type': self.doc_type,
                          '_id': id,
                          'doc': item,
                          'doc_as_upsert': "true",
                          }
                batch.append(action)
                if len(batch) >= batch_size:
                    helpers.bulk(client=self.instance, actions=batch, index=self.index)
                    batch = []
            if batch:
                helpers.bulk(client=self.instance, actions=batch, index=self.index)
            self.instance.indices.refresh(self.index)
        else:
            raise ValueError("A field_to_hash is required for import_from_iterable")
Example #19
0
    def handle(self, **options):
        self._initialize(**options)

        if (options['rebuild'] and
            not options['dry_run'] and
            self.es.indices.exists(self.INDEX_NAME)):

            self.es.indices.delete(index=self.INDEX_NAME)

        if (not options['dry_run'] and
            not self.es.indices.exists(self.INDEX_NAME)):

            self.es.indices.create(index=self.INDEX_NAME)

        if self.is_local_tm:
            self._set_latest_indexed_revision(**options)

        if isinstance(self.parser, FileParser):
            helpers.bulk(self.es, self._parse_translations(**options))
            return

        # If we are parsing from DB.
        tp_qs = TranslationProject.objects.all()

        if options['disabled_projects']:
            tp_qs = tp_qs.exclude(project__disabled=True)

        for tp in tp_qs:
            self.parser.tp_pk = tp.pk
            helpers.bulk(self.es, self._parse_translations(**options))
Example #20
0
 def _index_alias_multiple_indexes_bulk(self, documents=None, actions=None,
                                        versions=None):
     """A bulk operation failed by trying to access an alias that has
        multiple indexes. To rememdy this we will need to iterate on all
        indexes within the alias and retry the bulk operation individually.
     """
     indexes = self.engine.indices.get_alias(index=self.alias_name)
     for index_name in indexes:
         try:
             if documents:
                 result = helpers.bulk(
                     client=self.engine,
                     index=index_name,
                     doc_type=self.document_type,
                     chunk_size=self.index_chunk_size,
                     actions=self._prepare_actions(documents,
                                                   versions))
             if actions:
                 result = helpers.bulk(
                     client=self.engine,
                     index=index_name,
                     doc_type=self.document_type,
                     chunk_size=self.index_chunk_size,
                     actions=actions)
             LOG.debug("Indexing result: %s", result)
         except Exception as e:
             # Log the error and continue to the next index.
             format_msg = {
                 'doc': self.document_type,
                 'msg': str(e)
             }
             LOG.error("Failed Indexing %(doc)s: %(msg)s" % format_msg)
def write_to_elasticsearch(conn, data, xen_index):
    conn.indices.create(index=xen_index, body=PS_REVIEWERS_MAPPING,
                        ignore=400)

    columns = data.columns.values.tolist()

    uniq_id = 0
    bulk_doc = []

    for row in data.itertuples():
        uniq_id = uniq_id + 1
        doc = to_dict(row, columns)

        header = {
            "_index": xen_index,
            "_type": "patchserie",
            "_id": uniq_id,
            "_source": doc
        }

        bulk_doc.append(header)
        if uniq_id % 5000 == 0:
            helpers.bulk(conn, bulk_doc)
            bulk_doc = []

    helpers.bulk(conn, bulk_doc)
Example #22
0
    def clear(self, schema=None, source_id=None):
        filter_ = {'bool': {'must': []}}
        if schema is not None:
            filter_['bool']['must'].append({
                'term': {'$schema': schema}
            })
        if source_id is not None:
            filter_['bool']['must'].append({
                'term': {'$sources': source_id}
            })
        q = {'filtered': {'query': {'match_all': {}}, 'filter': filter_}}
        q = {'query': q, 'fields': []}

        log.info('Deleting existing entries matching index criteria')

        def gen_deletes():
            for res in scan(self.config.elastic_client, query=q,
                            index=self.config.elastic_index):
                yield {
                    '_op_type': 'delete',
                    '_index': self.config.elastic_index,
                    '_type': res.get('_type'),
                    '_id': res.get('_id')
                }

        bulk(self.config.elastic_client, gen_deletes(),
             stats_only=True, chunk_size=self.chunk,
             request_timeout=60.0)
Example #23
0
    def execute(self):
        """
        Index data of specified queryset
        """
        start_time = time.time()

        for qs, progress in self.batch_qs():

            elapsed = time.time() - start_time

            total_left = (1 / (progress + 0.001)) * elapsed - elapsed

            progres_msg = \
                'PART: %s %.3f : duration: %.2f left: %.2f' % (
                    self.part, progress, elapsed, total_left
                )

            log.info(progres_msg)

            helpers.bulk(
                self.client,
                (self.convert(obj).to_dict(include_meta=True) for obj in qs),
                raise_on_error=True,
            )

        if settings.TESTING and self.index:
            idx = es.Index(self.index)
            # refresh index, make sure its ready for queries
            idx.refresh()
Example #24
0
 def do_bulk_index(self, bulk_reqs):
     try:
         helpers.bulk(self.es, bulk_reqs)
     except Exception as e:
         raise CuckooReportError(
             "Failed to save results in ElasticSearch for " "task #%d: %s" % (self.task["id"], e)
         )
def process_provider_into_es(fname, es, conn):
    status = False
    with open(fname, 'r') as infile:
        actions = []
        try:
            for doc in ijson.items(infile, "item"):
                if doc['type'] == 'INDIVIDUAL':
                    action = {
                        "_index": "data",
                        "_type": "provider",
                        "_source": doc
                        }
                else:
                    action = {
                        "_index": "data",
                        "_type": "facility",
                        "_source": doc
                    }
                actions.append(action)
                if len(actions) > 0 and len(actions) % 50 == 0:
                    helpers.bulk(es, actions)
                    status = True
                    actions = []
        except (KeyboardInterrupt, SystemExit):
            conn.rollback()
            raise
        except (UnicodeDecodeError, ValueError, ijson.JSONError):
            print "{0}\n".format(str(ex))
    return status
Example #26
0
def index_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data,
             id=document.id)
    clear_children(document)

    try:
        if document.type == Document.TYPE_TEXT:
            bulk(es, generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(es, generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
Example #27
0
def delete_source(source_id):
    q = {'query': {'term': {'source_id': source_id}}}

    def deletes():
            q['_source'] = ['id', 'document_id']
            for res in scan(es, query=q, index=es_index,
                            doc_type=[TYPE_RECORD]):
                yield {
                    '_op_type': 'delete',
                    '_index': es_index,
                    '_parent': res.get('_source', {}).get('document_id'),
                    '_type': res.get('_type'),
                    '_id': res.get('_id')
                }

            q['_source'] = ['id']
            for res in scan(es, query=q, index=es_index,
                            doc_type=[TYPE_DOCUMENT]):
                yield {
                    '_op_type': 'delete',
                    '_index': es_index,
                    '_type': res.get('_type'),
                    '_id': res.get('_id')
                }

    try:
        bulk(es, deletes(), stats_only=True, chunk_size=2000,
             request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
Example #28
0
def save_to_elasticsearch(pItemId, pItemName, ls_top_sim):
    bulk_data = []

    INDEX_NAME = "recsys_tfidf_result"
    es = elasticsearch.Elasticsearch([{"host": "10.220.83.22", "port": 9206}])

    # es.indices.delete(INDEX_NAME)
    # es.indices.create(INDEX_NAME)

    ls_rel_p = []
    for rel_prod in ls_top_sim:
        rel_prod_id = rel_prod[0]
        rel_prod_name = rel_prod[1]
        rel_score = rel_prod[2]
        rel_item = {}
        rel_item["similar_score"] = rel_score
        rel_item["similar_product_id"] = rel_prod_id
        rel_item["similar_product_name"] = rel_prod_name
        ls_rel_p.append(rel_item)
    op_dict = {
        "_type": "tfidf_result_type",
        "_index": INDEX_NAME,
        "_product_id": pItemId,
        "_product_name": pItemName,
        "_ls_similar_products": ls_rel_p,
        # "_rating": row[2]
    }
    bulk_data.append(op_dict)
    print bulk_data
    # bulk index the data
    print ("bulk indexing...")
    # res = es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
    helpers.bulk(es, bulk_data)
Example #29
0
    def bulk_add_data(self, _index, _doc_type, dict_data=None):
        """
        批量添加数据
        :param _index: 索引名
        :param _doc_type: 类型名
        :param dict_data: 数据列表
        :return:
        """

        if not dict_data:
            dict_data = []
        # j = 0
        actions = []
        for value in dict_data:
            action = {
                "_index": _index,
                "_type": _doc_type,
                # "_id": j + 1,
                "_source": {
                    "timestamp": datetime.now()}
            }
            action["_source"].update(value)
            actions.append(action)
            # j += 1


            if (len(actions) == 500000):
                helpers.bulk(self.es, actions)
                del actions[0:len(actions)]

        if (len(actions) > 0):
            helpers.bulk(self.es, actions)
Example #30
0
def show_details():
    banking_list = driver.find_elements_by_xpath(
        '//table[@class="data yen_nyushutsukin_001"]/tbody/tr')

    actions = []

    for banking in banking_list:
        detail = banking.find_elements_by_tag_name('td')

        date = datetime.datetime.strptime(detail[0].text.replace('\n', ''),
                                          '%Y年%m月%d日')
        # JSTに変換
        date = date + datetime.timedelta(hours=-9)

        payment = in_or_out_payment(detail[1].text, detail[2].text)
        remark = detail[3].text

        if is_salary(remark, date.day):
            # 給料は翌月1日にする
            date = (date + datetime.timedelta(days=30)).replace(day=1)

        doc = {'@timestamp': date, 'payment': payment, 'remark': remark}
        actions.append({'_index': 'mufg', '_type': 'mufg', '_source': doc})

    if len(actions) > 0:
        helpers.bulk(es, actions)

    try:
        driver.find_element_by_xpath('//a/img[@alt="新しい明細"]').click()
    except NoSuchElementException:
        return
    else:
        # 例外がない(=次のページがある)から次のページへ
        show_details()
Example #31
0
        actions.append({
            "_index": "pttpokemongo",
            "_op_type": "index",
            "_source": article
        })

    f.close()

    return actions


if __name__ == "__main__":
    es = Elasticsearch(hosts='localhost', port=9200)

    actions = load_data_and_convert_to_actions()

    # print(actions[0])

    print("original cases:", len(actions))

    success = 0

    for index in range(len(actions)):
        try:
            helpers.bulk(es, actions[index:index + 1])
            success += 1
        except Exception as e:
            print(str(e)[0:500])
            continue

    print("success cases:", success)
Example #32
0
                "zipcode": str(line['zipcode']),
                "geo": i18n['geo']
            })
        except:
            bulk_batch.append({
                "id": n,
                "district_code": int(line['district_code']),
                "amphoe_code": int(line['amphoe_code']),
                "province_code": int(line['province_code']),
                "district": {
                    "th": line['district']
                },
                "amphoe": {
                    "th": line['amphoe']
                },
                "province": {
                    "th": line['province']
                },
                "zipcode": str(line['zipcode'])
            })
        if (n + 1) % size == 0:
            print('Batch:', n + 1)
            try:
                helpers.bulk(es, genBulk(bulk_batch))
                bulk_batch = []
            except Exception as e:
                print(str(e))
    print('End:', n + 1)
    helpers.bulk(es, genBulk(bulk_batch))
    print('Total', len(data))
Example #33
0
def index():

	try:
		ES_CLIENT.indices.delete("news")
	except Exception as e:
		print(e)

	ES_CLIENT.indices.create("news", ES_MAPPINGS)

	items = []
	total_indexed, total_failed = 0, 0

	for folder in CLEANDIR.iterdir():

		if folder.name == '.gitignore': continue

		for i, file in enumerate(sorted(folder.iterdir())):

			if file.name == ".gitignore": continue

			print("Processing:", file.name)
			with open(file, "r") as _file:
				items.extend(json.loads(_file.read()))

			if i > 0 and i % CHUNKS[folder.name] == 0:

				print("Indexing", len(items))

				for item in items:
					if 'sentiment' not in item['_source']:
						print("Fault", file.name)

				indexed, failed = helpers.bulk(ES_CLIENT,
											   items,
											   stats_only=True,
											   raise_on_error=False)

				print("Indexed:", indexed)
				print("Failed:", failed)

				total_indexed += indexed
				total_failed += failed

				items = []

	print("Final Indexing", len(items))
	if len(items) != 0:

		indexed, failed = helpers.bulk(ES_CLIENT,
									   items,
									   stats_only=True,
									   raise_on_error=False)

		print("Final Indexed:", indexed)
		print("Final Failed:", failed)

		total_indexed += indexed
		total_failed += failed

	print("Total Indexed:", total_indexed)
	print("Total Failed:", total_failed)
                embeddings = model.encode(questions[start_idx:end_idx],
                                          show_progress_bar=False)
                bulk_data = []
                for qid, question, embedding in zip(
                        qids[start_idx:end_idx], questions[start_idx:end_idx],
                        embeddings):
                    bulk_data.append({
                        "_index": 'quora',
                        "_id": qid,
                        "_source": {
                            "question": question,
                            "question_vector": embedding
                        }
                    })

                helpers.bulk(es, bulk_data)
                pbar.update(chunk_size)

    except:
        print("During index an exception occured. Continue\n\n")

#Interactive search queries
while True:
    inp_question = input("Please enter a question: ")

    encode_start_time = time.time()
    question_embedding = model.encode(inp_question)
    encode_end_time = time.time()

    #Lexical search
    bm25 = es.search(index="quora",
Example #35
0
                #     print(fprecs[fp])

                docs = toESDocs(fprecs, fp_index)

                # for doc in docs:
                #     print(doc)

                if UPDATE_ELASTICSEARCH:
                    es.update(index=docindex,
                              doc_type="document",
                              id=docid,
                              body=encodedData,
                              _source=False,
                              refresh="false")

                    bulk(es, docs)

            except Exception as e:
                timestampPrint("ERROR: {}".format(e))
        else:
            bads[badcount] = j_content
            badcount += 1
        overallcount += 1

        # update average
        ipend = time.time()
        thistime = ipend - ipstart
        peripaverage = (
            (overallcount * peripaverage) + thistime) / (overallcount + 1)
        if overallcount % 5 == 0:
            print >> sys.stderr, "Reading fingerprints and rdns, did: " + str(overallcount) + \
Example #36
0
    print('Reading files ...')
    # Reads all the documents in a directory tree and generates an index operation for each
    ldocs = []
    for f in lfiles:
        if nfiles <= 0:
            break
        nfiles -= 1
        ftxt = codecs.open(f, "r", encoding='iso-8859-1')

        text = ''
        for line in ftxt:
            text += line
        # Insert operation for a document with fields' path' and 'text'
        ldocs.append({'_op_type': 'index', '_index': index, '_type': 'document', 'path': f, 'text': text})

    # Working with ElasticSearch
    client = Elasticsearch()
    try:
        # Drop index if it exists
        ind = Index(index, using=client)
        ind.delete()
    except NotFoundError:
        pass
    # then create it
    ind.settings(number_of_shards=1)
    ind.create()

    # Bulk execution of elasticsearch operations (faster than executing all one by one)
    print('Indexing ...')
    bulk(client, ldocs)
Example #37
0
def main():
    logFileName = '{}.log'.format(os.path.splitext(sys.argv[0])[0])
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[logging.FileHandler(logFileName),
                  logging.StreamHandler()])
    auth = get_config_params('config.ini')
    args = parse_args()
    view = "psra_{province}_uhs".format(**{'province': args.province.lower()})
    limit = 10000
    offset = 0

    # create index
    es = Elasticsearch([auth.get('es', 'es_endpoint')],
                       http_auth=(auth.get('es',
                                           'es_un'), auth.get('es', 'es_pw')))
    if es.indices.exists(view):
        es.indices.delete(view)

    # id_field = 'AssetID'
    settings = {
        'settings': {
            'number_of_shards': 1,
            'number_of_replicas': 0
        },
        'mappings': {
            'properties': {
                'coordinates': {
                    'type': 'geo_point'
                },
                'geometry': {
                    'type': 'geo_shape'
                }
            }
        }
    }
    es.indices.create(index=view, body=settings, request_timeout=90)

    while True:
        sqlquerystring = 'SELECT *, ST_AsGeoJSON(geom) \
            FROM results_psra_{province}.{view} \
            ORDER BY {view}."geom" \
            LIMIT {limit} \
            OFFSET {offset}'.format(
            **{
                'province': args.province.lower(),
                'view': view,
                'limit': limit,
                'offset': offset
            })
        offset += limit
        connection = None
        try:
            # Connect to the PostGIS database
            connection = psycopg2.connect(
                user=auth.get('rds', 'postgres_un'),
                password=auth.get('rds', 'postgres_pw'),
                host=auth.get('rds', 'postgres_host'),
                port=auth.get('rds', 'postgres_port'),
                database=auth.get('rds', 'postgres_db'))
            # Query the entire view with the geometries in geojson format
            cur = connection.cursor()
            cur.execute(sqlquerystring)
            rows = cur.fetchall()
            if rows:
                columns = [name[0] for name in cur.description]
                geomIndex = columns.index('st_asgeojson')
                feature_collection = {
                    'type': 'FeatureCollection',
                    'features': []
                }
                # Format table into a geojson format for ES/Kibana consumption
                for row in rows:
                    coordinates = json.loads(row[geomIndex])['coordinates']
                    feature = {
                        'type': 'Feature',
                        'geometry': json.loads(row[geomIndex]),
                        'coordinates': coordinates,
                        'properties': {},
                    }
                    for index, column in enumerate(columns):
                        if column != "st_asgeojson":
                            value = row[index]
                            feature['properties'][column] = value

                    feature_collection['features'].append(feature)
                geojsonobject = json.dumps(feature_collection,
                                           indent=2,
                                           default=decimal_default)
                d = json.loads(geojsonobject)
                helpers.bulk(es, gendata(d, view), raise_on_error=False)

            else:
                if (connection):
                    connection.close()
                return

        except (Exception, psycopg2.Error) as error:
            logging.error(error)
def bulk_indexing():
    BlogPostIndex.init()
    es = Elasticsearch()
    bulk(client=es,
         actions=(b.indexing()
                  for b in models.BlogPost.objects.all().iterator()))
Example #39
0
from elasticsearch import helpers, Elasticsearch
import sys
import json

sqli_file = sys.argv[1]

urls = []
with open(sqli_file) as f:
    url_data = f.readlines()
    for data in url_data:
        urls.append(data.strip())

my_group = group([bipolar.sqli_check.s(url) for url in urls])
group_results = my_group.apply_async()
print(group_results)
while not group_results.ready():
    print('waiting for jobs to complete')
    sleep(10)
    group_results = group_results.get()

output = []
for results in group1_results:
    if results is not None:
        for i in results:
            output.append(json.dumps(i))

print(output)

es = Elasticsearch(timeout=999999)
helpers.bulk(es, output, index='fnsqli', doc_type="doc")
 def _bulk_operate_realize(self, to_be_done_list):
     helpers.bulk(self.middleware_opration_python_instance, to_be_done_list)
        # end of event group
        if thisEntityId != lastEntityId:
            if events:
                yield get_action(events)
            events = []
            lastEntityId = thisEntityId
        events.append(doc["_source"])
        numDocsProcessed += 1
        if numDocsProcessed % 10000 == 0:
            elapsedSecs = int(time.time() - start)
            dps = numDocsProcessed / max(1, elapsedSecs)
            print numDocsProcessed, "docs per second=", dps

    # load last event group too
    if events:
        yield get_action(events)

    print "Processed", numDocsProcessed, "docs"


start = time.time()
helpers.bulk(es,
             generate_actions(),
             index=args.entityIndexName,
             doc_type=args.entityDocType,
             chunk_size=args.actionsPerBulk)

elapsed = (time.time() - start)
print "elapsed time=", elapsed
             "_type": "doc",
             "_source": {
                 "id": lst_line[0],
                 "judge_name": lst_line[1],
                 "judge_status": lst_line[2],
                 "person_cnt": lst_line[3],
                 "company_cnt": lst_line[4],
                 "court": lst_line[5],
                 "courtlevel": lst_line[6],
                 "docid": lst_line[7],
                 "casereason": lst_line[8],
                 "casetype": lst_line[9],
                 "doctype": lst_line[10],
                 "trialprocedure": lst_line[11],
                 "judgeyear": lst_line[12],
                 "judgemonth": lst_line[13],
                 "judgetime": lst_line[14],
                 "lawyers": lst_line[15],
                 "partners": lst_line[16],
                 "persons": lst_line[17],
                 "companies": lst_line[18]
             }
         } for lst_line in lst)
         try:
             helpers.bulk(es, action)
         except:
             print('出现问题了')
         print('传输了{}条审判人员'.format(str(flag)))
         lst = []
 file_open.close()
 print('上传完成{}'.format(name))
Example #43
0
 def bulk(self, actions, **kwargs):
     return bulk(client=self._get_connection(), actions=actions, **kwargs)
Example #44
0
from elasticsearch import Elasticsearch, helpers
import sys, json

es = Elasticsearch()


def load_json(filename):
    if filename.endswith('.json'):
        with open(filename, 'r') as open_file:
            yield json.load(open_file)


helpers.bulk(es, load_json(sys.argv[1]), index='sfn-tag-details')
def data_client(client):
    """Connects to client and stores some test index data in elasticsearch
    """
    create_index(client)
    bulk(client, get_index_data(), raise_on_error=True, refresh=True)
    yield client
Example #46
0
    def write_documents(self,
                        documents: Union[List[dict], List[Document]],
                        index: Optional[str] = None):
        """
        Indexes documents for later queries in Elasticsearch.

        When using explicit document IDs, any existing document with the same ID gets updated.

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
                          Optionally: Include meta data via {"text": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                          should be changed to what you have set for self.text_field and self.name_field.
        :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
        :return: None
        """

        if index and not self.client.indices.exists(index=index):
            self._create_document_index(index)

        if index is None:
            index = self.index

        # Make sure we comply to Document class format
        documents_objects = [
            Document.from_dict(d, field_map=self._create_document_field_map())
            if isinstance(d, dict) else d for d in documents
        ]

        documents_to_index = []
        for doc in documents_objects:

            _doc = {
                "_op_type":
                "index" if self.update_existing_documents else "create",
                "_index": index,
                **doc.to_dict(field_map=self._create_document_field_map())
            }  # type: Dict[str, Any]

            # cast embedding type as ES cannot deal with np.array
            if _doc[self.embedding_field] is not None:
                if type(_doc[self.embedding_field]) == np.ndarray:
                    _doc[self.embedding_field] = _doc[
                        self.embedding_field].tolist()

            # rename id for elastic
            _doc["_id"] = str(_doc.pop("id"))

            # don't index query score and empty fields
            _ = _doc.pop("score", None)
            _ = _doc.pop("probability", None)
            _doc = {k: v for k, v in _doc.items() if v is not None}

            # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
            # we "unnest" all value within "meta"
            if "meta" in _doc.keys():
                for k, v in _doc["meta"].items():
                    _doc[k] = v
                _doc.pop("meta")
            documents_to_index.append(_doc)
        bulk(self.client,
             documents_to_index,
             request_timeout=300,
             refresh=self.refresh_type)
Example #47
0
        rtv += flattenChildren(child_key, child.get('children', []))
        if 'children' in child:
            del child['children']
        rtv.append(child)
    return rtv


def processDocs():
    data = parse_file()
    rtv = []
    for d in data:
        _id = unicode(uuid.uuid4())
        rtv += flattenChildren(_id, d.get('children', []))
        if 'children' in d:
            del d['children']
        d['rkw'] = _id
        d['keyword'] += ' ' + 'root'
        rtv.append(d)
    return rtv


if __name__ == '__main__':
    data = processDocs()
    print json.dumps(data)
    es.delete_by_query(index=config.index_name,
                       body={"query": {
                           "match_all": {}
                       }})
    res = helpers.bulk(es, data, index=config.index_name, doc_type='doc')
#    pprint(res)
Example #48
0
        # print type(doc_sanitized['oncology_sub_indication'])
        # print doc_sanitized['oncology_sub_indication'][0]

        # normalize investigators to authors
        if 'inventors' in doc_sanitized.keys():
            doc_sanitized['authors'] = doc_sanitized['inventors']
            del doc_sanitized['inventors']

        action = {
            "_index": "kols_patents_new",
            "_type": "patents_new",
            "_id": objectId,
            "_source": doc_sanitized
        }
        actions.append(action)

        # print doc_sanitized
        # break
        # ret_val = es.index(index="kols_congresses_new",doc_type="congresses_new",ignore=400,body=doc_sanitized,request_timeout=60)
        # if ret_val['created'] != True:
        #             print ret_val['created']
        # print ret_val
    except Exception, e:
        print "error...", e
        break
        # print x['_id']
        # print ret_val

if (len(actions) > 0):
    helpers.bulk(es, actions, chunk_size=50, request_timeout=50)
 def bulk(self, actions, **kwargs):
     return bulk(client=self.connection, actions=actions, **kwargs)
                # if row count equals or exceeds max rows
                if args.max_rows > 0 and row_count >= args.max_rows:
                    # break out of reading loop
                    break

                # if row count is modulus
                # of the flush count value
                if row_count % args.flush_rows == 0:

                    # flush accumulated
                    # rows to target file
                    out_file.flush()

                    if args.out_elastic_search == 'Y' and len(es_actions) > 0:
                        helpers.bulk(es, es_actions)
                        es_actions.clear()

                    # ending time hack
                    end_time = time()
                    # compute records/second
                    seconds = end_time - bgn_time
                    if seconds > 0:
                        rcds_per_second = row_count / seconds
                    else:
                        rcds_per_second = 0
                    # output progress message
                    message = "Processed: {:,} rows in {:,.0f} seconds @ {:,.0f} records/second".format(
                        row_count, seconds, rcds_per_second)
                    print(message)
Example #51
0
 def index(self):
     action_generator = self.get_document_create_bulk_op
     return es_helpers.bulk(self.es, action_generator())
                    while temp_i < temp_max_i:
                        temp_i += 1
                        t_actions, t_wiki_pages_id_index = procs.pop(0).get()
                        actions += t_actions
                        wiki_pages_id_index.update(t_wiki_pages_id_index)
                temp_max_i = raw_temp_max_i

        for proc in procs:
            t_actions, t_wiki_pages_id_index = proc.get()
            actions += t_actions
            wiki_pages_id_index.update(t_wiki_pages_id_index)

        procs = []

        temp_time = time.time()
        print("bulk index batch")
        temp_success, temp_fail = elastic_helper.bulk(es, actions)
        print("Time taken: ",
              time.strftime("%H:%M:%S", time.gmtime(time.time() - temp_time)))
    es.indices.refresh(index="wiki-pages-index")
    json_filename = "wiki_pages_id_index.json"
    dict_path = os.path.join(output_index_path, json_filename)
    temp_time = time.time()
    print("saving wiki id index as json file")
    helper.save_dict_json(wiki_pages_id_index, dict_path)
    print("Time taken: ",
          time.strftime("%H:%M:%S", time.gmtime(time.time() - temp_time)))

    print("-----Total time taken: ",
          time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
Example #53
0
            # outputs epoch seconds, ES can read without any weird configs
            created_at_epoch = int(
                dateutil.parser.parse(tweet["created_at"]).timestamp())

            # tweet schema
            esTweet = {
                "created_at": created_at_epoch,
                "urls": expandedUrls,
                "text": text,
                "favorite_count": tweet["favorite_count"],
                "_id": tweet["id_str"],
                "user-screen_name": tweet["user"]["screen_name"],
                "quote_count": tweet["quote_count"],
                "reply_count": tweet["reply_count"],
                "retweet_count": tweet["retweet_count"],
                "lang": tweet["lang"],
                "s3_file_path": s3Object["Key"],
            }

            # append tweet to list to be stored
            tweets.append(esTweet)

        # needed to set the index to store the tweet
        year_month = (str(s3Object["LastModified"].year) + "-" +
                      str(s3Object["LastModified"].month))
        es_index = "tweets-" + year_month

        # stores all tweets in list to Elasticsearch
        helpers.bulk(es, tweets, index=es_index)
        lastKey = s3Object["Key"]
Example #54
0
def main():
	print("Connecting to ES...")
	es = Elasticsearch(hosts=[{"host":'elasticsearch'}])
	if not es.ping():
		raise ValueError("Connection failed")
	else:
		print('Connected to ES')

	print("Connecting to MySQL...")
	conn= pymysql.connect(host='conceptlights_db_1',user='******',password='******',db='dboe_1',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
	if conn.open:
		print('Connected to MySQL')
	else:
		print('Connection to MySQL failed')

	if es.indices.exists(index='dboe'):
		print('dboe index exists, deleting...')
		if es.indices.delete(index='dboe'):
			print('dboe index deleted, will reindex now.')

	body = {
				"settings" : {
					"number_of_shards": 1,
					"number_of_replicas": 0
				},
				"mappings": {
					"dboe-type": {
							"properties": {
								"location" : {
									"type" : "geo_point"
						}
					}
				}
			}}

	es.indices.create( index='dboe', ignore=400, body=body )

	global db_cur
	db_cur = conn.cursor()
	actions = []

	rootPath = './data'
	pattern = '*1_qdb-TEI-02.xml' #WIP: Test only with entries starting with 'm,n,o,p,q' for the moment
	listplace_path = './data/helper_tables/listPlace-id.xml'
	fragebogen_concepts_path = './data/frage-fragebogen-full-tgd01.xml'

	q_regex = r"^(\d+)(\w+)"
	# q_head_regex = r"pc> (.*)<"



	with open(listplace_path, "r", encoding="utf-8") as listplace_file, \
	open(fragebogen_concepts_path, "r", encoding="utf-8") as fragebogen_concepts_file:
		listplace_soup = BeautifulSoup(listplace_file, 'xml')
		fragebogen_concepts_soup = BeautifulSoup(fragebogen_concepts_file,'xml')

		stop_words = get_stop_words('de')

		#Walk data dir extracting the different entries
		for root, dirs, files in os.walk(rootPath):
			for filename in fnmatch.filter(files, pattern):
				print(os.path.join(root, filename))
				soup = BeautifulSoup(open(os.path.join(root, filename), "r", encoding="utf-8"), 'xml')
				for entry in soup.find_all("entry"):
					entry_obj= {}

					questionnaire = entry.findAll(
										"ref", {"type": "fragebogenNummer"})
					if len(questionnaire) > 0:
						entry_obj['source_question_title'] = questionnaire[0].string
						match = re.match(q_regex, entry_obj['source_question_title'])
						if match:
							entry_obj['questionnaire_number'] = match.group(1)
							entry_obj['question'] = match.group(2)
							
							questionnaire_label = fragebogen_concepts_soup.find("label", text="Fragebogen " + entry_obj['questionnaire_number'])
							if questionnaire_label:
								questionnaire_head = questionnaire_label.parent					
								entry_obj['questionnaire_label'] = questionnaire_head.contents[4]

								questionnaire = questionnaire_head.parent
								question = questionnaire.find('item', {"n" : entry_obj['question']})
								if question: 
									if question.label:
										entry_obj['question_label'] = question.label.string
									#label ?
									concepts = question.find_all('seg', attrs={"xml:id":True})
									if len(concepts) > 0:
										# print('Question {} relates to the following concepts:'.format(item.get('n')))
										concepts_set = set()
										for concept in concepts:
											# print(concept.string)
											if concept.string is not None and concept.string not in stop_words and "." not in concept.string and len(concept.string) > 1:
												concepts_set.add(concept.string)
										entry_obj['question_concepts'] =  list(concepts_set)
								else:
									continue
							else:
								print('Questionnaire ' + entry_obj['questionnaire_number'] + ' could not be found')
					else:
						continue
					
					entry_obj['main_lemma'] = str(entry.form.orth.string)
					if len(entry_obj['main_lemma']) == 0:
						continue

					entry_obj['id'] = entry['xml:id']
					#part of speech
					entry_obj['pos'] = str(entry.gramGrp.pos.string)
					
					if entry.sense:
						entry_obj['sense'] = entry.sense.text.replace('\n', '')

					if entry.note:
						entry_obj['note'] = entry.note.text.replace('\n', '')

					source = entry.findAll(
										"ref", {"type": "quelle"})
					if len(source) > 0:
						entry_obj['source'] = source[0].string

					revised_source = entry.findAll(
										"ref", {"type": "quelleBearbeitet"})
					if len(revised_source) > 0:
						entry_obj['revised_source'] = revised_source[0].text


					usg = entry.find('usg')
					if not usg:
						continue
					else:
						list_place = usg.find("listPlace", recursive=False)
						if not list_place:
							continue
						else:
							geo_dict = process_listplace_node(listplace_soup, list_place)
							entry_obj.update(geo_dict)		
					
					actions.append({
							'_index': 'dboe',
							'_type': 'dboe-type',
							'_source': entry_obj})

					if len(actions) > 50:
						bulk(es, actions)
						actions = []
				
		print('Done')

	conn.close()
	exit(0)
Example #55
0
 def bulk_add_documents(self, listOfDocs):
     bulk(self.es, listOfDocs, raise_on_error=False, refresh='wait_for')
Example #56
0
def update_index_for_model(model,
                           batch_size=256,
                           batches_per_commit=10,
                           firstpk=0):
    """
    More efficient update of the search index for large models such as
    Paper

    :param batch_size: the number of instances to retrieve for each query
    :param batches_per_commit: the number of batches after which we
                    should commit to the search engine
    :param firstpk: the instance to start with.
    """
    using_backends = haystack.connection_router.for_write()
    if len(using_backends) != 1:
        raise ValueError("Don't know what search index to use")
    engine = haystack.connections[using_backends[0]]
    backend = engine.get_backend()
    index = engine.get_unified_index().get_index(model)

    qs = model.objects.order_by('pk')
    lastpk_object = list(model.objects.order_by('-pk')[:1])

    if not lastpk_object:  # No object in the model
        return

    lastpk = lastpk_object[0].pk

    batch_number = 0

    # rate reporting
    indexed = 0
    starttime = datetime.utcnow()

    while firstpk < lastpk:
        batch_number += 1

        prepped_docs = []
        for obj in qs.filter(pk__gt=firstpk)[:batch_size]:
            firstpk = obj.pk

            try:
                prepped_data = index.full_prepare(obj)
                final_data = {}

                # Convert the data to make sure it's happy.
                for key, value in list(prepped_data.items()):
                    final_data[key] = backend._from_python(value)
                final_data['_id'] = final_data[ID]

                prepped_docs.append(final_data)
            except SkipDocument:
                continue

        documents_sent = False
        while not documents_sent:
            try:
                bulk(backend.conn,
                     prepped_docs,
                     index=backend.index_name,
                     doc_type='modelresult')
                documents_sent = True
            except ConnectionTimeout as e:
                logger.warning(e)
                logger.info('retrying')
                sleep(30)

        indexed += len(prepped_docs)
        if batch_number % batches_per_commit == 0:
            backend.conn.indices.refresh(index=backend.index_name)

        if indexed >= 5000:
            curtime = datetime.utcnow()
            rate = int(indexed / (curtime - starttime).total_seconds())
            logger.info("%d obj/s, %d / %d" % (rate, firstpk, lastpk))
            starttime = curtime
            indexed = 0
def single_bulk_to_es(bulk, config):
    bulk = bulk_builder(bulk, config)
    helpers.bulk(config['es_conn'], bulk, chunk_size=config['bulk_size'])
Example #58
0
    result = WordCloud(max_words=50).generate(string)
    result.to_file(text_query + ".png")


if __name__ == "__main__":
    es = ES
    es.indices.delete("final")
    if not es.indices.exists("final"):
        print("No index found.")
        es.indices.create("final")
        for folder in os.listdir(FOLDER):
            print("Now indexing folder", folder)
            folder = FOLDER + folder + '/'
            num = (len(os.listdir(folder)) // BULK_SIZE) + 1
            for i in range(num):
                print("Bulk", i, "of", num)
                i *= BULK_SIZE
                bulk(es,
                     json_to_bulk(folder, 'final', i, BULK_SIZE),
                     stats_only=True)
        print("\nTest index is build.")

    query = text_to_query("theory date = '2001:2003'")

    print(query, end='\n\n')
    results = es.search(index="final", body=query)['hits']['hits']

    print("Found", len(results), "results")

    word_cloud("theory date = '2001:2003'", results)
Example #59
0
    def execute_query(self, querystring=""):
        """
        Not a test.
        This method is doing the heavy lifting for the tests in this class: create and fill the
        index with our courses so we can run our queries and check our facet counts.
        It also executes the query and returns the result from the API.
        """
        # Create the subject category page. This is necessary to link the subjects we
        # defined above with the "subjects" filter
        # As it is the only page we create, we expect it to have the path "0001"
        CategoryFactory(page_reverse_id="subjects", should_publish=True)

        # Index these 4 courses in Elasticsearch
        indices_client = IndicesClient(client=ES_CLIENT)
        # Delete any existing indices so we get a clean slate
        indices_client.delete(index="_all")
        # Create an index we'll use to test the ES features
        indices_client.create(index="test_courses")
        indices_client.close(index="test_courses")
        indices_client.put_settings(body=ANALYSIS_SETTINGS, index="test_courses")
        indices_client.open(index="test_courses")

        # Use the default courses mapping from the Indexer
        indices_client.put_mapping(
            body=CoursesIndexer.mapping, doc_type="course", index="test_courses"
        )
        # Add the sorting script
        ES_CLIENT.put_script(id="score", body=CoursesIndexer.scripts["score"])
        ES_CLIENT.put_script(
            id="state_field", body=CoursesIndexer.scripts["state_field"]
        )

        # Actually insert our courses in the index
        actions = [
            {
                "_id": course["id"],
                "_index": "test_courses",
                "_op_type": "create",
                "_type": "course",
                "absolute_url": {"en": "url"},
                "cover_image": {"en": "image"},
                "title": {"en": "title"},
                **course,
                "course_runs": [
                    {
                        "languages": course_run["languages"],
                        "start": arrow.utcnow().datetime,
                        "end": arrow.utcnow().datetime,
                        "enrollment_start": arrow.utcnow().datetime,
                        "enrollment_end": arrow.utcnow().datetime,
                    }
                    for course_run in course["course_runs"]
                ],
            }
            for course in COURSES
        ]
        bulk(actions=actions, chunk_size=500, client=ES_CLIENT)
        indices_client.refresh()

        response = self.client.get(f"/api/v1.0/courses/?{querystring:s}")
        self.assertEqual(response.status_code, 200)

        return json.loads(response.content)
    def store(self, report):
        sample_ids = {}
        sample_list = []

        for filename in report:
            report[filename]['filename'] = filename
            try:
                sample_id = report[filename]['SHA256']
            except KeyError:
                sample_id = uuid4()
            # Store metadata with the sample, not the report
            sample = {'filename': filename, 'tags': []}
            for field in METADATA_FIELDS:
                if field in report[filename]:
                    if len(report[filename][field]) != 0:
                        sample[field] = report[filename][field]
                    del report[filename][field]

            # If there is Cuckoo results in the report, some
            # cleanup is needed for the report
            if 'Cuckoo Sandbox' in report[filename].keys():
                cuckoo_report = report[filename]['Cuckoo Sandbox']
                cuckoo_doc = {
                    'target': cuckoo_report.get('target'),
                    'summary': cuckoo_report.get('behavior',
                                                 {}).get('summary'),
                    'info': cuckoo_report.get('info')
                }
                signatures = cuckoo_report.get('signatures')
                if signatures:
                    cuckoo_doc['signatures'] = process_cuckoo_signatures(
                        signatures)

                dropped = cuckoo_report.get('dropped')
                if dropped:
                    cuckoo_doc['dropped'] = dropped

                procmemory = cuckoo_report.get('procmemory')
                if procmemory:
                    cuckoo_doc['procmemory'] = procmemory

                # TODO: add the API calls to the Cuckoo Report document
                # for process in cuckoo_report.get('behavior', {}).get('processes', []):
                #     process_pid = process['pid']
                #     cuckoo_doc['calls'] = {}
                #     cuckoo_doc['calls'][process_pid] = []
                #     for call in process['calls']:
                #         cuckoo_doc['calls'][process_pid].append(call)

                report[filename]['Cuckoo Sandbox'] = cuckoo_doc

            # Store report; let ES autogenerate the ID so we can save it with the sample
            try:
                report_result = self.es.index(index=self.index,
                                              doc_type=self.doc_type,
                                              body=report[filename],
                                              parent=sample_id,
                                              pipeline='dedot')
            except (TransportError, UnicodeEncodeError) as e:
                # If fail, index empty doc instead
                print('Failed to index that report!\n{}'.format(e))
                report_body_fail = {
                    'ERROR':
                    'Failed to index the full report in Elasticsearch',
                    'Scan Time': report[filename]['Scan Time']
                }
                report_result = self.es.index(index=self.index,
                                              doc_type=self.doc_type,
                                              body=report_body_fail,
                                              parent=sample_id,
                                              pipeline='dedot')

            report_id = report_result.get('_id')
            sample['report_id'] = report_id
            sample_ids[sample_id] = report_id

            sample_list.append({
                '_op_type': 'create',
                '_index': self.index,
                '_type': 'sample',
                '_id': sample_id,
                '_source': sample,
                'pipeline': 'dedot'
            })

        result = helpers.bulk(self.es, sample_list, raise_on_error=False)

        creation_errors = result[1]
        if not creation_errors:
            return sample_ids

        # Some samples already exist; update them to ref the new reports
        updates_list = []
        for err in creation_errors:
            if err['create']['status'] == 409:
                sid = err['create']['_id']
                rid = sample_ids[sid]
                updates_list.append({
                    '_op_type': 'update',
                    '_index': self.index,
                    '_type': 'sample',
                    '_id': sample_id,
                    'doc': {
                        'report_id': rid
                    },
                    'pipeline': 'dedot'
                })

        result = helpers.bulk(self.es, updates_list, raise_on_error=False)
        return sample_ids