Python bulk_chunks Exemples, pyelasticsearch.bulk_chunks Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : related.py Projet : akatsoulas/airmozilla

def index(all=False, flush_first=False, since=datetime.timedelta(minutes=10)):
    es = get_connection()

    if flush_first:
        flush(es)
        create(es)

    if all:
        events = Event.objects.scheduled_or_processing()
    else:
        now = timezone.now()
        events = (
            Event.objects.scheduled_or_processing()
            .filter(modified__gte=now-since)
        )

    # bulk_chunks() breaks our documents into smaller requests for speed
    index = get_index()
    for chunk in pyelasticsearch.bulk_chunks(
        documents(events, es),
        docs_per_chunk=500,
        bytes_per_chunk=10000
    ):
        es.bulk(chunk, doc_type=doc_type, index=index)

    es.refresh(index)

Exemple #2

0

Afficher le fichier

Fichier : autoindex.py Projet : appressoas/ievv_opensource

    def index_items(self, index_documents):
        """
        Index the given index_documents.

        Iterates over the given ``index_documents``, and send documents to
        :meth:`ievv_opensource.ievv_elasticsearch.search.Connection.bulk` in
        batches of ``IEVV_ELASTICSEARCH_INDEX_BATCH_SIZE`` index_documents.

        Parameters:
            index_documents: An iterable of :class:`.AbstractDocument`.
        """
        searchapi = search.Connection.get_instance()

        for doc_type, index_documents_of_doc_type in itertools.groupby(
                index_documents,
                key=lambda index_document: index_document.doc_type):
            for chunk in bulk_chunks(
                    self._iterate_index_operations(index_documents_of_doc_type),
                    docs_per_chunk=self.bulk_index_docs_per_chunk,
                    bytes_per_chunk=self.bulk_index_bytes_per_chunk):
                searchapi.elasticsearch.bulk(chunk, index=self.name, doc_type=doc_type)

        # NOTE: We should be able to let AbstractDocument.get_index_op_kwargs()
        #       include the doc_type, and avoid the groupby(), but that randomly
        #       raises an exception complaining about missing type.
        # for chunk in bulk_chunks(
        #         self._iterate_index_operations(index_documents),
        #         docs_per_chunk=self.bulk_index_docs_per_chunk,
        #         bytes_per_chunk=self.bulk_index_bytes_per_chunk):
        #     searchapi.elasticsearch.bulk(chunk, index=self.name)

        if getattr(settings, 'IEVV_ELASTICSEARCH_AUTOREFRESH_AFTER_INDEXING', False):
            searchapi.refresh()

Exemple #3

0

Afficher le fichier

def perform_bulk_index(host, index_name, doc_type, doc_fetch, docs_per_chunk,
                       bytes_per_chunk, parallel):
    Parallel(n_jobs=parallel)(
        delayed(local_bulk)(host, index_name, doc_type, chunk)
        for chunk in bulk_chunks(doc_fetch(),
                                 docs_per_chunk=docs_per_chunk,
                                 bytes_per_chunk=bytes_per_chunk))

Exemple #4

0

Afficher le fichier

Fichier : utils_tests.py Projet : pyelasticsearch/pyelasticsearch

 def test_bytes_first_too_big(self):
     """
     Don't yield an empty chunk if the first item is over the byte limit on
     its own.
     """
     actions = ['chimpanzees', 'hi', 'ho']
     chunks = bulk_chunks(actions, bytes_per_chunk=6)
     self.assertEqual(list(chunks), [['chimpanzees'], ['hi', 'ho']])

Exemple #5

0

Afficher le fichier

Fichier : utils_tests.py Projet : yy1117/pyelasticsearch

 def test_bytes_first_too_big(self):
     """
     Don't yield an empty chunk if the first item is over the byte limit on
     its own.
     """
     actions = ['chimpanzees', 'hi', 'ho']
     chunks = bulk_chunks(actions, bytes_per_chunk=6)
     eq_(list(chunks), [['chimpanzees'], ['hi', 'ho']])

Exemple #6

0

Afficher le fichier

Fichier : utils_tests.py Projet : pyelasticsearch/pyelasticsearch

    def test_bytes(self):
        """
        Make sure byte-based limits work.

        The last document is not allowed to overshoot the limit.
        """
        actions = ['o', 'hi', 'good', 'chimpanzees']
        chunks = bulk_chunks(actions, bytes_per_chunk=5)
        self.assertEqual(list(chunks), [['o', 'hi'], ['good'], ['chimpanzees']])

Exemple #7

0

Afficher le fichier

Fichier : utils_tests.py Projet : yy1117/pyelasticsearch

    def test_bytes(self):
        """
        Make sure byte-based limits work.

        The last document is not allowed to overshoot the limit.
        """
        actions = ['o', 'hi', 'good', 'chimpanzees']
        chunks = bulk_chunks(actions, bytes_per_chunk=5)
        eq_(list(chunks), [['o', 'hi'], ['good'], ['chimpanzees']])

Exemple #8

0

Afficher le fichier

Fichier : dbManager.py Projet : sauravcsvt/geocoding

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)

        self.eserver.create_index(index='geonames', settings=settings)
        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index='geonames', doc_type='places')
            print "..",

        self.eserver.refresh('geonames')

Exemple #9

0

Afficher le fichier

Fichier : search.py Projet : ianmackinnon/mango

def build_org(es, orm, Org, Orgalias):
    log.warning("Bulk adding org : start")

    def docs():
        for org in orm.query(Org):
            yield es.index_op(org_doc(org), id=org.org_id)

    for chunk in pyelasticsearch.bulk_chunks(
            docs(), docs_per_chunk=500, bytes_per_chunk=10000):
        es.bulk(chunk, doc_type=es_doc_type, index=es_index)

    log.warning("Bulk adding org : end")

Exemple #10

0

Afficher le fichier

Fichier : token_prices.py Projet : cyber-drop/ethereum_analytical_db

    def _insert_multiple_docs(self, docs, index_name):
        """
        Index multiple documents simultaneously

        Parameters
        ----------
        docs: list
            List of dictionaries with new data
        doc_type: str
            Type of inserted documents
        index_name: str
            Name of the index that contains inserted documents
        """
        for chunk in bulk_chunks(docs, docs_per_chunk=1000):
            self._construct_bulk_insert_ops(chunk)
            self.client.bulk_index(index=index_name, docs=chunk)

Exemple #11

0

Afficher le fichier

Fichier : sync.py Projet : muke5hy/py-mysql-es

	def run(self):
		try:
			if self.bulk_size < 2:
				for action in self.proc_binlog():
					self.es.bulk([action])
					self.mark_binlog()
			else:
				for chunk in bulk_chunks(self.proc_binlog(), docs_per_chunk=self.bulk_size):
					self.es.bulk(chunk)
					self.mark_binlog()
		except KeyboardInterrupt:
			pass
		except Exception:
			import traceback
			logging.error(traceback.format_exc())
			self.send_email(msg=traceback.format_exc())
			raise

Exemple #12

0

Afficher le fichier

Fichier : csv2es.py Projet : haizaar/csv2es

def perform_bulk_index(host, index_name, doc_type, doc_fetch, docs_per_chunk, bytes_per_chunk, parallel):
    """
    Chunk up documents and send them to Elasticsearch in bulk.

    :param host: the target Elasticsearch host
    :param index_name: the target index name
    :param doc_type: the target document type
    :param doc_fetch: a function to call to fetch documents
    :param docs_per_chunk: the number of documents per chunk to upload
    :param bytes_per_chunk: the max bytes per chunk to upload
    :param parallel: the number of bulk uploads to do at the same time
    """
    Parallel(n_jobs=parallel)(
        delayed(local_bulk)(host, index_name, doc_type, chunk)
        for chunk in bulk_chunks(doc_fetch(),
                                 docs_per_chunk=docs_per_chunk,
                                 bytes_per_chunk=bytes_per_chunk))

Exemple #13

0

Afficher le fichier

    def create(self, datacsv, confDir="../data/"):
        with open(os.path.join(confDir, "es_settings.json")) as jf:
            settings = json.load(jf)
            settings['mappings'][self._doctype] = settings['mappings'].pop(
                'places')

        try:
            self.eserver.create_index(index=self._index, settings=settings)
        except:
            self.eserver.delete_index(self._index)
            self.eserver.create_index(index=self._index, settings=settings)

        for chunk in bulk_chunks(self._opLoader(datacsv, confDir),
                                 docs_per_chunk=1000):
            self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype)
            print "..",

        self.eserver.refresh(self._index)

Exemple #14

0

Afficher le fichier

def index(all=False, flush_first=False, since=datetime.timedelta(minutes=10)):
    es = get_connection()

    if flush_first:
        flush(es)
        create(es)

    if all:
        events = Event.objects.scheduled_or_processing()
    else:
        now = timezone.now()
        events = (Event.objects.scheduled_or_processing().filter(
            modified__gte=now - since))

    # bulk_chunks() breaks our documents into smaller requests for speed
    index = get_index()
    for chunk in pyelasticsearch.bulk_chunks(documents(events, es),
                                             docs_per_chunk=500,
                                             bytes_per_chunk=10000):
        es.bulk(chunk, doc_type=doc_type, index=index)

    es.refresh(index)

Exemple #15

0

Afficher le fichier

    def _save_internal_transactions(self, blocks_traces):
        """
        Save specified transactions to the database in multiple chunks

        Save only those which are attached to an ethereum transaction

        Parameters
        ----------
        blocks_traces : list
            List of transactions to save
        """
        docs = [
            self._preprocess_internal_transaction(transaction)
            for transaction in blocks_traces if transaction["transactionHash"]
        ]
        if docs:
            for chunk in bulk_chunks(docs, None, BYTES_PER_CHUNK):
                self.client.bulk_index(
                    docs=chunk,
                    index=self.indices["internal_transaction"],
                    doc_type="itx",
                    id_field="hash",
                    refresh=True)

Exemple #16

0

Afficher le fichier

def get_level_1_quotes_and_trades(ticker: str, seconds: int):
    def documents():
        """Get level 1 quotes and trades for ticker for seconds seconds."""
        quote_conn = iq.QuoteConn(name="pyiqfeed-Example-lvl1")
        quote_listener = iq.VerboseQuoteListener("Level 1 Listener")
        quote_conn = iq.QuoteConn(name="pyiqfeed-Example-lvl1")
        quote_conn.add_listener(quote_listener)
        with iq.ConnConnector([quote_conn]) as connector:
            all_fields = sorted(list(iq.QuoteConn.quote_msg_map.keys()))
            quote_conn.select_update_fieldnames(all_fields)
            quote_conn.watch(ticker)
            import time

            time.sleep(seconds)
            #quote_conn.unwatch(ticker)
            #quote_conn.remove_listener(quote_listener)

    for chunk in bulk_chunks(documents(),
                             docs_per_chunk=500,
                             bytes_per_chunk=10000):
        # We specify a default index and doc type here so we don't
        # have to repeat them in every operation:
        es.bulk(chunk, doc_type='feed', index='beginning')

Exemple #17

0

Afficher le fichier

Fichier : build.py Projet : bitslab/dxr

def index_file(tree, tree_indexers, path, es, index):
    """Index a single file into ES, and build a static HTML representation of it.

    For the moment, we execute plugins in series, figuring that we have plenty
    of files to keep our processors busy in most trees that take very long. I'm
    a little afraid of the cost of passing potentially large TreesToIndex to
    worker processes. That goes at 52MB/s on my OS X laptop, measuring by the
    size of the pickled object and including the pickling and unpickling time.

    :arg path: Absolute path to the file to index
    :arg index: The ES index name

    """
    try:
        contents = unicode_contents(path, tree.source_encoding)
    except IOError as exc:
        if exc.errno == ENOENT and islink(path):
            # It's just a bad symlink (or a symlink that was swiped out
            # from under us--whatever)
            return
        else:
            raise

    # Just like index_folders, if the path is not in UTF-8, then elasticsearch
    # will not accept the path, so just move on.
    rel_path = relpath(path, tree.source_folder)
    is_text = isinstance(contents, unicode)
    is_link = islink(path)
    # Index by line if the contents are text and the path is not a symlink.
    index_by_line = is_text and not is_link
    if index_by_line:
        lines = split_content_lines(contents)
        num_lines = len(lines)
        needles_by_line = [{} for _ in xrange(num_lines)]
        annotations_by_line = [[] for _ in xrange(num_lines)]
        refses, regionses = [], []
    needles = {}
    linkses = []

    for tree_indexer in tree_indexers:
        file_to_index = tree_indexer.file_to_index(rel_path, contents)
        if file_to_index.is_interesting():
            # Per-file stuff:
            append_update(needles, file_to_index.needles())
            if not is_link:
                linkses.append(file_to_index.links())

            # Per-line stuff:
            if index_by_line:
                refses.append(file_to_index.refs())
                regionses.append(file_to_index.regions())
                append_update_by_line(needles_by_line,
                                      file_to_index.needles_by_line())
                append_by_line(annotations_by_line,
                               file_to_index.annotations_by_line())

    def docs():
        """Yield documents for bulk indexing.

        Big Warning: docs also clears the contents of all elements of
        needles_by_line because they will no longer be used.
        """
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(# Some non-array fields:
                    folder=folder_name,
                    name=file_name,
                    size=file_info.st_size,
                    is_folder=False,

                    # And these, which all get mashed into arrays:
                    **needles)
        links = dictify_links(chain.from_iterable(linkses))
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines.
        if index_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line,
                    annotations_by_line,
                    es_lines(finished_tags(lines,
                                           chain.from_iterable(refses),
                                           chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(tags, lambda index_obj: "regions" if
                                          isinstance(index_obj['payload'], basestring) else
                                          "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)

                # Because needles_by_line holds a reference, total is not
                # garbage collected. Since we won't use it again, we can clear
                # the contents, saving substantial memory on long files.
                total.clear()

    # Indexing a 277K-line file all in one request makes ES time out (>60s),
    # so we chunk it up. 300 docs is optimal according to the benchmarks in
    # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like
    # images don't make our chunk sizes ridiculous, there's a size ceiling as
    # well: 10000 is based on the 300 and an average of 31 chars per line.
    for chunk in bulk_chunks(docs(), docs_per_chunk=300, bytes_per_chunk=10000):
        es.bulk(chunk, index=index, doc_type=LINE)

Exemple #18

0

Afficher le fichier

Fichier : dbhist.py Projet : ajmal017/whitehorse

def bg_get_hist_mult(symbols,
                     interval,
                     maxdatapoints,
                     datadirection=0,
                     requestid='',
                     datapointspersend='',
                     intervaltype='',
                     loop=False):
    try:
        # The IP address or hostname of your reader
        READER_HOSTNAME = 'localhost'
        # The TCP port specified in Speedway Connect
        READER_PORT = 9100
        # Define the size of the buffer that is used to receive data.
        BUFFER_SIZE = 1024

        # Open a socket connection to the reader
        s = socket.create_connection((READER_HOSTNAME, READER_PORT))

        # Set the socket to non-blocking
        #s.setblocking(0)

        # Make a file pointer from the socket, so we can read lines
        fs = s.makefile()
        tickerdict = dict()

        while 1:
            for symbol in symbols:
                symbol = symbol.upper()
                pass  #print	 'Getting ', symbol
                instrument_list = Instrument.search().filter(
                    'term', **{
                        'sym.raw': symbol
                    }).execute()
                if instrument_list and len(instrument_list) > 0:
                    instrument = instrument_list[0]
                    pass  #print	 instrument.id, symbol
                else:
                    instrument = Instrument()
                    instrument.sym = symbol
                    instrument.save()

                from pandas.io.json import json_normalize

                feed_list = Feed.search().filter(
                    'term', frequency=interval).filter(
                        'term', instrument_id=instrument.id).sort('-date')
                feed_list = feed_list[:int(maxdatapoints)]
                res = []
                index = 0
                for feed in feed_list:
                    index += 1
                    quote = {
                        'Date': feed.date,
                        'Open': feed.open,
                        'High': feed.high,
                        'Low': feed.low,
                        'Close': feed.close,
                        'Volume': feed.volume
                    }
                    mykey = "%s|%s|%s|%s|%s|%s|%s" % (
                        instrument.id, interval, feed.date.year,
                        feed.date.month, feed.date.day, feed.date.hour,
                        feed.date.minute)
                    if index > 1:
                        tickerdict[mykey] = quote
                cmd = "HIX,%s,%s,%s,%s,%s,%s,%s\r\n" % (
                    symbol, interval, maxdatapoints, datadirection, requestid,
                    datapointspersend, intervaltype)
                s.sendall(cmd)

                data = pd.DataFrame({},
                                    columns=[
                                        'Date', 'Open', 'High', 'Low', 'Close',
                                        'Volume', 'TotalVolume'
                                    ]).set_index('Date')

                def documents():
                    i = 0

                    while 1:
                        i += 1

                        try:
                            line = fs.readline()
                            # If data was received, pass #print	 it
                            if (len(line)):
                                pass  #pass #print			 line
                                fields = line.strip().split(',')
                                '''
                                    Format    Notes
                                        Request ID    Text    This field will only exist if the request specified a RequestID. If not specified in the request, the first field in each message will be the Timestamp.
                                        Time Stamp    CCYY-MM-DD HH:MM:SS    Example: 2008-09-01 16:00:01
                                        High    Decimal    Example: 146.2587
                                        Low    Decimal    Example: 145.2587
                                        Open    Decimal    Example: 146.2587
                                        Close    Decimal    Example: 145.2587
                                        Total Volume    Integer    Example: 1285001
                                        Period Volume    Integer    Example: 1285
                                        Number of Trades    Integer    Example: 10000 - Will be zero for all requests other than tick interval requests
                                        Example data:    Request: HIX,GOOG,60,10<CR><LF>
                                        2013-08-12 13:44:00,886.0680,886.0680,886.0680,886.0680,1010550,200,0,<CR><LF>
                                '''
                                if fields[0] == '!ENDMSG!':
                                    #s.close()
                                    #time.sleep(1)
                                    pass  #print	 'Done',symbol
                                    break
                                    #return data
                                else:
                                    pass  #pass #print			 line
                                    date = fields[0]
                                    high = float(fields[1])
                                    low = float(fields[2])
                                    open_price = float(fields[3])
                                    close_price = float(fields[4])
                                    total_volume = float(fields[5])
                                    volume = float(fields[6])
                                    trades = fields[7]

                                    if date:

                                        date = dateutil.parser.parse(date)
                                        #date=eastern.localize(date,is_dst=True)
                                        pass  #pass #print			 date
                                        quote = {
                                            'Date': date,
                                            'Open': open_price,
                                            'High': high,
                                            'Low': low,
                                            'Close': close_price,
                                            'Volume': volume,
                                            'TotalVolume': total_volume,
                                            #'wap':WAP,
                                        }
                                        frequency = interval

                                        feed = {
                                            'instrument_id': instrument.id,
                                            'frequency': frequency,
                                            'date': date,
                                            'open': quote['Open'],
                                            'high': quote['High'],
                                            'low': quote['Low'],
                                            'close': quote['Close'],
                                            'volume': quote['Volume']
                                        }
                                        mykey = "%s|%s|%s|%s|%s|%s|%s" % (
                                            instrument.id, interval, date.year,
                                            date.month, date.day, date.hour,
                                            date.minute)
                                        if not tickerdict.has_key(mykey):
                                            if date < datetime.now():
                                                with open(
                                                        'logs\\' + symbol +
                                                        '_hist.csv',
                                                        'a') as outfile:
                                                    log = "%s,%s,%s,%s,%s,%s,%s\r\n" % (
                                                        date, symbol,
                                                        str(quote['Open']),
                                                        str(quote['High']),
                                                        str(quote['Low']),
                                                        str(quote['Close']),
                                                        str(quote['Volume']))
                                                    pass  #print	 'logging ',symbol
                                                    outfile.write(log)
                                                outfile.close()
                                                bar_list = Feed.search(
                                                ).filter(
                                                    'term', date=date
                                                ).filter(
                                                    'term',
                                                    instrument_id=instrument.id
                                                ).filter('term',
                                                         frequency=frequency)
                                                if bar_list and bar_list.count(
                                                ) > 0:

                                                    tickerdict[mykey] = quote
                                                    pass  #print	 'update', symbol
                                                    mydoc = bar_list.execute(
                                                    )[0]._id
                                                    yield es.update_op(
                                                        doc=feed,
                                                        id=mydoc,
                                                        index='beginning',
                                                        doc_type='feed',
                                                        doc_as_upsert=True)
                                                else:
                                                    tickerdict[mykey] = quote

                                                    pass  #print	 'insert', symbol

                                                    yield es.index_op(feed)

                                            #saveQuote(symbol, instrument, interval, quote)
                                        #self.saveQuote(dbcontract, quote)

                                        data.loc[date] = [
                                            open_price, high, low, close_price,
                                            volume, total_volume
                                        ]
                                        pass  #pass #print			 date,high,low,open,close,volume,total_volume,trades
                        except Exception as e:
                            logging.error("get_btcfeed", exc_info=True)

                for chunk in bulk_chunks(documents(),
                                         docs_per_chunk=500,
                                         bytes_per_chunk=10000):
                    # We specify a default index and doc type here so we don't
                    # have to repeat them in every operation:
                    es.bulk(chunk, doc_type='feed', index='beginning')
            if not loop:
                break
        return data
    except Exception as e:
        pass  #print	 e

Exemple #19

0

Afficher le fichier

Fichier : geonames_elasticsearch.py Projet : ahalterman/mordecai

                    "coordinates" : coords,  # 4, 5
                    "feature_class" : row[6],
                    "feature_code" : row[7],
                    "country_code2" : row[8],
                    "country_code3" : country_code3,
                    "cc2" : row[9],
                    "admin1_code" : row[10],
                    "admin2_code" : row[11],
                    "admin3_code" : row[12],
                    "admin4_code" : row[13],
                    "population" : row[14],
                    "elevation" : row[15],
                    "dem" : row[16],
                    "timzeone" :  row[17],
                    "modification_date" : "2014-01-01"
                   }
            yield es.index_op(doc, index='geonames', doc_type='geoname')
        except:
            count += 1

    print 'Exception count:', count


chunk_count = 0
for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500):
    es.bulk(chunk)
    chunk_count += 1
    print 'Chunk count:', chunk_count

es.refresh('geonames')

Exemple #20

0

Afficher le fichier

Fichier : geonames_elasticsearch.py Projet : wanyixue/mordecai

                "coordinates": coords,  # 4, 5
                "feature_class": row[6],
                "feature_code": row[7],
                "country_code2": row[8],
                "country_code3": country_code3,
                "cc2": row[9],
                "admin1_code": row[10],
                "admin2_code": row[11],
                "admin3_code": row[12],
                "admin4_code": row[13],
                "population": row[14],
                "elevation": row[15],
                "dem": row[16],
                "timzeone": row[17],
                "modification_date": "2014-01-01"
            }
            yield es.index_op(doc, index='geonames', doc_type='geoname')
        except:
            count += 1

    print 'Exception count:', count


chunk_count = 0
for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500):
    es.bulk(chunk)
    chunk_count += 1
    print 'Chunk count:', chunk_count

es.refresh('geonames')

Exemple #21

0

Afficher le fichier

Fichier : build.py Projet : vck/dxr

def index_file(tree, tree_indexers, path, es, index):
    """Index a single file into ES, and build a static HTML representation of it.

    For the moment, we execute plugins in series, figuring that we have plenty
    of files to keep our processors busy in most trees that take very long. I'm
    a little afraid of the cost of passing potentially large TreesToIndex to
    worker processes. That goes at 52MB/s on my OS X laptop, measuring by the
    size of the pickled object and including the pickling and unpickling time.

    :arg path: Bytestring absolute path to the file to index
    :arg index: The ES index name

    """
    try:
        contents = unicode_contents(path, tree.source_encoding)
    except IOError as exc:
        if exc.errno == ENOENT and islink(path):
            # It's just a bad symlink (or a symlink that was swiped out
            # from under us--whatever)
            return
        else:
            raise

    # Just like index_folders, if the path is not in UTF-8, then elasticsearch
    # will not accept the path, so just move on.
    rel_path = relpath(path, tree.source_folder)
    is_text = isinstance(contents, unicode)
    is_link = islink(path)
    # Index by line if the contents are text and the path is not a symlink.
    index_by_line = is_text and not is_link
    if index_by_line:
        lines = split_content_lines(contents)
        num_lines = len(lines)
        needles_by_line = [{} for _ in xrange(num_lines)]
        annotations_by_line = [[] for _ in xrange(num_lines)]
        refses, regionses = [], []
    needles = {}
    linkses = []

    for tree_indexer in tree_indexers:
        file_to_index = tree_indexer.file_to_index(rel_path, contents)
        if file_to_index.is_interesting():
            # Per-file stuff:
            append_update(needles, file_to_index.needles())
            if not is_link:
                linkses.append(file_to_index.links())

            # Per-line stuff:
            if index_by_line:
                refses.append(file_to_index.refs())
                regionses.append(file_to_index.regions())
                append_update_by_line(needles_by_line,
                                      file_to_index.needles_by_line())
                append_by_line(annotations_by_line,
                               file_to_index.annotations_by_line())

    def docs():
        """Yield documents for bulk indexing.

        Big Warning: docs also clears the contents of all elements of
        needles_by_line because they will no longer be used.
        """
        # Index a doc of type 'file' so we can build folder listings.
        # At the moment, we send to ES in the same worker that does the
        # indexing. We could interpose an external queueing system, but I'm
        # willing to potentially sacrifice a little speed here for the easy
        # management of self-throttling.
        file_info = stat(path)
        folder_name, file_name = split(rel_path)
        # Hard-code the keys that are hard-coded in the browse()
        # controller. Merge with the pluggable ones from needles:
        doc = dict(  # Some non-array fields:
            folder=unicode_for_display(folder_name),
            name=unicode_for_display(file_name),
            size=file_info.st_size,
            is_folder=False,

            # And these, which all get mashed into arrays:
            **needles)
        links = dictify_links(chain.from_iterable(linkses))
        if links:
            doc['links'] = links
        yield es.index_op(doc, doc_type=FILE)

        # Index all the lines.
        if index_by_line:
            for total, annotations_for_this_line, tags in izip(
                    needles_by_line, annotations_by_line,
                    es_lines(
                        finished_tags(lines, chain.from_iterable(refses),
                                      chain.from_iterable(regionses)))):
                # Duplicate the file-wide needles into this line:
                total.update(needles)

                # We bucket tags into refs and regions for ES because later at
                # request time we want to be able to merge them individually
                # with those from skimmers.
                refs_and_regions = bucket(
                    tags, lambda index_obj: "regions" if isinstance(
                        index_obj['payload'], basestring) else "refs")
                if 'refs' in refs_and_regions:
                    total['refs'] = refs_and_regions['refs']
                if 'regions' in refs_and_regions:
                    total['regions'] = refs_and_regions['regions']
                if annotations_for_this_line:
                    total['annotations'] = annotations_for_this_line
                yield es.index_op(total)

                # Because needles_by_line holds a reference, total is not
                # garbage collected. Since we won't use it again, we can clear
                # the contents, saving substantial memory on long files.
                total.clear()

    # Indexing a 277K-line file all in one request makes ES time out (>60s),
    # so we chunk it up. 300 docs is optimal according to the benchmarks in
    # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like
    # images don't make our chunk sizes ridiculous, there's a size ceiling as
    # well: 10000 is based on the 300 and an average of 31 chars per line.
    for chunk in bulk_chunks(docs(), docs_per_chunk=300,
                             bytes_per_chunk=10000):
        es.bulk(chunk, index=index, doc_type=LINE)

Exemple #22

0

Afficher le fichier

Fichier : utils_tests.py Projet : pyelasticsearch/pyelasticsearch

 def test_over(self):
     """Make sure action iterators longer than 1 chunk work."""
     actions = self.str_xrange(7)
     chunks = bulk_chunks(actions, docs_per_chunk=3)
     self.assertEqual(list(chunks), [['0', '1', '2'], ['3', '4', '5'], ['6']])

Exemple #23

0

Afficher le fichier

Fichier : utils_tests.py Projet : yy1117/pyelasticsearch

 def test_on(self):
     """Make sure action iterators that end on a chunk boundary work."""
     actions = self.str_xrange(4)
     chunks = bulk_chunks(actions, docs_per_chunk=2)
     eq_(list(chunks), [['0', '1'], ['2', '3']])

Exemple #24

0

Afficher le fichier

Fichier : utils_tests.py Projet : yy1117/pyelasticsearch

 def test_under(self):
     """Make sure action iterators shorter than 1 chunk work."""
     actions = self.str_xrange(1)  # just 0
     chunks = bulk_chunks(actions, docs_per_chunk=2)
     eq_(list(chunks), [['0']])

Exemple #25

0

Afficher le fichier

Fichier : para-to-es.py Projet : yubowen-ph/negative-examples-for-para

    Rthandler.setFormatter(formatter)
    logging.getLogger().addHandler(Rthandler)


def get_para_5m_raw_data():
    examples = []
    lines = io.open(PATH, 'r', encoding='utf-8').readlines()
    for i in lines:
        s1 = i.split("\t")[0].lower()
        s2 = i.split("\t")[1].lower()
        examples.append({'content': s1, 'type': 'origin'})
        examples.append({'content': s2, 'type': 'para'})
    return examples


def document(sentences):
    for s in sentences:
        dic = {'content': s['content'], 'type': s['type']}
        yield es.index_op(dic)


if __name__ == '__main__':
    # init_log()
    sentences = get_para_5m_raw_data()
    for chunk in bulk_chunks(document(sentences),
                             docs_per_chunk=1000,
                             bytes_per_chunk=100000):
        es.bulk(chunk, doc_type='sentence', index='para-nmt-50m')
        doc_num += 1000
        print("indexed" + str(doc_num) + "docs")
        logging.info("indexed" + str(doc_num) + "docs")

Exemple #26

0

Afficher le fichier

Fichier : utils_tests.py Projet : pyelasticsearch/pyelasticsearch

 def test_under(self):
     """Make sure action iterators shorter than 1 chunk work."""
     actions = self.str_xrange(1)  # just 0
     chunks = bulk_chunks(actions, docs_per_chunk=2)
     self.assertEqual(list(chunks), [['0']])

Exemple #27

0

Afficher le fichier

def get_historical_bar_to_db(ticker: str, bar_len: int, bar_unit: str,
                             num_bars: int):
    """Shows how to get interval bars."""
    hist_conn = iq.HistoryConn(name="pyiqfeed-Example-historical-bars")
    hist_listener = iq.VerboseIQFeedListener("History Bar Listener")
    hist_conn.add_listener(hist_listener)
    """Shows how to get interval bars."""
    hist_conn = iq.HistoryConn(name="pyiqfeed-Example-historical-bars")
    hist_listener = iq.VerboseIQFeedListener("History Bar Listener")
    hist_conn.add_listener(hist_listener)

    Instrument.init()
    Feed.init()
    symbol = ticker.upper()
    instrument_list = Instrument.search().filter('term', **{
        'sym.raw': symbol
    }).execute()
    if instrument_list and len(instrument_list) > 0:
        instrument = instrument_list[0]
    else:
        instrument = Instrument()
        instrument.sym = symbol
        instrument.save()

    def documents():
        with iq.ConnConnector([hist_conn]) as connector:
            # look at conn.py for request_bars, request_bars_for_days and
            # request_bars_in_period for other ways to specify time periods etc
            try:
                bars = hist_conn.request_bars(ticker=ticker,
                                              interval_len=bar_len,
                                              interval_type=bar_unit,
                                              max_bars=num_bars)
                '''
                today = datetime.now()
                start_date = today - relativedelta(days=10)
                start_time = datetime(year=start_date.year,
                                               month=start_date.month,
                                               day=start_date.day,
                                               hour=0,
                                               minute=0,
                                               second=0)
                end_time = datetime(year=today.year,
                                             month=today.month,
                                             day=today.day,
                                             hour=23,
                                             minute=59,
                                             second=59)
                bars = hist_conn.request_bars_in_period(ticker=ticker,
                                                        interval_len=bar_len,
                                                        interval_type=bar_unit,
                                                        bgn_prd=start_time,
                                                        end_prd=end_time)
                print(bars)
                '''
                for bar in bars:
                    date = parse(str(bar[0]))
                    timestamp = int(re.sub('\D', '', str(bar[1])))
                    sec = timestamp / 1000000
                    min = int(sec % 3600 / 60)
                    hour = int(sec / 3600)
                    sec = int(sec - hour * 3600 - min * 60)
                    date = datetime(date.year, date.month, date.day, hour, min,
                                    sec)
                    #print (ticker, date)
                    frequency = bar_len
                    feed = {
                        'instrument_id': instrument.id,
                        'frequency': frequency,
                        'date': date,
                        'high': float(bar[2]),
                        'low': float(bar[3]),
                        'open': float(bar[4]),
                        'close': float(bar[5]),
                        'volume': float(bar[6])
                    }
                    print(ticker, date, timestamp, feed)
                    bar_list = Feed.search().filter('term', date=date).filter(
                        'term', instrument_id=instrument.id).filter(
                            'term', frequency=frequency)
                    if bar_list and bar_list.count() > 0:
                        pass  #print  'update', symbol
                        mydoc = bar_list.execute()[0]._id
                        yield es.update_op(doc=feed,
                                           id=mydoc,
                                           index='beginning',
                                           doc_type='feed',
                                           doc_as_upsert=True)
                    else:
                        pass  #print  'insert', symbol
                        yield es.index_op(feed)
                #print(bars)
                print(len(bars))
                print("Last Bar Received")

            except (iq.NoDataError, iq.UnauthorizedError) as err:
                print("No data returned because {0}".format(err))

    for chunk in bulk_chunks(documents(),
                             docs_per_chunk=500,
                             bytes_per_chunk=10000):
        # We specify a default index and doc type here so we don't
        # have to repeat them in every operation:
        es.bulk(chunk, doc_type='feed', index='beginning')

Exemple #28

0

Afficher le fichier

Fichier : utils_tests.py Projet : yy1117/pyelasticsearch

 def test_none(self):
     """Make sure empty action iterators work."""
     actions = self.str_xrange(0)
     chunks = bulk_chunks(actions, docs_per_chunk=2)
     eq_(list(chunks), [])

Exemple #29

0

Afficher le fichier

def index_all_sections(skip=0, merged=False):
    for chunk in bulk_chunks(get_all_documents(skip=skip, merged=merged), bytes_per_chunk=15E6):
        es.bulk(chunk, doc_type='a', index=TEST_INDEX_NAME)

Exemple #30

0

Afficher le fichier

Fichier : utils_tests.py Projet : pyelasticsearch/pyelasticsearch

 def test_on(self):
     """Make sure action iterators that end on a chunk boundary work."""
     actions = self.str_xrange(4)
     chunks = bulk_chunks(actions, docs_per_chunk=2)
     self.assertEqual(list(chunks), [['0', '1'], ['2', '3']])

Exemple #31

0

Afficher le fichier

Fichier : downloader.py Projet : st421/congressional-record

    def __init__(self,start,**kwargs):
        """
        Invoke a Downloader object to get data from
        the Record. It will check to see if the necessary
        files are already downloaded and use those instead of
        querying FDSys. Downloaders are the endpoint for raw data.

        Required arguments:

        start : In form 'YYYY-MM-DD.' This is the day/start day you want.

        Optional arguments:

        parse : Defaults to True. This tells the downloader whether you just want
                the raw files, or if you also want it to extract data from the HTML.
                (Default means yes, give me the data.)


        end : Same form as start. This is the end date.

        outpath : Output path RELATIVE TO the present working directory. Defaults
                  to 'output' and works fine when you run it from the repo's root
                  directory.

        do_mode : Specify what kind of data you want from the parser.
                  If do_mode is not set, the downloader will do absolutely zilch.
                  do_mode can take the following values:

                  json : write json files in a /json directory for that
                         day of the Record.

                  es : Specify the URL and index of an ElasticSearch cluster with
                       arguments es_url and index, and it will pass each file to
                       that cluster for indexing. WARNING: This doesn't handle any
                       mappings, and it doesn't check to see if records are already
                       there, so it will overwrite old files in the same index
                       WITHOUT versioning.

                       also specify:
                       es_url : ElasticSearch cluster url
                       index  : ElasticSearch cluster index

                  yield : For each day of the Record the user specifies,
                          the downloader acts like a generator, yielding that day's
                          "crfile" dictionary. 
        """
        self.status = 'idle'
        logging.debug('Downloader object ready with params:')
        logging.debug(','.join(['='.join([key,value]) for key,value in list(kwargs.items())]))
        if 'outpath' in list(kwargs.keys()):
            outpath = kwargs['outpath']
        else:
            outpath = 'output'
        if kwargs['do_mode'] == 'es':
            es = ElasticSearch(kwargs['es_url'])
            for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile
                                        in self.bulkdownload(start,**kwargs)),
                                        docs_per_chunk=100):
                es.bulk(chunk,index=kwargs['index'],doc_type='crdoc')
        elif kwargs['do_mode'] == 'json':
            # outpath called so often to make it easy to follow
            # the idea that we're traversing a directory tree
            for crfile in self.bulkdownload(start,**kwargs):
                filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json'
                outpath = os.path.split(crfile.filepath)[0]
                outpath = os.path.split(outpath)[0]
                if 'json' not in os.listdir(outpath):
                    os.mkdir(os.path.join(outpath,'json'))
                outpath = os.path.join(outpath,'json',filename)
                with open(outpath,'w') as out_json:
                    json.dump(crfile.crdoc,out_json)
        elif kwargs['do_mode'] == 'yield':
            self.yielded = self.bulkdownload(start,parse=True,**kwargs)
        elif kwargs['do_mode'] == 'noparse':
            self.bulkdownload(start,parse=False,**kwargs)

        else:
            return None

Exemple #32

0

Afficher le fichier

Fichier : utils_tests.py Projet : yy1117/pyelasticsearch

 def test_over(self):
     """Make sure action iterators longer than 1 chunk work."""
     actions = self.str_xrange(7)
     chunks = bulk_chunks(actions, docs_per_chunk=3)
     eq_(list(chunks), [['0', '1', '2'], ['3', '4', '5'], ['6']])

Exemple #33

0

Afficher le fichier

Fichier : downloader.py Projet : nclarkjudd/congressionalrecord2

    def __init__(self,start,**kwargs):
        """
        Invoke a Downloader object to get data from
        the Record. It will check to see if the necessary
        files are already downloaded and use those instead of
        querying FDSys. Downloaders are the endpoint for raw data.

        Required arguments:

        start : In form 'YYYY-MM-DD.' This is the day/start day you want.

        Optional arguments:

        parse : Defaults to True. This tells the downloader whether you just want
                the raw files, or if you also want it to extract data from the HTML.
                (Default means yes, give me the data.)


        end : Same form as start. This is the end date.

        outpath : Output path RELATIVE TO the present working directory. Defaults
                  to 'output' and works fine when you run it from the repo's root
                  directory.

        do_mode : Specify what kind of data you want from the parser.
                  If do_mode is not set, the downloader will do absolutely zilch.
                  do_mode can take the following values:

                  json : write json files in a /json directory for that
                         day of the Record.

                  es : Specify the URL and index of an ElasticSearch cluster with
                       arguments es_url and index, and it will pass each file to
                       that cluster for indexing. WARNING: This doesn't handle any
                       mappings, and it doesn't check to see if records are already
                       there, so it will overwrite old files in the same index
                       WITHOUT versioning.

                       also specify:
                       es_url : ElasticSearch cluster url
                       index  : ElasticSearch cluster index

                  yield : For each day of the Record the user specifies,
                          the downloader acts like a generator, yielding that day's
                          "crfile" dictionary.
        """

        self.status = 'idle'
        logging.debug('Downloader object ready with params:')
        logging.debug(','.join(['='.join([key,value]) for key,value in kwargs.items()]))
        if 'outpath' in kwargs.keys():
            outpath = kwargs['outpath']
        else:
            outpath = 'output'
        if kwargs['do_mode'] == 'es':
            es = ElasticSearch(kwargs['es_url'])
            for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile
                                        in self.bulkdownload(start,**kwargs)),
                                        docs_per_chunk=100):
                es.bulk(chunk,index=kwargs['index'],doc_type='crdoc')
        elif kwargs['do_mode'] == 'json':
            # outpath called so often to make it easy to follow
            # the idea that we're traversing a directory tree
            for crfile in self.bulkdownload(start,**kwargs):
                filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json'
                outpath = os.path.split(crfile.filepath)[0]
                outpath = os.path.split(outpath)[0]
                if 'json' not in os.listdir(outpath):
                    os.mkdir(os.path.join(outpath,'json'))
                outpath = os.path.join(outpath,'json',filename)
                with open(outpath,'w') as out_json:
                    json.dump(crfile.crdoc,out_json)
        elif kwargs['do_mode'] == 'yield':
            self.yielded = self.bulkdownload(start,parse=True,**kwargs)
        elif kwargs['do_mode'] == 'noparse':
            self.bulkdownload(start,parse=False,**kwargs)

        else:
            return None

Exemple #34

0

Afficher le fichier

Fichier : oracle_river.py Projet : zzwgit/oracle_river

def Load_Oracle_to_Elasticsearch(cursor, SQL, es, Cle_primaire, DocType, Index, MySQLCnx, NomBase, NomSchema, NomTable):
    """
    fonction permetant d'executer une requête SQL, de l'encoder en JSON, et de l'indexer dans une base Elasticsearch

    :param cursor: curseur oracle
    :param SQL: requete SQL à envoyer à oracle
    :param es: objet de connexion à Elasticsearch
    :param Cle_primaire: Id de stockage dans elasticsearch
    :param DocType: type de document dans elasticsearch
    :param Index: nom de l'index dans Elasticsearch
    :param MySQLCnx: Objet de connexion à la base d'administration MySQL
    :param NomBase: Nom de la base requeté
    :param NomSchema: Nom du shema requeté
    :param NomTable: Nom de la table requeté
    """

    # date/heure du début de l'execution de la requete Oracle
    req_time = datetime.now()

    debut_requete =datetime.now()
    print ">>> Execution de la requète : %s" % (debut_requete)
    # exection de la requete
    cursor.execute(SQL)
    fin_requete = datetime.now()
    print "    -> durée du traitement : %s" % (fin_requete - debut_requete)

    # récupération des entêtes de colones
    colums = [x[0] for x in cursor.description]

    morceau = 1
    while True:

        debut_fetch = datetime.now()
        print ">>> Début du fetch : %s" % (debut_fetch)
        results = cursor.fetchmany(500000)
        fin_fetch = datetime.now()
        #print ">>> Fin du fetch : %d" % (fin_fetch)
        print "    -> durée du traitement : %s" % (fin_fetch - debut_fetch)

        debut_morceau = datetime.now()
        print ">>> Traitement du morceau %d " % (morceau)

        if not results:
            break
        morceau += 1

        # lecture des résultat et encodage en JSON
        i = 0
        list_dico = []

        for rows in results:
            dico = {colums[colums.index(x)] : rows[colums.index(x)] for x in colums}
            list_dico.append(dico)
            i += 1
            # print i

        # insertion en masse dans elasticsearch
        container = ((es.index_op(doc, id=doc[Cle_primaire]) for doc in list_dico))

        for chunk in bulk_chunks(container, docs_per_chunk= 10000, bytes_per_chunk= 10 * 1024 * 1024): # 10MB taille du morceau
            es.bulk(chunk,doc_type=DocType, index=Index)

        fin_morceau = datetime.now()
        print "    -> durée du traitement : %s" % (fin_morceau - debut_morceau)

    # ecriture de la date de dernière exécution dans MySQL
    Insertion_DTM_Exec(MySQLCnx, NomBase, NomSchema, NomTable, req_time)

    # supression du curseur oracle + fermeture de la connexion
    cursor.close()

Exemple #35

0

Afficher le fichier

Fichier : bulk_index_docs.py Projet : dapurv5/es-scripts

 def index(self):
   self.delete_index()
   self.create_index()
   for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000):
     self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type)
   self.es.refresh(self.index_name)

Exemple #36

0

Afficher le fichier

Fichier : utils_tests.py Projet : pyelasticsearch/pyelasticsearch

 def test_none(self):
     """Make sure empty action iterators work."""
     actions = self.str_xrange(0)
     chunks = bulk_chunks(actions, docs_per_chunk=2)
     self.assertEqual(list(chunks), [])