def index(all=False, flush_first=False, since=datetime.timedelta(minutes=10)): es = get_connection() if flush_first: flush(es) create(es) if all: events = Event.objects.scheduled_or_processing() else: now = timezone.now() events = ( Event.objects.scheduled_or_processing() .filter(modified__gte=now-since) ) # bulk_chunks() breaks our documents into smaller requests for speed index = get_index() for chunk in pyelasticsearch.bulk_chunks( documents(events, es), docs_per_chunk=500, bytes_per_chunk=10000 ): es.bulk(chunk, doc_type=doc_type, index=index) es.refresh(index)
def index_items(self, index_documents): """ Index the given index_documents. Iterates over the given ``index_documents``, and send documents to :meth:`ievv_opensource.ievv_elasticsearch.search.Connection.bulk` in batches of ``IEVV_ELASTICSEARCH_INDEX_BATCH_SIZE`` index_documents. Parameters: index_documents: An iterable of :class:`.AbstractDocument`. """ searchapi = search.Connection.get_instance() for doc_type, index_documents_of_doc_type in itertools.groupby( index_documents, key=lambda index_document: index_document.doc_type): for chunk in bulk_chunks( self._iterate_index_operations(index_documents_of_doc_type), docs_per_chunk=self.bulk_index_docs_per_chunk, bytes_per_chunk=self.bulk_index_bytes_per_chunk): searchapi.elasticsearch.bulk(chunk, index=self.name, doc_type=doc_type) # NOTE: We should be able to let AbstractDocument.get_index_op_kwargs() # include the doc_type, and avoid the groupby(), but that randomly # raises an exception complaining about missing type. # for chunk in bulk_chunks( # self._iterate_index_operations(index_documents), # docs_per_chunk=self.bulk_index_docs_per_chunk, # bytes_per_chunk=self.bulk_index_bytes_per_chunk): # searchapi.elasticsearch.bulk(chunk, index=self.name) if getattr(settings, 'IEVV_ELASTICSEARCH_AUTOREFRESH_AFTER_INDEXING', False): searchapi.refresh()
def perform_bulk_index(host, index_name, doc_type, doc_fetch, docs_per_chunk, bytes_per_chunk, parallel): Parallel(n_jobs=parallel)( delayed(local_bulk)(host, index_name, doc_type, chunk) for chunk in bulk_chunks(doc_fetch(), docs_per_chunk=docs_per_chunk, bytes_per_chunk=bytes_per_chunk))
def test_bytes_first_too_big(self): """ Don't yield an empty chunk if the first item is over the byte limit on its own. """ actions = ['chimpanzees', 'hi', 'ho'] chunks = bulk_chunks(actions, bytes_per_chunk=6) self.assertEqual(list(chunks), [['chimpanzees'], ['hi', 'ho']])
def test_bytes_first_too_big(self): """ Don't yield an empty chunk if the first item is over the byte limit on its own. """ actions = ['chimpanzees', 'hi', 'ho'] chunks = bulk_chunks(actions, bytes_per_chunk=6) eq_(list(chunks), [['chimpanzees'], ['hi', 'ho']])
def test_bytes(self): """ Make sure byte-based limits work. The last document is not allowed to overshoot the limit. """ actions = ['o', 'hi', 'good', 'chimpanzees'] chunks = bulk_chunks(actions, bytes_per_chunk=5) self.assertEqual(list(chunks), [['o', 'hi'], ['good'], ['chimpanzees']])
def test_bytes(self): """ Make sure byte-based limits work. The last document is not allowed to overshoot the limit. """ actions = ['o', 'hi', 'good', 'chimpanzees'] chunks = bulk_chunks(actions, bytes_per_chunk=5) eq_(list(chunks), [['o', 'hi'], ['good'], ['chimpanzees']])
def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) self.eserver.create_index(index='geonames', settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index='geonames', doc_type='places') print "..", self.eserver.refresh('geonames')
def build_org(es, orm, Org, Orgalias): log.warning("Bulk adding org : start") def docs(): for org in orm.query(Org): yield es.index_op(org_doc(org), id=org.org_id) for chunk in pyelasticsearch.bulk_chunks( docs(), docs_per_chunk=500, bytes_per_chunk=10000): es.bulk(chunk, doc_type=es_doc_type, index=es_index) log.warning("Bulk adding org : end")
def _insert_multiple_docs(self, docs, index_name): """ Index multiple documents simultaneously Parameters ---------- docs: list List of dictionaries with new data doc_type: str Type of inserted documents index_name: str Name of the index that contains inserted documents """ for chunk in bulk_chunks(docs, docs_per_chunk=1000): self._construct_bulk_insert_ops(chunk) self.client.bulk_index(index=index_name, docs=chunk)
def run(self): try: if self.bulk_size < 2: for action in self.proc_binlog(): self.es.bulk([action]) self.mark_binlog() else: for chunk in bulk_chunks(self.proc_binlog(), docs_per_chunk=self.bulk_size): self.es.bulk(chunk) self.mark_binlog() except KeyboardInterrupt: pass except Exception: import traceback logging.error(traceback.format_exc()) self.send_email(msg=traceback.format_exc()) raise
def perform_bulk_index(host, index_name, doc_type, doc_fetch, docs_per_chunk, bytes_per_chunk, parallel): """ Chunk up documents and send them to Elasticsearch in bulk. :param host: the target Elasticsearch host :param index_name: the target index name :param doc_type: the target document type :param doc_fetch: a function to call to fetch documents :param docs_per_chunk: the number of documents per chunk to upload :param bytes_per_chunk: the max bytes per chunk to upload :param parallel: the number of bulk uploads to do at the same time """ Parallel(n_jobs=parallel)( delayed(local_bulk)(host, index_name, doc_type, chunk) for chunk in bulk_chunks(doc_fetch(), docs_per_chunk=docs_per_chunk, bytes_per_chunk=bytes_per_chunk))
def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) settings['mappings'][self._doctype] = settings['mappings'].pop( 'places') try: self.eserver.create_index(index=self._index, settings=settings) except: self.eserver.delete_index(self._index) self.eserver.create_index(index=self._index, settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype) print "..", self.eserver.refresh(self._index)
def index(all=False, flush_first=False, since=datetime.timedelta(minutes=10)): es = get_connection() if flush_first: flush(es) create(es) if all: events = Event.objects.scheduled_or_processing() else: now = timezone.now() events = (Event.objects.scheduled_or_processing().filter( modified__gte=now - since)) # bulk_chunks() breaks our documents into smaller requests for speed index = get_index() for chunk in pyelasticsearch.bulk_chunks(documents(events, es), docs_per_chunk=500, bytes_per_chunk=10000): es.bulk(chunk, doc_type=doc_type, index=index) es.refresh(index)
def _save_internal_transactions(self, blocks_traces): """ Save specified transactions to the database in multiple chunks Save only those which are attached to an ethereum transaction Parameters ---------- blocks_traces : list List of transactions to save """ docs = [ self._preprocess_internal_transaction(transaction) for transaction in blocks_traces if transaction["transactionHash"] ] if docs: for chunk in bulk_chunks(docs, None, BYTES_PER_CHUNK): self.client.bulk_index( docs=chunk, index=self.indices["internal_transaction"], doc_type="itx", id_field="hash", refresh=True)
def get_level_1_quotes_and_trades(ticker: str, seconds: int): def documents(): """Get level 1 quotes and trades for ticker for seconds seconds.""" quote_conn = iq.QuoteConn(name="pyiqfeed-Example-lvl1") quote_listener = iq.VerboseQuoteListener("Level 1 Listener") quote_conn = iq.QuoteConn(name="pyiqfeed-Example-lvl1") quote_conn.add_listener(quote_listener) with iq.ConnConnector([quote_conn]) as connector: all_fields = sorted(list(iq.QuoteConn.quote_msg_map.keys())) quote_conn.select_update_fieldnames(all_fields) quote_conn.watch(ticker) import time time.sleep(seconds) #quote_conn.unwatch(ticker) #quote_conn.remove_listener(quote_listener) for chunk in bulk_chunks(documents(), docs_per_chunk=500, bytes_per_chunk=10000): # We specify a default index and doc type here so we don't # have to repeat them in every operation: es.bulk(chunk, doc_type='feed', index='beginning')
def index_file(tree, tree_indexers, path, es, index): """Index a single file into ES, and build a static HTML representation of it. For the moment, we execute plugins in series, figuring that we have plenty of files to keep our processors busy in most trees that take very long. I'm a little afraid of the cost of passing potentially large TreesToIndex to worker processes. That goes at 52MB/s on my OS X laptop, measuring by the size of the pickled object and including the pickling and unpickling time. :arg path: Absolute path to the file to index :arg index: The ES index name """ try: contents = unicode_contents(path, tree.source_encoding) except IOError as exc: if exc.errno == ENOENT and islink(path): # It's just a bad symlink (or a symlink that was swiped out # from under us--whatever) return else: raise # Just like index_folders, if the path is not in UTF-8, then elasticsearch # will not accept the path, so just move on. rel_path = relpath(path, tree.source_folder) is_text = isinstance(contents, unicode) is_link = islink(path) # Index by line if the contents are text and the path is not a symlink. index_by_line = is_text and not is_link if index_by_line: lines = split_content_lines(contents) num_lines = len(lines) needles_by_line = [{} for _ in xrange(num_lines)] annotations_by_line = [[] for _ in xrange(num_lines)] refses, regionses = [], [] needles = {} linkses = [] for tree_indexer in tree_indexers: file_to_index = tree_indexer.file_to_index(rel_path, contents) if file_to_index.is_interesting(): # Per-file stuff: append_update(needles, file_to_index.needles()) if not is_link: linkses.append(file_to_index.links()) # Per-line stuff: if index_by_line: refses.append(file_to_index.refs()) regionses.append(file_to_index.regions()) append_update_by_line(needles_by_line, file_to_index.needles_by_line()) append_by_line(annotations_by_line, file_to_index.annotations_by_line()) def docs(): """Yield documents for bulk indexing. Big Warning: docs also clears the contents of all elements of needles_by_line because they will no longer be used. """ # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict(# Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, is_folder=False, # And these, which all get mashed into arrays: **needles) links = dictify_links(chain.from_iterable(linkses)) if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines(finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket(tags, lambda index_obj: "regions" if isinstance(index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total) # Because needles_by_line holds a reference, total is not # garbage collected. Since we won't use it again, we can clear # the contents, saving substantial memory on long files. total.clear() # Indexing a 277K-line file all in one request makes ES time out (>60s), # so we chunk it up. 300 docs is optimal according to the benchmarks in # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like # images don't make our chunk sizes ridiculous, there's a size ceiling as # well: 10000 is based on the 300 and an average of 31 chars per line. for chunk in bulk_chunks(docs(), docs_per_chunk=300, bytes_per_chunk=10000): es.bulk(chunk, index=index, doc_type=LINE)
def bg_get_hist_mult(symbols, interval, maxdatapoints, datadirection=0, requestid='', datapointspersend='', intervaltype='', loop=False): try: # The IP address or hostname of your reader READER_HOSTNAME = 'localhost' # The TCP port specified in Speedway Connect READER_PORT = 9100 # Define the size of the buffer that is used to receive data. BUFFER_SIZE = 1024 # Open a socket connection to the reader s = socket.create_connection((READER_HOSTNAME, READER_PORT)) # Set the socket to non-blocking #s.setblocking(0) # Make a file pointer from the socket, so we can read lines fs = s.makefile() tickerdict = dict() while 1: for symbol in symbols: symbol = symbol.upper() pass #print 'Getting ', symbol instrument_list = Instrument.search().filter( 'term', **{ 'sym.raw': symbol }).execute() if instrument_list and len(instrument_list) > 0: instrument = instrument_list[0] pass #print instrument.id, symbol else: instrument = Instrument() instrument.sym = symbol instrument.save() from pandas.io.json import json_normalize feed_list = Feed.search().filter( 'term', frequency=interval).filter( 'term', instrument_id=instrument.id).sort('-date') feed_list = feed_list[:int(maxdatapoints)] res = [] index = 0 for feed in feed_list: index += 1 quote = { 'Date': feed.date, 'Open': feed.open, 'High': feed.high, 'Low': feed.low, 'Close': feed.close, 'Volume': feed.volume } mykey = "%s|%s|%s|%s|%s|%s|%s" % ( instrument.id, interval, feed.date.year, feed.date.month, feed.date.day, feed.date.hour, feed.date.minute) if index > 1: tickerdict[mykey] = quote cmd = "HIX,%s,%s,%s,%s,%s,%s,%s\r\n" % ( symbol, interval, maxdatapoints, datadirection, requestid, datapointspersend, intervaltype) s.sendall(cmd) data = pd.DataFrame({}, columns=[ 'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'TotalVolume' ]).set_index('Date') def documents(): i = 0 while 1: i += 1 try: line = fs.readline() # If data was received, pass #print it if (len(line)): pass #pass #print line fields = line.strip().split(',') ''' Format Notes Request ID Text This field will only exist if the request specified a RequestID. If not specified in the request, the first field in each message will be the Timestamp. Time Stamp CCYY-MM-DD HH:MM:SS Example: 2008-09-01 16:00:01 High Decimal Example: 146.2587 Low Decimal Example: 145.2587 Open Decimal Example: 146.2587 Close Decimal Example: 145.2587 Total Volume Integer Example: 1285001 Period Volume Integer Example: 1285 Number of Trades Integer Example: 10000 - Will be zero for all requests other than tick interval requests Example data: Request: HIX,GOOG,60,10<CR><LF> 2013-08-12 13:44:00,886.0680,886.0680,886.0680,886.0680,1010550,200,0,<CR><LF> ''' if fields[0] == '!ENDMSG!': #s.close() #time.sleep(1) pass #print 'Done',symbol break #return data else: pass #pass #print line date = fields[0] high = float(fields[1]) low = float(fields[2]) open_price = float(fields[3]) close_price = float(fields[4]) total_volume = float(fields[5]) volume = float(fields[6]) trades = fields[7] if date: date = dateutil.parser.parse(date) #date=eastern.localize(date,is_dst=True) pass #pass #print date quote = { 'Date': date, 'Open': open_price, 'High': high, 'Low': low, 'Close': close_price, 'Volume': volume, 'TotalVolume': total_volume, #'wap':WAP, } frequency = interval feed = { 'instrument_id': instrument.id, 'frequency': frequency, 'date': date, 'open': quote['Open'], 'high': quote['High'], 'low': quote['Low'], 'close': quote['Close'], 'volume': quote['Volume'] } mykey = "%s|%s|%s|%s|%s|%s|%s" % ( instrument.id, interval, date.year, date.month, date.day, date.hour, date.minute) if not tickerdict.has_key(mykey): if date < datetime.now(): with open( 'logs\\' + symbol + '_hist.csv', 'a') as outfile: log = "%s,%s,%s,%s,%s,%s,%s\r\n" % ( date, symbol, str(quote['Open']), str(quote['High']), str(quote['Low']), str(quote['Close']), str(quote['Volume'])) pass #print 'logging ',symbol outfile.write(log) outfile.close() bar_list = Feed.search( ).filter( 'term', date=date ).filter( 'term', instrument_id=instrument.id ).filter('term', frequency=frequency) if bar_list and bar_list.count( ) > 0: tickerdict[mykey] = quote pass #print 'update', symbol mydoc = bar_list.execute( )[0]._id yield es.update_op( doc=feed, id=mydoc, index='beginning', doc_type='feed', doc_as_upsert=True) else: tickerdict[mykey] = quote pass #print 'insert', symbol yield es.index_op(feed) #saveQuote(symbol, instrument, interval, quote) #self.saveQuote(dbcontract, quote) data.loc[date] = [ open_price, high, low, close_price, volume, total_volume ] pass #pass #print date,high,low,open,close,volume,total_volume,trades except Exception as e: logging.error("get_btcfeed", exc_info=True) for chunk in bulk_chunks(documents(), docs_per_chunk=500, bytes_per_chunk=10000): # We specify a default index and doc type here so we don't # have to repeat them in every operation: es.bulk(chunk, doc_type='feed', index='beginning') if not loop: break return data except Exception as e: pass #print e
"coordinates" : coords, # 4, 5 "feature_class" : row[6], "feature_code" : row[7], "country_code2" : row[8], "country_code3" : country_code3, "cc2" : row[9], "admin1_code" : row[10], "admin2_code" : row[11], "admin3_code" : row[12], "admin4_code" : row[13], "population" : row[14], "elevation" : row[15], "dem" : row[16], "timzeone" : row[17], "modification_date" : "2014-01-01" } yield es.index_op(doc, index='geonames', doc_type='geoname') except: count += 1 print 'Exception count:', count chunk_count = 0 for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500): es.bulk(chunk) chunk_count += 1 print 'Chunk count:', chunk_count es.refresh('geonames')
"coordinates": coords, # 4, 5 "feature_class": row[6], "feature_code": row[7], "country_code2": row[8], "country_code3": country_code3, "cc2": row[9], "admin1_code": row[10], "admin2_code": row[11], "admin3_code": row[12], "admin4_code": row[13], "population": row[14], "elevation": row[15], "dem": row[16], "timzeone": row[17], "modification_date": "2014-01-01" } yield es.index_op(doc, index='geonames', doc_type='geoname') except: count += 1 print 'Exception count:', count chunk_count = 0 for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500): es.bulk(chunk) chunk_count += 1 print 'Chunk count:', chunk_count es.refresh('geonames')
def index_file(tree, tree_indexers, path, es, index): """Index a single file into ES, and build a static HTML representation of it. For the moment, we execute plugins in series, figuring that we have plenty of files to keep our processors busy in most trees that take very long. I'm a little afraid of the cost of passing potentially large TreesToIndex to worker processes. That goes at 52MB/s on my OS X laptop, measuring by the size of the pickled object and including the pickling and unpickling time. :arg path: Bytestring absolute path to the file to index :arg index: The ES index name """ try: contents = unicode_contents(path, tree.source_encoding) except IOError as exc: if exc.errno == ENOENT and islink(path): # It's just a bad symlink (or a symlink that was swiped out # from under us--whatever) return else: raise # Just like index_folders, if the path is not in UTF-8, then elasticsearch # will not accept the path, so just move on. rel_path = relpath(path, tree.source_folder) is_text = isinstance(contents, unicode) is_link = islink(path) # Index by line if the contents are text and the path is not a symlink. index_by_line = is_text and not is_link if index_by_line: lines = split_content_lines(contents) num_lines = len(lines) needles_by_line = [{} for _ in xrange(num_lines)] annotations_by_line = [[] for _ in xrange(num_lines)] refses, regionses = [], [] needles = {} linkses = [] for tree_indexer in tree_indexers: file_to_index = tree_indexer.file_to_index(rel_path, contents) if file_to_index.is_interesting(): # Per-file stuff: append_update(needles, file_to_index.needles()) if not is_link: linkses.append(file_to_index.links()) # Per-line stuff: if index_by_line: refses.append(file_to_index.refs()) regionses.append(file_to_index.regions()) append_update_by_line(needles_by_line, file_to_index.needles_by_line()) append_by_line(annotations_by_line, file_to_index.annotations_by_line()) def docs(): """Yield documents for bulk indexing. Big Warning: docs also clears the contents of all elements of needles_by_line because they will no longer be used. """ # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict( # Some non-array fields: folder=unicode_for_display(folder_name), name=unicode_for_display(file_name), size=file_info.st_size, is_folder=False, # And these, which all get mashed into arrays: **needles) links = dictify_links(chain.from_iterable(linkses)) if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines( finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket( tags, lambda index_obj: "regions" if isinstance( index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total) # Because needles_by_line holds a reference, total is not # garbage collected. Since we won't use it again, we can clear # the contents, saving substantial memory on long files. total.clear() # Indexing a 277K-line file all in one request makes ES time out (>60s), # so we chunk it up. 300 docs is optimal according to the benchmarks in # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like # images don't make our chunk sizes ridiculous, there's a size ceiling as # well: 10000 is based on the 300 and an average of 31 chars per line. for chunk in bulk_chunks(docs(), docs_per_chunk=300, bytes_per_chunk=10000): es.bulk(chunk, index=index, doc_type=LINE)
def test_over(self): """Make sure action iterators longer than 1 chunk work.""" actions = self.str_xrange(7) chunks = bulk_chunks(actions, docs_per_chunk=3) self.assertEqual(list(chunks), [['0', '1', '2'], ['3', '4', '5'], ['6']])
def test_on(self): """Make sure action iterators that end on a chunk boundary work.""" actions = self.str_xrange(4) chunks = bulk_chunks(actions, docs_per_chunk=2) eq_(list(chunks), [['0', '1'], ['2', '3']])
def test_under(self): """Make sure action iterators shorter than 1 chunk work.""" actions = self.str_xrange(1) # just 0 chunks = bulk_chunks(actions, docs_per_chunk=2) eq_(list(chunks), [['0']])
Rthandler.setFormatter(formatter) logging.getLogger().addHandler(Rthandler) def get_para_5m_raw_data(): examples = [] lines = io.open(PATH, 'r', encoding='utf-8').readlines() for i in lines: s1 = i.split("\t")[0].lower() s2 = i.split("\t")[1].lower() examples.append({'content': s1, 'type': 'origin'}) examples.append({'content': s2, 'type': 'para'}) return examples def document(sentences): for s in sentences: dic = {'content': s['content'], 'type': s['type']} yield es.index_op(dic) if __name__ == '__main__': # init_log() sentences = get_para_5m_raw_data() for chunk in bulk_chunks(document(sentences), docs_per_chunk=1000, bytes_per_chunk=100000): es.bulk(chunk, doc_type='sentence', index='para-nmt-50m') doc_num += 1000 print("indexed" + str(doc_num) + "docs") logging.info("indexed" + str(doc_num) + "docs")
def test_under(self): """Make sure action iterators shorter than 1 chunk work.""" actions = self.str_xrange(1) # just 0 chunks = bulk_chunks(actions, docs_per_chunk=2) self.assertEqual(list(chunks), [['0']])
def get_historical_bar_to_db(ticker: str, bar_len: int, bar_unit: str, num_bars: int): """Shows how to get interval bars.""" hist_conn = iq.HistoryConn(name="pyiqfeed-Example-historical-bars") hist_listener = iq.VerboseIQFeedListener("History Bar Listener") hist_conn.add_listener(hist_listener) """Shows how to get interval bars.""" hist_conn = iq.HistoryConn(name="pyiqfeed-Example-historical-bars") hist_listener = iq.VerboseIQFeedListener("History Bar Listener") hist_conn.add_listener(hist_listener) Instrument.init() Feed.init() symbol = ticker.upper() instrument_list = Instrument.search().filter('term', **{ 'sym.raw': symbol }).execute() if instrument_list and len(instrument_list) > 0: instrument = instrument_list[0] else: instrument = Instrument() instrument.sym = symbol instrument.save() def documents(): with iq.ConnConnector([hist_conn]) as connector: # look at conn.py for request_bars, request_bars_for_days and # request_bars_in_period for other ways to specify time periods etc try: bars = hist_conn.request_bars(ticker=ticker, interval_len=bar_len, interval_type=bar_unit, max_bars=num_bars) ''' today = datetime.now() start_date = today - relativedelta(days=10) start_time = datetime(year=start_date.year, month=start_date.month, day=start_date.day, hour=0, minute=0, second=0) end_time = datetime(year=today.year, month=today.month, day=today.day, hour=23, minute=59, second=59) bars = hist_conn.request_bars_in_period(ticker=ticker, interval_len=bar_len, interval_type=bar_unit, bgn_prd=start_time, end_prd=end_time) print(bars) ''' for bar in bars: date = parse(str(bar[0])) timestamp = int(re.sub('\D', '', str(bar[1]))) sec = timestamp / 1000000 min = int(sec % 3600 / 60) hour = int(sec / 3600) sec = int(sec - hour * 3600 - min * 60) date = datetime(date.year, date.month, date.day, hour, min, sec) #print (ticker, date) frequency = bar_len feed = { 'instrument_id': instrument.id, 'frequency': frequency, 'date': date, 'high': float(bar[2]), 'low': float(bar[3]), 'open': float(bar[4]), 'close': float(bar[5]), 'volume': float(bar[6]) } print(ticker, date, timestamp, feed) bar_list = Feed.search().filter('term', date=date).filter( 'term', instrument_id=instrument.id).filter( 'term', frequency=frequency) if bar_list and bar_list.count() > 0: pass #print 'update', symbol mydoc = bar_list.execute()[0]._id yield es.update_op(doc=feed, id=mydoc, index='beginning', doc_type='feed', doc_as_upsert=True) else: pass #print 'insert', symbol yield es.index_op(feed) #print(bars) print(len(bars)) print("Last Bar Received") except (iq.NoDataError, iq.UnauthorizedError) as err: print("No data returned because {0}".format(err)) for chunk in bulk_chunks(documents(), docs_per_chunk=500, bytes_per_chunk=10000): # We specify a default index and doc type here so we don't # have to repeat them in every operation: es.bulk(chunk, doc_type='feed', index='beginning')
def test_none(self): """Make sure empty action iterators work.""" actions = self.str_xrange(0) chunks = bulk_chunks(actions, docs_per_chunk=2) eq_(list(chunks), [])
def index_all_sections(skip=0, merged=False): for chunk in bulk_chunks(get_all_documents(skip=skip, merged=merged), bytes_per_chunk=15E6): es.bulk(chunk, doc_type='a', index=TEST_INDEX_NAME)
def test_on(self): """Make sure action iterators that end on a chunk boundary work.""" actions = self.str_xrange(4) chunks = bulk_chunks(actions, docs_per_chunk=2) self.assertEqual(list(chunks), [['0', '1'], ['2', '3']])
def __init__(self,start,**kwargs): """ Invoke a Downloader object to get data from the Record. It will check to see if the necessary files are already downloaded and use those instead of querying FDSys. Downloaders are the endpoint for raw data. Required arguments: start : In form 'YYYY-MM-DD.' This is the day/start day you want. Optional arguments: parse : Defaults to True. This tells the downloader whether you just want the raw files, or if you also want it to extract data from the HTML. (Default means yes, give me the data.) end : Same form as start. This is the end date. outpath : Output path RELATIVE TO the present working directory. Defaults to 'output' and works fine when you run it from the repo's root directory. do_mode : Specify what kind of data you want from the parser. If do_mode is not set, the downloader will do absolutely zilch. do_mode can take the following values: json : write json files in a /json directory for that day of the Record. es : Specify the URL and index of an ElasticSearch cluster with arguments es_url and index, and it will pass each file to that cluster for indexing. WARNING: This doesn't handle any mappings, and it doesn't check to see if records are already there, so it will overwrite old files in the same index WITHOUT versioning. also specify: es_url : ElasticSearch cluster url index : ElasticSearch cluster index yield : For each day of the Record the user specifies, the downloader acts like a generator, yielding that day's "crfile" dictionary. """ self.status = 'idle' logging.debug('Downloader object ready with params:') logging.debug(','.join(['='.join([key,value]) for key,value in list(kwargs.items())])) if 'outpath' in list(kwargs.keys()): outpath = kwargs['outpath'] else: outpath = 'output' if kwargs['do_mode'] == 'es': es = ElasticSearch(kwargs['es_url']) for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile in self.bulkdownload(start,**kwargs)), docs_per_chunk=100): es.bulk(chunk,index=kwargs['index'],doc_type='crdoc') elif kwargs['do_mode'] == 'json': # outpath called so often to make it easy to follow # the idea that we're traversing a directory tree for crfile in self.bulkdownload(start,**kwargs): filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json' outpath = os.path.split(crfile.filepath)[0] outpath = os.path.split(outpath)[0] if 'json' not in os.listdir(outpath): os.mkdir(os.path.join(outpath,'json')) outpath = os.path.join(outpath,'json',filename) with open(outpath,'w') as out_json: json.dump(crfile.crdoc,out_json) elif kwargs['do_mode'] == 'yield': self.yielded = self.bulkdownload(start,parse=True,**kwargs) elif kwargs['do_mode'] == 'noparse': self.bulkdownload(start,parse=False,**kwargs) else: return None
def test_over(self): """Make sure action iterators longer than 1 chunk work.""" actions = self.str_xrange(7) chunks = bulk_chunks(actions, docs_per_chunk=3) eq_(list(chunks), [['0', '1', '2'], ['3', '4', '5'], ['6']])
def __init__(self,start,**kwargs): """ Invoke a Downloader object to get data from the Record. It will check to see if the necessary files are already downloaded and use those instead of querying FDSys. Downloaders are the endpoint for raw data. Required arguments: start : In form 'YYYY-MM-DD.' This is the day/start day you want. Optional arguments: parse : Defaults to True. This tells the downloader whether you just want the raw files, or if you also want it to extract data from the HTML. (Default means yes, give me the data.) end : Same form as start. This is the end date. outpath : Output path RELATIVE TO the present working directory. Defaults to 'output' and works fine when you run it from the repo's root directory. do_mode : Specify what kind of data you want from the parser. If do_mode is not set, the downloader will do absolutely zilch. do_mode can take the following values: json : write json files in a /json directory for that day of the Record. es : Specify the URL and index of an ElasticSearch cluster with arguments es_url and index, and it will pass each file to that cluster for indexing. WARNING: This doesn't handle any mappings, and it doesn't check to see if records are already there, so it will overwrite old files in the same index WITHOUT versioning. also specify: es_url : ElasticSearch cluster url index : ElasticSearch cluster index yield : For each day of the Record the user specifies, the downloader acts like a generator, yielding that day's "crfile" dictionary. """ self.status = 'idle' logging.debug('Downloader object ready with params:') logging.debug(','.join(['='.join([key,value]) for key,value in kwargs.items()])) if 'outpath' in kwargs.keys(): outpath = kwargs['outpath'] else: outpath = 'output' if kwargs['do_mode'] == 'es': es = ElasticSearch(kwargs['es_url']) for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile in self.bulkdownload(start,**kwargs)), docs_per_chunk=100): es.bulk(chunk,index=kwargs['index'],doc_type='crdoc') elif kwargs['do_mode'] == 'json': # outpath called so often to make it easy to follow # the idea that we're traversing a directory tree for crfile in self.bulkdownload(start,**kwargs): filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json' outpath = os.path.split(crfile.filepath)[0] outpath = os.path.split(outpath)[0] if 'json' not in os.listdir(outpath): os.mkdir(os.path.join(outpath,'json')) outpath = os.path.join(outpath,'json',filename) with open(outpath,'w') as out_json: json.dump(crfile.crdoc,out_json) elif kwargs['do_mode'] == 'yield': self.yielded = self.bulkdownload(start,parse=True,**kwargs) elif kwargs['do_mode'] == 'noparse': self.bulkdownload(start,parse=False,**kwargs) else: return None
def Load_Oracle_to_Elasticsearch(cursor, SQL, es, Cle_primaire, DocType, Index, MySQLCnx, NomBase, NomSchema, NomTable): """ fonction permetant d'executer une requête SQL, de l'encoder en JSON, et de l'indexer dans une base Elasticsearch :param cursor: curseur oracle :param SQL: requete SQL à envoyer à oracle :param es: objet de connexion à Elasticsearch :param Cle_primaire: Id de stockage dans elasticsearch :param DocType: type de document dans elasticsearch :param Index: nom de l'index dans Elasticsearch :param MySQLCnx: Objet de connexion à la base d'administration MySQL :param NomBase: Nom de la base requeté :param NomSchema: Nom du shema requeté :param NomTable: Nom de la table requeté """ # date/heure du début de l'execution de la requete Oracle req_time = datetime.now() debut_requete =datetime.now() print ">>> Execution de la requète : %s" % (debut_requete) # exection de la requete cursor.execute(SQL) fin_requete = datetime.now() print " -> durée du traitement : %s" % (fin_requete - debut_requete) # récupération des entêtes de colones colums = [x[0] for x in cursor.description] morceau = 1 while True: debut_fetch = datetime.now() print ">>> Début du fetch : %s" % (debut_fetch) results = cursor.fetchmany(500000) fin_fetch = datetime.now() #print ">>> Fin du fetch : %d" % (fin_fetch) print " -> durée du traitement : %s" % (fin_fetch - debut_fetch) debut_morceau = datetime.now() print ">>> Traitement du morceau %d " % (morceau) if not results: break morceau += 1 # lecture des résultat et encodage en JSON i = 0 list_dico = [] for rows in results: dico = {colums[colums.index(x)] : rows[colums.index(x)] for x in colums} list_dico.append(dico) i += 1 # print i # insertion en masse dans elasticsearch container = ((es.index_op(doc, id=doc[Cle_primaire]) for doc in list_dico)) for chunk in bulk_chunks(container, docs_per_chunk= 10000, bytes_per_chunk= 10 * 1024 * 1024): # 10MB taille du morceau es.bulk(chunk,doc_type=DocType, index=Index) fin_morceau = datetime.now() print " -> durée du traitement : %s" % (fin_morceau - debut_morceau) # ecriture de la date de dernière exécution dans MySQL Insertion_DTM_Exec(MySQLCnx, NomBase, NomSchema, NomTable, req_time) # supression du curseur oracle + fermeture de la connexion cursor.close()
def index(self): self.delete_index() self.create_index() for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000): self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type) self.es.refresh(self.index_name)
def test_none(self): """Make sure empty action iterators work.""" actions = self.str_xrange(0) chunks = bulk_chunks(actions, docs_per_chunk=2) self.assertEqual(list(chunks), [])