def main(fname_in, fname_out, size, frac): taq_in = marketflow.TAQ2Chunks(fname_in, do_process_chunk=False) downsampled = tp.Downsample(taq_in, frac) sanitized = tp.Sanitizer(tp.SplitChunks(downsampled, 'Symbol_root')) writ_len = 0 with open(fname_out, 'wb') as ofile: ofile.write(taq_in.first_line) for chunk in sanitized: if len(chunk) + writ_len > size: break ofile.write(chunk) writ_len += len(chunk) line_len = len(taq_in.first_line) datestr, numlines = taq_in.first_line.split(b':') first_line = datestr + b':' + str( ' ' * 4).encode() + str(writ_len).encode() first_line += str(' ' * (line_len - len(first_line) - 2)).encode() + b'\r\n' ofile.seek(0) ofile.write(first_line) basename = path.basename(fname_out) with ZipFile(fname_out + '.zip', 'w') as zf: zf.write(fname_out, basename, ZIP_DEFLATED)
def main(fname_in, fname_out, size, frac): taq_in = marketflow.TAQ2Chunks(fname_in, do_process_chunk=False) downsampled = tp.Downsample(taq_in, frac) # We should downsample enough that things will fit in memory! recombined = tp.JoinedChunks(tp.SplitChunks(downsampled, 'Symbol_root'), 'Symbol_root') sanitized = tp.Sanitizer(recombined) # Assemble our chunks - all of this should fit into memory for quick n' # easy testing write_len = 0 chunks = [] for chunk in sanitized: if len(chunk) + write_len > size: break chunks.append(chunk) write_len += len(chunk) # Compute a correct first line for this derived file line_len = len(taq_in.first_line) datestr, numlines = taq_in.first_line.split(b':') first_line = datestr + b':' + b' ' * 4 + str(write_len).encode() # padding for the rest of the line first_line += b' ' * (line_len - len(first_line) - 2) + b'\r\n' with open(fname_out, 'wb') as ofile: ofile.write(first_line) for chunk in sorted(chunks, key=lambda x: x[0]['Symbol_root']): ofile.write(chunk) basename = path.basename(fname_out) with ZipFile(fname_out + '.zip', 'w') as zf: zf.write(fname_out, basename, ZIP_DEFLATED)
def test_row_values(fname, numlines=5): sample = marketflow.TAQ2Chunks(sample_data_dir + fname, chunksize=chunksize) chunk = next(sample) # Check len(chunk) == min(sample.chunksize, length of file) print (sample.numlines) assert len(chunk) == sample.chunksize # Use raw_taq to read in raw bytes chunk_unprocessed_gen = marketflow.TAQ2Chunks(sample_data_dir + fname, chunksize=numlines, do_process_chunk=False) chunk_processed_gen = marketflow.TAQ2Chunks(sample_data_dir + fname, chunksize=numlines, do_process_chunk=True) chunk = next(chunk_unprocessed_gen) chunk_proc = next(chunk_processed_gen) month, day, year = chunk_unprocessed_gen.month, chunk_unprocessed_gen.day, chunk_unprocessed_gen.year for i in range(chunk.shape[0]): entry = chunk[i] msec = int(entry['msec'][2:5]) date_object = arrow.Arrow(year, month, day, hour=int(entry['hour']), minute=int(entry['minute']), second=int(entry['msec'][0:2]), tzinfo=gettz('America/New York')) unix_time = date_object.timestamp + msec/1000 assert unix_time == chunk_proc[i]['Time'] # in bytes symbol_root = entry['Symbol_root'] symbol_suffix = entry['Symbol_suffix'] bid_price = int(entry['Bid_Price'][0:7]) + int(entry['Bid_Price'][7:11])/10000 bid_size = int(entry['Bid_Size']) ask_price = int(entry['Ask_Price'][0:7]) + int(entry['Ask_Price'][7:11])/10000 ask_size = int(entry['Ask_Size']) # Add assert statements assert bid_price == chunk_proc[i][7] assert bid_size == chunk_proc[i][8] assert ask_price == chunk_proc[i][9] assert ask_size == chunk_proc[i][10]
def test_ini_row_value(): '''Test values read explicitly from test_taq.ini''' sample = marketflow.TAQ2Chunks(sample_data_dir + config['taq-data']['std-test-file'], chunksize=chunksize) chunk = next(sample) row0 = chunk[0] test_values = config['std-test-row-values'] assert float(test_values['time']) == row0['Time'] assert int(test_values['hour']) == row0['hour'] assert int(test_values['minute']) == row0['minute'] assert int(test_values['msec']) == row0['msec'] assert test_values['exchange'].encode('ascii') == row0['Exchange'] assert test_values['symbol_root'].encode('ascii') == row0['Symbol_root']
def test_h5_files(fname, tmpdir): # XXX Update to be appropriate conversion to HDF5 sample = marketflow.TAQ2Chunks(sample_data_dir + fname)