def split_bson(input_bson_filename, output_bson_filename_1, output_bson_filename_2, n, number_random_example): data = bson.decode_file_iter(open(input_bson_filename, 'rb')) random_items = random.sample(range(n), number_random_example) random_items.sort() print(random_items[0]) r_idx = 0 print(n) with open(output_bson_filename_1, 'w+') as output: for c, d in tqdm(enumerate(data), total=n): if c != random_items[r_idx]: continue else: # print("pick random item: {}".format(c)) # insert your code here. output.write(BSON.encode(d)) r_idx = r_idx + 1 if r_idx == number_random_example: break r_idx = 0 data = bson.decode_file_iter(open(input_bson_filename, 'rb')) with open(output_bson_filename_2, 'w+') as output: for c, d in tqdm(enumerate(data), total=n): if r_idx < number_random_example and c == random_items[r_idx]: r_idx = r_idx + 1 continue else: output.write(BSON.encode(d)) print("Finish convert tfrecords with {} records".format(r_idx))
def large_social_networks_twitter(): # count = 0 degree_thrd = 3 index = [] G = nx.Graph() with open('/mnt/wzhan139/cross media data/Twitter/twitter_followees.bson', "rb") as f: data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore")) count = 0 for c, d in enumerate(data): print("Reading node "+ str(c)) index.append(d['user_name']) G.add_node(d['user_name']) count += 1 with open('/mnt/wzhan139/cross media data/Twitter/twitter_followees.bson', "rb") as f: data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore")) for c, d in enumerate(data): print("Constructing graph in node " + str(c)) for j in range(len(d['followees'])): if G.has_node(d['followees'][j]['screen_name']): G.add_edge(d['user_name'], d['followees'][j]['screen_name']) with open('/mnt/wzhan139/cross media data/Twitter/twitter_followers.bson', "rb") as f: data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore")) for c, d in enumerate(data): print("Constructing graph in node " + str(c)) for i in range(len(d['followers'])): if G.has_node(d['followers'][i]['screen_name']): G.add_edge(d['user_name'], d['followers'][i]['screen_name']) G2 = nx.convert_node_labels_to_integers(G,label_attribute='old_label') num_node = nx.adjacency_matrix(G2).shape[0] sparsity = G2.number_of_edges() / num_node ** 2 print("no thredshold graph sparsity is " + str(sparsity)) print(nx.info(G2)) nx.write_gpickle(G2, "twitter.nothred.gpickle") remove_node=[] for n, d in G2.nodes(data=True): if G2.degree(n)<degree_thrd: remove_node.append(n) G2.remove_nodes_from(np.asarray(remove_node)) G2 = nx.convert_node_labels_to_integers(G2) num_node=nx.adjacency_matrix(G2).shape[0] G3 = nx.from_scipy_sparse_matrix(sp.dia_matrix((np.ones(num_node), 0), shape=nx.adjacency_matrix(G2).shape)) G4=nx.compose(G2,G3) nx.write_gpickle(G4, "twitter.gpickle") nx.write_adjlist(G4, "twitter_adj") nx.write_edgelist(G4, "twitter_edgelist") sparsity = G4.number_of_edges()/num_node**2 print("sparsity is "+ str(sparsity)) print(nx.info(G4))
def load_train_data(path, cutoff, sample_file=0): """ :param path: path of input dataset :param cutoff: how many lines you gonna read into memory :param sample_file: 1 - using sample file :return: list of all category_ids, ids, images(binary), weights(1/n_imgs) """ NCORE = cpu_count() all_categories = mp.Manager().list() all_ids = mp.Manager().list() all_imgs = mp.Manager().list() all_weights = mp.Manager().list() q = mp.Queue(maxsize=NCORE) iolock = mp.Lock() _, _, _, _, _, id2index = read_category(path) pool = mp.Pool(NCORE, initializer=process, initargs=(q, iolock, all_ids, all_categories, all_imgs, all_weights, id2index)) # process the file if sample_file == 0: data = bson.decode_file_iter(open(path + '/train.bson', 'rb')) if sample_file == 1: data = bson.decode_file_iter(open(path + '/train_example.bson', 'rb')) it = 0 for c, d in enumerate(data): if it >= cutoff: break q.put(d) # blocks until q below its max size it = it + 1 # tell workers we're done for _ in range(NCORE): q.put(None) pool.close() pool.join() # convert back to normal dictionary all_categories = list(all_categories) all_ids = list(all_ids) all_imgs = list(all_imgs) all_weights = list(all_weights) return all_categories, all_ids, all_imgs, all_weights
def last_oplog_timestamp(self): oplog = None try: if os.path.isfile( self.lockfile): #Im firs looking for the .lock file self.logger.info( 'Getting last bson-timestamp from previous oplog history file: "%s"' % self.lockfile) oplog = self.getLastTimestampSaved() self.last_dump_ts = oplog elif os.path.isfile(self.origin_dump_oplog): self.logger.info( 'Getting last bson-timestamp from oplog file: "%s"' % self.origin_dump_oplog) oplog = open(self.origin_dump_oplog) for change in bson.decode_file_iter(oplog): self.last_dump_ts = change['ts'] oplog.close() else: raise Exception, 'Could not find an oplog.bson or history file: "%s"!' % self.origin_dump_oplog, None except Exception, e: self.logger.fatal( 'Failed to restore mongodump to destination! Error: "%s"' % e) if oplog: oplog.close() exit(1)
def TestBatchGenerator(batch_size=1): categories = categorydict() count = 0 batchX = [] for sample in bson.decode_file_iter( open(path.join(dataroot, testfile), 'rb')): imgs = sample['imgs'] c = sample['_id'] for i in range(len(imgs)): im = imgs[i]['picture'] im = imread(io.BytesIO(im)) im = cv2.resize(im, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA) batchX.append(im) batchY = c count = count + 1 #print(count) if count < batch_size: pass else: yield np.asarray(batchX), np.asarray(batchY) count = 0 batchX = []
def getTotalTrainImageCount(): count = 0 for sample in bson.decode_file_iter( open(path.join(dataroot, trainfile), 'rb')): imgs = sample['imgs'] count = count + len(imgs) return count
def analyze(recording_file, focus=None, num_top=None, new_aspects=None, filter_stmt=None): # Set up reporter reporter = Reporter(filter_stmt=filter_stmt) if new_aspects: for new_aspect in new_aspects: reporter.add_aspect_from_eval(*new_aspect) plugins = load_plugins() for aspect_class in plugins['aspects']: reporter.add_aspect(aspect_class) # Stream samples to reporter for sample in decode_file_iter(recording_file): reporter.add_sample(sample['t'], sample['o']) echo('== Summary ==') for stat, val in sorted(reporter.report.get_summary().items()): echo(' %s = %s' % (stat, val)) echo() reporter.report.print_top(focus=focus, num_top=num_top)
def merge_bson_unique(output, inputs, uniquefield): uniquefieldset = set() duplicatecount = 0 invalidcount = 0 totalcount = 0 #configure logging logger = logging.getLogger(__name__) logger.info('Creating your output file : %s', output) with open(output, 'wb') as outputbson: for bsonfile in inputs: totalcount += 1 logger.info('Opening input file : %s', bsonfile) with open(bsonfile, 'rb') as bsonfile_handle: iterator = decode_file_iter(bsonfile_handle) for singleobject in iterator: if uniquefield not in singleobject: invalidcount += 1 if singleobject[uniquefield] not in uniquefieldset: outputbson.write(BSON.encode(singleobject)) uniquefieldset.add(singleobject[uniquefield]) else: duplicatecount += 1 logging.info('Finished merging input file : %s', bsonfile) logger.info('Finished merging all input files to path : %s', output) logger.info('Duplicates: %s, Total: %s, Invalid: %s', duplicatecount, totalcount, invalidcount)
def _get_data(self, file_path): """ This method imports the dummy dataset if it wasn't already loaded. """ data_set = [] print("Reading BSON file...") data = bson.decode_file_iter(open(file_path, 'rb')) print("Starting processing...") for c, d in enumerate(data): # Store what product belongs to what category_id = d['category_id'] for e, pic in enumerate(d['imgs']): picture = imread(io.BytesIO(pic['picture'])) # Store image with its label data_set.append((category_id, picture.reshape(1, -1))) if c % 10 == 0 and c > 0: print("Iteration {}".format(c)) # Convert image data to Pandas DataFrame pd_data = pd.DataFrame.from_records(data_set) pd_data.rename(columns={0: 'category_id', 1: 'image'}, inplace=True) return pd_data
def __iter__(self): for item in tqdm(decode_file_iter(open(self.fn, 'rb'))): sentences = mongodoc2sentences(item, '../data/cases') if sentences: for sentence in sentences: yield sentence.split() self.count+=1
def main(args): if os.path.exists(args.save_train_bson): raise FileExistsError(args.save_train_bson) if os.path.exists(args.save_val_bson): raise FileExistsError(args.save_val_bson) logging.info('aggregating id of products...') product_ids = list() with open(args.input_bson, 'rb') as reader: data = bson.decode_file_iter(reader) for x in tqdm(data, unit='products', total=args.num_products): product_ids.append(x.get('_id')) logging.info('shuffle train and val ids...') num_val = int(len(product_ids) * args.val_ratio) random.seed(args.random_seed) random.shuffle(product_ids) val_product_ids = set(random.sample(product_ids, num_val)) train_product_ids = set(product_ids) - val_product_ids logging.info('writing {} products for validation: {}'.format(len(val_product_ids), args.save_val_bson)) encode_dict_list(products=products_iter(args.input_bson, val_product_ids), output_bson_path=args.save_val_bson, total=len(val_product_ids)) logging.info('writing {} products for training: {}'.format(len(train_product_ids), args.save_train_bson)) encode_dict_list(products=products_iter(args.input_bson, train_product_ids), output_bson_path=args.save_train_bson, total=len(train_product_ids))
def products_iter(input_bson_path, product_ids): with open(input_bson_path, 'rb') as reader: data = bson.decode_file_iter(reader) for i, prod in tqdm(enumerate(data), unit='products', total=args.num_products, disable=True): prod_id = prod.get('_id') if prod_id in product_ids: yield prod
def apply_bson_delta(left, patch, outfile): """Apply patch to bson file and output results file""" if isinstance(left, io.IOBase): leftf = left else: leftf = open(left, 'rb') if isinstance(patch, io.IOBase): deltaf = patch else: deltaf = open(patch, 'rb') patchdata = bson.decode(patch.read()) patched = {'a': {}, 'c': {}, 'd': {}} for r in patchdata['records']: if 'patch' in r.keys(): o = r['patch'] else: o = r['obj'] patched[r['mode']][r['uniqkey']] = o for o in bson.decode_file_iter(leftf): if o[patchdata['uniqkey']] in patched['d']: continue elif o[patchdata['uniqkey']] in patched['c']: o = patched['c'][o[patchdata['uniqkey']]] outfile.write(bson.BSON.encode(o)) else: outfile.write(bson.BSON.encode(o)) for o in patched['a'].values(): outfile.write(bson.BSON.encode(o))
def test_generator(bson_file, batch_size): product_cnt = 0 pic_cnt = 0 num_products = NUM_TEST_PRODUCTS num_pics = NUM_TEST_PICS prods = [] pics = [] with open(bson_file, 'rb') as bf: data = bson.decode_file_iter(bf) for prod in data: product_id = prod['_id'] for picidx, pic in enumerate(prod['imgs']): picture = load_img_array(io.BytesIO(pic['picture'])) pics.append(picture) prods.append(product_id) pic_cnt += 1 # guarantee pics in the same prod are in the same batch if 0 <= (pic_cnt % batch_size) < 4 or pic_cnt == num_pics: yield np.array(prods), np.array(pics) prods = [] pics = [] product_cnt += 1 if product_cnt % 1000 == 0 or product_cnt == num_products: print("converted {} products {} images".format( product_cnt, pic_cnt))
def run(): for dataset_file_path, dataset_folder_path in zip( (TRAIN_FILE_PATH, TEST_FILE_PATH), (TRAIN_FOLDER_PATH, TEST_FOLDER_PATH)): print("Processing {} ...".format(dataset_file_path)) with open(dataset_file_path, "rb") as dataset_file_object: data_generator = bson.decode_file_iter(dataset_file_object) for data in data_generator: category_id = data.get("category_id", "dummy") category_folder_path = os.path.join(dataset_folder_path, str(category_id)) os.makedirs(category_folder_path, exist_ok=True) product_id = data["_id"] for picture_id, picture_dict in enumerate(data["imgs"]): picture_content = picture_dict["picture"] picture_file_path = os.path.join( category_folder_path, "{}_{}.jpg".format(product_id, picture_id)) with open(picture_file_path, "wb") as picture_file_object: picture_file_object.write(picture_content) print("All done!")
def random_sample_bson(input_bson_filename, output_bson_filename, n=100, number_random_example=10): data = bson.decode_file_iter(open(input_bson_filename, 'rb')) random_items = random.sample(range(n), number_random_example) random_items.sort() r_idx = 0 with open(output_bson_filename, 'w+') as output: for c, d in tqdm(enumerate(data), total=n): if c != random_items[r_idx]: continue else: # print("pick random item: {}".format(c)) # insert your code here. output.write(BSON.encode(d)) r_idx = r_idx + 1 if r_idx >= number_random_example: break print("Finish convert tfrecords with {} records".format(r_idx))
def run_train_boson_to_image(): bson_file = '/media/ssd/data/kaggle/cdiscount/__download__/train.bson' num_products = 7069896 # 7069896 for train and 1768182 for test out_dir = CDISCOUNT_DIR + '/train' os.makedirs(out_dir, exist_ok=True) categories = pd.read_csv(CDISCOUNT_DIR + '/category_names.csv', index_col='category_id') for category in categories.index: os.makedirs(out_dir + '/' + str(category), exist_ok=True) with open(bson_file, 'rb') as fbson: data = bson.decode_file_iter(fbson) #num_products = len(list(data)) #print ('num_products=%d'%num_products) #exit(0) for n, d in enumerate(data): print('%08d/%08d' % (n, num_products)) category = d['category_id'] _id = d['_id'] for i, pic in enumerate(d['imgs']): img_file = out_dir + '/' + str( category) + '/' + '%s-%d.jpg' % (str(_id), i) #print(img_file) with open(img_file, 'wb') as f: f.write(pic['picture'])
def build_distance_matrix(filename): #read measurements measurements = [] #count = 300 with open(filename) as measurement_file: iter = bson.decode_file_iter(measurement_file) for document in iter: #count -= 1 measurements.append(document) #if count == 0: # break #compute distance matrix distance_matrix = [] for i in range(0, len(measurements)): distance_matrix.append([]); for i in range(0, len(measurements) - 1): distance_matrix[i].append(100.0) for j in range(i + 1, len(measurements)): distance = compute_distance(measurements[i], measurements[j]) distance_matrix[i].append(distance) distance_matrix[j].append(distance) distance_matrix[len(distance_matrix) - 1].append(100.0) return measurements, distance_matrix
def parallel_map(func): with Pool() as pool: with path.open("rb") as file: return [ res for res in pool.imap( func, bson.decode_file_iter(file), chunksize=10000) ]
def get_single_item(idx): data_bson = bson.decode_file_iter(open(train_example_bson, 'rb')) prod_id = [] cat_id = [] img_arr = [] index = 0 for c, d in enumerate(data_bson): product_id = d['_id'] category_id = d['category_id'] # This won't be in Test data # prod_to_category[product_id] = category_id for e, pic in enumerate(d['imgs']): # array of image # picture = imread(io.BytesIO(pic['picture'])) # bytes of image picture = pic['picture'] if idx == index: prod_id.append(product_id) cat_id.append(category_id) img_arr.append(picture) index += 1 return (prod_id[0], cat_id[0], img_arr[0])
def test_backports(self): doc = BSON.encode({"tuple": (1, 2)}) exp = {"tuple": [1, 2]} options = CodecOptions(uuid_representation=ALL_UUID_REPRESENTATIONS[0], tz_aware=False, document_class=dict) self.assertEqual( {"tuple": [1, 2]}, BSON.encode( {"tuple": (1, 2)}, codec_options=options, uuid_subtype=ALL_UUID_REPRESENTATIONS[1]).decode()) self.assertEqual(exp, doc.decode( as_class=SON, tz_aware=True, uuid_subtype=ALL_UUID_REPRESENTATIONS[1], codec_options=options)) self.assertEqual([exp], list(decode_iter( doc, as_class=SON, tz_aware=True, uuid_subtype=ALL_UUID_REPRESENTATIONS[1], codec_options=options))) self.assertEqual([exp], list(decode_file_iter( StringIO(doc), as_class=SON, tz_aware=True, uuid_subtype=ALL_UUID_REPRESENTATIONS[1], codec_options=options))) self.assertEqual([exp], decode_all( doc, SON, True, ALL_UUID_REPRESENTATIONS[1], True, options))
def get_data(NCORE, bsonfile): """ given number of cores, and bsonfile location, returns: dataframe1: _id, category_id dataframe2: _id, list of images """ q = mp.Queue(maxsize=NCORE) iolock = mp.Lock() manager = mp.Manager() prod_to_category = manager.dict() prod_to_images = manager.dict() pool = mp.Pool(NCORE, initializer=process, initargs=(q, iolock, prod_to_category, prod_to_images)) data = bson.decode_file_iter(open(bsonfile, 'rb')) for c, d in enumerate(data): q.put(d) # blocks until q below its max size # tell workers we're done for _ in range(NCORE): q.put(None) pool.close() pool.join() prod_to_category = dict(prod_to_category) prod_to_images = dict(prod_to_images) prod_to_category = pd.DataFrame(list(prod_to_category.items()), columns=['_id', 'category_id']) prod_to_images = pd.DataFrame(list(prod_to_images.items()), columns=['_id', 'images']) return prod_to_category, prod_to_images
def run(self): logging.info("Resolving oplog for host %s:%s to max timestamp: %s" % (self.host, self.port, self.max_end_ts)) try: if self.dump_gzip: tailed_oplog_fh = GzipFile(self.tailed_oplog_file) mongodump_oplog_fh = GzipFile(self.mongodump_oplog_file, 'a+') else: tailed_oplog_fh = open(self.tailed_oplog_file) mongodump_oplog_fh = open(self.mongodump_oplog_file, 'a+') for change in decode_file_iter(tailed_oplog_fh): if 'ts' in change: ts = change['ts'] if ts > self.mongodump_oplog_last_ts or self.mongodump_oplog_last_ts is None: if ts < self.max_end_ts: mongodump_oplog_fh.write(BSON.encode(change)) self.changes += 1 self.last_ts = ts elif ts > self.max_end_ts: break tailed_oplog_fh.close() mongodump_oplog_fh.flush() mongodump_oplog_fh.close() except Exception, e: logging.fatal("Resolving of oplogs failed! Error: %s" % e) raise e
def run_make_train_summary(): bson_file = '/media/ssd/data/kaggle/cdiscount/__download__/train.bson' num_products = 7069896 # 7069896 for train and 1768182 for test out_dir = CDISCOUNT_DIR id = [] num_imgs = [] category_id = [] with open(bson_file, 'rb') as fbson: data = bson.decode_file_iter(fbson) #num_products = len(list(data)) #print ('num_products=%d'%num_products) #exit(0) for n, d in enumerate(data): print('\r%08d/%08d' % (n, num_products), flush=True, end='') category_id.append(d['category_id']) id.append(d['_id']) num_imgs.append(len(d['imgs'])) print('') #by product id df = pd.DataFrame({ '_id': id, 'num_imgs': num_imgs, 'category_id': category_id }) df.to_csv( '/media/ssd/data/kaggle/cdiscount/__temp__/train_by_product_id.csv', index=False) t = df['num_imgs'].sum() #check :12371293 print(t) #split by id -------------------------------------- id_random = list(id) random.shuffle(id_random) #make train, valid num_train = int(0.8 * (num_products)) num_valid = num_products - num_train #by id file1 = CDISCOUNT_DIR + '/split/' + 'train_id_v0_%d' % (num_train) file2 = CDISCOUNT_DIR + '/split/' + 'valid_id_v0_%d' % (num_valid) id1 = id_random[0:num_train] id2 = id_random[num_train:] write_list_to_file(id1, file1) write_list_to_file(id2, file2) #summary ------------------------------------ g = (df.groupby('category_id').agg({ '_id': 'count', 'num_imgs': 'sum' }).reset_index()) g.to_csv('/media/ssd/data/kaggle/cdiscount/__temp__/train_g.csv', index=False)
def transfer_data( ): # A generator used to generating a batch of labels and images #transfer all images to features data = bson.decode_file_iter(open( PIC_PATH, 'rb')) # bson.decode_file_iter is a generator # full list of classes length = len(data) print length char = input() df_categories = pd.read_csv(LABEL_PATH, index_col='category_id') category_classes = df_categories.index.values category_classes = category_classes.reshape(category_classes.shape[0], 1) # using just binarizer without endcoder to convert all unique category_ids to have a column for each class lb = preprocessing.LabelBinarizer() lb.fit(df_categories.index.values) # Size of pictures is defined here instead read size of the first picture n = 500 # Batch size pix_x = 180 pix_y = 180 rgb = 3 X_ids = np.zeros((n, 1)).astype(int) Y = np.zeros((n, 1)).astype(int) # category_id for each row X_images = np.zeros((n, pix_x, pix_y, rgb)) # m images are 180 by 180 by 3 i = 0 config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.7 # Restrict the growth of memory use, or memory will be used up images = tf.placeholder(tf.float32, [n, pix_x, pix_y, rgb], name="images") op = tf.image.resize_images(images, [MODEL_SIZE, MODEL_SIZE], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) batch_num = 0 with tf.Session(config=config) as sess: for c, d in enumerate(data): for e, pic in enumerate(d['imgs']): if i == 0: Y = np.zeros( (n, 1)).astype(int) # category_id for each row X_images = np.zeros( (n, pix_x, pix_y, rgb)) # m images are 180 by 180 by 3 picture = imread(io.BytesIO( pic['picture'])) # All images should be added. Y[i] = d['category_id'] X_images[i] = picture i += 1 if i == n: batch_num += 1 i = 0 Y_flat = lb.transform(Y) X_flat = sess.run(op, feed_dict={images: X_images}) Y = np.zeros( (n, 1)).astype(int) # category_id for each row X_images = np.zeros( (n, pix_x, pix_y, rgb)) # m images are 180 by 180 by 3 yield X_flat, Y_flat
def from_bson(cls, x): if six.PY2: # Hack for python 2: may work in py3 too, but it's definitely not the standard way! reader = bson.decode_file_iter(six.BytesIO(x)) event_dict = next(reader) else: event_dict = bson.BSON.decode(x) return cls(**event_dict)
def __iter__(self): with open(self._filename, 'rb') as f: i = 1 for tweet in decode_file_iter(f): if self._limit and i > self._limit: raise StopIteration if all(func(tweet) for func in self._filter_functions): i += 1 yield tweet
def test_invalid_decodes(self): # Invalid object size (not enough bytes in document for even # an object size of first object. # NOTE: decode_all and decode_iter don't care, not sure if they should? self.assertRaises(InvalidBSON, list, decode_file_iter(StringIO(b"\x1B"))) # An object size that's too small to even include the object size, # but is correctly encoded, along with a correct EOO (and no data). data = b"\x01\x00\x00\x00\x00" self.assertRaises(InvalidBSON, decode_all, data) self.assertRaises(InvalidBSON, list, decode_iter(data)) self.assertRaises(InvalidBSON, list, decode_file_iter(StringIO(data))) # One object, but with object size listed smaller than it is in the # data. data = (b"\x1A\x00\x00\x00\x0E\x74\x65\x73\x74" b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C" b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00" b"\x05\x00\x00\x00\x00") self.assertRaises(InvalidBSON, decode_all, data) self.assertRaises(InvalidBSON, list, decode_iter(data)) self.assertRaises(InvalidBSON, list, decode_file_iter(StringIO(data))) # One object, missing the EOO at the end. data = (b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74" b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C" b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00" b"\x05\x00\x00\x00") self.assertRaises(InvalidBSON, decode_all, data) self.assertRaises(InvalidBSON, list, decode_iter(data)) self.assertRaises(InvalidBSON, list, decode_file_iter(StringIO(data))) # One object, sized correctly, with a spot for an EOO, but the EOO # isn't 0x00. data = (b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74" b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C" b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00" b"\x05\x00\x00\x00\xFF") self.assertRaises(InvalidBSON, decode_all, data) self.assertRaises(InvalidBSON, list, decode_iter(data)) self.assertRaises(InvalidBSON, list, decode_file_iter(StringIO(data)))
def test_decode_file_iter(self): expected, bson_data = self._generate_multidocument_bson_stream() fileobj = tempfile.TemporaryFile() fileobj.write(bson_data) fileobj.seek(0) for expected_doc, decoded_doc in zip( expected, decode_file_iter(fileobj, self.codecopts)): self.assertEqual(expected_doc, decoded_doc) fileobj.close()
def test_date_filter_bson_date_filter_until(self): self.setUp() args = date_filter_bson.parse_args(['-i', os.path.dirname(os.path.abspath(__file__))+'/../test/test.bson', '-d2', '2016-01-18 00:00:00', '-o', os.path.dirname(os.path.abspath(__file__))+'/../test/output.bson']) date_filter_bson.date_filter(args.output, args.input, args.dateone, args.datetwo) count = 0 with open(os.path.dirname(os.path.abspath(__file__))+'/../test/output.bson', 'rb') as bsonfile_handle: iterator = decode_file_iter(bsonfile_handle) for line in iterator: count+=1 self.assertEqual(count, 21) #remove output self.tearDown self.tearDown()
def filter_records(infile, year, month, day, tz): """ Takes in a file handle pointing at a BSON file, and a year, month, day, timezone. Returns only those tweets which were sent on that date in that timezone. """ it = decode_file_iter(infile) try: for rec in it: d = tweet_date(rec).astimezone(tz) if d.year == year and d.month == month and d.day == day: yield rec except Exception as e: print e
def merge_bson(output, inputs): #configure logging logger = logging.getLogger(__name__) logger.info('Creating your output file : %s', output) with open(output, 'wb') as outputbson: for bsonfile in inputs: logger.info('Opening input file : %s', bsonfile) with open(bsonfile, 'rb') as bsonfile_handle: iterator = decode_file_iter(bsonfile_handle) for line in iterator: outputbson.write(BSON.encode(line)) logger.info('Finished merging input file : %s', bsonfile) logger.info('Finished merging all input files to path : %s', output)
def load(self): try: oplog = self.open() logging.debug("Reading oplog file %s" % self.oplog_file) for change in decode_file_iter(oplog, CodecOptions(unicode_decode_error_handler="ignore")): if 'ts' in change: self._last_ts = change['ts'] if self._first_ts is None and self._last_ts is not None: self._first_ts = self._last_ts self._count += 1 oplog.close() except Exception, e: logging.fatal("Error reading oplog file %s! Error: %s" % (self.oplog_file, e)) raise OperationError(e)
def get_iterator(self): tweet_parser = TweetParser() bson_handle = open(self.filepath, 'rb') for count, tweet in enumerate(bson.decode_file_iter(bson_handle)): if self.limit < count+1 and self.limit != 0: bson_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet bson_handle.close()
def convert_bson(self): message = 'Converting BSON "{}" to language shelf #{}'.format(self.dataset, self.process_id) bson_file = ProgressFile(self.bson_file, 'rb', message=message) shelf_name = 'languages-' + self.process_id + '.shelf' languages = shelve.open(self.path + shelf_name, writeback=True) # Read every BSON object as an iterator to save memory. for raw_json in bson.decode_file_iter(bson_file): repository = raw_json['full_name'].encode('utf-8') language = raw_json['language'].encode('utf-8') if raw_json['language'] is not None else '' languages[repository] = language languages.close() bson_file.close() self.cleanup(shelf_name)
def split_images(source, depth): categories = get_categories(depth) with open(source, 'rb') as data: for entry in bson.decode_file_iter(data): product_id = entry['_id'] category = categories[int(entry['category_id'])] target = os.path.join('data', 'categories' + str(depth), category) if not os.path.exists(target): print target os.makedirs(target) for e, pic in enumerate(entry['imgs']): picture = imread(io.BytesIO(pic['picture'])) picture_file = os.path.join(target, str(product_id) + '_' + str(e) + '.jpg') imsave(picture_file, picture)
def read(self): if os.path.isfile(self.oplog_file): try: logging.debug("Reading oplog file %s" % self.oplog_file) if self.dump_gzip: oplog = GzipFile(self.oplog_file) else: oplog = open(self.oplog_file) for change in decode_file_iter(oplog): if 'ts' in change: self._last_ts = change['ts'] if self._first_ts is None and self._last_ts is not None: self._first_ts = self._last_ts self._count += 1 oplog.close() except Exception, e: logging.fatal("Error reading oplog file %s! Error: %s" % (self.oplog_file, e)) raise e
def test_basic_decode(self): self.assertEqual({"test": u("hello world")}, BSON(b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74\x00\x0C" b"\x00\x00\x00\x68\x65\x6C\x6C\x6F\x20\x77\x6F" b"\x72\x6C\x64\x00\x00").decode()) self.assertEqual([{"test": u("hello world")}, {}], decode_all(b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74" b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C" b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00" b"\x05\x00\x00\x00\x00")) self.assertEqual([{"test": u("hello world")}, {}], list(decode_iter( b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74" b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C" b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00" b"\x05\x00\x00\x00\x00"))) self.assertEqual([{"test": u("hello world")}, {}], list(decode_file_iter(StringIO( b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74" b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C" b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00" b"\x05\x00\x00\x00\x00"))))
def convert_bson(self): output = open(self.path + self.dataset + '.json', 'wb') message = 'Converting BSON "{}" and filtering fields'.format(self.dataset) bson_file = ProgressFile(self.bson_file, 'rb', message=message) Shelf.merge_shelves() if os.path.isfile('languages.shelf'): if self.path != "" and not os.path.isfile(self.path + 'languages.shelf'): print("#{}. Copying languages shelf to local directory...".format(MPI.COMM_WORLD.rank)) shutil.copy('languages.shelf', self.path) languages = shelve.open(self.path + 'languages.shelf', writeback=True) else: languages = {} # Read every BSON object as an iterator to save memory. for raw_json in bson.decode_file_iter(bson_file): if not self.is_latin(raw_json['body']): continue preprocessed_json = {} repository = str(re.search(r"repos/([^/]+/[^/]+)(/|$)", raw_json['url']).group(1)) raw_json['language'] = '' if repository in languages: raw_json['language'] = languages[repository] for item in self.keep_fields: preprocessed_json[item] = raw_json[item] json.dump(preprocessed_json, output) output.write('\n') output.close() bson_file.close() # Don't move the file for now, since the commit comments only need to # be on the worker nodes if we're running under MPI self.cleanup()
def run(self): try: self.oplogs['backup'] = Oplog(self.mongodump_oplog['file'], self.do_gzip(), 'a+', self.flush_docs, self.flush_secs) self.oplogs['tailed'] = Oplog(self.tailed_oplog['file'], self.do_gzip()) logging.info("Resolving oplog for %s to max ts: %s" % (self.uri, self.max_end_ts)) self.state.set('running', True) self.state.set('first_ts', self.mongodump_oplog['first_ts']) if not self.state.get('first_ts'): self.state.set('first_ts', self.tailed_oplog['first_ts']) for change in decode_file_iter(self.oplogs['tailed'], CodecOptions(unicode_decode_error_handler="ignore")): self.last_ts = change['ts'] if not self.mongodump_oplog['last_ts'] or self.last_ts > self.mongodump_oplog['last_ts']: if self.last_ts < self.max_end_ts: self.oplogs['backup'].add(change) self.changes += 1 elif self.last_ts > self.max_end_ts: break self.state.set('count', self.mongodump_oplog['count'] + self.changes) self.state.set('last_ts', self.last_ts) self.state.set('running', False) self.exit_code = 0 except Exception, e: raise Error("Resolving of oplogs failed! Error: %s" % e)
def parse_train_example(): # train_example_file = open(DATA_DIR + 'train_example.bson', 'rb') # data = bson.loads(train_example_file.read()) # train_example = bson.decode_file_iter(open(DATA_DIR + 'train_example.bson', 'rb')) # data = bson.decode_document(open(DATA_DIR + 'train_example.bson', 'rb')) data = [] for key, value in enumerate(train_example): product_id = value['_id'] category_id = value['category_id'] # This won't be in Test data # prod_to_category[product_id] = category_id pics = [] for e, pic in enumerate(value['imgs']): picture = imread(io.BytesIO(pic['picture'])) # do something with the picture, etc # plt.imshow(picture) # plt.title(category_id) # plt.show() pics.append(picture) data.append((product_id, category_id, pics)) return data
#!/usr/bin/python3 # coding: utf-8 import bson # pip install pymongo 这样安装的附带包, 不能直接 pip install bson (这样出来是第三方的, 好多函数都没有) ################################################################## ## 写 bson 文件 post1 = {"author": "Mike", "text": "Another post!", "tags": ["bulk", "insert"], "date": 14} post2 = {"author": "Jenny", "text": "Another post!", "tags": ["bulk", "insert"], "date": 14} f = open('tmp.bson', 'wb') f.write(bson.BSON.encode(post1)) f.write(bson.BSON.encode(post2)) f.close() ################################################################## ## 直接使用 Python 读取 bson 文件, bson 文件是 json 的二进制格式 items = list(bson.decode_file_iter(open('./tmp.bson', 'rb'))); print(len(items)) # 2 item = items[0]; print(type(items), type(item)) # <class 'list'> <class 'dict'> print(item.keys()) # dict_keys(['author', 'text', 'tags', 'date'])
validation_data = val_gen, validation_steps = num_val_images // batch_size + 1, callbacks=[early_stopping, model_checkpoint]) print(history.history.keys()) # logging logging.info('N. epochs == '+str(len(history.history['val_acc']))) logging.info('Val accuracy == '+str(max(history.history['val_acc']))) ## Predict on Test-set print(">>> Predicting on test-set ... ") submission_df = pd.read_csv("data/sample_submission.csv") print(submission_df.head()) test_datagen = ImageDataGenerator() data = bson.decode_file_iter(open(test_bson_path, "rb")) with tqdm(total=num_test_products) as pbar: for c, d in enumerate(data): product_id = d["_id"] num_imgs = len(d["imgs"]) batch_x = np.zeros((num_imgs, 180, 180, 3), dtype=K.floatx()) for i in range(num_imgs): bson_img = d["imgs"][i]["picture"] # Load and preprocess the image. img = load_img(io.BytesIO(bson_img), target_size=(180, 180)) x = img_to_array(img) x = preprocess_image(x) # = test_datagen.random_transform(x) # = test_datagen.standardize(x) # Add the image to the batch. batch_x[i] = x
#!/usr/bin/env python import bson import bson.json_util from urllib import urlopen BUS='http://localhost:8000/test' print "Connecting to %s and receiving objects..." % BUS param = { 'heartbeat': 10, 'queue': { 'SYSTEM_ALERT': { 'seq': -1 } } } ack = bson.BSON(urlopen(BUS + '/open', bson.BSON.encode(param)).read()).decode() print bson.json_util.dumps(ack, indent=2) for msg in bson.decode_file_iter(urlopen(BUS + '/stream/' + str(ack['sid']))): print bson.json_util.dumps(msg, indent=2)
def build_load_files(self): import bson def yn(s): """ Return 'Y' if int(s) is True, or 'N' otherwise. Return an empty string if s is None. """ if s: if int(s): return "Y" else: return "N" return "" files = [x for x in sorted(self.get_file_list(self.arguments.path, self.file_filter)) if os.path.basename(x).lower() in BuilderClass.expected_files] if self._widget: self._widget.progressSet.emit(len(BuilderClass.expected_files), "") self._widget.progressUpdate.emit(0) self._corpus_id = 0 for i, filepath in enumerate(files): filename = os.path.basename(filepath) if filename == "wordforms.bson": max_cache = 20000 self.table(self.corpus_table)._max_cache = max_cache self._widget.progressSet.emit(4520596 // max_cache, "Loading {}".format(filename)) self._widget.progressUpdate.emit(0) else: self._widget.labelSet.emit("Loading {}".format(filename)) with open(filepath, "rb") as input_file: for entry in bson.decode_file_iter(input_file): self._entry = entry if filename == "sources.bson": self._source_id = len(self._source_dict) + 1 self._source_dict[str(entry["key"])] = self._source_id d = { self.source_id: self._source_id, self.source_label: entry.get("title", ""), self.source_year: entry.get("year", ""), self.source_author: entry.get("author", ""), self.source_key: entry.get("key", ""), self.source_note: entry.get("note", "")} self.table(self.source_table).add(d) elif filename == "roots.bson": self._root_id = len(self._root_dict) + 1 self._root_dict[str(entry["_id"])] = self._root_id d = {self.root_id: self._root_id, self.root_radicals: entry.get("radicals", ""), self.root_type: entry.get("type", ""), self.root_variant: entry.get("variant", 0), self.root_alternatives: entry.get("alternatives", "")} self.table(self.root_table).add(d) elif filename == "lexemes.bson": # Fix some spelling mistakes in the key names: for x, correct in [("achaic", "archaic"), ("archaic ", "archaic"), ("adverbial ", "adverbial"), ("instransitive", "intransitive")]: if x in entry.keys(): entry[correct] = entry[x] self._lemma_id = len(self._lemma_dict) + 1 self._lemma_dict[str(entry["_id"])] = self._lemma_id # get root id if possible, and also root radicals: root_id = None root = entry.get("root", "") if root: root_id = str(root.get("_id")) root_radicals = root.get("radicals", "") root_link = self._root_dict.get(root_id, 0) # look up headword: headword = None headword_dict = entry.get("headword", "") if headword_dict: headword = headword_dict.get("lemma") # fix 'verbalnoun': verbal_noun = entry.get("verbalnoun", "") if verbal_noun == "verbalnoun" or verbal_noun == "1": verbal_noun = "N" d = { self.lemma_id: self._lemma_id, self.lemma_label: entry.get("lemma", ""), self.lemma_adjectival: yn(entry.get("adjectival")), self.lemma_adverbial: yn(entry.get("adverbial")), self.lemma_alternatives: ";".join(entry.get("alternatives", [])), self.lemma_apertiumparadigm: entry.get("apertium_paradigm", ""), self.lemma_archaic: yn(entry.get("archaic")), self.lemma_created: entry.get("created", ""), self.lemma_derived_form: entry.get("derived_form", 0), self.lemma_ditransitive: yn(entry.get("ditransitive")), self.lemma_features: entry.get("features"), self.lemma_feedback: entry.get("feedback", ''), self.lemma_form: entry.get("form", ''), self.lemma_frequency: entry.get("frequency", 0), self.lemma_gender: entry.get("gender", ""), self.lemma_gloss: entry.get("gloss", ""), self.lemma_headword: headword, self.lemma_hypothetical: yn(entry.get("hypothetical")), self.lemma_intransitive: yn(entry.get("intransitive")), self.lemma_modified: entry.get("modified", ""), self.lemma_notduplicate: yn(entry.get("not_duplicate")), self.lemma_number: entry.get("number", ""), self.lemma_onomastictype: entry.get("onomastic_type", ""), self.lemma_participle: yn(entry.get("participle")), self.lemma_pending: yn(entry.get("pending")), self.lemma_pos: entry.get("pos",''), self.lemma_radicals: root_radicals, self.lemma_root_id: root_link, self.lemma_transcript: entry.get("phonetic", ""), self.lemma_verbalnoun: entry.get("verbalnoun")} self.table(self.lemma_table).add(d) elif filename == "wordforms.bson": self._corpus_id += 1 # try to get source id at all costs: source_id = None source_list = entry.get("sources") if source_list: try: source_id = self._source_dict[source_list[0]] except KeyError: for x in self._source_dict: if self._source_dict[x] == source_list[0]: source_id = self._source_dict[x] break else: source_id = 0 # collapse the dictionaries behind subject, # ind_obj, and dir_obj: subj_dict = entry.get("subject") l = [] if subj_dict: l = [subj_dict["person"], subj_dict["number"]] if "gender" in subj_dict: l.append(subj_dict["gender"]) subj = "_".join(l) ind_obj_dict = entry.get("ind_obj") l = [] if ind_obj_dict: l = [ind_obj_dict["person"], ind_obj_dict["number"]] if "gender" in ind_obj_dict: l.append(ind_obj_dict["gender"]) ind_obj = "_".join(l) dir_obj_dict = entry.get("dir_obj") l = [] if dir_obj_dict: l = [dir_obj_dict["person"], dir_obj_dict["number"]] if "gender" in dir_obj_dict: l.append(dir_obj_dict["gender"]) dir_obj = "_".join(l) d = {self.corpus_id: self._corpus_id, self.corpus_adverbial: yn(entry.get("adverbial")), self.corpus_alternatives: ";".join(entry.get("alternatives", [])), self.corpus_archaic: yn(entry.get("archaic")), self.corpus_aspect: entry.get("aspect", ""), self.corpus_created: entry.get("created", ""), self.corpus_dir_obj: dir_obj, self.corpus_form: entry.get("form", ""), self.corpus_full: entry.get("full", ""), self.corpus_gender: entry.get("gender", ""), self.corpus_generated: yn(entry.get("generated")), self.corpus_gloss: entry.get("gloss", ""), self.corpus_hypothetical: yn(entry.get("hypothetical")), self.corpus_ind_obj: ind_obj, self.corpus_lemma_id: self._lemma_dict.get(str(entry.get("lexeme_id"))), self.corpus_modified: entry.get("modified", ""), self.corpus_number: entry.get("number", ""), self.corpus_pattern: entry.get("pattern", ""), self.corpus_transcript: entry.get("phonetic", ""), self.corpus_plural_form: entry.get("plural_form", ""), self.corpus_polarity: entry.get("polarity", ""), self.corpus_possessor: entry.get("possessor", ""), self.corpus_source_id: source_id, self.corpus_subject: subj, self.corpus_word: entry.get("surface_form", "")} self.table(self.corpus_table).add(d) phon = entry.get("phonetic") if self._widget and not self._corpus_id % max_cache: self._widget.progressUpdate.emit(self._corpus_id // max_cache) self.commit_data()
def test_backport_codec_options_uuid(self): if not should_test_uuid: raise SkipTest("No uuid module") # Generated by the Java driver from_java = b('bAAAAAdfaWQAUCBQxkVm+XdxJ9tOBW5ld2d1aWQAEAAAAAMIQkfACFu' 'Z/0RustLOU/G6Am5ld2d1aWRzdHJpbmcAJQAAAGZmOTk1YjA4LWMwND' 'ctNDIwOC1iYWYxLTUzY2VkMmIyNmU0NAAAbAAAAAdfaWQAUCBQxkVm+' 'XdxJ9tPBW5ld2d1aWQAEAAAAANgS/xhRXXv8kfIec+dYdyCAm5ld2d1' 'aWRzdHJpbmcAJQAAAGYyZWY3NTQ1LTYxZmMtNGI2MC04MmRjLTYxOWR' 'jZjc5Yzg0NwAAbAAAAAdfaWQAUCBQxkVm+XdxJ9tQBW5ld2d1aWQAEA' 'AAAAPqREIbhZPUJOSdHCJIgaqNAm5ld2d1aWRzdHJpbmcAJQAAADI0Z' 'DQ5Mzg1LTFiNDItNDRlYS04ZGFhLTgxNDgyMjFjOWRlNAAAbAAAAAdf' 'aWQAUCBQxkVm+XdxJ9tRBW5ld2d1aWQAEAAAAANjQBn/aQuNfRyfNyx' '29COkAm5ld2d1aWRzdHJpbmcAJQAAADdkOGQwYjY5LWZmMTktNDA2My' '1hNDIzLWY0NzYyYzM3OWYxYwAAbAAAAAdfaWQAUCBQxkVm+XdxJ9tSB' 'W5ld2d1aWQAEAAAAAMtSv/Et1cAQUFHUYevqxaLAm5ld2d1aWRzdHJp' 'bmcAJQAAADQxMDA1N2I3LWM0ZmYtNGEyZC04YjE2LWFiYWY4NzUxNDc' '0MQAA') data = base64.b64decode(from_java) # Test decode_all. docs = bson.decode_all( data, dict, True, JAVA_LEGACY, True, CodecOptions(SON, False, STANDARD)) for d in docs: self.assertNotEqual(d['newguid'], uuid.UUID(d['newguidstring'])) encoded = [bson.BSON.encode( doc, uuid_subtype=JAVA_LEGACY, codec_options=CodecOptions(uuid_representation=STANDARD)) for doc in docs] # Test decode. docs2 = [e.decode( uuid_subtype=JAVA_LEGACY, as_class=dict, tz_aware=True, codec_options=CodecOptions(SON, False, STANDARD)) for e in encoded] for d in docs2: self.assertNotEqual(d['newguid'], uuid.UUID(d['newguidstring'])) # Test encode. for i in range(len(docs)): self.assertEqual(docs2[i]['newguid'], docs[i]['newguid']) self.assertEqual(uuid.UUID(docs2[i]['newguidstring']), uuid.UUID(docs[i]['newguidstring'])) # Test decode_iter docs = bson.decode_iter( data, dict, True, JAVA_LEGACY, True, CodecOptions(SON, False, STANDARD)) for d in docs: self.assertNotEqual(d['newguid'], uuid.UUID(d['newguidstring'])) # Test decode_file_iter docs = bson.decode_file_iter( StringIO(data), dict, True, JAVA_LEGACY, True, CodecOptions(SON, False, STANDARD)) for d in docs: self.assertNotEqual(d['newguid'], uuid.UUID(d['newguidstring']))
## 读数据库 import bson import sys from whoosh.index import create_in, open_dir from whoosh.fields import TEXT, ID, STORED, KEYWORD, NUMERIC, Schema from whoosh.qparser import QueryParser from jieba.analyse import ChineseAnalyzer import jieba import pickle from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer analyzer = ChineseAnalyzer() ################################################################## ## 分析 bson 数据 # items = list(bson.decode_file_iter(open('./tmp.bson', 'rb'))); print(len(items)) # 14128; 读取 BSON 文件 items = list(bson.decode_file_iter(open('./tmp_news/sina.bson', 'rb'))); print(len(items)) # 140639; 读取 BSON 文件 print(items[0].keys()) # dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_time2', 'news_channel', 'news_title', 'news_source', 'news_time', 'news_body', 'news_keywords', 'news_show', 'news_total']) print(items[0]['news_id']) # fxzczfc6652525 print(items[0]['news_keywords']) # ['陕西', '公厕爆炸'] print(items[0]['news_show']) # 0 print(items[0]['news_time']) # 2017年05月17日23:48 print(items[0]['news_time2']) # 1495036111; 时间戳 print(set([len(item['news_title']) for item in items if item.get('news_title', 0) != 0])) # {6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37}; 这些都是 str 长度 print(set([len(item['news_body']) for item in items])) # {0, 1, 2, 14, 15, 16, ..., 353, 982, 478, 1004}; 居然还能分成 1004 段... print(set([len([''.join(item['news_body'])]) for item in items])) # {1}; 清净多了 print(set([str(item.keys()) for item in items])) # 好多缺失值 # {"dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_channel', 'news_title', 'news_source', 'news_time', 'news_body', 'news_keywords', 'flag'])", # "dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_channel', 'news_title', 'news_source', 'news_time', 'news_body', 'news_keywords', 'flag', 'news_show', 'news_total'])", # "dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_channel', 'news_title', 'news_source', 'news_time', 'news_body', 'flag', 'news_show', 'news_total'])", # "dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_channel', 'news_title', 'news_time', 'news_body', 'news_keywords', 'flag', 'news_show', 'news_total'])",
while True: if index >= len(success_bucket_floors): break if failure_rate < success_bucket_floors[index]: break index += 1 domain_map[fields[0]] = success_bucket_floors[index - 1] #print "adding", fields[0], " ", success_bucket_floors[index-1] #read measurements error_map = {} with open(sys.argv[1]) as measurement_file: iter = bson.decode_file_iter(measurement_file) for document in iter: #get bucket map using error code error_message = document["measurement_error_message"] error_fields = error_message.split() error_code = int(error_fields[0].replace("[","").replace("]","")) if error_code not in error_map: bucket_map = {} for bucket_floor in success_bucket_floors: bucket_map[bucket_floor] = 0 error_map[error_code] = bucket_map bucket_map = error_map[error_code]
parser.add_argument("-n", "--num", dest="num", type=int, default=5, help="number of bson items") parser.add_argument("--pprint", dest="pprint", action="store_true", default=False, help="prettified print") parser.add_argument("-o", "--output", dest="output_file", type=str, help="output bson file") parser.add_argument("input_file", type=str, help="input bson file") args = parser.parse_args() assert args.num > 0 assert args.input_file if args.output_file: out_file = open(args.output_file, "wb") else: out_file = None with open(args.input_file, "rb") as in_file: for i, entry in enumerate(bson.decode_file_iter(in_file)): if i >= args.num: break if out_file is None: if args.pprint: pprint.pprint(entry) else: print entry else: out_file.write(bson.BSON.encode(entry)) if out_file: out_file.close()
data_path = Path('data') base_path = data_path / 'test' base_path.mkdir(exist_ok=True) n_cores = 12 prods = mp.Manager().dict() q = mp.Queue(maxsize=n_cores) iolock = mp.Lock() pool = mp.Pool(n_cores, initializer=process, initargs=(q, iolock)) # process the file data = bson.decode_file_iter(open(str(data_path / 'test.bson'), 'rb')) for c, d in enumerate(data): q.put(d) # tell workers we're done for _ in range(n_cores): q.put(None) pool.close() pool.join() # convert back to normal dictionary prod_to_category = dict(prod_to_category) prod_to_category = pd.DataFrame.from_dict(prod_to_category, orient='index')