Exemple #1
0
def split_bson(input_bson_filename, output_bson_filename_1,
               output_bson_filename_2, n, number_random_example):

    data = bson.decode_file_iter(open(input_bson_filename, 'rb'))

    random_items = random.sample(range(n), number_random_example)
    random_items.sort()
    print(random_items[0])
    r_idx = 0
    print(n)
    with open(output_bson_filename_1, 'w+') as output:
        for c, d in tqdm(enumerate(data), total=n):
            if c != random_items[r_idx]:
                continue
            else:
                # print("pick random item: {}".format(c))
                # insert your code here.
                output.write(BSON.encode(d))
                r_idx = r_idx + 1
                if r_idx == number_random_example:
                    break
    r_idx = 0
    data = bson.decode_file_iter(open(input_bson_filename, 'rb'))
    with open(output_bson_filename_2, 'w+') as output:
        for c, d in tqdm(enumerate(data), total=n):
            if r_idx < number_random_example and c == random_items[r_idx]:
                r_idx = r_idx + 1
                continue
            else:

                output.write(BSON.encode(d))
    print("Finish convert tfrecords with {} records".format(r_idx))
Exemple #2
0
def large_social_networks_twitter():
    # count = 0

    degree_thrd = 3


    index = []
    G = nx.Graph()
    with open('/mnt/wzhan139/cross media data/Twitter/twitter_followees.bson', "rb") as f:
        data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore"))
        count = 0
        for c, d in enumerate(data):
            print("Reading node "+ str(c))
            index.append(d['user_name'])
            G.add_node(d['user_name'])
            count += 1
    with open('/mnt/wzhan139/cross media data/Twitter/twitter_followees.bson', "rb") as f:
        data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore"))
        for c, d in enumerate(data):
            print("Constructing graph in node " + str(c))
            for j in range(len(d['followees'])):
                if G.has_node(d['followees'][j]['screen_name']):
                    G.add_edge(d['user_name'], d['followees'][j]['screen_name'])
    with open('/mnt/wzhan139/cross media data/Twitter/twitter_followers.bson', "rb") as f:
        data = bson.decode_file_iter(f, bson.CodecOptions(unicode_decode_error_handler="ignore"))
        for c, d in enumerate(data):
            print("Constructing graph in node " + str(c))
            for i in range(len(d['followers'])):
                if G.has_node(d['followers'][i]['screen_name']):
                    G.add_edge(d['user_name'], d['followers'][i]['screen_name'])


    G2 = nx.convert_node_labels_to_integers(G,label_attribute='old_label')
    num_node = nx.adjacency_matrix(G2).shape[0]
    sparsity = G2.number_of_edges() / num_node ** 2
    print("no thredshold graph sparsity is " + str(sparsity))
    print(nx.info(G2))
    nx.write_gpickle(G2, "twitter.nothred.gpickle")

    remove_node=[]
    for n, d in G2.nodes(data=True):
        if G2.degree(n)<degree_thrd:
            remove_node.append(n)
    G2.remove_nodes_from(np.asarray(remove_node))
    G2 = nx.convert_node_labels_to_integers(G2)

    num_node=nx.adjacency_matrix(G2).shape[0]
    G3 = nx.from_scipy_sparse_matrix(sp.dia_matrix((np.ones(num_node), 0), shape=nx.adjacency_matrix(G2).shape))
    G4=nx.compose(G2,G3)
    nx.write_gpickle(G4, "twitter.gpickle")
    nx.write_adjlist(G4, "twitter_adj")
    nx.write_edgelist(G4, "twitter_edgelist")

    sparsity = G4.number_of_edges()/num_node**2
    print("sparsity is "+ str(sparsity))
    print(nx.info(G4))
def load_train_data(path, cutoff, sample_file=0):
    """
    :param path: path of input dataset
    :param cutoff: how many lines you gonna read into memory
    :param sample_file: 1 - using sample file
    :return:
        list of all category_ids, ids, images(binary), weights(1/n_imgs)
    """
    NCORE = cpu_count()
    all_categories = mp.Manager().list()
    all_ids = mp.Manager().list()
    all_imgs = mp.Manager().list()
    all_weights = mp.Manager().list()

    q = mp.Queue(maxsize=NCORE)
    iolock = mp.Lock()

    _, _, _, _, _, id2index = read_category(path)
    pool = mp.Pool(NCORE,
                   initializer=process,
                   initargs=(q, iolock, all_ids, all_categories, all_imgs,
                             all_weights, id2index))

    # process the file
    if sample_file == 0:
        data = bson.decode_file_iter(open(path + '/train.bson', 'rb'))
    if sample_file == 1:
        data = bson.decode_file_iter(open(path + '/train_example.bson', 'rb'))

    it = 0
    for c, d in enumerate(data):
        if it >= cutoff:
            break
        q.put(d)  # blocks until q below its max size
        it = it + 1

    # tell workers we're done
    for _ in range(NCORE):
        q.put(None)
    pool.close()
    pool.join()

    # convert back to normal dictionary
    all_categories = list(all_categories)
    all_ids = list(all_ids)
    all_imgs = list(all_imgs)
    all_weights = list(all_weights)
    return all_categories, all_ids, all_imgs, all_weights
Exemple #4
0
 def last_oplog_timestamp(self):
     oplog = None
     try:
         if os.path.isfile(
                 self.lockfile):  #Im firs looking for the .lock file
             self.logger.info(
                 'Getting last bson-timestamp from previous oplog history file: "%s"'
                 % self.lockfile)
             oplog = self.getLastTimestampSaved()
             self.last_dump_ts = oplog
         elif os.path.isfile(self.origin_dump_oplog):
             self.logger.info(
                 'Getting last bson-timestamp from oplog file: "%s"' %
                 self.origin_dump_oplog)
             oplog = open(self.origin_dump_oplog)
             for change in bson.decode_file_iter(oplog):
                 self.last_dump_ts = change['ts']
             oplog.close()
         else:
             raise Exception, 'Could not find an oplog.bson or history file: "%s"!' % self.origin_dump_oplog, None
     except Exception, e:
         self.logger.fatal(
             'Failed to restore mongodump to destination! Error: "%s"' % e)
         if oplog:
             oplog.close()
         exit(1)
Exemple #5
0
def TestBatchGenerator(batch_size=1):
    categories = categorydict()
    count = 0
    batchX = []

    for sample in bson.decode_file_iter(
            open(path.join(dataroot, testfile), 'rb')):
        imgs = sample['imgs']
        c = sample['_id']
        for i in range(len(imgs)):
            im = imgs[i]['picture']
            im = imread(io.BytesIO(im))
            im = cv2.resize(im,
                            None,
                            fx=0.5,
                            fy=0.5,
                            interpolation=cv2.INTER_AREA)
            batchX.append(im)

        batchY = c
        count = count + 1
        #print(count)
        if count < batch_size:
            pass
        else:
            yield np.asarray(batchX), np.asarray(batchY)
            count = 0
            batchX = []
Exemple #6
0
def getTotalTrainImageCount():
    count = 0
    for sample in bson.decode_file_iter(
            open(path.join(dataroot, trainfile), 'rb')):
        imgs = sample['imgs']
        count = count + len(imgs)
    return count
Exemple #7
0
def analyze(recording_file,
            focus=None,
            num_top=None,
            new_aspects=None,
            filter_stmt=None):
    # Set up reporter
    reporter = Reporter(filter_stmt=filter_stmt)

    if new_aspects:
        for new_aspect in new_aspects:
            reporter.add_aspect_from_eval(*new_aspect)

    plugins = load_plugins()
    for aspect_class in plugins['aspects']:
        reporter.add_aspect(aspect_class)

    # Stream samples to reporter
    for sample in decode_file_iter(recording_file):
        reporter.add_sample(sample['t'], sample['o'])

    echo('== Summary ==')
    for stat, val in sorted(reporter.report.get_summary().items()):
        echo('  %s = %s' % (stat, val))
    echo()

    reporter.report.print_top(focus=focus, num_top=num_top)
Exemple #8
0
def merge_bson_unique(output, inputs, uniquefield):
    uniquefieldset = set()
    duplicatecount = 0
    invalidcount = 0
    totalcount = 0

    #configure logging
    logger = logging.getLogger(__name__)
    logger.info('Creating your output file : %s', output)
    with open(output, 'wb') as outputbson:
        for bsonfile in inputs:
            totalcount += 1
            logger.info('Opening input file : %s', bsonfile)
            with open(bsonfile, 'rb') as bsonfile_handle:
                iterator = decode_file_iter(bsonfile_handle)
                for singleobject in iterator:
                    if uniquefield not in singleobject:
                        invalidcount += 1
                    if singleobject[uniquefield] not in uniquefieldset:
                        outputbson.write(BSON.encode(singleobject))
                        uniquefieldset.add(singleobject[uniquefield])
                    else:
                        duplicatecount += 1
            logging.info('Finished merging input file : %s', bsonfile)
    logger.info('Finished merging all input files to path : %s', output)
    logger.info('Duplicates: %s, Total: %s, Invalid: %s', duplicatecount,
                totalcount, invalidcount)
Exemple #9
0
    def _get_data(self, file_path):
        """
        This method imports the dummy dataset if it wasn't already loaded.
        """
        data_set = []

        print("Reading BSON file...")
        data = bson.decode_file_iter(open(file_path, 'rb'))

        print("Starting processing...")
        for c, d in enumerate(data):
            # Store what product belongs to what
            category_id = d['category_id']
            for e, pic in enumerate(d['imgs']):
                picture = imread(io.BytesIO(pic['picture']))
                # Store image with its label
                data_set.append((category_id, picture.reshape(1, -1)))
            if c % 10 == 0 and c > 0:
                print("Iteration {}".format(c))

        # Convert image data to Pandas DataFrame
        pd_data = pd.DataFrame.from_records(data_set)
        pd_data.rename(columns={0: 'category_id', 1: 'image'}, inplace=True)

        return pd_data
Exemple #10
0
 def __iter__(self):
     for item in tqdm(decode_file_iter(open(self.fn, 'rb'))):
         sentences = mongodoc2sentences(item, '../data/cases')
         if sentences:
             for sentence in sentences:
                 yield sentence.split()
             self.count+=1
def main(args):
    if os.path.exists(args.save_train_bson):
        raise FileExistsError(args.save_train_bson)
    if os.path.exists(args.save_val_bson):
        raise FileExistsError(args.save_val_bson)

    logging.info('aggregating id of products...')
    product_ids = list()
    with open(args.input_bson, 'rb') as reader:
        data = bson.decode_file_iter(reader)
        for x in tqdm(data, unit='products', total=args.num_products):
            product_ids.append(x.get('_id'))

    logging.info('shuffle train and val ids...')
    num_val = int(len(product_ids) * args.val_ratio)
    random.seed(args.random_seed)
    random.shuffle(product_ids)
    val_product_ids = set(random.sample(product_ids, num_val))
    train_product_ids = set(product_ids) - val_product_ids

    logging.info('writing {} products for validation: {}'.format(len(val_product_ids), args.save_val_bson))
    encode_dict_list(products=products_iter(args.input_bson, val_product_ids),
                     output_bson_path=args.save_val_bson,
                     total=len(val_product_ids))

    logging.info('writing {} products for training: {}'.format(len(train_product_ids), args.save_train_bson))
    encode_dict_list(products=products_iter(args.input_bson, train_product_ids),
                     output_bson_path=args.save_train_bson,
                     total=len(train_product_ids))
def products_iter(input_bson_path, product_ids):
    with open(input_bson_path, 'rb') as reader:
        data = bson.decode_file_iter(reader)
        for i, prod in tqdm(enumerate(data), unit='products', total=args.num_products, disable=True):
            prod_id = prod.get('_id')
            if prod_id in product_ids:
                yield prod
Exemple #13
0
def merge_bson_unique(output, inputs, uniquefield):
    uniquefieldset = set()
    duplicatecount = 0
    invalidcount = 0
    totalcount = 0

    #configure logging
    logger = logging.getLogger(__name__)
    logger.info('Creating your output file : %s', output)
    with open(output, 'wb') as outputbson:
        for bsonfile in inputs:
            totalcount += 1
            logger.info('Opening input file : %s', bsonfile)
            with open(bsonfile, 'rb') as bsonfile_handle:
                iterator = decode_file_iter(bsonfile_handle)
                for singleobject in iterator:
                    if uniquefield not in singleobject:
                        invalidcount += 1
                    if singleobject[uniquefield] not in uniquefieldset:
                        outputbson.write(BSON.encode(singleobject))
                        uniquefieldset.add(singleobject[uniquefield])
                    else:
                        duplicatecount += 1
            logging.info('Finished merging input file : %s', bsonfile)
    logger.info('Finished merging all input files to path : %s', output)
    logger.info('Duplicates: %s, Total: %s, Invalid: %s', duplicatecount, totalcount, invalidcount)
Exemple #14
0
def apply_bson_delta(left, patch, outfile):
    """Apply patch to bson file and output results file"""
    if isinstance(left, io.IOBase):
        leftf = left
    else:
        leftf = open(left, 'rb')

    if isinstance(patch, io.IOBase):
        deltaf = patch
    else:
        deltaf = open(patch, 'rb')

    patchdata = bson.decode(patch.read())
    patched = {'a': {}, 'c': {}, 'd': {}}
    for r in patchdata['records']:
        if 'patch' in r.keys():
            o = r['patch']
        else:
            o = r['obj']
        patched[r['mode']][r['uniqkey']] = o
    for o in bson.decode_file_iter(leftf):
        if o[patchdata['uniqkey']] in patched['d']:
            continue
        elif o[patchdata['uniqkey']] in patched['c']:
            o = patched['c'][o[patchdata['uniqkey']]]
            outfile.write(bson.BSON.encode(o))
        else:
            outfile.write(bson.BSON.encode(o))
    for o in patched['a'].values():
        outfile.write(bson.BSON.encode(o))
Exemple #15
0
def test_generator(bson_file, batch_size):
    product_cnt = 0
    pic_cnt = 0
    num_products = NUM_TEST_PRODUCTS
    num_pics = NUM_TEST_PICS
    prods = []
    pics = []

    with open(bson_file, 'rb') as bf:
        data = bson.decode_file_iter(bf)

        for prod in data:
            product_id = prod['_id']

            for picidx, pic in enumerate(prod['imgs']):
                picture = load_img_array(io.BytesIO(pic['picture']))
                pics.append(picture)
                prods.append(product_id)
                pic_cnt += 1

            # guarantee pics in the same prod are in the same batch
            if 0 <= (pic_cnt % batch_size) < 4 or pic_cnt == num_pics:
                yield np.array(prods), np.array(pics)
                prods = []
                pics = []

            product_cnt += 1
            if product_cnt % 1000 == 0 or product_cnt == num_products:
                print("converted {} products {} images".format(
                    product_cnt, pic_cnt))
Exemple #16
0
def run():
    for dataset_file_path, dataset_folder_path in zip(
        (TRAIN_FILE_PATH, TEST_FILE_PATH),
        (TRAIN_FOLDER_PATH, TEST_FOLDER_PATH)):
        print("Processing {} ...".format(dataset_file_path))

        with open(dataset_file_path, "rb") as dataset_file_object:
            data_generator = bson.decode_file_iter(dataset_file_object)

            for data in data_generator:
                category_id = data.get("category_id", "dummy")
                category_folder_path = os.path.join(dataset_folder_path,
                                                    str(category_id))
                os.makedirs(category_folder_path, exist_ok=True)

                product_id = data["_id"]
                for picture_id, picture_dict in enumerate(data["imgs"]):
                    picture_content = picture_dict["picture"]
                    picture_file_path = os.path.join(
                        category_folder_path,
                        "{}_{}.jpg".format(product_id, picture_id))
                    with open(picture_file_path, "wb") as picture_file_object:
                        picture_file_object.write(picture_content)

    print("All done!")
Exemple #17
0
def random_sample_bson(input_bson_filename,
                       output_bson_filename,
                       n=100,
                       number_random_example=10):

    data = bson.decode_file_iter(open(input_bson_filename, 'rb'))

    random_items = random.sample(range(n), number_random_example)
    random_items.sort()

    r_idx = 0

    with open(output_bson_filename, 'w+') as output:
        for c, d in tqdm(enumerate(data), total=n):
            if c != random_items[r_idx]:
                continue
            else:
                # print("pick random item: {}".format(c))
                # insert your code here.
                output.write(BSON.encode(d))
                r_idx = r_idx + 1
                if r_idx >= number_random_example:
                    break

    print("Finish convert tfrecords with {} records".format(r_idx))
Exemple #18
0
def run_train_boson_to_image():

    bson_file = '/media/ssd/data/kaggle/cdiscount/__download__/train.bson'
    num_products = 7069896  # 7069896 for train and 1768182 for test
    out_dir = CDISCOUNT_DIR + '/train'

    os.makedirs(out_dir, exist_ok=True)
    categories = pd.read_csv(CDISCOUNT_DIR + '/category_names.csv',
                             index_col='category_id')
    for category in categories.index:
        os.makedirs(out_dir + '/' + str(category), exist_ok=True)

    with open(bson_file, 'rb') as fbson:
        data = bson.decode_file_iter(fbson)
        #num_products = len(list(data))
        #print ('num_products=%d'%num_products)
        #exit(0)

        for n, d in enumerate(data):
            print('%08d/%08d' % (n, num_products))

            category = d['category_id']
            _id = d['_id']
            for i, pic in enumerate(d['imgs']):
                img_file = out_dir + '/' + str(
                    category) + '/' + '%s-%d.jpg' % (str(_id), i)
                #print(img_file)

                with open(img_file, 'wb') as f:
                    f.write(pic['picture'])
Exemple #19
0
def build_distance_matrix(filename):
    #read measurements
    measurements = []
    #count = 300
    with open(filename) as measurement_file:
        iter = bson.decode_file_iter(measurement_file)
        for document in iter:
            #count -= 1
            measurements.append(document)

            #if count == 0:
            #    break

    #compute distance matrix
    distance_matrix = []
    for i in range(0, len(measurements)):
        distance_matrix.append([]);

    for i in range(0, len(measurements) - 1):
        distance_matrix[i].append(100.0)
        for j in range(i + 1, len(measurements)):
            distance = compute_distance(measurements[i], measurements[j])
            distance_matrix[i].append(distance)
            distance_matrix[j].append(distance)

    distance_matrix[len(distance_matrix) - 1].append(100.0)

    return measurements, distance_matrix
 def parallel_map(func):
     with Pool() as pool:
         with path.open("rb") as file:
             return [
                 res for res in pool.imap(
                     func, bson.decode_file_iter(file), chunksize=10000)
             ]
Exemple #21
0
def get_single_item(idx):
    data_bson = bson.decode_file_iter(open(train_example_bson, 'rb'))
    prod_id = []
    cat_id = []
    img_arr = []
    index = 0

    for c, d in enumerate(data_bson):
        product_id = d['_id']
        category_id = d['category_id']  # This won't be in Test data
        #     prod_to_category[product_id] = category_id
        for e, pic in enumerate(d['imgs']):
            # array of image
            # picture = imread(io.BytesIO(pic['picture']))

            # bytes of image
            picture = pic['picture']

            if idx == index:
                prod_id.append(product_id)
                cat_id.append(category_id)
                img_arr.append(picture)

            index += 1

    return (prod_id[0], cat_id[0], img_arr[0])
    def test_backports(self):
        doc = BSON.encode({"tuple": (1, 2)})
        exp = {"tuple": [1, 2]}
        options = CodecOptions(uuid_representation=ALL_UUID_REPRESENTATIONS[0],
                               tz_aware=False, document_class=dict)

        self.assertEqual(
            {"tuple": [1, 2]},
            BSON.encode(
                {"tuple": (1, 2)}, codec_options=options,
                uuid_subtype=ALL_UUID_REPRESENTATIONS[1]).decode())
        self.assertEqual(exp, doc.decode(
            as_class=SON,
            tz_aware=True,
            uuid_subtype=ALL_UUID_REPRESENTATIONS[1],
            codec_options=options))
        self.assertEqual([exp], list(decode_iter(
            doc,
            as_class=SON,
            tz_aware=True,
            uuid_subtype=ALL_UUID_REPRESENTATIONS[1],
            codec_options=options)))
        self.assertEqual([exp], list(decode_file_iter(
            StringIO(doc),
            as_class=SON,
            tz_aware=True,
            uuid_subtype=ALL_UUID_REPRESENTATIONS[1],
            codec_options=options)))
        self.assertEqual([exp], decode_all(
            doc, SON, True, ALL_UUID_REPRESENTATIONS[1], True, options))
Exemple #23
0
def get_data(NCORE, bsonfile):
    """
    given number of cores, and bsonfile location,
    returns:
    dataframe1: _id, category_id
    dataframe2: _id, list of images
    """
    q = mp.Queue(maxsize=NCORE)
    iolock = mp.Lock()
    manager = mp.Manager()
    prod_to_category = manager.dict()
    prod_to_images = manager.dict()
    pool = mp.Pool(NCORE,
                   initializer=process,
                   initargs=(q, iolock, prod_to_category, prod_to_images))

    data = bson.decode_file_iter(open(bsonfile, 'rb'))
    for c, d in enumerate(data):
        q.put(d)  # blocks until q below its max size

    # tell workers we're done
    for _ in range(NCORE):
        q.put(None)
    pool.close()
    pool.join()

    prod_to_category = dict(prod_to_category)
    prod_to_images = dict(prod_to_images)

    prod_to_category = pd.DataFrame(list(prod_to_category.items()),
                                    columns=['_id', 'category_id'])
    prod_to_images = pd.DataFrame(list(prod_to_images.items()),
                                  columns=['_id', 'images'])

    return prod_to_category, prod_to_images
    def run(self):
        logging.info("Resolving oplog for host %s:%s to max timestamp: %s" % (self.host, self.port, self.max_end_ts))

        try:
            if self.dump_gzip:
                tailed_oplog_fh = GzipFile(self.tailed_oplog_file)
                mongodump_oplog_fh = GzipFile(self.mongodump_oplog_file, 'a+')
            else:
                tailed_oplog_fh = open(self.tailed_oplog_file)
                mongodump_oplog_fh = open(self.mongodump_oplog_file, 'a+')

            for change in decode_file_iter(tailed_oplog_fh):
                if 'ts' in change:
                    ts = change['ts']
                    if ts > self.mongodump_oplog_last_ts or self.mongodump_oplog_last_ts is None:
                        if ts < self.max_end_ts:
                            mongodump_oplog_fh.write(BSON.encode(change))
                            self.changes += 1
                            self.last_ts = ts
                        elif ts > self.max_end_ts:
                            break
            tailed_oplog_fh.close()
            mongodump_oplog_fh.flush()
            mongodump_oplog_fh.close()
        except Exception, e:
            logging.fatal("Resolving of oplogs failed! Error: %s" % e)
            raise e
Exemple #25
0
def run_make_train_summary():

    bson_file = '/media/ssd/data/kaggle/cdiscount/__download__/train.bson'
    num_products = 7069896  # 7069896 for train and 1768182 for test
    out_dir = CDISCOUNT_DIR

    id = []
    num_imgs = []
    category_id = []

    with open(bson_file, 'rb') as fbson:
        data = bson.decode_file_iter(fbson)
        #num_products = len(list(data))
        #print ('num_products=%d'%num_products)
        #exit(0)

        for n, d in enumerate(data):
            print('\r%08d/%08d' % (n, num_products), flush=True, end='')

            category_id.append(d['category_id'])
            id.append(d['_id'])
            num_imgs.append(len(d['imgs']))
        print('')

    #by product id
    df = pd.DataFrame({
        '_id': id,
        'num_imgs': num_imgs,
        'category_id': category_id
    })
    df.to_csv(
        '/media/ssd/data/kaggle/cdiscount/__temp__/train_by_product_id.csv',
        index=False)
    t = df['num_imgs'].sum()  #check :12371293
    print(t)

    #split by id --------------------------------------
    id_random = list(id)
    random.shuffle(id_random)

    #make train, valid
    num_train = int(0.8 * (num_products))
    num_valid = num_products - num_train

    #by id
    file1 = CDISCOUNT_DIR + '/split/' + 'train_id_v0_%d' % (num_train)
    file2 = CDISCOUNT_DIR + '/split/' + 'valid_id_v0_%d' % (num_valid)
    id1 = id_random[0:num_train]
    id2 = id_random[num_train:]
    write_list_to_file(id1, file1)
    write_list_to_file(id2, file2)

    #summary ------------------------------------
    g = (df.groupby('category_id').agg({
        '_id': 'count',
        'num_imgs': 'sum'
    }).reset_index())
    g.to_csv('/media/ssd/data/kaggle/cdiscount/__temp__/train_g.csv',
             index=False)
Exemple #26
0
def transfer_data(
):  # A generator used to generating a batch of labels and images
    #transfer all images to features
    data = bson.decode_file_iter(open(
        PIC_PATH, 'rb'))  # bson.decode_file_iter is a generator
    # full list of classes
    length = len(data)
    print length
    char = input()
    df_categories = pd.read_csv(LABEL_PATH, index_col='category_id')
    category_classes = df_categories.index.values
    category_classes = category_classes.reshape(category_classes.shape[0], 1)

    # using just binarizer without endcoder to convert all unique category_ids to have a column for each class
    lb = preprocessing.LabelBinarizer()
    lb.fit(df_categories.index.values)

    # Size of pictures is defined here instead read size of the first picture
    n = 500  # Batch size
    pix_x = 180
    pix_y = 180
    rgb = 3

    X_ids = np.zeros((n, 1)).astype(int)
    Y = np.zeros((n, 1)).astype(int)  # category_id for each row
    X_images = np.zeros((n, pix_x, pix_y, rgb))  # m images are 180 by 180 by 3
    i = 0

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.7  # Restrict the growth of memory use, or memory will be used up
    images = tf.placeholder(tf.float32, [n, pix_x, pix_y, rgb], name="images")
    op = tf.image.resize_images(images, [MODEL_SIZE, MODEL_SIZE],
                                method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

    batch_num = 0

    with tf.Session(config=config) as sess:
        for c, d in enumerate(data):
            for e, pic in enumerate(d['imgs']):
                if i == 0:
                    Y = np.zeros(
                        (n, 1)).astype(int)  # category_id for each row
                    X_images = np.zeros(
                        (n, pix_x, pix_y, rgb))  # m images are 180 by 180 by 3
                picture = imread(io.BytesIO(
                    pic['picture']))  # All images should be added.
                Y[i] = d['category_id']
                X_images[i] = picture
                i += 1
                if i == n:
                    batch_num += 1
                    i = 0
                    Y_flat = lb.transform(Y)
                    X_flat = sess.run(op, feed_dict={images: X_images})
                    Y = np.zeros(
                        (n, 1)).astype(int)  # category_id for each row
                    X_images = np.zeros(
                        (n, pix_x, pix_y, rgb))  # m images are 180 by 180 by 3
                    yield X_flat, Y_flat
Exemple #27
0
 def from_bson(cls, x):
     if six.PY2:
         # Hack for python 2: may work in py3 too, but it's definitely not the standard way!
         reader = bson.decode_file_iter(six.BytesIO(x))
         event_dict = next(reader)
     else:
         event_dict = bson.BSON.decode(x)
     return cls(**event_dict)
 def __iter__(self):
     with open(self._filename, 'rb') as f:
         i = 1
         for tweet in decode_file_iter(f):
             if self._limit and i > self._limit:
                 raise StopIteration
             if all(func(tweet) for func in self._filter_functions):
                 i += 1
                 yield tweet
    def test_invalid_decodes(self):
        # Invalid object size (not enough bytes in document for even
        # an object size of first object.
        # NOTE: decode_all and decode_iter don't care, not sure if they should?
        self.assertRaises(InvalidBSON, list,
                          decode_file_iter(StringIO(b"\x1B")))

        # An object size that's too small to even include the object size,
        # but is correctly encoded, along with a correct EOO (and no data).
        data = b"\x01\x00\x00\x00\x00"
        self.assertRaises(InvalidBSON, decode_all, data)
        self.assertRaises(InvalidBSON, list, decode_iter(data))
        self.assertRaises(InvalidBSON, list, decode_file_iter(StringIO(data)))

        # One object, but with object size listed smaller than it is in the
        # data.
        data = (b"\x1A\x00\x00\x00\x0E\x74\x65\x73\x74"
                b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C"
                b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00"
                b"\x05\x00\x00\x00\x00")
        self.assertRaises(InvalidBSON, decode_all, data)
        self.assertRaises(InvalidBSON, list, decode_iter(data))
        self.assertRaises(InvalidBSON, list, decode_file_iter(StringIO(data)))

        # One object, missing the EOO at the end.
        data = (b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74"
                b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C"
                b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00"
                b"\x05\x00\x00\x00")
        self.assertRaises(InvalidBSON, decode_all, data)
        self.assertRaises(InvalidBSON, list, decode_iter(data))
        self.assertRaises(InvalidBSON, list, decode_file_iter(StringIO(data)))

        # One object, sized correctly, with a spot for an EOO, but the EOO
        # isn't 0x00.
        data = (b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74"
                b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C"
                b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00"
                b"\x05\x00\x00\x00\xFF")
        self.assertRaises(InvalidBSON, decode_all, data)
        self.assertRaises(InvalidBSON, list, decode_iter(data))
        self.assertRaises(InvalidBSON, list, decode_file_iter(StringIO(data)))
    def test_decode_file_iter(self):
        expected, bson_data = self._generate_multidocument_bson_stream()
        fileobj = tempfile.TemporaryFile()
        fileobj.write(bson_data)
        fileobj.seek(0)

        for expected_doc, decoded_doc in zip(
                expected, decode_file_iter(fileobj, self.codecopts)):
            self.assertEqual(expected_doc, decoded_doc)

        fileobj.close()
 def test_date_filter_bson_date_filter_until(self):
     self.setUp()
     args = date_filter_bson.parse_args(['-i', os.path.dirname(os.path.abspath(__file__))+'/../test/test.bson', '-d2', '2016-01-18 00:00:00', '-o', os.path.dirname(os.path.abspath(__file__))+'/../test/output.bson'])
     date_filter_bson.date_filter(args.output, args.input, args.dateone, args.datetwo)
     count = 0
     with open(os.path.dirname(os.path.abspath(__file__))+'/../test/output.bson', 'rb') as bsonfile_handle:
             iterator = decode_file_iter(bsonfile_handle)
             for line in iterator:
                 count+=1
     self.assertEqual(count, 21)
     #remove output self.tearDown
     self.tearDown()
Exemple #32
0
def filter_records(infile, year, month, day, tz):
    """
    Takes in a file handle pointing at a BSON file, and a year, month, day, timezone.
    Returns only those tweets which were sent on that date in that timezone.
    """
    it = decode_file_iter(infile)
    try:
        for rec in it:
            d = tweet_date(rec).astimezone(tz)
            if d.year == year and d.month == month and d.day == day:
                yield rec
    except Exception as e:
        print e
Exemple #33
0
def merge_bson(output, inputs):
    #configure logging
    logger = logging.getLogger(__name__)
    logger.info('Creating your output file : %s', output)
    with open(output, 'wb') as outputbson:
        for bsonfile in inputs:
            logger.info('Opening input file : %s', bsonfile)
            with open(bsonfile, 'rb') as bsonfile_handle:
                iterator = decode_file_iter(bsonfile_handle)
                for line in iterator:
                    outputbson.write(BSON.encode(line))
    logger.info('Finished merging input file : %s', bsonfile)
    logger.info('Finished merging all input files to path : %s', output)
 def load(self):
     try:
         oplog = self.open()
         logging.debug("Reading oplog file %s" % self.oplog_file)
         for change in decode_file_iter(oplog, CodecOptions(unicode_decode_error_handler="ignore")):
             if 'ts' in change:
                 self._last_ts = change['ts']
             if self._first_ts is None and self._last_ts is not None:
                 self._first_ts = self._last_ts
             self._count += 1
         oplog.close()
     except Exception, e:
         logging.fatal("Error reading oplog file %s! Error: %s" % (self.oplog_file, e))
         raise OperationError(e)
	def get_iterator(self):
		tweet_parser = TweetParser()
		bson_handle = open(self.filepath, 'rb')
		for count, tweet in enumerate(bson.decode_file_iter(bson_handle)):
			if self.limit < count+1 and self.limit != 0:
				bson_handle.close()
				return
			elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
			and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
				if self.should_strip:
					yield tweet_parser.strip_tweet(self.keep_fields, tweet) 
				else: 
					yield tweet
		bson_handle.close()
    def convert_bson(self):
        message = 'Converting BSON "{}" to language shelf #{}'.format(self.dataset, self.process_id)
        bson_file = ProgressFile(self.bson_file, 'rb', message=message)
        shelf_name = 'languages-' + self.process_id + '.shelf'
        languages = shelve.open(self.path + shelf_name, writeback=True)

        # Read every BSON object as an iterator to save memory.
        for raw_json in bson.decode_file_iter(bson_file):
            repository = raw_json['full_name'].encode('utf-8')
            language = raw_json['language'].encode('utf-8') if raw_json['language'] is not None else ''
            languages[repository] = language

        languages.close()
        bson_file.close()
        self.cleanup(shelf_name)
def split_images(source, depth):
    categories = get_categories(depth)
    with open(source, 'rb') as data:
        for entry in bson.decode_file_iter(data):
            product_id = entry['_id']
            category = categories[int(entry['category_id'])]
            target = os.path.join('data', 'categories' + str(depth), category)

            if not os.path.exists(target):
                print target
                os.makedirs(target)

            for e, pic in enumerate(entry['imgs']):
                picture = imread(io.BytesIO(pic['picture']))
                picture_file = os.path.join(target, str(product_id) + '_' + str(e) + '.jpg')
                imsave(picture_file, picture)
    def read(self):
        if os.path.isfile(self.oplog_file):
            try:
                logging.debug("Reading oplog file %s" % self.oplog_file)

                if self.dump_gzip:
                    oplog = GzipFile(self.oplog_file)
                else:
                    oplog = open(self.oplog_file)

                for change in decode_file_iter(oplog):
                    if 'ts' in change:
                        self._last_ts = change['ts']
                    if self._first_ts is None and self._last_ts is not None:
                        self._first_ts = self._last_ts
                    self._count += 1
                oplog.close()
            except Exception, e:
                logging.fatal("Error reading oplog file %s! Error: %s" % (self.oplog_file, e))
                raise e
 def test_basic_decode(self):
     self.assertEqual({"test": u("hello world")},
                      BSON(b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74\x00\x0C"
                           b"\x00\x00\x00\x68\x65\x6C\x6C\x6F\x20\x77\x6F"
                           b"\x72\x6C\x64\x00\x00").decode())
     self.assertEqual([{"test": u("hello world")}, {}],
                      decode_all(b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74"
                                 b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C"
                                 b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00"
                                 b"\x05\x00\x00\x00\x00"))
     self.assertEqual([{"test": u("hello world")}, {}],
                      list(decode_iter(
                         b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74"
                         b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C"
                         b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00"
                         b"\x05\x00\x00\x00\x00")))
     self.assertEqual([{"test": u("hello world")}, {}],
                      list(decode_file_iter(StringIO(
                         b"\x1B\x00\x00\x00\x0E\x74\x65\x73\x74"
                         b"\x00\x0C\x00\x00\x00\x68\x65\x6C\x6C"
                         b"\x6f\x20\x77\x6F\x72\x6C\x64\x00\x00"
                         b"\x05\x00\x00\x00\x00"))))
    def convert_bson(self):
        output = open(self.path + self.dataset + '.json', 'wb')
        message = 'Converting BSON "{}" and filtering fields'.format(self.dataset)
        bson_file = ProgressFile(self.bson_file, 'rb', message=message)
        
        Shelf.merge_shelves()
        
        if os.path.isfile('languages.shelf'):
            if self.path != "" and not os.path.isfile(self.path + 'languages.shelf'):
                print("#{}. Copying languages shelf to local directory...".format(MPI.COMM_WORLD.rank))
                shutil.copy('languages.shelf', self.path)
            languages = shelve.open(self.path + 'languages.shelf', writeback=True)
        else:
            languages = {}
        
        # Read every BSON object as an iterator to save memory.
        for raw_json in bson.decode_file_iter(bson_file):
            if not self.is_latin(raw_json['body']):
                continue

            preprocessed_json = {}
            repository = str(re.search(r"repos/([^/]+/[^/]+)(/|$)", raw_json['url']).group(1))
            raw_json['language'] = ''
            if repository in languages:
                raw_json['language'] = languages[repository]
            for item in self.keep_fields:
                preprocessed_json[item] = raw_json[item]
           
            json.dump(preprocessed_json, output)
            output.write('\n')

        output.close()
        bson_file.close()
        # Don't move the file for now, since the commit comments only need to 
        # be on the worker nodes if we're running under MPI
        self.cleanup()
    def run(self):
        try:
            self.oplogs['backup'] = Oplog(self.mongodump_oplog['file'], self.do_gzip(), 'a+', self.flush_docs, self.flush_secs)
            self.oplogs['tailed'] = Oplog(self.tailed_oplog['file'], self.do_gzip())
            logging.info("Resolving oplog for %s to max ts: %s" % (self.uri, self.max_end_ts))
            self.state.set('running', True)
            self.state.set('first_ts', self.mongodump_oplog['first_ts'])
            if not self.state.get('first_ts'):
                self.state.set('first_ts', self.tailed_oplog['first_ts'])
            for change in decode_file_iter(self.oplogs['tailed'], CodecOptions(unicode_decode_error_handler="ignore")):
                self.last_ts = change['ts']
                if not self.mongodump_oplog['last_ts'] or self.last_ts > self.mongodump_oplog['last_ts']:
                    if self.last_ts < self.max_end_ts:
                        self.oplogs['backup'].add(change)
                        self.changes += 1
                    elif self.last_ts > self.max_end_ts:
                        break

            self.state.set('count', self.mongodump_oplog['count'] + self.changes)
            self.state.set('last_ts', self.last_ts)
            self.state.set('running', False)
            self.exit_code = 0
        except Exception, e:
            raise Error("Resolving of oplogs failed! Error: %s" % e)
def parse_train_example():

    # train_example_file = open(DATA_DIR + 'train_example.bson', 'rb')
    # data = bson.loads(train_example_file.read())
    #
    train_example = bson.decode_file_iter(open(DATA_DIR + 'train_example.bson', 'rb'))
    # data = bson.decode_document(open(DATA_DIR + 'train_example.bson', 'rb'))

    data = []
    for key, value in enumerate(train_example):
        product_id = value['_id']
        category_id = value['category_id']  # This won't be in Test data
        # prod_to_category[product_id] = category_id
        pics = []
        for e, pic in enumerate(value['imgs']):
            picture = imread(io.BytesIO(pic['picture']))
            # do something with the picture, etc
            # plt.imshow(picture)
            # plt.title(category_id)
            # plt.show()
            pics.append(picture)
        data.append((product_id, category_id, pics))

    return data
Exemple #43
0
#!/usr/bin/python3
# coding: utf-8
import bson  # pip install pymongo 这样安装的附带包, 不能直接 pip install bson (这样出来是第三方的, 好多函数都没有)
##################################################################
## 写 bson 文件
post1 = {"author": "Mike", "text": "Another post!", "tags": ["bulk", "insert"], "date": 14}
post2 = {"author": "Jenny", "text": "Another post!", "tags": ["bulk", "insert"], "date": 14}
f = open('tmp.bson', 'wb')
f.write(bson.BSON.encode(post1))
f.write(bson.BSON.encode(post2))
f.close()
##################################################################
## 直接使用 Python 读取 bson 文件, bson 文件是 json 的二进制格式
items = list(bson.decode_file_iter(open('./tmp.bson', 'rb'))); print(len(items))  # 2
item = items[0]; print(type(items), type(item))  # <class 'list'> <class 'dict'>
print(item.keys())  # dict_keys(['author', 'text', 'tags', 'date'])
Exemple #44
0
                    validation_data = val_gen,
                    validation_steps = num_val_images // batch_size + 1,
                    callbacks=[early_stopping, model_checkpoint])

print(history.history.keys())

# logging
logging.info('N. epochs == '+str(len(history.history['val_acc'])))
logging.info('Val accuracy == '+str(max(history.history['val_acc'])))

## Predict on Test-set
print(">>> Predicting on test-set ... ")
submission_df = pd.read_csv("data/sample_submission.csv")
print(submission_df.head())
test_datagen = ImageDataGenerator()
data = bson.decode_file_iter(open(test_bson_path, "rb"))
with tqdm(total=num_test_products) as pbar:
    for c, d in enumerate(data):
        product_id = d["_id"]
        num_imgs = len(d["imgs"])
        batch_x = np.zeros((num_imgs, 180, 180, 3), dtype=K.floatx())
        for i in range(num_imgs):
            bson_img = d["imgs"][i]["picture"]
            # Load and preprocess the image.
            img = load_img(io.BytesIO(bson_img), target_size=(180, 180))
            x = img_to_array(img)
            x = preprocess_image(x)
            # = test_datagen.random_transform(x)
            # = test_datagen.standardize(x)
            # Add the image to the batch.
            batch_x[i] = x
Exemple #45
0
#!/usr/bin/env python

import bson
import bson.json_util
from urllib import urlopen

BUS='http://localhost:8000/test'

print "Connecting to %s and receiving objects..." % BUS

param = {
    'heartbeat': 10,
    'queue': {
        'SYSTEM_ALERT': {
            'seq': -1
        }
    }
}

ack = bson.BSON(urlopen(BUS + '/open', bson.BSON.encode(param)).read()).decode()
print bson.json_util.dumps(ack, indent=2)

for msg in bson.decode_file_iter(urlopen(BUS + '/stream/' + str(ack['sid']))):
    print bson.json_util.dumps(msg, indent=2)

Exemple #46
0
    def build_load_files(self):
        import bson

        def yn(s):
            """
            Return 'Y' if int(s) is True, or 'N' otherwise. Return an empty 
            string if s is None.
            """
            if s:
                if int(s):
                    return "Y"
                else:
                    return "N"
            return ""
        
        files = [x for x in sorted(self.get_file_list(self.arguments.path, self.file_filter)) if os.path.basename(x).lower() in BuilderClass.expected_files]
        if self._widget:
            self._widget.progressSet.emit(len(BuilderClass.expected_files), "")
            self._widget.progressUpdate.emit(0)
    
        self._corpus_id = 0

        for i, filepath in enumerate(files):
            filename = os.path.basename(filepath)
            
            if filename == "wordforms.bson":
                max_cache = 20000
                self.table(self.corpus_table)._max_cache = max_cache
                self._widget.progressSet.emit(4520596 // max_cache, "Loading {}".format(filename))
                self._widget.progressUpdate.emit(0)
            else:
                self._widget.labelSet.emit("Loading {}".format(filename))
            
            with open(filepath, "rb") as input_file:
                for entry in bson.decode_file_iter(input_file):
                    self._entry = entry
                    if filename == "sources.bson":
                        self._source_id = len(self._source_dict) + 1
                        self._source_dict[str(entry["key"])] = self._source_id
                        d = {
                            self.source_id: self._source_id,
                            self.source_label: entry.get("title", ""),
                            self.source_year: entry.get("year", ""),
                            self.source_author: entry.get("author", ""),
                            self.source_key: entry.get("key", ""),
                            self.source_note: entry.get("note", "")}
                        self.table(self.source_table).add(d)
                    
                    elif filename == "roots.bson":
                        self._root_id = len(self._root_dict) + 1
                        self._root_dict[str(entry["_id"])] = self._root_id
                        d = {self.root_id: self._root_id,
                             self.root_radicals: entry.get("radicals", ""),
                             self.root_type: entry.get("type", ""),
                             self.root_variant: entry.get("variant", 0),
                             self.root_alternatives: entry.get("alternatives", "")}
                        self.table(self.root_table).add(d)
                    
                    elif filename == "lexemes.bson":
                        # Fix some spelling mistakes in the key names:
                        for x, correct in [("achaic", "archaic"), 
                                           ("archaic ", "archaic"),
                                           ("adverbial ", "adverbial"),
                                           ("instransitive", "intransitive")]:
                            if x in entry.keys():
                                entry[correct] = entry[x]
                        self._lemma_id = len(self._lemma_dict) + 1
                        self._lemma_dict[str(entry["_id"])] = self._lemma_id

                        # get root id if possible, and also root radicals:
                        root_id = None
                        root = entry.get("root", "")
                        if root:
                            root_id = str(root.get("_id"))
                            root_radicals = root.get("radicals", "")
                        root_link = self._root_dict.get(root_id, 0)

                        # look up headword:
                        headword = None
                        headword_dict = entry.get("headword", "")
                        if headword_dict:
                            headword = headword_dict.get("lemma")

                        # fix 'verbalnoun':
                        verbal_noun = entry.get("verbalnoun", "")
                        if verbal_noun == "verbalnoun" or verbal_noun == "1":
                            verbal_noun = "N"

                        d = {
                            self.lemma_id: self._lemma_id,
                            self.lemma_label: entry.get("lemma", ""),
                            self.lemma_adjectival: yn(entry.get("adjectival")),
                            self.lemma_adverbial: yn(entry.get("adverbial")),
                            self.lemma_alternatives: ";".join(entry.get("alternatives", [])),
                            self.lemma_apertiumparadigm: entry.get("apertium_paradigm", ""),
                            self.lemma_archaic: yn(entry.get("archaic")),
                            self.lemma_created: entry.get("created", ""),
                            self.lemma_derived_form: entry.get("derived_form", 0),
                            self.lemma_ditransitive: yn(entry.get("ditransitive")),
                            self.lemma_features: entry.get("features"),
                            self.lemma_feedback: entry.get("feedback", ''),
                            self.lemma_form: entry.get("form", ''),
                            self.lemma_frequency: entry.get("frequency", 0),
                            self.lemma_gender: entry.get("gender", ""),
                            self.lemma_gloss: entry.get("gloss", ""),
                            self.lemma_headword: headword,
                            self.lemma_hypothetical: yn(entry.get("hypothetical")),
                            self.lemma_intransitive: yn(entry.get("intransitive")),
                            self.lemma_modified: entry.get("modified", ""),
                            self.lemma_notduplicate: yn(entry.get("not_duplicate")),
                            self.lemma_number: entry.get("number", ""),
                            self.lemma_onomastictype: entry.get("onomastic_type", ""),
                            self.lemma_participle: yn(entry.get("participle")),
                            self.lemma_pending: yn(entry.get("pending")),
                            self.lemma_pos: entry.get("pos",''),
                            self.lemma_radicals: root_radicals,
                            self.lemma_root_id: root_link,
                            self.lemma_transcript: entry.get("phonetic", ""),
                            self.lemma_verbalnoun: entry.get("verbalnoun")}
                        self.table(self.lemma_table).add(d)
                    
                    elif filename == "wordforms.bson":
                        self._corpus_id += 1

                        # try to get source id at all costs:
                        source_id = None
                        source_list = entry.get("sources")
                        if source_list:
                            try:
                                source_id = self._source_dict[source_list[0]]
                            except KeyError:
                                for x in self._source_dict:
                                    if self._source_dict[x] == source_list[0]:
                                        source_id = self._source_dict[x]
                                        break
                                else:
                                    source_id = 0
                        
                        # collapse the dictionaries behind subject, 
                        # ind_obj, and dir_obj:
                        subj_dict = entry.get("subject")
                        l = []
                        if subj_dict:
                            l = [subj_dict["person"], subj_dict["number"]]
                            if "gender" in subj_dict:
                                l.append(subj_dict["gender"])
                        subj = "_".join(l)

                        ind_obj_dict = entry.get("ind_obj")
                        l = []
                        if ind_obj_dict:
                            l = [ind_obj_dict["person"], ind_obj_dict["number"]]
                            if "gender" in ind_obj_dict:
                                l.append(ind_obj_dict["gender"])
                        ind_obj = "_".join(l)

                        dir_obj_dict = entry.get("dir_obj")
                        l = []
                        if dir_obj_dict:
                            l = [dir_obj_dict["person"], dir_obj_dict["number"]]
                            if "gender" in dir_obj_dict:
                                l.append(dir_obj_dict["gender"])
                        dir_obj = "_".join(l)

                        d = {self.corpus_id: self._corpus_id, 
                            self.corpus_adverbial: yn(entry.get("adverbial")),
                            self.corpus_alternatives: ";".join(entry.get("alternatives", [])),
                            self.corpus_archaic: yn(entry.get("archaic")),
                            self.corpus_aspect: entry.get("aspect", ""),
                            self.corpus_created: entry.get("created", ""),
                            self.corpus_dir_obj: dir_obj,
                            self.corpus_form: entry.get("form", ""),
                            self.corpus_full: entry.get("full", ""),
                            self.corpus_gender: entry.get("gender", ""),
                            self.corpus_generated: yn(entry.get("generated")),
                            self.corpus_gloss: entry.get("gloss", ""),
                            self.corpus_hypothetical: yn(entry.get("hypothetical")),
                            self.corpus_ind_obj: ind_obj,
                            self.corpus_lemma_id: self._lemma_dict.get(str(entry.get("lexeme_id"))),
                            self.corpus_modified: entry.get("modified", ""),
                            self.corpus_number: entry.get("number", ""),
                            self.corpus_pattern: entry.get("pattern", ""),
                            self.corpus_transcript: entry.get("phonetic", ""),
                            self.corpus_plural_form: entry.get("plural_form", ""),
                            self.corpus_polarity: entry.get("polarity", ""),
                            self.corpus_possessor: entry.get("possessor", ""),
                            self.corpus_source_id: source_id,
                            self.corpus_subject: subj,
                            self.corpus_word: entry.get("surface_form", "")}
                        self.table(self.corpus_table).add(d)
                        phon = entry.get("phonetic")

                        if self._widget and not self._corpus_id % max_cache:
                            self._widget.progressUpdate.emit(self._corpus_id // max_cache)
                self.commit_data()    
    def test_backport_codec_options_uuid(self):
        if not should_test_uuid:
            raise SkipTest("No uuid module")

        # Generated by the Java driver
        from_java = b('bAAAAAdfaWQAUCBQxkVm+XdxJ9tOBW5ld2d1aWQAEAAAAAMIQkfACFu'
                      'Z/0RustLOU/G6Am5ld2d1aWRzdHJpbmcAJQAAAGZmOTk1YjA4LWMwND'
                      'ctNDIwOC1iYWYxLTUzY2VkMmIyNmU0NAAAbAAAAAdfaWQAUCBQxkVm+'
                      'XdxJ9tPBW5ld2d1aWQAEAAAAANgS/xhRXXv8kfIec+dYdyCAm5ld2d1'
                      'aWRzdHJpbmcAJQAAAGYyZWY3NTQ1LTYxZmMtNGI2MC04MmRjLTYxOWR'
                      'jZjc5Yzg0NwAAbAAAAAdfaWQAUCBQxkVm+XdxJ9tQBW5ld2d1aWQAEA'
                      'AAAAPqREIbhZPUJOSdHCJIgaqNAm5ld2d1aWRzdHJpbmcAJQAAADI0Z'
                      'DQ5Mzg1LTFiNDItNDRlYS04ZGFhLTgxNDgyMjFjOWRlNAAAbAAAAAdf'
                      'aWQAUCBQxkVm+XdxJ9tRBW5ld2d1aWQAEAAAAANjQBn/aQuNfRyfNyx'
                      '29COkAm5ld2d1aWRzdHJpbmcAJQAAADdkOGQwYjY5LWZmMTktNDA2My'
                      '1hNDIzLWY0NzYyYzM3OWYxYwAAbAAAAAdfaWQAUCBQxkVm+XdxJ9tSB'
                      'W5ld2d1aWQAEAAAAAMtSv/Et1cAQUFHUYevqxaLAm5ld2d1aWRzdHJp'
                      'bmcAJQAAADQxMDA1N2I3LWM0ZmYtNGEyZC04YjE2LWFiYWY4NzUxNDc'
                      '0MQAA')

        data = base64.b64decode(from_java)

        # Test decode_all.
        docs = bson.decode_all(
            data, dict, True, JAVA_LEGACY, True,
            CodecOptions(SON, False, STANDARD))
        for d in docs:
            self.assertNotEqual(d['newguid'], uuid.UUID(d['newguidstring']))

        encoded = [bson.BSON.encode(
            doc, uuid_subtype=JAVA_LEGACY,
            codec_options=CodecOptions(uuid_representation=STANDARD))
                   for doc in docs]

        # Test decode.
        docs2 = [e.decode(
            uuid_subtype=JAVA_LEGACY, as_class=dict, tz_aware=True,
            codec_options=CodecOptions(SON, False, STANDARD))
                 for e in encoded]
        for d in docs2:
            self.assertNotEqual(d['newguid'], uuid.UUID(d['newguidstring']))

        # Test encode.
        for i in range(len(docs)):
            self.assertEqual(docs2[i]['newguid'], docs[i]['newguid'])
            self.assertEqual(uuid.UUID(docs2[i]['newguidstring']),
                             uuid.UUID(docs[i]['newguidstring']))

        # Test decode_iter
        docs = bson.decode_iter(
            data, dict, True, JAVA_LEGACY,
            True, CodecOptions(SON, False, STANDARD))
        for d in docs:
            self.assertNotEqual(d['newguid'], uuid.UUID(d['newguidstring']))

        # Test decode_file_iter
        docs = bson.decode_file_iter(
            StringIO(data), dict, True, JAVA_LEGACY,
            True, CodecOptions(SON, False, STANDARD))
        for d in docs:
            self.assertNotEqual(d['newguid'], uuid.UUID(d['newguidstring']))
## 读数据库
import bson
import sys
from whoosh.index import create_in, open_dir
from whoosh.fields import TEXT, ID, STORED, KEYWORD, NUMERIC, Schema
from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer
import jieba
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
analyzer = ChineseAnalyzer()
##################################################################
## 分析 bson 数据
# items = list(bson.decode_file_iter(open('./tmp.bson', 'rb'))); print(len(items))  # 14128; 读取 BSON 文件
items = list(bson.decode_file_iter(open('./tmp_news/sina.bson', 'rb'))); print(len(items))  # 140639; 读取 BSON 文件
print(items[0].keys())  # dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_time2', 'news_channel', 'news_title', 'news_source', 'news_time', 'news_body', 'news_keywords', 'news_show', 'news_total'])
print(items[0]['news_id'])  # fxzczfc6652525
print(items[0]['news_keywords'])  # ['陕西', '公厕爆炸']
print(items[0]['news_show'])  # 0
print(items[0]['news_time'])  # 2017年05月17日23:48
print(items[0]['news_time2'])  # 1495036111; 时间戳
print(set([len(item['news_title']) for item in items if item.get('news_title', 0) != 0]))
# {6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37}; 这些都是 str 长度
print(set([len(item['news_body']) for item in items])) # {0, 1, 2, 14, 15, 16, ..., 353, 982, 478, 1004}; 居然还能分成 1004 段...
print(set([len([''.join(item['news_body'])]) for item in items])) # {1}; 清净多了
print(set([str(item.keys()) for item in items]))  # 好多缺失值
# {"dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_channel', 'news_title', 'news_source', 'news_time', 'news_body', 'news_keywords', 'flag'])",
#  "dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_channel', 'news_title', 'news_source', 'news_time', 'news_body', 'news_keywords', 'flag', 'news_show', 'news_total'])",
#  "dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_channel', 'news_title', 'news_source', 'news_time', 'news_body', 'flag', 'news_show', 'news_total'])",
#  "dict_keys(['_id', 'news_id', 'news_url', 'news_from', 'news_channel', 'news_title', 'news_time', 'news_body', 'news_keywords', 'flag', 'news_show', 'news_total'])",
            while True:
                if index >= len(success_bucket_floors):
                    break
                
                if failure_rate < success_bucket_floors[index]:
                    break

                index += 1

            domain_map[fields[0]] = success_bucket_floors[index - 1]
            #print "adding", fields[0], " ", success_bucket_floors[index-1]

    #read measurements
    error_map = {}
    with open(sys.argv[1]) as measurement_file:
        iter = bson.decode_file_iter(measurement_file)
        for document in iter:
            #get bucket map using error code
            error_message = document["measurement_error_message"]
            error_fields = error_message.split()
            error_code = int(error_fields[0].replace("[","").replace("]",""))

            if error_code not in error_map:
                bucket_map = {}
                for bucket_floor in success_bucket_floors:
                    bucket_map[bucket_floor] = 0

                error_map[error_code] = bucket_map 

            bucket_map = error_map[error_code]
Exemple #50
0
    parser.add_argument("-n", "--num", dest="num", type=int, default=5, help="number of bson items")
    parser.add_argument("--pprint", dest="pprint", action="store_true", default=False, help="prettified print")
    parser.add_argument("-o", "--output", dest="output_file", type=str, help="output bson file")
    parser.add_argument("input_file", type=str, help="input bson file")

    args = parser.parse_args()

    assert args.num > 0
    assert args.input_file

    if args.output_file:
        out_file = open(args.output_file, "wb")
    else:
        out_file = None

    with open(args.input_file, "rb") as in_file:
        for i, entry in enumerate(bson.decode_file_iter(in_file)):
            if i >= args.num:
                break

            if out_file is None:
                if args.pprint:
                    pprint.pprint(entry)
                else:
                    print entry
            else:
                out_file.write(bson.BSON.encode(entry))

    if out_file:
        out_file.close()
data_path = Path('data')
base_path = data_path / 'test'
base_path.mkdir(exist_ok=True)


n_cores = 12
prods = mp.Manager().dict()

q = mp.Queue(maxsize=n_cores)
iolock = mp.Lock()
pool = mp.Pool(n_cores, initializer=process, initargs=(q, iolock))


# process the file

data = bson.decode_file_iter(open(str(data_path / 'test.bson'), 'rb'))
for c, d in enumerate(data):
    q.put(d)

# tell workers we're done

for _ in range(n_cores):
    q.put(None)

pool.close()
pool.join()

# convert back to normal dictionary
prod_to_category = dict(prod_to_category)

prod_to_category = pd.DataFrame.from_dict(prod_to_category, orient='index')