Esempio n. 1
0
def _from_carray(path, format_categories=None, format_codes=None, format_values=None):
    meta = json.load(open(os.path.join(path, 'meta'), 'r'))

    if meta['type'] == 'category':
        if format_categories in ['npz', 'npy']:
            filename = os.path.join(path, 'categories.%s' % format_categories)
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                categories_values = numpy.load(filename, mmap_mode='r+')  # TODO npz not memmap?
                if format_categories == 'npz':
                    categories_values = categories_values['arr_0']
        elif format_categories == 'pickle':
            filename = os.path.join(path, 'categories.pickle')
            with log.timedlogger("reading [%s] %s" % (meta['name'], filename)):
                categories_values = pickle.load(open(filename, 'rb'))
        elif format_categories == 'bcolz':
            rootdir = os.path.join(path, 'categories.bcolz')
            with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)):
                categories_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r')
                # categories_values = bcolz.carray(rootdir=rootdir, mode='r')[:]
        else:
            raise NotImplementedError("uh oh %s" % (meta['type'],))

        if format_codes == 'bcolz':
            rootdir = os.path.join(path, 'codes.bcolz')
            with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)):
                codes_values = bcolz.open(rootdir=rootdir, mode='r')[:]  # , categories=categories_values)
                # codes_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r') # , categories=categories_values)
        elif format_codes == 'npy':
            filename = os.path.join(path, 'codes.npy')
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                codes_values = numpy.load(filename, mmap_mode='r+')
        else:
            raise Exception("unknown format_codes type %s" % (format_codes,))

        with log.timedlogger("FastCat construction"):
            s = FastCat(codes_values, categories_values)
    else:
        if format_values == 'bcolz':
            rootdir = os.path.join(path, 'values.bcolz')
            with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)):
                # values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r')
                s = bcolz.open(rootdir=rootdir, mode='r')[:]
        elif format_values == 'npy':
            filename = os.path.join(path, 'values.npy')
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                s = numpy.load(filename, mmap_mode='r+')
        elif format_values == 'pickle':
            filename = os.path.join(path, 'values.pickle')
            with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)):
                s = pickle.load(open(filename, 'rb'))
        # with log.timedlogger("FastSeries construction"):
        #     index = pandas.Index(numpy.arange(len(values)), copy=False)
        #     values = SingleBlockManager(values, index, fastpath=True)
        #     s = pandas.Series(data=values, fastpath=True, copy=False, dtype=meta['type'])
        # s = values # [:]
    # logging.warning('Constructing categorical for %s' % meta['name'])
    # s = pandas.Categorical.from_codes(codes_values, categories_values, name=meta['name'])
    if isinstance(meta['name'], list):
        meta['name'] = tuple(meta['name'])
    return meta, s  # codes_values, categories_values
Esempio n. 2
0
 def __init__(self, root_dir):
     self._root_dir = root_dir
     import bcolz
     import os
     import pickle
     self._daily_table = bcolz.open(os.path.join(root_dir, LocalDataSource.DAILY))
     self._instruments = {d['order_book_id']: Instrument(d)
                          for d in pickle.load(open(os.path.join(root_dir, LocalDataSource.INSTRUMENTS), 'rb'))}
     self._dividend = bcolz.open(os.path.join(root_dir, LocalDataSource.DIVIDEND))
     self._yield_curve = bcolz.open(os.path.join(root_dir, LocalDataSource.YIELD_CURVE))
     self._trading_dates = pd.Index(pd.Timestamp(str(d)) for d in
                                    bcolz.open(os.path.join(root_dir, LocalDataSource.TRADING_DATES)))
Esempio n. 3
0
    def _get_ctable(self, asset):
        sid = int(asset)

        if isinstance(asset, Future):
            if self._future_minute_reader.sid_path_func is not None:
                path = self._future_minute_reader.sid_path_func(
                    self._future_minute_reader.rootdir, sid
                )
            else:
                path = "{0}/{1}.bcolz".format(
                    self._future_minute_reader.rootdir, sid)
        elif isinstance(asset, Equity):
            if self._equity_minute_reader.sid_path_func is not None:
                path = self._equity_minute_reader.sid_path_func(
                    self._equity_minute_reader.rootdir, sid
                )
            else:
                path = "{0}/{1}.bcolz".format(
                    self._equity_minute_reader.rootdir, sid)

        else:
            # TODO: Figure out if assets should be allowed if neither, and
            # why this code path is being hit.
            if self._equity_minute_reader.sid_path_func is not None:
                path = self._equity_minute_reader.sid_path_func(
                    self._equity_minute_reader.rootdir, sid
                )
            else:
                path = "{0}/{1}.bcolz".format(
                    self._equity_minute_reader.rootdir, sid)

        return bcolz.open(path, mode='r')
Esempio n. 4
0
def load(db, query=None):

    t0 = time.time()
    conn = sqlite3.connect(db)
    cur = conn.cursor()

    gt_cols = get_gt_cols(cur)
    samples = get_samples(cur)
    bcpath = get_bcolz_dir(db)

    carrays = {}
    n = 0
    for gtc in gt_cols:
        if not gtc in query: continue
        carrays[gtc] = []
        for s in samples:
            if not s in query and not fix_sample_name(s) in query:
                # need to add anyway as place-holder
                carrays[gtc].append(None)
                continue
            path = "%s/%s/%s" % (bcpath, s, gtc)
            if os.path.exists(path):
                carrays[gtc].append(bcolz.open(path, mode="r"))
                n += 1
    if os.environ.get("GEMINI_DEBUG") == "TRUE":
        print >>sys.stderr, "it took %.2f seconds to load %d arrays" \
            % (time.time() - t0, n)
    return carrays
Esempio n. 5
0
def load_array(fname, opt_fallback=None):
  if not os.path.isdir(fname):
    arr = opt_fallback()
    if hasattr(arr, 'values'): arr = arr.values
    save_array(fname, arr)
    return arr
  return bcolz.open(fname)[:]
Esempio n. 6
0
def load(db, query=None):

    t0 = time.time()
    conn, metadata = database.get_session_metadata(db)

    gt_cols = get_gt_cols(metadata)
    samples = get_samples(metadata)
    bcpath = get_bcolz_dir(db)

    carrays = {}
    n = 0
    for gtc in gt_cols:
        if not gtc in query: continue
        carrays[gtc] = []
        for s in samples:
            if not s in query and not fix_sample_name(s) in query:
                # need to add anyway as place-holder
                carrays[gtc].append(None)
                continue
            path = "%s/%s/%s" % (bcpath, s, gtc)
            if os.path.exists(path):
                carrays[gtc].append(bcolz.open(path, mode="r"))
                n += 1
    if os.environ.get("GEMINI_DEBUG") == "TRUE":
        sys.stderr.write("it took %.2f seconds to load %d arrays\n" \
            % (time.time() - t0, n))
    return carrays
Esempio n. 7
0
def animate(i):
    b = bcolz.open('db')
    yar = [j[0] for j in b[-10:]]
    xar = np.arange(len(yar))
    ax1.clear()
    ax1.plot(xar, yar)
    print(yar)
Esempio n. 8
0
    def __iter__(self):

        # obtain ctable
        if isinstance(self.source, string_types):
            import bcolz

            ctbl = bcolz.open(self.source, mode="r")
        else:
            # assume bcolz ctable
            ctbl = self.source

        # obtain header
        if self.outcols is None:
            header = tuple(ctbl.names)
        else:
            header = tuple(self.outcols)
            assert all(h in ctbl.names for h in header), "invalid outcols"
        yield header

        # obtain iterator
        if self.expression is None:
            it = ctbl.iter(outcols=self.outcols, skip=self.skip, limit=self.limit)
        else:
            it = ctbl.where(self.expression, outcols=self.outcols, skip=self.skip, limit=self.limit)

        for row in it:
            yield row
Esempio n. 9
0
    def truncate(self, date):
        """Truncate data beyond this date in all ctables."""
        truncate_slice_end = self.data_len_for_day(date)

        glob_path = os.path.join(self._rootdir, "*", "*", "*.bcolz")
        sid_paths = sorted(glob(glob_path))

        for sid_path in sid_paths:
            file_name = os.path.basename(sid_path)

            try:
                table = bcolz.open(rootdir=sid_path)
            except IOError:
                continue
            if table.len <= truncate_slice_end:
                logger.info("{0} not past truncate date={1}.", file_name, date)
                continue

            logger.info(
                "Truncating {0} at end_date={1}", file_name, date.date()
            )

            table.resize(truncate_slice_end)

        # Update end session in metadata.
        metadata = BcolzMinuteBarMetadata.read(self._rootdir)
        metadata.end_session = date
        metadata.write(self._rootdir)
Esempio n. 10
0
 def test00b(self):
     """Testing `carray` reshape (large shape)"""
     a = np.arange(16000).reshape((20, 20, 40))
     b = bcolz.arange(16000, rootdir=self.rootdir).reshape((20, 20, 40))
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     # print "b->", `b`
     assert_array_equal(a, b, "Arrays are not equal")
Esempio n. 11
0
 def test00a(self):
     """Testing `carray` reshape"""
     a = np.arange(16).reshape((2, 2, 4))
     b = bcolz.arange(16, rootdir=self.rootdir).reshape((2, 2, 4))
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     # print "b->", `b`
     assert_array_equal(a, b, "Arrays are not equal")
Esempio n. 12
0
 def test01b(self):
     """Testing `zeros` constructor (II)"""
     a = np.zeros(2, dtype='(2,4)i4')
     b = bcolz.zeros(2, dtype='(2,4)i4', rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     # print "b->", `b`
     assert_array_equal(a, b, "Arrays are not equal")
Esempio n. 13
0
    def _get_ctable(self, asset):
        sid = int(asset)
        if self._sid_path_func is not None:
            path = self._sid_path_func(self.rootdir, sid)
        else:
            path = "{0}/{1}.bcolz".format(self.rootdir, sid)

        return bcolz.open(path, mode='r')
Esempio n. 14
0
 def test02(self):
     """Testing `ones` constructor"""
     a = np.ones((2, 2), dtype='(4,)i4')
     b = bcolz.ones((2, 2), dtype='(4,)i4', rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     # print "b->", `b`
     assert_array_equal(a, b, "Arrays are not equal")
Esempio n. 15
0
 def test03a(self):
     """Testing `fill` constructor (scalar default)"""
     a = np.ones((2, 200), dtype='(4,)i4') * 3
     b = bcolz.fill((2, 200), 3, dtype='(4,)i4', rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     # print "b->", `b`
     assert_array_equal(a, b, "Arrays are not equal")
Esempio n. 16
0
 def test00b(self):
     """Testing `__getitem()__` method with only a start (slice)"""
     a = np.ones((27, 2700), dtype="i4") * 3
     b = bcolz.fill((27, 2700), 3, dtype="i4", rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = slice(1)
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 17
0
def query_maps(table_name, attr_name, key_to_int=False):
    """查询bcolz表中属性值"""
    rootdir = bcolz_table_path(table_name)
    ct = bcolz.open(rootdir)
    d = ct.attrs[attr_name]
    if key_to_int:
        return {int(k): v for k, v in d.items()}
    else:
        return d
Esempio n. 18
0
 def test00a(self):
     """Testing `__getitem()__` method with only a start (scalar)"""
     a = np.ones((2, 3), dtype="i4") * 3
     b = bcolz.fill((2, 3), 3, dtype="i4", rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = 1
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 19
0
 def test02(self):
     """Testing `__getitem()__` method with a start, stop, step"""
     a = np.ones((10, 2), dtype="i4") * 3
     b = bcolz.fill((10, 2), 3, dtype="i4", rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = slice(1, 9, 2)
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 20
0
 def test03c(self):
     """Testing `__getitem()__` method with several slices (III)"""
     a = np.arange(120 * 1000).reshape((5 * 1000, 4, 3, 2))
     b = bcolz.carray(a, rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = (slice(None, None, 3), slice(1, 3, 2), slice(1, 4, 2))
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 21
0
def from_dict_of_blocks(rootdir, mode='r'):
    """ deprecated """
    meta = json.load(open(os.path.join(rootdir, 'meta')))
    d = dict()
    for i, k in enumerate(meta['keys']):
        filename = os.path.join(rootdir, str(i))
        with log.timedlogger('reading {} ({})'.format(filename, k)):
            d[k] = bcolz.open(filename, mode=mode)
            print('... d[{}].shape = {}'.format(k, d[k].shape))
    return d
Esempio n. 22
0
 def test04c(self):
     """Testing `__getitem()__` method with shape reduction (III)"""
     a = np.arange(6000).reshape((50, 40, 3))
     b = bcolz.carray(a, rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = (1, slice(1, 4, 2), 2)
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 23
0
 def test05c(self):
     """Testing `__getitem()__` method with fancy indexing (III)"""
     a = np.arange(2000).reshape((50, 40))
     b = bcolz.carray(a, rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     sl = (slice(None), [0, 2])
     # print "b[sl]->", `b[sl]`
     self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal")
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 24
0
 def test04(self):
     """Testing `fill` constructor with open and resize (array default)"""
     a = np.ones((3,200), dtype='(4,)i4')*3
     b = bcolz.fill((2,200), [3,3,3,3], dtype='(4,)i4', rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     c = np.ones((1,200), dtype='(4,)i4')*3
     b.append(c)
     #print "b->", `b`, len(b), b[1]
     assert_array_equal(a, b, "Arrays are not equal")
Esempio n. 25
0
 def test00b(self):
     """Testing `__setitem()__` method with only a start (vector)"""
     a = np.ones((200, 300), dtype="i4") * 3
     b = bcolz.fill((200, 300), 3, dtype="i4", rootdir=self.rootdir)
     sl = slice(1)
     a[sl, :] = range(300)
     b[sl] = range(300)
     if self.open:
         b.flush()
         b = bcolz.open(rootdir=self.rootdir)
     # print "b[sl]->", `b[sl]`
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 26
0
 def test02b(self):
     """Testing `__setitem()__` method with start,stop,step (scalar)"""
     a = np.ones((10, 2), dtype="i4") * 3
     b = bcolz.fill((10, 2), 3, dtype="i4", rootdir=self.rootdir)
     sl = slice(1, 8, 3)
     a[sl, :] = range(2)
     b[sl] = range(2)
     if self.open:
         b.flush()
         b = bcolz.open(rootdir=self.rootdir)
     # print "b[sl]->", `b[sl]`, `b`
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 27
0
 def test05(self):
     """Testing `fill` constructor with open and resize (nchunks>1)"""
     a = np.ones((3,2000), dtype='(4,)i4')*3
     b = bcolz.fill((2,2000), [3,3,3,3], dtype='(4,)i4', rootdir=self.rootdir)
     if self.open:
         b = bcolz.open(rootdir=self.rootdir)
     c = np.ones((1,2000), dtype='(4,)i4')*3
     b.append(c)
     #print "b->", `b`
     # We need to use the b[:] here to overcome a problem with the
     # assert_array_equal() function
     assert_array_equal(a, b[:], "Arrays are not equal")
Esempio n. 28
0
 def test01a(self):
     """Testing `__setitem()__` method with start,stop (scalar)"""
     a = np.ones((500, 200), dtype="i4") * 3
     b = bcolz.fill((500, 200), 3, dtype="i4", rootdir=self.rootdir,
                    cparams=bcolz.cparams())
     sl = slice(100, 400)
     a[sl, :] = 0
     b[sl] = 0
     if self.open:
         b.flush()
         b = bcolz.open(rootdir=self.rootdir)
     # print "b[sl]->", `b[sl]`
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 29
0
 def test03d(self):
     """Testing `__setitem()__` method with several slices (IV)"""
     a = np.arange(120).reshape((5, 4, 3, 2))
     b = bcolz.carray(a, rootdir=self.rootdir)
     sl = (slice(1, 3), slice(1, 3, 1), slice(1, None, 2), slice(1))
     # print "before->", `b[sl]`
     a[sl] = 2
     b[sl] = 2
     if self.open:
         b.flush()
         b = bcolz.open(rootdir=self.rootdir)
     # print "after->", `b[sl]`
     assert_array_equal(a[:], b[:], "Arrays are not equal")
Esempio n. 30
0
 def test04c(self):
     """Testing `__setitem()__` method with shape reduction (III)"""
     a = np.arange(24).reshape((4, 3, 2))
     b = bcolz.carray(a, rootdir=self.rootdir)
     sl = (1, 2, slice(None, None, None))
     # print "before->", `b[sl]`
     a[sl] = 2
     b[sl] = 2
     if self.open:
         b.flush()
         b = bcolz.open(rootdir=self.rootdir)
     # print "after->", `b[sl]`
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Esempio n. 31
0
def load_array(fname):
    print("Loading image dataset from the location " + str(fname) + ".")
    return bcolz.open(fname)[:]

# In[4]:


a = "THIS is a go(%*#od day!===thanks $%@%*(don't}}} know... ====================================="
b = divide_string(a)
print(len(b))
print(b)


# In[5]:



vectors = bz.open(r"C:\Users\mul02\Desktop\Course\LIGN 167\Final Project\glove\27B.100.dat")[:]
words = pickle.load(open(r"C:\Users\mul02\Desktop\Course\LIGN 167\Final Project\glove\27B.100_words.pkl", 'rb'))
word2idx = pickle.load(open(r"C:\Users\mul02\Desktop\Course\LIGN 167\Final Project\glove\27B.100_idx.pkl", 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}


# In[6]:


# sents is a list of string
def remove_infrequent_words(sents):
    word_counts = {}
    divide_sentence = []
    #divide each sentence first
    for s in sents:
Esempio n. 33
0
def glove(name, address):
    vectors = bcolz.open(address + name +'.300.dat')[:]
    words = pickle.load(open(address + name +'.300_words.pkl', 'rb'))
    word2idx = pickle.load(open(address + name +'.300_idx.pkl', 'rb'))
    glove = {w: vectors[word2idx[w]] for w in words}
    return glove
Esempio n. 34
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--name', type=str, help='saved and resumed file name')
    parser.add_argument('--resume', action='store_true', help='resumed flag')
    parser.add_argument('--test',
                        dest='test_only',
                        default=False,
                        action='store_true')
    parser.add_argument('--detctor',
                        default='2019-03-16_10:28:52{}.pth',
                        help='the name of detector')
    parser.add_argument('--gpu', default='3', help='the chosen gpu id')
    args = parser.parse_args()

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    cudnn.benchmark = True

    ########################################## ARGUMENT SETTING	 ########################################
    if args.test_only:
        args.resume = True
    if args.resume and not args.name:
        raise ValueError('Resuming requires file name!')
    name = args.name if args.name else datetime.now().strftime(
        "%Y-%m-%d_%H:%M:%S")
    if args.resume:
        target_name = name
        logs = torch.load(target_name)
        # hacky way to tell the VQA classes that they should use the vocab without passing more params around
        data.preloaded_vocab = logs['vocab']
    else:
        target_name = os.path.join('logs', '{}'.format(name))
    if not args.test_only:
        print('will save to {}'.format(target_name))

    ######################################### DATASET PREPARATION #######################################
    if config.train_set == 'train':
        train_loader = data.get_loader(train=True)
        val_loader = data.get_loader(val=True)
    elif args.test_only:
        val_loader = data.get_loader(test=True)
    else:
        train_loader = data.get_loader(train=True, val=True)
        val_loader = data.get_loader(test=True)
    ########################################## MODEL PREPARATION ########################################
    embedding = bcolz.open(config.glove_path_filtered)[:]
    net = model.RelAtt(embedding)
    net = nn.DataParallel(net).cuda()

    optimizer = optim.Adam([p for p in net.parameters() if p.requires_grad],
                           lr=config.initial_lr,
                           weight_decay=1e-8)

    # optimizer = optim.RMSprop(
    # [p for p in net.parameters() if p.requires_grad],
    # lr=config.initial_lr,
    # momentum=0.20,
    # weight_decay=1e-8
    # )
    scheduler = lr_scheduler.ExponentialLR(optimizer, 0.5**(1 / 50000))
    #########################################
    #######################################
    acc_val_best = 0.0
    start_epoch = 0
    if args.resume:
        net.load_state_dict(logs['model_state'])
        optimizer.load_state_dict(logs['optim_state'])
        scheduler.load_state_dict(logs['scheduler_state'])
        start_epoch = logs['epoch']
        acc_val_best = logs['acc_val_best']

    tracker = utils.Tracker()
    r = np.zeros(3)
    for i in range(start_epoch, config.epochs):
        if not args.test_only:
            run(net,
                train_loader,
                optimizer,
                scheduler,
                tracker,
                train=True,
                prefix='train',
                epoch=i)
        if not (config.train_set == 'train+val'
                and i in range(config.epochs - 5)):
            r = run(net,
                    val_loader,
                    optimizer,
                    scheduler,
                    tracker,
                    train=False,
                    prefix='val',
                    epoch=i,
                    has_answers=(config.train_set == 'train'))

        if not args.test_only:
            results = {
                'epoch': i,
                'acc_val_best': acc_val_best,
                'name': name,
                'model_state': net.state_dict(),
                'optim_state': optimizer.state_dict(),
                'scheduler_state': scheduler.state_dict(),
                'eval': {
                    'answers': r[0],
                    'accuracies': r[1],
                    'idx': r[2]
                },
                'vocab': val_loader.dataset.vocab,
            }
            if config.train_set == 'train' and r[1].mean() > acc_val_best:
                acc_val_best = r[1].mean()
                torch.save(results, target_name + '.pth')
            if config.train_set == 'train+val':
                torch.save(results, target_name + '{}.pth')
                if i in range(config.epochs - 5, config.epochs):
                    saved_for_test(val_loader, r, i)

        else:
            saved_for_test(val_loader, r)
            break
Esempio n. 35
0
 def __init__(self, f):
     self._dates = bcolz.open(f, 'r')
     self._index = self._dates.attrs['line_map']
Esempio n. 36
0
def get_data(table):
    try:
        with bcolz.open(os.path.join(BUNDLE, table), 'r') as ctable:
            return ctable.todataframe()
    except FileNotFoundError:
        pass
Esempio n. 37
0
 def __init__(self, f):
     self._table = bcolz.open(f, 'r')
     self._index = self._table.attrs['line_map']
Esempio n. 38
0
 def _load_bcolz_data(self):
     bc = bcolz.open(rootdir=self.bcolz_fname, mode='r')
     self.df = bc.todataframe()
with open(f'{glove_path}/glove.6B.50d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
    
vectors = bcolz.carray(vectors[1:].reshape((400001, 50)), rootdir=f'{glove_path}/6B.50.dat', mode='w')
vectors.flush()
pickle.dump(words, open(f'{glove_path}/6B.50_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'{glove_path}/6B.50_idx.pkl', 'wb'))

vectors = bcolz.open(f'{glove_path}/6B.50.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}/6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}
print(glove['the'])

df = pd.read_csv("DA_labeled_belc_2019.csv")
t = [str(s).split() for s in df['text'].values.tolist()]

words = {}
for sentence in t:
    for word in sentence:
        words[word] = word

vocabulary = []
Esempio n. 40
0
import matplotlib.ticker as ticker
from utils.time_utils import timeSince
import pickle
import bcolz
import numpy as np
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from preprocess import clean_text
plt.switch_backend('agg')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =============================================================================
# EMBEDDING
# =============================================================================

vectors = bcolz.open(f'Embedding/6B.300d.dat')[:]
words = pickle.load(open(f'Embedding/6B.300_words.pkl', 'rb'))
word2idx = pickle.load(open(f'Embedding/6B.300_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}


def get_weights_matrix(target_vocab):
    matrix_len = len(target_vocab)
    weights_matrix = np.zeros((matrix_len, 300))
    words_found = 0

    for i, word in enumerate(target_vocab):
        try: 
            weights_matrix[i] = glove[word]
            words_found += 1
Esempio n. 41
0
def load_glove(glove_path):
    vectors = bcolz.open(f'{glove_path}/42B.300.dat')[:]
    words = pickle.load(open(f'{glove_path}/42B.300_words.pkl', 'rb'))
    word2idx = pickle.load(open(f'{glove_path}/42B.300_idx.pkl', 'rb'))
    glove = {w: vectors[word2idx[w]] for w in words}
    return glove, word2idx, vectors
Esempio n. 42
0
def load_array(fname): return bcolz.open(fname)[:]

def get_classes(path):
Esempio n. 43
0
 def __init__(self, f):
     dates = bcolz.open(f, 'r')
     self._index = dates.attrs['line_map']
     self._dates = [int(d) for d in dates]
def main():
    args, log = setup()

    if not args.data_augment:
        train = flatten_xml(args.train_folder, 'train')
        valid = flatten_xml(args.valid_folder, 'valid')
        log.info('xml data flattened.')
    else:
        train, valid = load_augmented_data()
        log.info('xml augmented data loaded.')

    ## tokenize & annotate
    with Pool(args.threads, initializer=init) as p:
        annotate_ = partial(annotate)
        train = list(p.map(annotate_, train, chunksize=args.batch_size))
        valid = list(p.imap(annotate_, valid, chunksize=args.batch_size))

    initial_len = len(train)
    train = list(filter(lambda x: x[-1] is not None, train))
    log.info('drop {} inconsistent samples.'.format(initial_len - len(train)))
    log.info('tokens generated')

    full = train + valid
    t1 = [row[1] for row in full]
    t2 = [row[5] for row in full]

    # build vocabulary
    vocab, counter = build_vocab(t1, t2)

    counter_tag = collections.Counter(w for row in full for w in row[3])
    vocab_tag = sorted(counter_tag, key=counter_tag.get, reverse=True)
    counter_ent = collections.Counter(w for row in full for w in row[4])
    vocab_ent = sorted(counter_ent, key=counter_ent.get, reverse=True)
    w2id = {w: i for i, w in enumerate(vocab)}
    id2w = {i: w for i, w in enumerate(vocab)}
    tag2id = {w: i for i, w in enumerate(vocab_tag)}
    ent2id = {w: i for i, w in enumerate(vocab_ent)}
    log.info('Vocabulary size: {}'.format(len(vocab)))
    log.info('Found {} POS tags.'.format(len(vocab_tag)))
    log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent))

    """
    pair_id     = row[0]
    t1_tokens   = row[1]
    t1_features = row[2] 
    t1_tags     = row[3] 
    t1_ents     = row[4] 
    t2_tokens   = row[5]
    t1          = row[6] # original t1 text
    t2          = row[7] # original t2 text
    label       = row[8] # string label Y/N
    """

    to_id_ = partial(to_id, w2id=w2id, tag2id=tag2id, ent2id=ent2id)
    train = list(map(to_id_, train))
    valid = list(map(to_id_, valid))
    log.info('converted to ids.')

    # loading glove
    glove_dir = os.path.dirname(args.wv_file)
    vectors_path  = os.path.join(glove_dir, 'glove.840B.300d.dat')
    words_path    = os.path.join(glove_dir, 'glove.840B.300d_words.pkl')
    word2idx_path = os.path.join(glove_dir, 'glove.840B.300d_idx.pkl')
    if not os.path.exists(words_path):
        build_glove(args.wv_file)
        log.info('glove built.')

    vectors  = bcolz.open(vectors_path)[:]
    words    = pickle.load(open(words_path, 'rb'))
    word2idx = pickle.load(open(word2idx_path, 'rb'))
    glove    = {w: vectors[word2idx[w]] for w in words}
    log.info('glove loaded.')

    vocab_size = len(vocab)
    embeddings = np.zeros((vocab_size, args.wv_dim))
    embed_counts = np.zeros(vocab_size)
    embed_counts[:4] = 1  # PAD, SOS, EOS, UNK

    words_found = 0
    for i, word in enumerate(w2id):
        if word in ["<PAD>", "<SOS>", "<EOS>", "<UNK>"]:
            continue
        try:
            embeddings[i] = glove[word]
            words_found += 1
        except KeyError:
            embeddings[i] = np.random.normal(scale=0.6, size=(args.wv_dim, ))

        embed_counts[i] += 1

    embeddings /= embed_counts.reshape((-1, 1))
    log.info('got embedding matrix.')
    log.info('{0} words not found.'.format(vocab_size - words_found))

    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ent': vocab_ent,
        'embedding': embeddings.tolist(),
        'word2id': w2id,
        'id2word': id2w,
        'wv_cased': args.wv_cased,
    }
    with open('data/coliee_meta_full_ko-de-en.msgpack', 'wb') as f:
        pickle.dump(meta, f)

    result = {
        'train': train,
        'valid': valid
    }
    with open('data/coliee_data_full_ko-de-en.msgpack', 'wb') as f:
        pickle.dump(result, f)

    log.info('saved to disk.')
Esempio n. 45
0
def read_bcolz(fname):
    import bcolz
    """Load the bcolz array from memory (all at once)
    """
    return bcolz.open(fname)[:]
Esempio n. 46
0
 def readFiles(self):
     self.vectors = bcolz.open(self.preTrainemb_path+self.name+'_wiki.dat')[:]
     self.words = pickle.load(open(self.preTrainemb_path+self.name+'_words_wiki.pkl', 'rb'))
     self.word2index = pickle.load(open(self.preTrainemb_path+self.name+'_w2i_wiki.pkl', 'rb'))
     self.index2word = pickle.load(open(self.preTrainemb_path+self.name+'_i2w_wiki.pkl', 'rb'))
     self.embedding_matrix = pickle.load(open(self.preTrainemb_path+self.name+'_embMat_wiki.pkl', 'rb') )
Esempio n. 47
0
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

vectors = bcolz.carray(vectors[1:].reshape((400000, 300)),
                       rootdir='glove.6B/6B.300.dat',
                       mode='w')
vectors.flush()
pickle.dump(words, open('glove.6B/6B.300_words.pkl', 'wb'))
pickle.dump(word2idx, open('glove.6B/6B.300_idx.pkl', 'wb'))

with open('data/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

print('Loading vocab...')

vectors = bcolz.open('glove.6B/6B.300.dat')[:]
words = pickle.load(open('glove.6B/6B.300_words.pkl', 'rb'))
word2idx = pickle.load(open('glove.6B/6B.300_idx.pkl', 'rb'))

print('glove is loaded...')

glove = {w: vectors[word2idx[w]] for w in words}
matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, 300))
words_found = 0

for i, word in enumerate(vocab.idx2word):
    try:
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
Esempio n. 48
0
def main():
    """
    Training and validation.
    """

    global best_bleu4, epochs_since_improvement, checkpoint, start_epoch, fine_tune_encoder, data_name, word_map, glove_path, emb_dim, rev_word_map

    # Read word map
    word_map_file = os.path.join(data_folder, 'WORDMAP_' + data_name + '.json')
    with open(word_map_file, 'r') as j:
        word_map = json.load(j)

    rev_word_map = {v: k for k, v in word_map.items()}
    #get glove
    vectors = bcolz.open(f'{glove_path}/6B.300.dat')[:]
    words = pickle.load(open(f'{glove_path}/6B.300_words.pkl', 'rb'))
    word2idx = pickle.load(open(f'{glove_path}/6B.300_idx.pkl', 'rb'))

    glove = {w: vectors[word2idx[w]] for w in words}
    matrix_len = len(word_map)
    weights_matrix = np.zeros((matrix_len, emb_dim))
    words_found = 0

    for i, word in enumerate(word_map.keys()):
        try:
            weights_matrix[i] = glove[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
#     weights_matrix = np.float64(weights_matrix)
#     weights_matrix = torch.from_numpy(weights_matrix)
#     pretrained_embedding = weights_matrix.to(dtype=torch.float)
#     print(pretrained_embedding.dtype)
#     if device.type == 'cpu' :
#         pretrained_embedding =  torch.FloatTensor(weights_matrix)
#     else:
#         pretrained_embedding =  torch.cuda.FloatTensor(weights_matrix)
        pretrained_embedding = torch.FloatTensor(weights_matrix)

    # Initialize / load checkpoint
    if checkpoint is None:
        decoder = DecoderWithAttention(attention_dim=attention_dim,
                                       embed_dim=emb_dim,
                                       decoder_dim=decoder_dim,
                                       vocab_size=len(word_map),
                                       dropout=dropout)
        decoder.load_pretrained_embeddings(
            pretrained_embedding
        )  # pretrained_embeddings should be of dimensions (len(word_map), emb_dim)
        decoder.fine_tune_embeddings(True)  # or False
        decoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, decoder.parameters()),
                                             lr=decoder_lr)
        encoder = Encoder()
        encoder.fine_tune(fine_tune_encoder)
        encoder_optimizer = torch.optim.Adam(
            params=filter(lambda p: p.requires_grad, encoder.parameters()),
            lr=encoder_lr) if fine_tune_encoder else None

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        best_bleu4 = checkpoint['bleu-4']
        decoder = checkpoint['decoder']
        decoder_optimizer = checkpoint['decoder_optimizer']
        encoder = checkpoint['encoder']
        encoder_optimizer = checkpoint['encoder_optimizer']
        if fine_tune_encoder is True and encoder_optimizer is None:
            encoder.fine_tune(fine_tune_encoder)
            encoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, encoder.parameters()),
                                                 lr=encoder_lr)

    # Move to GPU, if available
    decoder = decoder.to(device)
    encoder = encoder.to(device)

    # Loss function
    criterion = nn.CrossEntropyLoss().to(device)

    # Custom dataloaders
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'TRAIN',
        transform=transforms.Compose([normalize])),
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'VAL',
        transform=transforms.Compose([normalize])),
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=workers,
                                             pin_memory=True)

    # Epochs
    for epoch in range(start_epoch, epochs):

        # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
        if epochs_since_improvement == 20:
            break
        if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
            adjust_learning_rate(decoder_optimizer, 0.8)
            if fine_tune_encoder:
                adjust_learning_rate(encoder_optimizer, 0.8)

        # One epoch's training


#         train(train_loader=train_loader,
#               encoder=encoder,
#               decoder=decoder,
#               criterion=criterion,
#               encoder_optimizer=encoder_optimizer,
#               decoder_optimizer=decoder_optimizer,
#               epoch=epoch)

# One epoch's validation
        recent_bleu4 = validate(val_loader=val_loader,
                                encoder=encoder,
                                decoder=decoder,
                                criterion=criterion)

        # Check if there was an improvement
        is_best = recent_bleu4 > best_bleu4
        best_bleu4 = max(recent_bleu4, best_bleu4)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0

        # Save checkpoint
        save_checkpoint(data_name, epoch, epochs_since_improvement, encoder,
                        decoder, encoder_optimizer, decoder_optimizer,
                        recent_bleu4, is_best)
Esempio n. 49
0
def load_array(data_folder, fname):
    fname = os.path.join(data_folder, fname)
    print("Loading from {0} ...".format(fname))
    return bcolz.open(fname)[:]
Esempio n. 50
0
def get_collection_timestamp(config, path):
    import bcolz
    _, meta_full_path = get_paths(config.root_path, path)
    meta_data = bcolz.open(meta_full_path)[:][0]
    return meta_data['created']
Esempio n. 51
0
def load_array(fname):
    " load np matrix or array"
    return bcolz.open(fname)[:]
Esempio n. 52
0
    def read_time_series_cache_from_disk(self,
                                         fname,
                                         engine='hdf5',
                                         start_date=None,
                                         finish_date=None,
                                         db_server=constants.db_server,
                                         db_port=constants.db_port,
                                         username=constants.db_username,
                                         password=constants.db_password):
        """Reads time series cache from disk in either HDF5 or bcolz

        Parameters
        ----------
        fname : str (or list)
            file to be read from
        engine : str (optional)
            'hd5' - reads HDF5 files (default)
            'arctic' - reads from Arctic/MongoDB database
            'bcolz' - reads from bcolz file (not fully implemented)
            'parquet' - reads from Parquet
        start_date : str/datetime (optional)
            Start date
        finish_date : str/datetime (optional)
            Finish data
        db_server : str
            IP address of MongdDB (default '127.0.0.1')

        Returns
        -------
        DataFrame
        """

        logger = LoggerManager.getLogger(__name__)

        data_frame_list = []

        if not (isinstance(fname, list)):
            if '*' in fname:
                fname = glob.glob(fname)
            else:
                fname = [fname]

        for fname_single in fname:
            logger.debug("Reading " + fname_single + "..")

            if engine == 'parquet' and '.gzip' not in fname_single and '.parquet' not in fname_single:
                fname_single = fname_single + '.parquet'

            if (engine == 'bcolz'):
                try:
                    name = self.get_bcolz_filename(fname_single)
                    zlens = bcolz.open(rootdir=name)
                    data_frame = zlens.todataframe()

                    data_frame.index = pandas.DatetimeIndex(data_frame['DTS_'])
                    data_frame.index.name = 'Date'
                    del data_frame['DTS_']

                    # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas
                    data_frame.columns = self.find_replace_chars(
                        data_frame.columns, _replace_chars, _invalid_chars)
                    data_frame.columns = [x[2:] for x in data_frame.columns]
                except:
                    data_frame = None

            elif (engine == 'redis'):
                fname_single = os.path.basename(fname_single).replace('.', '_')

                msg = None

                try:
                    # for pyarrow
                    context = pa.default_serialization_context()

                    r = redis.StrictRedis(host=db_server, port=db_port, db=0)

                    # is there a compressed key stored?)
                    k = r.keys('comp_*_' + fname_single)

                    # if so, then it means that we have stored it as a compressed object
                    # if have more than 1 element, take the last (which will be the latest to be added)
                    if (len(k) >= 1):
                        k = k[-1].decode('utf-8')

                        comp = r.get(k)

                        siz = int(k.split('_')[1])
                        dec = pa.decompress(comp,
                                            codec='lz4',
                                            decompressed_size=siz)

                        msg = context.deserialize(dec)
                    else:
                        msg = r.get(fname_single)

                        # print(fname_single)
                        if msg is not None:
                            msg = context.deserialize(msg)
                            # logger.warning("Key " + fname_single + " not in Redis cache?")

                except Exception as e:
                    logger.info("Cache not existent for " + fname_single +
                                " in Redis: " + str(e))

                if msg is None:
                    data_frame = None
                else:
                    logger.info('Load Redis cache: ' + fname_single)

                    data_frame = msg  # pandas.read_msgpack(msg)

            elif (engine == 'arctic'):
                socketTimeoutMS = 2 * 1000

                import pymongo
                from arctic import Arctic

                fname_single = os.path.basename(fname_single).replace('.', '_')

                logger.info('Load Arctic/MongoDB library: ' + fname_single)

                if username is not None and password is not None:
                    c = pymongo.MongoClient(
                        host="mongodb://" + username + ":" + password + "@" +
                        str(db_server) + ":" + str(db_port),
                        connect=False
                    )  # , username=username, password=password)
                else:
                    c = pymongo.MongoClient(host="mongodb://" +
                                            str(db_server) + ":" +
                                            str(db_port),
                                            connect=False)

                store = Arctic(c,
                               socketTimeoutMS=socketTimeoutMS,
                               serverSelectionTimeoutMS=socketTimeoutMS)

                # Access the library
                try:
                    library = store[fname_single]

                    if start_date is None and finish_date is None:
                        item = library.read(fname_single)

                    else:
                        from arctic.date import DateRange
                        item = library.read(
                            fname_single,
                            date_range=DateRange(
                                start_date.replace(tzinfo=None),
                                finish_date.replace(tzinfo=None)))

                    c.close()

                    logger.info('Read ' + fname_single)

                    data_frame = item.data

                except Exception as e:
                    logger.warning('Library may not exist or another error: ' +
                                   fname_single + ' & message is ' + str(e))
                    data_frame = None

            elif self.path_exists(self.get_h5_filename(fname_single)):
                store = pandas.HDFStore(self.get_h5_filename(fname_single))
                data_frame = store.select("data")

                if ('intraday' in fname_single):
                    data_frame = data_frame.astype('float32')

                store.close()

            elif self.path_exists(fname_single) and '.csv' in fname_single:
                data_frame = pandas.read_csv(fname_single, index_col=0)

                data_frame.index = pd.to_datetime(data_frame.index)

            elif self.path_exists(fname_single):
                data_frame = self.read_parquet(fname_single)
                # data_frame = pandas.read_parquet(fname_single)

            data_frame_list.append(data_frame)

        if len(data_frame_list) == 1:
            return data_frame_list[0]

        return data_frame_list
Esempio n. 53
0
        state_change_label = torch.from_numpy(state_change_label).view(
            -1).long()

        loss_state_change_label = nn.CrossEntropyLoss(
            self.state_label_weights)(self.state_change_label_logits,
                                      state_change_label)

        return loss_state_change_label * coefficient

    def mse_loss(self, target_preds, coefficient):
        target_preds = torch.from_numpy(target_preds)
        return nn.functional.mse_loss(self.state_change_label_logits,
                                      target_preds) * coefficient


vectors = bcolz.open('data/6B.100.dat')[:]
words = pickle.load(open('data/6B.100_words.pkl', 'rb'))
word2idx = pickle.load(open('data/6B.100_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

with open("data/train_samples.pkl", "rb") as fp:
    train_samples = pickle.load(fp)

with open("data/test_samples.pkl", "rb") as fp:
    test_samples = pickle.load(fp)

with open("data/dev_samples.pkl", "rb") as fp:
    dev_samples = pickle.load(fp)

with open("data/unlabeled_samples.pkl", "rb") as fp:
        vectors.append(vect)

#Construct pickle files
vectors = bcolz.carray(vectors[1:].reshape((400000, 50)),
                       rootdir=f'/Users/nilslager/Desktop/gitit.50.dat',
                       mode='w')
vectors.flush()
pickle.dump(
    words, open(f'/Users/nilslager/Desktop/Projekt1/bibliotekN_words.pkl',
                'wb'))
pickle.dump(
    word2idx,
    open(f'/Users/nilslager/Desktop/Projekt1/bibliotekN_index.pkl', 'wb'))

#Create vector space
vectors = bcolz.open(f'/Users/nilslager/Desktop/6B.50.dat')[:]
words = pickle.load(
    open(f'/Users/nilslager/Desktop/Projekt1/bibliotekN_words.pkl', 'rb'))
word2idx = pickle.load(
    open(f'/Users/nilslager/Desktop/Projekt1/bibliotekN_index.pkl', 'rb'))
glove = {w: vectors[word2idx[w]] for w in words}

#Test
print(glove["the"])

#Model
matrix_len = len(glove)
weights_matrix = np.zeros((matrix_len, 50))
words_found = 0

#Glove -> Weights matrix
Esempio n. 55
0
 def __init__(self,fp):
     conn = sql.connect(fp + '//' + 'data.sqlite')
     self.c = conn.cursor()
     self.tb = bcolz.open(fp + '//' + 'data_d')
Esempio n. 56
0
def load_array(f):
    return bcolz.open(f)
Esempio n. 57
0
def load_array(fname):
    return bcolz.open(fname)[:]
Esempio n. 58
0
 def __init__(self, f):
     self._dates = pd.Index(pd.Timestamp(str(d)) for d in bcolz.open(f, 'r'))
Esempio n. 59
0
    def read_time_series_cache_from_disk(self,
                                         fname,
                                         engine='hdf5',
                                         start_date=None,
                                         finish_date=None,
                                         db_server='127.0.0.1'):
        """
        read_time_series_cache_from_disk - Reads time series cache from disk in either HDF5 or bcolz

        Parameters
        ----------
        fname : str
            file to be read from

        Returns
        -------
        DataFrame
        """

        if (engine == 'bcolz'):
            try:
                name = self.get_bcolz_filename(fname)
                zlens = bcolz.open(rootdir=name)
                data_frame = zlens.todataframe()

                data_frame.index = pandas.DatetimeIndex(data_frame['DTS_'])
                data_frame.index.name = 'Date'
                del data_frame['DTS_']

                # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas
                data_frame.columns = self.find_replace_chars(
                    data_frame.columns, _replace_chars, _invalid_chars)
                data_frame.columns = [x[2:] for x in data_frame.columns]

                return data_frame
            except:
                return None
        elif (engine == 'arctic'):
            socketTimeoutMS = 2 * 1000

            import pymongo
            from arctic import Arctic

            fname = os.path.basename(fname).replace('.', '_')

            self.logger.info('Load MongoDB library: ' + fname)

            c = pymongo.MongoClient(db_server, connect=False)

            store = Arctic(c,
                           socketTimeoutMS=socketTimeoutMS,
                           serverSelectionTimeoutMS=socketTimeoutMS)

            # Access the library
            library = store[fname]

            if start_date is None and finish_date is None:
                item = library.read(fname)
            else:
                from arctic.date import DateRange
                item = library.read(fname,
                                    date_range=DateRange(
                                        start_date, finish_date))

            c.close()

            self.logger.info('Read ' + fname)

            return item.data
        elif os.path.isfile(self.get_h5_filename(fname)):
            store = pandas.HDFStore(self.get_h5_filename(fname))
            data_frame = store.select("data")

            if ('intraday' in fname):
                data_frame = data_frame.astype('float32')

            store.close()

            return data_frame

        return None
def test_load_bcolz_embeddings():
    bcolz_embeddings_path = '/home/peng/Workspace/data/embeddings/bcolz_vectors/bcolz_embeddings.dat';
    word2idx_path = '/home/peng/Workspace/data/embeddings/word2idx.pkl';
    vectors = bcolz.open(f'{bcolz_embeddings_path}', mode='r');
    word2idx = pickle.load(open(word2idx_path, 'rb'));
    print(vectors[word2idx['house']]);