def _from_carray(path, format_categories=None, format_codes=None, format_values=None): meta = json.load(open(os.path.join(path, 'meta'), 'r')) if meta['type'] == 'category': if format_categories in ['npz', 'npy']: filename = os.path.join(path, 'categories.%s' % format_categories) with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)): categories_values = numpy.load(filename, mmap_mode='r+') # TODO npz not memmap? if format_categories == 'npz': categories_values = categories_values['arr_0'] elif format_categories == 'pickle': filename = os.path.join(path, 'categories.pickle') with log.timedlogger("reading [%s] %s" % (meta['name'], filename)): categories_values = pickle.load(open(filename, 'rb')) elif format_categories == 'bcolz': rootdir = os.path.join(path, 'categories.bcolz') with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)): categories_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r') # categories_values = bcolz.carray(rootdir=rootdir, mode='r')[:] else: raise NotImplementedError("uh oh %s" % (meta['type'],)) if format_codes == 'bcolz': rootdir = os.path.join(path, 'codes.bcolz') with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)): codes_values = bcolz.open(rootdir=rootdir, mode='r')[:] # , categories=categories_values) # codes_values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r') # , categories=categories_values) elif format_codes == 'npy': filename = os.path.join(path, 'codes.npy') with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)): codes_values = numpy.load(filename, mmap_mode='r+') else: raise Exception("unknown format_codes type %s" % (format_codes,)) with log.timedlogger("FastCat construction"): s = FastCat(codes_values, categories_values) else: if format_values == 'bcolz': rootdir = os.path.join(path, 'values.bcolz') with log.timedlogger("reading [%s] %s" % (meta['name'], rootdir)): # values = FakeCarrayAsNumpyArray(rootdir=rootdir, mode='r') s = bcolz.open(rootdir=rootdir, mode='r')[:] elif format_values == 'npy': filename = os.path.join(path, 'values.npy') with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)): s = numpy.load(filename, mmap_mode='r+') elif format_values == 'pickle': filename = os.path.join(path, 'values.pickle') with log.timedlogger("reading [%s] %s with mmap_mode" % (meta['name'], filename)): s = pickle.load(open(filename, 'rb')) # with log.timedlogger("FastSeries construction"): # index = pandas.Index(numpy.arange(len(values)), copy=False) # values = SingleBlockManager(values, index, fastpath=True) # s = pandas.Series(data=values, fastpath=True, copy=False, dtype=meta['type']) # s = values # [:] # logging.warning('Constructing categorical for %s' % meta['name']) # s = pandas.Categorical.from_codes(codes_values, categories_values, name=meta['name']) if isinstance(meta['name'], list): meta['name'] = tuple(meta['name']) return meta, s # codes_values, categories_values
def __init__(self, root_dir): self._root_dir = root_dir import bcolz import os import pickle self._daily_table = bcolz.open(os.path.join(root_dir, LocalDataSource.DAILY)) self._instruments = {d['order_book_id']: Instrument(d) for d in pickle.load(open(os.path.join(root_dir, LocalDataSource.INSTRUMENTS), 'rb'))} self._dividend = bcolz.open(os.path.join(root_dir, LocalDataSource.DIVIDEND)) self._yield_curve = bcolz.open(os.path.join(root_dir, LocalDataSource.YIELD_CURVE)) self._trading_dates = pd.Index(pd.Timestamp(str(d)) for d in bcolz.open(os.path.join(root_dir, LocalDataSource.TRADING_DATES)))
def _get_ctable(self, asset): sid = int(asset) if isinstance(asset, Future): if self._future_minute_reader.sid_path_func is not None: path = self._future_minute_reader.sid_path_func( self._future_minute_reader.rootdir, sid ) else: path = "{0}/{1}.bcolz".format( self._future_minute_reader.rootdir, sid) elif isinstance(asset, Equity): if self._equity_minute_reader.sid_path_func is not None: path = self._equity_minute_reader.sid_path_func( self._equity_minute_reader.rootdir, sid ) else: path = "{0}/{1}.bcolz".format( self._equity_minute_reader.rootdir, sid) else: # TODO: Figure out if assets should be allowed if neither, and # why this code path is being hit. if self._equity_minute_reader.sid_path_func is not None: path = self._equity_minute_reader.sid_path_func( self._equity_minute_reader.rootdir, sid ) else: path = "{0}/{1}.bcolz".format( self._equity_minute_reader.rootdir, sid) return bcolz.open(path, mode='r')
def load(db, query=None): t0 = time.time() conn = sqlite3.connect(db) cur = conn.cursor() gt_cols = get_gt_cols(cur) samples = get_samples(cur) bcpath = get_bcolz_dir(db) carrays = {} n = 0 for gtc in gt_cols: if not gtc in query: continue carrays[gtc] = [] for s in samples: if not s in query and not fix_sample_name(s) in query: # need to add anyway as place-holder carrays[gtc].append(None) continue path = "%s/%s/%s" % (bcpath, s, gtc) if os.path.exists(path): carrays[gtc].append(bcolz.open(path, mode="r")) n += 1 if os.environ.get("GEMINI_DEBUG") == "TRUE": print >>sys.stderr, "it took %.2f seconds to load %d arrays" \ % (time.time() - t0, n) return carrays
def load_array(fname, opt_fallback=None): if not os.path.isdir(fname): arr = opt_fallback() if hasattr(arr, 'values'): arr = arr.values save_array(fname, arr) return arr return bcolz.open(fname)[:]
def load(db, query=None): t0 = time.time() conn, metadata = database.get_session_metadata(db) gt_cols = get_gt_cols(metadata) samples = get_samples(metadata) bcpath = get_bcolz_dir(db) carrays = {} n = 0 for gtc in gt_cols: if not gtc in query: continue carrays[gtc] = [] for s in samples: if not s in query and not fix_sample_name(s) in query: # need to add anyway as place-holder carrays[gtc].append(None) continue path = "%s/%s/%s" % (bcpath, s, gtc) if os.path.exists(path): carrays[gtc].append(bcolz.open(path, mode="r")) n += 1 if os.environ.get("GEMINI_DEBUG") == "TRUE": sys.stderr.write("it took %.2f seconds to load %d arrays\n" \ % (time.time() - t0, n)) return carrays
def animate(i): b = bcolz.open('db') yar = [j[0] for j in b[-10:]] xar = np.arange(len(yar)) ax1.clear() ax1.plot(xar, yar) print(yar)
def __iter__(self): # obtain ctable if isinstance(self.source, string_types): import bcolz ctbl = bcolz.open(self.source, mode="r") else: # assume bcolz ctable ctbl = self.source # obtain header if self.outcols is None: header = tuple(ctbl.names) else: header = tuple(self.outcols) assert all(h in ctbl.names for h in header), "invalid outcols" yield header # obtain iterator if self.expression is None: it = ctbl.iter(outcols=self.outcols, skip=self.skip, limit=self.limit) else: it = ctbl.where(self.expression, outcols=self.outcols, skip=self.skip, limit=self.limit) for row in it: yield row
def truncate(self, date): """Truncate data beyond this date in all ctables.""" truncate_slice_end = self.data_len_for_day(date) glob_path = os.path.join(self._rootdir, "*", "*", "*.bcolz") sid_paths = sorted(glob(glob_path)) for sid_path in sid_paths: file_name = os.path.basename(sid_path) try: table = bcolz.open(rootdir=sid_path) except IOError: continue if table.len <= truncate_slice_end: logger.info("{0} not past truncate date={1}.", file_name, date) continue logger.info( "Truncating {0} at end_date={1}", file_name, date.date() ) table.resize(truncate_slice_end) # Update end session in metadata. metadata = BcolzMinuteBarMetadata.read(self._rootdir) metadata.end_session = date metadata.write(self._rootdir)
def test00b(self): """Testing `carray` reshape (large shape)""" a = np.arange(16000).reshape((20, 20, 40)) b = bcolz.arange(16000, rootdir=self.rootdir).reshape((20, 20, 40)) if self.open: b = bcolz.open(rootdir=self.rootdir) # print "b->", `b` assert_array_equal(a, b, "Arrays are not equal")
def test00a(self): """Testing `carray` reshape""" a = np.arange(16).reshape((2, 2, 4)) b = bcolz.arange(16, rootdir=self.rootdir).reshape((2, 2, 4)) if self.open: b = bcolz.open(rootdir=self.rootdir) # print "b->", `b` assert_array_equal(a, b, "Arrays are not equal")
def test01b(self): """Testing `zeros` constructor (II)""" a = np.zeros(2, dtype='(2,4)i4') b = bcolz.zeros(2, dtype='(2,4)i4', rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) # print "b->", `b` assert_array_equal(a, b, "Arrays are not equal")
def _get_ctable(self, asset): sid = int(asset) if self._sid_path_func is not None: path = self._sid_path_func(self.rootdir, sid) else: path = "{0}/{1}.bcolz".format(self.rootdir, sid) return bcolz.open(path, mode='r')
def test02(self): """Testing `ones` constructor""" a = np.ones((2, 2), dtype='(4,)i4') b = bcolz.ones((2, 2), dtype='(4,)i4', rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) # print "b->", `b` assert_array_equal(a, b, "Arrays are not equal")
def test03a(self): """Testing `fill` constructor (scalar default)""" a = np.ones((2, 200), dtype='(4,)i4') * 3 b = bcolz.fill((2, 200), 3, dtype='(4,)i4', rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) # print "b->", `b` assert_array_equal(a, b, "Arrays are not equal")
def test00b(self): """Testing `__getitem()__` method with only a start (slice)""" a = np.ones((27, 2700), dtype="i4") * 3 b = bcolz.fill((27, 2700), 3, dtype="i4", rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) sl = slice(1) self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def query_maps(table_name, attr_name, key_to_int=False): """查询bcolz表中属性值""" rootdir = bcolz_table_path(table_name) ct = bcolz.open(rootdir) d = ct.attrs[attr_name] if key_to_int: return {int(k): v for k, v in d.items()} else: return d
def test00a(self): """Testing `__getitem()__` method with only a start (scalar)""" a = np.ones((2, 3), dtype="i4") * 3 b = bcolz.fill((2, 3), 3, dtype="i4", rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) sl = 1 # print "b[sl]->", `b[sl]` self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test02(self): """Testing `__getitem()__` method with a start, stop, step""" a = np.ones((10, 2), dtype="i4") * 3 b = bcolz.fill((10, 2), 3, dtype="i4", rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) sl = slice(1, 9, 2) # print "b[sl]->", `b[sl]` self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test03c(self): """Testing `__getitem()__` method with several slices (III)""" a = np.arange(120 * 1000).reshape((5 * 1000, 4, 3, 2)) b = bcolz.carray(a, rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) sl = (slice(None, None, 3), slice(1, 3, 2), slice(1, 4, 2)) # print "b[sl]->", `b[sl]` self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def from_dict_of_blocks(rootdir, mode='r'): """ deprecated """ meta = json.load(open(os.path.join(rootdir, 'meta'))) d = dict() for i, k in enumerate(meta['keys']): filename = os.path.join(rootdir, str(i)) with log.timedlogger('reading {} ({})'.format(filename, k)): d[k] = bcolz.open(filename, mode=mode) print('... d[{}].shape = {}'.format(k, d[k].shape)) return d
def test04c(self): """Testing `__getitem()__` method with shape reduction (III)""" a = np.arange(6000).reshape((50, 40, 3)) b = bcolz.carray(a, rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) sl = (1, slice(1, 4, 2), 2) # print "b[sl]->", `b[sl]` self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test05c(self): """Testing `__getitem()__` method with fancy indexing (III)""" a = np.arange(2000).reshape((50, 40)) b = bcolz.carray(a, rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) sl = (slice(None), [0, 2]) # print "b[sl]->", `b[sl]` self.assertTrue(a[sl].shape == b[sl].shape, "Shape is not equal") assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test04(self): """Testing `fill` constructor with open and resize (array default)""" a = np.ones((3,200), dtype='(4,)i4')*3 b = bcolz.fill((2,200), [3,3,3,3], dtype='(4,)i4', rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) c = np.ones((1,200), dtype='(4,)i4')*3 b.append(c) #print "b->", `b`, len(b), b[1] assert_array_equal(a, b, "Arrays are not equal")
def test00b(self): """Testing `__setitem()__` method with only a start (vector)""" a = np.ones((200, 300), dtype="i4") * 3 b = bcolz.fill((200, 300), 3, dtype="i4", rootdir=self.rootdir) sl = slice(1) a[sl, :] = range(300) b[sl] = range(300) if self.open: b.flush() b = bcolz.open(rootdir=self.rootdir) # print "b[sl]->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test02b(self): """Testing `__setitem()__` method with start,stop,step (scalar)""" a = np.ones((10, 2), dtype="i4") * 3 b = bcolz.fill((10, 2), 3, dtype="i4", rootdir=self.rootdir) sl = slice(1, 8, 3) a[sl, :] = range(2) b[sl] = range(2) if self.open: b.flush() b = bcolz.open(rootdir=self.rootdir) # print "b[sl]->", `b[sl]`, `b` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test05(self): """Testing `fill` constructor with open and resize (nchunks>1)""" a = np.ones((3,2000), dtype='(4,)i4')*3 b = bcolz.fill((2,2000), [3,3,3,3], dtype='(4,)i4', rootdir=self.rootdir) if self.open: b = bcolz.open(rootdir=self.rootdir) c = np.ones((1,2000), dtype='(4,)i4')*3 b.append(c) #print "b->", `b` # We need to use the b[:] here to overcome a problem with the # assert_array_equal() function assert_array_equal(a, b[:], "Arrays are not equal")
def test01a(self): """Testing `__setitem()__` method with start,stop (scalar)""" a = np.ones((500, 200), dtype="i4") * 3 b = bcolz.fill((500, 200), 3, dtype="i4", rootdir=self.rootdir, cparams=bcolz.cparams()) sl = slice(100, 400) a[sl, :] = 0 b[sl] = 0 if self.open: b.flush() b = bcolz.open(rootdir=self.rootdir) # print "b[sl]->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def test03d(self): """Testing `__setitem()__` method with several slices (IV)""" a = np.arange(120).reshape((5, 4, 3, 2)) b = bcolz.carray(a, rootdir=self.rootdir) sl = (slice(1, 3), slice(1, 3, 1), slice(1, None, 2), slice(1)) # print "before->", `b[sl]` a[sl] = 2 b[sl] = 2 if self.open: b.flush() b = bcolz.open(rootdir=self.rootdir) # print "after->", `b[sl]` assert_array_equal(a[:], b[:], "Arrays are not equal")
def test04c(self): """Testing `__setitem()__` method with shape reduction (III)""" a = np.arange(24).reshape((4, 3, 2)) b = bcolz.carray(a, rootdir=self.rootdir) sl = (1, 2, slice(None, None, None)) # print "before->", `b[sl]` a[sl] = 2 b[sl] = 2 if self.open: b.flush() b = bcolz.open(rootdir=self.rootdir) # print "after->", `b[sl]` assert_array_equal(a[sl], b[sl], "Arrays are not equal")
def load_array(fname): print("Loading image dataset from the location " + str(fname) + ".") return bcolz.open(fname)[:]
# In[4]: a = "THIS is a go(%*#od day!===thanks $%@%*(don't}}} know... =====================================" b = divide_string(a) print(len(b)) print(b) # In[5]: vectors = bz.open(r"C:\Users\mul02\Desktop\Course\LIGN 167\Final Project\glove\27B.100.dat")[:] words = pickle.load(open(r"C:\Users\mul02\Desktop\Course\LIGN 167\Final Project\glove\27B.100_words.pkl", 'rb')) word2idx = pickle.load(open(r"C:\Users\mul02\Desktop\Course\LIGN 167\Final Project\glove\27B.100_idx.pkl", 'rb')) glove = {w: vectors[word2idx[w]] for w in words} # In[6]: # sents is a list of string def remove_infrequent_words(sents): word_counts = {} divide_sentence = [] #divide each sentence first for s in sents:
def glove(name, address): vectors = bcolz.open(address + name +'.300.dat')[:] words = pickle.load(open(address + name +'.300_words.pkl', 'rb')) word2idx = pickle.load(open(address + name +'.300_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} return glove
def main(): parser = argparse.ArgumentParser() parser.add_argument('--name', type=str, help='saved and resumed file name') parser.add_argument('--resume', action='store_true', help='resumed flag') parser.add_argument('--test', dest='test_only', default=False, action='store_true') parser.add_argument('--detctor', default='2019-03-16_10:28:52{}.pth', help='the name of detector') parser.add_argument('--gpu', default='3', help='the chosen gpu id') args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu cudnn.benchmark = True ########################################## ARGUMENT SETTING ######################################## if args.test_only: args.resume = True if args.resume and not args.name: raise ValueError('Resuming requires file name!') name = args.name if args.name else datetime.now().strftime( "%Y-%m-%d_%H:%M:%S") if args.resume: target_name = name logs = torch.load(target_name) # hacky way to tell the VQA classes that they should use the vocab without passing more params around data.preloaded_vocab = logs['vocab'] else: target_name = os.path.join('logs', '{}'.format(name)) if not args.test_only: print('will save to {}'.format(target_name)) ######################################### DATASET PREPARATION ####################################### if config.train_set == 'train': train_loader = data.get_loader(train=True) val_loader = data.get_loader(val=True) elif args.test_only: val_loader = data.get_loader(test=True) else: train_loader = data.get_loader(train=True, val=True) val_loader = data.get_loader(test=True) ########################################## MODEL PREPARATION ######################################## embedding = bcolz.open(config.glove_path_filtered)[:] net = model.RelAtt(embedding) net = nn.DataParallel(net).cuda() optimizer = optim.Adam([p for p in net.parameters() if p.requires_grad], lr=config.initial_lr, weight_decay=1e-8) # optimizer = optim.RMSprop( # [p for p in net.parameters() if p.requires_grad], # lr=config.initial_lr, # momentum=0.20, # weight_decay=1e-8 # ) scheduler = lr_scheduler.ExponentialLR(optimizer, 0.5**(1 / 50000)) ######################################### ####################################### acc_val_best = 0.0 start_epoch = 0 if args.resume: net.load_state_dict(logs['model_state']) optimizer.load_state_dict(logs['optim_state']) scheduler.load_state_dict(logs['scheduler_state']) start_epoch = logs['epoch'] acc_val_best = logs['acc_val_best'] tracker = utils.Tracker() r = np.zeros(3) for i in range(start_epoch, config.epochs): if not args.test_only: run(net, train_loader, optimizer, scheduler, tracker, train=True, prefix='train', epoch=i) if not (config.train_set == 'train+val' and i in range(config.epochs - 5)): r = run(net, val_loader, optimizer, scheduler, tracker, train=False, prefix='val', epoch=i, has_answers=(config.train_set == 'train')) if not args.test_only: results = { 'epoch': i, 'acc_val_best': acc_val_best, 'name': name, 'model_state': net.state_dict(), 'optim_state': optimizer.state_dict(), 'scheduler_state': scheduler.state_dict(), 'eval': { 'answers': r[0], 'accuracies': r[1], 'idx': r[2] }, 'vocab': val_loader.dataset.vocab, } if config.train_set == 'train' and r[1].mean() > acc_val_best: acc_val_best = r[1].mean() torch.save(results, target_name + '.pth') if config.train_set == 'train+val': torch.save(results, target_name + '{}.pth') if i in range(config.epochs - 5, config.epochs): saved_for_test(val_loader, r, i) else: saved_for_test(val_loader, r) break
def __init__(self, f): self._dates = bcolz.open(f, 'r') self._index = self._dates.attrs['line_map']
def get_data(table): try: with bcolz.open(os.path.join(BUNDLE, table), 'r') as ctable: return ctable.todataframe() except FileNotFoundError: pass
def __init__(self, f): self._table = bcolz.open(f, 'r') self._index = self._table.attrs['line_map']
def _load_bcolz_data(self): bc = bcolz.open(rootdir=self.bcolz_fname, mode='r') self.df = bc.todataframe()
with open(f'{glove_path}/glove.6B.50d.txt', 'rb') as f: for l in f: line = l.decode().split() word = line[0] words.append(word) word2idx[word] = idx idx += 1 vect = np.array(line[1:]).astype(np.float) vectors.append(vect) vectors = bcolz.carray(vectors[1:].reshape((400001, 50)), rootdir=f'{glove_path}/6B.50.dat', mode='w') vectors.flush() pickle.dump(words, open(f'{glove_path}/6B.50_words.pkl', 'wb')) pickle.dump(word2idx, open(f'{glove_path}/6B.50_idx.pkl', 'wb')) vectors = bcolz.open(f'{glove_path}/6B.50.dat')[:] words = pickle.load(open(f'{glove_path}/6B.50_words.pkl', 'rb')) word2idx = pickle.load(open(f'{glove_path}/6B.50_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} print(glove['the']) df = pd.read_csv("DA_labeled_belc_2019.csv") t = [str(s).split() for s in df['text'].values.tolist()] words = {} for sentence in t: for word in sentence: words[word] = word vocabulary = []
import matplotlib.ticker as ticker from utils.time_utils import timeSince import pickle import bcolz import numpy as np import torch.nn.functional as F from sklearn.model_selection import train_test_split from preprocess import clean_text plt.switch_backend('agg') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # ============================================================================= # EMBEDDING # ============================================================================= vectors = bcolz.open(f'Embedding/6B.300d.dat')[:] words = pickle.load(open(f'Embedding/6B.300_words.pkl', 'rb')) word2idx = pickle.load(open(f'Embedding/6B.300_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} def get_weights_matrix(target_vocab): matrix_len = len(target_vocab) weights_matrix = np.zeros((matrix_len, 300)) words_found = 0 for i, word in enumerate(target_vocab): try: weights_matrix[i] = glove[word] words_found += 1
def load_glove(glove_path): vectors = bcolz.open(f'{glove_path}/42B.300.dat')[:] words = pickle.load(open(f'{glove_path}/42B.300_words.pkl', 'rb')) word2idx = pickle.load(open(f'{glove_path}/42B.300_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} return glove, word2idx, vectors
def load_array(fname): return bcolz.open(fname)[:] def get_classes(path):
def __init__(self, f): dates = bcolz.open(f, 'r') self._index = dates.attrs['line_map'] self._dates = [int(d) for d in dates]
def main(): args, log = setup() if not args.data_augment: train = flatten_xml(args.train_folder, 'train') valid = flatten_xml(args.valid_folder, 'valid') log.info('xml data flattened.') else: train, valid = load_augmented_data() log.info('xml augmented data loaded.') ## tokenize & annotate with Pool(args.threads, initializer=init) as p: annotate_ = partial(annotate) train = list(p.map(annotate_, train, chunksize=args.batch_size)) valid = list(p.imap(annotate_, valid, chunksize=args.batch_size)) initial_len = len(train) train = list(filter(lambda x: x[-1] is not None, train)) log.info('drop {} inconsistent samples.'.format(initial_len - len(train))) log.info('tokens generated') full = train + valid t1 = [row[1] for row in full] t2 = [row[5] for row in full] # build vocabulary vocab, counter = build_vocab(t1, t2) counter_tag = collections.Counter(w for row in full for w in row[3]) vocab_tag = sorted(counter_tag, key=counter_tag.get, reverse=True) counter_ent = collections.Counter(w for row in full for w in row[4]) vocab_ent = sorted(counter_ent, key=counter_ent.get, reverse=True) w2id = {w: i for i, w in enumerate(vocab)} id2w = {i: w for i, w in enumerate(vocab)} tag2id = {w: i for i, w in enumerate(vocab_tag)} ent2id = {w: i for i, w in enumerate(vocab_ent)} log.info('Vocabulary size: {}'.format(len(vocab))) log.info('Found {} POS tags.'.format(len(vocab_tag))) log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent)) """ pair_id = row[0] t1_tokens = row[1] t1_features = row[2] t1_tags = row[3] t1_ents = row[4] t2_tokens = row[5] t1 = row[6] # original t1 text t2 = row[7] # original t2 text label = row[8] # string label Y/N """ to_id_ = partial(to_id, w2id=w2id, tag2id=tag2id, ent2id=ent2id) train = list(map(to_id_, train)) valid = list(map(to_id_, valid)) log.info('converted to ids.') # loading glove glove_dir = os.path.dirname(args.wv_file) vectors_path = os.path.join(glove_dir, 'glove.840B.300d.dat') words_path = os.path.join(glove_dir, 'glove.840B.300d_words.pkl') word2idx_path = os.path.join(glove_dir, 'glove.840B.300d_idx.pkl') if not os.path.exists(words_path): build_glove(args.wv_file) log.info('glove built.') vectors = bcolz.open(vectors_path)[:] words = pickle.load(open(words_path, 'rb')) word2idx = pickle.load(open(word2idx_path, 'rb')) glove = {w: vectors[word2idx[w]] for w in words} log.info('glove loaded.') vocab_size = len(vocab) embeddings = np.zeros((vocab_size, args.wv_dim)) embed_counts = np.zeros(vocab_size) embed_counts[:4] = 1 # PAD, SOS, EOS, UNK words_found = 0 for i, word in enumerate(w2id): if word in ["<PAD>", "<SOS>", "<EOS>", "<UNK>"]: continue try: embeddings[i] = glove[word] words_found += 1 except KeyError: embeddings[i] = np.random.normal(scale=0.6, size=(args.wv_dim, )) embed_counts[i] += 1 embeddings /= embed_counts.reshape((-1, 1)) log.info('got embedding matrix.') log.info('{0} words not found.'.format(vocab_size - words_found)) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ent': vocab_ent, 'embedding': embeddings.tolist(), 'word2id': w2id, 'id2word': id2w, 'wv_cased': args.wv_cased, } with open('data/coliee_meta_full_ko-de-en.msgpack', 'wb') as f: pickle.dump(meta, f) result = { 'train': train, 'valid': valid } with open('data/coliee_data_full_ko-de-en.msgpack', 'wb') as f: pickle.dump(result, f) log.info('saved to disk.')
def read_bcolz(fname): import bcolz """Load the bcolz array from memory (all at once) """ return bcolz.open(fname)[:]
def readFiles(self): self.vectors = bcolz.open(self.preTrainemb_path+self.name+'_wiki.dat')[:] self.words = pickle.load(open(self.preTrainemb_path+self.name+'_words_wiki.pkl', 'rb')) self.word2index = pickle.load(open(self.preTrainemb_path+self.name+'_w2i_wiki.pkl', 'rb')) self.index2word = pickle.load(open(self.preTrainemb_path+self.name+'_i2w_wiki.pkl', 'rb')) self.embedding_matrix = pickle.load(open(self.preTrainemb_path+self.name+'_embMat_wiki.pkl', 'rb') )
vect = np.array(line[1:]).astype(np.float) vectors.append(vect) vectors = bcolz.carray(vectors[1:].reshape((400000, 300)), rootdir='glove.6B/6B.300.dat', mode='w') vectors.flush() pickle.dump(words, open('glove.6B/6B.300_words.pkl', 'wb')) pickle.dump(word2idx, open('glove.6B/6B.300_idx.pkl', 'wb')) with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) print('Loading vocab...') vectors = bcolz.open('glove.6B/6B.300.dat')[:] words = pickle.load(open('glove.6B/6B.300_words.pkl', 'rb')) word2idx = pickle.load(open('glove.6B/6B.300_idx.pkl', 'rb')) print('glove is loaded...') glove = {w: vectors[word2idx[w]] for w in words} matrix_len = len(vocab) weights_matrix = np.zeros((matrix_len, 300)) words_found = 0 for i, word in enumerate(vocab.idx2word): try: weights_matrix[i] = glove[word] words_found += 1 except KeyError:
def main(): """ Training and validation. """ global best_bleu4, epochs_since_improvement, checkpoint, start_epoch, fine_tune_encoder, data_name, word_map, glove_path, emb_dim, rev_word_map # Read word map word_map_file = os.path.join(data_folder, 'WORDMAP_' + data_name + '.json') with open(word_map_file, 'r') as j: word_map = json.load(j) rev_word_map = {v: k for k, v in word_map.items()} #get glove vectors = bcolz.open(f'{glove_path}/6B.300.dat')[:] words = pickle.load(open(f'{glove_path}/6B.300_words.pkl', 'rb')) word2idx = pickle.load(open(f'{glove_path}/6B.300_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} matrix_len = len(word_map) weights_matrix = np.zeros((matrix_len, emb_dim)) words_found = 0 for i, word in enumerate(word_map.keys()): try: weights_matrix[i] = glove[word] words_found += 1 except KeyError: weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, )) # weights_matrix = np.float64(weights_matrix) # weights_matrix = torch.from_numpy(weights_matrix) # pretrained_embedding = weights_matrix.to(dtype=torch.float) # print(pretrained_embedding.dtype) # if device.type == 'cpu' : # pretrained_embedding = torch.FloatTensor(weights_matrix) # else: # pretrained_embedding = torch.cuda.FloatTensor(weights_matrix) pretrained_embedding = torch.FloatTensor(weights_matrix) # Initialize / load checkpoint if checkpoint is None: decoder = DecoderWithAttention(attention_dim=attention_dim, embed_dim=emb_dim, decoder_dim=decoder_dim, vocab_size=len(word_map), dropout=dropout) decoder.load_pretrained_embeddings( pretrained_embedding ) # pretrained_embeddings should be of dimensions (len(word_map), emb_dim) decoder.fine_tune_embeddings(True) # or False decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=decoder_lr) encoder = Encoder() encoder.fine_tune(fine_tune_encoder) encoder_optimizer = torch.optim.Adam( params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) if fine_tune_encoder else None else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] best_bleu4 = checkpoint['bleu-4'] decoder = checkpoint['decoder'] decoder_optimizer = checkpoint['decoder_optimizer'] encoder = checkpoint['encoder'] encoder_optimizer = checkpoint['encoder_optimizer'] if fine_tune_encoder is True and encoder_optimizer is None: encoder.fine_tune(fine_tune_encoder) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) # Move to GPU, if available decoder = decoder.to(device) encoder = encoder.to(device) # Loss function criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'TRAIN', transform=transforms.Compose([normalize])), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'VAL', transform=transforms.Compose([normalize])), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) # Epochs for epoch in range(start_epoch, epochs): # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20 if epochs_since_improvement == 20: break if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: adjust_learning_rate(decoder_optimizer, 0.8) if fine_tune_encoder: adjust_learning_rate(encoder_optimizer, 0.8) # One epoch's training # train(train_loader=train_loader, # encoder=encoder, # decoder=decoder, # criterion=criterion, # encoder_optimizer=encoder_optimizer, # decoder_optimizer=decoder_optimizer, # epoch=epoch) # One epoch's validation recent_bleu4 = validate(val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion) # Check if there was an improvement is_best = recent_bleu4 > best_bleu4 best_bleu4 = max(recent_bleu4, best_bleu4) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, recent_bleu4, is_best)
def load_array(data_folder, fname): fname = os.path.join(data_folder, fname) print("Loading from {0} ...".format(fname)) return bcolz.open(fname)[:]
def get_collection_timestamp(config, path): import bcolz _, meta_full_path = get_paths(config.root_path, path) meta_data = bcolz.open(meta_full_path)[:][0] return meta_data['created']
def load_array(fname): " load np matrix or array" return bcolz.open(fname)[:]
def read_time_series_cache_from_disk(self, fname, engine='hdf5', start_date=None, finish_date=None, db_server=constants.db_server, db_port=constants.db_port, username=constants.db_username, password=constants.db_password): """Reads time series cache from disk in either HDF5 or bcolz Parameters ---------- fname : str (or list) file to be read from engine : str (optional) 'hd5' - reads HDF5 files (default) 'arctic' - reads from Arctic/MongoDB database 'bcolz' - reads from bcolz file (not fully implemented) 'parquet' - reads from Parquet start_date : str/datetime (optional) Start date finish_date : str/datetime (optional) Finish data db_server : str IP address of MongdDB (default '127.0.0.1') Returns ------- DataFrame """ logger = LoggerManager.getLogger(__name__) data_frame_list = [] if not (isinstance(fname, list)): if '*' in fname: fname = glob.glob(fname) else: fname = [fname] for fname_single in fname: logger.debug("Reading " + fname_single + "..") if engine == 'parquet' and '.gzip' not in fname_single and '.parquet' not in fname_single: fname_single = fname_single + '.parquet' if (engine == 'bcolz'): try: name = self.get_bcolz_filename(fname_single) zlens = bcolz.open(rootdir=name) data_frame = zlens.todataframe() data_frame.index = pandas.DatetimeIndex(data_frame['DTS_']) data_frame.index.name = 'Date' del data_frame['DTS_'] # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas data_frame.columns = self.find_replace_chars( data_frame.columns, _replace_chars, _invalid_chars) data_frame.columns = [x[2:] for x in data_frame.columns] except: data_frame = None elif (engine == 'redis'): fname_single = os.path.basename(fname_single).replace('.', '_') msg = None try: # for pyarrow context = pa.default_serialization_context() r = redis.StrictRedis(host=db_server, port=db_port, db=0) # is there a compressed key stored?) k = r.keys('comp_*_' + fname_single) # if so, then it means that we have stored it as a compressed object # if have more than 1 element, take the last (which will be the latest to be added) if (len(k) >= 1): k = k[-1].decode('utf-8') comp = r.get(k) siz = int(k.split('_')[1]) dec = pa.decompress(comp, codec='lz4', decompressed_size=siz) msg = context.deserialize(dec) else: msg = r.get(fname_single) # print(fname_single) if msg is not None: msg = context.deserialize(msg) # logger.warning("Key " + fname_single + " not in Redis cache?") except Exception as e: logger.info("Cache not existent for " + fname_single + " in Redis: " + str(e)) if msg is None: data_frame = None else: logger.info('Load Redis cache: ' + fname_single) data_frame = msg # pandas.read_msgpack(msg) elif (engine == 'arctic'): socketTimeoutMS = 2 * 1000 import pymongo from arctic import Arctic fname_single = os.path.basename(fname_single).replace('.', '_') logger.info('Load Arctic/MongoDB library: ' + fname_single) if username is not None and password is not None: c = pymongo.MongoClient( host="mongodb://" + username + ":" + password + "@" + str(db_server) + ":" + str(db_port), connect=False ) # , username=username, password=password) else: c = pymongo.MongoClient(host="mongodb://" + str(db_server) + ":" + str(db_port), connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS) # Access the library try: library = store[fname_single] if start_date is None and finish_date is None: item = library.read(fname_single) else: from arctic.date import DateRange item = library.read( fname_single, date_range=DateRange( start_date.replace(tzinfo=None), finish_date.replace(tzinfo=None))) c.close() logger.info('Read ' + fname_single) data_frame = item.data except Exception as e: logger.warning('Library may not exist or another error: ' + fname_single + ' & message is ' + str(e)) data_frame = None elif self.path_exists(self.get_h5_filename(fname_single)): store = pandas.HDFStore(self.get_h5_filename(fname_single)) data_frame = store.select("data") if ('intraday' in fname_single): data_frame = data_frame.astype('float32') store.close() elif self.path_exists(fname_single) and '.csv' in fname_single: data_frame = pandas.read_csv(fname_single, index_col=0) data_frame.index = pd.to_datetime(data_frame.index) elif self.path_exists(fname_single): data_frame = self.read_parquet(fname_single) # data_frame = pandas.read_parquet(fname_single) data_frame_list.append(data_frame) if len(data_frame_list) == 1: return data_frame_list[0] return data_frame_list
state_change_label = torch.from_numpy(state_change_label).view( -1).long() loss_state_change_label = nn.CrossEntropyLoss( self.state_label_weights)(self.state_change_label_logits, state_change_label) return loss_state_change_label * coefficient def mse_loss(self, target_preds, coefficient): target_preds = torch.from_numpy(target_preds) return nn.functional.mse_loss(self.state_change_label_logits, target_preds) * coefficient vectors = bcolz.open('data/6B.100.dat')[:] words = pickle.load(open('data/6B.100_words.pkl', 'rb')) word2idx = pickle.load(open('data/6B.100_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} with open("data/train_samples.pkl", "rb") as fp: train_samples = pickle.load(fp) with open("data/test_samples.pkl", "rb") as fp: test_samples = pickle.load(fp) with open("data/dev_samples.pkl", "rb") as fp: dev_samples = pickle.load(fp) with open("data/unlabeled_samples.pkl", "rb") as fp:
vectors.append(vect) #Construct pickle files vectors = bcolz.carray(vectors[1:].reshape((400000, 50)), rootdir=f'/Users/nilslager/Desktop/gitit.50.dat', mode='w') vectors.flush() pickle.dump( words, open(f'/Users/nilslager/Desktop/Projekt1/bibliotekN_words.pkl', 'wb')) pickle.dump( word2idx, open(f'/Users/nilslager/Desktop/Projekt1/bibliotekN_index.pkl', 'wb')) #Create vector space vectors = bcolz.open(f'/Users/nilslager/Desktop/6B.50.dat')[:] words = pickle.load( open(f'/Users/nilslager/Desktop/Projekt1/bibliotekN_words.pkl', 'rb')) word2idx = pickle.load( open(f'/Users/nilslager/Desktop/Projekt1/bibliotekN_index.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} #Test print(glove["the"]) #Model matrix_len = len(glove) weights_matrix = np.zeros((matrix_len, 50)) words_found = 0 #Glove -> Weights matrix
def __init__(self,fp): conn = sql.connect(fp + '//' + 'data.sqlite') self.c = conn.cursor() self.tb = bcolz.open(fp + '//' + 'data_d')
def load_array(f): return bcolz.open(f)
def load_array(fname): return bcolz.open(fname)[:]
def __init__(self, f): self._dates = pd.Index(pd.Timestamp(str(d)) for d in bcolz.open(f, 'r'))
def read_time_series_cache_from_disk(self, fname, engine='hdf5', start_date=None, finish_date=None, db_server='127.0.0.1'): """ read_time_series_cache_from_disk - Reads time series cache from disk in either HDF5 or bcolz Parameters ---------- fname : str file to be read from Returns ------- DataFrame """ if (engine == 'bcolz'): try: name = self.get_bcolz_filename(fname) zlens = bcolz.open(rootdir=name) data_frame = zlens.todataframe() data_frame.index = pandas.DatetimeIndex(data_frame['DTS_']) data_frame.index.name = 'Date' del data_frame['DTS_'] # convert invalid characters (which Bcolz can't deal with) to more readable characters for pandas data_frame.columns = self.find_replace_chars( data_frame.columns, _replace_chars, _invalid_chars) data_frame.columns = [x[2:] for x in data_frame.columns] return data_frame except: return None elif (engine == 'arctic'): socketTimeoutMS = 2 * 1000 import pymongo from arctic import Arctic fname = os.path.basename(fname).replace('.', '_') self.logger.info('Load MongoDB library: ' + fname) c = pymongo.MongoClient(db_server, connect=False) store = Arctic(c, socketTimeoutMS=socketTimeoutMS, serverSelectionTimeoutMS=socketTimeoutMS) # Access the library library = store[fname] if start_date is None and finish_date is None: item = library.read(fname) else: from arctic.date import DateRange item = library.read(fname, date_range=DateRange( start_date, finish_date)) c.close() self.logger.info('Read ' + fname) return item.data elif os.path.isfile(self.get_h5_filename(fname)): store = pandas.HDFStore(self.get_h5_filename(fname)) data_frame = store.select("data") if ('intraday' in fname): data_frame = data_frame.astype('float32') store.close() return data_frame return None
def test_load_bcolz_embeddings(): bcolz_embeddings_path = '/home/peng/Workspace/data/embeddings/bcolz_vectors/bcolz_embeddings.dat'; word2idx_path = '/home/peng/Workspace/data/embeddings/word2idx.pkl'; vectors = bcolz.open(f'{bcolz_embeddings_path}', mode='r'); word2idx = pickle.load(open(word2idx_path, 'rb')); print(vectors[word2idx['house']]);