def eod_position_pd_run(pricing_environment, valuation_date): r = utils.get_redis_conn(redis_ip) position_result = r.get(EOD_BASIC_POSITIONS) position = pd.read_msgpack(position_result) risk_result = r.get(EOD_BASIC_RISKS_ + pricing_environment.lower()) risk = pd.read_msgpack(risk_result) cash_flow_result = r.get(EOD_BASIC_CASH_FLOW) cash_flow = pd.read_msgpack(cash_flow_result) listed_option_positions_result = r.get(EOD_BASIC_LISTED_OPTION_POSITION_ + pricing_environment.lower()) listed_option_positions = pd.read_msgpack(listed_option_positions_result) headers = utils.login(data_resource_ip, login_body) rpt = [] if not position.empty: rpt = eod_position_report_pd.eod_position_report( position, risk, cash_flow, listed_option_positions, pricing_environment, data_resource_ip, headers, valuation_date) position_result = JSONEncoder().encode(rpt) r.set(EOD_CUSTOM_POSITION_ + pricing_environment.lower(), str(position_result))
def msgpack_assertMeta(filename, frames=None, redo=False): '''Asserts that the .meta file for a given .msg file exists and returns the data in the .meta file once it exists''' meta_out_file = filename.replace(".msg", ".meta") print(meta_out_file) meta_frames = None if (os.path.exists(meta_out_file) and not redo): #Need to check for latin encodings due to weird pandas default try: meta_frames = pd.read_msgpack(meta_out_file) except UnicodeDecodeError as e: meta_frames = pd.read_msgpack(meta_out_file, encoding='latin-1') if (meta_frames == None): if (frames == None): print( "Bulk reading .msg for metaData assertion. Be patient, reading in slices not supported." ) print(filename) #Need to check for latin encodings due to weird pandas default try: frames = pd.read_msgpack(filename) except UnicodeDecodeError as e: frames = pd.read_msgpack(filename, encoding='latin-1') meta_frames = {"NumValues": frames["NumValues"]} if (not os.path.exists(meta_out_file) or redo): pd.to_msgpack(meta_out_file, meta_frames) return meta_frames
def recommend_by_shop(self, start_date, end_date): start_date = DateStrToDate(start_date) end_date = DateStrToDate(end_date, hour=23, minute=59, seconds=59) case_data = pd.read_msgpack(red.get('case_data')) payment_logs = pd.read_msgpack(red.get('payment_logs')) shop_reflect = json.loads(red.get('shop_data').decode()) result_list = [] data = case_data[case_data['apply_date'] <= end_date] payment_limit = payment_logs[(payment_logs['date'] >= start_date) & (payment_logs['date'] < end_date)] data_excute = Data_Execute() date_index = "%s/%s" % (start_date.strftime('%Y-%m-%d'), end_date.strftime('%m-%d')) for shop_id in shop_reflect: shop_data = data_excute.recommend_by_shop( table=data[data['shop_id'] == int(shop_id)], payment_logs=payment_limit[payment_limit['shop_id'] == int( shop_id)], date_index=date_index, data_name=shop_reflect[str(shop_id)], start_date=start_date) result_list.append(shop_data) return result_list
def learn(basepath, features_file, labels_file): # Load the data print 'Loading data...' features_data = pd.read_msgpack(load_data(basepath, features_file)['data']) labels_data = pd.read_msgpack(load_data(basepath, labels_file)['data']) df = pd.concat([features_data, labels_data], axis=1) # Process features samples, labels = preprocess.process_data(df) # How many samples are we going to leave out for the test set? nb_test = int(len(labels) * 0.2) split = len(labels) - nb_test # Prepare training and test sets X_train = np.array(samples[:split]) y_train = labels[:split] X_test = np.array(samples[split + 1:]) y_test = labels[split + 1:] print len(X_train), 'train sequences' print len(X_test), 'test sequences' # How many classes? num_classes = np.max(labels) + 1 print num_classes, 'classes' # Train Model train_and_save(X_train, X_test, y_train, y_test, num_classes, basepath)
def learn(basepath, features_file, labels_file): # Load the data print 'Loading data...' features_data = pd.read_msgpack(load_data(basepath, features_file)['data']) labels_data = pd.read_msgpack(load_data(basepath, labels_file)['data']) df = pd.concat([features_data,labels_data], axis=1) # Process features samples, labels = preprocess.process_data(df) # How many samples are we going to leave out for the test set? nb_test = int(len(labels) * 0.2) split = len(labels) - nb_test # Prepare training and test sets X_train = np.array(samples[:split]) y_train = labels[:split] X_test = np.array(samples[split+1:]) y_test = labels[split+1:] print len(X_train), 'train sequences' print len(X_test), 'test sequences' # How many classes? num_classes = np.max(labels)+1 print num_classes, 'classes' # Train Model train_and_save(X_train, X_test, y_train, y_test, num_classes, basepath)
def concat_data(): df_ohlc = pd.read_msgpack(f'{DATASET_PATH}/twse_ohlc.msgpack').rename( columns={ '日期': 'date', '開盤指數': 'open', '最高指數': 'high', '最低指數': 'low', '收盤指數': 'close' }).reset_index(drop=True) df_ob = pd.read_msgpack(f'{DATASET_PATH}/twse_orderbook.msgpack') df_ob_open = df_ob[df_ob['時間'].map( lambda x: x.time() == dt.time(9, 0, 0))].copy() df_ob_close = df_ob[df_ob['時間'].map( lambda x: x.time() == dt.time(13, 30, 0))].copy() df_ob_open['date'] = df_ob_open['時間'].map( lambda x: x.date()) #.strftime('%Y%m%d') df_ob_open = df_ob_open.rename(columns={ '累積委託買進筆數': 'order_buy', '累積委託賣出筆數': 'order_sell' })[['date', 'order_buy', 'order_sell']] df_ob_close['date'] = df_ob_close['時間'].map(lambda x: x.date()) df_ob_close = df_ob_close.rename(columns={'累積成交金額': 'volume'})[[ 'date', 'volume' ]] df_twse = df_ohlc.set_index('date').join( df_ob_open.set_index('date')).join( df_ob_close.set_index('date')).dropna() print('data concat done.') return df_twse
def load(self, fn): self.xs_not_decomposed = pd.read_msgpack(fn + '.X.msg') self.xs_not_decomposed = self.xs_not_decomposed.to_dict() self.ys_not_decomposed = pd.read_msgpack(fn + '.Y.msg') self.idx = self.ys_not_decomposed.index.max() self.ys_not_decomposed = self.ys_not_decomposed.to_dict() self.start()
def main(): with TLOG('read data'): df = pd.read_msgpack('data/z6_ts_events.msgpack') df_discount = pd.read_msgpack('data/z6_ts_discount.msgpack') ids = np.load('data/z6_ts_user_id_merchant_id.npy') feature = UserMerchantFeature(df, df_discount=df_discount, keys=ids) df = feature.process() df.to_msgpack('data/z6_ts_feature_user_merchant.msgpack')
def Files_Read_Qo_Q190(link, listEvents=False, umbral=0.5, MinDays=15): #Read observed try: Qo = pd.read_msgpack('/Users/nicolas/LambdaExp/BaseData/USGS/' + link + '.msg') except: #Read information from the links LinksData = pd.read_msgpack('LinkData.msg') USGS_id = LinksData.index[LinksData['Link'] == int(link)].values[0] #Read from the web. print('Warning: reading from the web...') Qo = db.WEB_Get_USGS(USGS_id, '2008-01-01', '2018-12-30') Qo.to_msgpack('/Users/nicolas/LambdaExp/BaseData/USGS/' + link + '.msg') print('Message: Streamflow saved as a msgpack as link:' + link) Qo = Qo.resample('H').mean() #Read simulated Qs = pd.read_msgpack('/Users/nicolas/LambdaExp/BaseData/HLM190/' + link + '.msg') Qs = Qs.resample('H').mean() #Find events shared = Qo.index.intersection(Qs.index) Qs = Qs[shared] Qo = Qo[shared] pos1, pos2 = ser.Runoff_FindEvents(Qo, Qs, umbral=umbral) #Estimates the max anual Streeamflow QmaxA = np.median(Qo.resample('A').max()) Qbase = Qo.resample('A').apply( lambda x: np.percentile(x[np.isfinite(x)], 50)).mean() #Selects only the good events pos1V2 = [] pos2V2 = [] for i, j in zip(pos1, pos2): M = Qo[i:j].max() if M > QmaxA * umbral: #Check amount of nans. NaNPercent = Qo[i:j][np.isnan(Qo[i:j])].size / Qo[i:j].size if NaNPercent < 0.2: #Check the time between peak and strt of the event while Qo[i] < Qbase * 2.5: i = i + pd.Timedelta('5h') Td = j - i if Td.days < MinDays: pos1V2.append(i) pos2V2.append(j) #List the events if listEvents: c = 0 for p1, p2 in zip(pos1V2, pos2V2): qp = '%.2f ' % np.nanmax(Qo[p1:p2]) print(c, qp, p1) c += 1 #Updates the class Evento Evento.Qobs = Qo Evento.Qsim = Qs Evento.pos1 = pos1V2 Evento.pos2 = pos2V2
def load(self, only_get=None): train_data = pandas.read_msgpack('cache/dataset/train.msg') merge_data = pandas.read_msgpack('cache/dataset/merge.msg') valid_data = pandas.read_msgpack('cache/dataset/valid.msg') if only_get is not None: return train_data.head(only_get), merge_data.head( only_get), valid_data.head(only_get) else: return train_data, merge_data, valid_data
def _load(self, as_df): with gzip.open('rf_cache/dataset/train.msg.gz', 'r') as f: if as_df: td = pandas.read_msgpack(f) else: td = pandas.read_msgpack(f).values f = np.load('rf_cache/folds.npz')['data'] return td, f
def read_events(self): split = self.split.name with TLOG('read events'): df_events = pd.read_msgpack(f'data/z6_ts_{split}_events.msgpack') df_discount = pd.read_msgpack( f'data/z6_ts_{split}_discount.msgpack') df = with_discount(df_events, df_discount) with TLOG('build index events'): events = IndexedEvents(df, key_column=self.key_column) return events
def main(): with TLOG('read data'): df = pd.read_msgpack('data/z6_ts_events.msgpack') df_discount = pd.read_msgpack('data/z6_ts_discount.msgpack') coupon_ids = np.load('data/z6_ts_coupon_id.npy') coupon_feature = CouponFeature(df, df_discount=df_discount, keys=coupon_ids) df = coupon_feature.process() df.to_msgpack('data/z6_ts_feature_coupon.msgpack')
def read_input_to_pandas(self, columnList=[], indexCol="Sample"): if self.isGzipped: with gzip.open(self.filePath) as path: df = pd.read_msgpack(path) else: df = pd.read_msgpack(self.filePath) df = df.reset_index() if len(columnList) > 0: df = df[columnList] return df
def get_human_22_fake_genome(): from mbf_genomics.testing import MockGenome import gzip genes = pd.read_msgpack( gzip.GzipFile(get_sample_data(Path("mbf_align/hs_22_genes.msgpack.gz"))) ).reset_index() tr = pd.read_msgpack( gzip.GzipFile(get_sample_data(Path("mbf_align/hs_22_transcripts.msgpack.gz"))) ).reset_index() return MockGenome(df_genes=genes, df_transcripts=tr, chr_lengths={"22": 50_818_468})
def get_translational_efficiency(project_id): from main import get_db rdb = get_db() rp = rdb.get("{}_rpkm_rp".format(project_id)) rna = rdb.get("{}_rpkm_rna".format(project_id)) list_of_samples = [] if rp is None or rna is None: return render_template("translational_efficiency.html", samples=list_of_samples, error="No data for project: {}".format(project_id)) rp_df = pd.read_msgpack(rp) rna_df = pd.read_msgpack(rna) samples = list(rp_df.columns) samples.remove('gene_name') list_of_samples = samples if request.method == "GET": return render_template("translational_efficiency.html", samples=list_of_samples) selected_samples = request.form.getlist('selected_samples') if not selected_samples: return render_template("translational_efficiency.html", samples=list_of_samples, error="No samples selected") apply_filter = request.form.get('apply_filter') == "True" min_y = request.form.get('min_y', -100) max_y = request.form.get('max_y', 100) min_y = int(min_y) max_y = int(max_y) plot_series = [] for sample in selected_samples: gene_names = rp_df['gene_name'].tolist() rp = rp_df[sample].astype(float).tolist() rna = rna_df[sample].astype(float).tolist() df = pd.DataFrame(columns=['gene_name', 'x', 'y']) df['gene_name'] = gene_names df['rpkm_rna'] = rna df['rpkm_rp'] = rp df['log2(rp)'] = np.log2(df['rpkm_rp']) df['log2(rna)'] = np.log2(df['rpkm_rna']) df['x'] = df['log2(rna)'] df['y'] = df['log2(rna)'] / df['log2(rp)'] if apply_filter: df = df.loc[df['y'] >= min_y] df = df.loc[df['y'] <= max_y] df = df.replace([np.inf, -np.inf], np.nan) df = df.dropna() series = { 'name': sample, 'data': df.to_dict('records') } plot_series.append(series) return render_template("translational_efficiency.html", samples=list_of_samples, selected_samples=selected_samples, apply_filter=apply_filter, min_y=min_y, max_y=max_y, plot_series=plot_series)
def FeedToRedis(r=r, source_dir="intermediate-results/", mode="batch"): if not mode in ["batch", "recent"]: print "[Error] the mode is not correct!" exit() result_filename = "mirror-news-ann-distance-20.result" msg_filename = "news-id-tfidf50-topic-category.msg" if mode == "recent": result_filename = "recent-" + result_filename msg_filename = "recent-" + msg_filename if os.path.exists(source_dir + result_filename) == True: f = open(source_dir + result_filename, 'r') else: print( "[Warning] Cannot find the latest list of related news. Use the fallback list now. Please run daily_batch.sh to get the latest related news" ) f = open('fallback/fallback.result', 'r') if os.path.exists(source_dir + msg_filename) == True: df = pd.read_msgpack(source_dir + msg_filename) else: print( "[Warning] Cannot find the latest metadata of related news. Use the fallback metadata now. Please run daily_batch.sh to get the latest metadata" ) df = pd.read_msgpack('fallback/fallback.msg') print("Loading the KNN list...") news_dict = dict() for line in f: news_id, knn_raw = line.replace("\n", "").split("\t") knn_list = json.loads(knn_raw) r_news = [] for (r_id, _) in knn_list: r_dict = get_facets(df, r_id) r_news.append(r_dict) n_dict = get_facets(df, news_id) n_dict['knn_list'] = knn_list n_dict['related_news'] = r_news news_dict["related-news-v2-" + news_id] = json.dumps(n_dict) """ if you find error msg: MISCONF Redis is configured to save RDB snapshots, try this on redis-cli: config set stop-writes-on-bgsave-error no """ print "Total: " + str(len(news_dict)) print "Feed all to Redis..." r.mset(news_dict) print "Done!"
def histogram(wild_card='*thrown', key='energy', bins=10): paths = glob(wild_card) df = pd.read_msgpack(paths[0]) df.dropna(inplace=True) counts, bins = np.histogram(df[key], bins=bins) total_bincounts = np.zeros(len(bins) - 1) for path in paths: df = pd.read_msgpack(path) df.dropna(inplace=True) bincounts = np.histogram(df[key], bins=bins)[0] total_bincounts += bincounts return total_bincounts, bins
def load(file_name, mode=None): if mode == 'HD5': df = pd.read_hdf(file_name, 'dataframe') elif mode == 'msgpack': pd.read_msgpack(file_name) elif mode == 'parquet': df = pd.read_parquet(file_name, engine='pyarrow', use_pandas_metadata=True) elif mode == 'pickle.gzip': df = pd.read_pickle(file_name, compression='gzip') elif mode == 'feather': read_feather(file_name)
def msgpack_revert(self, val, cls=dict, checker=None, **kw): if cls == pandas_df_type.__name__ or cls == pandas_df_type: import pandas if isinstance(val, requests.Response): return pandas.read_msgpack(val.content, **keep(kw, [])) else: return pandas.read_msgpack(io.BytesIO(val), **keep(kw, [])) if isinstance(val, requests.Response): val = val.content import msgpack val = msgpack.loads(val, **keep(kw, [])) val = self.schema_revert(val, cls, checker=checker) return val
def Statistic_index(self, statistic_date, statistic_date_end, compare_date, compare_date_end, shop_get): statistic_date = DateStrToDate(statistic_date) statistic_end_date = DateStrToDate(statistic_date_end, hour=23, minute=59, seconds=59) compare_date = DateStrToDate(compare_date) compare_end_date = DateStrToDate(compare_date_end, hour=23, minute=59, seconds=59) if int(shop_get) == 0 or not shop_get: shop_ids = [i for i in range(2, 24)] else: shop_ids = [int(shop_get)] case_data = pd.read_msgpack(red.get('case_data')) payment_logs = pd.read_msgpack(red.get('payment_logs')) data_execute = Data_Execute() statistic_data = case_data[ (case_data['apply_date'] >= statistic_date) & (case_data['apply_date'] < statistic_end_date) & (case_data['shop_id'].isin(shop_ids))] statistic_compare = case_data[ (case_data['apply_date'] >= compare_date) & (case_data['apply_date'] < compare_end_date) & (case_data['shop_id'].isin(shop_ids))] payment_data = payment_logs[(payment_logs['date'] >= statistic_date) & (payment_logs['date'] < statistic_end_date) & (payment_logs['shop_id'].isin(shop_ids))] payment_compare = payment_logs[ (payment_logs['date'] >= compare_date) & (payment_logs['date'] < compare_end_date) & (payment_logs['shop_id'].isin(shop_ids))] statistic = data_execute.index_stactic( statistic_data, 1, statistic_date.strftime('%Y-%m-%d'), index=True, payment_logs=payment_data) compare = data_execute.index_stactic(statistic_compare, 2, compare_date.strftime('%Y-%m-%d'), index=True, payment_logs=payment_compare) result_list = [statistic, compare] return result_list
def main(cls, split): df_raw_offline = pd.read_msgpack(f'data/z1_raw_offline.msgpack') df_raw_test = pd.read_msgpack(f'data/z1_raw_test.msgpack') user_id_index = np.unique( np.concatenate([ df_raw_offline['user_id'].unique(), df_raw_test['user_id'].unique() ])) events = cls(user_id_index) LOG.info('events feed_offline') events.feed_offline(df_raw_offline) df_online_coupon = pd.read_msgpack( f'data/z1_raw_online_coupon.msgpack') LOG.info('events feed_online_coupon') events.feed_online_coupon(df_online_coupon) df_online_click = pd.read_msgpack(f'data/z1_raw_online_click.msgpack') LOG.info('events feed_online_click') events.feed_online_click(df_online_click) LOG.info('events feed_test') events.feed_test(df_raw_test) LOG.info('events to_frame') df = events.to_frame(split) df.to_msgpack(f'data/z6_ts_{split.name}_events.msgpack') LOG.info('build_discount_table') df_discount = cls.build_discount_table(df) df_discount.to_msgpack(f'data/z6_ts_{split.name}_discount.msgpack') df_offline_events = df[df['event_type'].isin([ 'offline_receive_coupon', 'offline_buy_with_coupon', 'offline_buy_without_coupon', ])] LOG.info('build_index_of user_id') user_id_index = cls.build_index_of(df_offline_events, 'user_id') np.save(f'data/z6_ts_{split.name}_user_id.npy', user_id_index) for key in ['merchant_id', 'coupon_id']: LOG.info('build_index_of {}', key) arr = cls.build_index_of(df_offline_events, key) np.save('data/z6_ts_{}_{}.npy'.format(split.name, key), arr) LOG.info('build_index_of user_id_merchant_id') arr = cls.build_index_of(df_offline_events, ['user_id', 'merchant_id']) np.save(f'data/z6_ts_{split.name}_user_id_merchant_id.npy', arr) LOG.info('build_index_of user_id_coupon_id') arr = cls.build_index_of(df_offline_events, ['user_id', 'coupon_id']) np.save(f'data/z6_ts_{split.name}_user_id_coupon_id.npy', arr)
def histogram2d(wild_card='*thrown', x='energy', y='number_photons', bins=10): paths = glob(wild_card) df = pd.read_msgpack(paths[0]) df.dropna(inplace=True) h = np.histogram2d(x=df[x], y=df[y], bins=bins) total_counts = h[0] bins = h[1] total_counts = 0 for path in paths: df = pd.read_msgpack(path) df.dropna(inplace=True) bincounts = np.histogram2d(x=df[x], y=df[y], bins=bins)[0] total_counts += bincounts return total_counts, bins
def _getStore(f, storeType): '''Helper Function - Gets the HDFStore or frames for the file and storeType''' frames = None if (storeType == "hdf5"): store = pd.HDFStore(f) elif (storeType == "msgpack"): print( "Bulk reading .msg. Be patient, reading in slices not supported.") sys.stdout.flush() #Need to check for latin encodings due to weird pandas default try: frames = pd.read_msgpack(f) except UnicodeDecodeError as e: frames = pd.read_msgpack(f, encoding='latin-1') return store, frames
def read_dataset(): with TLOG('read dataframes'): df_test_full = pd.read_msgpack( f'data/z6_ts_{SPLIT.name}_merged_test.msgpack') df_train_full = pd.read_msgpack( f'data/z6_ts_{SPLIT.name}_merged_train.msgpack') # columns features = list( df_test_full.columns.difference([ 'user_id', 'merchant_id', 'coupon_id', 'discount_name', 'date', 'label', ])) features = GOOD_FEATURES # features = list( # set(itertools.chain(*FEATULE_LEVELS.values())) # - set(BAD_FEATURES) # ) print(pretty(features)) test_submit_cols = ['user_id', 'coupon_id', 'date'] if TEST_HAS_LABEL: test_submit_cols += ['label'] train_submit_cols = ['user_id', 'coupon_id', 'date', 'label'] # test submit df_test = df_test_full[features] df_submit = format_date(df_test_full.loc[df_test.index, test_submit_cols]) LOG.info('df_test {}', df_test.shape) LOG.info('df_submit {}', df_submit.shape) # split train validate mask = np.random.rand(len(df_train_full)) < 0.05 df_validate = df_train_full.loc[mask, features + ['label']] df_validate_submit = format_date(df_train_full.loc[mask, train_submit_cols]) df_train = df_train_full.loc[~mask, features + ['label']] df_train_submit = format_date(df_train_full.loc[~mask, train_submit_cols]) LOG.info('df_train {}', df_train.shape) LOG.info('df_train_submit {}', df_train_submit.shape) LOG.info('df_validate {}', df_validate.shape) LOG.info('df_validate_submit {}', df_validate_submit.shape) df_train_x, df_train_y = split_feature_label(df_train) df_validate_x, df_validate_y = split_feature_label(df_validate) ret = df_train_x, df_train_y, df_validate_x, df_validate_y, df_test, df_submit, df_validate_submit, df_train_submit return [x.copy() for x in ret]
def read_all(self, freq, **kwargs): """ Read the entire timeseries record for all matching timeseries instances. Optionally exclude timeseries from the final DataFrame by specifying IDs in the exclude argument. :param identifier: Identifier of the timeseries. :type identifier: string :param freq: Timeseries data frequency. :type freq: string :param excludes: IDs of timeseries to exclude from final DataFrame. :type excludes: array[string] :param kwargs: Attributes to match against timeseries instances (e.g. source, measurand). :type kwargs: kwargs :returns: pandas.DataFrame -- Timeseries data. """ url = self.__attach_kwargs_to_url( self.server + '/read_all/{0}.{1}'.format(freq, self.format), kwargs ) if self.format == 'msgpack': return pd.read_msgpack(urlopen(url)) elif self.format == 'json': return pd.read_json(urlopen(url)) else: raise NotImplementedError('Unsupported format: {0}'.format(self.format))
def _load_chunks(self): for chunk in pandas.read_msgpack(self.buf, iterator=True): for col in self.schema.cols: if isinstance(col, big_dt): # converting big_dt column chunk[col.name] = chunk[col.name].map(datetime.datetime.fromtimestamp, na_action='ignore') yield chunk
def testrun(gname): method = 'tepitope' #'iedbmhc1'#'netmhciipan' path = 'test' gfile = os.path.join(genomespath, '%s.gb' % gname) df = sequtils.genbank2Dataframe(gfile, cds=True) #names = list(df.locus_tag[:1]) names = ['VP24'] alleles1 = [ "HLA-A*02:02", "HLA-A*11:01", "HLA-A*32:07", "HLA-B*15:17", "HLA-B*51:01", "HLA-C*04:01", "HLA-E*01:03" ] alleles2 = [ "HLA-DRB1*0101", "HLA-DRB1*0305", "HLA-DRB1*0812", "HLA-DRB1*1196", "HLA-DRB1*1346", "HLA-DRB1*1455", "HLA-DRB1*1457", "HLA-DRB1*1612", "HLA-DRB4*0107", "HLA-DRB5*0203" ] P = base.getPredictor(method) P.iedbmethod = 'IEDB_recommended' #'netmhcpan' P.predictProteins(df, length=11, alleles=alleles2, names=names, save=True, path=path) f = os.path.join('test', names[0] + '.mpk') df = pd.read_msgpack(f) P.data = df #b = P.get_binders(data=df) #print b[:20] base.getScoreDistributions(method, path) return
def read_python2_hdf5_dataframe(h5_filepath, key): h5_filepath = os.path.realpath(h5_filepath) msgpack_filepath = h5_filepath + '.' + key.replace('/', '_') + '.msgpack' filepath_time = os.path.getmtime(h5_filepath) if not os.path.exists( msgpack_filepath) or filepath_time > os.path.getmtime( msgpack_filepath): logging.info( 'msgpack file {} doesnt exists, creating'.format(msgpack_filepath)) convert_python2_hdf5_to_msgpack(h5_filepath, key, msgpack_filepath) else: logging.info('msgpack file {} exists'.format(msgpack_filepath)) data = pd.read_msgpack(msgpack_filepath) # Fix columns names and string columns that are bytes data.columns = data.columns.astype(str) for col in data: try: newcol = data[col].str.decode('utf-8') except AttributeError: continue if not newcol.isnull().any(): data[col] = newcol return data
def getScoreDistributions(method, path): """Get global score distributions and save quantile values for each allele Assumes all the files in path represent related proteins""" files = glob.glob(os.path.join(path, '*.mpk')) results = [] P = getPredictor(method) key = P.scorekey #if method == 'iedbmhc1': # P.data = pd.read_msgpack(files[0]) # key = P.getScoreKey() print key for f in files[:200]: df = pd.read_msgpack(f) #df = df.dropna() x = df.pivot_table(index='peptide', columns='allele', values=key) #print x[:5] results.append(x) result = pd.concat(results) percs = np.arange(0.01,1,0.01) bins = result.quantile(percs) #reverse is best values are lower if P.operator == '<': bins.index = pd.Series(bins.index).apply(lambda x: 1-x) outfile = os.path.join(path,'quantiles.csv') print outfile bins.to_csv(outfile,float_format='%.3f') df= pd.read_csv(outfile,index_col=0) print df.ix[0.96] return
def load_pandas(file_name='review.json', use_cache=True): cache_path = os.path.join(CACHE_PATH, f'load_pandas.msgpack') if use_cache and os.path.exists(cache_path): print(f'Loading from {cache_path}') ratings, user_counts, active_users = pd.read_msgpack(cache_path) print(f'Loaded from {cache_path}') else: line_count = len( open(os.path.join(EXCEL_PATH, file_name), encoding='utf8').readlines()) user_ids, business_ids, stars, dates, text = [], [], [], [], [] with open(os.path.join(EXCEL_PATH, file_name), encoding='utf8') as f: for line in tqdm(f, total=line_count): blob = json.loads(line) user_ids += [blob["user_id"]] business_ids += [blob["business_id"]] stars += [blob["stars"]] dates += [blob["date"]] text += [blob["text"]] ratings = pd.DataFrame({ "user_id": user_ids, "business_id": business_ids, "rating": stars, "text": text, "date": dates }) user_counts = ratings["user_id"].value_counts() active_users = user_counts.loc[user_counts >= 5].index.tolist() pd.to_msgpack(cache_path, (ratings, user_counts, active_users)) print(f'Dumping to {cache_path}') return ratings, user_counts, active_users
def read_df_from_redis(self, redisConn, key): try: return pd.read_msgpack(redisConn.get(key)) except: if DEBUG: traceback.print_exc() return -1
def test_mse(neighborhood_size=5, filtertype="collaborative filtering"): """Tests the mse of predictions based on a given number of neighborhood sizes neighborhood_size -- the sizes of neighborhoods between the number and 1 (so 5 tests for neighborhood of length 1, 2, 3, 4, 5) filtertype -- the type of similarity you want to test the mse of """ # init variables all_df = helpers.json_to_df() df = helpers.split_data(all_df) ut = helpers.create_utility_matrix(df[0]) if filtertype == "collaborative filtering": print("Creating needed variables...") sim = helpers.similarity_matrix_cosine(ut) elif filtertype == "content based": print("Creating needed variables...") cats = helpers.json_to_df_categories() fancy_cats = helpers.extract_genres(cats) ut_cats = helpers.pivot_genres(fancy_cats) sim = helpers.create_similarity_matrix_categories(ut_cats) elif filtertype == "spacy": print("Creating needed variables...") sim = pd.read_msgpack("spacy_similarity.msgpack") else: print("Please enter a valid filtertype") return print("Starting calculations...") mses = {} # test the mse based on the length of the neighborhood for i in range(1, neighborhood_size + 1): predictions = helpers.predict_ratings(sim, ut, df[1], i).dropna() amount = len(predictions) mses[i] = helpers.mse(predictions) return mses, amount
def loadProject(self, filename=None, asksave=False): """Open project file""" w=True if asksave == True: w = self.closeProject() if w == None: return if filename == None: filename = filedialog.askopenfilename(defaultextension='.dexpl"', initialdir=os.getcwd(), filetypes=[("project","*.dexpl"), ("All files","*.*")], parent=self.main) if not filename: return if os.path.isfile(filename): #pb = self.progressDialog() #t = threading.Thread() #t.__init__(target=pd.read_msgpack, args=(filename)) #t.start() data = pd.read_msgpack(filename) self.newProject(data) self.filename = filename self.main.title('%s - DataExplore' %filename) self.projopen = True return
def read_output(path,layer_dims): """ Input: layer_dims, length 3 list of dimensions of the output of bottleneck layer Returns ndarray of shape (num_batches, layer_dims) """ data = pd.read_msgpack(path) batch_size = data.index.size dims = [batch_size] + layer_dims return data.as_matrix().reshape(dims)
def _load(self): df = pandas.read_msgpack(self.buf) for col in self.schema.cols: if isinstance(col, big_dt): # converting big_dt column df[col.name] = df[col.name].map(datetime.datetime.fromtimestamp, na_action='ignore') return df
def testLoad(self): """Test re-loading predictions""" infile = os.path.join(self.testdir, 'ZEBOVgp1.mpk') pred = pd.read_msgpack(infile) P = base.getPredictor('iedbmhc1') P.data = pred return
def data(): # Set CSS properties for th elements in dataframe try: rdb.get("data") data = pd.read_msgpack(rdb.get("data")) return render_template('index.html', data=data.to_html(index=False, justify='center', classes="table table-striped")) except Exception as e: return render_template('index.html', data="<p>no data found</p><br/>Exception: " + str(e) + "<p></br></br>run /timereport-fetch in slack first</p></br>")
def load(self, filename, filetype=None): """Load file, if no filetype given assume it's msgpack format""" if filetype == '.pickle': self.df = pd.read_pickle(filename) else: self.df = pd.read_msgpack(filename) return
def load_msgpack(self, filename): """Load a msgpack file""" size = round((os.path.getsize(filename)/1.0485e6),2) print (size) df = pd.read_msgpack(filename) name = os.path.splitext(os.path.basename(filename))[0] self.load_dataframe(df, name) return
def get_val(self, key): ## mined에서 사용하게 될 모든 데이터는 TICKERS 데이터가 아니면 pandas df이다 ## TICKERS 데이터는 리스트 형식이다 if 'TICKERS' in key: data = self.redis_client.lrange(key, 0, -1) data = list(map(lambda x: x.decode('utf-8'), data)) else: data = pd.read_msgpack(self.redis_client.get(key)) # 레디스에서 df 형식의 데이터를 가지고 오는 방법 ### 참고: 레디스에 df를 저장하는 방법은: redis.set(key, df.to_msgpack(compress='zlib'))와 같은 형식이다 return data
def msg_io(self, name, func, **kwargs): """Read data from msgpack. If not available, calculate and store.""" cd = self.cache_dir() msgpath = os.path.join(cd, name + MSGTLD) if os.path.isfile(msgpath): data = pd.read_msgpack(msgpath) else: ensure_dir(cd) data = func(**kwargs) data.to_msgpack(msgpath) return data
def loadmsgpack(self, filename): """Load a msgpack file""" df = pd.read_msgpack(filename) name = os.path.splitext(os.path.basename(filename))[0] if hasattr(self,'sheets'): self.addSheet(sheetname=name, df=df) else: data = {name:df} self.newProject(data) return
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False): warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pd.read_msgpack(path_or_buf, encoding, iterator) ray_frame = from_pandas(port_frame, get_npartitions()) return ray_frame
def loadmsgpack(self, filename): """Load a msgpack file""" size = round((os.path.getsize(filename)/1.0485e6),2) print (size) df = pd.read_msgpack(filename) name = os.path.splitext(os.path.basename(filename))[0] if hasattr(self,'sheets'): self.addSheet(sheetname=name, df=df) else: data = {name:{'table':df}} self.newProject(data) return
def __get_list(self, list_name, kwargs): url = self.__attach_kwargs_to_url( self.server + '/list/{0}.{1}'.format(list_name, self.format), kwargs ) if self.format == 'msgpack': return pd.read_msgpack(urlopen(url)).values.tolist() elif self.format == 'json': return pd.read_json(urlopen(url)).values.tolist() else: raise NotImplementedError('Unsupported format: {0}'.format(self.format))
def pull_df(self, md5): """Wrapper for the Workbench get_dataframe method Args: md5: pull the dataframe identified by this md5 Returns: The uncompressed/unserialized dataframe """ try: _packed_df = self.workbench.get_dataframe(md5) _df = pd.read_msgpack(lz4.loads(_packed_df)) return _df except zerorpc.exceptions.RemoteError as e: return repr_to_str_decorator.r_to_s(self._data_not_found)(e)
def initialize_mission_control(memory): # Get subscription sockets subscription_sender, subscription_receiver = subscription_sockets() # Initialize subscriptions initialize_subscriptions(subscription_sender) # Initialize Feed Handler while True: message = subscription_receiver.recv() ticker = message.split('_', 1)[0] message = pd.read_msgpack(message.split('_', 1)[1]) print message
def getPredictions(path,tag,method='tepitope',q=0.96): """Get predictions from file system""" q=round(q,2) #preds = OrderedDict() cutoffs = {} filename = os.path.join(path, tag+'.mpk') if not os.path.exists(filename): return df = pd.read_msgpack(filename) pred = base.getPredictor(name=method, data=df) cutoffs = pred.allelecutoffs = getCutoffs(path, method, q) pred = pred return pred
def openProject(self, filename=None): """Open project file""" if filename == None: filename = filedialog.askopenfilename(defaultextension='.dexpl"', initialdir=os.getcwd(), filetypes=[("project","*.dexpl"), ("All files","*.*")], parent=self.main) if not filename: return if os.path.isfile(filename): data = pd.read_msgpack(filename) self.newProject(data) self.filename = filename return
def read(site_loc, sheet=0, verbose=True): "Read saved excel sheet into dataframe" fn = join("data", site_loc, "all_{}.msg".format(sheet)) exfile = glob(join("data", site_loc, "*.xlsx"))[0] xl_workbook = xlrd.open_workbook(exfile) sheet_names = xl_workbook.sheet_names() del xl_workbook df = pd.read_msgpack(fn) if verbose: print("{} => {}".format(sheet_names, sheet_names[sheet])) with open(join("data", site_loc, "description.txt")) as f: print(f.read()) print("Nulls: {} / {}".format(df["2"].isnull().sum(), len(df))) return df
def param_table(e=None, query_str=QSTR, debug=False, rho_limits=None, use_cache=True): cached_table = files['params_cache'] if path.isfile(cached_table) and use_cache: return pd.read_msgpack(cached_table) if e is None: if debug: e = test_events() else: e = pip2015events() if rho_limits is None: rho_limits = rholimits data = e.summary(col='paper', split_date=pd.datetime(2014,7,1)) del(e) gc.collect() data = sf.apply_rho_intervals(data, rho_limits) if len(query_str)<1: return data return data.query(query_str)
def param_table(e=None, cond=cond, debug=False, rho_limits=None, use_cache=True, split_date=pd.datetime(2014,7,1), **kws): cached_table = files['params_cache'] if path.isfile(cached_table) and use_cache: return pd.read_msgpack(cached_table) if e is None: if debug: e = test_events(**kws) else: e = events(**kws) if rho_limits is None: rho_limits = RHO_LIMITS data = e.summary(col='paper', split_date=split_date) del(e) gc.collect() data = apply_rho_intervals(data, rho_limits) if cond is None: return data return data.where(cond)
def read_frame(fname, ftype=None, return_ftype=False): if ftype is None: with magic.Magic() as m: ftype = m.id_filename(fname) if ftype[:4] == 'data': ftype = 'msgpack' elif ftype.find('ASCII') != -1 or ftype.find('Image') != -1: ftype = 'csv' elif ftype[:4] == '8086': ftype = 'pickle' else: ftype = None if ftype == 'msgpack': df = pd.read_msgpack(fname) elif ftype == 'csv': df = pd.read_csv(fname, header=0, parse_dates=[0], index_col=0) elif ftype == 'pickle': df = pd.read_pickle(fname) if ftype is not None: return (df, ftype) if return_ftype else df raise Exception('File type not recognized for {}'.format(fname))
def testrun(gname): method = 'tepitope'#'iedbmhc1'#'netmhciipan' path='test' gfile = os.path.join(genomespath,'%s.gb' %gname) df = sequtils.genbank2Dataframe(gfile, cds=True) #names = list(df.locus_tag[:1]) names=['VP24'] alleles1 = ["HLA-A*02:02", "HLA-A*11:01", "HLA-A*32:07", "HLA-B*15:17", "HLA-B*51:01", "HLA-C*04:01", "HLA-E*01:03"] alleles2 = ["HLA-DRB1*0101", "HLA-DRB1*0305", "HLA-DRB1*0812", "HLA-DRB1*1196", "HLA-DRB1*1346", "HLA-DRB1*1455", "HLA-DRB1*1457", "HLA-DRB1*1612", "HLA-DRB4*0107", "HLA-DRB5*0203"] P = base.getPredictor(method) P.iedbmethod='IEDB_recommended' #'netmhcpan' P.predictProteins(df,length=11,alleles=alleles2,names=names, save=True,path=path) f = os.path.join('test', names[0]+'.mpk') df = pd.read_msgpack(f) P.data=df #b = P.getBinders(data=df) #print b[:20] base.getScoreDistributions(method, path) return
def getPredictions(label,genome,tag,q=0.96): """Get predictions from file system""" q=round(q,2) path = os.path.join(datapath, label) print path genomename = os.path.splitext(genome)[0] preds = OrderedDict() cutoffs = {} bcell = None for m in methods: rpath = os.path.join(path, '%s/%s' %(genomename,m)) filename = os.path.join(rpath, tag+'.mpk') if not os.path.exists(filename): continue df = pd.read_msgpack(filename) pred = base.getPredictor(name=m, data=df) if m == 'bcell': bcell = pred continue cutoffs[m] = pred.allelecutoffs = analysis.getCutoffs(rpath, m, q) preds[m] = pred return preds, bcell, cutoffs
def getAllBinders(path, method='tepitope', n=3, cutoff=0.95, promiscuous=True): """Get all promiscuous binders from a set of proteins in path""" print 'getting binders..' binders = [] m=method if m=='bcell': return #not applicable l=9 P = base.getPredictor(m) files = glob.glob(os.path.join(path, '*.mpk')) #get allele specific cutoffs P.allelecutoffs = getCutoffs(path, method, cutoff, overwrite=True) for f in files: df = pd.read_msgpack(f) if promiscuous== True: b = P.getPromiscuousBinders(data=df,n=n) else: b = P.getBinders(data=df) #print b[:5] binders.append(b) result = pd.concat(binders) result['start'] = result.pos result['end'] = result.pos+result.peptide.str.len() return result
def read(self, identifier, freq, **kwargs): """ Read the entire timeseries record for the requested timeseries instance. :param identifier: Identifier of the timeseries. :type identifier: string :param freq: Timeseries data frequency. :type freq: string :param kwargs: Attributes to match against timeseries instances (e.g. source, measurand). :type kwargs: kwargs :returns: pandas.DataFrame -- Timeseries data. """ url = self.__attach_kwargs_to_url( self.server + '/{0}/{1}.{2}'.format(identifier, freq, self.format), kwargs ) if self.format == 'msgpack': return pd.read_msgpack(urlopen(url)) elif self.format == 'json': return pd.read_json(urlopen(url)) else: raise NotImplementedError('Unsupported format: {0}'.format(self.format))
def time_packers_read_pack(self): pd.read_msgpack(self.f)