#convert List of transactions List of lists def extractAsLists(lst): res = [] for el in lst: sub = el.split(',') res.append(sub) return(res) print(extractAsLists(dataset)) dataset = extractAsLists(dataset) te_ary = te.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) # df.to_csv(dir_path+'AprioriDatasampledata.csv',encoding='utf-8',index=False) from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x)) print(frequent_itemsets) # result = frequent_itemsets[ (frequent_itemsets['length'] == 3) & # (frequent_itemsets['support'] >= 0.1) ] # print(frequent_itemsets) # result.to_csv(dir_path+'Aprioriresult.csv',encoding='utf-8',index=False)
def alpha_contribution(self, neutralize='Raw', W_mat="Equal", stock_pool='Astock'): """ 利用Barra风险模型对alpha因子进行归因 严格的,对于不同股票池的风险因子的因子收益率应该重新计算 但是这里暂时用的还是全市场的风险因子收益率 """ type_list = ['COUNTRY', 'STYLE', 'INDUSTRY'] stock_return = Barra().get_stock_return().T risk_return = Barra().get_factor_return(type_list=type_list) for i_date in range(len(self.change_date_series) - 1): # date #################################################################################################### date = self.change_date_series[i_date] bg_date = Date().get_trade_date_offset(date, 1) ed_date = self.change_date_series[i_date + 1] # data #################################################################################################### stock_return_period = stock_return.loc[bg_date:ed_date, :] stock_return_period = stock_return_period.T.dropna().T stock_return_period = pd.DataFrame( stock_return_period.sum(skipna=True)) stock_return_period.columns = ['Pct'] risk_return_period = risk_return.loc[bg_date:ed_date, :] risk_return_period = pd.DataFrame( risk_return_period.sum(skipna=True)) risk_return_period.columns = [date] exposure_date = Barra().get_factor_exposure_date(date, type_list) fmp = self.get_fmp(neutralize, W_mat, stock_pool) fmp_date = pd.DataFrame(fmp[date]) fmp_date.columns = ['FmpWeight'] fmp_date = fmp_date.dropna() code_list = list( set(exposure_date.index) & set(fmp_date.index) & set(stock_return_period.index)) code_list.sort() exposure_date = exposure_date.loc[code_list, :] fmp_date = fmp_date.loc[code_list, :] stock_return_period = stock_return_period.loc[code_list, :] if len(fmp_date) > self.min_stock_num: # risk factor return multiply alpha exposure_return on risk factor #################################################################################################### fmp_exposure = np.dot(fmp_date.T, exposure_date) fmp_exposure = pd.DataFrame(fmp_exposure, index=[date], columns=exposure_date.columns) fmp_risk_factor = fmp_exposure.mul(risk_return_period.T) fmp_alpha_factor = np.dot(fmp_date.T, stock_return_period) col = list(fmp_risk_factor.columns) fmp_risk_factor.loc[date, 'Res_Alpha'] = fmp_alpha_factor[0][ 0] - fmp_risk_factor.sum().sum() fmp_risk_factor.loc[date, 'Industry'] = fmp_risk_factor[ self.industry_factor_name].sum().sum() fmp_risk_factor.loc[date, 'Style'] = fmp_risk_factor[ self.style_factor_name].sum().sum() fmp_risk_factor.loc[date, 'Raw_Alpha'] = fmp_alpha_factor[0][0] col.insert(0, 'Res_Alpha') col.insert(0, 'Industry') col.insert(0, 'Style') col.insert(0, 'Raw_Alpha') fmp_risk_factor = fmp_risk_factor[col] print("Contribution for %s %s %s %s %s" % (self.alpha_factor_name, neutralize, W_mat, date, stock_pool)) # 4 concat #################################################################################################### if i_date == 0: fmp_risk_factor_all = fmp_risk_factor fmp_exposure_all = fmp_exposure else: fmp_risk_factor_all = pd.concat( [fmp_risk_factor_all, fmp_risk_factor], axis=0) fmp_exposure_all = pd.concat( [fmp_exposure_all, fmp_exposure], axis=0) # summary #################################################################################################### sub_path = os.path.join(self.path, 'summary') fmp_risk_summary = pd.DataFrame() fmp_risk_summary['Contribution'] = fmp_risk_factor_all.mean( ) * self.annual_number fmp_risk_summary['IR'] = fmp_risk_factor_all.mean( ) / fmp_risk_factor_all.std() * np.sqrt(self.annual_number) risk_return_mean = pd.DataFrame(risk_return.mean()) * 250 risk_return_mean.columns = ['Factor Return'] exposure_mean = pd.DataFrame(fmp_exposure_all.mean()) exposure_mean.columns = ['Avg Exposure'] exposure = pd.concat([risk_return_mean, exposure_mean], axis=1) # write excel #################################################################################################### filename = os.path.join( sub_path, '%s_%s_%s_%s_Summary.xlsx' % (self.alpha_factor_name, neutralize, W_mat, stock_pool)) sheet_name = "Contribution" we = WriteExcel(filename) ws = we.add_worksheet(sheet_name) num_format_pd = pd.DataFrame([], columns=fmp_risk_summary.columns, index=['format']) num_format_pd.ix['format', :] = '0.00' we.write_pandas(fmp_risk_summary, ws, begin_row_number=0, begin_col_number=1, num_format_pd=num_format_pd, color="blue", fillna=True) num_format_pd = pd.DataFrame([], columns=exposure.columns, index=['format']) num_format_pd.ix['format', :] = '0.00' num_format_pd.ix['format', 'Avg Exposure'] = '0.00%' we.write_pandas(exposure, ws, begin_row_number=4, begin_col_number=2 + len(fmp_risk_summary.columns), num_format_pd=num_format_pd, color="blue", fillna=True) we.close() # Write Csv #################################################################################################### sub_path = os.path.join(self.path, 'fmp_risk_factor') filename = os.path.join( sub_path, '%s_%s_%s_%s_RiskContributionFMP.csv' % (self.alpha_factor_name, neutralize, W_mat, stock_pool)) fmp_risk_factor_all.to_csv(filename) sub_path = os.path.join(self.path, 'fmp_exposure') filename = os.path.join( sub_path, '%s_%s_%s_%s_RiskExposureFMP.csv' % (self.alpha_factor_name, neutralize, W_mat, stock_pool)) fmp_exposure_all.to_csv(filename)
def get_data_date(self, date, stock_pool='Astock'): # alpha data date #################################################################################################### alpha_date_list = list(self.alpha_data.columns) alpha_date_list = list(filter(lambda x: x <= date, alpha_date_list)) alpha_date = pd.DataFrame(self.alpha_data[max(alpha_date_list)]) alpha_date.columns = [self.alpha_factor_name] # industry data date #################################################################################################### risk_factor_name = [] type_list = ['INDUSTRY'] barra_industry_date = Barra().get_factor_exposure_date( date=date, type_list=type_list) industry_columns = barra_industry_date.columns risk_factor_name.extend(industry_columns) self.industry_factor_name = industry_columns self.risk_factor_name = risk_factor_name # style data date #################################################################################################### type_list = ['STYLE'] barra_style_date = Barra().get_factor_exposure_date( date=date, type_list=type_list) barra_style_date = FactorPreProcess().standardization(barra_style_date) style_columns = barra_style_date.columns risk_factor_name.extend(style_columns) self.style_factor_name = style_columns self.risk_factor_name = risk_factor_name # free mv date #################################################################################################### free_mv_date = pd.DataFrame(self.free_mv_data[date]) free_mv_date.columns = ['FreeMv'] # ipo date #################################################################################################### ipo_date = Stock().get_ipo_date() ipo_date_new = Date().get_trade_date_offset(date, -120) ipo_date = ipo_date[ipo_date['IPO_DATE'] < ipo_date_new] ipo_date = ipo_date[ipo_date['DELIST_DATE'] > date] # trade status date #################################################################################################### trade_status_date = pd.DataFrame(self.trade_status[date]) trade_status_date.columns = ['TradeStatus'] code_trade = pd.concat([ipo_date, trade_status_date], axis=1) code_trade = code_trade.dropna() code_trade = code_trade[code_trade['TradeStatus'] == 0.0] if stock_pool != 'Astock': from quant.stock.index import Index index_weight = Index().get_weight(stock_pool, date) if index_weight is not None: code_trade = pd.concat([code_trade, index_weight], axis=1) code_trade = code_trade.dropna() else: code_trade = pd.DataFrame([]) all_data = pd.concat([ alpha_date, barra_industry_date, barra_style_date, free_mv_date, code_trade ], axis=1) all_data = all_data.dropna() alpha_date = pd.DataFrame(all_data[self.alpha_factor_name]) alpha_date = FactorPreProcess().remove_extreme_value_mad(alpha_date) alpha_date = FactorPreProcess().standardization(alpha_date) barra_industry_date = pd.DataFrame(all_data[self.industry_factor_name]) columns = barra_industry_date.columns[barra_industry_date.sum() > 10.0] barra_industry_date = barra_industry_date[columns] barra_style_date = pd.DataFrame(all_data[self.style_factor_name]) barra_style_date = FactorPreProcess().standardization(barra_style_date) free_mv_date = pd.DataFrame(all_data['FreeMv']) code_trade = pd.DataFrame(all_data['TradeStatus']) return alpha_date, barra_industry_date, barra_style_date, free_mv_date, code_trade
df['ADX'] = ta.ADX(np.array(df['high']), np.array(df['low']), \ np.array(df['close']), timeperiod = n) df['Return'] = np.log(df['Open'] / df['Open'].shift(1)) print(df.head()) df = df.dropna() ss = StandardScaler() unsup.fit(np.reshape(ss.fit_transform(df[:split]), (-1, df.shape[1]))) regime = unsup.predict(np.reshape(ss.fit_transform(df[split:]), \ (-1, df.shape[1]))) Regimes = pd.DataFrame(regime, columns = ['Regime'], index=df[split:].index)\ .join(df[split:], how='inner') \ .assign(market_cu_return = df[split:] \ .Return.cumsum())\ .reset_index(drop = False) \ .rename(columns = {'index':'Date'}) order = [0,1,2,3] fig = sns.FacetGrid(data = Regimes, hue='Regime' , hue_order=order, aspect=2 , size=4) fig.map(plt.scater, 'Date', 'market_cu_return', s = 4).add_legend() plt.show() for i in order: print('MEan for regime %i:' %i, unsup.means_[i][0]) print('Co-Varience for regime %i' %i, (unsupp.covariances_[i])) ss1 = StandardScaler() columns = Regimes.Columns.drop(['Regime', 'Date']) Regimes[columns] = ss1.fit_transform(Regimes[columns])
def input_fn(): return tf.data.Dataset.from_tensor_slices( dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1)
def get_vcf_annotations(df, sample_name, split_columns='', drop_hom_ref=True): ''' This function adds the following annotations for each variant: multiallele, phase, a1, a2, GT1, GT2, vartype1, vartype2, zygosity, and parsed FORMAT values, see below for additional information. Parameters -------------- sample_name: str, required sample column header id, e.g. NA12878 split_columns: dict, optional key:FORMAT id value:#fields expected e.g. {'AD':2} indicates Allelic Depth should be split into 2 columns. drop_hom_ref: bool, optional specifies whether to drop all homozygous reference variants from dataframe. FALSE REQUIRES LARGE MEMORY FOOTPRINT Output -------------- This function adds the following annotations to each variant: multiallele: {0,1} 0=biallele 1=multiallelic phase: {'/', '|'} /=unphased, |=phased a1: DNA base representation of allele1 call, e.g. A a2: DNA base representation of allele2 call, e.g. A GT1: numeric representation of allele1 call, e.g. 0 GT2: numeric representation of allele2 call, e.g. 1 vartype1: {snp, mnp, ins, del, indel or SV} variant type of first allele vartype2: {snp, mnp, ins, del, indel or SV} variant type of second allele zygosity: {het-ref, hom-ref, alt-ref, het-miss, hom-miss} FORMAT values: any values associated with the genotype calls are added as additional columns, split_columns are further split by ',' into individual columns ''' df['multiallele'] = df.ALT.str.count(',') multidf = df[df['multiallele'] > 0] while len(df) + len(multidf) > 0: df = df[~df.index.isin(multidf.index)] #print len(multidf), 'multidf rows' if len(multidf) > 0: multidf = get_multiallelic_bases(multidf, sample_name, single_sample_vcf=False) #print 'single alleles', len(df) df = get_biallelic_bases(df, sample_name) if len(multidf) > 0: df = df.append(multidf) df = zygosity_fast(df) df['vartype1'] = map(vartype_map, df[['REF','a1']].values) df['vartype2'] = map(vartype_map, df[['REF','a2']].values) df.set_index(['CHROM', 'POS', 'REF', 'ALT', 'sample_ids'], inplace=True) #df.sortlevel(level=['CHROM','POS','REF','ALT','sample_ids'],inplace=True) #sorting biallelic and multiallele variants #print 'before parse_single_genotype_data', len(df) #df = df.join( parse_single_genotype_data(df, sample_name, split_cols=split_columns), how='left' ) df['GT'] = df['sample_genotypes'] del df[sample_name] if 'FORMAT' in df.columns: del df['FORMAT'] df.reset_index(level=4, inplace=True, drop=True) df.set_index('GT', inplace=True, drop=False,append=True) #print df return df return pd.DataFrame()
def norm_df(df): return pd.DataFrame(scaler.transform(df), columns=df.columns, index=df.index)
def main(args): # Figure out the datatype we will use; this will determine whether we run on # CPU or on GPU. Run on GPU by adding the command-line flag --use_gpu dtype = torch.FloatTensor if args.use_gpu: dtype = torch.cuda.FloatTensor # Set up a transform to use for validation data at test-time. For validation # images we will simply resize so the smaller edge has 224 pixels, then take # a 224 x 224 center crop. We will then construct an ImageFolder Dataset object # for the validation data, and a DataLoader for the validation set. test_transform = T.Compose([ T.Scale(224), T.CenterCrop(224), T.ToTensor(), T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD), ]) test_dset = MultiLabelImageFolderTest(args.test_dir, transform=test_transform) test_loader = DataLoader(test_dset, batch_size=args.batch_size, num_workers=args.num_workers) def transform_target_to_1_0_vect(target): vect = np.zeros((17,)) vect[target] = 1 return vect # Now that we have set up the data, it's time to set up the model. # For this example we will finetune a densenet-169 model which has been # pretrained on ImageNet. We will first reinitialize the last layer of the # model, and train only the last layer for a few epochs. We will then finetune # the entire model on our dataset for a few more epochs. # First load the pretrained densenet-169 model; this will download the model # weights from the web the first time you run it. #model = torchvision.models.densenet169(pretrained=True) encoder = EncoderCNN(dtype, model_type = 'densenet') encoder.load_state_dict(torch.load(args.cnn_load_path)) encoder.type(dtype) encoder.eval() #decoder = DecoderRNN(args.label_embed_size, args.lstm_hidden_size, encoder.output_size, 17, args.combined_hidden_size) decoder = DecoderCaptionRNN(args.label_embed_size, args.lstm_hidden_size, encoder.output_size, 17) decoder.load_state_dict(torch.load(args.rnn_load_path)) decoder.type(dtype) decoder.eval() # Reinitialize the last layer of the model. Each pretrained model has a # slightly different structure, but from the densenet class definition # we see that the final fully-connected layer is stored in model.classifier: # https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py#L111 num_classes = 17 classes = find_classes(args.label_list_file) y_pred = np.zeros((len(test_dset), 17)) filenames_list = [] predictions = [] count = 0 for x, filenames in test_loader: print_progress(count, len(test_dset), 'Running example') x_var = Variable(x.type(dtype), volatile = True) preds = decoder.sample(encoder(x_var)) for i in range(preds.size(0)): pred = preds[i].data.cpu().numpy().tolist() if 17 in pred: ind = pred.index(17) pred = pred[:ind] predictions.append(' '.join([classes[j] for j in pred])) filenames_list += filenames count += x.size(0) subm = pd.DataFrame() subm['image_name'] = filenames_list subm['tags'] = predictions subm.to_csv(args.sub_file, index=False)
import pandas as pd import numpy as np print "create Series Object." s = pd.Series([1, 3, 5, np.nan, 6, 8]) print s print print print "Create DataFrame Object." dates = pd.date_range('20130101', periods=6) print dates print print df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) print df print print df2 = pd.DataFrame({ 'A': 1., 'B': pd.Timestamp('20130102'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(["test", "train", "test", "train"]), 'F': 'foo' }) print df2 print print
import pytest import pandas as pd from pandas.util.testing import assert_frame_equal import corna.inputs.maven_parser as fp label_df = pd.DataFrame({ 'Name': [1, 2, 3, 4, 5], 'Formula': [1, 2, 3, 4, 5], 'Sample': [1, 2, 3, 4, 5], 'Label': [ 'C13_0_N15_0', 'C13_1_N15_0', 'C13_0_N15_1', 'C13_2_N15_1', 'N15_1_C13_2' ] }) incomplete_maven_df = pd.DataFrame({ 'Metabolite Name': [1, 2, 3, 4, 5], 'Formula': [1, 2, 3, 4, 5], 'sample_1': [1, 2, 3, 4, 5], 'sample_2': [1, 2, 3, 4, 5], 'Label': [ 'C13_0_N15_0', 'C13_1_N15_0', 'C13_0_N15_1', 'C13_2_N15_1', 'N15_1_C13_2' ] }) label_df_maven_format = pd.DataFrame({ 'Name': [1, 2, 3, 4, 5], 'Formula': [1, 2, 3, 4, 5], 'Sample': [1, 2, 3, 4, 5],
newshape=(simulations, hidden_dim)) # Reshape back to [2*counter, 20*hidden_dim] x = np.concatenate((potentials, samples), axis=0) y = np.append(['potentials' for x in range(simulations)], ['samples' for x in range(simulations)]) from sklearn.manifold import TSNE import matplotlib.pyplot as plt import seaborn as sns tsne = TSNE(n_components=2, verbose=1, perplexity=40.0, n_iter=300, n_jobs=-1) tsne_results = tsne.fit_transform(X=x) feat_cols = ['point' + str(i) for i in range(x.shape[1])] df = pd.DataFrame(x, columns=feat_cols) df['y'] = y df['tsne-one'] = tsne_results[:, 0] df['tsne-two'] = tsne_results[:, 1] plt.figure(dpi=600, figsize=(16, 10)) ax = sns.scatterplot(x='tsne-one', y='tsne-two', hue='y', palette=['dodgerblue', 'red'], data=df, legend="full", alpha=1, s=30) handles, labels = ax.get_legend_handles_labels() ax.legend(handles=handles[1:], labels=labels[1:])
#spark读取数据文件创建DataFrame from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession from pyspark.sql.functions import split import pandas as pd spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate() df = spark.read.csv("/BigData/trip_data_1.csv", header=True, inferSchema=True) df = df.repartition(1) a_1 = df.filter(df["passenger_count"]==1).count() a_2 = df.filter(df["passenger_count"]==2).count() a_3 = df.filter(df["passenger_count"]==3).count() a_4 = df.filter(df["passenger_count"]==4).count() Count = [{'count_type':'1人','count':a_1},{'count_type':'2人','count':a_2},{'count_type':'3人','count':a_3},{'count_type':'4人','count':a_4}] count_pd = pd.DataFrame(Count) count_pd.to_csv("Count_pie.csv")
U_whole=U.copy() # In[68]: U_whole.nodes() # Descriptive Statisctics # In[33]: # no of nodes list_nodes = pd.DataFrame(list(U_whole.nodes())) #list_nodes.to_csv('list_edges.csv') len(list_nodes) # In[34]: # no. of edges list_edges = pd.DataFrame(list(U_whole.edges())) list_edges.to_csv('list_edges.csv') #len(list_edges) list_edges # In[35]:
# Vaccine vaccines = ['Moderna', 'Pfizer', 'JohnsonJohnson'] vaccine = {} vaccine['Patient'] = personStats['Person'] vaccine['Vaccinated'] = [random.randrange(0, 2) for _ in range(number_records)] vaccine['VaccineReceived'] = [ random.choice(vaccines) if vaccine['Vaccinated'][i] == 1 else 'Not Applicable' for i in range(number_records) ] doseNumber = [] for i in range(number_records): if vaccine['Vaccinated'][i] == 1: if 'Moderna' in vaccine['VaccineReceived'][i] or 'Pfizer' in vaccine[ 'VaccineReceived'][i]: doseNumber.append(random.randrange(1, 3)) else: doseNumber.append(1) else: doseNumber.append(-1) vaccine['DoseNumber'] = doseNumber vaccine['Symptomatic'] = [ random.randrange(0, 2) if vaccine['Vaccinated'][i] == 1 else -1 for i in range(number_records) ] # Adding data to csv file merge = {**country, **person, **countryStats, **personStats, **vaccine} complete_df = pd.DataFrame(merge) complete_df.to_csv(file_name, index=False)
def get_multiallelic_bases(df_orig, sample_col, single_sample_vcf=True): ''' This function parses multiallele variants into DNA base representations. It currently does not support haploid chromosomes. ''' haploid_chromosomes = ['X', 'chrX', 'Y', 'chrY', 'M', 'chrM'] df = df_orig.copy() def get_phase(line, sample_id): ''' Returns phase from genotype ''' genotype = str(line[sample_id]) if "|" in genotype: return "|" if "/" in genotype: return "/" else: return '-' def _get_allele(line, gt_col): ''' Returns allele base call from multi-allelic variants ''' alleles = [line['REF']] alleles.extend(list( line['ALT'].split(",")) ) a1 = "." try: a1 = alleles[int(line[gt_col])] #returns missing if gt_int_call is "." except: a1 = "." return a1 def get_GT_multisample_vcf(line, sample_col, gt_index): return line[sample_col].split(':')[0].split(line['phase'])[int(gt_index)] def get_GT_multisample_vcf_haploid(line, sample_col, gt_index): return str(line[sample_col]).split(':')[0] # if single_sample_vcf: # df['phase'] = df[sample_col].str[1] # df = df[df['phase']!=':'] #removing haploid variants # # df['GT1'] = df[sample_col].str[0] # df = df[df['GT1']!='.'] #removing variants with missing calls # df['GT1'] = df['GT1'].astype(int) # # df['GT2'] = df[sample_col].str[2] # df = df[df['GT2']!='.'] #removing variants with missing calls # df['GT2'] = df['GT2'].astype(int) if not single_sample_vcf: df['phase'] = df.apply(get_phase, args=[sample_col], axis=1) #get phase haploid_df = df[df.phase == "-"] #likley occurs at sex chromosome sites haploid_df = haploid_df[haploid_df['CHROM'].isin(haploid_chromosomes)] if len(haploid_df) > 0: haploid_df['GT1'] = df.apply(get_GT_multisample_vcf_haploid, args=[sample_col, 0], axis=1) haploid_df = haploid_df[ (haploid_df['GT1']!='.') & (haploid_df['GT1']!=np.NaN)] haploid_df['GT1'] = haploid_df['GT1'].astype(int) haploid_df['GT2'] = 0 haploid_df['a1'] = haploid_df.apply(_get_allele, args=['GT1'], axis=1) haploid_df['a2'] = haploid_df.apply(_get_allele, args=['GT2'], axis=1) df = df[df.phase != "-"] if len(df) > 0: df['GT1'] = df.apply(get_GT_multisample_vcf, args=[sample_col, 0], axis=1) df = df[ (df['GT1']!='.') & (df['GT1']!=np.NaN)] df['GT1'] = df['GT1'].astype(int) df['GT2'] = df.apply(get_GT_multisample_vcf, args=[sample_col, 1], axis=1) df = df[ (df['GT2']!='.') & (df['GT2']!=np.NaN)] df['GT2'] = df['GT2'].astype(int) df['a1'] = df.apply(_get_allele, args=['GT1'], axis=1) df['a2'] = df.apply(_get_allele, args=['GT2'], axis=1) #if len(df_multi) > 0: # df = df.append(df_multi) #df['a1'] = df.apply(_get_allele, args=['GT1'], axis=1) #df['a2'] = df.apply(_get_allele, args=['GT2'], axis=1) if len(df) > 0: if len(haploid_df) > 0: df = df.append(haploid_df) #adding haploid variants to dataframe return df else: return df if len(haploid_df) > 0: return haploid_df else: return pd.DataFrame()
cur_path = os.path.join(output_dir, s) if not os.path.exists(cur_path): os.makedirs(cur_path) print('Extracting acoustic feature...', flush=True) tr_x = Parallel(n_jobs=paras.n_jobs)(delayed(extract_feature)(str(file),feature=paras.feature_type,dim=paras.feature_dim,\ cmvn=paras.apply_cmvn,delta=paras.apply_delta,delta_delta=paras.apply_delta_delta,save_feature=os.path.join(cur_path,str(file).split('/')[-1].replace('.flac',''))) for file in tqdm(todo)) # sort by len sorted_y = [ '_'.join([str(i) for i in tr_y[idx]]) for idx in reversed(np.argsort(tr_x)) ] sorted_todo = [ os.path.join(s, str(todo[idx]).split('/')[-1].replace('.flac', '.npy')) for idx in reversed(np.argsort(tr_x)) ] # Dump label df = pd.DataFrame( data={ 'file_path': [fp for fp in sorted_todo], 'length': list(reversed(sorted(tr_x))), 'label': sorted_y }) df.to_csv(os.path.join(output_dir, s + '.csv')) # train with open(os.path.join(output_dir, "mapping.pkl"), "wb") as fp: pickle.dump(encode_table, fp) print('All done, saved at', output_dir, 'exit.')
def process_variant_annotations(df_vars_split_cols_sample_id_drop_hom_ref): ''' This function stacks a pandas vcf dataframe and adds annotations ''' df_vars, split_columns, sample_id, drop_hom_ref = df_vars_split_cols_sample_id_drop_hom_ref df_groups = df_vars.groupby('FORMAT') parsed_df = [] for format,df_format in df_groups: #iterate through different FORMAT types df_format = df_format[df_format['ALT'] != '.'] #dropping missing ALT alleles df_format = df_format[sample_id] #only consider sample columns df_format = df_format.replace(to_replace='.', value=np.NaN) #replacing missing calls with None df_format = pd.DataFrame( df_format.stack(), columns=['sample_genotypes'] ) #stacks sample calls and drops none calls if len(df_format) <1: #occurs when all calls are empty continue #SAVE QUALITY INFORMATION SEPARETELY TO AVOID ANNOTATION PROCESSING IDENTICAL GENOTYPE CALLS (DIFFERENT QUALITY DOESNT MATTER) if format.count(':') > 0: df_qual = pd.DataFrame(list(df_format['sample_genotypes'].str.split(':') ), index=df_format.index) #qual df, setting aside for later joining #print df_format.head(), format.split(':') df_qual.columns = format.split(':') #setting quality column names df_qual.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_ids'] #setting index names for joining with df_format later df_format['sample_genotypes'] = df_qual[format.split(':')[0]] #setting just the GT calls del df_qual['GT'] #removing from df_qual to avoid joining problems with df_format after add_annotations #DROPPING MISSING CALLS df_format = df_format[ (df_format['sample_genotypes']!='./.') & (df_format['sample_genotypes']!='.|.') \ & (df_format['sample_genotypes']!='.') ] #SETTING INDICES df_format.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_ids'] #setting index names df_format.reset_index(inplace=True) #ONLY NEED TO PASS UNIQUE GENOTYPE CALLS DF TO get_vcf_annotations, then broadcast back to df_format df_annotations = df_format.drop_duplicates(subset=['CHROM', 'POS', 'REF', 'ALT', 'sample_genotypes']) df_annotations['FORMAT'] = format.split(':')[0] #setting format id df_annotations.set_index(['CHROM', 'POS', 'REF', 'ALT', 'sample_genotypes'], drop=False, inplace=True) df_annotations = get_vcf_annotations(df_annotations, 'sample_genotypes', split_columns=split_columns) #getting annotations #SETTING INDICES AGAIN if len(df_annotations) < 1: continue #continue if no variants within this FORMAT category df_format.set_index(['CHROM', 'POS', 'REF', 'ALT', 'sample_genotypes'], drop=True, inplace=True) df_annotations.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_genotypes'] df_format = df_format.join(df_annotations) #df_format.set_index('sample_ids', drop=True, inplace=True, append=True) df_format['FORMAT'] = format df_format.reset_index(level=4, inplace=True, drop=False) if drop_hom_ref: hom_ref_counts = get_hom_ref_counts(df_format) hom_ref_counts.name = 'hom_ref_counts' df_format = df_format[df_format['zygosity']!='hom-ref'] #dropping all homozygous reference variants df_format = df_format.join(hom_ref_counts) df_format['hom_ref_counts'].fillna(value=0, inplace=True) del df_format['sample_genotypes'] df_format.set_index('sample_ids', inplace=True, append=True, drop=True) ##JOINING QUAL INFO BACK TO DF if format.count(':') > 0 and len(df_qual) > 0: df_format = df_format.join(df_qual, how='left') pass #SPLITTING GENOTYPE QUALITY COLUMNS if split_columns != '': for col in split_columns: split_col_names = [col + '_' + str(n) for n in range(0, split_columns[col]) ] df_format = df_format.join(pd.DataFrame(list(df_format[col].str.split(',').str[:len(split_col_names)]), index=df_format.index, columns=split_col_names)) del df_format[col] parsed_df.append(df_format) if len(parsed_df) > 0: df_annot = pd.concat(parsed_df) return df_annot else: print 'No Annotations generated, please check for excessive missing values' return df_vars
Q_ice.interpolate(mp.Q_ice) Q_mixed.interpolate(mp.Q_mixed) Q_latent.interpolate(mp.Q_latent) Q_s.interpolate(mp.S_flux_bc) melt.interpolate(mp.wb) Tb.interpolate(mp.Tb) Sb.interpolate(mp.Sb) full_pressure.interpolate(mp.P_full) ########## # Plotting top boundary. shelf_boundary_points = get_top_boundary(cavity_length=L, cavity_height=H2, water_depth=water_depth) top_boundary_mp = pd.DataFrame() def top_boundary_to_csv(boundary_points, df, t_str): df['Qice_t_' + t_str] = Q_ice.at(boundary_points) df['Qmixed_t_' + t_str] = Q_mixed.at(boundary_points) df['Qlat_t_' + t_str] = Q_latent.at(boundary_points) df['Qsalt_t_' + t_str] = Q_s.at(boundary_points) df['Melt_t' + t_str] = melt.at(boundary_points) df['Tb_t_' + t_str] = Tb.at(boundary_points) df['P_t_' + t_str] = full_pressure.at(boundary_points) df['Sal_t_' + t_str] = sal.at(boundary_points) df['Temp_t_' + t_str] = temp.at(boundary_points) df["integrated_melt_t_ " + t_str] = assemble(melt * ds(4)) if mesh.comm.rank == 0:
# Create income code column for classifying income for a STRATIFIED split df['income'] = np.ceil(df['median_income'] / 1.5) df['income'].where(df['income'] < 5, 5.0, inplace=True) # Sklearn Split object over this category to ensure representative sampling split = sk.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) # Perform stratified split for train_i, test_i in split.split(df, df['income']): df_train = df.loc[train_i] df_test = df.loc[test_i] original_split_percent = df['income'].value_counts() / len(df) * 100 train_split_percent = df_train['income'].value_counts() / len(df_train) * 100 compare = pd.DataFrame([original_split_percent, train_split_percent]).transpose() compare.columns = ['original', 'split'] compare['diff'] = compare['original'] - compare['split'] # Drop this stratification category df_train.drop(["income"], axis=1, inplace=True) df_test.drop(["income"], axis=1, inplace=True) logger.info(f"Test: {len(df_test)} records") logger.info(f"Train: {len(df_train)} records") y_tr = df_train["median_house_value"].copy() X_tr = df_train.drop("median_house_value", axis=1) # drop labels for training set y_test = df_test["median_house_value"].copy()
random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(X, Y) random_forest.score(X, Y) # *References*:<br> # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html <br> # https://stats.stackexchange.com/questions/260460/optimization-of-a-random-forest-model<br> # https://en.wikipedia.org/wiki/Random_forest <br> # https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm # ## Final Submission # In[ ]: final_test_RF = final_test[cols] Y_pred_RF = random_forest.predict(final_test_RF) # In[ ]: submission = pd.DataFrame({ "PassengerId": test_df["PassengerId"], "Survived": Y_pred_RF }) submission.to_csv('titanic.csv', index=False) # **Final References:** <br> # *Editing Markdowns*: https://medium.com/ibm-data-science-experience/markdown-for-jupyter-notebooks-cheatsheet-386c05aeebed<br> # *Matplotlib color library:* https://matplotlib.org/examples/color/named_colors.html # In[ ]:
def download_all_regions() -> pd.DataFrame: def nuts_query(nuts_level): q = Query.all_regions(nuts=nuts_level) return q def lau_query(lau_level): q = Query.all_regions(lau=lau_level) return q qb_all = Query.all_regions() qe = QueryExecutioner() print("start") all_regions = qe.run_query(qb_all) print("all") r_nuts1 = qe.run_query(nuts_query(1)) print("nuts1") r_nuts2 = qe.run_query(nuts_query(2)) print("nuts2") r_nuts3 = qe.run_query(nuts_query(3)) print("nuts3") r_lau1 = qe.run_query(lau_query(1)) print("lau") # currently no distinction between different laus # on datehenguide side # r_lau2 = qe.run_query(lau_query(2)) levels = { "nuts1": r_nuts1, "nuts2": r_nuts2, "nuts3": r_nuts3, "lau": r_lau1, # 'lau2':r_lau2 } def isAnscestor(region_id, candidate): return region_id.startswith(candidate) and candidate != region_id def parent(region_id, region_details): desc = region_details.assign(ansc=lambda df: df.index.map( lambda i: isAnscestor(region_id, i))).query("ansc") max_lev = desc.level.max() # noqa: F841 parent_frame = desc.query("level == @max_lev") if not parent_frame.empty: return parent_frame.iloc[0, :].name else: None if all_regions is None: raise RuntimeError("Was not able to download all regions") for k in levels: if levels[k] is None: raise RuntimeError(f"Was not able to download {k} regions") all_regions_df = pd.concat([ pd.DataFrame(page["data"]["allRegions"]["regions"]) for page in cast(List[ExecutionResults], all_regions)[0].query_results ]).set_index("id") level_df = pd.concat( pd.concat([ pd.DataFrame(page["data"]["allRegions"]["regions"]) for page in cast(List[ExecutionResults], levels[k])[0].query_results ]).assign(level=k) for k in levels) all_rg_parents = all_regions_df.join( level_df.set_index("id").loc[:, "level"]).assign( parent=lambda df: df.index.map( partial( parent, region_details=all_regions_df.assign(level=lambda df: df. index.map(len)), ))) all_rg_parents.loc[all_rg_parents.level == "nuts1", "parent"] = "DG" return all_rg_parents
brand = list(data['brand']) naver_data = pd.read_excel(root + '네이버쇼핑_프론트_백팩.xlsx') print(naver_data.head()) len(naver_data) match = [] for one_title in naver_data['title']: imsi = [] for one_brand in brand: if one_brand in one_title: imsi.append(one_title) imsi.append(one_brand) else: pass match.append(imsi) print(len(match)) df = pd.DataFrame(match) df_imsi = df.loc[:, 0:1] df_imsi.columns = ['title_02', 'brand'] concat_end = pd.concat([naver_data, df_imsi], axis=1) concat_end.drop('title_02', 1) #df_all = pd.merge(naver_data, df_imsi) print(len(concat_end)) dd = concat_end.drop('title_02', 1) dd
def cutting_main(img_list): file_path = './crop/' #save_crop_img_path if not os.path.isdir(file_path): os.makedirs(file_path) img_path = './resize_img/' # load_img_path if not os.path.isdir(img_path): os.makedirs(img_path) csv_save_path = './csv_save/' # 좌표가 저장된 csv 파일 if not os.path.isdir(csv_save_path): os.makedirs(csv_save_path) bground_list = os.listdir(img_path) #원본 이미지 경로 txt_list = os.listdir(csv_save_path) #csv 파일 경로 count = len(bground_list) #이미지 경로 안의 이미지 갯수 print('Page:'+str(count)) line_c =[] #csv 저장 main =[] #좌표 저장되 있는 리스트 og_main =[] for j in range(0,count): #이건 한 장 마다 한바퀴 돈다. C = [] point =[] del C[:] main =[] del main[:] x = natsort.natsorted(bground_list) y = natsort.natsorted(txt_list) #==============파일 이름을 거르기 위한 부분===============# str_path = str(x[j]) str_path = str_path.replace('.jpg', '') str_path = str_path.replace('mask', '') str_path = str_path.replace('resize', '') for i in range(0, len(links)): str_path = str_path.replace(links[i], '') for i in range(0, len(l)): str_path = str_path.replace(l[i], '') str_path = re.sub('_', ' ', str_path) str_path = re.sub(' ', '', str_path) #==============파일 이름을 거르기 위한 부분===============# img = Image.open(img_path+x[j]) #이미지 오픈 csv_f = open(csv_save_path+y[j], 'r', encoding='UTF8') line_c = csv.reader(csv_f) for lines in line_c: C.append(lines) print('글자 수 :' +str(len(C))) if len(C) < 5: continue appendP = point.append appendMain = main.append append_OG = og_main.append for k in range(0,len(C)): appendP(C[k]) real =[int(point[k][0]),int(point[k][1]),int(point[k][2]),int(point[k][3])] real2 =[int(point[k][0]),int(point[k][1]),int(point[k][2]),int(point[k][3])] appendMain(real) append_OG(real2) #====================================================================================# #우측에서 왼쪽으로 세로 읽기로 정렬 main = sorted(main, key=itemgetter(0)) #배열 소팅, reverse=True main = sorting_arry(main) #소팅 main = sorted(main, key=itemgetter(0,1)) main = sorted(main, key=itemgetter(0), reverse=True) main = same_check(main, og_main) #소팅된걸 체크해서 바꿔 줌 cutting_img_num = 0 #====================================================================================# #배열을 커팅 하는 부분 king_name = "ㅇ" for k in range(len(C)): arry1 = int(main[k][0]) arry2 = int(main[k][1]) arry3 = int(main[k][2]) arry4 = int(main[k][3]) #========파일 생성==========================# king_path = file_path +str(img_list[j][0]) book_path = file_path +str(img_list[j][0])+'/'+str(img_list[j][1])+'/' dir_path = file_path +str(img_list[j][0])+'/'+str(img_list[j][1])+'/'+str(str_path) # 파일 저장 경로 if not os.path.isdir(king_path): os.mkdir(king_path +'/') if not os.path.isdir(book_path): os.mkdir(book_path +'/') if not os.path.isdir(dir_path): os.mkdir(dir_path +'/') #========판다스 파일 생성==========================# replaceA = txt_list[j].replace('.csv', '') location = (arry1,arry2) a = abs(arry1 -arry3) #판다스 저장용 b = abs(arry2 - arry4) #판다스 저장용 #========파일 생성==========================# if arry1 < 0: arry1 = 0 s_name = int(cutting_img_num) area =(arry1, arry2, arry3, arry4) cropped_img = img.crop(area) # 이미지 크롭 img_save_path = dir_path+'/'+str(s_name) + '.jpg' cropped_img.save(img_save_path) #이미지 저장 cutting_img_num = cutting_img_num +1 get_size = os.path.getsize(img_save_path) volume_kb = '%0.2f' % (get_size/1024) if str(img_list[j][0]) != king_name: m2 =[] m2.append([img_save_path,replaceA, location,a,b,volume_kb+'KB']) if not os.path.isdir('./'+ str(img_list[j][0])): os.mkdir('./'+ str(img_list[j][0]) +'/') df2 = pd.DataFrame(m2, columns=['save_path','filename','Location', 'w','h','Volume(KB)']) # 저장 경로, 파일 이름, 원본 이미지에서 해당 글자의 위치, 넓이, 높이, 용량 df2.to_csv('./'+ str(img_list[j][0]) +'/'+str(img_list[j][0]) +'_data.csv',encoding='euc-kr') # csv 파일 저장 (왕 별로 저장) king_name = str(img_list[j][0])
("period[D]", PeriodDtype(freq="D")), (IntervalDtype(), IntervalDtype()), ], ) def test__get_dtype(input_param, result): assert com._get_dtype(input_param) == result @pytest.mark.parametrize( "input_param,expected_error_message", [ (None, "Cannot deduce dtype from null object"), (1, "data type not understood"), (1.2, "data type not understood"), ("random string", 'data type "random string" not understood'), (pd.DataFrame([1, 2]), "data type not understood"), ], ) def test__get_dtype_fails(input_param, expected_error_message): # python objects with pytest.raises(TypeError, match=expected_error_message): com._get_dtype(input_param) @pytest.mark.parametrize( "input_param,result", [ (int, np.dtype(int).type), ("int32", np.int32), (float, np.dtype(float).type), ("float64", np.float64),
def cal_fmp(self, neutralize='Raw', W_mat="Equal", stock_pool='Astock'): """ min h'Wh s.t. h'*a = 1.0 h'*B = 0.0 :param W_mat W_mat = 'Equal' 对角线全为1 W_mat = 'FreeMvSqrt' 对角线为自由流通市值的平方根 W_mat = 'BarraStockCov' 对角线为Barra估计的股票协方差矩阵 :param neutralize neutralize = 'Raw' 不限制对风险因子约束 neutralize = 'Res' 限制对风险因子约束 具体约束见参数文件 :param stock_pool multi_factor pool 股票池 在计算FMP的时候 还可以加入其他的约束条件 """ params_file = r'E:\3_Data\5_stock_data\3_alpha_model\fmp\input_file\neutral_list.xlsx' params = pd.read_excel(params_file) for i_date in range(len(self.change_date_series) - 1): # read alpha data and concat multi_factor list #################################################################################################### date = self.change_date_series[i_date] data = self.get_data_date(date, stock_pool) alpha_date, industry_dummy_date, barra_style_date, free_mv_date, code_trade = data code_list = list(alpha_date.index) # W 矩阵 #################################################################################################### if W_mat == 'BarraStockCov': stock_cov = Barra().get_stock_covariance(date) alpha_date = alpha_date.loc[code_list, :] stock_cov = stock_cov.loc[code_list, code_list] alpha_date = FactorPreProcess().remove_extreme_value_mad( alpha_date) alpha_date = FactorPreProcess().standardization(alpha_date) elif W_mat == 'FreeMvSqrt': free_mv_date = free_mv_date.dropna() free_mv_date['FreeMv2'] = free_mv_date['FreeMv'].map( lambda x: 1 / (x**(1 / 2))) free_mv_date = pd.DataFrame(free_mv_date['FreeMv2']) else: pass #################################################################################################### if len(alpha_date) > self.min_stock_num: if W_mat == 'Equal': P = np.diag(np.ones(shape=(1, len(alpha_date)))[0]) elif W_mat == 'FreeMvSqrt': P = np.diag(np.column_stack(free_mv_date.values)[0]) elif W_mat == 'BarraStockCov': P = stock_cov.values else: P = np.diag(np.ones(shape=(1, len(alpha_date)))[0]) Q = np.zeros(shape=(P.shape[0], 1)) A = np.column_stack(alpha_date.values) A_add = np.ones(shape=(1, P.shape[0])) A = np.row_stack((A, A_add)) b = np.array([[1.0], [0.0]]) if neutralize == 'Res': params = params[params.name == self.alpha_factor_name] params = params[params.market == stock_pool] params.index = ['index'] if params.loc['index', 'Industry'] == 1.0: A_add = industry_dummy_date.T.values A = np.row_stack((A, A_add)) b_add = np.row_stack( (np.zeros(shape=(len(industry_dummy_date.columns), 1)))) b = np.row_stack((b, b_add)) params_style = params.loc[:, self.style_factor_name].T params_style = params_style[params_style == 1.0] params_style = params_style.dropna() if len(params_style) > 0: barra_style_date = barra_style_date[params_style.index] A_add = barra_style_date.T.values A = np.row_stack((A, A_add)) b_add = np.row_stack( (np.zeros(shape=(len(barra_style_date.columns), 1)))) b = np.row_stack((b, b_add)) print(A.shape) try: P = matrix(P) Q = matrix(Q) A = matrix(A) b = matrix(b) result = sol.qp(P, q=Q, A=A, b=b) fmp_raw_alpha = pd.DataFrame(np.array(result['x'][0:]), columns=[date], index=code_list).T print( "Cal FMP %s %s %s %s " % (date, stock_pool, neutralize, self.alpha_factor_name)) concat_data = pd.concat([fmp_raw_alpha.T, alpha_date], axis=1) concat_data = concat_data.dropna() print(concat_data.corr().values[0][0]) except Exception as e: fmp_raw_alpha = pd.DataFrame([], columns=[date], index=code_list).T print( "QP FMP is InCorrect %s %s %s %s " % (date, stock_pool, neutralize, self.alpha_factor_name)) else: fmp_raw_alpha = pd.DataFrame([], columns=[date], index=code_list).T print("The Length of Data is Zero %s %s %s %s " % (date, stock_pool, neutralize, self.alpha_factor_name)) # concat #################################################################################################### if i_date == 0: fmp_raw_alpha_all = fmp_raw_alpha else: fmp_raw_alpha_all = pd.concat( [fmp_raw_alpha_all, fmp_raw_alpha], axis=0) # write data #################################################################################################### sub_path = os.path.join(self.path, 'fmp') file = os.path.join( sub_path, '%s_%s_%s_%s.csv' % (self.alpha_factor_name, neutralize, W_mat, stock_pool)) fmp_raw_alpha_all = fmp_raw_alpha_all.T fmp_raw_alpha_all.to_csv(file)
nodenames = [ 'ASBNVACYO3Y.csv', 'ATLNGAMAO4Y.csv', 'CHCGILDTO6Y.csv', 'CHCGILWUO7Y.csv', 'MIAUFLWSO0Y.csv', 'MIAUFLWSO3P-NE70191.csv', 'NYCMNYZRO1Y.csv', 'WASHDC12O1Y.csv' ] node_options = [dict(label=x.split('.')[0], value=x) for x in nodenames] dimensions = ["signal_type"] collist = [ 'ChanOchOprAve', 'ChanOchLBCAve', 'ChanOchChromaticDispersionAve', 'BerPreFecAve', 'BerPostFecAve', 'PhaseCorrectionAve', 'Qave', 'PmdAve', 'SoPmdAve', 'ChanOchOptAve' ] #,'performance_metrics.ts'] global df df = pd.DataFrame() df = pd.read_feather(os.path.join(DATA_DIR, "pivoted_ochctp.feather")) print(df.shape) netcoolDF = pd.read_feather( r"D:\experiments\data\netcool_with_devices.feather") col_options = [dict(label=x, value=x.lower()) for x in ["Ave", "Min", "Max"]] devnames = set(df['performance_metrics.module'].tolist()) netcoolDF = netcoolDF[netcoolDF['device'].str.strip().isin(devnames)] devnames = netcoolDF['device'].str.strip().tolist() print(netcoolDF.shape) # devnames=set(netcoolDF['device'].tolist()) dev_options = [dict(label=x.split('.')[0], value=x) for x in devnames] app = dash.Dash( __name__, external_stylesheets=["https://codepen.io/chriddyp/pen/bWLwgP.css"])
def cal_real_portfolio(self, neutralize='Raw', W_mat="Equal", stock_pool='Astock'): for i_date in range(len(self.change_date_series) - 1): lamb = 1000.0 # date #################################################################################################### date = self.change_date_series[i_date] data = self.get_data_date(date, stock_pool) alpha_date, industry_dummy_date, barra_style_date, free_mv_date, code_trade = data fmp = self.get_fmp(neutralize, W_mat, stock_pool) fmp_date = pd.DataFrame(fmp[date]) fmp_date.columns = ['FmpWeight'] fmp_date = fmp_date.dropna() code_list = list(fmp_date.index) # Barra().cal_stock_covariance(date) stock_covriance = Barra().get_stock_covariance(date) stock_covriance = stock_covriance.loc[code_list, code_list].values stock_covriance = np.zeros(shape=(len(code_list), len(code_list))) alpha_signal = np.dot(stock_covriance, fmp_date) alpha_signal = fmp_date.values * 2 P = stock_covriance * lamb Q = -np.row_stack(alpha_signal) from quant.stock.index import Index index_weight = Index().get_weight(index_code=stock_pool, date=date) index_weight = index_weight.loc[code_list, :] index_weight = index_weight.fillna(0.0) index_weight['Max'] = 0.03 index_weight['Min'] = -index_weight['WEIGHT'] G_positive = np.diag(np.ones(shape=(len(index_weight)))) G_negative = -np.diag(np.ones(shape=(len(index_weight)))) G = np.row_stack((G_positive, G_negative)) h_positive = np.row_stack(index_weight['Max'].values) h_negative = np.row_stack(index_weight['Min'].values) h = np.row_stack((h_positive, h_negative)) A = np.ones(shape=(1, len(index_weight))) b = np.array([[0.0]]) try: P = matrix(P) Q = matrix(Q) G = matrix(G) h = matrix(h) A = matrix(A) b = matrix(b) result = sol.qp(P, q=Q, G=G, h=h, A=A, b=b) result = sol.qp(P, q=Q) stock_weight_active = pd.DataFrame(np.array(result['x'][0:]), columns=['Active'], index=code_list).T weight = pd.concat( [index_weight, stock_weight_active.T, fmp_date], axis=1) weight['PortWeight'] = weight['WEIGHT'] + weight['Active'] weight['ImplyWeight'] = weight['WEIGHT'] + weight['FmpWeight'] print((weight['WEIGHT'] - weight['PortWeight']).abs().sum()) print(weight['Active'].sum()) print("Cal Portfolio %s %s %s %s " % (date, stock_pool, neutralize, self.alpha_factor_name)) except Exception as e: stock_weight = pd.DataFrame([], columns=[date], index=code_list).T index_weight = pd.concat([index_weight, stock_weight.T], axis=1) print("QP Portfolio is InCorrect %s %s %s %s " % (date, stock_pool, neutralize, self.alpha_factor_name))
def get_biallelic_bases(df, sample_col, single_sample_vcf=True): ''' This function returns the base call for each biallelic base 10X faster than previous iterations ''' haploid_chromosomes = ['X', 'chrX', 'Y', 'chrY', 'M', 'chrM'] def get_phase(line, sample_id): ''' Returns phase from genotype ''' genotype = str(line[sample_id]) if "|" in genotype: return "|" if "/" in genotype: return "/" else: return '-' def _get_allele(line, gt_col): ''' Returns allelic base, handles multi-allelic variants ''' alleles = [line['REF']] alleles.extend(line['ALT'].split(",")) a1 = "." try: a1 = alleles[int(line[gt_col])] #returns missing if gt_int_call is "." except: a1 = "." return a1 def get_GT_multisample_vcf(line, sample_col, gt_index): return int(line[sample_col].split(line['phase'])[int(gt_index)]) def get_GT_multisample_vcf_haploid(line, sample_col, gt_index): return str(line[sample_col]).split(':')[0] if single_sample_vcf: df['GT_len'] = df[sample_col].str.split(':').str[0].str.len() haploid_df = df[df['GT_len'] <= 1] haploid_df['phase'] = '-' haploid_df = haploid_df[haploid_df['CHROM'].isin(haploid_chromosomes)] df = df[df['GT_len'] > 1] df['phase'] = df[sample_col].str[1] df = df[ (df['phase']!='-') ] del df['GT_len'] del haploid_df['GT_len'] if len(haploid_df) > 0: haploid_df['GT1'] = haploid_df[sample_col].str[0] haploid_df = haploid_df[ (haploid_df['GT1']!='.') & (haploid_df['GT1']!=np.NaN)] haploid_df['GT2'] = 0 if len(df) > 0: df['GT1'] = df[sample_col].str[0] df = df[(df['GT1']!='.') & (df['GT1']!=np.NaN)] df['GT1'] = df['GT1'].astype(int) df['GT2'] = df[sample_col].str[2] df = df[(df['GT2']!='.') & (df['GT2']!=np.NaN)] df['GT2'] = df['GT2'].astype(int) if not single_sample_vcf: #16th December 2014 not sure this is needed now that get_multiallelic_bases is separate function df['GT_len'] = df[sample_col].str.split(':').str[0].str.len() haploid_df = df[df['GT_len'] <= 1] haploid_df['phase'] = '-' haploid_df = haploid_df[haploid_df['CHROM'].isin(haploid_chromosomes)] df = df[~df.index.isin(haploid_df.index)] df['phase'] = df[sample_col].str[1] df = df[ (df['phase']!='-') ] del df['GT_len'] del haploid_df['GT_len'] if len(df) > 0: df['GT1'] = df.apply(get_GT_multisample, args=[sample_col, 0], axis=1) df = df[(df['GT1']!='.') & (df['GT1']!=np.NaN)] df['GT2'] = df.apply(get_GT_multisample, args=[sample_col, 1], axis=1) df = df[(df['GT2']!='.') & (df['GT2']!=np.NaN)] if len(haploid_df) > 0: haploid_df['GT1'] = df.apply(get_GT_multisample_vcf_haploid, args=[sample_col, 0], axis=1) haploid_df = haploid_df[ (haploid_df['GT1']!='.') & (haploid_df['GT1']!=np.NaN)] haploid_df['GT1'] = haploid_df['GT1'].astype(int) haploid_df['GT2'] = 0 if len(df) > 0: if len(haploid_df) > 0: df = df.append(haploid_df) else: pass else: df = haploid_df #FAST PROCESS SIMPLE ALLELE GENOTYPES df_simple = df if len(df_simple) > 0: df_gt1_ref = df_simple[df_simple.GT1.astype(int)==0][['REF']] #get a1 ref alleles df_gt1_ref.columns = ['a1'] df_gt2_ref = df_simple[df_simple.GT2.astype(int)==0][['REF']] #get a2 ref alleles df_gt2_ref.columns = ['a2'] df_gt1_alt = df_simple[df_simple.GT1.astype(int)==1][['ALT']] #get a1 alt alleles df_gt1_alt.columns = ['a1'] df_gt2_alt = df_simple[df_simple.GT2.astype(int)==1][['ALT']] #get a2 alt alleles df_gt2_alt.columns = ['a2'] gt1_alleles = pd.concat([df_gt1_ref,df_gt1_alt]) #merging GT1 allele bases into a single df #del gt1_alleles[0] gt2_alleles = pd.concat([df_gt2_ref,df_gt2_alt]) #merging GT2 allele bases into a single df #del gt2_alleles[0] gt1_2_allele_df = gt1_alleles.join(gt2_alleles) #Joining the GT1 and GT2 simple allele bases #print len(df) return df.join(gt1_2_allele_df) else: return pd.DataFrame()
import numpy as np from sklearn.ensemble import RandomForestClassifier import pandas as pd from sklearn import datasets, linear_model from sklearn.model_selection import train_test_split columns = "ip op1".split() headers = ["ip", "op1"] test_data = pd.read_csv("testdata.csv") test_data.columns = headers print(test_data.describe()) df1 = pd.DataFrame(test_data, columns=columns) y = df1.op1 train_x, test_x, train_y, test_y = train_test_split(test_data["ip"], test_data["op1"], train_size=0.1) train_x = train_x.reshape(1, -1) train_y = train_y.reshape(1, -1) print("Train_y Shape :: ", train_y.shape) print("Test_x Shape :: ", test_x.shape) print("Test_y Shape :: ", test_y.shape) clf = RandomForestClassifier() clf.fit(train_x, train_y) test_x = test_x.values.reshape(1, -1) predictions = clf.predict(test_x) print("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x))) print("Test Accuracy :: ", accuracy_score(test_y, predictions)) print(" Confusion matrix ", confusion_matrix(test_y, predictions)) print("Trained model:", clf)
dtrain = xgb.DMatrix(X_train, y_train) dval = xgb.DMatrix(X_val, y_val) watchlist = [(dtrain, 'train'), (dval, 'val')] bst = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=100, verbose_eval=True) y_train_hat = bst.predict(dtrain) plotAuc(y_train, y_train_hat, xlabel='train') y_val_hat = bst.predict(dval) plotAuc(y_val, y_val_hat, xlabel='validation') bst.save_model('xgb-{}.model'.format(getNextVer("xgb-(\d).model"))) return bst ######### train train, test, features, target = load_data() X_train, X_val, y_train, y_val = train_test_split(train[features], train[target], test_size=0.05, random_state=10) bst = getModel(X_train, y_train, X_val, y_val) dtest = xgb.DMatrix(test[features]) yhat = bst.predict(dtest) output = pd.DataFrame({'id': test.id, 'flag': yhat}) output = output[['id','flag']] output.to_csv('output-{}.txt'.format(getNextVer('output-(\d).txt')), index=False, columns=None, header=False, sep='\t')