#convert List of transactions List of lists
def extractAsLists(lst): 
    res = [] 
    for el in lst: 
        sub = el.split(',') 
        res.append(sub) 
      
    return(res) 
                 
print(extractAsLists(dataset)) 
dataset = extractAsLists(dataset)
    



te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
# df.to_csv(dir_path+'AprioriDatasampledata.csv',encoding='utf-8',index=False)


from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
print(frequent_itemsets) 


# result = frequent_itemsets[ (frequent_itemsets['length'] == 3) &
#                    (frequent_itemsets['support'] >= 0.1) ]
# print(frequent_itemsets)
# result.to_csv(dir_path+'Aprioriresult.csv',encoding='utf-8',index=False)
Exemple #2
0
    def alpha_contribution(self,
                           neutralize='Raw',
                           W_mat="Equal",
                           stock_pool='Astock'):
        """
        利用Barra风险模型对alpha因子进行归因
        严格的,对于不同股票池的风险因子的因子收益率应该重新计算
        但是这里暂时用的还是全市场的风险因子收益率
        """

        type_list = ['COUNTRY', 'STYLE', 'INDUSTRY']
        stock_return = Barra().get_stock_return().T
        risk_return = Barra().get_factor_return(type_list=type_list)

        for i_date in range(len(self.change_date_series) - 1):

            # date
            ####################################################################################################
            date = self.change_date_series[i_date]
            bg_date = Date().get_trade_date_offset(date, 1)
            ed_date = self.change_date_series[i_date + 1]

            # data
            ####################################################################################################
            stock_return_period = stock_return.loc[bg_date:ed_date, :]
            stock_return_period = stock_return_period.T.dropna().T
            stock_return_period = pd.DataFrame(
                stock_return_period.sum(skipna=True))
            stock_return_period.columns = ['Pct']

            risk_return_period = risk_return.loc[bg_date:ed_date, :]
            risk_return_period = pd.DataFrame(
                risk_return_period.sum(skipna=True))
            risk_return_period.columns = [date]

            exposure_date = Barra().get_factor_exposure_date(date, type_list)

            fmp = self.get_fmp(neutralize, W_mat, stock_pool)
            fmp_date = pd.DataFrame(fmp[date])
            fmp_date.columns = ['FmpWeight']
            fmp_date = fmp_date.dropna()

            code_list = list(
                set(exposure_date.index) & set(fmp_date.index)
                & set(stock_return_period.index))
            code_list.sort()

            exposure_date = exposure_date.loc[code_list, :]
            fmp_date = fmp_date.loc[code_list, :]
            stock_return_period = stock_return_period.loc[code_list, :]

            if len(fmp_date) > self.min_stock_num:

                # risk factor return multiply alpha exposure_return on risk factor
                ####################################################################################################
                fmp_exposure = np.dot(fmp_date.T, exposure_date)
                fmp_exposure = pd.DataFrame(fmp_exposure,
                                            index=[date],
                                            columns=exposure_date.columns)

                fmp_risk_factor = fmp_exposure.mul(risk_return_period.T)
                fmp_alpha_factor = np.dot(fmp_date.T, stock_return_period)

                col = list(fmp_risk_factor.columns)

                fmp_risk_factor.loc[date, 'Res_Alpha'] = fmp_alpha_factor[0][
                    0] - fmp_risk_factor.sum().sum()
                fmp_risk_factor.loc[date, 'Industry'] = fmp_risk_factor[
                    self.industry_factor_name].sum().sum()
                fmp_risk_factor.loc[date, 'Style'] = fmp_risk_factor[
                    self.style_factor_name].sum().sum()
                fmp_risk_factor.loc[date, 'Raw_Alpha'] = fmp_alpha_factor[0][0]

                col.insert(0, 'Res_Alpha')
                col.insert(0, 'Industry')
                col.insert(0, 'Style')
                col.insert(0, 'Raw_Alpha')
                fmp_risk_factor = fmp_risk_factor[col]
                print("Contribution for %s %s %s %s %s" %
                      (self.alpha_factor_name, neutralize, W_mat, date,
                       stock_pool))

                # 4 concat
                ####################################################################################################
                if i_date == 0:
                    fmp_risk_factor_all = fmp_risk_factor
                    fmp_exposure_all = fmp_exposure
                else:
                    fmp_risk_factor_all = pd.concat(
                        [fmp_risk_factor_all, fmp_risk_factor], axis=0)
                    fmp_exposure_all = pd.concat(
                        [fmp_exposure_all, fmp_exposure], axis=0)

        # summary
        ####################################################################################################
        sub_path = os.path.join(self.path, 'summary')
        fmp_risk_summary = pd.DataFrame()
        fmp_risk_summary['Contribution'] = fmp_risk_factor_all.mean(
        ) * self.annual_number
        fmp_risk_summary['IR'] = fmp_risk_factor_all.mean(
        ) / fmp_risk_factor_all.std() * np.sqrt(self.annual_number)

        risk_return_mean = pd.DataFrame(risk_return.mean()) * 250
        risk_return_mean.columns = ['Factor Return']

        exposure_mean = pd.DataFrame(fmp_exposure_all.mean())
        exposure_mean.columns = ['Avg Exposure']

        exposure = pd.concat([risk_return_mean, exposure_mean], axis=1)

        # write excel
        ####################################################################################################
        filename = os.path.join(
            sub_path, '%s_%s_%s_%s_Summary.xlsx' %
            (self.alpha_factor_name, neutralize, W_mat, stock_pool))
        sheet_name = "Contribution"

        we = WriteExcel(filename)
        ws = we.add_worksheet(sheet_name)

        num_format_pd = pd.DataFrame([],
                                     columns=fmp_risk_summary.columns,
                                     index=['format'])
        num_format_pd.ix['format', :] = '0.00'
        we.write_pandas(fmp_risk_summary,
                        ws,
                        begin_row_number=0,
                        begin_col_number=1,
                        num_format_pd=num_format_pd,
                        color="blue",
                        fillna=True)

        num_format_pd = pd.DataFrame([],
                                     columns=exposure.columns,
                                     index=['format'])
        num_format_pd.ix['format', :] = '0.00'
        num_format_pd.ix['format', 'Avg Exposure'] = '0.00%'
        we.write_pandas(exposure,
                        ws,
                        begin_row_number=4,
                        begin_col_number=2 + len(fmp_risk_summary.columns),
                        num_format_pd=num_format_pd,
                        color="blue",
                        fillna=True)
        we.close()

        # Write Csv
        ####################################################################################################
        sub_path = os.path.join(self.path, 'fmp_risk_factor')
        filename = os.path.join(
            sub_path, '%s_%s_%s_%s_RiskContributionFMP.csv' %
            (self.alpha_factor_name, neutralize, W_mat, stock_pool))
        fmp_risk_factor_all.to_csv(filename)
        sub_path = os.path.join(self.path, 'fmp_exposure')
        filename = os.path.join(
            sub_path, '%s_%s_%s_%s_RiskExposureFMP.csv' %
            (self.alpha_factor_name, neutralize, W_mat, stock_pool))
        fmp_exposure_all.to_csv(filename)
Exemple #3
0
    def get_data_date(self, date, stock_pool='Astock'):

        # alpha data date
        ####################################################################################################
        alpha_date_list = list(self.alpha_data.columns)
        alpha_date_list = list(filter(lambda x: x <= date, alpha_date_list))

        alpha_date = pd.DataFrame(self.alpha_data[max(alpha_date_list)])
        alpha_date.columns = [self.alpha_factor_name]

        # industry data date
        ####################################################################################################
        risk_factor_name = []
        type_list = ['INDUSTRY']
        barra_industry_date = Barra().get_factor_exposure_date(
            date=date, type_list=type_list)
        industry_columns = barra_industry_date.columns
        risk_factor_name.extend(industry_columns)
        self.industry_factor_name = industry_columns
        self.risk_factor_name = risk_factor_name

        # style data date
        ####################################################################################################
        type_list = ['STYLE']
        barra_style_date = Barra().get_factor_exposure_date(
            date=date, type_list=type_list)
        barra_style_date = FactorPreProcess().standardization(barra_style_date)
        style_columns = barra_style_date.columns
        risk_factor_name.extend(style_columns)
        self.style_factor_name = style_columns
        self.risk_factor_name = risk_factor_name

        # free mv date
        ####################################################################################################
        free_mv_date = pd.DataFrame(self.free_mv_data[date])
        free_mv_date.columns = ['FreeMv']

        # ipo date
        ####################################################################################################
        ipo_date = Stock().get_ipo_date()
        ipo_date_new = Date().get_trade_date_offset(date, -120)
        ipo_date = ipo_date[ipo_date['IPO_DATE'] < ipo_date_new]
        ipo_date = ipo_date[ipo_date['DELIST_DATE'] > date]

        # trade status date
        ####################################################################################################
        trade_status_date = pd.DataFrame(self.trade_status[date])
        trade_status_date.columns = ['TradeStatus']

        code_trade = pd.concat([ipo_date, trade_status_date], axis=1)
        code_trade = code_trade.dropna()
        code_trade = code_trade[code_trade['TradeStatus'] == 0.0]

        if stock_pool != 'Astock':
            from quant.stock.index import Index
            index_weight = Index().get_weight(stock_pool, date)
            if index_weight is not None:
                code_trade = pd.concat([code_trade, index_weight], axis=1)
                code_trade = code_trade.dropna()
            else:
                code_trade = pd.DataFrame([])

        all_data = pd.concat([
            alpha_date, barra_industry_date, barra_style_date, free_mv_date,
            code_trade
        ],
                             axis=1)
        all_data = all_data.dropna()

        alpha_date = pd.DataFrame(all_data[self.alpha_factor_name])
        alpha_date = FactorPreProcess().remove_extreme_value_mad(alpha_date)
        alpha_date = FactorPreProcess().standardization(alpha_date)

        barra_industry_date = pd.DataFrame(all_data[self.industry_factor_name])
        columns = barra_industry_date.columns[barra_industry_date.sum() > 10.0]
        barra_industry_date = barra_industry_date[columns]

        barra_style_date = pd.DataFrame(all_data[self.style_factor_name])
        barra_style_date = FactorPreProcess().standardization(barra_style_date)

        free_mv_date = pd.DataFrame(all_data['FreeMv'])
        code_trade = pd.DataFrame(all_data['TradeStatus'])

        return alpha_date, barra_industry_date, barra_style_date, free_mv_date, code_trade
df['ADX'] = ta.ADX(np.array(df['high']), np.array(df['low']), \ 
                    np.array(df['close']), timeperiod = n)
df['Return'] = np.log(df['Open'] / df['Open'].shift(1))

print(df.head())

df = df.dropna()

ss = StandardScaler()
unsup.fit(np.reshape(ss.fit_transform(df[:split]), (-1, df.shape[1])))
regime = unsup.predict(np.reshape(ss.fit_transform(df[split:]), \
                            (-1, df.shape[1])))

Regimes = pd.DataFrame(regime, columns = ['Regime'], index=df[split:].index)\
                        .join(df[split:], how='inner') \
                            .assign(market_cu_return = df[split:] \
                                .Return.cumsum())\
                                .reset_index(drop = False) \
                                .rename(columns = {'index':'Date'})

order = [0,1,2,3]
fig = sns.FacetGrid(data = Regimes, hue='Regime' , hue_order=order, aspect=2 , size=4)
fig.map(plt.scater, 'Date', 'market_cu_return', s = 4).add_legend()
plt.show()

for i in order:
    print('MEan for regime %i:' %i, unsup.means_[i][0])
    print('Co-Varience for regime %i' %i, (unsupp.covariances_[i]))

ss1 = StandardScaler()
columns = Regimes.Columns.drop(['Regime', 'Date'])
Regimes[columns] = ss1.fit_transform(Regimes[columns])
Exemple #5
0
 def input_fn():
     return tf.data.Dataset.from_tensor_slices(
         dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1)
def get_vcf_annotations(df, sample_name, split_columns='', drop_hom_ref=True):
        '''
        This function adds the following annotations for each variant:
        multiallele, phase, a1, a2, GT1, GT2, vartype1, vartype2, zygosity,
        and parsed FORMAT values, see below for additional information.
                        
        Parameters
        --------------
        sample_name: str, required 
                    sample column header id, e.g. NA12878
        
        split_columns: dict, optional
                    key:FORMAT id value:#fields expected
                    e.g. {'AD':2} indicates Allelic Depth should be
                    split into 2 columns.
        
        drop_hom_ref: bool, optional
                    specifies whether to drop all homozygous reference
                    variants from dataframe.
                    FALSE REQUIRES LARGE MEMORY FOOTPRINT
        
        Output
        --------------
        This function adds the following annotations to each variant:
        
        multiallele: {0,1} 0=biallele  1=multiallelic
    
        phase: {'/', '|'} /=unphased, |=phased
        
        a1: DNA base representation of allele1 call, e.g. A
        a2: DNA base representation of allele2 call, e.g. A
        
        GT1: numeric representation of allele1 call, e.g. 0
        GT2: numeric representation of allele2 call, e.g. 1
        
        vartype1: {snp, mnp, ins, del, indel or SV} variant type of first allele
        vartype2: {snp, mnp, ins, del, indel or SV} variant type of second allele
        
        zygosity: {het-ref, hom-ref, alt-ref, het-miss, hom-miss}
        
        FORMAT values: any values associated with the genotype calls are 
                        added as additional columns, split_columns are further
                        split by ',' into individual columns
        
        
        
        '''
        
        
        
        df['multiallele'] = df.ALT.str.count(',')
        multidf = df[df['multiallele'] > 0]
            
        while len(df) + len(multidf) > 0:
            
            df = df[~df.index.isin(multidf.index)]
            #print len(multidf), 'multidf rows'
            
            if len(multidf) > 0:
                multidf = get_multiallelic_bases(multidf, sample_name, single_sample_vcf=False)
            
            
            #print 'single alleles', len(df)
            
            df = get_biallelic_bases(df, sample_name)
            
            
            if len(multidf) > 0:
                df = df.append(multidf)
            
            
            df = zygosity_fast(df)
            
            
            df['vartype1'] = map(vartype_map, df[['REF','a1']].values)
            df['vartype2'] = map(vartype_map, df[['REF','a2']].values)
            
            df.set_index(['CHROM', 'POS', 'REF', 'ALT', 'sample_ids'], inplace=True)
            
            #df.sortlevel(level=['CHROM','POS','REF','ALT','sample_ids'],inplace=True)  #sorting biallelic and multiallele variants 
            
            #print 'before parse_single_genotype_data', len(df)
            
            
            #df = df.join( parse_single_genotype_data(df, sample_name, split_cols=split_columns), how='left' )
            df['GT'] = df['sample_genotypes']
            del df[sample_name]
            if 'FORMAT' in df.columns:
                del df['FORMAT']
            
            df.reset_index(level=4, inplace=True, drop=True)
            df.set_index('GT', inplace=True, drop=False,append=True)
            #print df
            return df
    
        return pd.DataFrame()
Exemple #7
0
def norm_df(df):
    return pd.DataFrame(scaler.transform(df),
                        columns=df.columns,
                        index=df.index)
def main(args):

  # Figure out the datatype we will use; this will determine whether we run on
  # CPU or on GPU. Run on GPU by adding the command-line flag --use_gpu
  dtype = torch.FloatTensor
  if args.use_gpu:
    dtype = torch.cuda.FloatTensor

  # Set up a transform to use for validation data at test-time. For validation
  # images we will simply resize so the smaller edge has 224 pixels, then take
  # a 224 x 224 center crop. We will then construct an ImageFolder Dataset object
  # for the validation data, and a DataLoader for the validation set.
  test_transform = T.Compose([
    T.Scale(224),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
  ])
  test_dset = MultiLabelImageFolderTest(args.test_dir, transform=test_transform)
  test_loader = DataLoader(test_dset,
                  batch_size=args.batch_size,
                  num_workers=args.num_workers)

  def transform_target_to_1_0_vect(target):
    vect = np.zeros((17,))
    vect[target] = 1
    return vect
  
  # Now that we have set up the data, it's time to set up the model.
  # For this example we will finetune a densenet-169 model which has been
  # pretrained on ImageNet. We will first reinitialize the last layer of the
  # model, and train only the last layer for a few epochs. We will then finetune
  # the entire model on our dataset for a few more epochs.

  # First load the pretrained densenet-169 model; this will download the model
  # weights from the web the first time you run it.
  #model = torchvision.models.densenet169(pretrained=True)
  encoder = EncoderCNN(dtype, model_type = 'densenet')
  encoder.load_state_dict(torch.load(args.cnn_load_path))
  encoder.type(dtype)
  encoder.eval()
  #decoder = DecoderRNN(args.label_embed_size, args.lstm_hidden_size, encoder.output_size, 17, args.combined_hidden_size)
  decoder = DecoderCaptionRNN(args.label_embed_size, args.lstm_hidden_size, encoder.output_size, 17)
  decoder.load_state_dict(torch.load(args.rnn_load_path))
  decoder.type(dtype)
  decoder.eval()

  # Reinitialize the last layer of the model. Each pretrained model has a
  # slightly different structure, but from the densenet class definition
  # we see that the final fully-connected layer is stored in model.classifier:
  # https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py#L111
  num_classes = 17

  classes = find_classes(args.label_list_file)

  y_pred = np.zeros((len(test_dset), 17))
  filenames_list = []
  predictions = []

  count = 0
  for x, filenames in test_loader:
    print_progress(count, len(test_dset), 'Running example')

    x_var = Variable(x.type(dtype), volatile = True)
    preds = decoder.sample(encoder(x_var))

    for i in range(preds.size(0)):
        pred = preds[i].data.cpu().numpy().tolist()
        if 17 in pred:
            ind = pred.index(17)
            pred = pred[:ind]
        predictions.append(' '.join([classes[j] for j in pred]))

    filenames_list += filenames
    count += x.size(0)

  subm = pd.DataFrame()
  subm['image_name'] = filenames_list
  subm['tags'] = predictions
  subm.to_csv(args.sub_file, index=False)
import pandas as pd
import numpy as np

print "create Series Object."
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print s
print
print

print "Create DataFrame Object."
dates = pd.date_range('20130101', periods=6)
print dates
print
print

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print df
print
print

df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo'
})
print df2
print
print
import pytest
import pandas as pd
from pandas.util.testing import assert_frame_equal

import corna.inputs.maven_parser as fp

label_df = pd.DataFrame({
    'Name': [1, 2, 3, 4, 5],
    'Formula': [1, 2, 3, 4, 5],
    'Sample': [1, 2, 3, 4, 5],
    'Label': [
        'C13_0_N15_0', 'C13_1_N15_0', 'C13_0_N15_1', 'C13_2_N15_1',
        'N15_1_C13_2'
    ]
})

incomplete_maven_df = pd.DataFrame({
    'Metabolite Name': [1, 2, 3, 4, 5],
    'Formula': [1, 2, 3, 4, 5],
    'sample_1': [1, 2, 3, 4, 5],
    'sample_2': [1, 2, 3, 4, 5],
    'Label': [
        'C13_0_N15_0', 'C13_1_N15_0', 'C13_0_N15_1', 'C13_2_N15_1',
        'N15_1_C13_2'
    ]
})

label_df_maven_format = pd.DataFrame({
    'Name': [1, 2, 3, 4, 5],
    'Formula': [1, 2, 3, 4, 5],
    'Sample': [1, 2, 3, 4, 5],
                     newshape=(simulations, hidden_dim))

# Reshape back to [2*counter, 20*hidden_dim]
x = np.concatenate((potentials, samples), axis=0)
y = np.append(['potentials' for x in range(simulations)],
              ['samples' for x in range(simulations)])

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

tsne = TSNE(n_components=2, verbose=1, perplexity=40.0, n_iter=300, n_jobs=-1)
tsne_results = tsne.fit_transform(X=x)

feat_cols = ['point' + str(i) for i in range(x.shape[1])]
df = pd.DataFrame(x, columns=feat_cols)
df['y'] = y
df['tsne-one'] = tsne_results[:, 0]
df['tsne-two'] = tsne_results[:, 1]

plt.figure(dpi=600, figsize=(16, 10))
ax = sns.scatterplot(x='tsne-one',
                     y='tsne-two',
                     hue='y',
                     palette=['dodgerblue', 'red'],
                     data=df,
                     legend="full",
                     alpha=1,
                     s=30)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:])
Exemple #12
0
#spark读取数据文件创建DataFrame
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
import pandas as pd

spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

df = spark.read.csv("/BigData/trip_data_1.csv",
                    header=True,
                    inferSchema=True)

df = df.repartition(1)

a_1 = df.filter(df["passenger_count"]==1).count()
a_2 = df.filter(df["passenger_count"]==2).count()
a_3 = df.filter(df["passenger_count"]==3).count()
a_4 = df.filter(df["passenger_count"]==4).count()

Count = [{'count_type':'1人','count':a_1},{'count_type':'2人','count':a_2},{'count_type':'3人','count':a_3},{'count_type':'4人','count':a_4}]

count_pd = pd.DataFrame(Count)
count_pd.to_csv("Count_pie.csv")

U_whole=U.copy()


# In[68]:


U_whole.nodes()


# Descriptive Statisctics

# In[33]:


# no of nodes
list_nodes = pd.DataFrame(list(U_whole.nodes()))
#list_nodes.to_csv('list_edges.csv')
len(list_nodes)


# In[34]:


# no. of edges
list_edges = pd.DataFrame(list(U_whole.edges()))
list_edges.to_csv('list_edges.csv')
#len(list_edges)
list_edges


# In[35]:
Exemple #14
0
# Vaccine
vaccines = ['Moderna', 'Pfizer', 'JohnsonJohnson']
vaccine = {}
vaccine['Patient'] = personStats['Person']
vaccine['Vaccinated'] = [random.randrange(0, 2) for _ in range(number_records)]
vaccine['VaccineReceived'] = [
    random.choice(vaccines)
    if vaccine['Vaccinated'][i] == 1 else 'Not Applicable'
    for i in range(number_records)
]
doseNumber = []
for i in range(number_records):
    if vaccine['Vaccinated'][i] == 1:
        if 'Moderna' in vaccine['VaccineReceived'][i] or 'Pfizer' in vaccine[
                'VaccineReceived'][i]:
            doseNumber.append(random.randrange(1, 3))
        else:
            doseNumber.append(1)
    else:
        doseNumber.append(-1)
vaccine['DoseNumber'] = doseNumber
vaccine['Symptomatic'] = [
    random.randrange(0, 2) if vaccine['Vaccinated'][i] == 1 else -1
    for i in range(number_records)
]

# Adding data to csv file
merge = {**country, **person, **countryStats, **personStats, **vaccine}
complete_df = pd.DataFrame(merge)
complete_df.to_csv(file_name, index=False)
def get_multiallelic_bases(df_orig, sample_col, single_sample_vcf=True):
    '''
    This function parses multiallele variants into DNA base representations.
    It currently does not support haploid chromosomes.
    '''
    
    haploid_chromosomes = ['X', 'chrX', 'Y', 'chrY', 'M', 'chrM']

    df = df_orig.copy()
    
    def get_phase(line, sample_id):
        '''
        Returns phase from genotype
        '''
        genotype = str(line[sample_id])
        if "|" in genotype:
            return "|"
        if "/" in genotype:
            return "/"
        else:
            return '-'
    
    
    def _get_allele(line, gt_col):
        '''
        Returns allele base call from multi-allelic variants
        '''
        
        alleles = [line['REF']]
        alleles.extend(list( line['ALT'].split(",")) )
        a1 = "."
        try:
            a1 = alleles[int(line[gt_col])]  #returns missing if gt_int_call is "."
        except:
            a1 = "."
        return a1



    def get_GT_multisample_vcf(line, sample_col, gt_index):
        return line[sample_col].split(':')[0].split(line['phase'])[int(gt_index)]
    
    
    
    def get_GT_multisample_vcf_haploid(line, sample_col, gt_index):
        return str(line[sample_col]).split(':')[0]

    
#    if single_sample_vcf:
#        df['phase'] = df[sample_col].str[1]
#        df = df[df['phase']!=':']  #removing haploid variants
#        
#        df['GT1'] = df[sample_col].str[0]
#        df = df[df['GT1']!='.']  #removing variants with missing calls
#        df['GT1'] = df['GT1'].astype(int)
#        
#        df['GT2'] = df[sample_col].str[2]
#        df = df[df['GT2']!='.']  #removing variants with missing calls
#        df['GT2'] = df['GT2'].astype(int)


    if not single_sample_vcf:
        df['phase'] = df.apply(get_phase, args=[sample_col], axis=1)  #get phase
        haploid_df = df[df.phase == "-"]  #likley occurs at sex chromosome sites
        haploid_df = haploid_df[haploid_df['CHROM'].isin(haploid_chromosomes)]
        
        if len(haploid_df) > 0:
            haploid_df['GT1'] = df.apply(get_GT_multisample_vcf_haploid, args=[sample_col, 0], axis=1)
            haploid_df = haploid_df[ (haploid_df['GT1']!='.') & (haploid_df['GT1']!=np.NaN)]
            haploid_df['GT1'] = haploid_df['GT1'].astype(int)
            haploid_df['GT2'] = 0
            haploid_df['a1'] = haploid_df.apply(_get_allele, args=['GT1'], axis=1)
            haploid_df['a2'] = haploid_df.apply(_get_allele, args=['GT2'], axis=1)
        
        df = df[df.phase != "-"]
        if len(df) > 0:
            
            df['GT1'] = df.apply(get_GT_multisample_vcf, args=[sample_col, 0], axis=1)
            df = df[ (df['GT1']!='.') & (df['GT1']!=np.NaN)]
            df['GT1'] = df['GT1'].astype(int)
            
            
            df['GT2'] = df.apply(get_GT_multisample_vcf, args=[sample_col, 1], axis=1)
            df = df[ (df['GT2']!='.') & (df['GT2']!=np.NaN)]
            df['GT2'] = df['GT2'].astype(int)
            df['a1'] = df.apply(_get_allele, args=['GT1'], axis=1)
            df['a2'] = df.apply(_get_allele, args=['GT2'], axis=1)
    
    #if len(df_multi) > 0:
    #    df = df.append(df_multi)
    
        
    #df['a1'] = df.apply(_get_allele, args=['GT1'], axis=1)
    #df['a2'] = df.apply(_get_allele, args=['GT2'], axis=1)


    if len(df) > 0:
        if len(haploid_df) > 0:
            df = df.append(haploid_df)  #adding haploid variants to dataframe
            return df
        else:
            return df
    if len(haploid_df) > 0:
        return haploid_df
    else:
        return pd.DataFrame()
    cur_path = os.path.join(output_dir, s)
    if not os.path.exists(cur_path): os.makedirs(cur_path)

    print('Extracting acoustic feature...', flush=True)
    tr_x = Parallel(n_jobs=paras.n_jobs)(delayed(extract_feature)(str(file),feature=paras.feature_type,dim=paras.feature_dim,\
            cmvn=paras.apply_cmvn,delta=paras.apply_delta,delta_delta=paras.apply_delta_delta,save_feature=os.path.join(cur_path,str(file).split('/')[-1].replace('.flac',''))) for file in tqdm(todo))

    # sort by len
    sorted_y = [
        '_'.join([str(i) for i in tr_y[idx]])
        for idx in reversed(np.argsort(tr_x))
    ]
    sorted_todo = [
        os.path.join(s,
                     str(todo[idx]).split('/')[-1].replace('.flac', '.npy'))
        for idx in reversed(np.argsort(tr_x))
    ]
    # Dump label
    df = pd.DataFrame(
        data={
            'file_path': [fp for fp in sorted_todo],
            'length': list(reversed(sorted(tr_x))),
            'label': sorted_y
        })
    df.to_csv(os.path.join(output_dir, s + '.csv'))
    # train
    with open(os.path.join(output_dir, "mapping.pkl"), "wb") as fp:
        pickle.dump(encode_table, fp)

print('All done, saved at', output_dir, 'exit.')
def process_variant_annotations(df_vars_split_cols_sample_id_drop_hom_ref):
    '''
    This function stacks a pandas vcf dataframe and adds annotations
    
    '''
    df_vars, split_columns, sample_id, drop_hom_ref = df_vars_split_cols_sample_id_drop_hom_ref
    df_groups = df_vars.groupby('FORMAT')
    
    parsed_df = []
    for format,df_format in df_groups:  #iterate through different FORMAT types
        
        
        df_format = df_format[df_format['ALT'] != '.']  #dropping missing ALT alleles
        df_format = df_format[sample_id]  #only consider sample columns
        df_format = df_format.replace(to_replace='.', value=np.NaN)  #replacing missing calls with None
        
        
        df_format = pd.DataFrame( df_format.stack(), columns=['sample_genotypes'] )  #stacks sample calls and drops none calls
        
        if len(df_format) <1:  #occurs when all calls are empty
            continue  
        
        #SAVE QUALITY INFORMATION SEPARETELY TO AVOID ANNOTATION PROCESSING IDENTICAL GENOTYPE CALLS (DIFFERENT QUALITY DOESNT MATTER)
        if format.count(':') > 0:
            df_qual = pd.DataFrame(list(df_format['sample_genotypes'].str.split(':') ), index=df_format.index) #qual df, setting aside for later joining
            #print df_format.head(), format.split(':')
            df_qual.columns = format.split(':')  #setting quality column names
            df_qual.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_ids']  #setting index names for joining with df_format later
            df_format['sample_genotypes'] = df_qual[format.split(':')[0]]  #setting just the GT calls
            del df_qual['GT']  #removing from df_qual to avoid joining problems with df_format after add_annotations
            
        
        
        
        #DROPPING MISSING CALLS
        df_format = df_format[ (df_format['sample_genotypes']!='./.') & (df_format['sample_genotypes']!='.|.') \
                              & (df_format['sample_genotypes']!='.') ]
        
        #SETTING INDICES
        df_format.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_ids']  #setting index names
        df_format.reset_index(inplace=True)
        
        
        #ONLY NEED TO PASS UNIQUE GENOTYPE CALLS DF TO get_vcf_annotations, then broadcast back to df_format
        df_annotations = df_format.drop_duplicates(subset=['CHROM', 'POS', 'REF', 'ALT', 'sample_genotypes'])
        df_annotations['FORMAT'] = format.split(':')[0]  #setting format id
        df_annotations.set_index(['CHROM', 'POS', 'REF', 'ALT', 'sample_genotypes'], drop=False, inplace=True)
        df_annotations = get_vcf_annotations(df_annotations, 'sample_genotypes', split_columns=split_columns)  #getting annotations
        
        
        #SETTING INDICES AGAIN
        if len(df_annotations) < 1: continue  #continue if no variants within this FORMAT category
        df_format.set_index(['CHROM', 'POS', 'REF', 'ALT', 'sample_genotypes'], drop=True, inplace=True)
        df_annotations.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_genotypes']
        df_format = df_format.join(df_annotations)
        
        
        
        #df_format.set_index('sample_ids', drop=True, inplace=True, append=True)
        df_format['FORMAT'] = format
        df_format.reset_index(level=4, inplace=True, drop=False)
        
        
        if drop_hom_ref:
            hom_ref_counts = get_hom_ref_counts(df_format)
            hom_ref_counts.name = 'hom_ref_counts'
            df_format = df_format[df_format['zygosity']!='hom-ref']  #dropping all homozygous reference variants
            df_format = df_format.join(hom_ref_counts)
            df_format['hom_ref_counts'].fillna(value=0, inplace=True)
        
        del df_format['sample_genotypes']
        df_format.set_index('sample_ids', inplace=True, append=True, drop=True)
        
        
        
        ##JOINING QUAL INFO BACK TO DF
        if format.count(':') > 0 and len(df_qual) > 0:
            df_format = df_format.join(df_qual, how='left')
            pass
        
        #SPLITTING GENOTYPE QUALITY COLUMNS
        if split_columns != '':
            for col in split_columns:
                split_col_names = [col + '_' + str(n) for n in range(0, split_columns[col]) ]
                df_format = df_format.join(pd.DataFrame(list(df_format[col].str.split(',').str[:len(split_col_names)]), index=df_format.index, columns=split_col_names))
                del df_format[col]
        
        
        parsed_df.append(df_format)
        
        
    if len(parsed_df) > 0:  
        df_annot = pd.concat(parsed_df)
        return df_annot
    else:
        print 'No Annotations generated, please check for excessive missing values'
        return df_vars
Q_ice.interpolate(mp.Q_ice)
Q_mixed.interpolate(mp.Q_mixed)
Q_latent.interpolate(mp.Q_latent)
Q_s.interpolate(mp.S_flux_bc)
melt.interpolate(mp.wb)
Tb.interpolate(mp.Tb)
Sb.interpolate(mp.Sb)
full_pressure.interpolate(mp.P_full)

##########

# Plotting top boundary.
shelf_boundary_points = get_top_boundary(cavity_length=L,
                                         cavity_height=H2,
                                         water_depth=water_depth)
top_boundary_mp = pd.DataFrame()


def top_boundary_to_csv(boundary_points, df, t_str):
    df['Qice_t_' + t_str] = Q_ice.at(boundary_points)
    df['Qmixed_t_' + t_str] = Q_mixed.at(boundary_points)
    df['Qlat_t_' + t_str] = Q_latent.at(boundary_points)
    df['Qsalt_t_' + t_str] = Q_s.at(boundary_points)
    df['Melt_t' + t_str] = melt.at(boundary_points)
    df['Tb_t_' + t_str] = Tb.at(boundary_points)
    df['P_t_' + t_str] = full_pressure.at(boundary_points)
    df['Sal_t_' + t_str] = sal.at(boundary_points)
    df['Temp_t_' + t_str] = temp.at(boundary_points)
    df["integrated_melt_t_ " + t_str] = assemble(melt * ds(4))

    if mesh.comm.rank == 0:
Exemple #19
0
# Create income code column for classifying income for a STRATIFIED split
df['income'] = np.ceil(df['median_income'] / 1.5)
df['income'].where(df['income'] < 5, 5.0, inplace=True)

# Sklearn Split object over this category to ensure representative sampling
split = sk.model_selection.StratifiedShuffleSplit(n_splits=1,
                                                  test_size=0.2,
                                                  random_state=42)

# Perform stratified split
for train_i, test_i in split.split(df, df['income']):
    df_train = df.loc[train_i]
    df_test = df.loc[test_i]
original_split_percent = df['income'].value_counts() / len(df) * 100
train_split_percent = df_train['income'].value_counts() / len(df_train) * 100
compare = pd.DataFrame([original_split_percent,
                        train_split_percent]).transpose()
compare.columns = ['original', 'split']
compare['diff'] = compare['original'] - compare['split']

# Drop this stratification category
df_train.drop(["income"], axis=1, inplace=True)
df_test.drop(["income"], axis=1, inplace=True)

logger.info(f"Test: {len(df_test)} records")
logger.info(f"Train: {len(df_train)} records")

y_tr = df_train["median_house_value"].copy()
X_tr = df_train.drop("median_house_value",
                     axis=1)  # drop labels for training set

y_test = df_test["median_house_value"].copy()
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X, Y)
random_forest.score(X, Y)

# *References*:<br>
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html <br>
# https://stats.stackexchange.com/questions/260460/optimization-of-a-random-forest-model<br>
# https://en.wikipedia.org/wiki/Random_forest <br>
# https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm

# ## Final Submission

# In[ ]:

final_test_RF = final_test[cols]
Y_pred_RF = random_forest.predict(final_test_RF)

# In[ ]:

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": Y_pred_RF
})
submission.to_csv('titanic.csv', index=False)

# **Final References:** <br>
# *Editing Markdowns*: https://medium.com/ibm-data-science-experience/markdown-for-jupyter-notebooks-cheatsheet-386c05aeebed<br>
# *Matplotlib color library:* https://matplotlib.org/examples/color/named_colors.html

# In[ ]:
def download_all_regions() -> pd.DataFrame:
    def nuts_query(nuts_level):
        q = Query.all_regions(nuts=nuts_level)
        return q

    def lau_query(lau_level):
        q = Query.all_regions(lau=lau_level)
        return q

    qb_all = Query.all_regions()

    qe = QueryExecutioner()
    print("start")
    all_regions = qe.run_query(qb_all)
    print("all")
    r_nuts1 = qe.run_query(nuts_query(1))
    print("nuts1")
    r_nuts2 = qe.run_query(nuts_query(2))
    print("nuts2")
    r_nuts3 = qe.run_query(nuts_query(3))
    print("nuts3")
    r_lau1 = qe.run_query(lau_query(1))
    print("lau")
    # currently no distinction between different laus
    # on datehenguide side
    # r_lau2 = qe.run_query(lau_query(2))

    levels = {
        "nuts1": r_nuts1,
        "nuts2": r_nuts2,
        "nuts3": r_nuts3,
        "lau": r_lau1,
        # 'lau2':r_lau2
    }

    def isAnscestor(region_id, candidate):
        return region_id.startswith(candidate) and candidate != region_id

    def parent(region_id, region_details):
        desc = region_details.assign(ansc=lambda df: df.index.map(
            lambda i: isAnscestor(region_id, i))).query("ansc")
        max_lev = desc.level.max()  # noqa: F841
        parent_frame = desc.query("level == @max_lev")
        if not parent_frame.empty:
            return parent_frame.iloc[0, :].name
        else:
            None

    if all_regions is None:
        raise RuntimeError("Was not able to download all regions")

    for k in levels:
        if levels[k] is None:
            raise RuntimeError(f"Was not able to download {k} regions")

    all_regions_df = pd.concat([
        pd.DataFrame(page["data"]["allRegions"]["regions"])
        for page in cast(List[ExecutionResults], all_regions)[0].query_results
    ]).set_index("id")

    level_df = pd.concat(
        pd.concat([
            pd.DataFrame(page["data"]["allRegions"]["regions"]) for page in
            cast(List[ExecutionResults], levels[k])[0].query_results
        ]).assign(level=k) for k in levels)

    all_rg_parents = all_regions_df.join(
        level_df.set_index("id").loc[:, "level"]).assign(
            parent=lambda df: df.index.map(
                partial(
                    parent,
                    region_details=all_regions_df.assign(level=lambda df: df.
                                                         index.map(len)),
                )))
    all_rg_parents.loc[all_rg_parents.level == "nuts1", "parent"] = "DG"

    return all_rg_parents
Exemple #22
0
brand = list(data['brand'])

naver_data = pd.read_excel(root + '네이버쇼핑_프론트_백팩.xlsx')
print(naver_data.head())
len(naver_data)

match = []

for one_title in naver_data['title']:
    imsi = []
    for one_brand in brand:
        if one_brand in one_title:
            imsi.append(one_title)
            imsi.append(one_brand)
        else:
            pass
    match.append(imsi)

print(len(match))
df = pd.DataFrame(match)
df_imsi = df.loc[:, 0:1]
df_imsi.columns = ['title_02', 'brand']

concat_end = pd.concat([naver_data, df_imsi], axis=1)
concat_end.drop('title_02', 1)
#df_all = pd.merge(naver_data, df_imsi)

print(len(concat_end))

dd = concat_end.drop('title_02', 1)
dd
def cutting_main(img_list):
    file_path = './crop/' #save_crop_img_path
    if not os.path.isdir(file_path):
        os.makedirs(file_path)
    img_path = './resize_img/' # load_img_path 
    if not os.path.isdir(img_path):
        os.makedirs(img_path)
    csv_save_path = './csv_save/' # 좌표가 저장된 csv 파일
    if not os.path.isdir(csv_save_path):
        os.makedirs(csv_save_path)

    bground_list = os.listdir(img_path) #원본 이미지 경로
    txt_list = os.listdir(csv_save_path) #csv 파일 경로
    count = len(bground_list) #이미지 경로 안의 이미지 갯수
    print('Page:'+str(count))
    line_c =[]  #csv 저장
    main =[] #좌표 저장되 있는 리스트
    og_main =[]

    for j in range(0,count): #이건 한 장 마다 한바퀴 돈다. 
        C = []
        point =[]
        del C[:]
        main =[]
        del main[:]
        x = natsort.natsorted(bground_list)
        y = natsort.natsorted(txt_list)

        #==============파일 이름을 거르기 위한 부분===============#
        str_path = str(x[j])
        str_path = str_path.replace('.jpg', '')
        str_path = str_path.replace('mask', '')
        str_path = str_path.replace('resize', '')
        for i in range(0, len(links)):
            str_path = str_path.replace(links[i], '')
        for i in range(0, len(l)):
            str_path = str_path.replace(l[i], '')
        str_path = re.sub('_', ' ', str_path)
        str_path = re.sub(' ', '', str_path)
        #==============파일 이름을 거르기 위한 부분===============#

        img = Image.open(img_path+x[j]) #이미지 오픈 
        
        csv_f = open(csv_save_path+y[j], 'r', encoding='UTF8')
        line_c = csv.reader(csv_f)
        
        for lines in line_c:
            C.append(lines)

        print('글자 수 :' +str(len(C)))
        if len(C) < 5:
            continue
        
        appendP = point.append
        appendMain = main.append
        append_OG = og_main.append
        
        for k in range(0,len(C)):
            appendP(C[k])
            real =[int(point[k][0]),int(point[k][1]),int(point[k][2]),int(point[k][3])]
            real2 =[int(point[k][0]),int(point[k][1]),int(point[k][2]),int(point[k][3])] 
            appendMain(real)
            append_OG(real2)
        #====================================================================================#
        #우측에서 왼쪽으로 세로 읽기로 정렬
        
        main = sorted(main, key=itemgetter(0)) #배열 소팅, reverse=True
        main = sorting_arry(main) #소팅
        main = sorted(main, key=itemgetter(0,1))
        main = sorted(main, key=itemgetter(0), reverse=True)
        main = same_check(main, og_main) #소팅된걸 체크해서 바꿔 줌
        cutting_img_num = 0

        #====================================================================================#
        #배열을 커팅 하는 부분
        king_name = "ㅇ"
        for k in range(len(C)):
            arry1 = int(main[k][0])
            arry2 = int(main[k][1])
            arry3 = int(main[k][2])
            arry4 = int(main[k][3])
            #========파일 생성==========================#
            
            king_path = file_path +str(img_list[j][0])
            book_path = file_path +str(img_list[j][0])+'/'+str(img_list[j][1])+'/'
            dir_path = file_path +str(img_list[j][0])+'/'+str(img_list[j][1])+'/'+str(str_path) # 파일 저장 경로

            if not os.path.isdir(king_path):
                os.mkdir(king_path +'/')  
            if not os.path.isdir(book_path):
                os.mkdir(book_path +'/')  
            if not os.path.isdir(dir_path):
                os.mkdir(dir_path +'/')  

            #========판다스 파일 생성==========================#
            replaceA = txt_list[j].replace('.csv', '')
            location = (arry1,arry2)
            a = abs(arry1 -arry3) #판다스 저장용
            b = abs(arry2 - arry4) #판다스 저장용
            #========파일 생성==========================#
            if arry1 < 0:
                arry1 = 0
            s_name = int(cutting_img_num)
            area =(arry1, arry2, arry3, arry4)
            cropped_img = img.crop(area) # 이미지 크롭

            img_save_path = dir_path+'/'+str(s_name) + '.jpg'

            cropped_img.save(img_save_path) #이미지 저장
            cutting_img_num = cutting_img_num +1
            get_size = os.path.getsize(img_save_path)
            volume_kb = '%0.2f' % (get_size/1024)

            if str(img_list[j][0]) != king_name:
                m2 =[]
            
            m2.append([img_save_path,replaceA, location,a,b,volume_kb+'KB'])
            if not os.path.isdir('./'+ str(img_list[j][0])):
                os.mkdir('./'+ str(img_list[j][0]) +'/')  
            df2 = pd.DataFrame(m2, columns=['save_path','filename','Location', 'w','h','Volume(KB)']) # 저장 경로, 파일 이름, 원본 이미지에서 해당 글자의 위치, 넓이, 높이, 용량
            df2.to_csv('./'+ str(img_list[j][0]) +'/'+str(img_list[j][0]) +'_data.csv',encoding='euc-kr') # csv 파일 저장 (왕 별로 저장)
            king_name = str(img_list[j][0])
Exemple #24
0
        ("period[D]", PeriodDtype(freq="D")),
        (IntervalDtype(), IntervalDtype()),
    ],
)
def test__get_dtype(input_param, result):
    assert com._get_dtype(input_param) == result


@pytest.mark.parametrize(
    "input_param,expected_error_message",
    [
        (None, "Cannot deduce dtype from null object"),
        (1, "data type not understood"),
        (1.2, "data type not understood"),
        ("random string", 'data type "random string" not understood'),
        (pd.DataFrame([1, 2]), "data type not understood"),
    ],
)
def test__get_dtype_fails(input_param, expected_error_message):
    # python objects
    with pytest.raises(TypeError, match=expected_error_message):
        com._get_dtype(input_param)


@pytest.mark.parametrize(
    "input_param,result",
    [
        (int, np.dtype(int).type),
        ("int32", np.int32),
        (float, np.dtype(float).type),
        ("float64", np.float64),
Exemple #25
0
    def cal_fmp(self, neutralize='Raw', W_mat="Equal", stock_pool='Astock'):
        """
        min h'Wh
        s.t. h'*a = 1.0
             h'*B = 0.0

        :param W_mat
        W_mat = 'Equal' 对角线全为1
        W_mat = 'FreeMvSqrt' 对角线为自由流通市值的平方根
        W_mat = 'BarraStockCov' 对角线为Barra估计的股票协方差矩阵

        :param neutralize
        neutralize = 'Raw' 不限制对风险因子约束
        neutralize = 'Res' 限制对风险因子约束 具体约束见参数文件

        :param stock_pool
        multi_factor pool 股票池

        在计算FMP的时候 还可以加入其他的约束条件
        """

        params_file = r'E:\3_Data\5_stock_data\3_alpha_model\fmp\input_file\neutral_list.xlsx'
        params = pd.read_excel(params_file)

        for i_date in range(len(self.change_date_series) - 1):

            # read alpha data and concat multi_factor list
            ####################################################################################################
            date = self.change_date_series[i_date]
            data = self.get_data_date(date, stock_pool)
            alpha_date, industry_dummy_date, barra_style_date, free_mv_date, code_trade = data

            code_list = list(alpha_date.index)

            # W 矩阵
            ####################################################################################################
            if W_mat == 'BarraStockCov':

                stock_cov = Barra().get_stock_covariance(date)
                alpha_date = alpha_date.loc[code_list, :]
                stock_cov = stock_cov.loc[code_list, code_list]
                alpha_date = FactorPreProcess().remove_extreme_value_mad(
                    alpha_date)
                alpha_date = FactorPreProcess().standardization(alpha_date)

            elif W_mat == 'FreeMvSqrt':
                free_mv_date = free_mv_date.dropna()
                free_mv_date['FreeMv2'] = free_mv_date['FreeMv'].map(
                    lambda x: 1 / (x**(1 / 2)))
                free_mv_date = pd.DataFrame(free_mv_date['FreeMv2'])

            else:
                pass
            ####################################################################################################

            if len(alpha_date) > self.min_stock_num:

                if W_mat == 'Equal':
                    P = np.diag(np.ones(shape=(1, len(alpha_date)))[0])
                elif W_mat == 'FreeMvSqrt':
                    P = np.diag(np.column_stack(free_mv_date.values)[0])
                elif W_mat == 'BarraStockCov':
                    P = stock_cov.values
                else:
                    P = np.diag(np.ones(shape=(1, len(alpha_date)))[0])

                Q = np.zeros(shape=(P.shape[0], 1))

                A = np.column_stack(alpha_date.values)
                A_add = np.ones(shape=(1, P.shape[0]))
                A = np.row_stack((A, A_add))
                b = np.array([[1.0], [0.0]])

                if neutralize == 'Res':

                    params = params[params.name == self.alpha_factor_name]
                    params = params[params.market == stock_pool]
                    params.index = ['index']

                    if params.loc['index', 'Industry'] == 1.0:

                        A_add = industry_dummy_date.T.values
                        A = np.row_stack((A, A_add))
                        b_add = np.row_stack(
                            (np.zeros(shape=(len(industry_dummy_date.columns),
                                             1))))
                        b = np.row_stack((b, b_add))

                    params_style = params.loc[:, self.style_factor_name].T
                    params_style = params_style[params_style == 1.0]
                    params_style = params_style.dropna()

                    if len(params_style) > 0:

                        barra_style_date = barra_style_date[params_style.index]
                        A_add = barra_style_date.T.values
                        A = np.row_stack((A, A_add))
                        b_add = np.row_stack(
                            (np.zeros(shape=(len(barra_style_date.columns),
                                             1))))
                        b = np.row_stack((b, b_add))

                print(A.shape)
                try:
                    P = matrix(P)
                    Q = matrix(Q)
                    A = matrix(A)
                    b = matrix(b)
                    result = sol.qp(P, q=Q, A=A, b=b)
                    fmp_raw_alpha = pd.DataFrame(np.array(result['x'][0:]),
                                                 columns=[date],
                                                 index=code_list).T
                    print(
                        "Cal FMP %s %s %s %s " %
                        (date, stock_pool, neutralize, self.alpha_factor_name))
                    concat_data = pd.concat([fmp_raw_alpha.T, alpha_date],
                                            axis=1)
                    concat_data = concat_data.dropna()
                    print(concat_data.corr().values[0][0])
                except Exception as e:
                    fmp_raw_alpha = pd.DataFrame([],
                                                 columns=[date],
                                                 index=code_list).T
                    print(
                        "QP FMP is InCorrect  %s %s %s %s " %
                        (date, stock_pool, neutralize, self.alpha_factor_name))
            else:
                fmp_raw_alpha = pd.DataFrame([],
                                             columns=[date],
                                             index=code_list).T
                print("The Length of Data is Zero %s %s %s %s " %
                      (date, stock_pool, neutralize, self.alpha_factor_name))

            # concat
            ####################################################################################################
            if i_date == 0:
                fmp_raw_alpha_all = fmp_raw_alpha
            else:
                fmp_raw_alpha_all = pd.concat(
                    [fmp_raw_alpha_all, fmp_raw_alpha], axis=0)

        # write data
        ####################################################################################################
        sub_path = os.path.join(self.path, 'fmp')
        file = os.path.join(
            sub_path, '%s_%s_%s_%s.csv' %
            (self.alpha_factor_name, neutralize, W_mat, stock_pool))
        fmp_raw_alpha_all = fmp_raw_alpha_all.T
        fmp_raw_alpha_all.to_csv(file)
nodenames = [
    'ASBNVACYO3Y.csv', 'ATLNGAMAO4Y.csv', 'CHCGILDTO6Y.csv', 'CHCGILWUO7Y.csv',
    'MIAUFLWSO0Y.csv', 'MIAUFLWSO3P-NE70191.csv', 'NYCMNYZRO1Y.csv',
    'WASHDC12O1Y.csv'
]

node_options = [dict(label=x.split('.')[0], value=x) for x in nodenames]
dimensions = ["signal_type"]
collist = [
    'ChanOchOprAve', 'ChanOchLBCAve', 'ChanOchChromaticDispersionAve',
    'BerPreFecAve', 'BerPostFecAve', 'PhaseCorrectionAve', 'Qave', 'PmdAve',
    'SoPmdAve', 'ChanOchOptAve'
]  #,'performance_metrics.ts']

global df
df = pd.DataFrame()
df = pd.read_feather(os.path.join(DATA_DIR, "pivoted_ochctp.feather"))
print(df.shape)
netcoolDF = pd.read_feather(
    r"D:\experiments\data\netcool_with_devices.feather")

col_options = [dict(label=x, value=x.lower()) for x in ["Ave", "Min", "Max"]]
devnames = set(df['performance_metrics.module'].tolist())
netcoolDF = netcoolDF[netcoolDF['device'].str.strip().isin(devnames)]
devnames = netcoolDF['device'].str.strip().tolist()
print(netcoolDF.shape)
# devnames=set(netcoolDF['device'].tolist())
dev_options = [dict(label=x.split('.')[0], value=x) for x in devnames]
app = dash.Dash(
    __name__,
    external_stylesheets=["https://codepen.io/chriddyp/pen/bWLwgP.css"])
Exemple #27
0
    def cal_real_portfolio(self,
                           neutralize='Raw',
                           W_mat="Equal",
                           stock_pool='Astock'):

        for i_date in range(len(self.change_date_series) - 1):

            lamb = 1000.0

            # date
            ####################################################################################################
            date = self.change_date_series[i_date]
            data = self.get_data_date(date, stock_pool)
            alpha_date, industry_dummy_date, barra_style_date, free_mv_date, code_trade = data

            fmp = self.get_fmp(neutralize, W_mat, stock_pool)
            fmp_date = pd.DataFrame(fmp[date])
            fmp_date.columns = ['FmpWeight']
            fmp_date = fmp_date.dropna()

            code_list = list(fmp_date.index)

            # Barra().cal_stock_covariance(date)
            stock_covriance = Barra().get_stock_covariance(date)
            stock_covriance = stock_covriance.loc[code_list, code_list].values
            stock_covriance = np.zeros(shape=(len(code_list), len(code_list)))

            alpha_signal = np.dot(stock_covriance, fmp_date)
            alpha_signal = fmp_date.values * 2

            P = stock_covriance * lamb
            Q = -np.row_stack(alpha_signal)

            from quant.stock.index import Index
            index_weight = Index().get_weight(index_code=stock_pool, date=date)
            index_weight = index_weight.loc[code_list, :]
            index_weight = index_weight.fillna(0.0)
            index_weight['Max'] = 0.03
            index_weight['Min'] = -index_weight['WEIGHT']

            G_positive = np.diag(np.ones(shape=(len(index_weight))))
            G_negative = -np.diag(np.ones(shape=(len(index_weight))))
            G = np.row_stack((G_positive, G_negative))

            h_positive = np.row_stack(index_weight['Max'].values)
            h_negative = np.row_stack(index_weight['Min'].values)
            h = np.row_stack((h_positive, h_negative))

            A = np.ones(shape=(1, len(index_weight)))
            b = np.array([[0.0]])

            try:
                P = matrix(P)
                Q = matrix(Q)
                G = matrix(G)
                h = matrix(h)
                A = matrix(A)
                b = matrix(b)

                result = sol.qp(P, q=Q, G=G, h=h, A=A, b=b)
                result = sol.qp(P, q=Q)
                stock_weight_active = pd.DataFrame(np.array(result['x'][0:]),
                                                   columns=['Active'],
                                                   index=code_list).T
                weight = pd.concat(
                    [index_weight, stock_weight_active.T, fmp_date], axis=1)
                weight['PortWeight'] = weight['WEIGHT'] + weight['Active']
                weight['ImplyWeight'] = weight['WEIGHT'] + weight['FmpWeight']
                print((weight['WEIGHT'] - weight['PortWeight']).abs().sum())
                print(weight['Active'].sum())
                print("Cal Portfolio %s %s %s %s " %
                      (date, stock_pool, neutralize, self.alpha_factor_name))
            except Exception as e:
                stock_weight = pd.DataFrame([],
                                            columns=[date],
                                            index=code_list).T
                index_weight = pd.concat([index_weight, stock_weight.T],
                                         axis=1)
                print("QP Portfolio is InCorrect  %s %s %s %s " %
                      (date, stock_pool, neutralize, self.alpha_factor_name))
def get_biallelic_bases(df, sample_col, single_sample_vcf=True):
    '''
    This function returns the base call for each biallelic base
        10X faster than previous iterations
    '''
    haploid_chromosomes = ['X', 'chrX', 'Y', 'chrY', 'M', 'chrM']
    
    
    def get_phase(line, sample_id):
        '''
        Returns phase from genotype
        '''
        genotype = str(line[sample_id])
        if "|" in genotype:
            return "|"
        if "/" in genotype:
            return "/"
        else:
            return '-'
    

    def _get_allele(line, gt_col):
        '''
        Returns allelic base, handles multi-allelic variants
        '''
        alleles = [line['REF']]
        alleles.extend(line['ALT'].split(","))
        a1 = "."
        try:
            a1 = alleles[int(line[gt_col])]  #returns missing if gt_int_call is "."
        except:
            a1 = "."
        return a1


    def get_GT_multisample_vcf(line, sample_col, gt_index):
        return int(line[sample_col].split(line['phase'])[int(gt_index)])
    
    def get_GT_multisample_vcf_haploid(line, sample_col, gt_index):
        return str(line[sample_col]).split(':')[0]


    if single_sample_vcf:
        df['GT_len'] = df[sample_col].str.split(':').str[0].str.len()
        haploid_df = df[df['GT_len'] <= 1]
        haploid_df['phase'] = '-'
        haploid_df = haploid_df[haploid_df['CHROM'].isin(haploid_chromosomes)]
        
        df = df[df['GT_len'] > 1]
        df['phase'] = df[sample_col].str[1]
        df = df[ (df['phase']!='-') ]
        
        del df['GT_len']
        del haploid_df['GT_len']
        
        if len(haploid_df) > 0:
            haploid_df['GT1'] = haploid_df[sample_col].str[0]
            haploid_df = haploid_df[ (haploid_df['GT1']!='.') & (haploid_df['GT1']!=np.NaN)]
            haploid_df['GT2'] = 0
        
        
        if len(df) > 0:
            df['GT1'] = df[sample_col].str[0]
            df = df[(df['GT1']!='.') & (df['GT1']!=np.NaN)]
            df['GT1'] = df['GT1'].astype(int)
            df['GT2'] = df[sample_col].str[2]
            df = df[(df['GT2']!='.') & (df['GT2']!=np.NaN)]
            df['GT2'] = df['GT2'].astype(int)
            
        


    if not single_sample_vcf:  #16th December 2014 not sure this is needed now that get_multiallelic_bases is separate function
        df['GT_len'] = df[sample_col].str.split(':').str[0].str.len()
        haploid_df = df[df['GT_len'] <= 1]
        haploid_df['phase'] = '-'
        haploid_df = haploid_df[haploid_df['CHROM'].isin(haploid_chromosomes)]
        
        df = df[~df.index.isin(haploid_df.index)]
        df['phase'] = df[sample_col].str[1]
        df = df[ (df['phase']!='-') ]
        
        del df['GT_len']
        del haploid_df['GT_len']
        
        if len(df) > 0:
            df['GT1'] = df.apply(get_GT_multisample, args=[sample_col, 0], axis=1)
            df = df[(df['GT1']!='.') & (df['GT1']!=np.NaN)]
            df['GT2'] = df.apply(get_GT_multisample, args=[sample_col, 1], axis=1)
            df = df[(df['GT2']!='.') & (df['GT2']!=np.NaN)]
            
        if len(haploid_df) > 0:
            haploid_df['GT1'] = df.apply(get_GT_multisample_vcf_haploid, args=[sample_col, 0], axis=1)
            haploid_df = haploid_df[ (haploid_df['GT1']!='.') & (haploid_df['GT1']!=np.NaN)]
            haploid_df['GT1'] = haploid_df['GT1'].astype(int)
            haploid_df['GT2'] = 0



    if len(df) > 0:
        if len(haploid_df) > 0:
            df = df.append(haploid_df)
        else: pass
    else:
        df = haploid_df
        


    #FAST PROCESS SIMPLE ALLELE GENOTYPES
    df_simple = df
    if len(df_simple) > 0:
        df_gt1_ref = df_simple[df_simple.GT1.astype(int)==0][['REF']]  #get a1 ref alleles
        df_gt1_ref.columns = ['a1']
        df_gt2_ref = df_simple[df_simple.GT2.astype(int)==0][['REF']]  #get a2 ref alleles
        df_gt2_ref.columns = ['a2']
    
    
        df_gt1_alt = df_simple[df_simple.GT1.astype(int)==1][['ALT']]  #get a1 alt alleles
        df_gt1_alt.columns = ['a1']
        df_gt2_alt = df_simple[df_simple.GT2.astype(int)==1][['ALT']]  #get a2 alt alleles
        df_gt2_alt.columns = ['a2']
    
    
        gt1_alleles = pd.concat([df_gt1_ref,df_gt1_alt])  #merging GT1 allele bases into a single df
        #del gt1_alleles[0]
        gt2_alleles = pd.concat([df_gt2_ref,df_gt2_alt])  #merging GT2 allele bases into a single df
        #del gt2_alleles[0]
        gt1_2_allele_df = gt1_alleles.join(gt2_alleles)  #Joining the GT1 and GT2 simple allele bases 
    
        #print len(df)
        
        return df.join(gt1_2_allele_df)
    
    else:
        return pd.DataFrame()
Exemple #29
0
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
columns = "ip op1".split()
headers = ["ip", "op1"]
test_data = pd.read_csv("testdata.csv")
test_data.columns = headers
print(test_data.describe())
df1 = pd.DataFrame(test_data, columns=columns)
y = df1.op1
train_x, test_x, train_y, test_y = train_test_split(test_data["ip"],
                                                    test_data["op1"],
                                                    train_size=0.1)
train_x = train_x.reshape(1, -1)
train_y = train_y.reshape(1, -1)
print("Train_y Shape :: ", train_y.shape)
print("Test_x Shape :: ", test_x.shape)
print("Test_y Shape :: ", test_y.shape)
clf = RandomForestClassifier()
clf.fit(train_x, train_y)
test_x = test_x.values.reshape(1, -1)
predictions = clf.predict(test_x)
print("Train Accuracy :: ",
      accuracy_score(train_y, trained_model.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, predictions))
print(" Confusion matrix ", confusion_matrix(test_y, predictions))
print("Trained model:", clf)
Exemple #30
0
    dtrain = xgb.DMatrix(X_train, y_train)
    dval = xgb.DMatrix(X_val, y_val)

    watchlist = [(dtrain, 'train'), (dval, 'val')]
    bst = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=100, verbose_eval=True)


    y_train_hat = bst.predict(dtrain)

    plotAuc(y_train, y_train_hat, xlabel='train')

    y_val_hat = bst.predict(dval)
    plotAuc(y_val, y_val_hat, xlabel='validation')

    bst.save_model('xgb-{}.model'.format(getNextVer("xgb-(\d).model")))

    return bst

######### train

train, test, features, target = load_data()
X_train, X_val, y_train, y_val = train_test_split(train[features], train[target], test_size=0.05, random_state=10)
bst = getModel(X_train, y_train, X_val, y_val)

dtest = xgb.DMatrix(test[features])
yhat = bst.predict(dtest)

output = pd.DataFrame({'id': test.id, 'flag': yhat})
output = output[['id','flag']]
output.to_csv('output-{}.txt'.format(getNextVer('output-(\d).txt')), index=False, columns=None, header=False, sep='\t')