Example #1
0
def createDatarankWebCorpus(wordcloud=False,num_threads=15):
    path='/home/arya/PubMed/GEO/Datasets/'
    dpp=pd.read_pickle(path+'DPP.df')[['accession','pmid','cites_pmid']].drop_duplicates()
    dpp
    d=pd.read_pickle(path+'D.All.df')
    d
    count=pd.read_pickle(path+'DCC.df')[['accession','cpcc']]
    m=pd.read_pickle(path+'M.df')
    pm=pd.read_pickle(path+'PM.df')
    pm.index=pm.pmid
    pm=pm.loc[dpp.cites_pmid.unique()]
    pm=pd.merge(pm,m,left_on='muid',right_on='uid')[['pmid','name']]
    pmg=pm.groupby('pmid')
    CP=pm.pmid.unique()
    
    
    pm=pd.DataFrame( map(lambda p: (p, pmg.get_group(p).name.unique().tolist())  , CP), columns=['pmid','mesh'])
    
    dppm=pd.merge(dpp,pm,left_on='cites_pmid',right_on='pmid')
    G=dppm.groupby('accession')
    A=dppm.accession.unique()
    dm=pd.DataFrame([(a,[m  for v in G.get_group(a).mesh.values for m in v]) for a in A], columns=['accession','mesh'])
    if wordcloud:
#         map(word_cloud ,dm.iterrows())
        from multiprocessing import Pool
        pool=Pool(num_threads)
        pool.map(word_cloud ,dm.iterrows())
    d=pd.merge(dm,d,on='accession')
    d=pd.merge(d, count, on='accession')
    d.to_pickle(path+'D.Web.df')
Example #2
0
def main():
    """Main function to initialize databases to analysize Yelp data."""
    import random

    # ------------ Save Yelp Data as Pandas DataFrames to pickle ------------
    # Save all Yelp restaurant data in Arizona (Phoenix area)
    #restaurant_data = read_yelp('business',state=['AZ'],open=[True],categories='restaurants')
    #review_data = read_yelp('review',business_id=restaurant_data.business_id.unique())
    restaurant_data = pd.read_pickle('../data/pandas/business.pkl')
    review_data = pd.read_pickle('../data/pandas/review.pkl')
    result = save2pickle(restaurant_data,review_data)
    result = py2mysql(restaurant_data,review_data)

    # Save information for mexican restaurants only
    restaurant_data = restaurant_data[restaurant_data['categories'].map(lambda x: 'mexican' in [cat.lower() for cat in x])]
    review_data = review_data[review_data['business_id'].isin(restaurant_data.business_id.unique())]
    result = save2pickle(restaurant_data,review_data,append_string='_mexican')
    result = py2mysql(restaurant_data,review_data,append_string='_mexican')

    # Segment some data for training
    random.seed(1234)
    trainids = random.sample(restaurant_data.business_id,20)
    restaurant_data = restaurant_data[restaurant_data['business_id'].isin(trainids)]
    review_data = review_data[review_data['business_id'].isin(trainids)]
    result = save2pickle(restaurant_data,review_data,append_string='_mexican_train')

    # Make database of individual sentences from review data
    sentences = process_text.reviews_to_sentences(review_data)
    sentences = process_text.add_training_label(sentences,review_data)
    sentences.to_pickle('../data/pandas/sentences_mexican.pkl')
    result = sentence2mysql(sentences,review_data,append_string='_mexican')
Example #3
0
def loadFromDb(identificador):

    import sys
    if not sys.version_info[:2] == (3, 4):
        print ('Sos un boludo!, pero uno previsor')
        print ('Este codigo esta pensado para correr en python 3.4')
        
    import os
    import pandas as pd
    from IPython.display import display    
    
    display ('Verificando datos guardados previamente')
    filenameTouchs = 'dbTouchs'
    filenameSounds = 'dbSounds'
    
    if os.path.isfile(filenameTouchs):
        touchsLoad = pd.read_pickle(filenameTouchs)
        display ('Datos de touchs previos cargados')
    else:
        display ('Datos de touchs previos inexistentes')
        touchsLoad = pd.DataFrame()

    if os.path.isfile(filenameSounds):
        soundsLoad = pd.read_pickle(filenameSounds)
        display ('Datos de sounds previos cargados')
    else:
        display ('Datos de sounds previos inexistentes')
        soundsLoad = pd.DataFrame()
       
    if identificador!=0:
        touchsLoad = touchsLoad[touchsLoad['identificador']==identificador] 
        soundsLoad = soundsLoad[soundsLoad['identificador']==identificador]
        
    return touchsLoad, soundsLoad
Example #4
0
 def test_parse_rna_seq_metrics(self):
     metrics, hist = cpb.picard.parse_rna_seq_metrics(
         add_root('rna_seq_metrics.txt'))
     metrics2 = pd.read_pickle(add_root('rna_seq_metrics_metrics.pickle'))
     hist2 = pd.read_pickle(add_root('rna_seq_metrics_hist.pickle'))
     assert_series_equal(metrics, metrics2)
     assert_series_equal(hist, hist2)
    def get_subjects_list_adults_fct(df_path, df_qc_path, subjects_list):
        '''
        excludes kids and subjects with missing sex or age
        '''
        import pandas as pd
        import numpy as np

        df = pd.read_pickle(df_path)
        df_qc = pd.read_pickle(df_qc_path)
        df = pd.merge(df, df_qc, left_index=True, right_index=True)
        pd.to_pickle(df, 'testdf.pkl')

        df['subject_id'] = df.subject_id_x

        # fixme exclude subjects with mean_FD>.1
        subjects_list_exclude = df[(df.age<18) | (df.mean_FD_Power>.1)].index
        subjects_list_adults = subjects_list

        for exclude_subject in subjects_list_exclude:
            if exclude_subject in subjects_list_adults:
                subjects_list_adults.remove(exclude_subject)

        missing_info = df[(df.age==999) | ((np.logical_or(df.sex=='M', df.sex=='F'))==False)].index
        for missing in missing_info:
            if missing in subjects_list_adults:
                subjects_list_adults.remove(missing)


        # remove subject from subject_list_adults for which no entry exists in df
        for subject in subjects_list_adults:
            if not(subject in df.index):
                subjects_list_adults.remove(subject)

        return subjects_list_adults
Example #6
0
def plotPowerCLR(recompute=False):
    if recompute:
        mc = pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'MarkovChain'))
        hmm = f(pd.read_pickle('{}ROC/{}'.format(utl.outpath, 'HMM')))
        a = pd.concat([mc, hmm]);
        print a
        a = a[a.index.get_level_values('coverage') != np.inf]
        df = pd.DataFrame(a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(Qcoverage[x.name[0]])].mean()))[0]
        # df = pd.DataFrame(a.groupby(level=range(6)).apply(lambda x: x[x >= x.quantile(0.99)].mean()))
        df = getPower(df, groupbyLevels=range(4))
        df.to_pickle(utl.outpath + 'ROC/PowerCLR.df')
    else:
        df = pd.read_pickle(utl.outpath + 'ROC/PowerCLR.df')
        reload(pplt)
    info = pplt.getNameColorMarker(df)
    info.loc[info.index.get_level_values('method') == 'HMM', 'marker'] = '--o'
    info.loc[info.index.get_level_values('method') == 'MarkovChain', 'marker'] = '--s'
    info.loc[info.index.get_level_values('method') == 'HMM', 'color'] = 'r'
    info.loc[info.index.get_level_values('method') == 'MarkovChain', 'color'] = 'darkblue'
    # info.loc[info.index.get_level_values('q')==0.99,'color']='r'
    # info.loc[info.index.get_level_values('q')==1,'color']='darkblue'
    fig, axes = plt.subplots(2, 3, sharey=True, sharex=True, figsize=(6, 2.5), dpi=dpi);
    pplt.setStyle(lw=1);
    pplt.plotOnePower(df.xs(0.005, level='nu0'), info, axes[0], legendSubplot=0, ylabel='Hard');
    pplt.plotOnePower(df.xs(0.1, level='nu0'), info, axes[1], ylabel='Soft');
    [pplt.annotate('({})'.format(list('ABCDEF')[j]), ax=x, fontsize=7) for j, x in enumerate(axes.reshape(-1))]
    plt.gcf().subplots_adjust(bottom=0.15)
    pplt.savefig('powerCLR', dpi=dpi)
    plt.show()
Example #7
0
def train_model():
    global df_rest_p1
    global df_rest_p2
    global df_rest_p3

    print "training initial..."
    person = request.form.get('person')
    df=pd.DataFrame()
    if person=='Person1':
        df = pd.read_pickle("data_3/person2_weather")
        df, df_demo=splitTrainingset_for_demo(df,0.8)
        df_rest_p1 =df_demo

    elif person=='Person2':
        df = pd.read_pickle("data_3/person3_weather")
        df, df_demo = splitTrainingset_for_demo(df, 0.8)
        df_rest_p2 = df_demo

    elif person=='Person3':
        df = pd.read_pickle("data_3/person4_weather")
        df, df_demo = splitTrainingset_for_demo(df, 0.8)
        df_rest_p3 = df_demo

    X,y=data_preparation(df)
    json_plot = train_1_sample_batches_predict_next_sample(X,y, person)
    json_data = json.dumps(json_plot)
    return json_data
Example #8
0
def pickling():
    z=pd.read_pickle('z.pkl')
    y=pd.read_pickle('y.pkl')
    fi=pd.read_pickle('fi.pkl')
    em=pd.read_pickle('em.pkl')
    em_y=pd.read_pickle('em_y.pkl')
    return (z,y,fi,em,em_y)
Example #9
0
def plotSurf():
    from scipy import interpolate
    a=pd.read_pickle(utl.outpath+'real/real.maxLikelihoods.df')
    idx=(a.s.abs()*a.h.abs()*(a.alt-a.null)).sort_values().index[-1]
    R=pd.DataFrame(pd.read_pickle(utl.outpath+'real/real.replicates.df').loc[idx]).T
    SH=dta.getSH()
    ARGS=[(R,)+sh for sh in SH]
    likelihoods=pd.concat(map(mkv.computeLikelihoodReal,ARGS),axis=1);likelihoods.columns.names=['s','h']


    fig = plt.figure()
    ax = fig.gca(projection='3d')
    df=pd.concat([pd.Series(z[1].loc[z[0]].values,index=z[1].loc[z[0]].index,name=z[0]) for z in b.groupby(level=0)],axis=1)
    Z=df.values
    # Z[Z==Z.min()]=-1e3
    X=np.tile(df.index.values[:,None],Z.shape[1])
    Y=np.tile(df.columns.values[:,None],Z.shape[0]).T
    Z.min()
    Z.max()

    nn = 401;
    xi = np.linspace(-1.0, 2.0, 10);
    yi = np.linspace(-0.5, 0.5, nn);

    f = interpolate.interp2d(X,Y,Z,kind='cubic')
    zi = f(xi, yi)
    [xi, yi] = np.meshgrid(xi, yi);
    # surf = ax.plot_surface(X, Y, Z, cmap=mpl.cm.autumn)
    surf = ax.plot_surface(xi, yi, zi, cmap=mpl.cm.autumn)
    fig.colorbar(surf, shrink=0.5, aspect=5)

    # surf(xi, yi, zi, 'LineStyle', 'none', 'FaceColor', 'interp')
    plt.show()
Example #10
0
def helpfulModelingPipelineRFC():
   print "Loading pickles..."
   #comments_discussion_df=pd.read_pickle('comments_discussion.p')
   X=pd.read_pickle('X.p')
   y_actual=pd.read_pickle('y_actual.p')

   X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual, test_size=0.15, random_state=0)
   print y_actual_train.head()

   #pca = PCA(n_components=1)
   
   #use only SelectKBest to select features
   selection = SelectKBest(f_classif,k=15)

   X_features = selection.fit(X_train.iloc[:,0:len(X.columns)-2], y_actual_train).transform(X_train.iloc[:,0:len(X_train.columns)-2])

   rfc = RandomForestClassifier(criterion='entropy')

   # Do grid search over k, n_components and C:
   pipeline = Pipeline([('feature_selection', selection), ('rfc', rfc)])

   param_grid = dict(feature_selection__k=[11,13,14,15,16],
                     rfc__n_estimators=[950,1000,1050],
                     rfc__max_depth = [13,14,15,16],
                     rfc__min_samples_split = [4,5,6,7],
                     rfc__min_samples_leaf = [1,2,3])

   grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='precision', cv=20 ,verbose=10,n_jobs=15)
   grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train['is_helpful'].values)

   print(grid_search.best_estimator_)
   #print "All columns:"+str(X.columns)
   #print "Just the selected columns:"+str(X.columns[pipeline.named_steps['selection'].get_support()])
   pickle.dump(grid_search.best_estimator_, open( "rfc_best_estimator.p", "wb" ) )
Example #11
0
def svd_training(params):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    logger.debug("Start training...")
    train_data = pd.read_pickle(path=os.path.join(params['datastore'], params['train_datapath']))
    validation_data = pd.read_pickle(path=os.path.join(params['datastore'], params['validation_datapath']))

    svd_params = {p: params[p] for p in ['random_state', 'n_epochs', 'verbose', 'biased', 'n_factors', 'init_mean',
                                         'init_std_dev', 'lr_all', 'reg_all', 'lr_bu', 'lr_bi', 'lr_pu', 'lr_qi',
                                         'reg_bu', 'reg_bi', 'reg_pu', 'reg_qi']}
    svd = surprise.SVD(**svd_params)

    train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(params['surprise_reader'])) \
        .build_full_trainset()
    svd.fit(train_set)

    logger.debug("Evaluating...")

    metrics_dict = {}
    rating_metrics = params['rating_metrics']
    if len(rating_metrics) > 0:
        predictions = compute_rating_predictions(svd, validation_data, usercol=params['usercol'],
                                                 itemcol=params['itemcol'])
        for metric in rating_metrics:
            result = getattr(evaluation, metric)(validation_data, predictions)
            logger.debug("%s = %g", metric, result)
            if metric == params['primary_metric']:
                metrics_dict['default'] = result
            else:
                metrics_dict[metric] = result

    ranking_metrics = params['ranking_metrics']
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(svd, train_data, usercol=params['usercol'],
                                                      itemcol=params['itemcol'],
                                                      recommend_seen=params['recommend_seen'])
        k = params['k']
        for metric in ranking_metrics:
            result = getattr(evaluation, metric)(validation_data, all_predictions, col_prediction='prediction', k=k)
            logger.debug("%s@%d = %g", metric, k, result)
            if metric == params['primary_metric']:
                metrics_dict['default'] = result
            else:
                metrics_dict[metric] = result

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    # Report the metrics
    nni.report_final_result(metrics_dict)

    # Save the metrics in a JSON file
    output_dir = os.environ.get('NNI_OUTPUT_DIR')
    with open(os.path.join(output_dir, 'metrics.json'), 'w') as fp:
        temp_dict = metrics_dict.copy()
        temp_dict[params['primary_metric']] = temp_dict.pop('default')
        json.dump(temp_dict, fp)

    return svd
Example #12
0
 def __init__(self, psr, loadvec=True):
     self.log = logging.getLogger('Source')
     self.log.debug('Initializing source.')
     
     self.psr = psr
     self.npsrs = 1
     self.path = paths.vectors + 'srcVec_' + psr
                  
     # If necessary, rebuild catalogue; otherwise, just load catalogue.
     self.log.debug('Looking for catalogue.')
     try:
         f = open(paths.textfromATNF, 'r')
         pd.read_pickle(paths.psrcat)
     except IOError:
         self.log.warning('No PSR catalogue found.')
         self.build_catalogue()
         f = open(paths.textfromATNF, 'r')
     finally:
         self.log.debug('Checking catalogue.')
         f_text = f.read()
         f.close()
     
     if self.psr not in f_text:
         self.log.debug('PSR not in catalogue.')
         self.build_catalogue()
     
     self.log.debug('Reading catalogue.')
     psrcat = pd.read_pickle(paths.psrcat)
     self.param = psrcat.ix[self.psr]
     
     if loadvec: self.loadVectors()
Example #13
0
def helpfulModelingPipelineGBC():
   #load the pickles
   print "Loading pickle..."
   X=pd.read_pickle('X.p')
   y_actual=pd.read_pickle('y_actual.p')

   print "X head without the body and the comment_id:"
   print X.iloc[:,0:len(X.columns)-2].head()
   print "y_actual:"
   print y_actual['is_helpful'].values

   X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual['is_helpful'].values, test_size=0.15, random_state=0)
   
   selection = SelectKBest(f_classif,k=15)

   X_features = selection.fit_transform(X_train.iloc[:,0:len(X.columns)-2], y_actual_train)

   gbc = GradientBoostingClassifier(n_estimators=200)

   print np.unique(X_train.iloc[:,5:6])

   #Create a pipeline of feature selection and gradient boosting classifier
   pipeline = Pipeline([('feature_selection',selection),('gbc',gbc)])

   param_grid = dict(feature_selection__k=[9,10,11,12,14],
                     gbc__n_estimators = [450,500,550],
                     gbc__max_depth = [33,35,40],
                     gbc__min_samples_split = [1,2,3],
                     gbc__min_samples_leaf = [2,3,4])

   grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='recall',cv=15,verbose=10,n_jobs=15)
   grid_search.fit(X_train.iloc[:,0:len(X_train.columns)-2], y_actual_train)
   print(grid_search.best_estimator_)
   print "Just the selected columns:"+str(X.iloc[:,0:len(X.columns)-2].columns[pipeline.named_steps['feature_selection'].get_support()])
   pickle.dump(grid_search.best_estimator_, open( "gbc_best_estimator.p", "wb" ) )
Example #14
0
def train_model():
    """
    Train a scikit-learn model on top of the imagefeatures
    """
    print "Loading data into memory..."
    train = pd.read_pickle(file_path('train-features'))
    test  = pd.read_pickle(file_path('test-features'))
Example #15
0
def sample_ratings_large():
    df_titles = pd.read_pickle('df_titles_condensed.obj')
    df_ratings = pd.read_pickle('df_ratings_condensed.obj')
    df_titles.dropna(how='any', subset=['plot', 'storyline', 'genre', 'years'], inplace=True)
    valid_ids = set(df_titles['movie_id'])
    df_ratings = df_ratings[df_ratings['movie_id'].isin(valid_ids)]

    def condense(df_titles, df_ratings, title_ratings, user_ratings=20):
        valid_ids = set(df_titles['movie_id'])
        df_ratings = df_ratings[df_ratings['movie_id'].isin(valid_ids)]
        old_shape = (0, 0)
        titles_to_keep = 0
        while old_shape != df_ratings.shape:
            print(df_titles.shape)
            old_shape = df_ratings.shape
            agg = df_ratings.groupby('movie_id').count()
            titles_to_keep = set(agg[agg['user_id'] > title_ratings].index)

            agg = df_ratings.groupby('user_id').count()
            users_to_keep = set(agg[agg['movie_id'] > user_ratings].index)

            df_ratings = df_ratings[df_ratings['movie_id'].isin(titles_to_keep)]
            df_ratings = df_ratings[df_ratings['user_id'].isin(users_to_keep)]
            df_titles = df_titles[df_titles['movie_id'].isin(titles_to_keep)]

        print('%d/%d: found %d titles with %d ratings' %
              (user_ratings, title_ratings, len(titles_to_keep),
               df_ratings.shape[0]))

        df_ratings.to_pickle('df_ratings_condensed_2.obj')
        df_titles.to_pickle('df_titles_condensed_2.obj')

    pdb.set_trace()
Example #16
0
 def read(param):
     """
     data is sorted first by Chrom and then POS  in addGlobalPos. Important to have them sorted together 
     """
     try:
         meta=pd.read_pickle(param['dspath']+param['dsname']+'.meta.df')
         snp=pd.read_pickle(param['dspath']+param['dsname']+'.snp.df')
     except:
         if param['Region']=='Peru' and param['dsname']=='all':
             meta= Data.readPeruAll()
         elif param['Region']=='Peru' and param['dsname']=='winzeler':
             meta= Data.readPeruFiltered()
         elif param['Region']=='Sudan':
             meta= Data.readSudan()    
         else:
             print >> sys.stderr, 'Bad Parameter: ',param
             exit()
         meta= Data.removeNonPolymorphicandTriAllele(meta, param)
         meta = Data.correctCall(meta, param)
         meta= Data.computeRC(meta, param)
         meta.ix[:,'hetero']= meta[param['names']].apply(lambda x: ((x=='0/1')|(x=='1/0')).sum(),axis=1)
         meta=pd.concat([meta, meta[param['names']].apply(lambda x: x.value_counts(),axis=1).fillna(0)],axis=1)
         meta['0/1']+=meta['1/0'];meta.drop(['1/0'],axis=1,inplace=True)
         calls=meta[param['names']]
         snp=pd.concat([pd.DataFrame(calls.applymap(lambda x: x.split('/')[0]).values, columns=calls.columns+'maj') , pd.DataFrame(calls.applymap(lambda x: x.split('/')[1]).values, columns=calls.columns+'min')],axis=1).astype(int).T.sort_index();snp.columns=calls.index.values #major is always zero in heterozygotes in the other getsnp function 1/0 is possible for example line 7 mdio08 in the xlsx
         from popgen.Plasmodium.Run import runHW 
         meta=runHW(param,meta)
         meta.to_pickle(param['dspath']+param['dsname']+'.meta.df')
         snp.to_pickle(param['dspath']+param['dsname']+'.snp.df')
     return snp,meta
Example #17
0
def transform_data():
    """
    Passes the downloaded data through the indico imagefeatures API
    """
    train = pd.read_pickle(file_path('train'))
    test  = pd.read_pickle(file_path('test'))

    # limit to the first 10000 training examples 
    train = train[:10000]
    
    train.name, test.name = 'train', 'test'

    for df in (train, test):
        imagefeatures = []

        i = 0
        batch_size = 50
        n = len(df.data)/batch_size

        print "Fetching %s imagefeatures..." % (df.name)
        for df_batch in batch(df.data, batch_size):
            print "\t%d/%d" % (i, n)
            imagefeatures.extend(batch_image_features(df_batch)) 
            i += 1

        df['features'] = imagefeatures

        df.to_pickle(file_path("cifar10-%s-features.pkl" % df.name))
Example #18
0
def my_form_post(answer=None):
    
    filename='/home/seonhoon/Desktop/workspace/ImageQA/data/dict.pkl'
    
    with open(filename, 'rb') as fp:
        idx2word, word2idx, idx2answer, answer2idx = cPickle.load(fp)

    text = request.form['text']
    print text
    
    question=text.split()
    
    q_idx=[]
    for i in range(len(question)):
        q_idx.append(word2idx[question[i]])
    q_idx=np.array(q_idx)
    
    print q_idx

    #running caffe and tensorflow seems not so easy simultaneously
    pd.read_pickle('/home/seonhoon/Desktop/workspace/ImageQA_Web/cnn.pkl')
    x_img = np.array([pd.read_pickle('/home/seonhoon/Desktop/workspace/ImageQA_Web/cnn.pkl')['cnn_feature'][0].tolist()])
 


    x , x_mask = prepare_data([q_idx], config.steps)

            
    y = test_sample(x, x_mask, x_img)
    
    print idx2answer[y[0]]
    
    params = {'answer' : idx2answer[y[0]], 'text' : text}
    
    return render_template('iqa.html', **params) 
Example #19
0
def runHMM(h, stepS=0.05, eps=1e-1,CD=None,E=None,save=True,verbose=1):
    if CD is None:  CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df').iloc[:]
    if E is None:   E = pd.read_pickle(utl.outpath + 'real/Emissions.df')
    likes_null = getNullLikelihoods(CD,E)
    likes_thn = mkv.computeLikelihoodReal((CD, E, -stepS, h))

    likes_thp = mkv.computeLikelihoodReal((CD[likes_null > likes_thn], E, stepS, h));
    neg = likes_thn[likes_null <= likes_thn];
    zero = likes_null.loc[(likes_null.loc[likes_thp.index] >= likes_thp).replace({False: None}).dropna().index];
    pos = likes_thp.loc[(likes_null.loc[likes_thp.index] < likes_thp).replace({False: None}).dropna().index];
    if verbose>0:
        print 'N={}\t Null={} ({:.0f}\%)\t Pos={}\t Neg={}'.format(CD.shape[0], zero.size,
                                                               zero.size / float(CD.shape[0]) * 100,
                                                               pos.size, neg.size);
    sys.stdout.flush()

    dfz = pd.DataFrame(zero.values, index=zero.index, columns=['alt']);
    dfz['s'] = 0
    dfn = findML(neg, -stepS, CD.loc[neg.index], E, h, eps, stepS)
    dfp = findML(pos, stepS, CD.loc[pos.index], E, h, eps,stepS)

    df = pd.concat([dfp, dfz, dfn])
    df = pd.concat([df, likes_null], axis=1)
    df.columns = pd.MultiIndex.from_product([[h], df.columns], names=['h', 'stat'])
    if save:
        path = utl.outpath + 'real/HMM/'
        utl.mkdir(path)
        df.to_pickle(path + 'h{:E}.df'.format(h))
    return df
def get_specific_meta():
    t = time.time()
    meta_df = pd.read_pickle('/home/max/Documents/project4/data/metadata_drop.pkl')
    t = print_time_elapsed(t, ' read meta')
    categories = ['Baby', 'Beauty', 'Books', 'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry', 'Home_and_Kitchen', 'Movies_and_TV', 'Automotive', 'CDs_and_Vinyl', 'Toys_and_Games', 'Video_Games']
    df_list = []
    for category in categories:
        print 'starting ' + category
        cut_df = pd.DataFrame()
        df = pd.read_pickle('/home/max/Documents/project4/data/' + category + '.pkl')
        category_asins = pd.DataFrame(pd.Series(np.unique(df['asin'])), columns = ['asin'])
        t = print_time_elapsed(t, ' read ' + category)
        print category_asins
        print df
        cut_df = pd.merge(category_asins, meta_df, on='asin', how='inner')
        print cut_df
        t = print_time_elapsed(t, ' merge meta ' + category)
        cut_df = pd.merge(df, cut_df, on='asin', how='inner')
        print cut_df
        t = print_time_elapsed(t, ' merge ' + category)
        cut_df['category'] = category
        cut_df = cut_df.drop_duplicates('reviewText')
        df_list.append(cut_df)
    print '   STRTING CONCAT  '
    big_df = pd.concat(df_list, ignore_index=True)
    t = print_time_elapsed(t, ' concat ')
    big_df = big_df.set_index('asin')
    big_df = big_df.sort_index()
    print 'starting write'
    big_df.to_pickle('/home/max/Documents/project4/data/all_merged.pkl')
    t = print_time_elapsed(t, ' write')
    return big_df
Example #21
0
def createCADD():
    " less 1000G_phase3_inclAnno.tsv.gz | cut -f1,2  > coord.hg19.tsv"
    ' bedtools intersect -sorted -a Kyrgyz.hg19.tsv -wb -b ../CADD/1000G_phase3_inclAnno.tsv > CADD.hg19.tsv '
    cad=pd.read_csv(kutl.path+'data/CADD.hg19.tsv',sep='\t',header=None).iloc[:,3:].rename(columns={3:'CHROM',4:'POS'}).sort_values(['CHROM','POS']).set_index('CHROM')
    coor=pd.read_pickle(kutl.path+'data/map.df').dropna().apply(lambda x: x.astype(int)).set_index(19,append=True)[38].rename('POShg38')
    pd.read_pickle(kutl.path+'data/map.df').isnull().sum()
    cad.iloc[:10000].groupby(level=0).apply(lambda x: pd.merge(coor.loc[str(x.name)].sort_index().reset_index(),x,left_on=19,right_on='POS').iloc[:,2:] )
Example #22
0
def plotWealthProcess():
    startDate, endDate = date(2005,1, 1), date(2013, 12, 31)
    n_rvs = (5, 10)
    hist_periods = (20, 30 ,40 ,50 , 60 ,70 ,80)
    n_scenario = 200
    alphas = ("0.5", "0.55", "0.6", "0.65", "0.7", "0.75", 
              "0.8", "0.85", "0.9", "0.95")
    
    for n_rv in n_rvs:
        bh_wealthProcess = pd.read_pickle(os.path.join(ExpResultsDir, 
                                "buyhold_wealthprocess_n%s.pkl"%(n_rv)))
        
        for alpha in alphas:
            for hdx, hist_period in enumerate(hist_periods):
                paramDir = os.path.join(ExpResultsDir, 
                            "n%s_h%s_s%s_a%s"%(n_rv, hist_period, 
                                               n_scenario, alpha))
                
                expDirs = glob.glob(os.path.join(paramDir, 
                                "fixedSymbolSPPortfolio_20050103-20131231_*"))
                for rdx, expDir in enumerate(expDirs):
                    t = time.time()
                    runTime = expDir[expDir.rfind('_')+1:]
                    
                    wealthPkl = os.path.join(expDir, 'wealthProcess.pkl')
                    depositPkl = os.path.join(expDir, 'depositProcess.pkl')
                    if not os.path.exists(wealthPkl) or not os.path.exists(depositPkl):
                        continue
                    wealth =  pd.read_pickle(wealthPkl)
                    deposit = pd.read_pickle(depositPkl)
                    #combine
                    wealth['deposit'] = deposit
                    tWealth = wealth.sum(axis=1)
def main():
    from_str = args.cf.split('/')[-1].split('.')[0]

    df_file = args.pref + from_str + '_' + str(args.yr) \
        + '_feat_matrix_unnormed.pkl'
    if args.split == None:
        df_raw = pd.read_pickle(df_file)
    else:
        # combine data matrix and score pickles
        split_files = glob.glob(args.pref + 'split_dm/*' + args.split + '*unnorm*.pkl')
        sort_files = natsort.natsorted(split_files)
        split_scores = glob.glob(args.pref + 'split_dm/*' + args.split + '*scores*.pkl')
        sort_scores = natsort.natsorted(split_scores)
        for ndx, pf in enumerate(sort_files):
            if ndx == 0:
                df_raw = pd.read_pickle(pf)
                scores = pd.read_pickle(sort_scores[ndx])
            else:
                df_raw = df_raw.append(pd.read_pickle(pf))
                scores = scores.append(pd.read_pickle(sort_scores[ndx]))
        # combine score pickles
        df_file_out = df_file.replace('_feat_matrix_unnormed', '_scores')
        scores.to_pickle(df_file_out)

    # cut columns of all zeros
    df_trim = df_raw[df_raw.columns[(df_raw != 0).any()]]

    # normalize on a per candidate basis
    df_trim_norm = df_trim.div(df_trim.sum(axis=1), axis=0)

    df_file_out = df_file.replace('unnormed', 'trim_normed')
    df_trim_norm.to_pickle(df_file_out)
Example #24
0
def predict_live(batterattrfile, battermodelfile, batterdatafile,
                 pitcherattrfile, pitchermodelfile, pitcherdatafile,
                 predictionfile,
                 na_treatment='zero'):
  # Apply model, save results
  batterattrs = read_attrs(batterattrfile)[1:]
  batter_model = pickle.load(open(battermodelfile, 'r'))
  batter_data = pd.read_pickle(batterdatafile)

  pitcherattrs = read_attrs(pitcherattrfile)[1:]
  pitcher_model = pickle.load(open(pitchermodelfile, 'r'))
  pitcher_data = pd.read_pickle(pitcherdatafile)

  if na_treatment == 'zero':
    usable_batter_data = batter_data[batterattrs].fillna(0)
    usable_pitcher_data = pitcher_data[pitcherattrs].fillna(0)
  elif na_treatment == 'drop':
    usable_batter_data = batter_data[batterattrs].dropna()
    usable_pitcher_data = pitcher_data[pitcherattrs].dropna()

  batter_data['prediction'] = pd.Series(batter_model.predict(usable_batter_data), index=usable_batter_data.index)
  pitcher_data['prediction'] = pd.Series(pitcher_model.predict(usable_pitcher_data), index=usable_pitcher_data.index)

  keep_cols = ['fullname', 'player_id', 'Position', 'Team', 'Salary', 'prediction']

  batter_output = batter_data[keep_cols]
  pitcher_output = pitcher_data[keep_cols]

  pd.concat([batter_output, pitcher_output]).to_pickle(predictionfile)
Example #25
0
def cargarPickle (filename):

    """
        Esta funcion carga touchs y sounds en formato pickle. Englobna otras. Revisar superposicion
    """

    import pandas as pd
    import os
    from IPython.display import display

    import sys
    if not sys.version_info[:2] == (3, 4):
        print ('Sos un boludo!, pero uno previsor')
        print ('Este codigo esta pensado para correr en python 3.4')

    if os.path.isfile('./Guardados/'+filename+'.touch'):
        touchs = pd.read_pickle ('./Guardados/'+filename+'.touch')
    else:
        display ('Error, no se encontro los touchs buscados')
        return

    if os.path.isfile('./Guardados/'+filename+'.sounds'):
        sounds = pd.read_pickle ('./Guardados/'+filename+'.touch')
    else:
        display ('Error, no se encontro los sounds buscados')
        return

    return touchs, sounds
Example #26
0
def get_pkl_files(root_dir,pkl_dir,poem_file,vec_file,vectorizer_file):
    import pickle
    import pandas as pd
    df_poems = pd.read_pickle(root_dir + pkl_dir + '/' + poem_file)
    df_vecs =   pd.read_pickle(root_dir + pkl_dir + '/' + vec_file)
    vectorizer = pickle.load( open( root_dir + pkl_dir + '/' + vectorizer_file, "rb" ) )
    return df_poems, df_vecs, vectorizer
Example #27
0
def fetch_data(district=None, from_pickle=False, pickle_filename=None, unit_col='student_id', time_col='grade_level'):
    if from_pickle and pickle_filename is not None:
        print("Reading pickle file.")
        data = pd.read_pickle(pickle_filename + '.pkl')
        if os.path.isfile(pickle_filename + '_cats' + '.pkl'):
            feature_categories = pd.read_pickle(pickle_filename + '_cats' + '.pkl')
        feature_categories = None
    else:
        # Retrieve time-invariant features, time-variant features, and outcome labels.
        cohorts, features_constant, features_by_time, feature_categories, labels = extract_data(district)
        features_by_time = features_by_time.drop(['cohort', 'academic_year'], 1)

        # Extract features.
        features = extract_features(features_constant, # time-invariant features
                                    features_by_time, # time-variant features
                                    unit_col=unit_col, # instance identifier column
                                    time_col=time_col, # time unit column
                                    )

        # Extract outcome labels.
        labels = labels[['student_id', 'outcome_label']]
        labels = labels.dropna()

        # Extract instance-level data. Each instance has an identifier, one or more features, and a label.
        data = extract_instances(features, labels, unit_col='student_id')

        if pickle_filename is not None:
            data.to_pickle(pickle_filename + '.pkl')
            if feature_categories is not None:
                feature_categories.to_pickle(pickle_filename + '_cats.pkl')

    return data, feature_categories
Example #28
0
def prep_for_modeling():
    all = pd.read_pickle("./this_is_the_set_i_built_the_models_on_.pkl")
    with open("./exog_rf_.txt", "r") as f:
        temp = f.read()
        exog = temp.splitlines()
    all = bucketize(all, exog)
    with open("./exog_rf__.txt", "r") as f:
        temp = f.read()
        exog = temp.splitlines()
    train = all[all["train"] == 1]
    test = all[all["test"] == 1]
    validate = all[all["validate"] == 1]
    endog = get_variables("./expenses13_.txt")
    office = endog[0]
    outpatient = endog[1]
    er = endog[2]
    inpatient = endog[3]
    w = get_variables("weights13_.txt")
    w = w[0]
    insurance = pd.read_pickle("../data/insurance_current_.pkl")
    return (
        all,
        train,
        test,
        validate,
        endog,
        office,
        outpatient,
        inpatient,
        er,
        w,
        exog,
     insurance)
Example #29
0
def plot(y,MAP=True,fontsize=30,figsize=(18, 10), dpi=80):
    plt.figure(figsize=figsize, dpi=dpi)
    mpl.rc('font', **{'family': 'serif', 'serif': ['Times'], 'size':fontsize})
    
    m =y.mean( axis=1)
    print m
    title=["MRR","MAP"][MAP]
    labels = ['GEO', 'Jaccard', 'R', 'RI', 'Pref1','Pref5','RIP1' ,'RIP5' ]
    x = range(len(labels))
    error=y.std(axis=1)/2
    pref=[pd.read_pickle(path+'prefFB{}.pkl'.format(i)) for i in [1,5]]
    print pref
    prefm=map(lambda x: x[('AP','MRR')[not MAP]]['mean'] , pref)
    prefs=map(lambda x: x[('AP','MRR')[not MAP]]['std'] /2, pref)
    print prefm
    m=np.append(m,prefm)
    error=np.append(error,prefs)
    
    rip=[pd.read_pickle(path+'RIP{}.pkl'.format(i)) for i in [1,5]]
    prefm=map(lambda x: x[('AP','MRR')[not MAP]]['mean'] , rip)
    prefs=map(lambda x: x[('AP','MRR')[not MAP]]['std'] /2, rip)
    m=np.append(m,prefm)
    error=np.append(error,prefs)
    
    plt.errorbar(x,m, yerr=error, fmt='ok',linewidth=2, markersize=15)
    plt.xlim([-1,len(x)])
    plt.ylim([-0.01,max(m)+max(error)+0.05])
    plt.grid()
    plt.xticks(x, labels)
    plt.title(title)
    plt.savefig(path+title+'.png')
    plt.show()
Example #30
0
def concatDataFrames():
	score_df = pd.read_pickle('../../dataset/score_df_tst.pickle')
	tf_idf_df = pd.read_pickle('../../dataset/score_df_tfidf_tst.pickle')
	lsa_df = pd.read_pickle('../../dataset/score_df_lsa_cvect_tst.pickle')
	# Read additional features from the result of feature_engineering
	# and append to score_df before saving it.
	# Read from file
	preprocessed_path = '../../dataset/features_t.csv'
	features_df = None
	should_add_features = False
	if os.path.isfile(preprocessed_path):
		print("Found Preprocessed DataFrame... Begin appending features to score matrix")
		features_df = pd.read_csv(preprocessed_path, index_col=0)
		feature_cols = list(features_df.columns.values)
		features_np_arr = np.array(features_df)

		should_add_features = True
	else:
		print("Not Found Preprocessed DataFrame")
		return None

	if should_add_features:
		features_df = pd.DataFrame(features_np_arr, index=score_df.index, columns=feature_cols)

		result = pd.concat([score_df, tf_idf_df, lsa_df, features_df], axis=1, ignore_index=True)

		print result.shape

		result.to_pickle('../../dataset/score_df_final_tst.pickle')
Example #31
0
                    checkpoint(model, modelpath)

    print("End. Best Iteration {}:  HR = {:.4f}, NDCG = {:.4f}. ".format(
        best_iter, best_hr, best_ndcg))
    if save_model:
        print("The best MLP model is saved to {}".format(modelpath))

    if save_model:
        if not os.path.isfile(resultsdfpath):
            results_df = pd.DataFrame(columns=[
                "modelname", "best_hr", "best_ndcg", "best_iter", "train_time"
            ])
            experiment_df = pd.DataFrame(
                [[modelfname, best_hr, best_ndcg, best_iter, train_time]],
                columns=[
                    "modelname", "best_hr", "best_ndcg", "best_iter",
                    "train_time"
                ])
            results_df = results_df.append(experiment_df, ignore_index=True)
            results_df.to_pickle(resultsdfpath)
        else:
            results_df = pd.read_pickle(resultsdfpath)
            experiment_df = pd.DataFrame(
                [[modelfname, best_hr, best_ndcg, best_iter, train_time]],
                columns=[
                    "modelname", "best_hr", "best_ndcg", "best_iter",
                    "train_time"
                ])
            results_df = results_df.append(experiment_df, ignore_index=True)
            results_df.to_pickle(resultsdfpath)
Example #32
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import os

df = pd.read_pickle(
    os.path.join('/Users/amandashay/Documents/corepy', 'data_frame.pickle'))

# Smaller object for easier vis
small_df = df.iloc[49980:50019, :].copy()

# Basic Excel
small_df.to_excel("basic.xlsx")
small_df.to_excel("no_index.xlsx", index=False)
small_df.to_excel("columns.xlsx", columns=["artist", "title", "year"])

# Multiple worksheets
writer = pd.ExcelWriter('multiple_sheets.xlsx', engine='xlsxwriter')
small_df.to_excel(writer, sheet_name="Preview", index=False)
df.to_excel(writer, sheet_name="Complete", index=False)
writer.save()

# Conditional formatting
artist_counts = df['artist'].value_counts()
artist_counts.head()
writer = pd.ExcelWriter('colors.xlsx', engine='xlsxwriter')
artist_counts.to_excel(writer, sheet_name="Artist Counts")
sheet = writer.sheets['Artist Counts']
cells_range = 'B2:B{}'.format(len(artist_counts.index))
sheet.conditional_format(
Example #33
0
from __future__ import division
import bilby
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pylab import hist, diag
import scipy.integrate as integrate
from scipy.integrate import simps
from scipy.special import gamma, factorial

# A few simple setup steps
label = 'linear_regression_unknown_noise'
outdir = 'outdir_2_component_2nd_fixed_2'
bilby.utils.check_directory_exists_and_if_not_mkdir(outdir)

fdfs = pd.read_pickle("./Freq_small_df.pkl")
Edata = fdfs["snr"]


#Gauss components
def gauss1(x, f, mu, sigma):
    C1 = 2 / ((2**(1 + (1 / 2))) * sigma * gamma(1 / 2))
    return f * C1 * np.exp(-0.5 * (np.abs((x - mu) / sigma)**2))


def gauss2(x, f, mu, sigma, alpha):
    C2 = alpha / ((2**(1 + (1 / alpha))) * sigma * gamma(1 / alpha))
    return (1 - f) * C2 * np.exp(-0.5 * (np.abs((x - mu) / sigma)**alpha))


def bimodal(x, f, mu1, sigma1, mu2, sigma2, alpha2):
        'chain_path':
        map(lambda x: os.path.join(trainset_chain_dir, x),
            trainset_chain_dir_temp)
    })
    testset = pd.DataFrame({
        'image_path':
        map(lambda x: os.path.join(testset_dir, x), testset_dir_temp),
        'chain_path':
        map(lambda x: os.path.join(testset_chain_dir, x),
            testset_chain_dir_temp)
    })

    trainset.to_pickle(trainset_path)
    testset.to_pickle(testset_path)
else:
    trainset = pd.read_pickle(trainset_path)
    testset = pd.read_pickle(testset_path)

#testset.index = range(len(testset))
#testset = testset.ix[np.random.permutation(len(testset))]
is_train = tf.placeholder(tf.bool)

learning_rate = tf.placeholder(tf.float32, [])
ll = tf.placeholder(tf.int32, [batch_size])
net_size_tf = tf.placeholder(tf.int32, [net_size])
images_tf = tf.placeholder(tf.float32, [batch_size, net_size, net_size, 3],
                           name="images")
images_global = tf.placeholder(tf.float32, [batch_size, net_size, net_size, 3],
                               name="images_global")
#reconstruction_global=tf.placeholder( tf.float32, [batch_size, 128, 128, 3], name="rec_global")
fake_length = 5
Example #35
0
def process_run(config_file, label_function=None, kwargs={}):
    """Process one dataset from a config file using the prescribed label function"""
    global_scale = kwargs.get("global_scale")
    label_functions_pkg = kwargs.get("label_functions_pkg")
    data_filenames = kwargs.get("data_filenames")
    plot_raw = kwargs.get("plot_raw", False)
    cycle_key = kwargs.get("cycle_key", None)

    # Get data folder
    config = iter_utils.load_yaml(config_file)

    # Get the global scale
    global_scale = float(config.get("setup", {}).get("global_scale", 1.0))
    if global_scale is not None:
        global_scale = global_scale

    global_scale_inv = 1.0 / global_scale

    # Set up the label functions
    if label_function is None:
        process = config.get("setup", {})
        label_funs_to_get = process.get("label_functions", "default")
    else:
        label_funs_to_get = label_function

    default_fun = getattr(label_functions_pkg, "default")
    default_args = inspect.getfullargspec(default_fun)[0]

    if isinstance(label_funs_to_get, str):
        label_funs_to_get = [label_funs_to_get]

    if isinstance(label_funs_to_get, list):
        methods_to_call = dict()
        for curr_name in label_funs_to_get:
            curr_fun = getattr(label_functions_pkg, curr_name, None)
            if curr_fun is not None:
                args = inspect.getfullargspec(curr_fun)[0]
                methods_to_call[curr_name] = {"function": curr_fun, "args": args}

        if methods_to_call:
            label_function = methods_to_call
        else:
            label_function = {
                "default": {"function": default_fun, "args": default_args}
            }

    else:
        label_function = {"default": {"function": default_fun, "args": default_args}}

    # Get the names of the args specified in the label function
    label_fun_inputs = []
    for label_fun_key in label_function:
        label_fun_inputs.extend(label_function[label_fun_key]["args"])

    label_fun_inputs = list(set(label_fun_inputs))

    # Get file locations
    folder = iter_utils.get_group_folder(config)
    print(folder)
    success_filename = os.path.join(folder, "summary.yaml")

    # Unzip the sweep
    sweep = config["sweep"]
    sweep_vars = []
    sweep_labels = []
    sweep_values = []
    sweep_diffs = []
    sweep_lookup = []
    for param in sweep:
        sweep_vars.append(iter_utils.parse_variable_name(param["variable"]))
        sweep_values.append([])

        if param.get("max", None) is not None:
            if param["num_steps"] > 1:
                sweep_diffs.append(
                    (param["max"] - param["min"]) / (param["num_steps"] - 1)
                )
            else:
                sweep_diffs.append(0.0)

        folder_param = param.get("folder", None)
        if folder_param is not None:
            folder_setup = os.path.join(folder_param, "sweep_values.yaml")
            if os.path.isfile(folder_setup):
                curr_lookup = iter_utils.load_yaml(folder_setup)
                f = {}
                f["names"] = [
                    os.path.join(folder_param, row["name"])
                    for row in curr_lookup["files"]
                ]
                f["values"] = [row["values"] for row in curr_lookup["files"]]

                curr_lookup["files"] = f
                sweep_lookup.append(curr_lookup)
                sweep_labels.append(curr_lookup["variables"])
            else:
                sweep_lookup.append(None)
        else:
            sweep_lookup.append(None)
            sweep_labels.append(
                iter_utils.parse_variable_name(param.get("label", None))
            )

    print(sweep_vars)

    # Get the list of all folders
    run_folders = iter_utils.get_folders(folder)

    # Read in each data file and parse it
    label_vals = {}
    for key in label_function:
        label_vals[key] = []

    num_finger_segs = []

    for curr_folder in run_folders:
        print(curr_folder)
        param_filename = os.path.join(curr_folder, "params.yaml")
        params = iter_utils.load_yaml(param_filename)

        for idx, var in enumerate(sweep_vars):
            val = iter_utils.get_from_dict(params, var)

            if sweep_lookup[idx] is not None:
                try:
                    num_idx = sweep_lookup[idx]["files"]["names"].index(val)
                    val_use = sweep_lookup[idx]["files"]["values"][num_idx]
                except ValueError:
                    val_use = val
            else:
                val_use = val

            sweep_values[idx].append(val_use)

        # Get object position data if needed
        if "objectpose" in label_fun_inputs:
            fields = [
                "timeStamp",
                "objectId",
                "posX",
                "posY",
                "posZ",
                "oriX",
                "oriY",
                "oriZ",
                "oriW",
            ]
            pose_file = os.path.join(curr_folder, data_filenames["objectpose"])
            reader = iter_utils.read_parse_data(pose_file)
            df = reader.make_dataframe(fields)

            for pos in ["posX", "posY", "posZ"]:
                df[pos] = global_scale_inv * df[pos]

            # Get euler angles from the quaternions
            euler = []
            for quaternion in zip(df["oriX"], df["oriY"], df["oriZ"], df["oriW"]):
                # print(quaternion)
                euler.append(p.getEulerFromQuaternion(quaternion))

            euler = np.array(euler)
            euler = np.unwrap(euler, axis=0)
            euler = np.rad2deg(euler)

            df["eulerX"] = euler[:, 0]
            df["eulerY"] = euler[:, 1]
            df["eulerZ"] = -euler[:, 2]

            df_rel = df - df.iloc[0].values.squeeze()

            if plot_raw and cycle_key is not None:
                act_file = os.path.join(curr_folder, data_filenames["actuation"])
                iter_utils.graph_data(
                    df_rel,
                    filename=pose_file,
                    cyc_filename=act_file,
                    cyclic_key=cycle_key,
                )
                # iter_utils.graph_cyclic(df, act_file, cycle_key)

        else:
            df = None

        # Get contact data if needed
        if "contact" in label_fun_inputs:
            filename_contact = os.path.join(curr_folder, data_filenames["contact"])
            if os.path.exists(filename_contact):
                fields = [
                    "timeStamp",
                    "stepCount",
                    "bodyUniqueIdA",
                    "bodyUniqueIdB",
                    "linkIndexA",
                    "linkIndexB",
                ]
                reader = iter_utils.read_parse_data(filename_contact)
                df_contact = reader.make_dataframe(fields)
            else:
                df_contact = None
        else:
            df_contact = None

        # Get actuation data if needed
        if "actuation" in label_fun_inputs:
            filename_actuation = os.path.join(curr_folder, data_filenames["actuation"])
            if os.path.exists(filename_actuation):
                df_actuation = pd.read_pickle(filename_actuation)
                for col in df_actuation.columns.values:
                    if "actuation" in col:
                        df_actuation[col] = pow(global_scale_inv, 2) * df_actuation[col]
            else:
                df_actuation = None
        else:
            df_actuation = None

        # Get the number of finger segments
        calc_file = os.path.join(curr_folder, data_filenames["calculated"])
        calc_params = iter_utils.load_yaml(calc_file)
        num_finger_segs.append(calc_params.get("num_finger_segs", []))

        # package the correct data to give to the label function
        label_fun_send_list = {
            "objectpose": df,
            "contact": df_contact,
            "actuation": df_actuation,
        }

        # Get the labels from the label functions
        for label_fun_key in label_function:
            label_fun_send = dict()
            for key in label_function[label_fun_key]["args"]:
                label_fun_send[key] = label_fun_send_list[key]
            curr_val = label_function[label_fun_key]["function"](**label_fun_send)
            label_vals[label_fun_key].append(curr_val)

        if "save_raw_data" in label_function.keys():
            out = {}
            if df is not None:
                out["objectpose"] = df.to_dict(orient="list")
            if df_contact is not None:
                out["contact"] = df_contact.to_dict(orient="list")
            if df_actuation is not None:
                out["actuation"] = df_actuation.to_dict(orient="list")

            out_file = os.path.join(curr_folder, "raw_data.pkl")
            with open(out_file, "wb") as f:
                pickle.dump(out, f)

    results = dict()
    results["labels"] = label_vals
    results["vars"] = sweep_vars
    results["varlabels"] = sweep_labels
    results["sweep"] = sweep_values
    results["diffs"] = sweep_diffs
    results["num_finger_segs"] = num_finger_segs

    iter_utils.save_yaml(results, success_filename)

    data = flatten_data(results)
    filename, ext = os.path.splitext(success_filename)
    iter_utils.save_yaml(data, filename + "_flattened" + ext)

    return results
                        help='Find the n most dissimilar items')
    parser.add_argument('-c', '--conf_info', type=str, default=None,
                        help='tsv file containing ccs and energy info for conformers. \
                        Note that the index of the .tsv file must exactly match the index of the matrix')    
    
    args = parser.parse_args()
    n = args.ndis
    mtrx = args.mtrx

    # If SDS is not already a directory, make it
    directory = 'SDS'
    if not exists(directory):
        os.makedirs(directory)


    df = pd.read_pickle(mtrx)
    SDSdf = SDS(df, n=n)
    narray = np.array([x for x in range(1, n+1)])

    # If comparing conformers, calculate Boltzmann weighted CCS.
    if args.conf_info != None:
        csvdf = pd.read_csv(args.conf_info)
        writedf = conf_to_ccs(SDSdf['matrix index'].values, csvdf)
    else:
        writedf = SDSdf
        writedf['n Dissimilar'] = narray


    writedf.to_csv(f'SDS/SDS_{n}_dissimilar.csv', index=False)
    print((time()-start)/60, 'min')
        
    if args.no_sweeps:
        fn = fn[:-7] + '_nosweep.pickle'
    if not os.path.isfile(fn) or args.regenerate:
        patients = ['p1', 'p2', 'p5', 'p6', 'p8', 'p9', 'p11']
        cov_min = 100
        data = collect_data(patients,
                            cov_min=cov_min,
                            no_sweeps=args.no_sweeps)
        try:
            data.to_pickle(fn)
            print('Data saved to file:', os.path.abspath(fn))
        except IOError:
            print('Could not save data to file:', os.path.abspath(fn))

    else:
        data = pd.read_pickle(fn)

    # Make time and entropy bins
    t_bins = np.array([0, 100, 200, 500, 1000, 1500, 2000, 3000], int)
    t_binc = 0.5 * (t_bins[:-1] + t_bins[1:])
    add_binned_column(data, t_bins, 'time')
    data['time_binc'] = t_binc[data['time_bin']]

    # No-entropy sites are many, so the bin 0 comes up twice
    perc = np.linspace(0, 100, 8)
    S_bins = np.percentile(data['S'], perc)[1:]
    S_binc = np.percentile(
        data['S'],
        0.5 * (perc[:-1] + perc[1:]))[1:]  # this makes bin center medians.
    n_alleles = np.array(
        data.loc[:, ['af', 'S_bin']].groupby('S_bin').count()['af'])
Example #38
0
    cvs = cross_val_score(rfr_here,
                          crossval_X,
                          crossval_y,
                          cv=cv_groups,
                          n_jobs=n_jobs,
                          scoring='mean_absolute_error',
                          fit_params={'sample_weight': crossval_weights})
    msg("Cross validation took %f seconds with %i threads, %i records, %i estimators and %i CV groups"
        % ((time.time() - begin_time), n_jobs, len(crossval_X), n_estimators,
           cv_groups))
    msg("Results: %f, %s" % (np.mean(cvs), str(cvs)))
    return np.mean(cvs)


msg("Hi, reading moves.")
moves_df = read_pickle(sys.argv[1])

moves_file = open(sys.argv[1] + '.info', 'rb')
moves_info = pickle.load(moves_file)
categorical_features = moves_info['categorical_features']

msg("Computing weights")
game_weights = (1. / (moves_df.groupby('gamenum')['halfply'].agg({
    'max': np.max
}).clip(1, 1000)))['max']
moves_df['weight'] = moves_df['gamenum'].map(game_weights)
msg("Done")

#moves_df['abs_moverscore'] = moves_df['moverscore'].abs()

features_to_exclude = [
Example #39
0
    rel_report_dir = os.path.join(report_str + '_TR_%s' % TR)
    os.chdir(report_base_dir)
    if os.path.isdir(rel_report_dir):
        shutil.rmtree(rel_report_dir)
    os.mkdir(rel_report_dir)
    os.chdir(rel_report_dir)
    os.mkdir('reports')

    for subject_id in subjects_list:
        print(subject_id)
        df_ss_file = os.path.join(ds_dir, subject_id,
                                  'rsfMRI_preprocessing/QC/df', 'TR_%s' % TR,
                                  'qc_values.pkl')
        #fixme
        if os.path.exists(df_ss_file):
            df_ss = pd.read_pickle(df_ss_file)
        else:
            header = [
                'subject_id', 'similarity_epi_struct', 'similarity_struct_MNI',
                'mean_FD_Power', 'n_spikes', 'median_tsnr'
            ]
            data = np.hstack((subject_id, np.repeat(np.nan, len(header) - 1)))
            df_ss = pd.DataFrame([data], columns=header)
            df_ss = df_ss.set_index(df_ss.subject_id)

        # link to report pdf:
        rel_report_dir = os.path.join(report_str + '_TR_%s' % TR)
        subject_reports_dir = os.path.join(rel_report_dir, 'reports')
        report_file = os.path.join(subject_reports_dir, subject_id + '.pdf')
        df_ss['report_file'] = report_file
            values = round.next()
            if x == 0:
                image = values[0]
                for i in values[1:]:
                    labels.append(i)
            else:
                image = np.add(image, values[0])
            x += 1
        yield(image, labels)

#get directory of input images and create array of images and store images in the directory to the array
train_dir = "C:/pooled/Train"
#get labels pickle and convert to dataframe then sort by the filename to go along with the images
train_labels_file = "C:/Users/panka/OneDrive/Desktop/Aditya/image data 2018-19/Training_Input_Resized.pkl"

train_labels = pd.read_pickle(train_labels_file)
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_dataframe(dataframe=train_labels,directory=train_dir,target_size=(108,192),x_col='Filename',y_col=['Right Ankle x','Right Knee x','Right Hip x','Left Hip x','Left Knee x','Left Ankle x','Pelvis x','Thorax x','Upper Neck x','Head Top x','Right Wrist x','Right Elbow x','Right Shoulder x','Left Shoulder x','Left Elbow x','Left Wrist x','Right Ankle y','Right Knee y','Right Hip y','Left Hip y','Left Knee y','Left Ankle y','Pelvis y','Thorax y','Upper Neck y','Head Top y','Right Wrist y','Right Elbow y','Right Shoulder y','Left Shoulder y','Left Elbow y','Left Wrist y'],class_mode='other',batch_size=16)
    
#get directory of input images and create array of images and store images in the directory to the array
test_dir = "C:/pooled/Test"
#get labels pickle and convert to dataframe then sort by the filename to go along with the images
test_labels_file = "C:/Users/panka/OneDrive/Desktop/Aditya/image data 2018-19/Testing_Input_Resized.pkl"

test_labels = pd.read_pickle(test_labels_file)
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_dataframe(dataframe=test_labels,directory=test_dir,target_size=(108,192),x_col='Filename',y_col=['Right Ankle x','Right Knee x','Right Hip x','Left Hip x','Left Knee x','Left Ankle x','Pelvis x','Thorax x','Upper Neck x','Head Top x','Right Wrist x','Right Elbow x','Right Shoulder x','Left Shoulder x','Left Elbow x','Left Wrist x','Right Ankle y','Right Knee y','Right Hip y','Left Hip y','Left Knee y','Left Ankle y','Pelvis y','Thorax y','Upper Neck y','Head Top y','Right Wrist y','Right Elbow y','Right Shoulder y','Left Shoulder y','Left Elbow y','Left Wrist y'],class_mode='other',batch_size=16)

#create model
model = Sequential()
Example #41
0
def word2vec_inception():
    return concat_ser_dic(pd.read_pickle("./word2vec"),
                          pd.read_pickle("./inception"))
Example #42
0
df0 = df_pressures(xds_ibtracs)
df0[6000:6010]

#path to your daily mean SST and MLD data
path_sst = r'/media/administrador/SAMSUNG/seasonal_forecast/data/SST/'
path_mld = r'/media/administrador/SAMSUNG/seasonal_forecast/data/CFS/ocnmld/'
path_p = r'/home/administrador/Documentos/seasonal/seasonal_forecast/new/'

**For the calibration period the points with pressure, SST and MLD data in the target area are kept.**

df = df_p_sst_mld(df0,path_sst,path_mld)
df_cali = df.drop(df.index[5184:]) #years of the calibration period

# load data
path_p= r'/home/administrador/Documentos/seasonal/seasonal_forecast/new/'
df = pd.read_pickle(path_p+'df_coordinates_pmin_sst_mld_2019.pkl')
df.tail()

## <font color='royalblue'>**3.2 Predictor grid and data processing**</font> <a name="pg"></a>

<br />


**The historical datasets are interpolated into the a 1/2º grid resolution, defining this way the grid for the predictor in the target area.**


fig_predictor_grid = plot_predictor_grid()

**MLD, SST and pressure data plots:**

plot_sst_mlp_pmin_cali(df)
Example #43
0
import pandas as pd
from ecg_qc import ecg_qc
import math
from tqdm import tqdm

time_window_ml = 4
fs = 1000

df_ecg = pd.read_pickle('dataset_streamlit/df_ecg_103001_selection.pkl')
df_ecg.head()

ecg_data = df_ecg['ecg_signal'][10000:]

ecg_qc_ml = ecg_qc(
    normalized=True,
    model=
    '/home/aura-alexis/github/ecg_qc_viz/env2/lib64/python3.6/site-packages/ecg_qc-1.0b4-py3.6.egg/ecg_qc/ml/models/xgb_norm_{}s.joblib'
    .format(time_window_ml),
    data_encoder=
    '/home/aura-alexis/github/ecg_qc_viz/env2/lib64/python3.6/site-packages/ecg_qc-1.0b4-py3.6.egg/ecg_qc/ml/data_encoder/data_encoder_norm_{}s.joblib'
    .format(time_window_ml))

df_results = df_ecg
df_results['ml'] = ''

for ecg_signal_index in tqdm(
        range(math.floor(ecg_data.shape[0] / (fs * time_window_ml)) + 1)):
    start = ecg_signal_index * fs * time_window_ml
    end = start + fs * time_window_ml
    ml_prediction = ecg_qc_ml.get_signal_quality(ecg_data[start:end].values)
    df_results['ml'].iloc[start:end] = ml_prediction
Example #44
0
def doc2vec_word2vec_inception():
    tmp = concat_ser_dic(pd.read_pickle("./doc2vec"),
                         pd.read_pickle("./word2vec"))
    return concat_ser_dic(tmp, pd.read_pickle("./inception"))
Example #45
0
 def test_pickle_method(self):
     filename = os.path.join(self.tempdir, "df.pkl")
     self.df.to_pickle(filename)
     unpickled = pd.read_pickle(filename)
     assert_frame_equal(self.df, unpickled)
     assert self.df.crs == unpickled.crs
Example #46
0
def doc2vec_word2vec():
    return concat_ser_dic(pd.read_pickle("./doc2vec"),
                          pd.read_pickle("./word2vec"))
import pandas as pd

model = pd.read_pickle('IRIS_Model.bin')

sl = float(input('Enter Sepal_length(4.3 - 8.0) : '))
sw = float(input('Enter Sepal_width(2.0 - 4.4) : '))
pl = float(input('Enter Petal_length(1.0 - 7.0) : '))
pw = float(input('Enter Petal_width(0.1 - 2.5) : '))

result = model.predict([[pl, pw, sl * pw, sl * pl, sw * pl, sw * pw, pl * pw]])
if result == 1:
    result = 'Setosa'
elif result == 0:
    result = 'Virginica'
elif result == 2:
    result = 'Versicolor'
print('According to Your information this flower belongs to {} species'.format(
    result))
#print(result)
# Author Shael Minuk
import pickle as pkl
import pandas as pd
sim_data = pd.read_pickle("sim_results.pkl")

sim_data["sub_thick"] = 0
sim_data["sub_perm"] = 0
## just changing the subtrate to map its thickness and permeativity, make it numerical and not categorical
sim_data.loc[(sim_data[9] == "Rogers RO3003"), "sub_thick"] = 1.52e-3
sim_data.loc[(sim_data[9] == "Rogers RO3010"), "sub_thick"] = 1.28e-3
sim_data.loc[(sim_data[9] == "Rogers RO3003"), "sub_perm"] = 3
sim_data.loc[(sim_data[9] == "Rogers RO3010"), "sub_perm"] = 10
sim_data = sim_data.drop([0, 1, 6, 9], axis=1)
# one hot encode the categorical top mid bottom collumn :), now boolean 5_bot, 5_mid, 5_top etc etc instead of one col
sim_data = pd.get_dummies(sim_data, )
# store processed pickled data in model folder (dataframe)
sim_data.to_pickle("processed_data.pkl")
    cityFilter = [
        'livorno',
    ]  # 'nice']
    cityList = list(filter(lambda a: a not in cityFilter, cityList))
    cityList.append('barcellona')

    print(n)
    print(len(cityList))
    print(len(cityList))
    return citiyDic, cityList


inPath = '../data/'

df_librettos = pd.read_pickle(inPath + 'librettos_1.pkl')

filter_pot_city = [
    'casale', 'vittoria', 'desio', 'nola', 'bali', 'mira', 'sora', 'sora',
    'genzano', 'faro'
]

european_dic, european_cities = cityDic()

italian_dic, italian_cities = cityDicItaly()

european_dic = {**european_dic, **italian_dic}

city_names = df_librettos.pot_city_name.tolist()

long = df_librettos.longitude.tolist()
Example #50
0
        v = unit_transform(arg)
        row = [country_a, country_b, item_no, date_list[i], v]
        allrow.append(row)
    return allrow


def remake(df, item_no, value_header):
    '''
    整理 table_source df 並回傳 DataFrame

    df : pd.read_html(table_source)
    item_no : str  產品項目, ex: "020711", "020712" , "020714", "020742"
    value_header :str 貿易資訊, ex: "ex_qty","im_qty","ex_val","im_val"
    '''
    columns = ['country_a', 'country_b', 'item_no', 'date', value_header]

    records = []
    for row_no in range(2, len(df)):  # 資料從第三行開始
        allrow = parser_row_val(df, row_no, item_no)
        for row in allrow:
            records.append(row)

    df2 = pd.DataFrame.from_records(records, columns=columns)
    return df2


if __name__ == "__main__":
    df = pd.read_pickle('./pickle/test.pickle')
    df2 = remake(df, "020712", "im_qty")
    print(df2)
from math import *
from sklearn.mixture import GaussianMixture
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from pandas import DataFrame
from gensim.models import KeyedVectors

filename = sys.argv[1]
dims = int(sys.argv[2])

# Word Vectors
Glove = KeyedVectors.load(filename)

start = time.time()

all = pd.read_pickle('all.pkl')

# Computing tf-idf values.
traindata = []
for i in range( 0, len(all["text"])):
	traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(all["text"][i], True)))

tfv = TfidfVectorizer(strip_accents='unicode',dtype=np.float32)
tfidfmatrix_traindata = tfv.fit_transform(traindata)
featurenames = tfv.get_feature_names()
idf = tfv._tfidf.idf_

# Creating a dictionary with word mapped to its idf value 
print "Creating word-idf dictionary for Training set..."

word_idf_dict = {}
Example #52
0
                                mask=masks[0],
                                drift_model=None, # Already done by fmriprep
                                smoothing_fwhm=5.0,
                                hrf_model='spm + derivative',
                                n_jobs=10,
                                subject_label='{}.{}'.format(ds, subject))

        model.fit(images, 
                  behavior,
                  confounds)

        models.append(model)

    mask = fsl.Info.standard_image('MNI152_T1_2mm_brain_mask.nii.gz')

    confounds = pd.read_pickle(op.join(derivatives, 'all_subjectwise_parameters.pkl'))
    confounds = confounds[['ddm difficulty_effect', 'ddm z_cue_regressor']]
    confounds = confounds.groupby('dataset').transform(lambda x: (x - x.mean())/ x.std())

    confounds['subject_label'] = confounds.apply(lambda row: '{}.{}'.format(row.name[0], row.name[1]), 1)
    confounds['ds'] = confounds.index.get_level_values('dataset').map({'ds-01':0, 'ds-02':1})

    confounds = confounds.reset_index(drop=True)

    model2 = SecondLevelModel(mask)
    model2.fit(models, confounds=confounds)


    glm_dir = op.join(derivatives, 'both', 'modelfitting', 'glm_4', 'shift-{}'.format(shift))

    if not op.exists(glm_dir):
Example #53
0
from settings import FB_ALL, BASELINE_BEFORE, BASELINE_AFTER
import pandas as pd
import seaborn as sns
from pingouin import rm_corr, mixed_anova, pairwise_ttests, rm_anova, plot_paired, friedman, anova, ttest
import pylab as plt
import numpy as np
from mne.stats import fdr_correction

sns.set_context("paper")
sns.set_style("dark")

threshold = 2.125

stats_file = 'baseline_block_stats_1channels_1bands_median_20ths.pkl'
stats_df_all = pd.read_pickle('data/{}'.format(stats_file))
# stats_df = stats_df.loc[stats_df.subj_id!=28]
stats_df_all = stats_df_all.loc[stats_df_all['block_number'].isin(
    [BASELINE_AFTER, BASELINE_BEFORE])]
unique_blocks = list(stats_df_all['block_number'].unique())
stats_df_all = stats_df_all.loc[stats_df_all['threshold_factor'] == threshold]
stats_df_all['baseline'] = stats_df_all['block_number'].apply(
    lambda x: 'After' if x > 10 else 'Before')

fb_types = ['FB0', 'FB250', 'FB500', 'FBMock']
stats_df_all = stats_df_all.loc[stats_df_all['fb_type'].isin(fb_types)]

metric_type = 'n_spindles'
res = mixed_anova(stats_df_all.query('metric_type=="{}"'.format(metric_type)),
                  dv='metric',
                  within='baseline',
                  subject='subj_id',
Example #54
0
    def update_stock_prizes(self, ListOfTickers=None):
        """
		Update stock prizes given in ListOfCompanies using yahoo finance 

		If there is data to update the old file is backuped to .../backup/stockTicker.p so the backup is good for one business day

		"""

        if ListOfTickers is None:
            ListOfTickers = self.ListOfCompanies['Yahoo Ticker']

        print "Start updating stock prizes"
        print "--------------------------------------\n"

        self.UpdateTimeEnd = datetime.datetime.today().date()
        print "Today is ", self.UpdateTimeEnd, "\n"
        notUpdated = []

        for stocklabel in ListOfTickers:

            if os.path.isfile(self.PathData + 'raw/stocks/' + stocklabel +
                              '.p'):
                StockValue = pd.read_pickle(self.PathData + 'raw/stocks/' +
                                            stocklabel + '.p')

                self.UpdateTimeStart = StockValue.tail(
                    5)['Date'].tolist()[0].date()

                #if stock has been updated at the same date already

                if self.UpdateTimeStart == self.UpdateTimeEnd:
                    self.logging(
                        "Stock " + stocklabel +
                        ": UpdateTimeStart is equal to UpdateTimeEnd ")
                    continue
                try:

                    stock_prize = pdr.get_data_yahoo(stocklabel,
                                                     self.UpdateTimeStart,
                                                     self.UpdateTimeEnd)
                    stock_prize.dropna(inplace=True)
                    stock_prize.drop(index=stock_prize.loc[
                        stock_prize['Volume'] == 0.0].index.tolist(),
                                     inplace=True)
                    stock_prize.reset_index(inplace=True)

                    #print stock_prize
                    stock_prize = stock_prize.loc[
                        stock_prize['Date'] >= self.UpdateTimeStart]

                    if len(stock_prize) == 0:
                        self.logging("Stock " + stocklabel +
                                     ": no new data available")
                        continue

                    StockValue = pd.concat([
                        StockValue.loc[
                            StockValue['Date'] < self.UpdateTimeStart],
                        stock_prize
                    ],
                                           ignore_index=True)

                    shutil.copy(
                        self.PathData + 'raw/stocks/' + stocklabel + '.p',
                        self.PathData + 'raw/stocks/backup/' + stocklabel +
                        '.p')

                    #print "number of rows", len(StockValue), " for label", stocklabel
                    StockValue.reset_index(inplace=True, drop=True)

                    StockValue.to_pickle(self.PathData + 'raw/stocks/' +
                                         stocklabel + '.p')
                    print "Stock ", stocklabel, " updated"
                    self.logging("Stock " + stocklabel +
                                 ": successfully updated")

                except RemoteDataError:
                    self.logging("Stock " + stocklabel +
                                 ": No information for ticker found")
                    print "No information for ticker ", stocklabel
                    notUpdated.append(stocklabel)
                    continue

                except SSLError:
                    self.logging("Stock " + stocklabel + ":SSLError")
                    notUpdated.append(stocklabel)
                    continue

                except ConnectionError:
                    self.logging("Stock " + stocklabel + ": ConnectionError")
                    notUpdated.append(stocklabel)
                    continue
                except IndexError:
                    self.logging("Stock " + stocklabel + ": IndexError")
                    notUpdated.append(stocklabel)
                    continue
            else:
                #if file is not available yet get data starting from 01/01/2000
                self.UpdateTimeStart = datetime.datetime(2000, 1, 1).date()
                try:
                    stock_prize = pdr.get_data_yahoo(stocklabel,
                                                     self.UpdateTimeStart,
                                                     self.UpdateTimeEnd)
                    stock_prize.drop(index=stock_prize.loc[
                        stock_prize['Volume'] == 0.0].index.tolist(),
                                     inplace=True)
                    stock_prize.dropna(inplace=True)
                    stock_prize = stock_prize.reset_index()

                    #print stock_prize
                    stock_prize.to_pickle(self.PathData + 'raw/stocks/' +
                                          stocklabel + '.p')

                    print "Stock ", stocklabel, " updated"
                    self.logging("Stock " + stocklabel +
                                 ": successfully updated")

                except RemoteDataError:
                    self.logging("Stock " + stocklabel +
                                 ": No information for ticker found")
                    print "No information for ticker ", stocklabel
                    continue

                except SSLError:
                    self.logging("Stock " + stocklabel + ":SSLError")
                    notUpdated.append(stocklabel)
                    continue

        print "\nFinished updating stock prizes\n\n"

        if len(notUpdated) > 0:
            print "Not updated stocks", notUpdated
            self.logging("Not updated stocks " + str(notUpdated))

            return notUpdated
        else:
            return None
def get_movie_feature():
    df = pd.read_pickle(os.path.join(config.DIR_DATA, 'movie_feature_pub.pkl'))
    return df
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline

# In[2]:
class dataset:
    pass
sample_data = pd.read_csv("D:\KULIAH\Semester 8\Dataset\Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv")
sample_data.to_pickle('D:\KULIAH\Semester 8\Dataset\Thursday-15-02-2018_TrafficForML_CICFlowMeter.pkl')

# In[3]:
df = pd.read_pickle('D:\KULIAH\Semester 8\Dataset\Thursday-15-02-2018_TrafficForML_CICFlowMeter.pkl')
df = df[['URG Flag Cnt','SYN Flag Cnt','RST Flag Cnt','PSH Flag Cnt','Protocol',
         'Pkt Size Avg','Flow Pkts/s','FIN Flag Cnt','ECE Flag Cnt','ACK Flag Cnt','Dst Port','Label']]
df["Flow Pkts/s"] = pd.to_numeric(df["Flow Pkts/s"], errors='coerce')
df.dropna(inplace=True)
df.info(verbose=True)

# In[5]:
dataset.train = df.groupby('Label')
                .apply(pd.DataFrame.sample, frac=0.8)
                .reset_index(level='Label', drop=True)
dataset.test = df.drop(dataset.train.index)
dataset.label = dataset.train.Label.copy()

# In[6]:
dataset.train
def get_ratings():
    df = pd.read_pickle(os.path.join(config.DIR_DATA, 'ratings_pub.pkl'))
    return df
def get_question3_ref():
    df_ref_movie_feature = pd.read_pickle(os.path.join(config.DIR_DATA, 'ref_movie_feature.pkl'))
    return df_ref_movie_feature
Example #59
0
                           ) / df[abbv + ' SA Value'][0] * 100.0

        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df)

    print(main_df.head())

    pickle_out = open('fiddy_states3.pickle', 'wb')
    pickle.dump(main_df, pickle_out)
    pickle_out.close()


def HPI_Benchmark():
    df = quandl.get("FMAC/HPI_USA", authtoken=api_key)
    df['NSA Value'] = (df['NSA Value'] -
                       df['NSA Value'][0]) / df['NSA Value'][0] * 100.0
    df['SA Value'] = (df['SA Value'] -
                      df['SA Value'][0]) / df['SA Value'][0] * 100.0
    return df


m30 = mortgage_30yr()
HPI_data = pd.read_pickle('fiddy_states3.pickle')
HPI_bench = HPI_Benchmark()

state_HPI_M30 = HPI_data.join(m30)

print(state_HPI_M30.corr()['M30'].describe())
def get_likes():
    df = pd.read_pickle(os.path.join(config.DIR_DATA, 'likes_pub.pkl'))
    return df