Ejemplo n.º 1
0
 def load_from_file(self, filename, append_data=False):
     loaded_data_frame = pandas.load(filename)
     if append_data:
         self.data_frame = pandas.concat(
             [self.data_frame, loaded_data_frame])
     else:
         self.data_frame = pandas.load(filename)
Ejemplo n.º 2
0
def load_data(locking_event, after_trial_type, bins, meth):
    """Load the appropriate dataset given by locking_event and after_trial_type
    
    Returns: suffix, all_event_latencies, binneds
        all_event_latencies is a series with multi-index (
            block, gng, event, ulabel)
    """
    suffix = '_lock_%s_%s' % (locking_event, after_trial_type)
    dfoldeds = pandas.load('dfoldeds' + suffix)
    times_distr = pandas.load('times_distrs' + suffix)

    # Bin each individual sname from each folded
    binneds = dfoldeds['dfolded'].apply(
        lambda dfolded: kkpandas.Binned.from_dict_of_folded(
            dfolded, bins=bins, meth=meth))

    # Convert times_distr to a series keyed by (block, gng, event, ulabel)
    # instead of by (ulabel, group, event)
    all_event_latencies = times_distr.copy()
    all_event_latencies.index = pandas.MultiIndex.from_tuples([
        (idx[1][:2], idx[1][-2:], idx[2], idx[0])
        for idx in all_event_latencies.index],
        names=['block', 'gng', 'event', 'ulabel'],)

    return suffix, all_event_latencies, binneds
Ejemplo n.º 3
0
 def load_from_file(self, filename, append_data=False):
     loaded_data_frame = pandas.load(filename)
     if append_data:
         self.data_frame = pandas.concat([self.data_frame,
                                          loaded_data_frame])
     else:
         self.data_frame = pandas.load(filename)
Ejemplo n.º 4
0
def load_data(locking_event, after_trial_type, bins, meth):
    """Load the appropriate dataset given by locking_event and after_trial_type
    
    Returns: suffix, all_event_latencies, binneds
        all_event_latencies is a series with multi-index (
            block, gng, event, ulabel)
    """
    suffix = '_lock_%s_%s' % (locking_event, after_trial_type)
    dfoldeds = pandas.load('dfoldeds' + suffix)
    times_distr = pandas.load('times_distrs' + suffix)

    # Bin each individual sname from each folded
    binneds = dfoldeds['dfolded'].apply(
        lambda dfolded: kkpandas.Binned.from_dict_of_folded(
            dfolded, bins=bins, meth=meth))

    # Convert times_distr to a series keyed by (block, gng, event, ulabel)
    # instead of by (ulabel, group, event)
    all_event_latencies = times_distr.copy()
    all_event_latencies.index = pandas.MultiIndex.from_tuples(
        [(idx[1][:2], idx[1][-2:], idx[2], idx[0])
         for idx in all_event_latencies.index],
        names=['block', 'gng', 'event', 'ulabel'],
    )

    return suffix, all_event_latencies, binneds
Ejemplo n.º 5
0
def create_data_df(ts,wts):
    ts = pd.load('../Data/ts_data')
    #wts = pd.load('../Data/wts_data')
    wts = pd.load('../Data/ts_humidex')
    prevday_ts = ts.tshift(1,'D')
    prevday_avg = prevday_ts.resample('D',how='mean')
    prevday_avg = prevday_avg.asfreq('15Min',method='pad')
    prevweek_ts = ts.tshift(7,'D')
    #change min_date of all timeseries to the prevweek_ts date
    min_date = max(min(ts.index),min(prevweek_ts.index),min(prevday_ts.index),min(wts.index))
    max_date = max(prevday_ts.index)
    ts = ts[min_date:max_date]
    wts = wts[min_date:max_date]
    wts[wts<-20] = None
    wts = wts.interpolate()
    prevday_ts = prevday_ts[min_date:]
    prevday_avg = prevday_avg[min_date:]
    prevweek_ts = prevweek_ts[min_date:]
    all_covs = pd.DataFrame({'load':ts,'prevday_load':prevday_ts,'prevweek_ts':prevweek_ts,'prevday_avg':prevday_avg,'weather':wts},index=prevday_ts.index)
    hr = pd.Series(ts.index.map(lambda x: x.hour+(float(x.minute)/60)),index=ts.index)
    dod = pd.Series(ts.index.map(lambda x: int('%d%03d'%(x.year,x.dayofyear))),index=ts.index)
    mydf = pd.concat([ts,dod,hr],axis=1)
    mydf.columns=['ts','dod','hr']
    mydf = mydf.pivot(columns='dod',index='hr',values='ts')
    mywdf = pd.concat([wts,dod,hr],axis=1)
    mywdf.columns = ['wts','dod','hr']
    mywdf = mywdf.drop_duplicates(cols=['hr','dod'],take_last=True)
    mywdf = mywdf.pivot(columns='dod',index='hr',values='wts')
    return mydf,mywdf
Ejemplo n.º 6
0
def practice_two():
    frame = pd.read_csv('ch06/ex1.csv')  # 读
    frame.save('ch06/frame_pickle')  # 写     !!!出现错误,无法存储
    pd.load('ch06/frame_pickle')  # 读

    # 使用HDF5格式
    store = pd.HDFStore('mydata.h5')
    store['obj1'] = frame
    store['obj1_col'] = frame['a']

    # 读取Microsoft Excel文件   使用xlrd和openpyxl包
    xls_file = pd.ExcelFile('data.xls')  # 传入文件
    table = xls_file.parse('Sheet1')

    pass
Ejemplo n.º 7
0
def main():
    os.chdir("../../")
    
    # Load the DataFrame representing the MovieLens subset
    
    ratings=pd.load('input_data/ratings_train.pda')
    ratings = ratings.dropna()
    nratings = ratings.shape[0]
    
    try:
        
        itemFilter = CFilter_item(ratings) 
        item_cf_ratings = itemFilter.get_cf_rating(ratings)
        
        # Compute average deviation from rating on those ratings
        avgDeviation = np.sum(np.abs((item_cf_ratings - ratings['rating']))) / len(ratings)
        print "Average deviation from rating: %.4f" %(avgDeviation)
        
        # Dump the ratings to disk for easy future access
        fp = open("proc_data/item_item_ratings_2.pda", "wb")
        pkl.dump(item_cf_ratings, fp)
        fp.close()
                
        print "This concludes the main method. Buh-bye!"
        
    except LogicalError as l:
        print "A logical error occurred: %s" %(l)
    except DatasetError as d:
        print "A dataset-related error occurred: %s" %(d)
    except Exception as e:
        print "An exception occurred: " + str(e)
Ejemplo n.º 8
0
def readPkl():
    currDir = os.getcwd()
    files = os.path.join(currDir, 'pkl', 'BasicFeatures', '*.pkl')
    for src in glob.glob(files):
        df = pd.load(src)
        print df.index[:10]
        print df.index[0].strftime('%Y%m%d')
Ejemplo n.º 9
0
def display_progress_sparse():

    progress = pd.load('results/progress_sparse.pkl')

    for algo in ['batch', 'bb', 'lbfgs']:
        for i in range(1):

            x = progress.loc[algo+'_x_dense_'+str(i)]['time']
            y = progress.loc[algo+'_x_dense_'+str(i)]['f-f_min']
            plt.plot(x, y, 'r', linewidth=2, label='x dense')

            x = progress.loc[algo+'_z_dense_'+str(i)]['time']
            y = progress.loc[algo+'_z_dense_'+str(i)]['f-f_min']
            plt.plot(x, y, 'g', linewidth=2, label='z dense')

            x = progress.loc[algo+'_x_sparse_'+str(i)]['time']
            y = progress.loc[algo+'_x_sparse_'+str(i)]['f-f_min']
            plt.plot(x, y, '--r', linewidth=2, label='x_sparse')

            x = progress.loc[algo+'_z_sparse_'+str(i)]['time']
            y = progress.loc[algo+'_z_sparse_'+str(i)]['f-f_min']            
            plt.plot(x, y, '--g', linewidth=2, label='z_sparse')
            #plt.xscale('log')
            plt.yscale('log')
            plt.legend(loc=0)
            plt.ylabel('log10 f - f_min', fontsize=16)
            plt.xlabel('time in seconds', fontsize=16)
            #plt.title(algo+' experiment '+str(i), fontsize=16)
            plt.title(algo, fontsize=16)
            plt.show()
Ejemplo n.º 10
0
def display_progress():

    progress = pd.load('results/progress.pkl')

    for algo in ['batch', 'bb', 'lbfgs']:
        for i in range(1):
            
            data = progress.loc[algo+'_x_'+str(i)]
            x = np.array(data['time'])
            y = np.array(data['f-f_min'])
            x, log_y, alpha = clean_progress(x, y)
            plt.plot(x, log_y, 'r', label=algo+' in x', linewidth=2)
            plt.plot(x, alpha*x + log_y[0], '--r', label='linear fit', linewidth=2)

            data = progress.loc[algo+'_z_'+str(i)]
            x = np.array(data['time'])
            y = np.array(data['f-f_min'])
            x, log_y, alpha = clean_progress(x, y)
            plt.plot(x, log_y, 'g', label=algo+' in z', linewidth=2)
            plt.plot(x, alpha*x + log_y[0], '--g', label='linear fit', linewidth=2)

            plt.legend(loc=0)
            plt.xlabel('time (s)', fontsize=20)
            plt.ylabel('f-f_min', fontsize=20)
            plt.title(algo+' experiment '+str(i), fontsize=20)
            plt.show()
Ejemplo n.º 11
0
 def __init__(self, table):
     """ 
     table : string
         Expect to be a csv file with the first column the name of the item
     """
     ext = os.path.splitext(table)[-1]
     if ext == '.csv':
         self.df = pd.read_csv(table, index_col=0)
     elif ext == '.pickle':
         self.df = pd.load(table)
     else:
         raise Exception('table extension not .csv or .pickle')
     self.cols = self.df.columns
     self.index = self.df.index
     self.df['name'] = self.index
     # this is the list of types that I've found so far that  pandas.read_csv generates
     self.types = [
         dict(float64='Number',
              float='Number',
              int64='Number',
              object='Value',
              bool='bool')[str(t)] for t in self.df.dtypes
     ]
     print 'Loaded CSV file %s: found %d items, %d columns' % (
         table, len(self.df), len(self.cols))
def get_annual_total_net_assets(force_db_read=False, df_file = 'monthlytna.pandas'):
    """ gets average total net assets per fund per year 
    
    NOTE: This is not used in the current iteration of research    
    """
    # caching to make things faster most of the time
    if not force_db_read and os.path.isfile(df_file):
        print "Loading file from cache:",df_file
        return pd.load(df_file)
    #otherwise read from db    
    try:
        print "Getting Total Net Assets from database"
        con=None
        con = lite.connect(config['db_path'])    
        cur = con.cursor()      
        sql = """select crsp_fundno, substr(caldt,1,4) as year, mtna from MONTHLY_TNA where mtna <> '';"""
        cur.execute(sql) 
        data = cur.fetchall()
        print "Done getting Total Net Assets"
        df = pd.DataFrame(list(data),columns=['FundNo','Year','mtna'],dtype=np.float64) 

        df=df.replace({'mtna':-99.0}, value=np.nan)
        df.dropna(subset=['mtna'])

        grouped = df.groupby(['FundNo','Year'])
        tna_means = grouped.mean()
        print "Saving monthly TNA dataframe for next time as",df_file
        tna_means.save(df_file)
        return tna_means
    except lite.Error, e:
        print "Error %s:" % e.args[0]
        sys.exit(1)
Ejemplo n.º 13
0
def main(args):
    df = pd.load(args.df)

    estimators = [('reduce_dim', KernelPCA(kernel='linear')),
                  ('knc', KNeighborsClassifier(warn_on_equidistant=False))]

    clf = Pipeline(estimators)

    params = dict(reduce_dim__n_components=[2, 3, 5, 8, 10, 25, 50, 100])

    grid_search = GridSearchCV(clf, param_grid=params, cv=3)

    best = 0.0
    while True:
        random_seed = int(time.time() * 1000)
        random.seed(random_seed)

        df_l, y_l, df_t = split_df(df, 0.33)

        try:
            grid_search.fit(df_l, y_l)
        except ValueError:
            # Most likely wrong number of classes in cross validation set
            continue

        y_t = integer_labels(df_t)
        c = grid_search.predict(df_t)

        score, c = compare_clusters(c, y_t)

        if score > best:
            best = score
            print('{} {}'.format(random_seed, best))
            with open(args.out, 'a') as fh:
                fh.write('{} {}\n'.format(random_seed, best))
Ejemplo n.º 14
0
def calc_all_fund_r2(fund_returns,
                     asset_returns,
                     force_calc=False,
                     df_file='r2_all_funds.pandas'):
    """ Calculates R2 across all funds and asset class benchmarks, returns dataframe of results """
    # caching to make things faster most of the time
    if not force_calc and os.path.isfile(df_file):
        print "Loading R2 file from cache:", df_file
        return pd.load(df_file)
    #otherwise re-calc
    asset_r2_results = {}
    fund_list = get_fund_list(fund_returns)
    asset_list = asset_classes
    # dataframe to hold our results, index by fund, columns will be asset classes
    r2_df = pd.DataFrame(index=fund_list)
    for asset_series in asset_returns:
        print '***** Calculating R2 for asset class', asset_series.name
        for fund in fund_list:
            #reindex as datetime so we can resample to EOM dates and join with the benchmark
            f = fund_returns.ix[fund]
            f.index = pd.DatetimeIndex(f.index)
            f = f.resample(
                'M', how='prod'
            )  # just to normalize dates, shouldn't change data points
            r2 = calc_r2(f.join(asset_series, how='outer'))
            asset_r2_results[fund] = r2
            print 'R2(%s, fund %s) = %s' % (asset_series.name, fund, r2)

        r2_df[asset_series.name] = pd.Series(asset_r2_results)
        asset_r2_results = {}
    print 'Saving R2 results to', df_file
    r2_df.save(df_file)
    return r2_df
Ejemplo n.º 15
0
    def data(self):
        """ Returns a pandas.DataFrame of data, or None if not available. """

        with transaction.commit_on_success():
            self.refresh()
            if not self.status == Job.COMPLETE:
                raise ValueError("Job not complete, no data available")

            self.reference("data()")

            e = None
            try:
                logger.debug("%s looking for data file: %s" %
                             (str(self), self.datafile()))
                if os.path.exists(self.datafile()):
                    df = pandas.load(self.datafile())
                    logger.debug("%s data loaded %d rows from file: %s" %
                                 (str(self), len(df), self.datafile()))
                else:
                    logger.debug("%s no data, missing data file: %s" %
                                 (str(self), self.datafile()))
                    df = None
            except Exception as e:
                logger.error("Error loading datafile %s for %s" %
                             (self.datafile(), str(self)))
                logger.error("Traceback:\n%s" % e)
            finally:
                self.dereference("data()")

            if e:
                raise e

            return df
Ejemplo n.º 16
0
    def data(self):
        """ Returns a pandas.DataFrame of data, or None if not available. """

        with transaction.commit_on_success():
            self.refresh()
            if not self.status == Job.COMPLETE:
                raise ValueError("Job not complete, no data available")
            
            self.reference("data()")

            e = None
            try:
                logger.debug("%s looking for data file: %s" %
                             (str(self), self.datafile()))
                if os.path.exists(self.datafile()):
                    df = pandas.load(self.datafile())
                    logger.debug("%s data loaded %d rows from file: %s" %
                                 (str(self), len(df), self.datafile()))
                else:
                    logger.debug("%s no data, missing data file: %s" %
                                 (str(self), self.datafile()))
                    df = None
            except Exception as e:
                pass
            finally:
                self.dereference("data()")

            if e:
                raise e
            
            return df
Ejemplo n.º 17
0
def get_annual_total_net_assets(force_db_read=False,
                                df_file='monthlytna.pandas'):
    """ gets average total net assets per fund per year 
    
    NOTE: This is not used in the current iteration of research    
    """
    # caching to make things faster most of the time
    if not force_db_read and os.path.isfile(df_file):
        print "Loading file from cache:", df_file
        return pd.load(df_file)
    #otherwise read from db
    try:
        print "Getting Total Net Assets from database"
        con = None
        con = lite.connect(config['db_path'])
        cur = con.cursor()
        sql = """select crsp_fundno, substr(caldt,1,4) as year, mtna from MONTHLY_TNA where mtna <> '';"""
        cur.execute(sql)
        data = cur.fetchall()
        print "Done getting Total Net Assets"
        df = pd.DataFrame(list(data),
                          columns=['FundNo', 'Year', 'mtna'],
                          dtype=np.float64)

        df = df.replace({'mtna': -99.0}, value=np.nan)
        df.dropna(subset=['mtna'])

        grouped = df.groupby(['FundNo', 'Year'])
        tna_means = grouped.mean()
        print "Saving monthly TNA dataframe for next time as", df_file
        tna_means.save(df_file)
        return tna_means
    except lite.Error, e:
        print "Error %s:" % e.args[0]
        sys.exit(1)
Ejemplo n.º 18
0
def process_wave_height(awac_path):
    awac_path = os.path.normpath(awac_path)
    path = os.path.sep.join(awac_path.split(os.path.sep)[:-1])
    os.chdir(path)
    awac_file_name = awac_path.split(os.path.sep)[-1:][0]
    wave_height_df = pd.load(awac_file_name)
    get_stats_from_df(wave_height_df, "wave_height_decibar", path)
Ejemplo n.º 19
0
def chewsmet_txt(path, title):

    import pandas

    metdf = pandas.load(path + title + '.df')
    # open all file end with .df
    #metdf=pandas.load('/Users/Eating/Documents/ATM_R/python program/20150615_7.df')
    #print metdf['Wdir'][-1]
    #raw_input()
    #print metdf.index

    #raw_input()
    metdfFile = open(path + title + '.txt', 'w')
    #metdfFile=open('/Users/Eating/Documents/ATM_R/python program/20150615_7.txt','w')

    metdfFile.write(
        'Date_Time, Wind Speed, Wind Dir, Temperature, WaterPress, Humidity\n '
    )
    for i in range(0, len(metdf.index)):
        ##print metdf['Wdir'][i]
        #print metdf['Wspd'][i]

        metdfFile.write(
            str(metdf.index[i]) + ', ' + str(metdf['Wspd'][i]) + ', ' +
            str(metdf['Wdir'][i]) + ',' + str(metdf['temp'][i]) + ',' +
            str(metdf['H20Pres'][i]) + ',' + str(metdf['humidity'][i]) + '\n')

    #raw_input()
    metdfFile.close()

    print('done')
Ejemplo n.º 20
0
def readPkl():
    currDir = os.getcwd()
    files = os.path.join(currDir, 'pkl', 'BasicFeatures', '*.pkl')
    for src in glob.glob(files):
        df = pd.load(src)
        print df.index[:10]
        print df.index[0].strftime('%Y%m%d')
Ejemplo n.º 21
0
def load_quote(currency, path=NETFOUNDS_QUOTES_PATH):
    try:
        return pd.load(path + '/{}.dat'.format(currency))
    except FileNotFoundError:
        return None
    except Exception as e:
        raise(e)
Ejemplo n.º 22
0
def load_quote(currency, path=NETFOUNDS_QUOTES_PATH):
    try:
        return pd.load(path + '/{}.dat'.format(currency))
    except FileNotFoundError:
        return None
    except Exception as e:
        raise (e)
Ejemplo n.º 23
0
def get_style_bucket_funds(fund_returns, force_bucket=False, bucket_file='bucketed_style.pandas'):
    """ Creates a dataframe of funds that by style mapped to each asset class"""
    if not force_bucket and os.path.isfile(bucket_file):
        print "Loading bucketed style file from cache:",bucket_file
        return pd.load(bucket_file)
    #otherwise create the bucket file
    print "Bucketing by style"
    
    fund_styles = get_fund_styles()
    
    # apply mapping of our asset classes to styles
    # [[put in metamappings.py]]
    asset_style_results = {}
    fund_list = get_fund_list(fund_returns)
    asset_list = asset_classes
    # dataframe to hold our results, index by fund, columns will be asset classes
    style_df=pd.DataFrame(index=fund_list)
    for asset_class in asset_classes:
        print '***** Looking up styles for asset class',asset_class 
        styles = crsp_style_mapping[asset_class] 
        #loop through on the funds we have styles for
        for fund in list(set(fund_list).intersection(list(fund_styles.index))):
            if fund_styles.ix[fund]['StyleCode'] in styles:
                asset_style_results[fund] = 1
        style_df[asset_class] = pd.Series(asset_style_results)
        asset_style_results = {}    
    
    # take only the matching styles, and drop any rows with no assets highly correlated
    bucketed_df=style_df[style_df.apply(lambda x: x == 1, axis=1)].dropna(how='all')  
    
    # save for next time
    bucketed_df.save(bucket_file)
    return bucketed_df 
Ejemplo n.º 24
0
def main():
    os.chdir("../../")
    
    # Load the DataFrame representing the MovieLens subset
    
    ratings=pd.load('input_data/ratings_train.pda')
    ratings = ratings.dropna()
    nratings = ratings.shape[0]
    
    # Load the CFiltering_item object 
    
    fp = open("proc_data/cfilter_object.pda", "rb")
    cf3 = pkl.load(fp)
    fp.close()
    
    try:
        
        # we need to make sure everything's ok with our code
        # so we will do some tests on dimished MovieLens data
        
        item_cf_ratings = cf3.get_cf_rating(ratings.ix[:10000,:])
    
        # Compute squared loss on those ratings
        
        avgSquaredLoss = np.sum(np.square(item_cf_ratings - ratings.ix[:10000, :]['rating'])) / len(ratings)
        print avgSquaredLoss        # 1.59836284936 on the first 10000 items!
 
        
    except LogicalError as l:
        print "A logical error occurred: %s" %(l)
    except DatasetError as d:
        print "A dataset-related error occurred: %s" %(d)
    except Exception as e:
        print "An exception occurred: " + str(e)
Ejemplo n.º 25
0
def chewsmet_txt(path,title):
    
    import pandas

    metdf=pandas.load(path+title+'.df')
    # open all file end with .df
#metdf=pandas.load('/Users/Eating/Documents/ATM_R/python program/20150615_7.df')
#print metdf['Wdir'][-1]
#raw_input()
#print metdf.index

#raw_input()
    metdfFile=open(path+title+'.txt','w')
#metdfFile=open('/Users/Eating/Documents/ATM_R/python program/20150615_7.txt','w')

    metdfFile.write('Date_Time, Wind Speed, Wind Dir, Temperature, WaterPress, Humidity\n ')
    for i in range(0,len(metdf.index)):
    ##print metdf['Wdir'][i]
    #print metdf['Wspd'][i]

         metdfFile.write(str(metdf.index[i])+', '+str(metdf['Wspd'][i])+', '+str(metdf['Wdir'][i])+','+str(metdf['temp'][i])+','+str(metdf['H20Pres'][i])+','+str(metdf['humidity'][i])+'\n')
    
    #raw_input()
    metdfFile.close()  
  
    print('done')
Ejemplo n.º 26
0
    def load_assoc(self, fromdf=None):
        if fromdf is not None:
            print 'loading associations from file %s' % fromdf
            self.df['altassoc'] = pd.load(fromdf)
        else:
            print 'using associations found in sourceinfo'
        associations = self.df.associations if fromdf is None else self.df.altassoc
        probfun = lambda x: x['prob'][0] if not pd.isnull(x) else 0
        self.df['aprob'] = np.array([probfun(assoc) for assoc in associations])
        self.df['acat'] = np.array([
            assoc['cat'][0] if not pd.isnull(assoc) else 'unid'
            for assoc in associations
        ])
        self.df['aname'] = np.array([
            assoc['name'][0] if not pd.isnull(assoc) else 'unid'
            for assoc in associations
        ])
        self.df['aang'] = np.array([
            assoc['ang'][0] if not pd.isnull(assoc) else np.nan
            for assoc in associations
        ])

        self.df['adeltats'] = np.array([
            assoc['deltats'][0] if not pd.isnull(assoc) else np.nan
            for assoc in associations
        ])

        self.df10 = self.df.ix[self.df.ts > 10]
        print 'associated: %d/%d' % (sum(self.df10.aprob > 0.8), len(
            self.df10))
Ejemplo n.º 27
0
def calc_all_fund_r2(fund_returns, asset_returns, force_calc=False, df_file='r2_all_funds.pandas'):
    """ Calculates R2 across all funds and asset class benchmarks, returns dataframe of results """
    # caching to make things faster most of the time
    if not force_calc and os.path.isfile(df_file):
        print "Loading R2 file from cache:",df_file
        return pd.load(df_file)
    #otherwise re-calc 
    asset_r2_results = {}
    fund_list = get_fund_list(fund_returns)
    asset_list = asset_classes
    # dataframe to hold our results, index by fund, columns will be asset classes
    r2_df=pd.DataFrame(index=fund_list)
    for asset_series in asset_returns:
        print '***** Calculating R2 for asset class',asset_series.name 
        for fund in fund_list:
            #reindex as datetime so we can resample to EOM dates and join with the benchmark
            f = fund_returns.ix[fund]
            f.index = pd.DatetimeIndex(f.index)
            f = f.resample('M', how='prod') # just to normalize dates, shouldn't change data points
            r2 = calc_r2(f.join(asset_series,how='outer'))
            asset_r2_results[fund] = r2 
            print 'R2(%s, fund %s) = %s' % (asset_series.name,fund,r2)
            
        r2_df[asset_series.name] = pd.Series(asset_r2_results)
        asset_r2_results = {}
    print 'Saving R2 results to',df_file 
    r2_df.save(df_file)
    return r2_df 
Ejemplo n.º 28
0
 def loadDf(filename):
     '''
     load '.df' files
     ret: DataFrame
     '''
     temp_pickle = pandas.load(filename)
     print temp_pickle
     return temp_pickle
Ejemplo n.º 29
0
 def test_wap_dataframe(self):
     print("TestParseWap")
     try:        
         parse_wap.load(awac_folder_path + 'test_data.wap')
     except WindowsError:
         print("Load wap Files failed")
     wap_dataframe = pd.load('test_data_wap_df')
     self.assertEqual(len(wap_dataframe),wad_records)
Ejemplo n.º 30
0
def load_quote(currency):
    try:
        return pd.load(DUKASCOPY_QUOTES_PATH + '/{}.dat'.format(currency))
    except FileNotFoundError as e:
        print(e)
        return None
    except Exception as e:
        raise (e)
Ejemplo n.º 31
0
def filter_cz_cities(input="geography-first-all-2.pd", output=None):
    data = pd.load(input)

    places = get_maps()["Czech Rep.-city"]
    filtered = data[data["item"].isin(places)]

    filtered.save(output)
    print filtered
Ejemplo n.º 32
0
def filter_europe(input="geography-first-all-2.pd", output=None):
    data = pd.load(input)

    places = get_maps()["Europe-country"]
    filtered = data[data["item"].isin(places)]

    filtered.save(output)
    print filtered
def get_all_fund_returns(force_db_read=False, df_file = 'fund_returns_lite_nona_reindex.pandas'):
    """ Reads returns for all funds from the database (excludes some things we aren't interested in """
    # caching to make things faster most of the time
    if not force_db_read and os.path.isfile(df_file):
        print "Loading file from cache:",df_file
        return pd.load(df_file)
    #otherwise read from db
    print "Reading fund returns from the database"
    try:
        con = lite.connect(config['db_path'])   
        cur = con.cursor()      
        
        # run pragmas (case sensivity on)
        cur.execute(pragmas)
        
        sql= """select mr.crsp_fundno, caldt, mret from FUND_HDR fhdr, MONTHLY_RETURNS mr 
        where fhdr.crsp_fundno=mr.crsp_fundno  
        %s
        ;""" % common_excludes_data_load 
        cur.execute(sql)
        data = cur.fetchall()
        
        print 'Parsing database results (those pesky date parses take a little while)'
 
        df = pd.DataFrame(list(data),columns=['FundNo', 'StrCalDate', 'Return'])
        
        # parse the date column (string slicing is faster than datetime.strptime)
        parsed_dates = df['StrCalDate'].map(lambda x: date(int(str(x)[0:4]),int(str(x)[4:6]),int(str(x)[6:8]) ))
        
        # add back to the dataframe
        df['CalDate'] = pd.Series(parsed_dates,index=df.index) 
        
        # remove unneeded column
        del df['StrCalDate']
        # change -99 values to NaN, then drop them
        print 'Drop the -99.0 values'
        df=df.replace({'Return':-99.0}, value=np.nan)
        df=df.dropna(subset=['Return'])
        # add one to all the returns per our convention
        df['Return']=pd.Series(df['Return'],index=df.index) + 1

        print 'Re-index by FundNo and Date'
        df.index = [df['FundNo'],pd.DatetimeIndex(df['CalDate'])]
        
        # now that these are indexes, no need to keep the column data
        del df['CalDate']
        del df['FundNo']
        
        #save the file for next time
        print "Saving returns dataframe for next time as",df_file 
        df.save(df_file)
        
        return df
        
    except lite.Error, e:
        
        print "Error %s:" % e.args[0]
        sys.exit(1)
Ejemplo n.º 34
0
def Improve_plot(filepath):
    import os
    import pandas as pan
    from matplotlib import pyplot as plt

    os.chdir(filepath)
    df = pan.load('IMPROVE_data_all.pickle')

    return df
def load_table():

    # Load the table with the cores
    results_dir = '/d/bip3/ezbc/multicloud/data/python_output/'
    filename = results_dir + 'tables/multicloud_model_params.pickle'

    df = pd.load(filename)

    return df
Ejemplo n.º 36
0
 def load(self, name):
     '''
     Checks for an existing file name and if exists returns the data saved
     '''
     f = os.path.join(self.cache_dir, name)
     if os.access(f, os.F_OK):
         return pd.load(f)
     else:
         return None
Ejemplo n.º 37
0
def load_quote(currency):
    try:
        return pd.load(DUKASCOPY_QUOTES_PATH +
                       '/{}.dat'.format(currency))
    except FileNotFoundError as e:
        print(e)
        return None
    except Exception as e:
        raise(e)
def load_table():

    # Load the table with the cores
    results_dir =  '/d/bip3/ezbc/multicloud/data/python_output/'
    filename = results_dir + 'tables/multicloud_model_params.pickle'

    df = pd.load(filename)

    return df
Ejemplo n.º 39
0
def filter_states(input="geography-first-all-2.pd", output=None):
    data = pd.load(input)

    places = reduce(list.__add__, get_continents_country_maps().values(), [])

    filtered = data[data["item"].isin(places)]

    filtered.save(output)
    print filtered
Ejemplo n.º 40
0
 def test_wad_dataframe(self):
     print("TestParseWad")
     try:        
         path = awac_folder_path + 'test_data.wad'
         parse_wad.ParseWad(path)
     except WindowsError:
         print("Load Wad Files failed")        
     wad_dataframe = pd.load('test_data_wad_df')
     self.assertEqual(len(wad_dataframe),number_of_records)
Ejemplo n.º 41
0
def Improve_plot(filepath):
    import os
    import pandas as pan
    from matplotlib import pyplot as plt

    os.chdir(filepath)
    df = pan.load('IMPROVE_data_all.pickle')

    return df
Ejemplo n.º 42
0
def get_r2_bucket_funds(threshold=0.9, force_bucket=False, r2_file='r2_all_funds.pandas', bucket_file='bucketed_r2.pandas'):
    """ Creates a dataframe of funds that are highly correlated (by threshold) to each asset class"""
    if not force_bucket and os.path.isfile(bucket_file):
        print "Loading bucketed R2 file from cache:",bucket_file
        return pd.load(bucket_file)
    #otherwise create the bucket file
    if not os.path.isfile(r2_file): 
        print "R2 file (%s) does not exist. Run calc_all_fund_r2() first" % r2_file 
        return False 
    else:
        print "Bucketing by R2"
    r2_df = pd.load(r2_file)
    # take only the ones over threshold, and drop any rows with no assets highly correlated
    bucketed_df=r2_df[r2_df.apply(lambda x: x > threshold, axis=1)].dropna(how='all')  

    # save for next time
    bucketed_df.save(bucket_file)
    return bucketed_df
def display_progress():
    coherence_data = pd.load('results/coherences_dense.pkl')
    rate_data = pd.load('results/rates_dense.pkl')
    for distribution in distributions:
        for i, algorithm in enumerate(algorithms):
            mask = rate_data.loc[distribution].index.get_level_values(0)
            avg_rates = []
            for ratio in measurement_ratios:
                rate = rate_data.loc[distribution].loc[mask==ratio]
                rate.index = rate.index.droplevel(0)
                #print 'average rate for {}, '.format(ratio) + algorithm + ', ' + distribution
                avg_rates.append(np.mean(rate.loc[algorithm]))
            plt.plot(index, avg_rates, colors[i], linewidth=2, markersize=7, label=algorithm)
        plt.legend(loc=0)
        plt.title(distribution, fontsize=16)
        plt.xticks(range(5), [0.01,0.03,0.1,0.3,1])
        plt.xlabel('measurements / dimension', fontsize=16)
        plt.ylabel('log10 rate', fontsize=16)
        plt.show() 
Ejemplo n.º 44
0
 def test_awac_stats(self):
     print("TestAwacStats")
     try:
         path = awac_folder_path + 'test_data.wad'
         parse_wad.ParseWad(path)        
         awac_stats.process_wave_height(awac_folder_path + 'awac_wave_height_df')
     except WindowsError:
         print("Load wap Files failed")
     wave_height_dataframe = pd.load('awac_stats_30min')
     self.assertEqual(len(wave_height_dataframe),number_of_awac_stats)          
def get_distance_json(origins=["Paris"], destinations=["Lyon"], key=API_key):
    url = "https://maps.googleapis.com/maps/api/distancematrix/json?"
    origins = "origins=" + "|".join(origins)
    destinations = "destinations=" + "|".join(destinations)
    key = "key=" + key
    response = requests.get(url + "&" + origins + "&" +
                            destinations + "&" + key).text
    json_data = json.loads(response)
    df = pd.load(json_data)
    return df
    def __init__(self) :
        """This reads two csv files into pandas:
                PD_RO: a csv with UIDs, Names, Prices and Modules. It is called RO as it is Read Only. It will never be altered. NOTE: The UIDs are
                       imported to pandas as the index values.
                PD_CL: a csv with UIDs, Names, Contact Details, Status Variables, and Comments. It is called CL as it is a contact list.
                       This csv will be directly modifiable by the program according to user input. Note: the UIDs are imported to pandas as index values."""

        self.PD_RO = pd.read_csv(r"C:\Users\IPAB\Dropbox\MarketingTeamFolder\DataEntry\ReadOnlyFiles\FactoryList\Wave1\FactoryList.csv", index_col = 'UID')
        self.PD_CL = pd.read_csv(r"C:\Users\IPAB\Dropbox\MarketingTeamFolder\DataEntry\ContactList\ContactList_Wave1.csv", index_col = 'UID')
        self.DF = pd.load(r"C:\Users\IPAB\Dropbox\MarketingTeamFolder\DataEntry\ReadOnlyFiles\DataFrames\Wave1\FollowUp1DataFrame")
Ejemplo n.º 47
0
def load(store_cache_file=False, cache_file=None):
    if not store_cache_file and cache_file:
        return pd.load(cache_file)
    else:
        #TODO load your data into a DataFrame
        result = pd.DataFrame()  #replace with your data

        if store_cache_file and cache_file:
            result.to_csv(cache_file)

        return result
Ejemplo n.º 48
0
def load_data(path, smoothstr):
    return (
        pd.load(path + "/expmean" + smoothstr + ".df"),
        pd.load(path + "/ctrlmean" + smoothstr + ".df"),
        pd.load(path + "/expstd" + smoothstr + ".df"),
        pd.load(path + "/ctrlstd" + smoothstr + ".df"),
        pd.load(path + "/expn" + smoothstr + ".df"),
        pd.load(path + "/ctrln" + smoothstr + ".df"),
        pd.load(path + "/df2" + smoothstr + ".df"),
    )
Ejemplo n.º 49
0
def get_r2_bucket_funds(threshold=0.9,
                        force_bucket=False,
                        r2_file='r2_all_funds.pandas',
                        bucket_file='bucketed_r2.pandas'):
    """ Creates a dataframe of funds that are highly correlated (by threshold) to each asset class"""
    if not force_bucket and os.path.isfile(bucket_file):
        print "Loading bucketed R2 file from cache:", bucket_file
        return pd.load(bucket_file)
    #otherwise create the bucket file
    if not os.path.isfile(r2_file):
        print "R2 file (%s) does not exist. Run calc_all_fund_r2() first" % r2_file
        return False
    else:
        print "Bucketing by R2"
    r2_df = pd.load(r2_file)
    # take only the ones over threshold, and drop any rows with no assets highly correlated
    bucketed_df = r2_df[r2_df.apply(lambda x: x > threshold,
                                    axis=1)].dropna(how='all')

    # save for next time
    bucketed_df.save(bucket_file)
    return bucketed_df
Ejemplo n.º 50
0
def main(outfile,
         infile='sources.pickle',
         cuts='(sources.ts>10)*(sources.a<0.25)'):
    assert os.path.exists(infile), 'Input file "%s" not found' % infile
    sources = pd.load(infile)
    print 'Loaded DataTable file %s' % infile
    selected = sources[eval(cuts)]
    print 'applied cuts %s: %d -> %d sources' % (cuts, len(sources),
                                                 len(selected))
    t = MakeCat(selected)
    if outfile is None:
        outfile = '_'.join(os.path.abspath('.').split('/')[-2:]) + '.fits'
        # for example, 'P202_uw10.fits'
    t(outfile)
def load_cold_cores():

    # summary of cold clump data
    # http://wiki.cosmos.esa.int/planckpla2015/index.php/Catalogues#Individual_catalogues

    table_dir = '/d/bip3/ezbc/multicloud/data/cold_clumps/'
    df_dir = '/d/bip3/ezbc/multicloud/data/python_output/tables/'
    filename = table_dir + 'HFI_PCCS_GCC_R2.02.fits'

    if 0:
        print('\nAnalyzing table...')
        cc_hdu = fits.open(filename)

        cc_data = cc_hdu[1].data

        # get the region vertices
        regions = load_regions()

        df = dict()
        df['Glon'] = []
        df['Glat'] = []
        df['ra'] = []
        df['dec'] = []
        df['Region'] = []
        df['SNR'] = []
        for i in xrange(len(cc_data)):
            #if myg.point_in_polygon(ra, region_vertices):
            ra = cc_data[i][3]
            dec = cc_data[i][4]
            #if ra < 80 and ra > 40 and dec < 45 and dec > 15:
            region_check = check_region((ra, dec), regions)
            if region_check is not None:
                df['Glon'].append(cc_data.field('GLON')[i])
                df['Glat'].append(cc_data.field('GLAT')[i])
                df['ra'].append(cc_data.field('RA')[i])
                df['dec'].append(cc_data.field('DEC')[i])
                df['SNR'].append(cc_data.field('SNR')[i])
                df['Region'].append(region_check)

        df = pd.DataFrame(df)

        df.save(df_dir + 'multicloud_cold_clumps.pickle')
    else:
        df = pd.load(df_dir + 'multicloud_cold_clumps.pickle')

    print('\nFinished loading...')

    return df
Ejemplo n.º 52
0
def main(args):
    df = pd.load(args.df)

    svc_seed = int(time.time() * 1000)
    estimators = [('reduce_dim', KernelPCA(kernel='linear')),
                  ('svm', LinearSVC(random_state=svc_seed))]

    clf = Pipeline(estimators)

    equiv_sets = None
    params = dict(reduce_dim__n_components=[2, 3, 5, 8, 10, 25, 50, 100],
                  svm__C=[0.25, 0.5, 1, 2, 4, 8, 16])

    if args.twoclass:
        params = dict(reduce_dim__n_components=[
            2, 3, 5, 8, 10, 25, 40, 45, 50, 55, 60, 75, 100
        ])
        equiv_sets = [['external plexiform', 'glomerular cell layer'],
                      ['internal plexiform', 'mitral cell layer']]

    grid_search = GridSearchCV(clf, param_grid=params, cv=3)

    best = 0.0
    while True:
        random_seed = int(time.time() * 1000)
        random.seed(random_seed)

        df_l, _, df_t = split_df(df, 0.33)
        y_l = integer_labels(df_l, equiv_sets)

        try:
            grid_search.fit(df_l, y_l)
        except ValueError:
            # Most likely wrong number of classes in cross validation set
            continue

        y_t = integer_labels(df_t, equiv_sets)
        c = grid_search.predict(df_t)

        score, c = compare_clusters(c, y_t)

        if score > best:
            best = score
            print('{} {} {}'.format(svc_seed, random_seed, best))
            with open(args.out, 'a') as fh:
                fh.write('{} {} {}\n'.format(svc_seed, random_seed, best))
Ejemplo n.º 53
0
    def setup(self, othermodel='../P202_5years/uw700', **kw):
        super(UWsourceComparison,self).setup()
        self.plotfolder = 'comparison_%s' % othermodel.split('/')[-1]

        otherfilename = '../%s/sources.pickle' %othermodel
        self.othermodel=othermodel
        assert os.path.exists(otherfilename), 'File %s not found' % otherfilename
        print 'loading %s' % otherfilename
        odf = pd.load(otherfilename)
        self.odf = odf[odf.ts>10]
        self.df = self.df[self.df.ts>10]
        self.df['pindex_old']=self.odf.pindex
        self.df['ts_old'] = self.odf.ts
        self.df['eflux_old']=self.odf.eflux
        self.df['a_old'] = self.odf.a
        self.df['skydir_old'] = self.odf.skydir
        self.df['sedrec_old'] = self.odf.sedrec
Ejemplo n.º 54
0
def main(args):
    df = pd.load(args.df)
    y = integer_labels(df)

    pca = KernelPCA(None, kernel=args.kernel)
    pca.fit(df)
    X = pca.transform(df)

    nonzero_components = X.shape[1]

    seed = int(time.time() * 1000)

    gmm = GMM(4, n_init=10, random_state=seed)
    gmm.fit(X)
    c = gmm.predict(X)

    score, _ = compare_clusters(c, y)

    best = score

    with open(args.out, 'w') as fh:
        fh.write('{} {} {} {}\n'.format(args.kernel, nonzero_components, seed,
                                        best))

    n_comps = range(
        2, 16) + [int(i) for i in np.linspace(16, nonzero_components, 20)]

    for n in n_comps:
        pca = KernelPCA(n, kernel=args.kernel)
        pca.fit(df)
        X = pca.transform(df)

        for i in range(128):
            seed = int(time.time() * 1000)

            gmm = GMM(4, random_state=seed)
            gmm.fit(df)
            c = gmm.predict(df)

            score, _ = compare_clusters(c, y)
            if score > best:
                best = score
                with open(args.out, 'a'):
                    fh.write('{} {} {} {}\n'.format(args.kernel, n, seed,
                                                    best))
Ejemplo n.º 55
0
def traceproc(aerofilt_dir):
    import hysplit_tools as tools
    import os, sys
    import numpy as np
    import pandas as pan

    startdir = os.getcwd()

    topdir = aerofilt_dir

    os.chdir(topdir)

    data_files = os.listdir(os.getcwd())

    d_mean = []
    d_std = []
    t_mean = []
    t_std = []
    endpos_mean = []
    endpos_std = []
    start_time = []
    station = []

    #run through all location folders
    for f in data_files:
        if os.path.isdir(f):
            os.chdir(f)
            tracefile = f + 'traceback'
            #open traceback file
            trace_df = pan.load(tracefile)

            #create a separate dict of lists for each day and put those into a
            #list called dictlist

            dates = trace_df.index()
            keys = trace_df.columns()

            by = lambda x: lambda y: getattr(y, x)

            trace_mean = trace_df.groupby([by('month'), by('day')]).mean()
            trace_std = trace_df.groupby([by('month'), by('day')]).std()

    pan.save(df_out, 'Hyproc.pickle')

    os.chdir(startdir)