Example #1
0
def plots_workingTrends():

	# holiday = 0 and workday = 0 => weekend
	# let's see if holidays and weekends give the same trends

	# Day trends -- working vs. non-working day
	hours = np.linspace(0,23,24)

	days_average = DataFrame({'Hour': hours})

	# workdays
	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["workingday"] == 1) & (bike_data["time"] == hour) ].mean()['count'])
	days_average = days_average.join(DataFrame({'Working day': mean_vec}))

	# holidays or weekends
	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["workingday"] == 0) & (bike_data["time"] == hour) ].mean()['count'])
	days_average = days_average.join(DataFrame({'Non-working day': mean_vec}))

	days_average.drop('Hour',axis=1).plot(figsize=(12, 6), linewidth=3, fontsize=16)
	plt.xlabel('Hour', fontsize=16)
	plt.ylabel('Average counts', fontsize=16)
	plt.legend(loc='best', fontsize=16)
	plt.show()
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples(
            [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        self.assertTrue(lexsorted_df.columns.is_lexsorted())

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                     data=[[1, 'b1', 'c1', 3],
                                           [1, 'b2', 'c2', 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(
            index='a', columns=['b', 'c'], values='d')
        not_lexsorted_df = not_lexsorted_df.reset_index()
        self.assertFalse(not_lexsorted_df.columns.is_lexsorted())

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop('a', axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop('a', axis=1)

        tm.assert_frame_equal(result, expected)
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr):
    '''
    Meant to feed into a Pivot requested by Mitch Turner.
    
    Aggregates the same as above but includes time and product data.
    '''
    dat = pwunsale_tidy['Date'].tolist()
    pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat]    
    
    print('Aggregating custom pivot for Mitch.')
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    custom_cols = ['Month','CustomerId','Customer','ProductId','Product']    
    
    customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)

    return customer_returns
Example #4
0
    def test_mixed_depth_drop(self):
        arrays = [[  'a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  [   '',  'OD',  'OD', 'result1',   'result2',  'result1'],
                  [   '',  'wx',  'wy',        '',          '',         '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4,6),columns = index)

        result = df.drop('a',axis=1)
        expected = df.drop([('a','','')],axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(['top'],axis=1)
        expected = df.drop([('top','OD','wx')], axis=1)
        expected = expected.drop([('top','OD','wy')], axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(('top', 'OD', 'wx'), axis=1)
        expected = df.drop([('top','OD','wx')], axis=1)
        assert_frame_equal(expected, result)

        expected = df.drop([('top','OD','wy')], axis=1)
        expected = df.drop('top', axis=1)

        result = df.drop('result1', level=1, axis=1)
        expected = df.drop([('routine1', 'result1', ''),
                            ('routine2', 'result1', '')], axis=1)
        assert_frame_equal(expected, result)
Example #5
0
def process_recarray_pandas(data, endog_idx=0, exog_idx=None, dtype=None,
                            index_idx=None):

    data = DataFrame(data, dtype=dtype)
    names = data.columns

    if isinstance(endog_idx, (int, long)):
        endog_name = names[endog_idx]
        endog = data[endog_name]
        if exog_idx is None:
            exog = data.drop([endog_name], axis=1)
        else:
            exog = data.filter(names[exog_idx])
    else:
        endog = data.loc[:, endog_idx]
        endog_name = list(endog.columns)
        if exog_idx is None:
            exog = data.drop(endog_name, axis=1)
        elif isinstance(exog_idx, (int, long)):
            exog = data.filter([names[exog_idx]])
        else:
            exog = data.filter(names[exog_idx])

    if index_idx is not None:  # NOTE: will have to be improved for dates
        endog.index = Index(data.iloc[:, index_idx])
        exog.index = Index(data.iloc[:, index_idx])
        data = data.set_index(names[index_idx])

    exog_name = list(exog.columns)
    dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog,
                      endog_name=endog_name, exog_name=exog_name)
    return dataset
Example #6
0
def process_recarray_pandas(data, endog_idx=0, exog_idx=None, dtype=None):
    from pandas import DataFrame

    data = DataFrame(data, dtype=dtype)
    names = data.columns

    if isinstance(endog_idx, int):
        endog_name = names[endog_idx]
        endog = data[endog_name]
        if exog_idx is None:
            exog = data.drop([endog_name], axis=1)
        else:
            exog = data.filter(names[exog_idx])
    else:
        endog = data.ix[:, endog_idx]
        endog_name = list(endog.columns)
        if exog_idx is None:
            exog = data.drop(endog_name, axis=1)
        elif isinstance(exog_idx, int):
            exog = data.filter([names[exog_idx]])
        else:
            exog = data.filter(names[exog_idx])

    exog_name = list(exog.columns)
    dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog,
                      endog_name=endog_name, exog_name=exog_name)
    return dataset
def datatype_records_to_subset_and_migrate(likechars):
    stmt_for_pkeys = conn_popler_2.execute(
        select(
            from_obj=Maintable,
            columns=[
                column('lter_proj_site'),
                column('samplingprotocol')
            ]).
        where(
            column('samplingprotocol').like(
                '%{}%'.format(likechars))
        )
    )
    data = DataFrame(stmt_for_pkeys.fetchall())
    data.columns = stmt_for_pkeys.keys()

    records_to_get = data['lter_proj_site'].values.tolist()

    stmt_for_records = conn_popler_2.execute(
        select(
            from_table=Rawtable,
        ).
        where(column('lter_proj_site').in_(records_to_get)).
        order_by('sampleid')
    )
    data2 = DataFrame(stmt_for_records.fetchall())
    data2.columns = stmt_for_records.keys()
    data2.drop('individ', axis=1, inplace=True)
def clicksDataframe(clicks_data):
    clicks_dataframe = DataFrame(clicks_data, columns=['date', 'cardName', 'position', 'totalClicks', 'uniqueClicks'])
    clicks_dataframe = clicks_dataframe.apply(to_numeric, errors='ignore')
    clicks_dataframe.drop('date', axis=1, inplace=True)
    clicks_dataframe = clicks_dataframe.groupby(['cardName','position']).sum().sort_values(by='uniqueClicks',ascending=0)
    clicks_dataframe.reset_index(inplace=True)

    return clicks_dataframe
Example #9
0
def scale_features(df: DataFrame):
    spec_features = ['Fare']
    scaler = StandardScaler()
    for sf in spec_features:
        scale_param = scaler.fit(df[sf].reshape(-1, 1))
        df[sf + '_scaled'] = scaler.fit_transform(df[sf].reshape(-1, 1), scale_param)
    df.drop(labels=spec_features, axis=1, inplace=True)
    return df
Example #10
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
Example #11
0
def set_dummy_vars(df: DataFrame):
    df.drop(labels=['Name'], axis=1, inplace=True)
    discrete_features = list(df.dtypes[df.dtypes == 'object'].index)
    discrete_features.append('Pclass')
    dummies = [pd.get_dummies(df[f], prefix=f) for f in discrete_features]
    dummies.insert(0, df)
    df = pd.concat(dummies, axis=1)
    df.drop(labels=discrete_features, axis=1, inplace=True)
    return df
def homePageToSubjectPageDataframe(data):
    subject_dataframe = DataFrame(data,columns=['date','page_title','views','uniqueViews'])
    subject_dataframe = subject_dataframe.apply(to_numeric, errors='ignore')
    subject_dataframe.drop('date', axis=1, inplace=True)
    subject_dataframe = subject_dataframe.groupby(['page_title']).sum().sort_values(by='uniqueViews',ascending=0)
    subject_dataframe.reset_index(inplace=True)
    subject_dataframe['subject'] = subject_dataframe['page_title'].apply(lambda title: strip_edx_page_title(title))
    subject_dataframe['totalViews'] = subject_dataframe['uniqueViews'].sum()
    subject_dataframe['Pct'] = (subject_dataframe['uniqueViews'] / subject_dataframe['totalViews'])
    subject_dataframe = subject_dataframe[(subject_dataframe['Pct']>0.0001)]

    return subject_dataframe[['subject','uniqueViews','Pct']]
Example #13
0
    def generateGraphData(self):
        safePrint('Generating and uploading data files')

        allData = read_table(self.combinedFile, sep='\t', na_filter=False, parse_dates=[0], infer_datetime_format=True)
        xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and xcs[0:4] != 'TEST' and xcs != '000-00']

        # filter type==DATA and site==wikipedia
        allData = allData[(allData['xcs'].isin(xcsList)) & (allData['site'] == 'wikipedia')]

        # By "iszero+via", e.g.  a,b,aO,bO,..., where 'a' == zero-rated, 'b' == non-zero-rated, and 'O' == Opera
        data = DataFrame(pivot_table(allData, 'count', ['date', 'xcs', 'via', 'iszero'], aggfunc=np.sum))
        data.reset_index(inplace=True)
        data['via'] = data.apply(lambda r: ('a' if r['iszero'][:1] == 'y' else 'b') + r['via'][:1], axis=1)
        data.drop('iszero', axis=1, inplace=True)
        self.createClippedData('RawData:YearDailyViaIsZero', data)
        self.createPeriodData('RawData:WeeklyViaIsZero', data, weekly)
        self.createPeriodData('RawData:MonthlyViaIsZero', data, monthly)

        allowedSubdomains = ['m', 'zero']
        data = allData[(allData.ison == 'y') & (allData.iszero == 'y') & (allData.subdomain.isin(allowedSubdomains))]
        data = DataFrame(pivot_table(data, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum))
        data.reset_index(inplace=True)

        self.createClippedData('RawData:YearDailySubdomains', data)
        self.createPeriodData('RawData:WeeklySubdomains', data, weekly)
        self.createPeriodData('RawData:MonthlySubdomains', data, monthly)

        # create an artificial yes/no/opera sums
        opera = allData[(allData.via == 'OPERA') & (allData.iszero == 'y')]
        opera['str'] = 'o'
        yes = allData[allData.iszero == 'y']
        yes['str'] = 'y'
        no = allData[allData.iszero == 'n']
        no['str'] = 'n'
        combined = opera.append(yes).append(no)
        data = DataFrame(pivot_table(combined, 'count', ['date', 'xcs', 'str'], aggfunc=np.sum))
        data.reset_index(inplace=True)

        headerFields = 'date,xcs,iszero,count'  # Override "str" as "iszero"
        self.createClippedData('RawData:YearDailyTotals', data, headerFields)
        self.createPeriodData('RawData:MonthlyTotals', data, monthly, headerFields)

        data = []
        for xcsId in list(allData.xcs.unique()):
            byLang = pivot_table(allData[allData.xcs == xcsId], 'count', ['lang'], aggfunc=np.sum) \
                .order('count', ascending=False)
            top = byLang.head(5)
            vals = list(top.iteritems())
            vals.append(('other', byLang.sum() - top.sum()))
            valsTotal = sum([v[1] for v in vals]) / 100.0
            data.extend(['%s,%s,%.1f' % (l, xcsId, c / valsTotal) for l, c in vals])

        self.saveWikiPage('RawData:LangPercent', data, 'lang,xcs,count')
Example #14
0
def filter_tags(tag_pickle='results/material_tags.pickle', exclude_tags='results/exclude.csv', n=50):
    exclude_words, duplicate_sets = load_filter_tags(exclude_tags)
    with open(tag_pickle, 'r') as f:
        t = DataFrame(pickle.load(f)['result']).set_index('_id')
    for setn in duplicate_sets:
        t.ix[setn[0]] += sum(map(lambda x: t.ix[x] , setn[1:]))
        for tag in setn[1:]:
            t.drop(tag, inplace=True)
    for tag in exclude_words:
        t.drop(tag, inplace=True)
    t.sort(ascending=False)
    return t[:n].index
Example #15
0
def data_prep(input_file, bad_samples_file, freq_dict=None):
    '''prepare the ibdhmm file byremoving sites that are too close from eachother, calculating major and minor allele
    if specified, the freq_dict should be a json file that contains the frequencies. This is created from freq_parse.py'''
    min_snpD = 10
    tri_allele= 0
    
    output_file = ('.').join(input_file.split('.')[0:-2]) + '_cleaned.txt'
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    bad_samples = [sample.strip() for sample in open(bad_samples_file)]                                              
    df = DataFrame(read_csv(input_file, sep = '\t'))
    #remove bad samples
    df.drop(bad_samples, inplace = True, axis =1)
    #remove non-biallelic alleles
    #df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True)
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    '''#remove SNPs that are too close to one another
    df['diff'] = df.groupby('chrom')['pos'].diff()
    df.fillna('first', inplace = True)
    #df.to_csv('test_df.txt', sep = '\t')
    # BUG NOTE MUST FIX THE DAISY CHAIN PROBLEM
    df = df.query('diff > 10 or diff == "first"')
    df.drop('diff', axis = 1, inplace = True)'''
    
    if not freq_dict:
        #calculate the major and minor allele
        major = df.apply(major_find, axis =1 )
        minor = df.apply(minor_find, axis =1 )
        major_prop = df.apply(major_prop_find, axis =1 )
        minor_prop = df.apply(minor_prop_find, axis = 1)
    else:
        snp_dict = json.load(open(freq_dict))
        df['keys'] = df['chrom'].map(str) +':'+ df['pos'].map(str)        
        major = df['keys'].apply(lambda x : snp_dict[x]['major'])
        major_prop = df['keys'].apply(lambda x : snp_dict[x]['major_freq'])
        minor = df['keys'].apply(lambda x : snp_dict[x]['minor'])
        minor_prop = df['keys'].apply(lambda x : snp_dict[x]['minor_freq'])
        
        df.drop('keys', inplace= True, axis = 1)
               
        
        
    #inserting this stuff into dataframe for future use
    df.insert(3, 'minor_prop', minor_prop)
    df.insert(3, 'minor', minor)
    df.insert(3, 'major_prop', major_prop)
    df.insert(3, 'major', major)
    
    df.to_csv(output_file, sep = '\t', index= False)
    return df
Example #16
0
 def _one_hot_encoding(df: pd.DataFrame, features: list) -> pd.DataFrame:
     """
     help method for one hot encoding
     """
     for feature in features:
         one_hot = pd.get_dummies(df[feature], feature, '_')
         # And the next two statements 'replace' the existing feature_selection by the new binary-valued features
         # First, drop the existing column
         df.drop(feature, axis=1, inplace=True)
         # Next, concatenate the new columns. This assumes no clash of column names.
         df = pd.concat([df, one_hot], axis=1)
     return df
Example #17
0
def pd_dataframe6():
    obj=DataFrame(np.arange(5.),index=['a','b','c','d','e'])
    print obj
    new_obj=obj.drop('c')
    print new_obj
    print obj.drop(['b','c'])
    data = DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])
    print data
    print data.drop(['Ohio', 'Colorado'])
    print data.drop('two',axis=1)
    print data.drop(['two','four'],axis=1)
Example #18
0
 def load_velocities(self, unit=None):
     """
     Load Particle Velocities in units of km/s (default set in units class)
     unit: unit conversion from code units
     """
     if unit:
         self.units.set_velocity(unit)
     uvw = self._velocities.value * self.units.velocity_conv
     if self.units.coordinate_system == 'physical':
         a = self._header.ScaleFactor
         uvw *= numpy.sqrt(a)
     uvw = DataFrame(uvw, index=self._particleIDs.value, columns=['u', 'v', 'w'])
     if self._drop_ids is not None:
         uvw.drop(self._drop_ids, inplace=True)
     self[['u', 'v', 'w']] = uvw
def dataset_transformation(df : pd.DataFrame, testData : pd.DataFrame):

    #Age Transform
    df['AgeuponOutcome'] =  df['AgeuponOutcome'].apply(calculateAge)
    testData['AgeuponOutcome'] = testData['AgeuponOutcome'].apply(calculateAge)

    #name transform
    df['Name'] = df['Name'].apply(processName)
    testData['Name'] = testData['Name'].apply(processName)

    #df = df.apply(setMissingAge, axis=1)
    #testData = testData.apply(setMissingAge, axis=1)

    #encodeFeature(df,'OutcomeType')

    #Animal Tpye transform
    le = encodeFeature(df,testData,'AnimalType')


    #sex transform
    le =  encodeFeature(df,testData,'SexuponOutcome')
    le =  encodeFeature(df,testData,'SexuponOutcome1')


    #Breed transform
    le = encodeFeature(df,testData,'Breed')
    le = encodeFeature(df, testData, 'Breed1')
    le = encodeFeature(df, testData, 'Breed2')

    #color.
    le = encodeFeature(df,testData,'Color')
    le = encodeFeature(df, testData, 'Color1')
    le = encodeFeature(df, testData, 'Color2')


    #encodeFeature(df,'Breed1')
    #encodeFeature(df,'Breed2')
    #encodeFeature(df,'Breedcount')
    #encodeFeature(df,'Name')

    df =  df.drop(['DateTime'], axis=1)
    testData = testData.drop(['DateTime'], axis=1)

    df = df.drop(['Name'], axis=1)
    testData = testData.drop(['Name'], axis=1)


    return [df,testData]
Example #20
0
	def get_quote(self, symbols, dataframe = True):
		if isinstance(symbols, list) or isinstance(symbols, set) or isinstance(symbols, tuple):
			symbolList = list(symbols)
		elif isinstance(symbols, str):
			symbolList = symbols.split(',')
		symbols = util.symbols_to_string(symbols)
		url = URL_QUOTATION(symbols)
		retry = True
		while retry:
			try:
				quote  =self.session.get(
						URL_QUOTATION(symbols)
					,	timeout = 0.1
					).text
				retry = False
			except:
				pass
		quoteList = re.findall(r'\"(.*)\"', quote)
		if dataframe:
			for i in range( 0, len(quoteList) ):
				quoteList[i] = quoteList[i].split(',')
		else:
			for i in range( 0, len(quoteList) ):
				quoteList[i] = quoteList[i].split(',')
				quoteList[i].append( symbolList[i] )

		if dataframe:
			df_quote = DataFrame( quoteList, columns = SINA_QUOTE_COLUMNS )
			df_quote = df_quote.drop( 'ms', axis = 1 )
			df_quote["symbol"] = symbolList
			return df_quote
		else:
			return quoteList
Example #21
0
    def test_v12_compat(self):
        df = DataFrame(
            [
                [1.56808523, 0.65727391, 1.81021139, -0.17251653],
                [-0.2550111, -0.08072427, -0.03202878, -0.17581665],
                [1.51493992, 0.11805825, 1.629455, -1.31506612],
                [-0.02765498, 0.44679743, 0.33192641, -0.27885413],
                [0.05951614, -2.69652057, 1.28163262, 0.34703478],
            ],
            columns=["A", "B", "C", "D"],
            index=pd.date_range("2000-01-03", "2000-01-07"),
        )
        df["date"] = pd.Timestamp("19920106 18:21:32.12")
        df.ix[3, "date"] = pd.Timestamp("20130101")
        df["modified"] = df["date"]
        df.ix[1, "modified"] = pd.NaT

        v12_json = os.path.join(self.dirpath, "tsframe_v012.json")
        df_unser = pd.read_json(v12_json)
        assert_frame_equal(df, df_unser)

        df_iso = df.drop(["modified"], axis=1)
        v12_iso_json = os.path.join(self.dirpath, "tsframe_iso_v012.json")
        df_unser_iso = pd.read_json(v12_iso_json)
        assert_frame_equal(df_iso, df_unser_iso)
Example #22
0
    def test_two_isolated_steppers_one_gapped(self):
        N = 5
        Y = 25
        # Begin second feature one frame later than the first, so the probe labeling (0, 1) is
        # established and not arbitrary.
        a = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)})
        a = a.drop(3).reset_index(drop=True)
        b = DataFrame({'x': np.arange(1, N), 'y': Y + np.ones(N - 1), 'frame': np.arange(1, N)})
        f = pd.concat([a, b])
        actual = self.link(f, 5)
        expected = f.copy()
        expected['probe'] = np.concatenate([np.array([0, 0, 0, 2]), np.ones(N - 1)])
        expected.sort(['probe', 'frame'], inplace=True)
        expected.reset_index(drop=True, inplace=True)
        assert_frame_equal(actual, expected)

        # Sort rows by frame (normal use)
        actual = self.link(f.sort('frame'), 5)
        assert_frame_equal(actual, expected)

        # Shuffle rows (crazy!)
        np.random.seed(0)
        f1 = f.reset_index(drop=True)
        f1.reindex(np.random.permutation(f1.index))
        actual = self.link(f1, 5)
        assert_frame_equal(actual, expected)
def training(iden, Charg, Temps, use_cache_trainingset, test, verbose):
    ''' Return the prediction function, 
    for a given site iden, history Charg and temperature Temps'''
    if use_cache_trainingset:
        if test:
            X = pickle.load(open(CACHE_DIR+"X_test_"+iden+".p", "rb"))
        else:
            X = pickle.load(open(CACHE_DIR+"X_"+iden+".p", "rb"))
    else:
        X = DataFrame(Charg[iden])
        X = X.dropna(how='any')
        X['dayofweek'] = X.index.dayofweek
        X['Temps'] = Temps[iden].ix[X.index]
        X['fracday'] = X.index.minute/60.+X.index.hour
        X['lastminutes'] = X[iden].ix[X.index-10*Minute()].values
        X['yesterday'] = X[iden].ix[X.index-Day()].values
        X['yesterdaybis'] = X[iden].ix[X.index-Day()-10*Minute()].values
        X['lastweek'] = X[iden].ix[X.index-Week()].values
        X['lastweekbis'] = X[iden].ix[X.index-Week()-10*Minute()].values
        if test:
            pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) )
        else:
            pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) )
    X = X.dropna(how='any')
    y = X[iden]
    X = X.drop(iden, 1)
    scalerX = preprocessing.StandardScaler().fit(X)
    ##############################
    clf = linear_model.SGDRegressor(alpha = 0.000001,n_iter=3000)
    ##############################
    clf.fit(scalerX.transform(X), y)
    if verbose:
        print('Function for '+iden+' computed.')
    return(lambda x :clf.predict(scalerX.transform(x)))
Example #24
0
    def test_two_nearby_steppers_one_gapped(self):
        N = 5
        Y = 2
        # Begin second feature one frame later than the first, so the particle labeling (0, 1) is
        # established and not arbitrary.
        a = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)})
        b = DataFrame({'x': np.arange(1, N), 'y': Y + np.ones(N - 1), 'frame': np.arange(1, N)})
        a = a.drop(3).reset_index(drop=True)
        f = pd.concat([a, b])
        expected = f.copy().reset_index(drop=True)
        expected['particle'] = np.concatenate([np.array([0, 0, 0, 2]), np.ones(N - 1)])
        pandas_sort(expected, ['particle', 'frame'], inplace=True)
        expected.reset_index(drop=True, inplace=True)
        actual = self.link_df(f, 5)
        assert_frame_equal(actual, expected)
        actual_iter = self.link_df_iter(f, 5, hash_size=(50, 50))
        assert_frame_equal(actual_iter, expected)

        # Sort rows by frame (normal use)
        actual = self.link_df(pandas_sort(f, 'frame'), 5)
        assert_frame_equal(actual, expected)
        actual_iter = self.link_df_iter(pandas_sort(f, 'frame'), 5, hash_size=(50, 50))
        assert_frame_equal(actual_iter, expected)

        # Shuffle rows (crazy!)
        np.random.seed(0)
        f1 = f.reset_index(drop=True)
        f1.reindex(np.random.permutation(f1.index))
        actual = self.link_df(f1, 5)
        assert_frame_equal(actual, expected)
        actual_iter = self.link_df_iter(f1, 5, hash_size=(50, 50))
        assert_frame_equal(actual_iter, expected)
Example #25
0
def thread_participation_evolution(
        pm_frame, project, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to threads in project with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        title = "Participation per thread in {} (threshold = {})".format(
            project, n)
    else:
        thread_type = 'research threads'
        title = "Participation per thread in {}\
                 (threshold = {}, only research-threads)".format(project, n)
    data = pm_frame.loc[project][['basic', thread_type]]
    data = data.dropna()
    all_authors = set().union(*data[thread_type, 'authors'])
    author_thread = DataFrame(columns=all_authors)
    for author in author_thread.columns:
        author_thread[author] = data[thread_type, 'authors'].apply(
            lambda thread, author=author: author in thread)
    author_thread = author_thread.T
    author_thread = author_thread.sort_values(by=data.index.tolist(),
                                              ascending=False)
    author_thread = author_thread.drop(
        "Anonymous") if skip_anon else author_thread
    author_thread.columns.name = "Threads"
    select = author_thread.sum(axis=1) >= n
    return author_thread, data.index, select, title
Example #26
0
def project_participation_evolution(
        pm_frame, all_authors, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to projects with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = list(all_authors)
        title = "Participation per project in Polymath\
                 (threshold = {})".format(n)
    else:
        thread_type = 'research threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = set().union(
            *data['research threads', 'authors (accumulated)'])
        title = "Participation per project in Polymath\
                 (threshold = {}, only research-threads)".format(n)
    data.index = data.index.droplevel(1)
    author_project = DataFrame(columns=all_authors)
    for author in author_project.columns:
        author_project[author] = data[
            thread_type, 'authors (accumulated)'].apply(
                lambda project, author=author: author in project)
    author_project = author_project.T
    author_project = author_project.sort_values(by=data.index.tolist(),
                                                ascending=False)
    author_project = author_project.drop(
        "Anonymous") if skip_anon else author_project
    select = author_project.sum(axis=1) >= n
    return author_project, data.index, select, title
Example #27
0
    def test_two_isolated_steppers_one_gapped(self):
        N = 5
        Y = 25
        # Begin second feature one frame later than the first,
        # so the particle labeling (0, 1) is established and not arbitrary.
        a = DataFrame({'x': np.arange(N), 'y': np.ones(N),
                      'frame': np.arange(N)})
        a = a.drop(3).reset_index(drop=True)
        b = DataFrame({'x': np.arange(1, N), 'y': Y + np.ones(N - 1),
                      'frame': np.arange(1, N)})
        f = pd.concat([a, b])
        expected = f.copy()
        expected['particle'] = np.concatenate([np.array([0, 0, 0, 2]), np.ones(N - 1)])
        pandas_sort(expected, ['particle', 'frame'], inplace=True)
        expected.reset_index(drop=True, inplace=True)
        actual = self.link(f, 5)
        assert_traj_equal(actual, expected)
        # link_df_iter() tests not performed, because hash_size is
        # not knowable from the first frame alone.

        # Sort rows by frame (normal use)
        actual = self.link(pandas_sort(f, 'frame'), 5)
        assert_traj_equal(actual, expected)

        # Shuffle rows (crazy!)
        np.random.seed(0)
        f1 = f.reset_index(drop=True)
        f1.reindex(np.random.permutation(f1.index))
        actual = self.link(f1, 5)
        assert_traj_equal(actual, expected)
Example #28
0
def test_cythonized_aggers(op_name):
    data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan],
            'B': ['A', 'B'] * 6,
            'C': np.random.randn(12)}
    df = DataFrame(data)
    df.loc[2:10:2, 'C'] = np.nan

    op = lambda x: getattr(x, op_name)()

    # single column
    grouped = df.drop(['B'], axis=1).groupby('A')
    exp = {cat: op(group['C']) for cat, group in grouped}
    exp = DataFrame({'C': exp})
    exp.index.name = 'A'
    result = op(grouped)
    tm.assert_frame_equal(result, exp)

    # multiple columns
    grouped = df.groupby(['A', 'B'])
    expd = {}
    for (cat1, cat2), group in grouped:
        expd.setdefault(cat1, {})[cat2] = op(group['C'])
    exp = DataFrame(expd).T.stack(dropna=False)
    exp.index.names = ['A', 'B']
    exp.name = 'C'

    result = op(grouped)['C']
    if op_name in ['sum', 'prod']:
        tm.assert_series_equal(result, exp)
Example #29
0
def get_flights_from_route(cur, origin, destination):
    """
    Returns a dataframe for all flights matching origin, destination.
    """

    import time
    
    ### MySQL query
    time0 = time.time()
    cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, UniqueCarrier, ArrDelay FROM flights_100000 WHERE Origin = %s and Dest = %s;", (origin, destination))
    rows = cur.fetchall()
    td = time.time() - time0
    print 'Database query took %.2f seconds.' % td
    
    ### Convert to dataframe
    df = DataFrame(list(rows), columns=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'Carrier', 'ArrDelay'])

    ### Drop columns without delays (cancellations)
    df = df.dropna()
    
    ### Create some auxiliary columns
    df['DayOfYear'] = df.apply( lambda x: datetime.datetime(x['Year'],x['Month'],x['DayOfMonth']).timetuple().tm_yday, axis=1)
    df['Week'] = df['DayOfYear'] / 7 + 1
    df['DepHour'] = df['CRSDepTime']/100

    ### Drop unused columns
    df = df.drop(['DayOfMonth','CRSDepTime'],axis=1).sort_index(axis=1)

    ## df.head()
    
    return df
Example #30
0
    def load_abundances(self, tracked_species=None):
        """
        Load chemical abundances array.

        There are six abundances tracked for each particle.
        0:H2 1:HII 2:DII 3:HD 4:HeII 5:HeIII
        """
        default_species = ['H2', 'HII', 'DII', 'HD', 'HeII', 'HeIII']
        if tracked_species is None:
            tracked_species = default_species
        abundances = self._ChemicalAbundances.value
        abundances = DataFrame(abundances, index=self._particleIDs.value,
                               columns=tracked_species)
        if self._drop_ids is not None:
            abundances.drop(self._drop_ids)
        self[tracked_species] = abundances