def view_coef(model: LogisticRegression, train_df):
    coef_list = list(model.coef_.T)
    coef_df = DataFrame({'columns': list(train_df.columns[1:]),
                         'coef': coef_list})
    coef_df['abs_coef'] = abs(coef_df['coef'])
    coef_df.sort_values(by=['abs_coef'], ascending=[0], inplace=True)
    print(coef_df)
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr):
    '''
    Meant to feed into a Pivot requested by Mitch Turner.
    
    Aggregates the same as above but includes time and product data.
    '''
    dat = pwunsale_tidy['Date'].tolist()
    pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat]    
    
    print('Aggregating custom pivot for Mitch.')
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    custom_cols = ['Month','CustomerId','Customer','ProductId','Product']    
    
    customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)

    return customer_returns
Example #3
0
def compute_importances(data_set_df, user_info_df, label='gender', split_modal=False, n_est=10, max_depth=None):
    print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_importances = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    modalities = data_set_df.index.levels[0]

    def compute(x):
        x_imp = pc.fill_nan_features(x)
        try:
            m = ExtraTreesClassifier(n_estimators=n_est) if max_depth is None \
                else ExtraTreesClassifier(n_estimators=n_est, max_depth=3)
            print "\t\t\tfitting RF model..."
            m.fit(x_imp.T, y_v)

            # if len(feature_mics) > 1000:
            #     break
            # print m.feature_importances_
            for order, index in enumerate(x.index):
                feature_importances.loc[index] = m.feature_importances_[order]
                if float(order) % 10000 == 0 and order > 0:
                    print "\t\t\t%s features are done" % order
        except ValueError as e:
            # print "value error occurs during processing %r" % index
            pass

    if split_modal is True:
        for modal in modalities:
            x = df_filtered.loc[modal].dropna(how='all')
            compute(x)
    else:
        x = df_filtered.dropna(how='all')
        compute(x)

    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances
Example #4
0
    def test_sort_datetimes(self):

        # GH 3461, argsort / lexsort differences for a datetime column
        df = DataFrame(
            ["a", "a", "a", "b", "c", "d", "e", "f", "g"], columns=["A"], index=date_range("20130101", periods=9)
        )
        dts = [
            Timestamp(x)
            for x in [
                "2004-02-11",
                "2004-01-21",
                "2004-01-26",
                "2005-09-20",
                "2010-10-04",
                "2009-05-12",
                "2008-11-12",
                "2010-09-28",
                "2010-09-28",
            ]
        ]
        df["B"] = dts[::2] + dts[1::2]
        df["C"] = 2.0
        df["A1"] = 3.0

        df1 = df.sort_values(by="A")
        df2 = df.sort_values(by=["A"])
        assert_frame_equal(df1, df2)

        df1 = df.sort_values(by="B")
        df2 = df.sort_values(by=["B"])
        assert_frame_equal(df1, df2)
Example #5
0
def compute_mics(data_set_df, user_info_df, label='gender', min_not_nan=-1):
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_mics = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    i = 0
    for index, values in df_filtered.iterrows():
        # if len(feature_mics) > 1000:
        #     break
        m = minepy.MINE()
        try:
            if min_not_nan < 0:
                m.compute_score(values, y_v)
                feature_mics.loc[index] = m.mic()
            else:
                nan_removed = values.dropna()
                if len(nan_removed) < min_not_nan:
                    feature_mics.loc[index] = np.nan
                else:
                    m.compute_score(nan_removed, y_v[nan_removed.index.astype(int)])
                    feature_mics.loc[index] = m.mic()
            # if len(feature_mics) > 1000:
            #     break
            # if float(i) % 10000 == 0 and i > 0:
            #     print "\t\t\t%s features are done" % i
            i += 1
            # print index, feature_mics.loc[index].values[0]
        except ValueError:
            # print "value error occurs during processing %r" % index
            continue
    feature_mics.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_mics
Example #6
0
def compute_fscore(data_set_df, user_info_df, label='gender', min_not_nan=-1):
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_fs = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    i = 0
    for index, values in df_filtered.iterrows():
        try:
            if min_not_nan < 0:
                f_score, p_val = f_classif(values.fillna(values.mean())[:, np.newaxis], y_v)
                feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            else:
                nan_removed = values.dropna()
                if len(nan_removed) < min_not_nan:
                    feature_fs.loc[index] = np.nan
                else:
                    f_score, p_val = f_classif(nan_removed[:, np.newaxis], y_v[nan_removed.index.astype(int)])
                    feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            if float(i) % 10000 == 0 and i > 0:
                print "\t\t\t%s features are done" % i
            i += 1
            # print index, feature_fs.loc[index].values[0]
        except ValueError:
            # print "value error occurs during processing %r" % index
            continue
    feature_fs.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_fs
Example #7
0
def show_failures_prob(df, y, y_self, out_path):
    print('show_failures_prob: df=%s,y=%s,y_self=%s,out_path="%s"' %
          (S(df), S(y), S(y_self), out_path))
    name = '%s-%d' % (out_path, len(y))

    y_self_bool = DataFrame(np.floor(y_self['hat'].values * 2.0).astype(int),
                            index=y_self.index, columns=['hat'])

    print('A) y_self_bool=%s,' % C(y_self_bool))
    print('B) y=%s' % y.dtype)

    diff = y_self_bool['hat'] - y
    failures = diff != 0
    print('^' * 80)
    print(type(failures))
    print(failures.describe())
    print(failures[:5])
    failures_df = Series([False] * len(df), index=df.index)
    for idx, val in failures.iteritems():
        failures_df[idx] = val
    df = df[failures_df]
    y_self_df = Series([0.0] * len(df), index=df.index, dtype=float)
    y_self_df_bool = Series([0] * len(df), index=df.index, dtype=int)
    for idx in y_self_df.index:
        y_self_df[idx] = y_self['hat'][idx]
        y_self_df_bool[idx] = y_self_bool['hat'][idx]
    df['probability'] = y_self_df
    df['predicted'] = y_self_df_bool
    columns = list(df.columns[-3:]) + list(df.columns[:-3])
    df2 = DataFrame()
    for col in columns:
        df2[col] = df[col]
    df2.sort_values('hat', ascending=False, inplace=True)
    df2.to_csv('%s.failures.csv' % name, index_label='job_id')
Example #8
0
    def setup(self):
        one_count = 200000
        two_count = 1000000

        df1 = DataFrame(
            {'time': np.random.randint(0, one_count / 20, one_count),
             'key': np.random.choice(list(string.ascii_uppercase), one_count),
             'key2': np.random.randint(0, 25, one_count),
             'value1': np.random.randn(one_count)})
        df2 = DataFrame(
            {'time': np.random.randint(0, two_count / 20, two_count),
             'key': np.random.choice(list(string.ascii_uppercase), two_count),
             'key2': np.random.randint(0, 25, two_count),
             'value2': np.random.randn(two_count)})

        df1 = df1.sort_values('time')
        df2 = df2.sort_values('time')

        df1['time32'] = np.int32(df1.time)
        df2['time32'] = np.int32(df2.time)

        self.df1a = df1[['time', 'value1']]
        self.df2a = df2[['time', 'value2']]
        self.df1b = df1[['time', 'key', 'value1']]
        self.df2b = df2[['time', 'key', 'value2']]
        self.df1c = df1[['time', 'key2', 'value1']]
        self.df2c = df2[['time', 'key2', 'value2']]
        self.df1d = df1[['time32', 'value1']]
        self.df2d = df2[['time32', 'value2']]
        self.df1e = df1[['time', 'key', 'key2', 'value1']]
        self.df2e = df2[['time', 'key', 'key2', 'value2']]
Example #9
0
    def test_sort_index_multicolumn(self):
        import random
        A = np.arange(5).repeat(20)
        B = np.tile(np.arange(5), 20)
        random.shuffle(A)
        random.shuffle(B)
        frame = DataFrame({'A': A, 'B': B,
                           'C': np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=['A', 'B'])
        result = frame.sort_values(by=['A', 'B'])
        indexer = np.lexsort((frame['B'], frame['A']))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=['A', 'B'], ascending=False)
        result = frame.sort_values(by=['A', 'B'], ascending=False)
        indexer = np.lexsort((frame['B'].rank(ascending=False),
                              frame['A'].rank(ascending=False)))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=['B', 'A'])
        result = frame.sort_values(by=['B', 'A'])
        indexer = np.lexsort((frame['A'], frame['B']))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)
Example #10
0
 def data_frame(self):
     if self._processed_knockouts is None:
         self._process_knockouts()
     data_frame = DataFrame(self._processed_knockouts)
     data_frame.sort_values("size", inplace=True)
     data_frame.index = [i for i in range(len(data_frame))]
     return data_frame
def model_selection_cv( models, x, y, k=5, eval_func=None, random_state=None):
    """ framework for model selection based on stratified
        cross-validation

        Parameters:
        ----------
        * models: {dictionary}, key: model label, value: learner object
        * x: {np.array}, predictor data
        * y: {np.array}, response variable data
        * k: {integer}, the number of folds in cross-validation
        * random_state: {integer}, the random state set for replication
        * eval_func: {function}, return evaulation score
    """
    # stratified cross_validation
    cv = StratifiedKFold( y, n_folds=k, shuffle=False, random_state=random_state)
    tot_models = len( models.keys() )
    tot_iter = tot_models * k

    pbar = tqdm(total=tot_iter)

    train_reports, test_reports = [], []

    for jj, model_name in enumerate(models):
        model = models[model_name]
        # cross-validation evaluation containers
        train_scores = []
        test_scores = []
        # print( "--- model: {}'s cross-validation test ----".format( model_name ) )
        for ii, (train_idx, test_idx) in enumerate(cv):
            # retrieve data for relevant usage
            x_train, y_train = x[train_idx], y[train_idx]
            x_test, y_test = x[test_idx], y[test_idx]
            # training model
            model.fit( x_train, y_train )
            # evaluation model
            train_score = eval_func( model, x_train, y_train )
            train_score["model_name"] = model_name
            test_score = eval_func( model, x_test, y_test )
            test_score["model_name"] = model_name

            train_reports.append( train_score )
            test_reports.append( test_score )

            pbar.update()

    pbar.close()

    # convert list of performance records into dataframe
    train_reports = DataFrame(train_reports)
    test_reports = DataFrame(train_reports)

    metrics_names = [feat for feat in train_reports.columns.tolist() if feat != "model_name"]

    train_reports.sort_values(by=["model_name"])
    train_reports = train_reports[["model_name"] + metrics_names]
    test_reports.sort_values(by=["model_name"])
    test_reports = test_reports[["model_name"] + metrics_names]

    return train_reports, test_reports
Example #12
0
class SortValues(object):

    params = [True, False]
    param_names = ['ascending']

    def setup(self, ascending):
        self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB'))

    def time_frame_sort_values(self, ascending):
        self.df.sort_values(by='A', ascending=ascending)
Example #13
0
class SortIndexByColumns(object):

    def setup(self):
        N = 10000
        K = 10
        self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K),
                             'key2': tm.makeStringIndex(N).values.repeat(K),
                             'value': np.random.randn(N * K)})

    def time_frame_sort_values_by_columns(self):
        self.df.sort_values(by=['key1', 'key2'])
Example #14
0
    def test_stable_descending_multicolumn_sort(self):
        nan = np.nan
        df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]})
        # test stable mergesort
        expected = DataFrame({"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 2, 9]}, index=[2, 5, 4, 6, 1, 3, 0])
        sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort")
        assert_frame_equal(sorted_df, expected)

        expected = DataFrame({"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, index=[2, 5, 4, 6, 1, 0, 3])
        sorted_df = df.sort_values(["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort")
        assert_frame_equal(sorted_df, expected)
Example #15
0
 def getData(self, params):
      top = int(params['top'])
      regex = re.compile('^<.*>(\w+.*)</.>')
      df = createDataframe()
      source = [str(regex.findall(line)).strip('[]') for line in df['source'] if line != None]
      source = dict(Counter(source))
      appSource = source.keys()
      count = source.values()
      tweetSource = DataFrame({'AppSource': appSource, 'Count':count})
      tweetSource = tweetSource[['AppSource', 'Count']]
      tweetSource.sort_values(by='Count', ascending=False, inplace=True)
      return tweetSource[:top]
Example #16
0
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'):
    # print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered.dropna(how='all')
    x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values

    clf = RandomizedLogisticRegression()
    # print "\t\t\tfitting LR model..."
    clf.fit(x_imp.T, y_v)
    feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance'])
    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances
Example #17
0
    def test_timegrouper_get_group(self):
        # GH 6914

        df_original = DataFrame({
            'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
            'Quantity': [18, 3, 5, 1, 9, 3],
            'Date': [datetime(2013, 9, 1, 13, 0),
                     datetime(2013, 9, 1, 13, 5),
                     datetime(2013, 10, 1, 20, 0),
                     datetime(2013, 10, 3, 10, 0),
                     datetime(2013, 12, 2, 12, 0),
                     datetime(2013, 9, 2, 14, 0), ]
        })
        df_reordered = df_original.sort_values(by='Quantity')

        # single grouping
        expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
                         df_original.iloc[[4]]]
        dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']

        for df in [df_original, df_reordered]:
            grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
            for t, expected in zip(dt_list, expected_list):
                dt = pd.Timestamp(t)
                result = grouped.get_group(dt)
                assert_frame_equal(result, expected)

        # multiple grouping
        expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
                         df_original.iloc[[4]]]
        g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'),
                  ('Joe', '2013-12-31')]

        for df in [df_original, df_reordered]:
            grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
            for (b, t), expected in zip(g_list, expected_list):
                dt = pd.Timestamp(t)
                result = grouped.get_group((b, dt))
                assert_frame_equal(result, expected)

        # with index
        df_original = df_original.set_index('Date')
        df_reordered = df_original.sort_values(by='Quantity')

        expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
                         df_original.iloc[[4]]]

        for df in [df_original, df_reordered]:
            grouped = df.groupby(pd.Grouper(freq='M'))
            for t, expected in zip(dt_list, expected_list):
                dt = pd.Timestamp(t)
                result = grouped.get_group(dt)
                assert_frame_equal(result, expected)
Example #18
0
    def test_sort_index_duplicates(self):

        # with 9816, these are all translated to .sort_values

        df = DataFrame([lrange(5, 9), lrange(4)],
                       columns=['a', 'a', 'b', 'b'])

        with assertRaisesRegexp(ValueError, 'duplicate'):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by='a')
        with assertRaisesRegexp(ValueError, 'duplicate'):
            df.sort_values(by='a')

        with assertRaisesRegexp(ValueError, 'duplicate'):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by=['a'])
        with assertRaisesRegexp(ValueError, 'duplicate'):
            df.sort_values(by=['a'])

        with assertRaisesRegexp(ValueError, 'duplicate'):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                # multi-column 'by' is separate codepath
                df.sort_index(by=['a', 'b'])
        with assertRaisesRegexp(ValueError, 'duplicate'):
            # multi-column 'by' is separate codepath
            df.sort_values(by=['a', 'b'])

        # with multi-index
        # GH4370
        df = DataFrame(np.random.randn(4, 2),
                       columns=MultiIndex.from_tuples([('a', 0), ('a', 1)]))
        with assertRaisesRegexp(ValueError, 'levels'):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by='a')
        with assertRaisesRegexp(ValueError, 'levels'):
            df.sort_values(by='a')

        # convert tuples to a list of tuples
        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=[('a', 1)])
        expected = df.sort_values(by=[('a', 1)])

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=('a', 1))
        result = df.sort_values(by=('a', 1))
        assert_frame_equal(result, expected)
Example #19
0
def parse_ticker_dataframe(ticker: list) -> DataFrame:
    """
    Analyses the trend for the given ticker history
    :param ticker: See exchange.get_ticker_history
    :return: DataFrame
    """
    columns = {'C': 'close', 'V': 'volume', 'O': 'open', 'H': 'high', 'L': 'low', 'T': 'date'}
    frame = DataFrame(ticker) \
        .drop('BV', 1) \
        .rename(columns=columns)
    frame['date'] = to_datetime(frame['date'], utc=True, infer_datetime_format=True)
    frame.sort_values('date', inplace=True)
    return frame
Example #20
0
def show_predicted_prob(df, y_test, out_path):
    print('show_predicted_prob: df=%s,y_test=%s,out_path="%s"' %
          (S(df), S(y_test), out_path))
    name = '%s-%d' % (out_path, len(y_test))
    print('~' * 80)
    df = df.loc[y_test.index, :]
    df['hat'] = y_test
    columns = ['hat'] + [col for col in df.columns if col != 'hat']
    df2 = DataFrame()
    for col in columns:
        df2[col] = df[col]
    df2.sort_values('hat', ascending=False, inplace=True)
    df2.to_csv('%s.predicted.csv' % name, index_label='job_id')
Example #21
0
    def test_sort_index_different_sortorder(self):
        A = np.arange(20).repeat(5)
        B = np.tile(np.arange(5), 20)

        indexer = np.random.permutation(100)
        A = A.take(indexer)
        B = B.take(indexer)

        df = DataFrame({'A': A, 'B': B,
                        'C': np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=['A', 'B'], ascending=[1, 0])
        result = df.sort_values(by=['A', 'B'], ascending=[1, 0])

        ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
        expected = df.take(ex_indexer)
        assert_frame_equal(result, expected)

        # test with multiindex, too
        idf = df.set_index(['A', 'B'])

        result = idf.sort_index(ascending=[1, 0])
        expected = idf.take(ex_indexer)
        assert_frame_equal(result, expected)

        # also, Series!
        result = idf['C'].sort_index(ascending=[1, 0])
        assert_series_equal(result, expected['C'])
Example #22
0
    def data_frame(self):
        if self._processed_solutions is None:
            self._process_solutions()

        if self._manipulation_type == "reactions":
            data_frame = DataFrame(self._processed_solutions)
        else:
            columns = self._processed_solutions.columns.difference(["reactions", "size"])
            aggregation_functions = {k: self.__aggregation_function.get(k, lambda x: x.values[0]) for k in columns}
            data_frame = self._processed_solutions.groupby(["reactions", "size"], as_index=False) \
                .aggregate(aggregation_functions)
            data_frame = data_frame[self._processed_solutions.columns]

        data_frame.sort_values("size", inplace=True)
        data_frame.index = [i for i in range(len(data_frame))]
        return data_frame
Example #23
0
 def test_stable_descending_sort(self):
     # GH #6399
     df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
                    columns=['sort_col', 'order'])
     sorted_df = df.sort_values(by='sort_col', kind='mergesort',
                                ascending=False)
     assert_frame_equal(df, sorted_df)
Example #24
0
def thread_participation_evolution(
        pm_frame, project, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to threads in project with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        title = "Participation per thread in {} (threshold = {})".format(
            project, n)
    else:
        thread_type = 'research threads'
        title = "Participation per thread in {}\
                 (threshold = {}, only research-threads)".format(project, n)
    data = pm_frame.loc[project][['basic', thread_type]]
    data = data.dropna()
    all_authors = set().union(*data[thread_type, 'authors'])
    author_thread = DataFrame(columns=all_authors)
    for author in author_thread.columns:
        author_thread[author] = data[thread_type, 'authors'].apply(
            lambda thread, author=author: author in thread)
    author_thread = author_thread.T
    author_thread = author_thread.sort_values(by=data.index.tolist(),
                                              ascending=False)
    author_thread = author_thread.drop(
        "Anonymous") if skip_anon else author_thread
    author_thread.columns.name = "Threads"
    select = author_thread.sum(axis=1) >= n
    return author_thread, data.index, select, title
Example #25
0
def project_participation_evolution(
        pm_frame, all_authors, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to projects with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = list(all_authors)
        title = "Participation per project in Polymath\
                 (threshold = {})".format(n)
    else:
        thread_type = 'research threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = set().union(
            *data['research threads', 'authors (accumulated)'])
        title = "Participation per project in Polymath\
                 (threshold = {}, only research-threads)".format(n)
    data.index = data.index.droplevel(1)
    author_project = DataFrame(columns=all_authors)
    for author in author_project.columns:
        author_project[author] = data[
            thread_type, 'authors (accumulated)'].apply(
                lambda project, author=author: author in project)
    author_project = author_project.T
    author_project = author_project.sort_values(by=data.index.tolist(),
                                                ascending=False)
    author_project = author_project.drop(
        "Anonymous") if skip_anon else author_project
    select = author_project.sum(axis=1) >= n
    return author_project, data.index, select, title
Example #26
0
    def test_astype_categorical_to_other(self):

        value = np.random.RandomState(0).randint(0, 10000, 100)
        df = DataFrame({'value': value})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                                   right=False, labels=cat_labels)

        s = df['value_group']
        expected = s
        tm.assert_series_equal(s.astype('category'), expected)
        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
        msg = (r"could not convert string to float|"
               r"invalid literal for float\(\)")
        with pytest.raises(ValueError, match=msg):
            s.astype('float64')

        cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
        exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        tm.assert_series_equal(cat.astype('str'), exp)
        s2 = Series(Categorical(['1', '2', '3', '4']))
        exp2 = Series([1, 2, 3, 4]).astype(int)
        tm.assert_series_equal(s2.astype('int'), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(
                np.sort(np.unique(a)), np.sort(np.unique(b)))

        expected = Series(np.array(s.values), name='value_group')
        cmp(s.astype('object'), expected)
        cmp(s.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(s), np.array(s.values))

        # valid conversion
        for valid in [lambda x: x.astype('category'),
                      lambda x: x.astype(CategoricalDtype()),
                      lambda x: x.astype('object').astype('category'),
                      lambda x: x.astype('object').astype(
                          CategoricalDtype())
                      ]:

            result = valid(s)
            # compare series values
            # internal .categories can't be compared because it is sorted
            tm.assert_series_equal(result, s, check_categorical=False)

        # invalid conversion (these are NOT a dtype)
        msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\."
               "Categorical'> for astype")
        for invalid in [lambda x: x.astype(Categorical),
                        lambda x: x.astype('object').astype(Categorical)]:
            with pytest.raises(TypeError, match=msg):
                invalid(s)
Example #27
0
def land_sic_overlap_timeseries(instrument,
                                title="Land-Sea Ice Border Variations"):
    """
        Time Series that shows the percentage variations of the land mask
        border given the expansion of sea ice in VIRS.
    """

    files = data.file_names(instrument_id=data.INSTRUMENT_MAP.get(instrument))
    out = []

    for idx, mat in enumerate(data.mat_generator(files)):

        sic = SIC(files[idx])
        lm = LM(files[idx])

        sic_surface = sic.surface(boolean=False)
        lm_surface = lm.silhoutte()

        silhoutte_freq = itemfreq(lm_surface)
        border = silhoutte_freq[1][1]

        merge = np.add(sic_surface, lm_surface)
        merge_freq = itemfreq(merge)
        intercept = merge_freq[2][1]

        land_ice_overlap = (float(intercept) / border) * 100
        temp = {'timestamp': lm.title, 'intercept': land_ice_overlap}
        out.append(temp)

    index = [elem['timestamp'] for elem in out]
    df = DataFrame(out, index=index)
    sdf = df.sort_values(by='timestamp')
    sdf.plot(title=title)
    plt.show()
Example #28
0
def plot_stuff():
    pd_list = {}
    compare_tl = []
    compare_tl_head = []
    for vars in list_communities():
        for var in vars:
            pd_list.update({var.split("/")[-1].split(".")[0]: DataFrame(
                sorted(read_csv(var)[["Name", "G"]].values, key=lambda x: x[0], reverse=True))})

        N = len(pd_list.keys())  # Find number of elements
        stats = np.zeros((N, N))  # Create a 2-D Array to hold the stats
        keys = sorted(pd_list, reverse=True)  # Find data sets (Sort alphabetically, backwards)
        for idx, key in enumerate(keys):  # Populate 2-D array
            for i, val in enumerate(pd_list[key][1].values):
                if not i == idx:  # Ensure self values are set to zero
                    stats[i, idx] = val

        stats = DataFrame(stats, columns=keys, index=keys)
        # stats["Mean"] = stats.median(axis=0)
        # set_trace()
        stats["Mean"] = find_mean(stats)
        stats["Std"] = find_std(stats)
        stats = stats.sort_values(by="Mean", axis=0, ascending=False, inplace=False)
        print(tabulate(stats, showindex=True, headers=stats.columns, tablefmt="fancy_grid"))
        print("\n")
        save_path = os.path.abspath("/".join(var.split("/")[:-2]))
        method = var.split("/")[-2]+".xlsx"
        stats.to_excel(os.path.join(save_path, method))
        compare_tl.append(stats.sort_index(inplace=False)["Mean"].values.tolist())
        compare_tl_head.append(method)
    # set_trace()
    compare_tl= DataFrame(np.array(compare_tl).T, columns=compare_tl_head, index=stats.index.sort_values())
    save_path_2 = os.path.join(os.path.abspath("/".join(var.split("/")[:-3])), os.path.abspath("".join(var.split("/")[-3]))+".xlsx")
    compare_tl.to_excel(save_path_2)
Example #29
0
 def _search_by_inchi_fuzzy(self, inchi):
     # TODO: use openbabel if available
     matches = difflib.get_close_matches(inchi, self.data_frame.InChI.dropna(), n=5, cutoff=.8)
     ranks = {match: i for i, match in enumerate(matches)}
     selection = DataFrame(self.data_frame[self.data_frame.InChI.isin(matches)])
     selection['search_rank'] = selection.name.map(ranks)
     return selection.sort_values('search_rank')
Example #30
0
    def test_numeric_like_ops(self):

        df = DataFrame({'value': np.random.randint(0, 10000, 100)})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                                   right=False, labels=cat_labels)

        # numeric ops should not succeed
        for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
            pytest.raises(TypeError,
                          lambda: getattr(df, op)(df))

        # reduction ops should not succeed (unless specifically defined, e.g.
        # min/max)
        s = df['value_group']
        for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']:
            pytest.raises(TypeError,
                          lambda: getattr(s, op)(numeric_only=False))

        # mad technically works because it takes always the numeric data

        # numpy ops
        s = Series(Categorical([1, 2, 3, 4]))
        pytest.raises(TypeError, lambda: np.sum(s))

        # numeric ops on a Series
        for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
            pytest.raises(TypeError, lambda: getattr(s, op)(2))

        # invalid ufunc
        pytest.raises(TypeError, lambda: np.log(s))