Example #1
0
    def test_frame_values_with_tz(self):
        tz = "US/Central"
        df = DataFrame({"A": date_range('2000', periods=4, tz=tz)})
        result = df.values
        expected = np.array([
            [pd.Timestamp('2000-01-01', tz=tz)],
            [pd.Timestamp('2000-01-02', tz=tz)],
            [pd.Timestamp('2000-01-03', tz=tz)],
            [pd.Timestamp('2000-01-04', tz=tz)],
        ])
        tm.assert_numpy_array_equal(result, expected)

        # two columns, homogenous

        df = df.assign(B=df.A)
        result = df.values
        expected = np.concatenate([expected, expected], axis=1)
        tm.assert_numpy_array_equal(result, expected)

        # three columns, heterogenous
        est = "US/Eastern"
        df = df.assign(C=df.A.dt.tz_convert(est))

        new = np.array([
            [pd.Timestamp('2000-01-01T01:00:00', tz=est)],
            [pd.Timestamp('2000-01-02T01:00:00', tz=est)],
            [pd.Timestamp('2000-01-03T01:00:00', tz=est)],
            [pd.Timestamp('2000-01-04T01:00:00', tz=est)],
        ])
        expected = np.concatenate([expected, new], axis=1)
        result = df.values
        tm.assert_numpy_array_equal(result, expected)
Example #2
0
    def test_assign_bad(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

        # non-keyword argument
        with pytest.raises(TypeError):
            df.assign(lambda x: x.A)
        with pytest.raises(AttributeError):
            df.assign(C=df.A, D=df.A + df.C)
 def test_assign_alphabetical(self):
     # GH 9818
     df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
     result = df.assign(D=df.A + df.B, C=df.A - df.B)
     expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
                          columns=list('ABCD'))
     assert_frame_equal(result, expected)
     result = df.assign(C=df.A - df.B, D=df.A + df.B)
     assert_frame_equal(result, expected)
Example #4
0
    def test_assign_dependent_old_python(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

        # Key C does not exist at definition time of df
        with pytest.raises(KeyError):
            df.assign(C=lambda df: df.A,
                      D=lambda df: df['A'] + df['C'])
        with pytest.raises(KeyError):
            df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
Example #5
0
    def test_assign_dependent(self):
        df = DataFrame({'A': [1, 2], 'B': [3, 4]})

        result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
        expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
                             columns=list('ABCD'))
        assert_frame_equal(result, expected)

        result = df.assign(C=lambda df: df.A,
                           D=lambda df: df['A'] + df['C'])
        expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
                             columns=list('ABCD'))
        assert_frame_equal(result, expected)
Example #6
0
def subtitle_cat(train_df: pd.DataFrame, clue_word: list):
    # サブタイトル名をもとにカテゴリ変数を作成する
    df = train_df.assign(heading_cat = np.nan)
    df.loc[df.heading.str.contains(r'NO_SUBTITLE'), 'heading_cat'] = 0
    df.loc[df.heading.str.contains(util.contains_patt(clue_word)), 'heading_cat'] = 1
    df.loc[df.heading_cat.isna(), 'heading_cat'] = 2

    return df.heading_cat.astype('category')
Example #7
0
    def test_assign_order(self):
        # GH 9818
        df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
        result = df.assign(D=df.A + df.B, C=df.A - df.B)

        if PY36:
            expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]],
                                 columns=list('ABDC'))
        else:
            expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
                                 columns=list('ABCD'))
        assert_frame_equal(result, expected)
        result = df.assign(C=df.A - df.B, D=df.A + df.B)

        expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
                             columns=list('ABCD'))

        assert_frame_equal(result, expected)
Example #8
0
 def test_assign_bad(self):
     df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
     # non-keyword argument
     with tm.assertRaises(TypeError):
         df.assign(lambda x: x.A)
     with tm.assertRaises(AttributeError):
         df.assign(C=df.A, D=df.A + df.C)
     with tm.assertRaises(KeyError):
         df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C'])
     with tm.assertRaises(KeyError):
         df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
Example #9
0
def labeling(sentence_df: pd.DataFrame, train_dict: dict):
    _sentence_df = sentence_df.assign(label = False)
    for _id, train_values in train_dict.items():
        if len(train_values) is 0:
            continue

        _sentence_df.loc[_sentence_df._id == str(_id), 'label'] = \
            _sentence_df.loc[_sentence_df._id == str(_id)].sentence.str.contains(contains_patt(train_values))

    return _sentence_df
Example #10
0
class MergeCategoricals(object):

    def setup(self):
        self.left_object = DataFrame(
            {'X': np.random.choice(range(0, 10), size=(10000,)),
             'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})

        self.right_object = DataFrame(
            {'X': np.random.choice(range(0, 10), size=(10000,)),
             'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})

        self.left_cat = self.left_object.assign(
            Y=self.left_object['Y'].astype('category'))
        self.right_cat = self.right_object.assign(
            Z=self.right_object['Z'].astype('category'))

    def time_merge_object(self):
        merge(self.left_object, self.right_object, on='X')

    def time_merge_cat(self):
        merge(self.left_cat, self.right_cat, on='X')
Example #11
0
    def test_assign(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        original = df.copy()
        result = df.assign(C=df.B / df.A)
        expected = df.copy()
        expected['C'] = [4, 2.5, 2]
        assert_frame_equal(result, expected)

        # lambda syntax
        result = df.assign(C=lambda x: x.B / x.A)
        assert_frame_equal(result, expected)

        # original is unmodified
        assert_frame_equal(df, original)

        # Non-Series array-like
        result = df.assign(C=[4, 2.5, 2])
        assert_frame_equal(result, expected)
        # original is unmodified
        assert_frame_equal(df, original)

        result = df.assign(B=df.B / df.A)
        expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
        assert_frame_equal(result, expected)

        # overwrite
        result = df.assign(A=df.A + df.B)
        expected = df.copy()
        expected['A'] = [5, 7, 9]
        assert_frame_equal(result, expected)

        # lambda
        result = df.assign(A=lambda x: x.A + x.B)
        assert_frame_equal(result, expected)
Example #12
0
 def fix_tickets(
         self, ticket_frame: pd.DataFrame, path_fixes) -> pd.DataFrame:
     ticket_frame.rename(
         columns={'Total changed lines': 'ChangedLines'}, inplace=True)
     ticket_frame = ticket_frame[
         ticket_frame.ChangedLines < 100000]
     ticket_frame = ticket_frame.assign(
         ChangedFiles=ticket_frame['Changed files'].apply(
         partial(self.fix_path_prefixes, path_fixes)))
     fixed_frame = ticket_frame.drop(
         'Changed files', axis=1).sort_values(
         by='CommitDate').reset_index(drop=True)
     fixed_frame.fillna(value={'Found': ''}, axis=0, inplace=True)
     return fixed_frame
Example #13
0
def get_subtitle(sentence_df: pd.DataFrame, wiki_dump_data: list):
    df = sentence_df.assign(heading = '')
    new_train_df = pd.DataFrame()
    for _id in df._id.unique():
        article_df = df.loc[df._id == _id]
        
        row_article = [entry for entry in wiki_dump_data if entry['index']['_id'] == _id][0]
        parsed = wtp.parse(row_article['source_text'])
        for source in parsed.sections[1:]:
            heading = _search_subtitle(source.string)
            section_text = _clean_source_text(source)
            article_df = _get_subtitle_of_sentence(article_df, section_text, heading)
        
        article_df = _complement_subtitle(article_df)
        new_train_df = new_train_df.append(article_df)

    return new_train_df
Example #14
0
 def test_assign_multiple(self):
     df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B'])
     result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
     expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5],
                           [3, 6, 9, 3, 6]], columns=list('ABCDE'))
     assert_frame_equal(result, expected)
Example #15
0
#len(rate_with_go_rmdup_rmna.index)
#10314
for go_term in rate_with_go_rmdup_rmna['GO Term Accession'].drop_duplicates(keep='first'):
    list1 = rate_with_go_rmdup_rmna[rate_with_go_rmdup_rmna['GO Term Accession'] == go_term]['Snake_island_viper_6']
    list2 = rate_with_go_rmdup_rmna[rate_with_go_rmdup_rmna['GO Term Accession'] == go_term]['Black_brow_viper_4']
#paired    u, pvalue = scipy.stats.wilcoxon(list1,list2)
    u, pvalue = scipy.stats.mannwhitneyu(list1,list2,alternative='greater')
    go_u_pvalue.append([go_term,u,pvalue])
#4620
    if pvalue < 0.05:
        go_u_pvalue_sign.append([go_term,u,pvalue])
#7
"""
#go_u_pvalue_sign_df['lable'] = Series(['greater']*len(go_u_pvalue_sign_df), index=go_u_pvalue_sign_df.index)
go_u_pvalue_sign_df.columns = ['GO Term Accession','Mann-Whitney U statistic','p-value']
go_u_pvalue_sign_df_label = go_u_pvalue_sign_df.assign(label = Series(['greater']*len(go_u_pvalue_sign_df)))
go_u_pvalue_sign_less_df.columns = ['GO Term Accession','Mann-Whitney U statistic','p-value']
go_u_pvalue_sign_less_df_label = go_u_pvalue_sign_less_df.assign(label = Series(['less']*len(go_u_pvalue_sign_less_df)))
go_u_pvalue_sign_gl_df = pd.concat([go_u_pvalue_sign_df_label,go_u_pvalue_sign_less_df_label],axis=0,ignore_index=True)
#go_u_pvalue_sign_gl_df_sort = go_u_pvalue_sign_gl_df.sort_values(by='GO Term Accession',ascending=True)

"""
go_u_pvalue_sign_gl_df = go_u_pvalue_sign_df.append(go_u_pvalue_sign_less_df,ignore_index=True)
go_u_pvalue_sign_gl_df = pd.concat([go_u_pvalue_sign_df,go_u_pvalue_sign_less_df],axis=0,ignore_index=True)
go_u_pvalue_sign_gl_df.columns = ['GO Term Accession','Mann-Whitney U statistic','p-value']
go_u_pvalue_sign_gl_df.assign(lable = Series(['greater']*len(go_u_pvalue_sign_df)+['less']*len(go_u_pvalue_sign_less_df)))
"""

go_rate_with_sign_p = pd.merge(GO_mean_rate.reset_index().ix[:,["GO Term Accession",species_focus,species_backgroud]],go_u_pvalue_sign_gl_df,on="GO Term Accession",how='inner',sort=True)
mart_go_uniq_ann = pd.read_table('human_mart_export_GOuniq_sort.txt',header = 'infer',skiprows=[0],skip_blank_lines=True)
go_rate_with_sign_p_ann = pd.merge(mart_go_uniq_ann,go_rate_with_sign_p,on="GO Term Accession",how='inner')
Example #16
0
# m.to_csv(open('Data/X.csv', 'w'))
# dataset = dataset1
import traceback




dataset1 = dataset.select_dtypes(include=[np.number])
use_field = list(dataset1.columns.values)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(dataset1)
dataset1 = imp.transform(dataset1)
dataset1 = DataFrame(dataset1, columns=use_field)
dataset1 = concat_free_money(dataset1)
dataset1 = throw_outliers(dataset1)
dataset1 = dataset1.assign(SUBS_ID=dataset.SUBS_ID)
dataset1 = pd.merge(dataset1, dataset2, on='SUBS_ID', how='left')
dataset1 = pd.merge(dataset1, read_csv('Data/X2.csv'), on='SUBS_ID', how='left')
# print(use_field)
gr = dataset1.groupby('SUBS_ID')
dataset = gr.mean()
dataset1 = dataset.copy()

dataset1 = dataset1.drop(['AGE_GROUP1', 'AGE_GROUP2'], axis=1)
print(len(dataset1.columns.values))
# dataset1 = dataset1['SUBS_ID']
      # (dataset.columns.values[0]))
# dataset1 = preprocessing.scale(dataset1)

TRAIN_PART = 6/5
train = dataset1[:int(len(dataset1)/TRAIN_PART)]