コード例 #1
0
 def data_frame(self):
     if self._processed_knockouts is None:
         self._process_knockouts()
     data_frame = DataFrame(self._processed_knockouts)
     data_frame.sort_values("size", inplace=True)
     data_frame.index = [i for i in range(len(data_frame))]
     return data_frame
コード例 #2
0
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr):
    '''
    Meant to feed into a Pivot requested by Mitch Turner.
    
    Aggregates the same as above but includes time and product data.
    '''
    dat = pwunsale_tidy['Date'].tolist()
    pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat]    
    
    print('Aggregating custom pivot for Mitch.')
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    custom_cols = ['Month','CustomerId','Customer','ProductId','Product']    
    
    customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)

    return customer_returns
コード例 #3
0
def view_coef(model: LogisticRegression, train_df):
    coef_list = list(model.coef_.T)
    coef_df = DataFrame({'columns': list(train_df.columns[1:]),
                         'coef': coef_list})
    coef_df['abs_coef'] = abs(coef_df['coef'])
    coef_df.sort_values(by=['abs_coef'], ascending=[0], inplace=True)
    print(coef_df)
コード例 #4
0
ファイル: feature_importance.py プロジェクト: heevery/ohp
def compute_importances(data_set_df, user_info_df, label='gender', split_modal=False, n_est=10, max_depth=None):
    print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_importances = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    modalities = data_set_df.index.levels[0]

    def compute(x):
        x_imp = pc.fill_nan_features(x)
        try:
            m = ExtraTreesClassifier(n_estimators=n_est) if max_depth is None \
                else ExtraTreesClassifier(n_estimators=n_est, max_depth=3)
            print "\t\t\tfitting RF model..."
            m.fit(x_imp.T, y_v)

            # if len(feature_mics) > 1000:
            #     break
            # print m.feature_importances_
            for order, index in enumerate(x.index):
                feature_importances.loc[index] = m.feature_importances_[order]
                if float(order) % 10000 == 0 and order > 0:
                    print "\t\t\t%s features are done" % order
        except ValueError as e:
            # print "value error occurs during processing %r" % index
            pass

    if split_modal is True:
        for modal in modalities:
            x = df_filtered.loc[modal].dropna(how='all')
            compute(x)
    else:
        x = df_filtered.dropna(how='all')
        compute(x)

    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances
コード例 #5
0
ファイル: feature_importance.py プロジェクト: heevery/ohp
def compute_fscore(data_set_df, user_info_df, label='gender', min_not_nan=-1):
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_fs = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    i = 0
    for index, values in df_filtered.iterrows():
        try:
            if min_not_nan < 0:
                f_score, p_val = f_classif(values.fillna(values.mean())[:, np.newaxis], y_v)
                feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            else:
                nan_removed = values.dropna()
                if len(nan_removed) < min_not_nan:
                    feature_fs.loc[index] = np.nan
                else:
                    f_score, p_val = f_classif(nan_removed[:, np.newaxis], y_v[nan_removed.index.astype(int)])
                    feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            if float(i) % 10000 == 0 and i > 0:
                print "\t\t\t%s features are done" % i
            i += 1
            # print index, feature_fs.loc[index].values[0]
        except ValueError:
            # print "value error occurs during processing %r" % index
            continue
    feature_fs.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_fs
コード例 #6
0
ファイル: join_merge.py プロジェクト: TomAugspurger/pandas
    def setup(self):
        one_count = 200000
        two_count = 1000000

        df1 = DataFrame(
            {'time': np.random.randint(0, one_count / 20, one_count),
             'key': np.random.choice(list(string.ascii_uppercase), one_count),
             'key2': np.random.randint(0, 25, one_count),
             'value1': np.random.randn(one_count)})
        df2 = DataFrame(
            {'time': np.random.randint(0, two_count / 20, two_count),
             'key': np.random.choice(list(string.ascii_uppercase), two_count),
             'key2': np.random.randint(0, 25, two_count),
             'value2': np.random.randn(two_count)})

        df1 = df1.sort_values('time')
        df2 = df2.sort_values('time')

        df1['time32'] = np.int32(df1.time)
        df2['time32'] = np.int32(df2.time)

        self.df1a = df1[['time', 'value1']]
        self.df2a = df2[['time', 'value2']]
        self.df1b = df1[['time', 'key', 'value1']]
        self.df2b = df2[['time', 'key', 'value2']]
        self.df1c = df1[['time', 'key2', 'value1']]
        self.df2c = df2[['time', 'key2', 'value2']]
        self.df1d = df1[['time32', 'value1']]
        self.df2d = df2[['time32', 'value2']]
        self.df1e = df1[['time', 'key', 'key2', 'value1']]
        self.df2e = df2[['time', 'key', 'key2', 'value2']]
コード例 #7
0
ファイル: feature_importance.py プロジェクト: heevery/ohp
def compute_mics(data_set_df, user_info_df, label='gender', min_not_nan=-1):
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_mics = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    i = 0
    for index, values in df_filtered.iterrows():
        # if len(feature_mics) > 1000:
        #     break
        m = minepy.MINE()
        try:
            if min_not_nan < 0:
                m.compute_score(values, y_v)
                feature_mics.loc[index] = m.mic()
            else:
                nan_removed = values.dropna()
                if len(nan_removed) < min_not_nan:
                    feature_mics.loc[index] = np.nan
                else:
                    m.compute_score(nan_removed, y_v[nan_removed.index.astype(int)])
                    feature_mics.loc[index] = m.mic()
            # if len(feature_mics) > 1000:
            #     break
            # if float(i) % 10000 == 0 and i > 0:
            #     print "\t\t\t%s features are done" % i
            i += 1
            # print index, feature_mics.loc[index].values[0]
        except ValueError:
            # print "value error occurs during processing %r" % index
            continue
    feature_mics.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_mics
コード例 #8
0
ファイル: explore.py プロジェクト: peterwilliams97/prince
def show_failures_prob(df, y, y_self, out_path):
    print('show_failures_prob: df=%s,y=%s,y_self=%s,out_path="%s"' %
          (S(df), S(y), S(y_self), out_path))
    name = '%s-%d' % (out_path, len(y))

    y_self_bool = DataFrame(np.floor(y_self['hat'].values * 2.0).astype(int),
                            index=y_self.index, columns=['hat'])

    print('A) y_self_bool=%s,' % C(y_self_bool))
    print('B) y=%s' % y.dtype)

    diff = y_self_bool['hat'] - y
    failures = diff != 0
    print('^' * 80)
    print(type(failures))
    print(failures.describe())
    print(failures[:5])
    failures_df = Series([False] * len(df), index=df.index)
    for idx, val in failures.iteritems():
        failures_df[idx] = val
    df = df[failures_df]
    y_self_df = Series([0.0] * len(df), index=df.index, dtype=float)
    y_self_df_bool = Series([0] * len(df), index=df.index, dtype=int)
    for idx in y_self_df.index:
        y_self_df[idx] = y_self['hat'][idx]
        y_self_df_bool[idx] = y_self_bool['hat'][idx]
    df['probability'] = y_self_df
    df['predicted'] = y_self_df_bool
    columns = list(df.columns[-3:]) + list(df.columns[:-3])
    df2 = DataFrame()
    for col in columns:
        df2[col] = df[col]
    df2.sort_values('hat', ascending=False, inplace=True)
    df2.to_csv('%s.failures.csv' % name, index_label='job_id')
コード例 #9
0
ファイル: test_sorting.py プロジェクト: AlexisMignon/pandas
    def test_sort_index_multicolumn(self):
        import random
        A = np.arange(5).repeat(20)
        B = np.tile(np.arange(5), 20)
        random.shuffle(A)
        random.shuffle(B)
        frame = DataFrame({'A': A, 'B': B,
                           'C': np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=['A', 'B'])
        result = frame.sort_values(by=['A', 'B'])
        indexer = np.lexsort((frame['B'], frame['A']))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=['A', 'B'], ascending=False)
        result = frame.sort_values(by=['A', 'B'], ascending=False)
        indexer = np.lexsort((frame['B'].rank(ascending=False),
                              frame['A'].rank(ascending=False)))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=['B', 'A'])
        result = frame.sort_values(by=['B', 'A'])
        indexer = np.lexsort((frame['A'], frame['B']))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)
コード例 #10
0
ファイル: test_sorting.py プロジェクト: tserafim/pandas
    def test_sort_datetimes(self):

        # GH 3461, argsort / lexsort differences for a datetime column
        df = DataFrame(
            ["a", "a", "a", "b", "c", "d", "e", "f", "g"], columns=["A"], index=date_range("20130101", periods=9)
        )
        dts = [
            Timestamp(x)
            for x in [
                "2004-02-11",
                "2004-01-21",
                "2004-01-26",
                "2005-09-20",
                "2010-10-04",
                "2009-05-12",
                "2008-11-12",
                "2010-09-28",
                "2010-09-28",
            ]
        ]
        df["B"] = dts[::2] + dts[1::2]
        df["C"] = 2.0
        df["A1"] = 3.0

        df1 = df.sort_values(by="A")
        df2 = df.sort_values(by=["A"])
        assert_frame_equal(df1, df2)

        df1 = df.sort_values(by="B")
        df2 = df.sort_values(by=["B"])
        assert_frame_equal(df1, df2)
コード例 #11
0
def model_selection_cv( models, x, y, k=5, eval_func=None, random_state=None):
    """ framework for model selection based on stratified
        cross-validation

        Parameters:
        ----------
        * models: {dictionary}, key: model label, value: learner object
        * x: {np.array}, predictor data
        * y: {np.array}, response variable data
        * k: {integer}, the number of folds in cross-validation
        * random_state: {integer}, the random state set for replication
        * eval_func: {function}, return evaulation score
    """
    # stratified cross_validation
    cv = StratifiedKFold( y, n_folds=k, shuffle=False, random_state=random_state)
    tot_models = len( models.keys() )
    tot_iter = tot_models * k

    pbar = tqdm(total=tot_iter)

    train_reports, test_reports = [], []

    for jj, model_name in enumerate(models):
        model = models[model_name]
        # cross-validation evaluation containers
        train_scores = []
        test_scores = []
        # print( "--- model: {}'s cross-validation test ----".format( model_name ) )
        for ii, (train_idx, test_idx) in enumerate(cv):
            # retrieve data for relevant usage
            x_train, y_train = x[train_idx], y[train_idx]
            x_test, y_test = x[test_idx], y[test_idx]
            # training model
            model.fit( x_train, y_train )
            # evaluation model
            train_score = eval_func( model, x_train, y_train )
            train_score["model_name"] = model_name
            test_score = eval_func( model, x_test, y_test )
            test_score["model_name"] = model_name

            train_reports.append( train_score )
            test_reports.append( test_score )

            pbar.update()

    pbar.close()

    # convert list of performance records into dataframe
    train_reports = DataFrame(train_reports)
    test_reports = DataFrame(train_reports)

    metrics_names = [feat for feat in train_reports.columns.tolist() if feat != "model_name"]

    train_reports.sort_values(by=["model_name"])
    train_reports = train_reports[["model_name"] + metrics_names]
    test_reports.sort_values(by=["model_name"])
    test_reports = test_reports[["model_name"] + metrics_names]

    return train_reports, test_reports
コード例 #12
0
ファイル: frame_methods.py プロジェクト: changhiskhan/pandas
class SortValues(object):

    params = [True, False]
    param_names = ['ascending']

    def setup(self, ascending):
        self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB'))

    def time_frame_sort_values(self, ascending):
        self.df.sort_values(by='A', ascending=ascending)
コード例 #13
0
ファイル: frame_methods.py プロジェクト: changhiskhan/pandas
class SortIndexByColumns(object):

    def setup(self):
        N = 10000
        K = 10
        self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K),
                             'key2': tm.makeStringIndex(N).values.repeat(K),
                             'value': np.random.randn(N * K)})

    def time_frame_sort_values_by_columns(self):
        self.df.sort_values(by=['key1', 'key2'])
コード例 #14
0
ファイル: test_sorting.py プロジェクト: tserafim/pandas
    def test_stable_descending_multicolumn_sort(self):
        nan = np.nan
        df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]})
        # test stable mergesort
        expected = DataFrame({"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 2, 9]}, index=[2, 5, 4, 6, 1, 3, 0])
        sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort")
        assert_frame_equal(sorted_df, expected)

        expected = DataFrame({"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, index=[2, 5, 4, 6, 1, 0, 3])
        sorted_df = df.sort_values(["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort")
        assert_frame_equal(sorted_df, expected)
コード例 #15
0
 def getData(self, params):
      top = int(params['top'])
      regex = re.compile('^<.*>(\w+.*)</.>')
      df = createDataframe()
      source = [str(regex.findall(line)).strip('[]') for line in df['source'] if line != None]
      source = dict(Counter(source))
      appSource = source.keys()
      count = source.values()
      tweetSource = DataFrame({'AppSource': appSource, 'Count':count})
      tweetSource = tweetSource[['AppSource', 'Count']]
      tweetSource.sort_values(by='Count', ascending=False, inplace=True)
      return tweetSource[:top]
コード例 #16
0
ファイル: feature_importance.py プロジェクト: heevery/ohp
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'):
    # print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered.dropna(how='all')
    x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values

    clf = RandomizedLogisticRegression()
    # print "\t\t\tfitting LR model..."
    clf.fit(x_imp.T, y_v)
    feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance'])
    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances
コード例 #17
0
ファイル: test_timegrouper.py プロジェクト: sinhrks/pandas
    def test_timegrouper_get_group(self):
        # GH 6914

        df_original = DataFrame({
            'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
            'Quantity': [18, 3, 5, 1, 9, 3],
            'Date': [datetime(2013, 9, 1, 13, 0),
                     datetime(2013, 9, 1, 13, 5),
                     datetime(2013, 10, 1, 20, 0),
                     datetime(2013, 10, 3, 10, 0),
                     datetime(2013, 12, 2, 12, 0),
                     datetime(2013, 9, 2, 14, 0), ]
        })
        df_reordered = df_original.sort_values(by='Quantity')

        # single grouping
        expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
                         df_original.iloc[[4]]]
        dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']

        for df in [df_original, df_reordered]:
            grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
            for t, expected in zip(dt_list, expected_list):
                dt = pd.Timestamp(t)
                result = grouped.get_group(dt)
                assert_frame_equal(result, expected)

        # multiple grouping
        expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
                         df_original.iloc[[4]]]
        g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'),
                  ('Joe', '2013-12-31')]

        for df in [df_original, df_reordered]:
            grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
            for (b, t), expected in zip(g_list, expected_list):
                dt = pd.Timestamp(t)
                result = grouped.get_group((b, dt))
                assert_frame_equal(result, expected)

        # with index
        df_original = df_original.set_index('Date')
        df_reordered = df_original.sort_values(by='Quantity')

        expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
                         df_original.iloc[[4]]]

        for df in [df_original, df_reordered]:
            grouped = df.groupby(pd.Grouper(freq='M'))
            for t, expected in zip(dt_list, expected_list):
                dt = pd.Timestamp(t)
                result = grouped.get_group(dt)
                assert_frame_equal(result, expected)
コード例 #18
0
ファイル: test_sorting.py プロジェクト: AlexisMignon/pandas
    def test_sort_index_duplicates(self):

        # with 9816, these are all translated to .sort_values

        df = DataFrame([lrange(5, 9), lrange(4)],
                       columns=['a', 'a', 'b', 'b'])

        with assertRaisesRegexp(ValueError, 'duplicate'):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by='a')
        with assertRaisesRegexp(ValueError, 'duplicate'):
            df.sort_values(by='a')

        with assertRaisesRegexp(ValueError, 'duplicate'):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by=['a'])
        with assertRaisesRegexp(ValueError, 'duplicate'):
            df.sort_values(by=['a'])

        with assertRaisesRegexp(ValueError, 'duplicate'):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                # multi-column 'by' is separate codepath
                df.sort_index(by=['a', 'b'])
        with assertRaisesRegexp(ValueError, 'duplicate'):
            # multi-column 'by' is separate codepath
            df.sort_values(by=['a', 'b'])

        # with multi-index
        # GH4370
        df = DataFrame(np.random.randn(4, 2),
                       columns=MultiIndex.from_tuples([('a', 0), ('a', 1)]))
        with assertRaisesRegexp(ValueError, 'levels'):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by='a')
        with assertRaisesRegexp(ValueError, 'levels'):
            df.sort_values(by='a')

        # convert tuples to a list of tuples
        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=[('a', 1)])
        expected = df.sort_values(by=[('a', 1)])

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=('a', 1))
        result = df.sort_values(by=('a', 1))
        assert_frame_equal(result, expected)
コード例 #19
0
ファイル: explore.py プロジェクト: peterwilliams97/prince
def show_predicted_prob(df, y_test, out_path):
    print('show_predicted_prob: df=%s,y_test=%s,out_path="%s"' %
          (S(df), S(y_test), out_path))
    name = '%s-%d' % (out_path, len(y_test))
    print('~' * 80)
    df = df.loc[y_test.index, :]
    df['hat'] = y_test
    columns = ['hat'] + [col for col in df.columns if col != 'hat']
    df2 = DataFrame()
    for col in columns:
        df2[col] = df[col]
    df2.sort_values('hat', ascending=False, inplace=True)
    df2.to_csv('%s.predicted.csv' % name, index_label='job_id')
コード例 #20
0
ファイル: analyze.py プロジェクト: enenn/freqtrade
def parse_ticker_dataframe(ticker: list) -> DataFrame:
    """
    Analyses the trend for the given ticker history
    :param ticker: See exchange.get_ticker_history
    :return: DataFrame
    """
    columns = {'C': 'close', 'V': 'volume', 'O': 'open', 'H': 'high', 'L': 'low', 'T': 'date'}
    frame = DataFrame(ticker) \
        .drop('BV', 1) \
        .rename(columns=columns)
    frame['date'] = to_datetime(frame['date'], utc=True, infer_datetime_format=True)
    frame.sort_values('date', inplace=True)
    return frame
コード例 #21
0
ファイル: products.py プロジェクト: biosustain/cameo
 def _search_by_inchi_fuzzy(self, inchi):
     # TODO: use openbabel if available
     matches = difflib.get_close_matches(inchi, self.data_frame.InChI.dropna(), n=5, cutoff=.8)
     ranks = {match: i for i, match in enumerate(matches)}
     selection = DataFrame(self.data_frame[self.data_frame.InChI.isin(matches)])
     selection['search_rank'] = selection.name.map(ranks)
     return selection.sort_values('search_rank')
コード例 #22
0
    def data_frame(self):
        if self._processed_solutions is None:
            self._process_solutions()

        if self._manipulation_type == "reactions":
            data_frame = DataFrame(self._processed_solutions)
        else:
            columns = self._processed_solutions.columns.difference(["reactions", "size"])
            aggregation_functions = {k: self.__aggregation_function.get(k, lambda x: x.values[0]) for k in columns}
            data_frame = self._processed_solutions.groupby(["reactions", "size"], as_index=False) \
                .aggregate(aggregation_functions)
            data_frame = data_frame[self._processed_solutions.columns]

        data_frame.sort_values("size", inplace=True)
        data_frame.index = [i for i in range(len(data_frame))]
        return data_frame
コード例 #23
0
ファイル: test_sorting.py プロジェクト: AlexisMignon/pandas
    def test_sort_index_different_sortorder(self):
        A = np.arange(20).repeat(5)
        B = np.tile(np.arange(5), 20)

        indexer = np.random.permutation(100)
        A = A.take(indexer)
        B = B.take(indexer)

        df = DataFrame({'A': A, 'B': B,
                        'C': np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=['A', 'B'], ascending=[1, 0])
        result = df.sort_values(by=['A', 'B'], ascending=[1, 0])

        ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
        expected = df.take(ex_indexer)
        assert_frame_equal(result, expected)

        # test with multiindex, too
        idf = df.set_index(['A', 'B'])

        result = idf.sort_index(ascending=[1, 0])
        expected = idf.take(ex_indexer)
        assert_frame_equal(result, expected)

        # also, Series!
        result = idf['C'].sort_index(ascending=[1, 0])
        assert_series_equal(result, expected['C'])
コード例 #24
0
ファイル: overview.py プロジェクト: patrickallo/mathpracmod
def thread_participation_evolution(
        pm_frame, project, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to threads in project with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        title = "Participation per thread in {} (threshold = {})".format(
            project, n)
    else:
        thread_type = 'research threads'
        title = "Participation per thread in {}\
                 (threshold = {}, only research-threads)".format(project, n)
    data = pm_frame.loc[project][['basic', thread_type]]
    data = data.dropna()
    all_authors = set().union(*data[thread_type, 'authors'])
    author_thread = DataFrame(columns=all_authors)
    for author in author_thread.columns:
        author_thread[author] = data[thread_type, 'authors'].apply(
            lambda thread, author=author: author in thread)
    author_thread = author_thread.T
    author_thread = author_thread.sort_values(by=data.index.tolist(),
                                              ascending=False)
    author_thread = author_thread.drop(
        "Anonymous") if skip_anon else author_thread
    author_thread.columns.name = "Threads"
    select = author_thread.sum(axis=1) >= n
    return author_thread, data.index, select, title
コード例 #25
0
ファイル: test_sorting.py プロジェクト: AlexisMignon/pandas
 def test_stable_descending_sort(self):
     # GH #6399
     df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
                    columns=['sort_col', 'order'])
     sorted_df = df.sort_values(by='sort_col', kind='mergesort',
                                ascending=False)
     assert_frame_equal(df, sorted_df)
コード例 #26
0
ファイル: test_dtypes.py プロジェクト: forking-repos/pandas
    def test_astype_categorical_to_other(self):

        value = np.random.RandomState(0).randint(0, 10000, 100)
        df = DataFrame({'value': value})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                                   right=False, labels=cat_labels)

        s = df['value_group']
        expected = s
        tm.assert_series_equal(s.astype('category'), expected)
        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
        msg = (r"could not convert string to float|"
               r"invalid literal for float\(\)")
        with pytest.raises(ValueError, match=msg):
            s.astype('float64')

        cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
        exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        tm.assert_series_equal(cat.astype('str'), exp)
        s2 = Series(Categorical(['1', '2', '3', '4']))
        exp2 = Series([1, 2, 3, 4]).astype(int)
        tm.assert_series_equal(s2.astype('int'), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(
                np.sort(np.unique(a)), np.sort(np.unique(b)))

        expected = Series(np.array(s.values), name='value_group')
        cmp(s.astype('object'), expected)
        cmp(s.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(s), np.array(s.values))

        # valid conversion
        for valid in [lambda x: x.astype('category'),
                      lambda x: x.astype(CategoricalDtype()),
                      lambda x: x.astype('object').astype('category'),
                      lambda x: x.astype('object').astype(
                          CategoricalDtype())
                      ]:

            result = valid(s)
            # compare series values
            # internal .categories can't be compared because it is sorted
            tm.assert_series_equal(result, s, check_categorical=False)

        # invalid conversion (these are NOT a dtype)
        msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\."
               "Categorical'> for astype")
        for invalid in [lambda x: x.astype(Categorical),
                        lambda x: x.astype('object').astype(Categorical)]:
            with pytest.raises(TypeError, match=msg):
                invalid(s)
コード例 #27
0
ファイル: sic_report.py プロジェクト: lv10/ross_sea
def land_sic_overlap_timeseries(instrument,
                                title="Land-Sea Ice Border Variations"):
    """
        Time Series that shows the percentage variations of the land mask
        border given the expansion of sea ice in VIRS.
    """

    files = data.file_names(instrument_id=data.INSTRUMENT_MAP.get(instrument))
    out = []

    for idx, mat in enumerate(data.mat_generator(files)):

        sic = SIC(files[idx])
        lm = LM(files[idx])

        sic_surface = sic.surface(boolean=False)
        lm_surface = lm.silhoutte()

        silhoutte_freq = itemfreq(lm_surface)
        border = silhoutte_freq[1][1]

        merge = np.add(sic_surface, lm_surface)
        merge_freq = itemfreq(merge)
        intercept = merge_freq[2][1]

        land_ice_overlap = (float(intercept) / border) * 100
        temp = {'timestamp': lm.title, 'intercept': land_ice_overlap}
        out.append(temp)

    index = [elem['timestamp'] for elem in out]
    df = DataFrame(out, index=index)
    sdf = df.sort_values(by='timestamp')
    sdf.plot(title=title)
    plt.show()
コード例 #28
0
ファイル: plot_results.py プロジェクト: rahlk/Bellwether
def plot_stuff():
    pd_list = {}
    compare_tl = []
    compare_tl_head = []
    for vars in list_communities():
        for var in vars:
            pd_list.update({var.split("/")[-1].split(".")[0]: DataFrame(
                sorted(read_csv(var)[["Name", "G"]].values, key=lambda x: x[0], reverse=True))})

        N = len(pd_list.keys())  # Find number of elements
        stats = np.zeros((N, N))  # Create a 2-D Array to hold the stats
        keys = sorted(pd_list, reverse=True)  # Find data sets (Sort alphabetically, backwards)
        for idx, key in enumerate(keys):  # Populate 2-D array
            for i, val in enumerate(pd_list[key][1].values):
                if not i == idx:  # Ensure self values are set to zero
                    stats[i, idx] = val

        stats = DataFrame(stats, columns=keys, index=keys)
        # stats["Mean"] = stats.median(axis=0)
        # set_trace()
        stats["Mean"] = find_mean(stats)
        stats["Std"] = find_std(stats)
        stats = stats.sort_values(by="Mean", axis=0, ascending=False, inplace=False)
        print(tabulate(stats, showindex=True, headers=stats.columns, tablefmt="fancy_grid"))
        print("\n")
        save_path = os.path.abspath("/".join(var.split("/")[:-2]))
        method = var.split("/")[-2]+".xlsx"
        stats.to_excel(os.path.join(save_path, method))
        compare_tl.append(stats.sort_index(inplace=False)["Mean"].values.tolist())
        compare_tl_head.append(method)
    # set_trace()
    compare_tl= DataFrame(np.array(compare_tl).T, columns=compare_tl_head, index=stats.index.sort_values())
    save_path_2 = os.path.join(os.path.abspath("/".join(var.split("/")[:-3])), os.path.abspath("".join(var.split("/")[-3]))+".xlsx")
    compare_tl.to_excel(save_path_2)
コード例 #29
0
ファイル: test_operators.py プロジェクト: BranYang/pandas
    def test_numeric_like_ops(self):

        df = DataFrame({'value': np.random.randint(0, 10000, 100)})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                                   right=False, labels=cat_labels)

        # numeric ops should not succeed
        for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
            pytest.raises(TypeError,
                          lambda: getattr(df, op)(df))

        # reduction ops should not succeed (unless specifically defined, e.g.
        # min/max)
        s = df['value_group']
        for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']:
            pytest.raises(TypeError,
                          lambda: getattr(s, op)(numeric_only=False))

        # mad technically works because it takes always the numeric data

        # numpy ops
        s = Series(Categorical([1, 2, 3, 4]))
        pytest.raises(TypeError, lambda: np.sum(s))

        # numeric ops on a Series
        for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
            pytest.raises(TypeError, lambda: getattr(s, op)(2))

        # invalid ufunc
        pytest.raises(TypeError, lambda: np.log(s))
コード例 #30
0
ファイル: overview.py プロジェクト: patrickallo/mathpracmod
def project_participation_evolution(
        pm_frame, all_authors, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to projects with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = 'all threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = list(all_authors)
        title = "Participation per project in Polymath\
                 (threshold = {})".format(n)
    else:
        thread_type = 'research threads'
        data, _ = get_last(pm_frame, thread_type)
        all_authors = set().union(
            *data['research threads', 'authors (accumulated)'])
        title = "Participation per project in Polymath\
                 (threshold = {}, only research-threads)".format(n)
    data.index = data.index.droplevel(1)
    author_project = DataFrame(columns=all_authors)
    for author in author_project.columns:
        author_project[author] = data[
            thread_type, 'authors (accumulated)'].apply(
                lambda project, author=author: author in project)
    author_project = author_project.T
    author_project = author_project.sort_values(by=data.index.tolist(),
                                                ascending=False)
    author_project = author_project.drop(
        "Anonymous") if skip_anon else author_project
    select = author_project.sum(axis=1) >= n
    return author_project, data.index, select, title
コード例 #31
0
def group_by_booking_date(dataframe: pd.DataFrame):
    key = lambda k: (k.year, k.month, k.day)
    dataframe_sort_creation = dataframe.sort_values(by='creation_date',
                                                    ascending=True)  # new Frame of data d to leave the original Frame of data the same
    print(dataframe_sort_creation.groupby(dataframe_sort_creation['creation_date'].apply(key)).mean()['amount'])
    print(dataframe.groupby(dataframe['booking_date'].apply(key)))
コード例 #32
0
ファイル: test_multi.py プロジェクト: sughosh360/pandas
    def test_left_join_index_multi_match_multiindex(self):
        left = DataFrame(
            [
                ["X", "Y", "C", "a"],
                ["W", "Y", "C", "e"],
                ["V", "Q", "A", "h"],
                ["V", "R", "D", "i"],
                ["X", "Y", "D", "b"],
                ["X", "Y", "A", "c"],
                ["W", "Q", "B", "f"],
                ["W", "R", "C", "g"],
                ["V", "Y", "C", "j"],
                ["X", "Y", "B", "d"],
            ],
            columns=["cola", "colb", "colc", "tag"],
            index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
        )

        right = DataFrame(
            [
                ["W", "R", "C", 0],
                ["W", "Q", "B", 3],
                ["W", "Q", "B", 8],
                ["X", "Y", "A", 1],
                ["X", "Y", "A", 4],
                ["X", "Y", "B", 5],
                ["X", "Y", "C", 6],
                ["X", "Y", "C", 9],
                ["X", "Q", "C", -6],
                ["X", "R", "C", -9],
                ["V", "Y", "C", 7],
                ["V", "R", "D", 2],
                ["V", "R", "D", -1],
                ["V", "Q", "A", -3],
            ],
            columns=["col1", "col2", "col3", "val"],
        ).set_index(["col1", "col2", "col3"])

        result = left.join(right, on=["cola", "colb", "colc"], how="left")

        expected = DataFrame(
            [
                ["X", "Y", "C", "a", 6],
                ["X", "Y", "C", "a", 9],
                ["W", "Y", "C", "e", np.nan],
                ["V", "Q", "A", "h", -3],
                ["V", "R", "D", "i", 2],
                ["V", "R", "D", "i", -1],
                ["X", "Y", "D", "b", np.nan],
                ["X", "Y", "A", "c", 1],
                ["X", "Y", "A", "c", 4],
                ["W", "Q", "B", "f", 3],
                ["W", "Q", "B", "f", 8],
                ["W", "R", "C", "g", 0],
                ["V", "Y", "C", "j", 7],
                ["X", "Y", "B", "d", 5],
            ],
            columns=["cola", "colb", "colc", "tag", "val"],
            index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
        )

        tm.assert_frame_equal(result, expected)

        result = left.join(right,
                           on=["cola", "colb", "colc"],
                           how="left",
                           sort=True)

        expected = expected.sort_values(["cola", "colb", "colc"],
                                        kind="mergesort")

        tm.assert_frame_equal(result, expected)
コード例 #33
0
ファイル: pandas.py プロジェクト: ICB-DCM/parPE
def constructEdataFromDataFrame(
        df: pd.DataFrame,
        model: AmiciModel,
        condition: pd.Series,
        by_id: Optional[bool] = False) -> amici.amici.ExpData:
    """
    Constructs an ExpData instance according to the provided Model
    and DataFrame.

    :param df:
        pd.DataFrame with Observable Names/Ids as columns.
        Standard deviations may be specified by appending '_std' as suffix.

    :param model:
        Model instance.

    :param condition:
        pd.Series with FixedParameter Names/Ids as columns.
        Preequilibration conditions may be specified by appending
        '_preeq' as suffix. Presimulation conditions may be specified by
        appending '_presim' as suffix.

    :param by_id:
        Indicate whether in the arguments, column headers are based on ids or
         names. This should correspond to the way `df` and `condition` was
         created in the first place.

    :return:
        ExpData instance.
    """
    # initialize edata
    edata = amici.ExpData(model.get())

    # timepoints
    df = df.sort_values(by='time', ascending=True)
    edata.setTimepoints(df['time'].values.astype(float))

    # get fixed parameters from condition
    overwrite_preeq = {}
    overwrite_presim = {}
    for par in list(_get_names_or_ids(model, 'FixedParameter', by_id=by_id)):
        if par + '_preeq' in condition.keys() \
                and not math.isnan(condition[par + '_preeq'].astype(float)):
            overwrite_preeq[par] = condition[par + '_preeq'].astype(float)
        if par + '_presim' in condition.keys() \
                and not math.isnan(condition[par + '_presim'].astype(float)):
            overwrite_presim[par] = condition[par + '_presim'].astype(float)

    # fill in fixed parameters
    edata.fixedParameters = condition[_get_names_or_ids(
        model, 'FixedParameter', by_id=by_id)].astype(float).values

    # fill in preequilibration parameters
    if any([
            overwrite_preeq[key] != condition[key]
            for key in overwrite_preeq.keys()
    ]):
        edata.fixedParametersPreequilibration = \
            _get_specialized_fixed_parameters(
                model, condition, overwrite_preeq, by_id=by_id)
    elif len(overwrite_preeq.keys()):
        edata.fixedParametersPreequilibration = copy.deepcopy(
            edata.fixedParameters)

    # fill in presimulation parameters
    if any([
            overwrite_presim[key] != condition[key]
            for key in overwrite_presim.keys()
    ]):
        edata.fixedParametersPresimulation = _get_specialized_fixed_parameters(
            model, condition, overwrite_presim, by_id=by_id)
    elif len(overwrite_presim.keys()):
        edata.fixedParametersPresimulation = copy.deepcopy(
            edata.fixedParameters)

    # fill in presimulation time
    if 't_presim' in condition.keys():
        edata.t_presim = float(condition['t_presim'])

    # fill in data and stds
    for obs_index, obs in enumerate(
            _get_names_or_ids(model, 'Observable', by_id=by_id)):
        if obs in df.keys():
            edata.setObservedData(df[obs].values.astype(float), obs_index)
        if obs + '_std' in df.keys():
            edata.setObservedDataStdDev(df[obs + '_std'].values.astype(float),
                                        obs_index)

    return edata
コード例 #34
0
class ParamGA(object):
    def __init__(self, QID_trn, X_trn, Y_trn, QID_dev, X_dev, Y_dev, 
                 model_name='RandomForestClassifier', param_funcs=dict(), 
                 param_static=dict(), param_index=dict(),
                 pair_wise=False):
        self.QID_trn, self.X_trn, self.Y_trn = QID_trn, X_trn, Y_trn
        self.QID_dev, self.X_dev, self.Y_dev = QID_dev, X_dev, Y_dev
        self.rank_dev = DataFrame({}, index=self.X_dev.index)
        self.rank_dev['Label'] = self.Y_dev
        self.rank_dev['QID'] = self.QID_dev
        self.counter = itertools.count()
        self.mrrs = dict()
        self.model_name = model_name
        self.param_funcs = param_funcs
        self.param_static = param_static
        self.param_index = param_index
        self.pair_wise = pair_wise
        if self.pair_wise:
            self.pair_ranker = PairWiseRanker(self.QID_trn, self.X_trn, self.Y_trn)
            self.pair_ranker.init_predict(self.QID_dev, self.X_dev)
        else:
            self.models = dict()
        
    def _gen_param(self):
        param = [None] * len(self.param_index)
        for pn, func in self.param_funcs.items():
            param[self.param_index[pn]] = func['gen']()
        return param

    def _evaluate(self, indiv):
        eval_str = self.model_name + '('
        flag = ''
        for pn in self.param_funcs:
            gene = indiv[self.param_index[pn]]
            if type(gene) == str:
                eval_str += flag + pn + '="' + str(gene) + '"'
            else:
                eval_str += flag + pn + '=' + str(gene)
            flag = ', '
        for pn in self.param_static:
            if type(self.param_static[pn]) == str:
                eval_str += flag + pn + '="' + str(self.param_static[pn]) + '"'
            else:
                eval_str += flag + pn + '=' + str(self.param_static[pn])
            flag = ', '
        eval_str += ')'
        model = eval(eval_str)
        model_idx = self.counter.next()
        if self.pair_wise:
            self.pair_ranker.fit(model, model_idx)
            pred = self.pair_ranker.do_predict(model_idx)
        else:
            model.fit(self.X_trn, self.Y_trn)
            self.models[model_idx] = model
            pred = Series(model.predict_proba(self.X_dev)[:, 1], index=self.X_dev.index)
        self.rank_dev['pred'] = pred # pred must be a Series, not an array
        self.rank_dev.sort_values(['QID', 'pred'], inplace=True, ascending=False)
        grp = self.rank_dev.Label.groupby(self.rank_dev.QID)
        mrr = MRR(grp, keep_no_ans=False)
        self.mrrs[model_idx] = mrr
        print '    >', model_idx, np.round(mrr, 4), indiv
        return mrr, 

    def _mut_indiv(self, indiv, indiv_pb):
        for pn, func in self.param_funcs.items():
            if random.random() < indiv_pb:
                indiv[self.param_index[pn]] = func['mut']()

    def run(self, NPOP=30, NGEN=10, CXPB=0.5, MUTPB=0.2):
        self.ga = MyGA(self._gen_param, self._evaluate, self._mut_indiv, CXPB=CXPB, MUTPB=MUTPB)
        self.ga.init_pop(NPOP=NPOP)
        self.ga.iterate(NGEN=NGEN)
コード例 #35
0
# 分析浏览次数7次以上的数据
times = counts1_.index[7:]
bins = [7, 100, 1000, 50000]
cats = pd.cut(times, bins, right=True, labels=['8~100', '101~1000', '1000以上'])
e = cats.value_counts()
e = DataFrame(e, columns=[u'用户数'])
e.index.name = u'点击次数'

# In[22]:

e[u'用户数'] = np.nan
e.ix[u'8~100', u'用户数'] = a.loc[8:100, :][u'用户数'].sum()
e.ix['101~1000', u'用户数'] = a.loc[101:1000, :][u'用户数'].sum()
e.ix['1000以上', u'用户数'] = a.loc[1001:, :][u'用户数'].sum()
e.sort_values(by=u'用户数', ascending=False, inplace=True)
e.reset_index(inplace=True)

e

# In[23]:

#-----* 3 *-----对浏览一次的用户行为进行分析

# 读取数据库数据
engine = create_engine(
    'mysql+pymysql://root:@127.0.0.1:3306/jing?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize=10000)

# In[24]:
コード例 #36
0
df = DataFrame(obs)
num = ['min_exec', 'average', 'max_exec']
for c in num:
    df[c] /= df['N']
df.head()

##########################
# Let's compute the gains.

gains = []
for nb in set(df['nb']):
    gain = parallized_gain(df[df.nb == nb])
    gains.append(dict(nb=nb, gain=gain))

dfg = DataFrame(gains)
dfg = dfg.sort_values('nb').reset_index(drop=True).copy()
dfg

##########################################
# Graph.

ax = dfg.set_index('nb').plot()
ax.set_title(
    "Parallelization gain depending\non the number of trees\n(max_depth=6).")

##############################
# That does not answer the question we are looking for
# as we would like to know the best threshold *th*
# which defines the number of observations for which
# we should parallelized. This number depends on the number
# of trees. A gain > 1 means the parallization should happen
def input_value () : 
    mode = input("naver or daum ?")
    database = input("Database ? ")
    
    start_year = str(input("Start Year ? "))
    start_month = str(input("Start Month ? "))
    start_day = str(input("Start Day ? "))

    end_year = str(input("End Year ? "))
    end_month = str(input("End Month ? "))
    end_day = str(input("End Day ? "))
    
    reply_num = int(input("댓글 수 몇 개 이상 ? "))
    mode_database = mode+ '_' + database
    
    conn = db.make_connect(mode_database)
    news = db.read_by_table(mode + "_articles")
    reply = db.read_by_table(mode + "_replies")
    news_df = DataFrame(news)
    reply_df = DataFrame(reply)
    news_df.sort_values(by = ['article_date'], axis = 0, inplace = True)
    news_df = news_df[news_df.article_date != '-']
    news_df = news_df.dropna()
    
    if(mode == 'naver') : 
        news_df['article_date'] = news_df['article_date'].str.replace('최종수정 ', '')
        news_df['article_date'] = news_df['article_date'].str.replace('.', '-')
        
        start_date = start_year + '-' + start_month + '-' + start_day
        end_date = end_year + '-' + end_month + '-' + end_day
    
    elif(mode == 'daum') : 
        news_df['article_date'] = news_df['article_date'].str.replace('수정 ', '')
        news_df['article_date'] = news_df['article_date'].str.replace('입력 ', '')
        
        start_date = start_year + '.' + start_month + '.' + start_day
        end_date = end_year + '.' + end_month + '.' + end_day
      
    news_df = news_df[news_df.article_date > start_date]
    news_df = news_df[news_df.article_date < end_date]
    
    reply_df['R_Like+Bad'] = reply_df['R_Like'] + reply_df['R_Bad']
    reply_df['reply_date'] = reply_df['reply_date'].apply(lambda e: e[:16])
    
    group_reply_df = reply_df.groupby('Article_ID').size().to_frame('R_count')
    group_reply_df = group_reply_df[group_reply_df['R_count'] > reply_num]
    group_reply_df = group_reply_df.reset_index()
          
    news_df = news_df[news_df['Article_ID'].isin(group_reply_df['Article_ID'])]
        
    temp_reply_df = DataFrame({'Article_ID': reply_df['Article_ID'],'R_Like': reply_df['R_Like'], 'R_Bad': reply_df['R_Bad'], 'R_Like+Bad': reply_df['R_Like+Bad']})
    temp_reply_df = temp_reply_df.groupby('Article_ID').sum()
    
    temp_news_df = DataFrame({'Article_ID': news_df['Article_ID'],'Title': news_df['Title'], 'article_date' : news_df['article_date']})
    
    result_df = pd.merge(temp_news_df, group_reply_df, on = 'Article_ID')
    rank_value = int(input("상위 랭크 ? "))
    result_df = pd.merge(result_df, temp_reply_df, on = 'Article_ID')

    # Like+Bad 순으로 정렬 후 순위만큼 출력
    result_df = result_df.sort_values(by = ['R_Like+Bad'], ascending=False)
    print('댓글 수가 '+ str(reply_num) +' 개 이상인 기사들의 총 개수 : ' + str(len(result_df)))
    print(result_df[0:rank_value])
    
    return result_df,reply_df
コード例 #38
0
        covid_pcr.append(result2)
    elif (result1 is not None and result2 is None):
        covid_pcr.append(result1)

print len(covid_pcr)

inputDF['covid_pcr'] = covid_pcr

# threshold = len(inputDF)*0.7

print "before drop " + str(len(inputDF.columns.values))
inputDF = inputDF.dropna(axis=1, how='all')
print "after drop " + str(len(inputDF.columns.values))

inputDF = inputDF.dropna(axis=0, how='any', thresh=40)

threshold = len(inputDF) * 0.7
inputDF = inputDF.dropna(axis=1, how='any', thresh=threshold)

percent_missing = inputDF.isnull().sum() * 100 / len(inputDF)
missing_value_df = DataFrame({
    'column_name': inputDF.columns,
    'percent_missing': percent_missing
})
missing_value_df.sort_values('percent_missing', inplace=True)

inputDF = inputDF.dropna(axis=0,
                         how='any',
                         thresh=len(inputDF.columns.values) * 0.8)

inputDF.to_sql('treated_dataset', con=dbconn.connection, index=False)
コード例 #39
0
'''
   D  C  B  A
b  2  3  1  0
a  6  7  5  4
'''
print 'DataFrame按列的值排序'
frame = DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]})
print frame
'''
   a  b
0  0  4
1  1  7
2  0 -3
3  1  2
'''
print frame.sort_values(by = 'b') # 指定b这列的值进行排序
'''
   a  b
2  0 -3
3  1  2
0  0  4
1  1  7
'''
print frame.sort_values(by = ['a', 'b']) #先a后b进行列的值排序
'''
   a  b
2  0 -3
0  0  4
3  1  2
1  1  7
'''
コード例 #40
0
def main(offset=0):
    daily001 = main_session.query(models.DailyPro).filter(
        models.DailyPro.ts_code == '000001.SZ').order_by(
            models.DailyPro.trade_date.desc()).all()
    LAST_MARKET_DATE = daily001[offset].trade_date

    data_frame = DataFrame()
    for i, stock_basic in enumerate(
            main_session.query(models.StockBasicPro).all()):
        try:
            for key in models.StockBasicPro.keys:
                data_frame.loc[i, key] = getattr(stock_basic, key)

            daily = main_session.query(models.DailyPro).filter(
                models.DailyPro.ts_code == stock_basic.ts_code,
                models.DailyPro.trade_date <= LAST_MARKET_DATE).order_by(
                    models.DailyPro.trade_date.desc()).limit(
                        sampling_count).all()
            ma_10 = api.daily_close_ma(daily=daily, step=10)
            ma_20 = api.daily_close_ma(daily=daily, step=20)
            data_frame.loc[i, COL_MA_10] = ma_10[0]
            data_frame.loc[i, COL_MA_20] = ma_20[0]
            data_frame.loc[i, COL_MA_10_SLOPE] = round(
                (ma_10[0] / ma_10[1] - 1) * 100, 2)
            data_frame.loc[i, COL_MA_20_SLOPE] = round(
                (ma_20[0] / ma_20[1] - 1) * 100, 2)
            data_frame.loc[i, COL_LASTPRICE] = daily[0].close
            data_frame.loc[i, COL_INDAY_CHG] = round(
                daily[0].close - daily[0].open, 2)
            cons = main_session.query(models.ConceptPro).join(
                models.ConceptDetailPro,
                models.ConceptPro.code == models.ConceptDetailPro.code).filter(
                    models.ConceptDetailPro.ts_code ==
                    stock_basic.ts_code).all()
            concept_value = ''
            for con in cons:
                concept_value = concept_value + '{c}, '.format(c=con.name)
            data_frame.loc[i, 'concept'] = concept_value

            daily_basic = main_session.query(models.DailyBasicPro).filter(
                models.DailyBasicPro.ts_code == stock_basic.ts_code).first()
            if daily_basic:
                data_frame.loc[i, 'circ_mv'] = '{}亿'.format(
                    round(daily_basic.circ_mv / 10000, 2))

        except Exception as e:
            print('excetion in index:{index} {code} {name}'.format(
                index=i, code=stock_basic.ts_code, name=stock_basic.name))
            continue
        print('##### {i} #####'.format(i=i))

    data_frame = data_frame[(data_frame[COL_MA_10] > data_frame[COL_MA_20])
                            &
                            (data_frame[COL_LASTPRICE] < data_frame[COL_MA_10])
                            & (data_frame[COL_INDAY_CHG] > 0)]
    # data_frame = data_frame.sort_values(by=COL_MAXGAP, ascending=False).reset_index(drop=True)
    # data_frame = data_frame.iloc[:200]

    data_frame = data_frame.sort_values(by=COL_MA_20_SLOPE,
                                        ascending=False).reset_index(drop=True)
    data_frame = data_frame.loc[:, [
        'ts_code', 'name', 'industry', COL_LASTPRICE, 'concept', 'circ_mv'
    ]]

    file_name = '../../logs/{date}@MA_10_20.csv'.format(date=LAST_MARKET_DATE)
    # print(fileName)
    with open(file_name, 'w', encoding='utf8') as file:
        data_frame.to_csv(file)
コード例 #41
0
type(frame.e)
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame.sort_index()
frame.sort_index(axis=1)
frame.sort_index(axis=1, ascending=False)
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
frame.sort_values(by='b')
frame.sort_values(by=['a','b'])

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
obj.rank(method='first')
obj.rank(ascending=False, method='max') # method is tie-breaking method

frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame
frame.rank(axis='columns')

obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj
obj.index.is_unique
コード例 #42
0
def extract_preds_from_test_set(gen, model, reset_data=None):
    """
    Run a model on specified number of batches extracted from a generator object.

    :param gen: generator object that returns a batch of (test set samples, test set targets).
    Generator should come from train.gen_seq_scans(..., test_set=True) so that batches are created sequentially.
    :param model: keras.models.Sequential model object
    :param reset_data: pandas.DataFrame with data from which generator came from, use to determine
    when to reset model states. Use only with a stateful model.
    :return: y, preds 2-tuple of arrays, the original target array (y) and the
    model predicted array (preds)
    """
    y = list()
    preds = list()
    idx = list()

    proceed = True
    batch_ctr = 0
    prev_end_idx = None

    while proceed:
        if (batch_ctr % 10 == 0) and (batch_ctr > 0):
            print('batch number {0}, total predictions made = {1}'.format(
                batch_ctr, len(y)))

        # if reset_every is an integer, reset model every n batches.
        x_batch, y_batch, idx_batch = next(gen)

        # if using a stateful model and there's a discontinuity between consecutive batches, reset model.
        if (reset_data is not None) and (batch_ctr > 0):
            batch_step = reset_data.pos.iloc[
                idx_batch[0]] - reset_data.pos.iloc[prev_end_idx]
            prev_end_idx = idx_batch[-1]

            if batch_step != 1:
                print('batch discontinuity found. resetting model state.')
                model.reset_states()

        # store end index from current batch.
        else:
            prev_end_idx = idx_batch[-1]

        preds_batch = model.predict(x_batch, batch_size=x_batch.shape[0])

        y += y_batch.ravel().tolist()
        preds += preds_batch.ravel().tolist()
        idx += idx_batch

        # check whether or not any batch indices are repeated. If they are,
        # this means that batches have reset to the beginning of the test set
        # as the entire test set has been covered.
        proceed = len(set(idx)) == len(idx)
        batch_ctr += 1

    # remove duplicate predictions and arrange predictions in genome order.
    preds_df = DataFrame({'y': y, 'preds': preds, 'idx': idx})
    preds_df.drop_duplicates(subset=['idx'], inplace=True)
    preds_df.sort_values(['idx'], inplace=True)
    preds_df.reset_index(drop=True, inplace=True)

    return preds_df
コード例 #43
0
ファイル: nepc.py プロジェクト: BillyMitchell97/nepc
    def summary(self, metadata=None, lower=None, upper=None, sort=[]):
        """Summarize the NEPC model.

        Prints the following information:
            - Number of cross sections in the model
            - Number of cross sections matching metadata, if provided

        Returns a stylized Pandas dataframe with headers given by:

        headers = ["cs_id", "specie", "lhsA", "rhsA", "process",
                   "reaction", "threshold", "E_peak", "E_upper",
                   "sigma_max", "lpu", "upu"]

        Parameters
        ----------
        metadata: dict
            see :attr:`.CS.metadata`
        lower : int
            lower bound of model index to include in summary
        upper : int
            upper bound of model index to include in summary
        sort : list[str]
            headers by which the stylized Pandas table is sorted

        Returns
        -------
        cs_df : pandas.io.formats.style.Styler
            A stylized Pandas DataFrame containing the cs_id, process,
            range of electron energies (E_lower, E_upper),
            maximum sigma (sigma_max), and
            lpu/upu's for each cross section in the model (or subset of the
            model if :obj:`metadata` is provided)

        """
        summary_list = []

        headers = ["cs_id", "specie", "lhsA", "rhsA", "process",
                   "reaction", "threshold", "E_peak", "E_upper",
                   "sigma_max", "lpu", "upu"]

        max_e_peak = 0
        min_e_peak = 100000
        max_e_upper = 0
        max_peak_sigma = 0
        min_peak_sigma = 1
        max_lpu = 0.000000001
        max_upu = 0.000000001

        print('Number of cross sections in model: {:d}'.format(len(self.cs)))
        if metadata is not None:
            cs_subset = self.subset(metadata=metadata)
            print('Number of cross sections with '
                  'matching metadata: {:d}'.format(len(cs_subset)))
        else:
            cs_subset = self.cs


        for cs in cs_subset:
            csdata = np.array(list(zip(cs.data['e'], cs.data['sigma'])))
            e_peak = csdata[np.argmax(csdata[:, 1]), 0]
            cs_peak_sigma = np.max(csdata[:, 1])
            e_upper = np.max(csdata[csdata[:, 1] != 0.0][:, 0])
            if e_peak > max_e_peak:
                max_e_peak = e_peak
            if e_peak < min_e_peak:
                min_e_peak = e_peak
            if e_upper > max_e_upper:
                max_e_upper = e_upper
            if cs_peak_sigma > max_peak_sigma:
                max_peak_sigma = cs_peak_sigma
            if cs_peak_sigma < min_peak_sigma:
                min_peak_sigma = cs_peak_sigma
            reaction = reaction_latex(cs)
            cs_lpu = cs.metadata["lpu"]
            cs_upu = cs.metadata["upu"]
            if cs_lpu is not None and cs_lpu > max_lpu:
                max_lpu = cs_lpu
            if cs_upu is not None and cs_upu > max_upu:
                max_upu = cs_upu
            summary_list.append([cs.metadata["cs_id"],
                                 cs.metadata["specie"], cs.metadata["lhsA"], cs.metadata["rhsA"],
                                 cs.metadata["process"], reaction,
                                 cs.metadata["units_e"]*cs.metadata["threshold"],
                                 cs.metadata["units_e"]*e_peak,
                                 cs.metadata["units_e"]*e_upper,
                                 cs.metadata["units_sigma"]*cs_peak_sigma,
                                 cs_lpu, cs_upu])

        cs_df = DataFrame(summary_list, columns=headers)
        if sort:
            cs_df = (cs_df.sort_values(by=sort)
                     .reset_index(drop=True))
        if upper is None:
            upper = len(cs_df)
        if lower is None:
            lower = 0
        return (cs_df.loc[lower:upper]
                .style
                .background_gradient(subset=['threshold', 'E_peak', 'E_upper',
                                             'sigma_max', 'lpu', 'upu'],
                                     cmap='plasma')
                .highlight_null('red'))
コード例 #44
0
ファイル: backends.py プロジェクト: oasbrink/ibis
 def assert_frame_equal(cls, left: pd.DataFrame, right: pd.DataFrame,
                        *args: Any, **kwargs: Any) -> None:
     columns = list(set(left.columns) & set(right.columns))
     left = left.sort_values(by=columns)
     right = right.sort_values(by=columns)
     return super().assert_frame_equal(left, right, *args, **kwargs)
コード例 #45
0
from pandas import Series, DataFrame

data = {
    '语文': [66, 95, 98, 90, 80],
    '数学': [65, 76, 86, 88, 90],
    '英语': [30, 98, 88, 77, 90]
}
df = DataFrame(data,
               index=['张飞', '关羽', '刘备', '典韦', '许褚'],
               columns=['语文', '数学', '英语'])
df1 = DataFrame(data,
                index=['张飞', '关羽', '刘备', '典韦', '许褚'],
                columns=['语文', '数学', '英语', '总计'])

df1['总计'] = df1.sum(axis=1)
print('平均分\n', df.mean())
print("============================================")
print('最小成绩\n', df.min())
print("============================================")
print('最大成绩\n', df.max())
print("============================================")
print('方差\n', df.var())
print("============================================")
print('标准差\n', df.std())
print("============================================")
#输出按总成绩倒序排名
print('总成绩排名如下:')
print(df1.sort_values('总计', ascending=False))

# In[ ]:
コード例 #46
0
def arrange(filename, savename):
    print('글 정렬 작업 준비중...')
    sys.stdout.flush()
    from pandas import DataFrame, read_csv, concat

    if not ('.csv' in filename): filename += '.csv'

    # 통피 리스트
    SKTip = [
        '203.226', '211.234', '223.32', '223.33', '223.34', '223.35', '223.36',
        '223.37', '223.38', '223.39', '223.40', '223.41', '223.42', '223.43',
        '223.44', '223.45', '223.46', '223.47', '223.48', '223.49', '223.50',
        '223.51', '223.52', '223.53', '223.54', '223.55', '223.56', '223.57',
        '223.58', '223.59', '223.60', '223.61', '223.62', '223.63', '27.160',
        '27.161', '27.162', '27.163', '27.164', '27.165', '27.166', '27.167',
        '27.168', '27.169', '27.170', '27.171', '27.172', '27.173', '27.174',
        '27.175', '27.176', '27.177', '27.178', '27.179', '27.180', '27.181',
        '27.182', '27.183'
    ]
    KTip = [
        '39.7', '110.70', '175.223', '211.246', '118.235', '110.70', '175.252',
        '175.253', '175.254', '175.255'
    ]  #'175.252'
    LGTip = ['61.43', '211.234', '117.111', '211.36', '106.101', '106.102']

    # 기존 CSV 불러옴
    data = read_csv(filename, dtype={'IPID': str})
    cdata = DataFrame()

    haspostdata = False
    hascmtdata = False

    # 처음 csv 데이터가 댓글 데이터인경우
    if 'Cmt ID' in data:
        cdata = read_csv(filename, dtype={'IPID': str})
        data = DataFrame()
        hascmtdata = True
    else:
        haspostdata = True

    if len(sys.argv) > 4 and (sys.argv[1] == "-a" or sys.argv[1] == "--a"):
        print('2개 이상의 데이터가 발견되었습니다')
        sys.stdout.flush()

        savename = sys.argv[len(sys.argv) - 1]

        for i in range(3, len(sys.argv) - 1):
            print(i - 1, '번째 파일 병합중...')
            sys.stdout.flush()

            fname = sys.argv[i]
            if not ('.csv' in fname): fname += '.csv'
            newd = read_csv(fname, dtype={'IPID': str})

            if 'Cmt ID' in newd:
                cdata = concat([cdata, newd])
                if not hascmtdata: hascmtdata = True
            else:
                data = concat([data, newd])
                if not haspostdata: haspostdata = True

    if not ('.csv' in savename): savename += '.csv'

    if haspostdata:
        gonic = data[data['HasAccount'] == 1]  # 고닉글과
        udong = data[data['HasAccount'] == 0]  # 유동글

    if hascmtdata:
        cgonic = cdata[cdata['HasAccount'] == 1]  # 고닉댓과
        cudong = cdata[cdata['HasAccount'] == 0]  # 유동댓

    # 새로운 데이터 생성
    res = DataFrame(columns=[
        'Nick', 'IPID', 'Posts', 'Upvotes', 'Downvotes', 'Comments', 'Views',
        'HasAccount'
    ])

    idList = []
    ipList = []
    unickList = []

    # 모으는 순서: 고닉ID -> 유동IP -> 유동닉
    if haspostdata:
        idList = gonic.IPID.unique().tolist()
        ipList = udong.IPID.unique().tolist()
        unickList = udong.Nickname.unique().tolist()

    if hascmtdata:
        for ipid in cgonic.IPID.unique().tolist():
            if not ipid in idList:
                idList.append(ipid)

        for ipid in cudong.IPID.unique().tolist():
            if not ipid in ipList:
                ipList.append(ipid)

        for unick in cudong.Nickname.unique().tolist():
            if not unick in unickList:
                unickList.append(unick)

    # 고닉 다중이 목록 불러오기
    dup_list_id = []
    try:
        tmp = open('dup_list_id.txt', 'r', encoding='utf-8')
        dup_list_id = tmp.read().split('\n')
        tmp.close()
    except:
        print('dup_list_id.txt 불러오기 실패')

    # 유동 다중ip 목록 불러오기
    dup_list_ip = []
    try:
        tmp = open('dup_list_ip.txt', 'r', encoding='utf-8')
        dup_list_ip = tmp.read().split('\n')
        tmp.close()
    except:
        print('dup_list_ip.txt 불러오기 실패')

    # 유동 다중닉 목록 불러오기
    dup_list_nick = []
    try:
        tmp = open('dup_list_nick.txt', 'r', encoding='utf-8')
        dup_list_nick = tmp.read().split('\n')
        tmp.close()
    except:
        print('dup_list_nick.txt 불러오기 실패')

    # 고닉 다중이 id목록에서 이미 있던놈은 미리 지우기
    for ml in dup_list_id:
        if not ml == '':  # 첫 문자가 #면 주석처리, 아무것도 없을시 무시
            if not ml[0] == '#':
                # 맨 처음 값은 무시
                rmlist = ml[ml.find('\t') + 1:]
                idList = [e for e in idList if e not in rmlist.split('\t')]
                idList.append(rmlist)

    # 유동 다중ip 목록에서 이미 있던놈은 미리 지우기
    for ml in dup_list_ip:
        if not ml == '':  # 첫 문자가 #면 주석처리, 아무것도 없을시 무시
            if not ml[0] == '#':
                # 맨 처음 값은 무시
                rmlist = ml[ml.find('\t') + 1:]
                ipList = [e for e in ipList if e not in rmlist.split('\t')]
                ipList.append(rmlist)

    # 유동 다중닉 목록에서 이미 있던놈은 미리 지우기
    for ml in dup_list_nick:
        if not ml == '':  # 첫 문자가 #면 주석처리, 아무것도 없을시 무시
            if not ml[0] == '#':
                # 맨 처음 값은 무시
                rmlist = ml[ml.find('\t') + 1:]
                unickList = [
                    e for e in unickList if e not in rmlist.split('\t')
                ]
                unickList.append(rmlist)

    print('고닉 글 집계중...')
    sys.stdout.flush()

    ################### 고닉 글 집계 ###################

    for ids in idList:

        # 글 검색
        col = DataFrame()
        if haspostdata:
            col = gonic[gonic['IPID'].isin(ids.split('\t'))]
            col2 = udong[udong['IPID'].isin(ids.split('\t'))]  # 유동 아이피도 넣기
            col = concat([col, col2])

        if hascmtdata:
            # 댓글 검색
            ccol = cgonic[cgonic['IPID'].isin(ids.split('\t'))]
            ccol2 = cudong[cudong['IPID'].isin(ids.split('\t'))]  # 유동 아이피도 넣기
            ccol = concat([ccol, ccol2])

        nicks = []
        ids2 = []
        if haspostdata:
            nicks = col.Nickname.unique().tolist()
            ids2 = col.IPID.unique().tolist()

        if hascmtdata:
            # 댓글 검색에서 나온 다중닉을 기존닉목록에 추가
            for cn in ccol.Nickname.unique().tolist():
                if not cn in nicks:
                    nicks.append(cn)

            # 댓글 검색에서 나온 다중ID(IP)를 기존ID(IP)목록에 추가
            for cid in ccol.IPID.unique().tolist():
                if not cid in ids2:
                    ids2.append(cid)

        nicks = ' '.join(nicks)
        ids2 = ' '.join(ids2)

        if not haspostdata: counts = None
        else: counts = col.shape[0]  # 글 수

        if 'Upvotes' in data.columns: upvotes = col.Upvotes.sum()  # 추천수
        else: upvotes = None
        if 'Downvotes' in data.columns: downvotes = col.Downvotes.sum()  # 비추수
        else: downvotes = None
        if 'Views' in data.columns: views = col.Views.sum()  # 조회수
        else: views = None

        if hascmtdata: comments = len(ccol) + len(ccol2)  # 쓴댓글수
        else: comments = None

        nd = {
            'Nick': nicks,
            'IPID': ids2,
            'Posts': counts,
            'Upvotes': upvotes,
            'Downvotes': downvotes,
            'Comments': comments,
            'Views': views,
            'HasAccount': 1
        }
        res = res.append(nd, ignore_index=True)

        if haspostdata:
            udong = udong.drop(
                col2.index)  # 조건에 맞게 쓴 글들은 없애기 -> 또 세지 않도록 (이건 고닉의 집피유동 제거)
        if hascmtdata: cudong = cudong.drop(ccol2.index)

    print('유동 글 집계중...')
    sys.stdout.flush()

    ################### 유동 글 집계 ###################

    teltype = 0

    # 닉네임이 ㅇㅇ이고 통피인놈들을 일단 묶어서 통계내기
    for ips in [SKTip, KTip, LGTip]:
        col = DataFrame()
        if haspostdata:
            col = udong[(udong['IPID'].isin(ips))
                        & (udong['Nickname'] == 'ㅇㅇ')]
        if hascmtdata:
            ccol = cudong[(cudong['IPID'].isin(ips))
                          & (cudong['Nickname'] == 'ㅇㅇ')]

        #if col.shape[0] > 0:
        nicks = 'ㅇㅇ'
        if (teltype == 0):
            nicks += '(SK통피)'
        elif (teltype == 1):
            nicks += '(KT통피)'
        elif (teltype == 2):
            nicks += '(U+통피)'
        else:
            nicks += '(기타통피)'
        teltype += 1

        if haspostdata: ids2 = col.IPID.unique().tolist()
        else: ids2 = []

        if hascmtdata:
            # 댓글 검색에서 나온 통피IP를 기존IP목록에 추가
            for cid in ccol.IPID.unique().tolist():
                if not cid in ids2:
                    ids2.append(cid)

        ids2 = ' '.join(ids2)

        if not haspostdata: counts = None
        else: counts = col.shape[0]  # 글 수

        if 'Upvotes' in data.columns: upvotes = col.Upvotes.sum()  # 추천수
        else: upvotes = None
        if 'Downvotes' in data.columns: downvotes = col.Downvotes.sum()  # 비추수
        else: downvotes = None
        if 'Views' in data.columns: views = col.Views.sum()  # 조회수
        else: views = None

        if hascmtdata: comments = len(ccol)
        else: comments = None

        nd = {
            'Nick': nicks,
            'IPID': ids2,
            'Posts': counts,
            'Upvotes': upvotes,
            'Downvotes': downvotes,
            'Comments': comments,
            'Views': views,
            'HasAccount': 0
        }
        res = res.append(nd, ignore_index=True)
        if haspostdata:
            udong = udong.drop(col.index)  # 조건에 맞게 쓴 글들은 없애기 -> 또 세지 않도록
        if hascmtdata: cudong = cudong.drop(ccol.index)

    # ㅇㅇ(123.45), ㅇㅇ(56.789) -> 다른 놈으로 취급 (단, ip가 리스트에 있으면 같은놈)
    # 파이썬(123.45), 루비(123.45) -> 다른 놈으로 취급 (단, 닉네임이 리스트에 있으면 같은놈)

    # ip가 다른 ㅇㅇ닉글들 수집
    for ips in ipList:
        col = DataFrame()
        if haspostdata:
            col = udong[(udong['IPID'].isin(ips.split('\t')))
                        & (udong['Nickname'] == 'ㅇㅇ')]
        if hascmtdata:
            ccol = cudong[(cudong['IPID'].isin(ips.split('\t')))
                          & (cudong['Nickname'] == 'ㅇㅇ')]
        #if col.shape[0] > 0:
        nicks = 'ㅇㅇ'

        if haspostdata: ids2 = col.IPID.unique().tolist()
        else: ids2 = []

        if hascmtdata:
            # 댓글 검색에서 나온 통피IP를 기존IP목록에 추가
            for cid in ccol.IPID.unique().tolist():
                if not cid in ids2:
                    ids2.append(cid)

        ids2 = ' '.join(ids2)

        if not haspostdata: counts = None
        else: counts = col.shape[0]  # 글 수

        if 'Upvotes' in data.columns: upvotes = col.Upvotes.sum()  # 추천수
        else: upvotes = None
        if 'Downvotes' in data.columns: downvotes = col.Downvotes.sum()  # 비추수
        else: downvotes = None
        if 'Views' in data.columns: views = col.Views.sum()  # 조회수
        else: views = None

        if hascmtdata: comments = len(ccol)
        else: comments = None

        nd = {
            'Nick': nicks,
            'IPID': ids2,
            'Posts': counts,
            'Upvotes': upvotes,
            'Downvotes': downvotes,
            'Comments': comments,
            'Views': views,
            'HasAccount': 0
        }
        res = res.append(nd, ignore_index=True)
        if haspostdata: udong = udong.drop(col.index)
        if hascmtdata: cudong = cudong.drop(ccol.index)

    print('닉유동 글 집계중...')
    sys.stdout.flush()

    # 닉네임이 ㅇㅇ가 아닌 유동닉글들 수집
    for nicks in unickList:
        col = DataFrame()
        if haspostdata: col = udong[udong['Nickname'].isin(nicks.split('\t'))]
        if hascmtdata:
            ccol = cudong[cudong['Nickname'].isin(nicks.split('\t'))]
        #if col.shape[0] > 0:
        ##nicks = ' '.join(col.Nickname.unique().tolist())
        ##ids2 = ' '.join(col.IPID.unique().tolist())
        nicks = []
        ids2 = []
        if haspostdata:
            nicks = col.Nickname.unique().tolist()
            ids2 = col.IPID.unique().tolist()

        if hascmtdata:
            # 댓글 검색에서 나온 다중닉을 기존닉목록에 추가
            for cn in ccol.Nickname.unique().tolist():
                if not cn in nicks:
                    nicks.append(cn)

            # 댓글 검색에서 나온 다중ID(IP)를 기존ID(IP)목록에 추가
            for cid in ccol.IPID.unique().tolist():
                if not cid in ids2:
                    ids2.append(cid)

        nicks = ' '.join(nicks)
        ids2 = ' '.join(ids2)

        if not haspostdata: counts = None
        else: counts = col.shape[0]  # 글 수

        if 'Upvotes' in data.columns: upvotes = col.Upvotes.sum()  # 추천수
        else: upvotes = None
        if 'Downvotes' in data.columns: downvotes = col.Downvotes.sum()  # 비추수
        else: downvotes = None
        if 'Views' in data.columns: views = col.Views.sum()  # 조회수
        else: views = None

        if hascmtdata: comments = len(ccol)
        else: comments = None

        nd = {
            'Nick': nicks,
            'IPID': ids2,
            'Posts': counts,
            'Upvotes': upvotes,
            'Downvotes': downvotes,
            'Comments': comments,
            'Views': views,
            'HasAccount': 0
        }
        if haspostdata: res = res.append(nd, ignore_index=True)

    print('작업 마무리중...')
    sys.stdout.flush()

    # 결측치 제거
    res = res.dropna(axis=1)
    res = res.dropna(axis=0)
    #res = res[res['Posts']!=0]
    res = res[res['IPID'] != '']

    # 정렬 (글싼순으로, 글이 없다면 댓글싼순)
    if 'Posts' in res:
        res = res.sort_values(by='Posts', ascending=False)
    elif 'Comments' in res:
        res = res.sort_values(by='Comments', ascending=False)

    # 저장
    res.to_csv(savename, encoding='utf-8-sig', index=False)
    print(savename, '로 저장되었습니다.')
    sys.stdout.flush()
コード例 #47
0
                elif frame.iat[a, 2] < frame.iat[b, 2]:
                    frame.iat[a, 2] -= [2]
                    frame.iat[b, 2] += [2]
                else:
                    frame.iat[a, 2] += [2]
                    frame.iat[b, 2] -= [2]
            if frame.iat[a, 3] + frame.iat[b, 3] == 3:
                if frame.iat[a, 2] < frame.iat[b, 2]:
                    frame.iat[a, 2] -= [1]
                else:
                    frame.iat[b, 2] -= [1]
            if frame.iat[a, 3] + frame.iat[b, 3] == 4:
                frame.iat[a, 2] -= [2]
                frame.iat[b, 2] -= [2]
        cgn = frame[frame['life'] < 0]
        pgn = frame.sort_values(by='life', ascending=False)
        if len(cgn.index) == 0:
            pass
        else:
            for x, y in cgn.index, pgn.index:
                frame.iat[x, 4] = pgn.iat[y, 4]
                frame.iat[x, 2] = 10
                c += 1
                counter = frame['type'].value_counts()
                counter.name = c
                recorder[c] = counter
            if c == 10:
                d = 1
                recorder.plot()
                plt.show()
コード例 #48
0
    def test_astype_categorical_to_other(self):

        value = np.random.RandomState(0).randint(0, 10000, 100)
        df = DataFrame({"value": value})
        labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=["value"], ascending=True)
        df["value_group"] = pd.cut(df.value,
                                   range(0, 10500, 500),
                                   right=False,
                                   labels=cat_labels)

        s = df["value_group"]
        expected = s
        tm.assert_series_equal(s.astype("category"), expected)
        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
        msg = r"could not convert string to float|invalid literal for float\(\)"
        with pytest.raises(ValueError, match=msg):
            s.astype("float64")

        cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
        exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
        tm.assert_series_equal(cat.astype("str"), exp)
        s2 = Series(Categorical(["1", "2", "3", "4"]))
        exp2 = Series([1, 2, 3, 4]).astype(int)
        tm.assert_series_equal(s2.astype("int"), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(np.sort(np.unique(a)),
                                   np.sort(np.unique(b)))

        expected = Series(np.array(s.values), name="value_group")
        cmp(s.astype("object"), expected)
        cmp(s.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(s), np.array(s.values))

        tm.assert_series_equal(s.astype("category"), s)
        tm.assert_series_equal(s.astype(CategoricalDtype()), s)

        roundtrip_expected = s.cat.set_categories(
            s.cat.categories.sort_values()).cat.remove_unused_categories()
        tm.assert_series_equal(
            s.astype("object").astype("category"), roundtrip_expected)
        tm.assert_series_equal(
            s.astype("object").astype(CategoricalDtype()), roundtrip_expected)

        # invalid conversion (these are NOT a dtype)
        msg = ("dtype '<class 'pandas.core.arrays.categorical.Categorical'>' "
               "not understood")

        for invalid in [
                lambda x: x.astype(Categorical),
                lambda x: x.astype("object").astype(Categorical),
        ]:
            with pytest.raises(TypeError, match=msg):
                invalid(s)
コード例 #49
0
from pandas import DataFrame

data = [
    ["037730", "3R", 1510],
    ["036360", "3SOFT", 1790],
    ["005760", "ACTS", 1185],
]

columns = ["종목코드", "종목명", "현재가"]
df = DataFrame(data=data, columns=columns)
df = df.set_index('종목코드')

# 현재가를 기준으로 정렬
df2 = df.sort_values(by='현재가')
print(df2)

コード例 #50
0
oldest = titanic_df['Age'].max()
fig.set(xlim=(0,oldest))
fig.add_legend()

# Drop Null values from the Cabin column using dropna
deck = titanic_df['Cabin'].dropna()
print deck.head()
# Cabin levels are categorized A,B,C,D,E,F,G, so only need first letter
levels = []
for level in deck:
    levels.append(level[0])  # append the first letter
print levels

cabin_df = DataFrame(levels)
cabin_df.columns = ['Cabin']
sns.factorplot(x='Cabin',kind='count',data=cabin_df.sort_values(by='Cabin',ascending=1),palette='winter_d') # but produces a T cabin...

# Creating a new table dropping cases containing specific values
cabin_df = cabin_df[cabin_df.Cabin != 'T']
sns.factorplot(x='Cabin',kind='count',data=cabin_df.sort_values(by='Cabin',ascending=1),palette='winter_d') # but produces a T cabin...


# Factorplot to see how two categorical variables are related
sns.factorplot(x='Embarked',kind='count',data=titanic_df.sort_values(by='Pclass'),hue='Pclass')

# Calculate values in put in a new column (whether alone or with family) based on existing column values
def alone_or_fam(passenger):
    sib, par = passenger
    if sib==0 and par==0:
        with_fam = 'Alone'
    else:
コード例 #51
0
	#get the contigs that have no SNPs on them
	print('find unreped contigs')
	unreped_contigs = contig_dat[~contig_dat['Contig'].isin(snp_df['Contig'])]

	#print non represented contigs to a file
	print('writing unreped contigs')
	unreped_contigs.to_csv('contigs_with_no_snps.tsv', sep='\t', index=False)

	#get contigs with representitives 
	reped_contigs = contig_dat[contig_dat['Contig'].isin(snp_df['Contig'])]

	#build the dictonary of snp locations
	contig_hit_dict = snp_pos_dictonary(reped_contigs, snp_df)

	#locate the gaps in the contigs
	#there are 10,432 regions.
	print('scanning for gaps')
	gaps_in_contigs = find_gaps(contig_hit_dict, reped_contigs)

	gaps_dataframe = DataFrame(gaps_in_contigs, columns = ['Contig', 'leading_position', 'trailing_position'])
	#get size data for the contigs
	gaps_dataframe = gaps_dataframe.merge(contig_dat)
	#reorder columns
	gaps_dataframe = gaps_dataframe[['Contig', 'Size', 'Size_rank', 'leading_position', 'trailing_position']]
	gaps_dataframe.sort_values(['Size_rank','leading_position'], inplace=True)
	gaps_dataframe.to_csv('locations_of_gaps_in_coverage.tsv', sep='\t', index=False)




コード例 #52
0
 def f1(df: pd.DataFrame) -> pd.DataFrame:
     return df.sort_values("b").head(1)
コード例 #53
0
def aggregate_aligned_column_sims(
    aggsim: DataFrame,
    tableid_colids: Dict[int, Set[int]],
    align_columns: str = "greedy",
    align_width_norm: str = "jacc",
    align_use_total_width: bool = True,
) -> DataFrame:
    """Aggregate column similarities.

    To create a table similarity graph, the column similarities need to be aggregated.
    This aggregation must be based on several assumptions which influence the accuracy
    and speed.

    First of all, how to align columns. Do you allow multiple columns from one table to
    align with a single column in the other? In that case, choose one of the fast 'max'
    values for the ``align`` parameter, depending on whether to allow the first or 
    the second table to match multiple columns in the other. 
    
    Otherwise, choose 'greedy'. This calculates a kind of soft-jaccard score.
    In that case, you'll need to decide how to handle columns for which no similarity score
    could be calculated. To ignore those columns, set ``align_use_total_width=False``.
    Otherwise, they will be assumed to be non-matching.
    Also, the alignment score is then normalized. This expresses your view about whether
    you want wide and narrow tables to match. If so, choose 'wide'. If you want the
    tables to have the similar widths, choose 'narrow'. For a middle ground, choose
    'jacc', which will calculate ``score / (cols1 + cols2 - score)``.

    Args:
        aggsim: Column similarities (aggregated match scores)
        tableid_colids: Global column IDs per table ID
        align_columns ({'max1', 'max2', 'greedy'}): Column alignment method. Defaults to 'greedy'.
        align_width_norm ({'wide', 'narrow', 'jacc'}): Table width difference normalisation method. Defaults to 'jacc'.
        align_use_total_width: Whether to use total table width. Defaults to True.

    Returns:
        Table similarities
    """
    assert align_columns in {"max1", "max2", "greedy"}
    assert align_width_norm in {"wide", "narrow", "jacc"}

    def agg(gs, align):
        try:  # Maybe show progress
            if log.getLogger().level <= log.INFO:
                import warnings, tqdm

                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore", category=FutureWarning)
                    tqdm.tqdm.pandas(desc="Aggregating column scores")
            return gs.progress_aggregate(align)
        except Exception as e:
            log.debug(f"When trying to show aggregation progress, {e}")
            return gs.agg(align)

    if align_columns == "greedy":
        # Compute soft column alignment jaccard
        aggsim.sort_values(ascending=False, inplace=True)
        total = agg(aggsim.groupby(level=[0, 1]), greedy_align)
        if align_use_total_width:
            # Use total column widths
            table_numcols = pd.Series(
                {ti: len(cis)
                 for ti, cis in tableid_colids.items()})
            j = (pd.DataFrame({
                "total": total
            }).join(table_numcols.rename("n1"),
                    on="ti1").join(table_numcols.rename("n2"), on="ti2"))
        else:
            # Only use number of matched columns
            n1 = aggsim.groupby(level=[0, 1, 2]).count().groupby(
                level=[0, 1]).first()
            n2 = aggsim.groupby(level=[0, 1, 3]).count().groupby(
                level=[0, 1]).first()
            j = pd.DataFrame({"total": total, "n1": n1, "n2": n2})

        #
        if align_width_norm == "jacc":
            return j["total"] / (j["n1"] + j["n2"] - j["total"])
        elif align_width_norm == "wide":
            return j["total"] / j[["n1", "n2"]].max(1)
        elif align_width_norm == "narrow":
            return j["total"] / j[["n1", "n2"]].min(1)

    else:
        level = 2 if align_columns == "max1" else 3
        return aggsim.groupby(level=[0, 1, level]).max().groupby(
            level=[0, 1]).mean()
コード例 #54
0
    'hits': [],
    '0': [],
    '1': [],
    '2': [],
    '3': [],
    '4': [],
    '5': [],
    '6': [],
    '7': [],
    '8': [],
    '9': []
}
for line in fread.readlines():
    line = line.strip().split("\t")
    d_df['species'].append(d[str(line[0])])
    d_df['hits'].append(int(line[1]))
    id_list = [str(k) for k in range(10)]
    flag = 0
    for i in range(2, len(line), 2):
        id = str(line[i])
        num = int(line[i + 1])
        id_list.remove(id)
        d_df[str(id)].append(num)
        flag += 1
    for j in id_list:
        d_df[j].append(0)
        flag += 1
df = DataFrame(d_df)
sorted = df.sort_values(by="hits", ascending=False)
print(sorted.head(20))
コード例 #55
0
def show_model_options(model):
    running = True
    while running:
        print(
            '--------------------------------------------------------------------------'
        )
        print(model.estimator_name)
        print(
            '--------------------------------------------------------------------------'
        )
        print('0. Show performance')
        print('1. Show residual plot')
        print('2. Train this model again')
        print('3. Brute force combination of features')
        print('4. Display feature importance')
        print('5. Predict housing price using custom features')
        print('6. Back')
        print(
            '--------------------------------------------------------------------------'
        )

        user_input = input('Enter your choice: ')
        if user_input == '6':
            running = False

        # Show Performance.
        elif user_input == '0' and user_input.isdigit:
            show_model_performance(model)

        # Show Residual Plot.
        elif user_input == '1':
            # Predict prices with data used for training.
            predicted_training_target = model.predict(training_features)
            # Predict prices with new and unseen data.
            predicted_testing_target = model.predict(testing_features)

            # Generate and show residual plot.
            plot_residual(predicted_training_target=predicted_training_target,
                          actual_training_target=training_target,
                          predicted_testing_target=predicted_testing_target,
                          actual_testing_target=testing_target,
                          model_name=model.estimator_name)

        # Train again.
        elif user_input == '2' and user_input.isdigit:
            print('Training {} ...'.format(model.estimator_name))
            model.train_and_evaluate(features=training_features,
                                     target=training_target,
                                     kfold=True)
            print('Training completed!')
            model.save()
            print('Model persisted.')

        elif user_input == '3':
            print('searching for best combination of features...')
            best_mse, best_feature_lists = find_best_features(model)
            print('Best MSE:', best_mse, 'using features:', best_feature_lists)

        elif user_input == '4':
            df = DataFrame()
            df['FEATURE'] = dataset.column_names[0:-1]
            if model.estimator_name in [
                    'Elastic Net',
                    'LARS',
                    'Lasso',
                    'Ridge',
                    'Linear',
            ]:
                df['IMPORTANCE'] = model.estimator.coef_
                print(df.sort_values(by='IMPORTANCE', ascending=False))
            elif model.estimator_name in [
                    'Gradient Boosting', 'Random Forest', 'Extra Trees'
            ]:
                df['IMPORTANCE'] = model.estimator.feature_importances_
                print(df.sort_values(by='IMPORTANCE', ascending=False))
            elif model.estimator_name in ['SVM RBF']:
                print(model.estimator.dual_coef_)

        elif user_input == '5':
            predict_custom(model)
コード例 #56
0
def prep_data(data: pd.DataFrame) -> pd.DataFrame:
    data['abs_bias'] = np.sqrt(data['bias']**2)
    data = data.sort_values(by='track')
    return data[['track', 'bias', 'abs_bias', 'review_length']]
    def scoring_trend_analysis(self, flag):
        choose = flag
        with open(r'./用户影评相关数据/' + self.filmname + '用户影评相关信息.json',
                  'r',
                  encoding='UTF-8') as f:
            t1 = json.load(f, strict=False)
        if choose == '1':
            self.textBrowser.append("开始生成" + self.filmname +
                                    "的评论推荐度与日期分析柱状图......")
            QApplication.processEvents()
        if choose == '2':
            self.textBrowser.append("开始生成" + self.filmname +
                                    "的评论推荐度与日期分析折线图......")
            QApplication.processEvents()
        if choose == '3':
            self.textBrowser.append("开始生成" + self.filmname +
                                    "的评论推荐度与日期分析河状图......")
            QApplication.processEvents()
        # 取出里面的评分数据
        score, date, val, command_date_list = [], [], [], []
        result = {}
        for each in t1:
            command_date_list.append((each['用户推荐度'], each['用户评论时间']))
        # 数出各个日期各个得分的数量
        for i in set(list(command_date_list)):
            result[i] = command_date_list.count(i)  # dict类型
        info = []
        # 将计数好的数据重新打包
        for key in result:
            score = key[0]
            date = key[1]
            val = result[key]
            info.append([score, date, val])
        info_new = DataFrame(info)
        # 将字典转换成为数据框
        info_new.columns = ['score', 'date', 'votes']
        # 按日期升序排列df
        info_new.sort_values('date', inplace=True)
        # 插入空缺的数据,每个日期的评分类型应该有5中,依次遍历判断是否存在,若不存在则往新的df中插入新数值
        mark = 0
        creat_df = pd.DataFrame(columns=['score', 'date',
                                         'votes'])  # 创建空的dataframe
        for i in list(info_new['date']):
            location = info_new[(info_new.date == i)
                                & (info_new.score == "力荐")].index.tolist()
            if location == []:
                creat_df.loc[mark] = ["力荐", i, 0]
                mark += 1
            location = info_new[(info_new.date == i)
                                & (info_new.score == "推荐")].index.tolist()
            if location == []:
                creat_df.loc[mark] = ["推荐", i, 0]
                mark += 1
            location = info_new[(info_new.date == i)
                                & (info_new.score == "还行")].index.tolist()
            if location == []:
                creat_df.loc[mark] = ["还行", i, 0]
                mark += 1
            location = info_new[(info_new.date == i)
                                & (info_new.score == "较差")].index.tolist()
            if location == []:
                creat_df.loc[mark] = ["较差", i, 0]
                mark += 1
            location = info_new[(info_new.date == i)
                                & (info_new.score == "很差")].index.tolist()
            if location == []:
                creat_df.loc[mark] = ["很差", i, 0]
                mark += 1
        info_new = info_new.append(creat_df.drop_duplicates(),
                                   ignore_index=True)
        command_date_list = []
        info_new.sort_values('date',
                             inplace=True)  # 按日期升序排列df,便于找最早date和最晚data,方便后面插值
        for index, row in info_new.iterrows():
            command_date_list.append([row['date'], row['votes'], row['score']])
        attr, v1, v2, v3, v4, v5 = [], [], [], [], [], []
        attr = list(sorted(set(info_new['date'])))
        for i in attr:
            v1.append(
                int(info_new[(info_new['date'] == i)
                             & (info_new['score'] == "力荐")]['votes']))
            v2.append(
                int(info_new[(info_new['date'] == i)
                             & (info_new['score'] == "推荐")]['votes']))
            v3.append(
                int(info_new[(info_new['date'] == i)
                             & (info_new['score'] == "还行")]['votes']))
            v4.append(
                int(info_new[(info_new['date'] == i)
                             & (info_new['score'] == "较差")]['votes']))
            v5.append(
                int(info_new[(info_new['date'] == i)
                             & (info_new['score'] == "很差")]['votes']))

        # 柱状图
        if choose == '1':
            c = (Bar(
                init_opts=opts.InitOpts(width="665px", height="500px")
            ).add_xaxis(attr).add_yaxis("力荐", v1, stack="stack1").add_yaxis(
                "推荐", v2,
                stack="stack1").add_yaxis("还行", v3, stack="stack1").add_yaxis(
                    "较差", v4, stack="stack1").add_yaxis(
                        "很差", v5,
                        stack="stack1").reversal_axis().set_series_opts(
                            label_opts=opts.LabelOpts(
                                is_show=False)).set_global_opts(
                                    tooltip_opts=opts.TooltipOpts(
                                        is_show=True),
                                    toolbox_opts=opts.ToolboxOpts(
                                        is_show=True,
                                        pos_right="30%",
                                    ),
                                    title_opts=opts.TitleOpts(
                                        title="用户评论推荐度柱状图"),
                                    datazoom_opts=opts.DataZoomOpts(
                                        type_="inside",
                                        range_start=0,
                                        range_end=100),
                                ).render("./爬虫数据关联可视化/" + self.filmname +
                                         "影评可视化数据/bar_reversal_axis.html"))
            QApplication.processEvents()
            self.comment_columnar_pic = (Bar(
                init_opts=opts.InitOpts(width="665px", height="500px")
            ).add_xaxis(attr).add_yaxis("力荐", v1, stack="stack1").add_yaxis(
                "推荐", v2,
                stack="stack1").add_yaxis("还行", v3, stack="stack1").add_yaxis(
                    "较差", v4, stack="stack1").add_yaxis(
                        "很差", v5,
                        stack="stack1").reversal_axis().set_series_opts(
                            label_opts=opts.LabelOpts(
                                is_show=False)).set_global_opts(
                                    tooltip_opts=opts.TooltipOpts(
                                        is_show=True),
                                    toolbox_opts=opts.ToolboxOpts(
                                        is_show=True,
                                        pos_right="30%",
                                    ),
                                    title_opts=opts.TitleOpts(
                                        title="用户评论推荐度柱状图"),
                                    datazoom_opts=opts.DataZoomOpts(
                                        type_="inside",
                                        range_start=0,
                                        range_end=100),
                                ))
            self.saveflag = '4'
            self.textBrowser.append("开始生成" + self.filmname +
                                    "的评论推荐度与日期分析柱状图完成!")
            QApplication.processEvents()
            self.show_scoring_trend_analysis_columnar()
            QApplication.processEvents()

        # 折线图
        if choose == '2':
            polyline = (Line(
                init_opts=opts.InitOpts(width="665px", height="500px")
            ).add_xaxis(attr).add_yaxis("力荐", v1, stack="stack1").add_yaxis(
                "推荐", v2,
                stack="stack1").add_yaxis("还行", v3, stack="stack1").add_yaxis(
                    "较差", v4, stack="stack1").add_yaxis(
                        "很差", v5, stack="stack1").set_global_opts(
                            tooltip_opts=opts.TooltipOpts(is_show=True),
                            toolbox_opts=opts.ToolboxOpts(
                                is_show=True,
                                pos_right="30%",
                            ),
                            title_opts=opts.TitleOpts(title="用户评论推荐度折线图"),
                            datazoom_opts=opts.DataZoomOpts(type_="inside",
                                                            range_start=0,
                                                            range_end=100),
                        ).render("./爬虫数据关联可视化/" + self.filmname +
                                 "影评可视化数据/line_markpoint.html"))
            QApplication.processEvents()
            self.comment_polyline_pic = (Line(
                init_opts=opts.InitOpts(width="665px", height="500px")
            ).add_xaxis(attr).add_yaxis("力荐", v1, stack="stack1").add_yaxis(
                "推荐", v2,
                stack="stack1").add_yaxis("还行", v3, stack="stack1").add_yaxis(
                    "较差", v4, stack="stack1").add_yaxis(
                        "很差", v5, stack="stack1").set_global_opts(
                            tooltip_opts=opts.TooltipOpts(is_show=True),
                            toolbox_opts=opts.ToolboxOpts(
                                is_show=True,
                                pos_right="30%",
                            ),
                            title_opts=opts.TitleOpts(title="用户评论推荐度折线图"),
                            datazoom_opts=opts.DataZoomOpts(type_="inside",
                                                            range_start=0,
                                                            range_end=100),
                        ))
            self.saveflag = '5'
            self.textBrowser.append(self.filmname + "的评论推荐度与日期分析折线图完成!")
            QApplication.processEvents()
            self.show_scoring_trend_analysis_polyline()
            QApplication.processEvents()

        # 河流图
        if choose == '3':
            river = (ThemeRiver(
                init_opts=opts.InitOpts(width="665px", height="500px")).add(
                    series_name=['力荐', '推荐', '还行', '较差', '很差'],
                    data=command_date_list,
                    singleaxis_opts=opts.SingleAxisOpts(pos_top="50",
                                                        pos_bottom="50",
                                                        type_="time"),
                ).set_global_opts(
                    tooltip_opts=opts.TooltipOpts(is_show=True,
                                                  trigger="axis",
                                                  axis_pointer_type="line"),
                    toolbox_opts=opts.ToolboxOpts(
                        is_show=True,
                        pos_right="30%",
                    ),
                    title_opts=opts.TitleOpts(title="推荐度河流图"),
                    datazoom_opts=opts.DataZoomOpts(type_="inside",
                                                    range_start=0,
                                                    range_end=100),
                ).render("./爬虫数据关联可视化/" + self.filmname +
                         "影评可视化数据/theme_river.html"))
            QApplication.processEvents()
            self.comment_river_pic = (ThemeRiver(
                init_opts=opts.InitOpts(width="665px", height="500px")).add(
                    series_name=['力荐', '推荐', '还行', '较差', '很差'],
                    data=command_date_list,
                    singleaxis_opts=opts.SingleAxisOpts(pos_top="50",
                                                        pos_bottom="50",
                                                        type_="time"),
                ).set_global_opts(
                    tooltip_opts=opts.TooltipOpts(is_show=True,
                                                  trigger="axis",
                                                  axis_pointer_type="line"),
                    toolbox_opts=opts.ToolboxOpts(
                        is_show=True,
                        pos_right="30%",
                    ),
                    title_opts=opts.TitleOpts(title="推荐度河流图"),
                    datazoom_opts=opts.DataZoomOpts(type_="inside",
                                                    range_start=0,
                                                    range_end=100),
                ))
            self.saveflag = '6'
            self.textBrowser.append(self.filmname + "的评论推荐度与日期分析河状图完成!")
            QApplication.processEvents()
            self.show_scoring_trend_analysis_river()
            QApplication.processEvents()
コード例 #58
0
    def plus_period(self, dfp: pd.DataFrame, s_date, e_date, pr_cloc: str,
                    sign: str):
        # 获取dfp的列名
        list_cl = list(dfp)
        # 去除需要合并列的列名
        list_cl.remove(s_date)
        list_cl.remove(e_date)
        list_cl.remove(pr_cloc)
        # 按照去除合并列名进行排序
        dfp.sort_values(list_cl, inplace=True)
        # 按照排序后行,重新设置index
        dfp = dfp.reset_index(drop=True)
        # 复制dfp,用以处理
        dfpc = dfp.copy()
        # 去除合并列
        dfpc.drop([s_date, e_date, pr_cloc], axis=1, inplace=True)
        # 进行查重处理
        list_dp = dfpc.duplicated()
        # 查找重复分界点
        x = list_dp[list_dp.isin([False])].index
        # 因为没有找到index插入的方法,将分界点index转为list
        list_x = []
        for q in range(len(x)):
            list_x.append(x[q])
        # 主要用于加入最后一条记录index
        list_x.append(len(dfp))

        # print(list_x)
        # x.append(int64(len(dfp)))
        yn = []
        # 循环获取重复记录段数据
        for i in range(len(list_x) - 1):
            # 判断是否有需要合并项
            if (list_x[i + 1] - list_x[i]) > 1:
                # 若有序号间隔大于1,则进入循环

                for j in range(list_x[i + 1] - list_x[i]):
                    # 取出需要合并数据,形成list
                    yn.append(dfp.loc[list_x[i] + j, pr_cloc])
                # 将list合并成以sign为分隔字符串。
                y = sign.join(yn)
                # 排序
                dfp_d = dfp.loc[list_x[i]:list_x[i + 1] - 1, :].copy()
                dfp_d.sort_values(s_date, inplace=True)
                dfp_d.reset_index(drop=True, inplace=True)
                # 将字符串赋给dfp第一列
                dfp.loc[list_x[i], pr_cloc] = y
                dfp.loc[list_x[i], s_date] = dfp_d.loc[0, s_date]
                dfp.loc[list_x[i],
                        e_date] = dfp_d.loc[list_x[i + 1] - list_x[i] - 1,
                                            e_date]

                # 删除多余项目
                for k in range(list_x[i + 1] - list_x[i] - 1):
                    dfp.drop(list_x[i] + 1 + k, axis=0, inplace=True)
                # 清空记录list
                yn = []
        # 重置index
        dfp = dfp.reset_index(drop=True)

        return dfp
コード例 #59
0
                  columns=list('dabc'))
print(frame.sort_index())
# print(frame.sort_index(axis=0))  # equivalent as above
print(frame.sort_index(axis=1))
print(frame.sort_index(axis=1, ascending=False))
print()

print("## Sort by value of Series:")
obj = Series([4, 7, -3, 2])
print(obj.sort_values())
print()

print("## Sort by columns of DataFrame:")
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by='b'))
print(frame.sort_values(by=['a', 'b']))
print()

print("## rank(), compute numerical data ranks, start from 1:")
obj = Series([7, -5, 7, 4, 2, 0, 4])
print("obj.rank(axis='index'):")
print(obj.rank(axis='index'))  # default: axis=0, axis='index'
print("""obj.rank(method='first')),
ranks assigned in order they appear in the array:""")
print(obj.rank(method='first'))
print("obj.rank(method='min'):")
print(obj.rank(method='min'))
print("obj.rank(ascending=False, method='max'):")
print(obj.rank(ascending=False, method='max'))
print()
コード例 #60
0
def run(date, date1):

    stock = StockPool(date1).select_stock()
    df = DataFrame()
    df['cash_net_oper_act'] = FactorsZoo.CashNetOperAct(
        date, stock, label='cash_net_oper_act').get_data()
    df['deductedprofit'] = FactorsZoo.DeductedProfit(
        date, stock, 'deductedprofit').get_data()
    df['dividend'] = FactorsZoo.Dividend(date, stock, 'dividend').get_data()
    df['industry'] = FactorsZoo.Industry(date, stock, 'industry').get_data()
    df['net_inc'] = FactorsZoo.NetInc(date, stock, 'net_inc').get_data()
    df['pct'] = FactorsZoo.Pct(date, stock, 'pct', -6).get_data()
    df['pe'] = FactorsZoo.Pe(date, stock, 'pe').get_data()
    df['size'] = FactorsZoo.Size(date, stock, 'size').get_data()
    df['turn_per'] = FactorsZoo.TurnPer(date, stock, 'turn_per', -6).get_data()
    df['volitality'] = FactorsZoo.Volitality(date, stock, 'volitality',
                                             -6).get_data()
    df['vol_per'] = FactorsZoo.VolPer(date, stock, 'vol_per', -6).get_data()
    df['yoyprofit'] = FactorsZoo.Yoyprofit(date, stock, 'yoyprofit').get_data()
    df['yoytr'] = FactorsZoo.YoyTr(date, stock, 'yoytr').get_data()
    # 因子中性化
    factors = df.columns.tolist()
    df['Codes'] = stock
    df = df.dropna()
    stock = df['Codes'].values.tolist()
    df = df[factors]
    fp = FactorProcess()
    df = fp.neutralize_factor(df, factors)
    alpha = fp.get_alpha(stock, date, -6)
    df['alpha'] = alpha
    coef_ = fp.calac_beta(df['alpha'], df[factors], factors)
    df1 = DataFrame()

    df1['cash_net_oper_act'] = FactorsZoo.CashNetOperAct(
        date1, stock, label='cash_net_oper_act').get_data()
    df1['deductedprofit'] = FactorsZoo.DeductedProfit(
        date1, stock, 'deductedprofit').get_data()
    df1['dividend'] = FactorsZoo.Dividend(date1, stock, 'dividend').get_data()
    df1['industry'] = FactorsZoo.Industry(date1, stock, 'industry').get_data()
    df1['net_inc'] = FactorsZoo.NetInc(date1, stock, 'net_inc').get_data()
    df1['pct'] = FactorsZoo.Pct(date1, stock, 'pct', -6).get_data()
    df1['pe'] = FactorsZoo.Pe(date1, stock, 'pe').get_data()
    df1['size'] = FactorsZoo.Size(date1, stock, 'size').get_data()
    df1['turn_per'] = FactorsZoo.TurnPer(date1, stock, 'turn_per',
                                         -6).get_data()
    df1['volitality'] = FactorsZoo.Volitality(date1, stock, 'volitality',
                                              -6).get_data()
    df1['vol_per'] = FactorsZoo.VolPer(date1, stock, 'vol_per', -6).get_data()
    df1['yoyprofit'] = FactorsZoo.Yoyprofit(date1, stock,
                                            'yoyprofit').get_data()
    df1['yoytr'] = FactorsZoo.YoyTr(date1, stock, 'yoytr').get_data()
    factors = df1.columns.tolist()
    df1['Codes'] = stock
    df1 = df1.dropna()
    stock = df1['Codes'].values.tolist()
    df1 = df1[factors]
    df1 = fp.neutralize_factor(df1, factors)
    alpha = fp.forcast_alpha(coef_, df1, factors)

    df1['Codes'] = stock
    df1['alpha'] = alpha
    df1 = df1.sort_values(['alpha'], ascending=False).head(30)
    return df1