Example #1
0
    def prepare(self, train, test, feat='1'):
        if feat == '1':
            X = [self.compute(_) for _ in train]
            Y = [_['label'] for _ in train]

            x = [self.compute(_) for _ in test]
            y = [_['label'] for _ in test]

        elif feat == '2':
            X = np.array([self.compute(elem) + self.base[elem['tweet_id']] for elem in train], dtype=float)
            Y = [_['label'] for _ in train]
            # replace all with 0
            for i in xrange(X[0, :].size):
                if np.all(np.isnan(X[:, i])):
                    X[:, i].fill(0)

            x = np.array([self.compute(elem) + self.base[elem['tweet_id']] for elem in test], dtype=float)
            y = [_['label'] for _ in test]
            # replace all with 0
            for i in xrange(x[0, :].size):
                if np.all(np.isnan(x[:, i])):
                    x[:, i].fill(0)

        # prepare
        X = standardize(imputer(X))
        Y = np.array(Y)
        x = standardize(imputer(x))
        y = np.array(y)

        return X, Y, x, y
Example #2
0
def PCC(dataset):
    crossplatform = [
        'dist mean', 'dist var', 'agree mean', 'agree var', 'disagree mean',
        'disagree var', 'discuss mean', 'discuss var', 'unrelated mean',
        'unrelated var'
    ]
    s = Supervise()
    fs, ls = [], []
    for elem in dataset:
        feats = s.compute(elem)
        fs.append(feats)
        ls.append(elem['label'])

    ms = np.array(fs, dtype=float)
    # replace all with 0
    for i in xrange(ms[0, :].size):
        if np.all(np.isnan(ms[:, i])):
            ms[:, i].fill(0)
    ms = imputer(ms)
    ms = standardize(ms)
    ls = np.array(ls)
    # print('mean: ', pearsonr(ms, ls))
    # print('var: ', pearsonr(vs, ls))
    order = []
    for i in xrange(10):
        order.append((crossplatform[i], pearsonr(ms[:, i], ls)[0]))
    order = sorted(order, key=lambda x: abs(x[1]), reverse=True)
    print 'Features with the highest PCC:'
    print order
Example #3
0
def train_test(train_X, train_Y, test_X, test_Y):
    np.random.seed(38)
    if np.all(np.isnan(test_X)):
        return
    train_X = imputer(train_X)
    train_X = standardize(train_X)
    test_X = imputer(test_X)
    test_X = standardize(test_X)

    dropout = 0.5
    epochs = 23
    batch_size = 10
    hidden = 20
    clf = build_model(hidden=hidden, dropout=dropout)
    clf.fit(train_X, train_Y, epochs, batch_size, verbose=False)
    return metrics(test_Y, clf.predict_classes(test_X))
Example #4
0
    def generate_factor(self):
        CLOSE = DataFrame({
            stock: pd.read_csv('%s/StockDailyData/Stock/%s.csv' %
                               (gc.DATABASE_PATH, stock),
                               index_col=[0],
                               parse_dates=[0]).loc[:, 'close']
            for stock in self.stocks
        })
        ADJ = DataFrame({
            stock: pd.read_csv('%s/StockDailyData/Stock/%s.csv' %
                               (gc.DATABASE_PATH, stock),
                               index_col=[0],
                               parse_dates=[0]).loc[:, 'adj_factor']
            for stock in self.stocks
        })

        CLOSE.fillna(method='ffill', inplace=True)
        ADJ.fillna(method='ffill', inplace=True)

        CLOSE = CLOSE * ADJ
        r = np.log(CLOSE).diff(20)
        ind_files = os.listdir('%s/Data' % gc.FACTORBASE_PATH)
        ind_files = list(filter(lambda x: x[0] == '8', ind_files))

        a = DataFrame(0, index=r.index, columns=r.columns)
        for ind_file in ind_files:
            ind_df = pd.read_csv('%s/Data/%s' % (gc.FACTORBASE_PATH, ind_file),
                                 index_col=[0],
                                 parse_dates=[0])
            ind_df = ind_df.loc[r.index, r.columns]
            ind_df[ind_df == 0] = np.nan
            a = a.add(ind_df.mul((r * ind_df).max(1), axis=0), fill_value=0)
        a = a.loc[a.index >= self.start_date, :]
        a = a.loc[a.index <= self.end_date, :]
        self.factor = tools.standardize(a)
Example #5
0
def exp1():
    print 'extract features for experiment 1'
    with open('CCMR/CCMR_Twitter_t.txt') as f1, open(
            'CCMR/CCMR_Google_t.txt') as f2:
        twitter = json.load(f1)
        google = json.load(f2)

    # get split of train test
    cv_task = task_split(twitter)
    cv_event = event_split(twitter)

    print 'extract cpcl features'
    X, Y = extract_feats(twitter, google, embed='complete', agree='complete')

    X_fill = imputer(X)
    X_fill = standardize(X_fill)
    scores = []
    for i in xrange(10):
        p = pearsonr(X_fill[:, i], Y)
        print p
        scores.append(abs(p[0]))
    print 'average: ', np.average(scores)

    with open('CLCP/Twitter_CLCP_via_Google.pkl', 'wb') as f:
        pkl.dump(((X, Y), (cv_task, cv_event)), f)
Example #6
0
 def update_factor(self):
     self.generate_factor()
     #if 'industry' in self.neutral_list:
     if False:
         industrys = tools.get_industrys('L1', self.stocks)
         tmp = {}
         for k in industrys.keys():
             if len(industrys[k]) > 0:
                 tmp[k] = industrys[k]
         industrys = tmp
         factor = tools.standardize_industry(self.factor, industrys)
     #if 'market_capitalization' in self.neutral_list:
     if False:
         market_capitalization = DataFrame({
             stock:
             pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' %
                         (gc.DATABASE_PATH, stock),
                         index_col=[0],
                         parse_dates=[0]).loc[:, 'TOTMKTCAP']
             for stock in self.stocks
         })
         market_capitalization = np.log(market_capitalization)
         if self.start_date:
             market_capitalization = market_capitalization.loc[
                 market_capitalization.index >= self.start_date, :]
         if self.end_date:
             market_capitalization = market_capitalization.loc[
                 market_capitalization.index <= self.end_date, :]
         #if 'industry' in self.neutral_list:
         if True:
             market_capitalization = tools.standardize_industry(
                 market_capitalization, industrys)
         beta = (factor * market_capitalization).sum(1) / (
             market_capitalization * market_capitalization).sum(1)
         factor = factor - market_capitalization.mul(beta, axis=0)
     self.factor.fillna(0, inplace=True)
     factor = tools.standardize(self.factor)
     if os.path.exists('%s/Data/%s.csv' %
                       (gc.FACTORBASE_PATH, self.factor_name)):
         factor_old = pd.read_csv('%s/Data/%s.csv' %
                                  (gc.FACTORBASE_PATH, self.factor_name),
                                  index_col=[0])
         factor = pd.concat(
             [factor_old, factor.loc[factor.index > factor.index[-1], :]],
             axis=0)
         factor.sort_index(axis=0, inplace=True)
         factor.sort_index(axis=1, inplace=True)
     factor.to_csv('%s/Data/%s.csv' %
                   (gc.FACTORBASE_PATH, self.factor_name))