Ejemplo n.º 1
0
def main(idir, odir, featuredir, N=50, generatetxt=False, selectfeatures=False):
    if generatetxt:
        start = [2017, 1, 1]
        end = [2017, 9, 1]
        xls2txt(idir, odir, start, end)
    if selectfeatures:
        feature_selection(odir, featuredir)
    date = readdate(os.path.join(odir, 'datetime.txt'))
    length = len(date) - 1
    features = {}
    with open(os.path.join(featuredir, 'features.txt'), 'r') as f:
        for line in f:
            features[line.strip()] = readdata(os.path.join(odir, line.strip()))
    r_abnormities = []
    with open(os.path.join(featuredir, 'abnormities.txt'), 'r') as f:
        for line in f:
            r_abnormities.append(line.strip())
    r_abnormities_ts = str2timestamp(r_abnormities)
    abnormities = gaussian_detection(features, phi=1.96)
    f1_scores = {}
    for feature in abnormities:
        abnormity = abnormities[feature]
        f1_scores[feature] = adj_f1(str2timestamp(date[i] for i in abnormity), r_abnormities_ts)
    count = weighted_count(abnormities, f1_scores, length)
    print(get_abnormity(count, date, N))
Ejemplo n.º 2
0
def read_dataset(size_training):

    full_train = pd.read_csv("../../input/train.csv")
    selected_train = feature_selection(full_train)
    zero_train = selected_train.loc[selected_train['target'] == 0].values
    one_train = selected_train.loc[selected_train['target'] == 1].values

    zero_features_train = zero_train[:, 2:]
    zero_targets_train = zero_train[:, 1]
    one_features_train = one_train[:, 2:]
    one_targets_train = one_train[:, 1]

    random_training_zero = list(range(zero_targets_train.shape[0]))
    random.shuffle(random_training_zero)
    zero_features_train = zero_features_train[
        random_training_zero[0:size_training], :]
    zero_targets_train = zero_targets_train[
        random_training_zero[0:size_training]]

    features_train = concatenate((zero_features_train, one_features_train),
                                 axis=0).astype(float64)
    targets_train = concatenate((zero_targets_train, one_targets_train),
                                axis=0).astype(int)

    full_test = pd.read_csv("../../input/test.csv")
    test = feature_selection(full_test).values
    features_test = test[:, 1:].astype(float64)

    return features_train, targets_train, features_test
Ejemplo n.º 3
0
def main():
    feature_selection()
    linear_regression_predict()
    polynomial_regression_predict()
    randomforest_predict()
    bp_predict()
    svr_predict()
Ejemplo n.º 4
0
    def feature_selection_topn(self):

        #f = open("C:\\Users\\Administrator\\Desktop\\python note\\craw\\taobaomm\\sj_names.txt")
        module_path = dirname(__file__)
        f = open(join(module_path, 'sj_names.txt'))
        class_list = []
        term_str = []
        try:
            for line in f:
                lt = line.split(',')
                if lt[1] == '全部':  #过滤掉全部分类
                    continue
                class_list.append(lt[0])
                temstr = lt[2].split('(')
                seg_list = jieba.cut_for_search(temstr[0])  #搜索引擎模式
                terlist = ", ".join(seg_list)  #解析成字符串
                try:
                    term_str.append([
                        term.strip() for term in terlist.split(',')
                        if len(term.strip()) > 1
                    ])  #去掉空格字符转换为列表
                except UnicodeEncodeError:
                    print 'err'
        finally:
            f.close()
        print len(term_str)
        term_set_fs = fs.feature_selection(term_str, class_list,
                                           'IG')[:2000]  #选取前1000个信息增益最大的词
        self.term_set_dict = dict(zip(term_set_fs,
                                      range(len(term_set_fs))))  #生成字典
        f2 = open('feature_term_result.txt', "w+")
        for i in range(len(term_set_fs)):
            str = term_set_fs[i]
            f2.write(str.encode('utf-8') + '\n')
        f2.close()
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)  
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
    
    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary = True)   
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
    
    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec= vectorizer.transform(doc_str_list_test)
    
    clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类
    doc_test_predicted = clf.predict(doc_test_vec)
    
    acc = np.mean(doc_test_predicted == doc_class_list_test)  
    print 'Accuracy: ', acc
    
    return acc
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print('Loading dataset, 80% for training, 20% for testing...')
    movie_reviews = load_files(dataset_dir_name)
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(
        movie_reviews.data,
        movie_reviews.target,
        test_size=0.2,
        random_state=0)

    print('Feature selection...')
    print('fs method:' + fs_method, 'fs num:' + str(fs_num))
    vectorizer = CountVectorizer(binary=True)
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [
        word_tokenizer(doc_str) for doc_str in doc_str_list_train
    ]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train,
                                                      doc_class_list_train,
                                                      fs_method)[:fs_num]

    print('Building VSM model...')
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec = vectorizer.transform(doc_str_list_test)

    clf = MultinomialNB().fit(doc_train_vec,
                              doc_class_list_train)  # µ÷ÓÃMultinomialNB·ÖÀàÆ÷
    doc_test_predicted = clf.predict(doc_test_vec)

    acc = np.mean(doc_test_predicted == doc_class_list_test)
    print('Accuracy: ', acc)

    return acc
Ejemplo n.º 7
0
def cross_validation(y, x, k_indices, k, lambda_):
    train_idxs = [
        n for (i, idxs) in enumerate(k_indices) for n in idxs if i != k
    ]
    test_idxs = k_indices[k]
    x_train, y_train = x[train_idxs], y[train_idxs]
    x_test, y_test = x[test_idxs], y[test_idxs]
    tx_train = np.c_[np.ones(len(y_train)), x_train]
    x_test = np.c_[np.ones(len(y_test)), x_test]

    x_train, x_test, w, indicies = feature_selection(x_train, y_train, x_test,
                                                     y_test, lambda_)
    loss_tr = compute_mse(y_train, x_train, w)
    loss_te = compute_mse(y_test, x_test, w)

    return loss_tr, loss_te, w, indicies
Ejemplo n.º 8
0
    def fit(self, Xs, y, time_ramain):
        #self.tables = copy.deepcopy(Xs)
        Xs, self.y = data_sample(Xs, y)
        self.Xs = copy.deepcopy(Xs)

        X, y, feature_names, cat_feature_map, stampcol = baseline_features(
            Xs, y, self.config)

        features_from_base, self.feature_selection_models = feature_selection(
            X, y, int(len(X.columns) / 5), feature_names, cat_feature_map)

        X, self.cat_dict_counts = cat_value_counts(
            X, list(cat_feature_map.keys()))

        X = pd.concat([X, features_from_base], axis=1)

        #
        #
        # one_hot_feature,models = onehot_feature_selection(X, y, cat_feature_map.keys(), feature_num_everyiter=1)
        #
        # one_hot_feature = pd.DataFrame(one_hot_feature,columns=["one_hot_feature"])
        #
        # print(X.shape)
        #
        # X = pd.concat([X,one_hot_feature],axis=1)
        #
        # print(X.shape)

        #features_from_base,self.feature_selection_models = feature_selection(X, y ,20,feature_names, cat_feature_map)

        #
        #timestamp_features(X, y, features_from_base, cat_feature_map,  self.config,stampcol)

        # X=polyfeatures(X)
        # # model=XGBClassifier()
        # # model.fit(X, y)
        # # print(model.feature_importances_)

        train(X, y, self.config)
Ejemplo n.º 9
0
def load_dataset(split):
    df = pd.read_csv('data/text_emotion.csv')
    df.columns = ['id', 'class', 'author', 'tweet']

    if os.path.exists('data_ml/text_emotion_features.npy'):
        X = np.load('data_ml/text_emotion_features.npy')
    else:
        print('Fix encoding...')
        df = fix_encoding(df)
        print('Split sentences...')
        df = split_tweet_sentences(df)
        print('Tokenize tweets...')
        df = tokenize_tweets(df)
        print('Lematize tweets...')
        df = get_lemmas(df)
        lexicon = pd.read_csv('lexicons/Ratings_Warriner_et_al.csv', usecols=[0, 1, 2, 5], index_col=0)
        lexicon.columns = ['word', 'valence', 'arousal']
        path_to_jar = 'stanford_parser/stanford-parser.jar'
        path_to_models_jar = 'stanford_parser/stanford-parser-3.9.1-models.jar'
        valence_shifter = FeatureExtractionContextValenceShifting(path_to_jar, path_to_models_jar, lexicon)
        df = valence_shifter.get_initial_valences(df)
        featured_dataset, vocab = generate_initial_features(df)
        X = featured_dataset['valences'].values.tolist()[:split]
        y = featured_dataset['class'].values.tolist()[:split]
        selected, mask = feature_selection(X, y, vocab)
        for index, row in featured_dataset.iterrows():
            valences = np.array(row.valences[mask])
            featured_dataset.set_value(index=index, col='valences', value=valences)
        X = np.vstack(featured_dataset.valences.values)
        np.save('data_ml/text_emotion_features', X)

    classes = df['class'].values.tolist()
    c = np.unique(classes).tolist()
    d = dict([(y, x) for x, y in enumerate(c)])
    classes = np.array([d[x] for x in classes])

    return X, classes, len(c)
    models = ['Linear', 'Ridge', 'AdaBoost', 'RandomForest', 'SVM']

    for imp_method in ['Mode', 'KNN']:
        for outcome in outcomes:
            print("Generating results for {}".format(outcome))
            features_path = 'features_{}_{}.csv'.format(outcome, imp_method)
            labels_path = 'labels_{}_{}.csv'.format(outcome, imp_method)

            print("Loading dataset...")
            X, Y = get_data(args.data_dir, features_path, labels_path)
            is_classf = Y.dtype == np.int8
            print("Successfully loaded dataset.")

            for fs_method in fs_methods:
                if fs_method:
                    print("Performing feature selection using {}...".format(
                        fs_method))
                    print(X.shape)
                    X_subset = feature_selection(X,
                                                 Y,
                                                 outcome,
                                                 fs_method,
                                                 imp_method,
                                                 args.data_dir,
                                                 verbose=1)
                    print(X_subset.shape)
                """
                for model in models:
                    train(model, X_subset, Y, is_classf, outcome, fs_method, 
                        imp_method, args.data_dir, args.results_dir, verbose=1)
                """
Ejemplo n.º 11
0
def main():
    """Wrapper function which calls all the other functions"""
    rolled_df = deriving_features.create_dataframe_with_features()
    print(rolled_df.columns)
    target = input("Enter the column name of y variable:")
    e = eda(rolled_df, target)
    event = input("Enter the event:")
    er = e.eventRatio(event)
    print(er)
    stat = e.impstat()
    rng = e.range()
    rng = rng.rename('Range')
    print("Size is")
    print(rng.size)
    iq = e.iqr()
    iq = iq.rename('IQR')
    cor = e.corr()
    print(cor)
    ske = e.skew()
    print(ske)
    ske = ske.rename('Skewness')
    kur = e.kurt()
    print(kur)
    kur = kur.rename('Kurtosis')
    [mi, mi1] = e.missinginfo()
    print("missing value is")
    print(pd.Series(mi1[1:]))
    mi1 = mi1.rename('Missing values')
    e.missingplot()
    b = e.bin('woe', 'y', 'yes', 'pdays', 'day')
    print(b)
    try:
        os.remove('D:/Other projects/python modules/Report.xlsx')
        engine = 'xlsxwriter'
        writer = pd.ExcelWriter('Report.xlsx', engine=engine)
        stat1 = pd.DataFrame(stat.T)
        print(stat1)
        stat1.to_csv(writer, startcol=0, startrow=5)
        ws = writer.sheets['Sheet1']
        ws.write_string(1, 4, 'DataDescription')
        rng.to_excel(writer, startcol=9, startrow=5, index=False)
        iq.to_excel(writer, startcol=10, startrow=5, index=False)
        ske.to_excel(writer, startcol=11, startrow=5, index=False)
        kur.to_excel(writer, startcol=12, startrow=5, index=False)
        ws.write_string(5 + rng.size + 2, 5, 'Correlation')
        cor.to_excel(writer, startcol=0, startrow=5 + rng.size + 4)
        #        mi1[1:].to_excel(writer,startcol=rng.size+2,startrow=5+rng.size+4)
        b.to_excel(writer, startcol=12, startrow=5 + rng.size + 4)
        #        ws.write_string(5+rng.size+2,14,'Binning')
        #        misplot.to_excel(writer,startcol=0,startrow=rng.size+rng.size+5+3+5)
        writer.close()
    except:
        engine = 'xlsxwriter'
        writer = pd.ExcelWriter('Report.xlsx', engine=engine)
        stat1 = pd.DataFrame(stat.T)
        print(stat1)
        stat1.to_excel(writer, startcol=0, startrow=5)
        ws = writer.sheets['Sheet1']
        ws.write_string(1, 4, 'DataDescription')
        rng.to_excel(writer, startcol=9, startrow=5, index=False)
        iq.to_excel(writer, startcol=10, startrow=5, index=False)
        ske.to_excel(writer, startcol=11, startrow=5, index=False)
        kur.to_excel(writer, startcol=12, startrow=5, index=False)
        ws.write_string(5 + rng.size + 2, 5, 'Correlation')
        cor.to_excel(writer, startcol=0, startrow=5 + rng.size + 4)
        mi1[1:].to_excel(writer,
                         startcol=rng.size + 2,
                         startrow=5 + rng.size + 4)
        writer.close()
    ft = feature_transformation(rolled_df, target)
    p = True
    while (p):
        degree = input(
            "Enter the degree of the polynomial features you want to derive")
        try:
            degree = int(degree)
            p = False
        except:
            print("You did not enter correct value. Try again")
            p = True

    poly_feature_set = ft.poly_features()
    feature_transformed_df = ft.transformation()
    cols_to_use = poly_feature_set.columns.difference(
        feature_transformed_df.columns)
    final_df = pd.merge(feature_transformed_df,
                        poly_feature_set[cols_to_use],
                        left_index=True,
                        right_index=True,
                        how='outer')
    print(final_df.columns)
    cat = [x for x in final_df.columns
           if final_df[x].dtypes == 'object'].copy()
    label_encod = ft.label_encoding(final_df)
    one_hot = ft.one_hot_encoding(label_encod, cat)
    cols_use = one_hot.columns.difference(final_df.columns)
    final_f = pd.merge(final_df,
                       one_hot[cols_use],
                       left_index=True,
                       right_index=True,
                       how='outer')
    nystroem_rbf_dataframe = ft.kernel_transformation_using_nystroem_rbf(
        final_f, cat)
    cols_needed = nystroem_rbf_dataframe.columns.difference(final_df)
    final_data = pd.merge(final_df,
                          nystroem_rbf_dataframe[cols_needed],
                          left_index=True,
                          right_index=True,
                          how='outer')
    f = 0
    p1 = True
    while (p1):
        try:
            p1 = False
            f = input(
                "Enter 1 if you want to write the dataframe into a csv file else enter 0:"
            )
            if (int(f) == 1):
                path = input("Enter the path where you want to save:")
                final_data.to_csv(path, index=False)
        except:
            p1 = True
            print("You have entered wrong value. Please try again.")

#    ml=machine_learning(final_data,target)
    datecol = [
        x for x in final_data.columns
        if final_data[x].dtypes == 'datetime64[ns]'
    ]
    X1 = [
        x for x in final_data.columns if final_data[x].dtypes != 'object'
        and x not in datecol and x not in target
    ]
    X = [x for x in X1 if x not in cat]
    fs = feature_selection(final_data, target)
    fs.recursive_feature_elimination(X)
Ejemplo n.º 12
0
    # pre-training/initialization of the parameters of 1LNN:
    w_h1, w_o, b1, bo, t1, accuracy1[t], F1[t] = NN_pretraining_one(
        trX, trY, teX, teY, K)
    time1[t] = t1 / 10000

    # f_subset = np.arange(19)  # respiration features
    # f_subset = np.arange(19,32,1) # wrist features
    f_subset = [25, 29]  # median of (roll,pitch)
    # f_subset = np.arange(37)  # all the features
    # number of features used in the first stage
    f = 5
    # indicator function whether or not to use z-normalization, set z to 1 when we have 5 features
    z = 1

    # training and testing data in the first stage, trX2 and teX2:
    trX2, teX2 = feature_selection(trX, teX, f_subset, f, z)

    # firm cascade:
    plambda = [0.25]
    t3, accuracy3[t], F3[t], nnz3[t] = cascade_two_stage(
        trX, trY, teX, teY, trX2, teX2, w_h1, w_o, b1, bo, plambda, a)
    time3[t] = t3 / 10000

    # soft cascade:
    beta = [0.0001]
    t2, accuracy2[t], F2[t], nnz2[t] = soft_cascade_LR_1LNN(
        trX, trY, teX, teY, trX2, teX2, beta, K)
    time2[t] = t2 / 10000

    t += 1
Ejemplo n.º 13
0
def text_classifly_twang(vectorizer, doc_str_list_train, doc_str_list_test,
                         doc_class_list_train, doc_class_list_test,
                         doc_terms_list_train, fs_method, fs_num):
    #文件夹下有多个子文件夹,每个文件夹名为类别并存放属于该类的txt
    #fs_method是特征选取的方式
    #fs_num是在进行特征选取后,按照特征重要度排序后得到的前fs_num个特征
    print('Loading dataset, 80% for training, 20% for testing...')

    # print(doc_class_list_train)#一串1 0 组成的列表
    print('Feature selection...')
    print('fs method:' + fs_method, 'fs num:' + str(fs_num))

    # at first i encounter an encoding problem
    #here i fix the problem with the decode_error parameter = u'ignore' while default setting is u'strict'
    #http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

    # print((word_tokenizer(('i am new student\nyou good'))))#return a list of word_tokenizer
    # print(list(jieba.cut('我是撒谎比\n你是\t杀吧', cut_all = False )))

    # doc_terms_list_train = [word_tokenizer(str(doc_str, encoding = 'utf-8', errors = 'ignore')) for doc_str in doc_str_list_train]

    selectedFeatures = feature_selection.feature_selection(
        doc_terms_list_train, doc_class_list_train, fs_method)

    print(len(selectedFeatures))
    term_set_fs = selectedFeatures[:fs_num]
    # print('-----------',len(term_set_fs))

    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    #print the number of features
    # print(doc_train_vec.shape)
    # print((doc_train_vec))
    #scipy.sparse.csr.csr_matrix
    # print(type(doc_train_vec))
    doc_test_vec = vectorizer.transform(doc_str_list_test)

    # print('Building Naive Beyas model...')
    # clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类��?
    # doc_test_predicted = clf.predict(doc_test_vec)

    # print('Building SVM model...')
    # svclf = SVC(kernel = 'linear')#default with 'rbf'
    # svclf.fit(doc_train_vec, doc_class_list_train)
    # doc_test_predicted = svclf.predict(doc_test_vec)

    print('Building KNN model...')
    knnclf = KNeighborsClassifier()
    knnclf.fit(doc_train_vec, doc_class_list_train)
    doc_test_predicted = knnclf.predict(doc_test_vec)

    #here i can't use RBM, there is not prediction attribute
    # from sklearn.neural_network import BernoulliRBM
    # RBMclf = BernoulliRBM().fit(doc_train_vec, doc_class_list_train)
    # doc_test_predicted = RBMclf.predict(doc_test_vec)

    # print('Building Multilayer perception classifier model...')
    # mlpclf = MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes = (50,20), random_state = 1)
    # mlpclf.fit(doc_train_vec, doc_class_list_train)
    # doc_test_predicted = mlpclf.predict(doc_test_vec)

    acc = np.mean(doc_test_predicted == doc_class_list_test)
    print('Accuracy:', acc)
    return acc
Ejemplo n.º 14
0
    predictions = model.predict(df)
    result = load_submission()
    result['SalePrice'] = predictions
    if save_csv:
        result.to_csv('../data/processed/test_results.csv', index=False)

    return result


def load_submission(path='../data/raw/sample_submission.csv'):
    return pd.read_csv(path)


def preprocessing(df):
    df = pd.get_dummies(df, drop_first=True)
    return df


def load_model(filename="../models/2-gradient-boosting.sav"):
    return joblib.load(filename)


if __name__ == "__main__":
    os.chdir(os.path.dirname(sys.argv[0]))
    model = load_model()
    test = pd.read_csv('../data/raw/test.csv', index_col=0)
    test = clean(test, to_test=True)
    test = feature_selection(test)
    test = preprocessing(test)
    print(predict_results(model, test))
Ejemplo n.º 15
0
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))


trainX = train.drop(["ID", "y"], axis=1).values
trainY = train['y'].values

model = feature_selection.feature_selection(
    trainX, trainY, chi2, method="SelectKBest", k=150)

# trainX_new = model.transform(trainX)
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.003 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.95      # feature_fraction
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 20
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
Ejemplo n.º 16
0
def run_model_recursive(apt_fname, tr_te_split, res_file_pref, freq,
                        is_feat_select, is_draw):
    ############
    # load data
    ############
    print('========================================' * 2)
    print(apt_fname, res_file_pref)
    print(tr_te_split)

    df = load_data(apt_fname, freq)
    train = df[tr_te_split['trb']:tr_te_split['tre']]
    test = df[tr_te_split['teb']:tr_te_split['tee']]
    print(test)
    print('train/test:', train.shape, test.shape)

    feat = list(train.columns.values)
    feat.remove('energy')
    feat.remove('raw_energy')
    print('features (%d):' % len(feat), feat)
    print('index of energy-1:', feat.index('energy-1'))

    X_train = train[feat].as_matrix()
    y_train = train['energy'].as_matrix()
    X_test = test[feat].as_matrix()
    y_test = test['raw_energy'].as_matrix()

    print('train/test (after converting to matrix):', X_train.shape,
          X_test.shape)

    ####################
    # feature selection
    ####################
    if is_feat_select:
        print('feature seleciton ...')
        selected = feature_selection(X_train, y_train, 12)
        print(len(selected))
        print('selected features (%d):' % sum(selected),
              [feat[i] for i in range(len(selected)) if selected[i]])
        X_train = X_train[:, selected]
        X_test = X_test[:, selected]
        print('train/test (after feature selection):', X_train.shape,
              X_test.shape)
        res_file_pref += '_feature'

    ########
    # train
    ########
    print('training ...')
    parameters = {
        'n_estimators': (50, 100, 150, 200, 250, 300, 350, 400, 450, 500),
        'max_depth': [1, 2, 3],
        'learning_rate': [0.001, 0.01, 0.1],
        'random_state': [42],
        'loss': ['ls']
    }

    # parameters = {'n_estimators': (50,),
    #               'max_depth': [1],
    #               'learning_rate': [0.001],
    #               'random_state': [42],
    #               'loss': ['ls']}

    clf = GridSearchCV(GradientBoostingRegressor(),
                       param_grid=parameters,
                       cv=TimeSeriesSplit(n_splits=3),
                       scoring='neg_mean_squared_error')
    clf.fit(X_train, y_train)
    print(clf.best_params_)

    #######
    # test
    #######
    print('testing (recursive) ...')
    y_pred = []
    for i in range(len(X_test)):
        # print '-------' * 10
        # print 'i:', i
        # print 'y_pred:', y_pred
        # print 'feat:', X_test[i][18:]
        # print 'range(i):', range(i)
        for j in range(min(i, 49)):
            X_test[i][j + feat.index('energy-1')] = np.log(y_pred[-j - 1] + 1)
        # print 'feat:', X_test[i][18:]
        y_p = clf.predict([X_test[i]])[0]
        y_p = np.exp(y_p) - 1
        # print 'y_p:', y_p, np.log(y_p+1)
        y_pred.append(y_p)

    #############
    # evaluation
    #############
    mse = mean_squared_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print('MSE:', mse)
    print('MAPE:', mape)
    print('save result to file ...')
    pickle.dump({
        'y_test': y_test,
        'y_pred': y_pred
    }, open(res_file_pref + '_mse%.4f_mape%.4f.pkl' % (mse, mape), 'wb'))
    print('saved.')

    if is_draw:
        pyplot.plot(y_test)
        pyplot.plot(y_pred, color='red')
        pyplot.show()
    #all the work is done here
    def transform(self, X):
        X = pd.Series(X)
        X_tagged = X.apply(self.custom_feat).apply(pd.Series).fillna(0)
        X_tagged['n_tokens'] = X_tagged.apply(sum, axis=1)
        #print("Xxxx ", X_tagged)
        if self.normalize:
            X_tagged = X_tagged.divide(X_tagged['n_tokens'], axis=0).fillna(0)
        #print("X tagged ", X_tagged)
        return X_tagged


if __name__ == '__main__':

    qd = question_detector()
    feature_sel = feature_selection()
    #sys.exit()

    data = pd.read_csv(qd.input_file)
    X_train_1 = data['post_text']
    y_train_1 = data['category']
    print(X_train_1.shape, "\t", y_train_1.shape)

    data = pd.read_csv(qd.input_file_2)
    X_train_2 = data['post_text']
    y_train_2 = data['category']
    print(X_train_2.shape, "\t", y_train_2.shape)

    uniques, count = np.unique(y_train_2, return_counts=True)
    print(dict(zip(uniques, count)))
    sys.exit()
import os


def preprocessing_train(df):
    df = pd.get_dummies(df, drop_first=True)
    return df


def train_model(df):
    X_train, y_train = df.drop(columns=['SalePrice']), df[['SalePrice']]
    X_train = preprocessing_train(X_train)
    model = GradientBoostingRegressor(n_estimators=3500,
                                      learning_rate=0.01,
                                      max_depth=4,
                                      max_features='sqrt',
                                      min_samples_leaf=15,
                                      min_samples_split=10,
                                      loss='huber',
                                      random_state=42)
    model.fit(X_train, y_train)
    return model


if __name__ == "__main__":
    os.chdir(os.path.dirname(sys.argv[0]))
    df = pd.read_csv('../data/raw/train.csv', index_col=0)
    df = clean(df)
    df = feature_selection(df)
    model = train_model(df)
    joblib.dump(model, '../models/2-gradient-boosting.sav')
def text_classifly_twang(vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train,fs_method, fs_num, cf_method):
    print('Loading dataset, 80% for training, 20% for testing...')
    print('Feature selection...')
    print('fs method:' + fs_method, 'fs num:' + str(fs_num))

    selectedFeatures = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)
    print('-------select feature_selection')
    numShow = 500
    count = 0

    # 打印特证词
    # for i in selectedFeatures[1:numShow]:
    #     print(count, ' \t',i)
    #     count += 1

    print('特征词的数量:')
    print(len(selectedFeatures))
    term_set_fs = selectedFeatures[:fs_num]#选择前fs_num个特征词
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))#建立单词字典,key是特证词,value是索引号

    #词频矩阵
    vectorizer.fixed_vocabulary = True#固定单词库,这样词频矩阵只会统计单词库内的单词出现的频次
    vectorizer.vocabulary = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec= vectorizer.transform(doc_str_list_test)

    if(cf_method == 'nb'):
        print('Building Naive Beyas model...')
        clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类��?
        doc_test_predicted = clf.predict(doc_test_vec)
    elif(cf_method == 'svm'):
        print('Building SVM model...')
        svclf = SVC(kernel = 'linear')#default with 'rbf'
        svclf.fit(doc_train_vec, doc_class_list_train)
        doc_test_predicted = svclf.predict(doc_test_vec)
    elif(cf_method == 'knn'):
        print('Building KNN model...')
        knnclf = KNeighborsClassifier(5)#括号内传入k值
        knnclf.fit(doc_train_vec, doc_class_list_train)
        doc_test_predicted = knnclf.predict(doc_test_vec)
    elif(cf_method == 'bp'):
        print('Building Multilayer perception classifier model...')
        mlpclf = MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes = (50,20), random_state = 1)
        mlpclf.fit(doc_train_vec, doc_class_list_train)
        doc_test_predicted = mlpclf.predict(doc_test_vec)

    #here i can't use RBM, there is not prediction attribute
    # from sklearn.neural_network import BernoulliRBM
    # RBMclf = BernoulliRBM().fit(doc_train_vec, doc_class_list_train)
    # doc_test_predicted = RBMclf.predict(doc_test_vec)

    #打印准确度
    acc = np.mean(doc_test_predicted == doc_class_list_test)
    printOption = True
    if (printOption):
        print('Accuracy:', acc)

    f1_micro = metrics.f1_score(doc_class_list_test, doc_test_predicted, average = 'micro')
    f1_macro = metrics.f1_score(doc_class_list_test, doc_test_predicted, average = 'macro')
    #average = macro , 没有考虑不平衡数据集。本次实验使用的三个类样本数目相近,因此f1_macro直接计算分类器在三个类别的f1值的均值,结果可以接受
    if (printOption):
        print('f1_micro:\t', f1_micro, '\tf1_macro:\t', f1_macro)
        print(metrics.f1_score(doc_class_list_test, doc_test_predicted, average = None))
    #只选取了三个类的文章
        print(metrics.classification_report(doc_class_list_test, doc_test_predicted, target_names = ['sport','economy','computer']))
        print(metrics.confusion_matrix(doc_class_list_test, doc_test_predicted))
    else:
        f1_macro = 0
        acc = 0
    return f1_macro, acc
Ejemplo n.º 20
0
    trX1 = trX
    teX1 = teX

    plambda2 = [0.425]
    K = 3
    v_h1, v_o, c1, co, time22, accuracy22, F22 = second_stage_pretraining(
        trX, trY, teX, teY, trX1, teX1, K, w_h1, w_h2, w_o, b1, b2, bo,
        plambda2, a)

    f_subset1 = np.arange(19)  # respiration features
    f_subset2 = np.arange(19, 32, 1)  # wrist features
    f1 = 19
    z1 = 1
    f2 = 13
    z2 = 1
    trX1, teX1 = feature_selection(trX, teX, f_subset1, f1, z1)
    trX2, teX2 = feature_selection(trX, teX, f_subset2, f2, z2)

    # firm cascade:
    plambda = [0.05]
    t3, accuracy3[t], F3[t], nnz = tree_cascade_v1(trX, trY, teX, teY, trX1,
                                                   teX1, trX2, teX2, w_h1,
                                                   w_h2, w_o, b1, b2, bo, v_h1,
                                                   v_o, c1, co, plambda, a)
    time3[t] = t3 / 10000

    # soft cascade:
    beta = [0.001]
    t2, accuracy2[t], F2[t], nnz_soft = tree_soft_cascade_v1(
        trX, trY, teX, teY, trX1, teX1, trX2, teX2, beta, K, K1, K2)
    time2[t] = t2 / 10000
Ejemplo n.º 21
0
def run_model(apt_fname,
              tr_te_split,
              res_file_pref,
              freq,
              is_feat_select,
              is_draw):
    ############
    # load data
    ############
    print('========================================' * 2)
    # print(apt_fname, res_file_pref)
    print(tr_te_split)

    df = load_data(apt_fname, freq)

    train = df[tr_te_split['trb']: tr_te_split['tre']]
    test = df[tr_te_split['teb']: tr_te_split['tee']]
    # print(test)
    # print('train/test:', train.shape, test.shape)

    feat = list(train.columns.values)
    feat.remove('energy')
    feat.remove('raw_energy')
    # print('raw features (%d):' % len(feat), feat)

    X_train = train[feat].as_matrix()
    y_train = train['energy'].as_matrix()
    X_test = test[feat].as_matrix()
    y_test = test['raw_energy'].as_matrix()

    # print('train/test (after converting to matrix):', X_train.shape, X_test.shape)

    ####################
    # feature selection
    ####################
    if is_feat_select:
        print('feature seleciton ...')
        selected = feature_selection(X_train, y_train, 12)
        print(len(selected))
        print('selected features (%d):' % sum(selected), [feat[i] for i in range(len(selected)) if selected[i]])
        X_train = X_train[:, selected]
        X_test = X_test[:, selected]
        print('train/test (after feature selection):', X_train.shape, X_test.shape)
        res_file_pref += '_feature'

    ########
    # train
    ########
    print('training ...')
    parameters = {'C': (0.001, 0.01, 0.1, 1),
                  'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
                  }

    clf = GridSearchCV(svm.SVR(),
                       param_grid=parameters,
                       cv=TimeSeriesSplit(n_splits=3),
                       scoring='neg_mean_squared_error')
    clf.fit(X_train, y_train)
    print(clf.best_params_)

    #######
    # test
    #######
    print('testing ...')

    y_pred = clf.predict(X_test)

    # y_pred = np.exp(np.cumsum(np.concatenate(([np.log(y_test[0])], y_pred))))
    y_pred = np.exp(y_pred) - 1

    #############
    # evaluation
    #############
    mse = mean_squared_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print('MSE:', mse)
    print('MAPE:', mape)
    print('save result to file ...')
    pickle.dump(
        {'y_test': y_test, 'y_pred': y_pred},
        open(res_file_pref + '_mse%.4f_mape%.4f.pkl' % (mse, mape), 'wb'))
    print('saved.')

    if is_draw:
        pyplot.plot(y_test)
        pyplot.plot(y_pred, color='red')
        pyplot.show()
Ejemplo n.º 22
0
    with open(results_path, 'w') as f: 
        json.dump(scores, f)
        if verbose:
            print("Successfully saved scores.")


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Fragile Families Train Script")
    parser.add_argument('model', help="model")
    parser.add_argument('outcome', help="outcome")
    parser.add_argument('-i', dest='imp_method', help="imputation method", default='KNN')
    parser.add_argument('-m', dest='fs_method', help="feature selection method", default='ElasticNet')
    parser.add_argument('-d', dest='data_dir', help='data directory', default='data')
    parser.add_argument('-s', dest='results_dir', help='results directory', default='results')
    args = parser.parse_args()

    features_path = 'features_{}_{}.csv'.format(args.outcome, args.imp_method)
    labels_path = 'labels_{}_{}.csv'.format(args.outcome, args.imp_method)

    print("Loading dataset...")
    X, Y = get_data(args.data_dir, features_path, labels_path)
    is_classf = Y.dtype == np.int8 
    print("Successfully loaded dataset.")
    if args.fs_method:
        print("Performing feature selection using {}...".format(args.fs_method))
        X = feature_selection(X, Y, args.outcome, args.fs_method, args.imp_method, args.data_dir, verbose=1)
    print("X dim: {}".format(X.shape))
    train(args.model, X, Y, is_classf, args.outcome, args.fs_method, 
        args.imp_method, args.data_dir, args.results_dir, verbose=1)    
    
Ejemplo n.º 23
0
def create_dictionary(posfile, negfile, dicfile, use_stopwords, stopwordsfile,
                      use_chi2_select, local_fun, global_fun):
    stopwords = []
    if use_stopwords:
        stopwords = read_keywords_file(stopwordsfile)

    fwrite = open(dicfile, 'w')
    if fwrite is None:
        raise IOError('%s cannt open' % (dicfile))

    # 读取所有文件内的词
    words = []
    words.extend(read_words(posfile))
    words.extend(read_words(negfile))

    # 统计词频
    cnts = collections.Counter(words).most_common()

    print('total vocab:%d' % len(cnts))

    # 去除停用词+低词频
    dictionary = {}
    idx = 0
    reverse_dic = []
    for cnt in cnts:
        if cnt[1] >= min_freq and cnt[0] not in stopwords:
            dictionary[cnt[0]] = idx
            reverse_dic.append(cnt[0])
            idx += 1

    print('total vocab after stop and min_req :%d' % idx)

    # 卡方检验抽取chi2_rate的词,并计算global_fun
    if not use_chi2_select:  #使用卡方检验
        return

    posdata, poslabel, posidf = read_data_with_label(posfile, 1, dictionary,
                                                     local_fun, global_fun)
    negdata, neglabel, negidf = read_data_with_label(negfile, -1, dictionary,
                                                     local_fun, global_fun)

    # 文档数
    D = len(posdata) + len(negdata)

    # 获取idf
    idf = np.log(D / (posidf + negidf))

    datas = posdata
    labels = poslabel
    datas.extend(negdata)
    labels.extend(neglabel)

    global C
    global kernel
    global gamma

    dim_k, C, kernel, gamma, scores, pvals = feature_selection.feature_selection(
        datas, labels)

    # chi2值,p值, 单词, idf合并
    chi2info = zip(scores, pvals, reverse_dic, idf)

    chi2info = sorted(chi2info, key=itemgetter(0), reverse=True)

    vocab_size = dim_k
    print('total vocab after chi2:%d' % vocab_size)
    for i in range(vocab_size):
        fwrite.write(
            '%lf\t%lf\t%s\t%lf\n' %
            (chi2info[i][0], chi2info[i][1], chi2info[i][2], chi2info[i][3]))
    fwrite.close()
Ejemplo n.º 24
0
# fe_stats
x_train, x_test = fe_stats(x_train, x_test, genes_features, cells_features)
x_train.head()

# group the drug using kmeans
if runty == 'traineval':
    x_train, x_test = fe_cluster(x_train, x_test, genes_features, cells_features,
                                 n_cluster_g=cfg_fe.n_clusters_g, n_cluster_c=cfg_fe.n_clusters_c, seed=cfg_fe.seed, runty=runty, path=save_path)
elif runty == 'eval':
    x_train, x_test = fe_cluster(x_train, x_test, genes_features, cells_features,
                                 n_cluster_g=cfg_fe.n_clusters_g, n_cluster_c=cfg_fe.n_clusters_c, seed=cfg_fe.seed, runty=runty, path=load_path)


# select feature, VarianceThreshold
x_train, x_test = feature_selection(
    x_train, x_test, feature_select=cfg_fe.feature_select, variancethreshold_for_FS=cfg_fe.variancethreshold_for_FS)


# one-hot encoding
x_train = onehot_encoding(x_train)
x_test = onehot_encoding(x_test)

feature_cols = [c for c in x_train.columns if (str(c)[0:5] != 'kfold' and c not in [
    'sig_id', 'drug_id', 'cp_type', 'cp_time', 'cp_dose'])]
target_cols = [x for x in y_train.columns if x != 'sig_id']


# label smoothing
if cfg_fe.regularization_ls:
    y_train = ls_manual(y_train, ls_rate=cfg_fe.ls_rate)
Ejemplo n.º 25
0
from feature_engineering import feature_engineering
from feature_selection import feature_selection
from Models import linear_model,xgb_model
import argparser
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

if __name__ == '__main__':

	parser=argparse.ArgumentParser()
	parser.add_argument('--train_dataset',help='address of train dataset')
	parser.add_argument('--test_dataset',help='address of test dataset')
	parser.add_argument('--model',help='model')

	train_dataset = parser.train_dataset
	test_dataset  = parser.test_dataset
	model=parser.model
	feature_engineering(train_dataset,test_dataset)
	feature_selection()


	if model=='linear':
		linear_model()
	if model=='xbg':
		xgb_model()

	elif:
		linear_model()
		xgb_model()

Ejemplo n.º 26
0
def compute_regression_results(datasets,cities_dict,city,window,setup,baseline,fs_method,fs_feature_num,features,regressor,weights=None):
    """ Compute regression results
    
    args:
    datasets -- dict containing pandas dataframes from each city and window
    cities_dict -- a dict which as the country code as key (e.g UK) and the list of cities in this country as a value
    city -- the string name of the city
    window -- the number of  aggregated timesteps(hours) (valid values: 6,12,24)
    setup -- the regression setup (valid values: ('cross city' (i.e. all to one),'within city' (i.e. same city)))
    baseline -- string to indicate whether this experiment is baseline, by defining the prediction metric, or not (valid values: 'idw','mean','NULL'))
    fs_method -- the feature selection method ('Conly':features with highier correlation with PM2.5 in all cities.(used in paper),
'                                              'Sonly'':features with lowest correlation variance with PM2.5 in all cities,
                                               'S&C':combination of previous methods,
                                               'None':No feature selection)
    fs_feature_num -- number of best features to keep after performing feature selection  or 'None'
    features -- list of features in one step regression (e.g ['#aqs','bow_10k_unigrams_normalized']) or list of lists of features for two step regression (e.g [[bow_10k_unigrams_normalized'],['nearby_ground_truth_pm25']])
    regressors -- an sklearn regressor for one step regression or a list of two sklearn regressors in two step regression setup
    weights -- the inverse distance weight matrix from all cities (used only in cross city setups) in order to weight each training sample when training the model or None
    """
    
    print(city+' -> '+ str(window)+' in '+setup+' setup with features-> '+str(features) )
    for code in cities_dict:
        if city in cities_dict[code]:
            country_code = code
            country = cities_dict[code]
      
    weights_flag = False
    if weights is not None:
        weights_flag = True
    
    if setup == 'within city':
        dataset = datasets[city+'_'+str(window)]
        train,test = split_dataset_even_odd_months(dataset)
        if weights is not None:
            raise Exception('Need to use cross city setup when using weights')
    elif setup == 'cross city':
        train,test = create_all_vs_one_datasets(datasets,cities_dict,city,window,weights=weights) 
         
    #check if it is a 2 step regression
    if isinstance(features[0],list):
        two_step = True
        first_feature = features[0]#feature selection only for the first feature (currently implemented  to work only for one bag of words feature)
        feature_list = features[0] + features[1]
    else:
        two_step = False
        first_feature = features
        feature_list = features
        
    #check if the experiment is a baseline error computation
    if baseline != 'NULL':
        copy_test = test.copy()
        copy_test = test.dropna()
        if baseline =='idw':
            predictions = copy_test['idw_pm25']
        elif baseline == 'mean':
            copy_test['mean'] = test.pm25.mean()
            predictions = copy_test['mean']
            print
        else:
            raise Exception('invalid baseline parameter')
        copy_test['pm25_cat'] = copy_test.pm25.apply(to_labels)
        return[country_code,city,window,setup,baseline,'NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL',
                                np.sqrt(sm.mean_squared_error(predictions, copy_test.pm25)),sm.mean_absolute_error(predictions, copy_test.pm25),
                                precision_recall_fscore_support(copy_test.pm25_cat,predictions.apply(to_labels),labels=['good','bad'])[0][1],
                                precision_recall_fscore_support(copy_test.pm25_cat,predictions.apply(to_labels),labels=['good','bad'])[1][1],
                                precision_recall_fscore_support(copy_test.pm25_cat,predictions.apply(to_labels),labels=['good','bad'])[2][1]]             

    
    types = ['BOW','IDW_PM25','Twitter']
    type_mask =[False,False,False]
    representation = 'None'
    for i in feature_list:
        if 'bow' in i:
            type_mask[0] = True
            representation = 'uni_tf' # only this is supported currently
        elif 'idw' in i:
            type_mask[1] = True
        else:
            type_mask[2] = True
    feature_types = '+'.join(list(np.array(types)[type_mask])) #join types if they exist
    feature_details = '+'.join(feature_list)
    if two_step:
        feature_details = feature_details+'_2step'
        regressor_name = [get_regressor_name(regressor[0]),get_regressor_name(regressor[1])]
    else:
        regressor_name = get_regressor_name(regressor)
          
    #feature selection          
    if fs_method != 'NULL':
        if fs_feature_num != 'NULL':
            if len(first_feature) > 1:
                raise Exception('You have to use only one bow feature for feature selection') #(currently implemented  to work only for one bag of words feature)
            _,mask = feature_selection(datasets,country,first_feature[0],'pm25',window,method=fs_method)
            mask = mask[:fs_feature_num]#get the fs_feature_num top features
    else:
        mask = None
        
    if two_step:
        #calculate the training predictions using KFold cross validation to train the second step regression model
        train_dataset,bow_predictions = create_cv_bow_model(regressor[0],
                           features[0],'pm25',train,mask=mask,keep=features[0]+features[1],weights=weights_flag,cv=3)
        #train and test a regressor with first step features
        _,test_predictions =first_step_regression(train,test,regressor[0],features[0],mask=mask
                                                ,keep =features[0]+features[1],weights=weights_flag) 
        
        #train second step regressor with second step features and bow prediction
        model = create_second_step_bow_model(regressor[1],train_dataset,features[1],bow_predictions,weights=weights_flag)
        #use above model to test second step features with test predictions from first step features
        rmse_res,mae_res,test_prediction,precision,recall,fscore = testing(model,features[1],'pm25',test,mask=None,additional_features= test_predictions,
                        keep=features[0]+features[1],classification=True)
        return[country_code,city,window,setup,baseline,weights_flag,fs_method,fs_feature_num,
               feature_types,feature_details,representation,regressor_name[0],regressor_name[1],rmse_res,mae_res,precision[1],recall[1],fscore[1]]

    else:
        #training
        model,train_prediction = training(regressor,features,'pm25',train,mask=mask,weights=weights_flag) #add mask for feature selecation
        #testing
        rmse_res,mae_res,test_prediction,precision,recall,fscore = testing(model,features,'pm25',test,mask=mask,classification=True,verbose=False)
        return[country_code,city,window,setup,baseline,weights_flag,fs_method,fs_feature_num,
               feature_types,feature_details,representation,regressor_name,'NULL',rmse_res,mae_res,precision[1],recall[1],fscore[1]]
Ejemplo n.º 27
0
    teY = Y[test_idxs]

    # parameter alpha of the gating function:
    a = 10
    # number of hidden units in 1LNN:
    K1 = 10
    K2 = 20
    # pre-training/initialization of the parameters of 1LNN:
    w_h1, w_h2, w_o, b1, b2, bo, t1, accuracy1[t], F1[t] = NN_pretraining(
        trX, trY, teX, teY, K1, K2)
    time1[t] = t1 / 10000

    f_subset1 = np.arange(37)  # all the features
    f = 37
    z = 0
    trX2, teX2 = feature_selection(trX, teX, f_subset1, f, z)

    # firm cascade:
    plambda1 = [0.43]
    K = 3
    v_h1, v_o, c1, co, t22, a22, f22 = second_stage_pretraining(
        trX, trY, teX, teY, trX2, teX2, K, w_h1, w_h2, w_o, b1, b2, bo,
        plambda1, a)

    f_subset2 = [25, 29]  # median of (roll,pitch)
    f = 5
    z = 1
    trX3, teX3 = feature_selection(trX, teX, f_subset2, f, z)

    plambda2 = [0.26]
    t3, accuracy3[t], F3[t], nnz3_firt, nnz3_second = cascade_three_stage(
Ejemplo n.º 28
0
def main():

    cfg_fe = Config_FeatureEngineer()
    seed_everything(seed_value=cfg_fe.seed)

    data_dir = '/kaggle/input/lish-moa/'
    save_path = './'
    load_path = '/kaggle/input/moatabnetmultimodekfold/'
    runty = 'eval'

    train = pd.read_csv(os.path.join(data_dir, 'train_features.csv'))
    targets_scored = pd.read_csv(
        os.path.join(data_dir, 'train_targets_scored.csv'))
    test = pd.read_csv(os.path.join(data_dir, 'test_features.csv'))
    train_drug = pd.read_csv(os.path.join(data_dir, 'train_drug.csv'))
    submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))

    x_train = train.copy()
    x_test = test.copy()
    y_train = targets_scored.copy()

    genes_features = [column for column in x_train.columns if 'g-' in column]
    cells_features = [column for column in x_train.columns if 'c-' in column]

    # scale the data, like RankGauss
    x_train, x_test = scaling(x_train,
                              x_test,
                              scale=cfg_fe.scale,
                              n_quantiles=cfg_fe.scale_n_quantiles,
                              seed=cfg_fe.seed)

    # decompose data, like PCA
    if runty == 'traineval':
        x_train, x_test = decompo_process(x_train,
                                          x_test,
                                          decompo=cfg_fe.decompo,
                                          genes_variance=cfg_fe.genes_variance,
                                          cells_variance=cfg_fe.cells_variance,
                                          seed=cfg_fe.seed,
                                          pca_drop_orig=cfg_fe.pca_drop_orig,
                                          runty=runty,
                                          path=save_path)
    elif runty == 'eval':
        x_train, x_test = decompo_process(x_train,
                                          x_test,
                                          decompo=cfg_fe.decompo,
                                          genes_variance=cfg_fe.genes_variance,
                                          cells_variance=cfg_fe.cells_variance,
                                          seed=cfg_fe.seed,
                                          pca_drop_orig=cfg_fe.pca_drop_orig,
                                          runty=runty,
                                          path=load_path)

    # select feature, VarianceThreshold
    x_train, x_test = feature_selection(
        x_train,
        x_test,
        feature_select=cfg_fe.feature_select,
        variancethreshold_for_FS=cfg_fe.variancethreshold_for_FS)

    # fe_stats
    x_train, x_test = fe_stats(x_train, x_test, genes_features, cells_features)

    # group the drug using kmeans
    if runty == 'traineval':
        x_train, x_test = fe_cluster(x_train,
                                     x_test,
                                     genes_features,
                                     cells_features,
                                     n_cluster_g=cfg_fe.n_clusters_g,
                                     n_cluster_c=cfg_fe.n_clusters_c,
                                     seed=cfg_fe.seed,
                                     runty=runty,
                                     path=save_path)
    elif runty == 'eval':
        x_train, x_test = fe_cluster(x_train,
                                     x_test,
                                     genes_features,
                                     cells_features,
                                     n_cluster_g=cfg_fe.n_clusters_g,
                                     n_cluster_c=cfg_fe.n_clusters_c,
                                     seed=cfg_fe.seed,
                                     runty=runty,
                                     path=load_path)

    # one-hot encoding
    x_train = onehot_encoding(x_train)
    x_test = onehot_encoding(x_test)

    feature_cols = [
        c for c in x_train.columns
        if (str(c)[0:5] != 'kfold' and c not in
            ['sig_id', 'drug_id', 'cp_type', 'cp_time', 'cp_dose'])
    ]
    target_cols = [x for x in y_train.columns if x != 'sig_id']

    # label smoothing
    if cfg_fe.regularization_ls:
        y_train = ls_manual(y_train, ls_rate=cfg_fe.ls_rate)

    # merge drug_id and labels
    x_train = x_train.merge(y_train, on='sig_id')
    x_train = x_train.merge(train_drug, on='sig_id')

    # remove sig_id
    # x_train, x_test, y_train = remove_ctl(x_train, x_test, y_train)

    # make CVs
    target_cols = [x for x in targets_scored.columns if x != 'sig_id']
    x_train = make_cv_folds(x_train, cfg_fe.seeds, cfg_fe.nfolds,
                            cfg_fe.drug_thresh, target_cols)

    begin_time = datetime.datetime.now()

    if (runty == 'traineval'):
        test_preds_all = train_tabnet(x_train, y_train, x_test, submission,
                                      feature_cols, target_cols, cfg_fe.seeds,
                                      cfg_fe.nfolds, save_path)
        y_train = targets_scored[
            train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
        test_pred_final = pred_tabnet(x_train,
                                      y_train,
                                      x_test,
                                      submission,
                                      feature_cols,
                                      target_cols,
                                      cfg_fe.seeds,
                                      cfg_fe.nfolds,
                                      load_path='./',
                                      stacking=False)
    elif (runty == 'eval'):
        y_train = targets_scored[
            train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
        test_pred_final = pred_tabnet(x_train,
                                      y_train,
                                      x_test,
                                      submission,
                                      feature_cols,
                                      target_cols,
                                      cfg_fe.seeds,
                                      cfg_fe.nfolds,
                                      load_path,
                                      stacking=False)

    time_diff = datetime.datetime.now() - begin_time
    print(f'Total time is {time_diff}')

    # make submission
    all_feat = [col for col in submission.columns if col not in ["sig_id"]]
    # To obtain the same lenght of test_preds_all and submission
    # sig_id = test[test["cp_type"] != "ctl_vehicle"].sig_id.reset_index(drop=True)
    sig_id = test.sig_id
    tmp = pd.DataFrame(test_pred_final, columns=all_feat)
    tmp["sig_id"] = sig_id

    submission = pd.merge(test[["sig_id"]], tmp, on="sig_id", how="left")
    submission.fillna(0, inplace=True)
    submission[test["cp_type"] == "ctl_vehicle"] = 0.

    submission.to_csv("submission_tabbet.csv", index=None)