Ejemplo n.º 1
0
def concat_train_test(train: str, test: str):
    """
    parameter: train, test - path of data
    return: len(train),train_test_data_path, train_test_data
    """
    train_data, _ = load_csv(train)
    test_data, _ = load_csv(test)
    train_test_data = pd.concat([train_data, test_data])
    train_test_data_path = data_path + 'train_test_data.csv'
    train_test_data.to_csv(train_test_data_path, index=None, encoding='utf8')
    return len(train_data), train_test_data_path, train_test_data
Ejemplo n.º 2
0
def train(train_data_path, test_data_path):
    """
    return: len(train_data), train_test_data_path
    """
    start = datetime.now()
    print(start, "Start training dm and dbow model!")
    add_number_to_query(train_data_path)
    train_data, _ = load_csv(train_data_path)
    X_train = tokenize_data(train_data)
    tfidf_matrix_dump(X_train, 'tfidf_train.pkl')

    new_train_data_path = fill_train_data(train_data, X_train)
    length, train_test_data_path, train_test_data = concat_train_test(
        new_train_data_path, test_data_path)
    add_number_to_query(train_test_data_path)
    X_train_test = tokenize_data(train_test_data)
    tfidf_matrix_dump(X_train_test, 'tfidf_train_test.pkl')

    train_doc2vec(train_test_data_path, length, 'dbow')
    train_doc2vec(train_test_data_path, length, 'dm')
    end = datetime.now()
    print(
        end, "Train dm and dbow model down! Duration time: {}s ".format(
            (end - start).seconds))
    return length, train_test_data_path
Ejemplo n.º 3
0
def train_tfidf_stack(csv_path, length):
    """
    csv_path: train_test_data_path
    length: the length of train_data
    """
    print(datetime.now(), 'training tfidf stack start!')
    train_test_data, dic = load_csv(csv_path)
    # tfv = TfidfVectorizer(tokenizer=Tokenizer(len(train_test_data)), min_df=3, max_df=0.95, sublinear_tf=True)
    # X = tfv.fit_transform(train_test_data['Query'])
    with codecs.open(data_path + 'tfidf_train_test.pkl', 'rb') as f:
        X = pickle.load(f)  # load tfidf matrix from tfidf_train_test.pkl
    df_stack = pd.DataFrame(index=range(len(train_test_data)))
    # -----------------------stacking for education/age/gender------------------
    # ['Education', 'Age','Gender']
    for i in ['Education', 'Age', 'Gender']:
        print(i)
        TR = length
        # print(train_test_data.iloc[:TR][i].value_counts())
        # print(train_test_data.iloc[TR:][i].value_counts())
        num_class = len(pd.value_counts(dic[i]))
        n = 5

        X_tr = X[:TR]
        y_tr = dic[i][:TR]
        X_te = X[TR:]
        y_te = dic[i][TR:]

        stack = np.zeros((X_tr.shape[0], num_class))
        stack_te = np.zeros((X_te.shape[0], num_class))
        kf = KFold(n_splits=n)
        for j, (tr, va) in enumerate(kf.split(X_tr, y_tr)):
            print('%s stack:%d/%d' % (str(datetime.now()), j + 1, n))
            # print(train_test_data.iloc[tr][i].value_counts())
            # print(train_test_data.iloc[va][i].value_counts())
            clf = LogisticRegression(C=3,
                                     solver='liblinear',
                                     dual=True,
                                     max_iter=10000)
            clf.fit(X_tr[tr], y_tr[tr])
            y_pred_va = clf.predict_proba(X_tr[va])
            y_pred_te = clf.predict_proba(X_te)
            print('va acc:', myAcc(y_tr[va], y_pred_va))
            print('te acc:', myAcc(y_te, y_pred_te))
            stack[va] += y_pred_va
            stack_te += y_pred_te
        stack_te /= n
        stack_all = np.vstack([stack, stack_te])
        for k in range(stack_all.shape[1]):
            df_stack['tfidf_{}_{}'.format(i, k)] = stack_all[:, k]
    df_stack.to_csv(data_path + 'tfidf_stack.csv', index=None, encoding='utf8')
    print(datetime.now(), 'training tfidf stack done!')
Ejemplo n.º 4
0
def train(train_test_data_path, length):
    """
    length: length of train data
    """
    df_lr, df_dm, df_dbow = load_model_data()
    data, dic = load_csv(train_test_data_path)
    # seed = 10

    TR = length
    df_sub = pd.DataFrame()
    df_sub['Id'] = data.iloc[TR:]['Id']
    df = pd.concat([df_lr, df_dbow, df_dm], axis=1)
    print("----" * 5 + "Training xgb-ens start" + "----" * 5)
    print(df.columns)
    for lb in ['Education', 'Age', 'Gender']:
        # for lb in ['Gender']:
        print("-----" * 5 + lb + "-----" * 5)
        num_class = len(pd.value_counts(dic[lb][:length]))
        X = df.iloc[:TR]
        y = dic[lb][:TR]
        X_te = df.iloc[TR:]
        y_te = dic[lb][TR:]
        print('{} train value_counts'.format(lb))
        print(pd.value_counts(dic[lb][:length]))
        print('{} test value_counts'.format(lb))
        print(pd.value_counts(dic[lb][length:]))
        esr = 100
        evals = 1
        n_trees = 10

        lb_2_model = eval(lb)
        params = lb_2_model.params
        params['num_class'] = num_class
        dtrain = xgb.DMatrix(X, y)
        dvalid = xgb.DMatrix(X_te, y_te)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        bst = xgb.train(params,
                        dtrain,
                        n_trees,
                        evals=watchlist,
                        feval=xgb_acc_score,
                        maximize=True,
                        early_stopping_rounds=esr,
                        verbose_eval=evals)
        df_sub[lb] = np.argmax(bst.predict(dvalid), axis=1)
    df_sub = df_sub[['Age', 'Education', 'Gender', 'Id']]
    df_sub.columns = ['Age', 'Education', 'Gender', 'Id']
    results_path = data_path + 'tfidf_dm_dbow_.csv'
    df_sub.to_csv(results_path, index=None, encoding='utf8')
    ensAcc(results_path, test_set_path)
    print("----" * 5 + "Training xgb-ens finished" + "----" * 5)
Ejemplo n.º 5
0
def train_doc2vec(csv_path, length, model_type='dbow'):
    """
    :param csv_path: path of train and test data with .csv format
    :param model_type: 'dbow' or 'dm'
    :param length: the length of train data
    :return: dbow_d2v.model in ./data/
    """
    d2v = Doc2Vec(dm=0,
                  vector_size=300,
                  negative=5,
                  hs=0,
                  min_count=3,
                  window=30,
                  sample=1e-5,
                  workers=8,
                  alpha=0.025,
                  min_alpha=0.025)
    epoch = 2
    if model_type == 'dm':
        epoch = 3
        d2v = Doc2Vec(dm=1,
                      vector_size=300,
                      negative=5,
                      hs=0,
                      min_count=3,
                      window=10,
                      sample=1e-5,
                      workers=8,
                      alpha=0.025,
                      min_alpha=0.025)
    doc_list = DocList(csv_path[:-4] + '_num.txt')
    d2v.build_vocab(doc_list)
    _, dic = load_csv(csv_path)
    print(datetime.now(), model_type + ' model training!')
    for i in range(epoch):
        print(datetime.now(), 'pass: {}/{}'.format(i + 1, epoch))
        doc_list = DocList(csv_path[:-4] + '_num.txt')
        d2v.train(doc_list, total_examples=d2v.corpus_count, epochs=d2v.epochs)
        X_d2v = np.array([d2v.docvecs[i] for i in range(length)])
        for j in ["Education", 'Age', 'Gender']:
            clf = LogisticRegression(C=3,
                                     solver='saga',
                                     dual=False,
                                     max_iter=10000)
            scores = cross_val_score(clf, X_d2v, dic[j][:length], cv=5)
            print(model_type, j, scores, np.mean(scores))
    d2v.save(data_path + model_type + '_d2v.model')
    print(datetime.now(), model_type + ' model save done!')
Ejemplo n.º 6
0
def add_number_to_query(csv_path):
    """
    :param csv_path: train_data_path
    :return: the total number of train_data.
    """
    print("----" * 5 + "Add number to query: Start" + "----" * 5)
    train_data, _ = load_csv(csv_path)
    f = codecs.open(csv_path[:-4] + '_num.txt', 'w', encoding='utf8')
    for i, queries in enumerate(train_data.iloc[:len(train_data)]['Query']):
        words = []
        for query in queries.split('\t'):
            words.extend(list(jieba.cut(query)))
        f.write('_*{} {}'.format(i, ' '.join(words)))
    f.close()
    print("----" * 5 + "Add number to query: Done" + "----" * 5)
    return len(train_data)
Ejemplo n.º 7
0
def train_dbow_dm_nn(feat: str, length: int):
    """
    :param feat: str ['dbow_d2v'.'dm_d2v']
    :param length: length of train data
    :return: none
    """
    print(datetime.now(), 'training ' + feat + ' stack start!')
    train_data, dic = load_csv(csv_path)
    model = Doc2Vec.load(data_path + feat + '.model')
    doc_vec = np.array([model.docvecs[i] for i in range(len(train_data))])
    df_stack = pd.DataFrame(index=range(len(train_data)))
    TR = length
    n = 5
    X_tr = doc_vec[:TR]
    X_te = doc_vec[TR:]
    for _, lb in enumerate(['Education', 'Age', 'Gender']):
        num_class = len(pd.value_counts(dic[lb]))
        y_tr = dic[lb][:TR]
        y_te = dic[lb][TR:]

        stack = np.zeros((X_tr.shape[0], num_class))
        stack_te = np.zeros((X_te.shape[0], num_class))
        kf = KFold(n_splits=n)
        for k, (tr, va) in enumerate(kf.split(X_tr, y_tr)):
            print('{} stack:{}/{} {}'.format(datetime.now(), k + 1, n, lb))
            nb_classes = num_class
            X_train = X_tr[tr]
            y_train = y_tr[tr].astype(np.int)
            X_test = X_te
            y_test = y_te.astype(np.int)

            X_train = X_train.astype('float32')
            X_test = X_test.astype('float32')
            Y_train = np_utils.to_categorical(y_train, nb_classes)
            Y_test = np_utils.to_categorical(y_test, nb_classes)

            model = Sequential()
            model.add(Dense(300, input_shape=(X_train.shape[1], )))
            model.add(Dropout(0.1))
            model.add(Activation('tanh'))
            model.add(Dense(nb_classes))
            model.add(Activation('softmax'))

            model.compile(loss='categorical_crossentropy',
                          optimizer='adadelta',
                          metrics=['accuracy'])

            model.fit(X_train,
                      Y_train,
                      shuffle=True,
                      batch_size=128,
                      epochs=35,
                      verbose=2,
                      validation_data=(X_test, Y_test))
            y_pred_va = model.predict(X_tr[va])
            y_pred_te = model.predict(X_te)
            print('va acc:', myAcc(y_tr[va], y_pred_va))
            print('te acc:', myAcc(y_te, y_pred_te))
            stack[va] += y_pred_va
            stack_te += y_pred_te
        stack_te /= n
        stack_all = np.vstack([stack, stack_te])
        for l in range(stack_all.shape[1]):
            df_stack['{}_{}_{}'.format(feat, lb, l)] = stack_all[:, l]
    df_stack.to_csv(data_path + feat + '_stack.csv',
                    encoding='utf8',
                    index=None)
    print(datetime.now(), 'training ' + feat + ' stack done!')
Ejemplo n.º 8
0
    def __iter__(self):
        for _, line in enumerate(codecs.open(self.f, encoding='utf8')):
            words = line.split()
            tags = [int(words[0][2:])]
            words = words[1:]
            yield self.SentimentDocument(words, tags)


if __name__ == "__main__":
    train_data_path = './data/train_data.csv'
    test_data_path = './data/test_data.csv'
    start = datetime.now()
    print("Start training dm and dbow model!")
    add_number_to_query(train_data_path)
    train_data, _ = load_csv(train_data_path)
    X_train = tokenize_data(train_data)
    tfidf_matrix_dump(X_train, 'tfidf_train.pkl')

    new_train_data_path = fill_train_data(train_data, X_train)
    length, train_test_data_path, train_test_data = concat_train_test(
        new_train_data_path, test_data_path)
    add_number_to_query(train_test_data_path)
    X_train_test = tokenize_data(train_test_data)
    tfidf_matrix_dump(X_train_test, 'tfidf_train_test.pkl')

    train_doc2vec(train_test_data_path, length, 'dbow')
    train_doc2vec(train_test_data_path, length, 'dm')
    end = datetime.now()
    print("Train dm and dbow model down! Duration time: {}s ".format(
        (end - start).seconds))