예제 #1
0
def load_data():
    path = config.origin_csv
    print('load data')
    data = read_cut(path)  # cut word
    data = data_2id(data)  # 2id
    data = add_hum_feats(data, config.train_featdires)  # 生成特征并加入
    return data
예제 #2
0
def make_w2v(path, cut_char_level):
    if not os.path.exists(path):
        #我私自去除了CutWord
        #data = pd.read_csv('../'+config.origin_csv)
        data = read_cut('../' + config.origin_csv)
        if cut_char_level:
            content = list(data['q_cut']) + list(data['a_cut'])
        else:
            content = list(data['q_cut_word']) + list(data['a_cut_word'])
        model = Word2Vec(
            content,
            size=config.w2v_vec_dim,
            window=5,
            min_count=5,
        )
        model.save(path)
    else:
        model = Word2Vec.load(path)

    weights = model.wv.syn0
    vocab = dict([(k, v.index + 1) for k, v in model.wv.vocab.items()])
    vocab['<-UNKNOW->'] = len(vocab) + 1
    embed_weights = np.zeros(shape=(weights.shape[0] + 2, weights.shape[1]))
    embed_weights[1:weights.shape[0] + 1] = weights
    unk_vec = np.random.random(size=weights.shape[1]) * 0.5
    pading_vec = np.random.random(size=weights.shape[1]) * 0
    embed_weights[weights.shape[0] + 1] = unk_vec - unk_vec.mean()
    embed_weights[0] = pading_vec

    return vocab, embed_weights
예제 #3
0
파일: w2v.py 프로젝트: BarryZM/ChatBots
def save_my_w2v(path):
    data = read_cut(path)
    vocab = read_words(data)
    # Read top n word vectors. Read all vectors when topn is 0
    vectors, iw, wi, dim = read_vectors('data/pre_w2v/sgns.zhihu.word', 0)

    m = load_pre_train_embeddings(vocab, vectors)
    np.save(config.word_embed_vocab, vocab)
    np.save(config.word_embed_weight, m)
예제 #4
0
def load_data():
    path = config.origin_csv
    print('load data')
    data = read_cut(path)  # cut word
    data = data_2id(data)  # 2id
    data = add_hum_feats(data, config.train_feats)  # 生成特征并加入

    x_train, y_train = get_X_Y_from_df(data, config.data_augment)
    print(len(x_train[2]))
    
    return x_train, y_train
예제 #5
0
파일: main.py 프로젝트: zle1992/inspur
def submit_inteface(in_path, out_path, model_name, cv=False):

    data_df = read_cut(in_path, config.test_data_cut_hdf)
    data_df = data2id(data_df)
    data_df.label = data_df.label.fillna(0)
    X, _ = get_X_Y_from_df(data_df)
    data = [X, _]
    model = get_model(model_name)

    print('load model and predict')
    if not cv:
        test_pred = model.single_predict(data)
    else:
        test_pred = model.make_test_cv_data(data)
    test_model_pred = np.squeeze(test_pred)
    data_df['label'] = np.argmax(test_model_pred, axis=1) + 1
    data_df[['label']].to_csv(out_path, index=False, header=None, sep='\t')
예제 #6
0
파일: main.py 프로젝트: zle1992/inspur
def train(cv, model_name):

    data_df = read_cut(config.origin_csv, config.train_data_cut_hdf)
    data_df = data2id(data_df)
    data_df = data_df.sample(frac=1, random_state=18)

    model = get_model(model_name)

    if cv:
        kfolds = 5
        x_train, y_train = get_X_Y_from_df(data_df)
        model.make_train_cv_data([x_train, y_train], kfolds)
    else:
        train, dev = train_test(data_df)
        x_train, y_train = get_X_Y_from_df(train)
        x_dev, y_dev = get_X_Y_from_df(dev)
        model.single_train([x_train, y_train, x_dev, y_dev])
예제 #7
0
    data = magic1(data)
    feats0_path = 'data/cache/feats/feats0_train.csv'
    feats0 = [
        'q1_freq', 'q2_freq', 'freq_mean', 'freq_cross', 'q1_freq_sq',
        'q2_freq_sq'
    ]
    data[['id'] + feats0].to_csv(feats0_path, index=False)


def merge_feats():
    df0 = pd.read_csv('data/cache/feats/feats0_train.csv')
    df1 = pd.read_csv('data/cache/feats/feats1_train.csv')
    df = pd.merge(df0, df1, on='id')
    return df


def merge_test(data):
    data = feats1_gen(data)
    data = extract_features(data)
    data = magic1(data)
    return data[config.feats]


if __name__ == '__main__':
    path = config.origin_csv
    data = read_cut(path)  # cut word
    #data = data_2id(data)  # 2id
    save_feats0(data)
    save_feats1(data)