コード例 #1
0
    dirs = [
        'nlcow14ax01', 'nlcow14ax02', 'nlcow14ax03', 'nlcow14ax04',
        'nlcow14ax05', 'nlcow14ax06', 'nlcow14ax07'
    ]
    vvb = '/vol/tensusers/fkarsdorp/vvb.tokenized.txt'

    def __iter__(self):
        for directory in CowReader.dirs:
            with codecs.open(os.path.join(CowReader.root, directory,
                                          directory + ".xml"),
                             encoding='utf-8') as infile:
                sentence = []
                for line in infile:
                    if line.startswith('<s'):
                        continue
                    elif line.startswith('</s>'):
                        yield sentence
                        sentence = []
                    else:
                        word, pos, lemma = line.strip().split('\t')
                        if pos not in ('$.', 'punc'):
                            sentence.append(word.lower())
        with codecs.open(CowReader.vvb, encoding='utf-8') as vvb:
            for sentence in vvb:
                yield list(tokenize(sentence, lowercase=True))


sentences = CowReader()
model = Word2Vec(sentences, size=300, window=10, min_count=10, workers=20)
model.save("/vol/tensusers/fkarsdorp/cow-vvb.w2v")
コード例 #2
0
KoreanTokenizedTerms = []

for KoreanTokenizedDocument in KoreanTokenizedDocuments:
    KoreanTokenizedTerms.append([
        term[0] for term in KoreanTokenizedDocument
        if (term[1] in ('Noun', 'Adjective', 'Verb'))
    ])

#print(KoreanTokenizedSentences[:5])

print(KoreanTokenizedTerms)

model = Word2Vec(sentences=KoreanTokenizedTerms,
                 size=64,
                 sg=1,
                 window=10,
                 min_count=1,
                 seed=42,
                 workers=8)

model.save('KoreanWord2Vec.w2v')

print(u"==================================")
print(u"삼성 Similarity Words:")
print(u"==================================")

for word in model.most_similar(positive=[u'삼성'], negative=[], topn=30):
    print("==> " + str(word))

print("\n")
コード例 #3
0
    labelized = []
    for i, v in tqdm(enumerate(tweets)):
        label = '%s_%s' % (label_type, i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized


x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

x_train_zemberek = labelizeTweets(x_train_zemberek, 'TRAIN')
x_test_zemberek = labelizeTweets(x_test_zemberek, 'TEST')

n_dims = [50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300]
n_dim = 75
tweet_w2v = Word2Vec(size=n_dim, min_count=3, hs=1, window=7, iter=75, sg=0)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)],
                total_examples=tweet_w2v.corpus_count,
                epochs=tweet_w2v.iter)

tweet_w2v_zemberek = Word2Vec(size=n_dim,
                              min_count=3,
                              hs=1,
                              window=7,
                              iter=75,
                              sg=0)
tweet_w2v_zemberek.build_vocab([x.words for x in tqdm(x_train_zemberek)])
tweet_w2v_zemberek.train([x.words for x in tqdm(x_train_zemberek)],
                         total_examples=tweet_w2v_zemberek.corpus_count,
                         epochs=tweet_w2v_zemberek.iter)
コード例 #4
0
rg.set_model(model=lfr_params)
g = rg.lfr_model()

graph_path = "./outputs/lfr_synthetic_n1000.gml"
nx.write_gml(g, graph_path)

# Find the embedding of the
temp_adjlist_file = "./temp/graph.adjlist"
embedding_file = "./outputs/output.embedding"
nx.write_edgelist(g, temp_adjlist_file)

dwg = dw.load_edgelist(temp_adjlist_file, undirected=True)
walks = dw.build_deepwalk_corpus(dwg,
                                 num_paths=dw_params['n'],
                                 path_length=dw_params['l'],
                                 alpha=0)
model = Word2Vec(walks,
                 size=dw_params['d'],
                 window=dw_params['w'],
                 min_count=0,
                 sg=1,
                 hs=1,
                 workers=dw_params['workers'])
model.wv.save_word2vec_format(embedding_file)

comdetect = CommunityDetection(embedding_file,
                               graph_path,
                               params={'directed': False})
score = comdetect.evaluate(num_of_communities=kmeans_num_of_communities)
print("Score: {}".format(score))
from nltk.corpus import stopwords

stoplist = set(stopwords.words('english'))

if __name__ == '__main__':
    data = pd.read_csv('codeforces_problems_csv/data.csv')
    X_data = list(data['problem_text'])

    if not os.path.exists('w2v_problem_data.bin'):
        sentences = [line for text in X_data for line in clean(text)]
        #for i in range(len(sentences)):
        #	sentences[i] = get_lemmatized_tokens(' '.join(sentences[i]))

        model = Word2Vec(sentences,
                         workers=4,
                         size=200,
                         min_count=50,
                         window=10,
                         sample=1e-3)
        model.save('w2v_problem_data.bin')

    else:
        model = Word2Vec.load('w2v_problem_data.bin')

    # very common word in dp problems
    print(model.most_similar('ways'))
    print(len(model.wv.vocab.values()))

    X = model[model.wv.vocab]

    # visualize the data
    tsne = TSNE(n_components=2)
コード例 #6
0
ファイル: run.py プロジェクト: abdcelikkanat/expon_emb
from gensim.models.word2vec import Word2Vec

print("hello world")

model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers)

コード例 #7
0
sns.set(style='white', context='notebook', palette='deep')

train = pd.read_csv('nlp_test/Train/Train_DataSet.csv')
lablel = pd.read_csv('nlp_test/Train/Train_DataSet_Label.csv')
test = pd.read_csv('nlp_test/Test/Test_DataSet.csv')
test_title = test['title']
#将文本和标签合并到一个数据集
train = pd.merge(train, lablel, on='id')

train['title'] = train['title'].apply(lambda x: str(x))
train['words'] = train['title'].apply(lambda x: jieba.lcut(x))
#构建特征工程
x = train['words']
y = train['label']

w2v = Word2Vec(size=100, min_count=5, window=5)
w2v.build_vocab(x)
w2v.train(x, total_examples=w2v.corpus_count, epochs=w2v.iter)
#
# #获取一个句子的向量
#
# def total_vec(words):
#     vec=np.zeros(300).reshape(1,300)
#     for word in words:
#         try:
#             vec+=w2v.wv[word].reshape(1,300)
#         except KeyError:
#             continue
#     return vec
#
# train_vec=np.concatenate(total_vec(words) for words in x)
コード例 #8
0
            for file in files:
                with open(root + '/' + file, 'r') as f:
                    text += f.read()

### each.translate(translator) == 특수문자 제거
### x.lower() == 소문자화
### if x.lower() not in stop_words == 불용어제거
clean = [[
    x.lower() for x in each.translate(translator).split()
    if x.lower() not in stop_words
] for each in text.split('.\n')]

print(clean)
print("------------------------------------------------------------")
#window크기 5, 최소 출현수 5, skip-gram, 10000번 학습
model = Word2Vec(clean, window=20, min_count=7, sg=1, iter=10000)

print(list(model.wv.vocab.keys()))
print("vocab length : %d" % len(model.wv.vocab))

#유사 의미 찾기
# print(model.wv.most_similar("good"))

#
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

X = model.wv[model.wv.vocab]

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
コード例 #9
0
#%%
data = 'total_results.p'
total_results = pickle.load(open(data, "rb"))

#%%
### initialize model and build vocabulary
n_dim = 300
window = 5
downsampling = 0.001
seed = 1
num_workers = os.cpu_count() - 2  ## not sure if this is a good idea
min_count = 30
imf_w2v = Word2Vec(sg=1,
                   seed=seed,
                   workers=num_workers,
                   size=n_dim,
                   min_count=min_count,
                   window=window,
                   sample=downsampling)
## build the vocabulary
imf_w2v.build_vocab(total_results)

#%%
## train w2v model
corpus_count = imf_w2v.corpus_count
overall_start_time = time.time()
for i in range(200):
    start_time = time.time()
    iteration = 10
    print('running', i + 1, '-', (i + 1) * iteration)
    if gensim.__version__[0] == '1':
コード例 #10
0
#wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False)
#tfidf = TfidfModel(wiki)
# save for persistence

#wiki.save('wiki.corpus')
#tfidf.save('wiki.tfidf.model')
          
# word2vec
sentences=[]

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()
    file_path="enwiki-latest-pages-articles1.xml"
                
    with open(file_path,"r",buffering=1) as f:
                    for line in f:
                        sentences.append(line)

params = {'size': 200, 'window': 10, 'min_count': 10, 
          'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3,}
word2vec = Word2Vec(sentences, **params)
word2vec.save('wiki.word2vec.model')

print(word2vec.wv['configuration'])
コード例 #11
0
import logging
import re
import nltk
from gensim.models.word2vec import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.keyedvectors import KeyedVectors
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = LineSentence('txt/source.txt')
model = Word2Vec(sentences, min_count=3, size=10, window=300)
word_vectors = model.wv
word_vectors.save('model)
model = KeyedVectors.load('model')
コード例 #12
0
    # load all data sentence

    with open('data/structured/sentence_neg_handled/train/pos.pkl',
              'rb') as fi:
        sentence_train_pos = dill.load(fi)
    with open('data/structured/sentence_neg_handled/train/neg.pkl',
              'rb') as fi:
        sentence_train_neg = dill.load(fi)

    # join the data
    all_data = sentence_train_pos + sentence_train_neg

    size = 300
    window = 7
    min_count = 15
    model_word2vec = Word2Vec(all_data,
                              size=size,
                              window=window,
                              min_count=min_count,
                              workers=4)

    # filename to save (or load)
    fname = "model/model{}_{}_{}.pkl".format(size, window, min_count)

    # check model
    print(model_word2vec.wv['good'])

    # save the file in fname
    model_word2vec.save(fname)
コード例 #13
0
from gensim.models.word2vec import Word2Vec
import pandas as pd
#训练词向量

#加载数据
# 英语问句对  英语问句1,西班牙语翻译1,英语问句2,西班牙语翻译2,匹配标注。
english_spa = pd.read_csv('/home/moon/work/tianchi/data/cikm_english_train_20180516.txt', sep = '\t', header = None)
english_spa.columns = ['eng_qura1', 'spa_qura1', 'eng_qura2', 'spa_qura2', 'label']
#西班牙语问句1
english_spa['spa_qura_list_1'] = english_spa['spa_qura1'].apply(lambda x : x.split(' '))
#西班牙语问句2
english_spa['spa_qura_list_2'] = english_spa['spa_qura2'].apply(lambda x : x.split(' '))
spa_list = list(english_spa['spa_qura_list_1'])
spa_list.extend(list(english_spa['spa_qura_list_2']))
model = Word2Vec(spa_list, sg=1, size=30,  window=5,  min_count=1,  negative=3, sample=0.001, hs=1, workers=8)
model.save("./w2v.mod")
コード例 #14
0
FILTER_ENGLISH = True
# Name for output w2v model file
OUTPUT_MODEL_FILE = "w2v_yelp_100_alpha_0.025_window_4"
PICKLED_DATA = "/home/alfredo/deep-nlp/data/reviews.pickle."

NUM_PARTITIONS = 2  # Use all data
reviews_texts, _, _, _, _ = get_reviews_data(range(1, NUM_PARTITIONS),
                                             PICKLED_DATA)

# Each review will be considered a sentence
sentences = []
for num, text in enumerate(reviews_texts):
    if num % 10000 == 0:
        print "%d out of %d reviews read" % (num, len(reviews_texts))
    if FILTER_ENGLISH:
        if detect_language(text) == u"english":
            sentences.append(tokenize_text(text))
    else:
        sentences.append(text)

# Build a w2v model
w2v = Word2Vec(sentences=sentences,
               size=100,
               alpha=0.025,
               window=4,
               min_count=2,
               sample=1e-5,
               workers=4,
               negative=10)
w2v.save(OUTPUT_MODEL_FILE)
コード例 #15
0

def labelizeTweets(tweets, label_type):
    labelized = []
    for i, v in tqdm(enumerate(tweets)):
        label = '%s_%s' % (label_type, i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized


x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

# print(x_train[0])

tweet_w2v = Word2Vec(size=200, min_count=10)  # can change size
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)],
                total_examples=tweet_w2v.corpus_count,
                epochs=tweet_w2v.iter)

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))


def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
コード例 #16
0
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result


#bütün tweet verilerini topladık -train-test olan text columnlar  toplandı
all_x = pd.concat([x_train, x_test])
all_x_w2v = labelize_tweets_ug(all_x, 'all')

#tweet kelimelerine word2vec cbow yöntemi(sg=0) uygulanıyor,
#cümle içindeki current_wod ile predicted word arasındaki mesafewindow_size=2
#size=100 feature vetörlerin boyutu
cores = multiprocessing.cpu_count()
model_ug_cbow = Word2Vec(sg=0,
                         size=100,
                         negative=5,
                         window=2,
                         min_count=2,
                         workers=cores,
                         alpha=0.065,
                         min_alpha=0.065)
model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

#embedding eğitimi yapılıyor
for epoch in range(30):
    model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]),
                        total_examples=len(all_x_w2v),
                        epochs=1)
    model_ug_cbow.alpha -= 0.002
    model_ug_cbow.min_alpha = model_ug_cbow.alpha

#daha sonra skip-gram modeli
model_sg = Word2Vec(sg=1,
コード例 #17
0
all_docs.labels.iloc[1]
all_docs.doc_words[4][:52]
print(all_docs.doc_words[6])
# %%
import multiprocessing
import sys
from gensim.models.word2vec import Word2Vec

workers = multiprocessing.cpu_count()
print('number of cpu: {}'.format(workers))
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise."

# %%
word_model = Word2Vec(all_docs.doc_words,
                      min_count=2,
                      size=300,
                      window=5,
                      workers=workers,
                      iter=100)

# %%
from UtilWordEmbedding import MeanEmbeddingVectorizer


mean_vec_tr = MeanEmbeddingVectorizer(word_model)
doc_vec = mean_vec_tr.transform(all_docs.doc_words)

# %%

word_model.most_similar('submit')

# %%
コード例 #18
0
# -*- coding: utf-8 -*-

import logging
import sys
from gensim.models.word2vec import Word2Vec, LineSentence

logging.basicConfig(level=logging.INFO)

model = Word2Vec(LineSentence(sys.argv[1]), sg=1)
model.save(sys.argv[2])
コード例 #19
0
# data import and cleaning
test_solution = pd.read_csv("test_with_solutions.csv")
data_train = pd.read_csv("train.csv")

corpus = corpus_creation(data_train['Comment'])

# gensim word2vec model
vector_size = 512
window_size = 10

# Create Word2Vec
word2vec = Word2Vec(sentences=corpus,
                    size=vector_size,
                    window=window_size,
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=multiprocessing.cpu_count())

# Train subset size (0 < size < len(tokenized_corpus))
train_size = 3900
# test len(corpus - train)
test_size = 47

# Compute average and max tweet length
avg_length = 0.0
max_length = 0

for comment in corpus:
    if len(comment) > max_length:
コード例 #20
0
"""
corpus = [preprocessing(x) for x in corpus]
X_train = [preprocessing(x) for x in X_train]
X_test = [preprocessing(x) for x in X_test]
# print(corpus)
# print(X_train)


"""
训练NLP模型
有了这些干净的数据集,我们可以做我们的NLP模型了。

先用最简单的Word2Vec
"""

model = Word2Vec(corpus, size=128, window=5, min_count=5, workers=4)
# print(model['ok'])

"""
用NLP模型表达我们的数据
接着,我们可以用这个坐标,来表示之前干干净净的数据。
但是有个问题。我们的vec是基于每个单词的,怎么办呢?
由于文本本身的量很小,我们可以把所有的单词的vector拿过来取个平均值
"""
# 先拿到全部的vocabulary
vocab = model.wv.vocab


def get_vector(word_list): # 得到任意text的vector
    # 建立一个全是0的array
    res = np.zeros([128])
コード例 #21
0
def word2vec_train(combined):
    model = Word2Vec(min_count=n_exposures, window=window_size)
    model.build_vocab(combined)  # input: list
    model.train(combined, total_examples=model.corpus_count, epochs=model.iter)
    model.save('./Word2vec_model.pkl')
    model.wv.save_word2vec_format("./word2vec.model", binary=True)
コード例 #22
0
ファイル: embeding.py プロジェクト: raff7/HRED-Chatbot
from gensim.models.word2vec import Word2Vec
import gzip

f = gzip.open('D:\Wikipedia\OpenSubtitles2018.en.gz')
data = f.readlines()
for i in range(len(data)):
    data[i] = data[i][:-1].decode("utf-8")

model = Word2Vec(data, size=300, min_count=5, window=10, workers=7)
model.save("model-subtitles")
print('done')
print('done')
コード例 #23
0
def main():
    arg_parser = ArgumentParser(description='Script to train Word2Vec.')

    arg_parser.add_argument('-i', '--fasta-file')
    arg_parser.add_argument('-o', '--model-file')
    arg_parser.add_argument('-c', '--corpus-file')
    arg_parser.add_argument('-v', '--word-vectors-file')
    arg_parser.add_argument('-u', '--context-vectors-file')

    arg_parser.add_argument('-n', '--ngram-size', type=int, default=3)
    arg_parser.add_argument('-s', '--vector-size', type=int, default=100)
    arg_parser.add_argument('-w', '--window-size', type=int, default=5)
    arg_parser.add_argument('-t', '--num-threads', type=int, default=3)
    arg_parser.add_argument('-r', '--random-seed', type=int, default=None)
    arg_parser.add_argument('-k', '--num-iterations', type=int, default=5)

    args = arg_parser.parse_args()

    fasta_file = args.fasta_file
    model_file = args.model_file
    corpus_file = args.corpus_file
    word_vectors_file = args.word_vectors_file
    context_vectors_file = args.context_vectors_file

    ngram_size = args.ngram_size
    random_seed = args.random_seed
    vector_size = args.vector_size
    window_size = args.window_size
    num_threads = args.num_threads
    num_iterations = args.num_iterations

    if not any([fasta_file, corpus_file]):
        print('Error: Please specify either a FASTA file or corpus file.')
        arg_parser.print_help()
        return

    if fasta_file and not Path(fasta_file).exists():
        print('FASTA file not found: {}'.format(fasta_file))
        return

    if not corpus_file:
        corpus_file = 'corpus.txt'
    elif not fasta_file and not Path(corpus_file).exists():
        print('Corpus file not found: {}'.format(corpus_file))
        return

    if random_seed:
        print('Random-Seed-Mode: Setting number of threads to 1')

        num_threads = 1
        python_hash_seed = environ.get('PYTHONHASHSEED', None)

        if python_hash_seed is None or python_hash_seed == 'random':
            print('Random-Seed-Mode: Global PYTHONHASHSEED needs to be set')
            return
    else:
        random_seed = 42

    if fasta_file:
        make_corpus(fasta_file, corpus_file, ngram_size)

    if not any([model_file, word_vectors_file, context_vectors_file]):
        return

    model = Word2Vec(
        LineSentence(corpus_file),
        size=vector_size,
        window=window_size,
        min_count=2,
        sg=1,
        # hs=0,
        # negative=5,
        # ns_exponent=0.75,  # requires gensim 3.5
        # cbow_mean=1,
        # sample=0.001,
        iter=num_iterations,
        # alpha=0.025,
        # min_alpha=0.0001,
        # batch_words=10000,
        # null_word=0,
        # trim_rule=None,
        # compute_loss=False,
        # sorted_vocab=1,
        # max_vocab_size=None,
        # max_final_vocab=None,  # requires gensim 3.5
        seed=random_seed,
        workers=num_threads,
        # callbacks=()
    )

    if model_file:
        model.save(model_file)

    if word_vectors_file:
        save_w2v_vectors_file(word_vectors_file, model.wv.vocab,
                              model.wv.vectors)

    if context_vectors_file:
        has_syn1 = hasattr(model, 'syn1')  # hierarchical softmax
        has_syn1neg = hasattr(model, 'syn1neg')  # negative sampling

        if has_syn1 and has_syn1neg:
            context_vectors_file_1 = context_vectors_file + '.hs'
            context_vectors_file_2 = context_vectors_file + '.ns'

            save_w2v_vectors_file(context_vectors_file_1, model.wv.vocab,
                                  model.syn1)
            save_w2v_vectors_file(context_vectors_file_2, model.wv.vocab,
                                  model.syn1neg)
        elif has_syn1:
            save_w2v_vectors_file(context_vectors_file, model.wv.vocab,
                                  model.syn1)
        elif has_syn1neg:
            save_w2v_vectors_file(context_vectors_file, model.wv.vocab,
                                  model.syn1neg)
コード例 #24
0
ファイル: train.py プロジェクト: harry990/nlp-course
def train_model(inpath, outpath):
    model = Word2Vec(LineSentence(inpath), workers=cpu_count())
    model.save(outpath)
コード例 #25
0
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')


print('Loading Data...')
train, test = import_tag(datasets=data_locations)
combined = train.values() + test.values()

print('Tokenising...')
combined = tokenizer(combined)

print('Training a Word2vec model...')
model = Word2Vec(size=vocab_dim,
                 min_count=n_exposures,
                 window=window_size,
                 workers=cpu_count,
                 iter=n_iterations)
model.build_vocab(combined)
model.train(combined)
# model.save('vectorizer.w2v')
print('Transform the Data...')
index_dict, word_vectors, train, test = create_dictionaries(train=train,
                                                            test=test,
                                                            model=model)

print('Setting up Arrays for Keras Embedding Layer...')
n_symbols = len(index_dict) + 1  # adding 1 to account for 0th index
embedding_weights = np.zeros((n_symbols, vocab_dim))
for word, index in index_dict.items():
    embedding_weights[index, :] = word_vectors[word]
コード例 #26
0
def build_dataset(train_data_path, test_data_path):
    '''
    数据加载+预处理
    :param train_data_path:训练集路径
    :param test_data_path: 测试集路径
    :return: 训练数据 测试数据  合并后的数据
    '''
    # 1.加载数据
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)
    print('train data size {},test data size {}'.format(
        len(train_df), len(test_df)))

    # 2. 空值剔除
    train_df.dropna(subset=['Report'], inplace=True)

    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)

    # 3.多线程, 批量数据处理
    train_df = parallelize(train_df, sentences_proc)
    test_df = parallelize(test_df, sentences_proc)

    # 4. 合并训练测试集合
    train_df['merged'] = train_df[['Question', 'Dialogue',
                                   'Report']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    test_df['merged'] = test_df[['Question',
                                 'Dialogue']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
    print('train data size {},test data size {},merged_df data size {}'.format(
        len(train_df), len(test_df), len(merged_df)))

    # 5.保存处理好的 训练 测试集合
    train_df = train_df.drop(['merged'], axis=1)
    test_df = test_df.drop(['merged'], axis=1)

    train_df.to_csv(train_seg_path, index=None, header=False)
    test_df.to_csv(test_seg_path, index=None, header=False)

    # 6. 保存合并数据
    merged_df.to_csv(merger_seg_path, index=None, header=False)

    # 7. 训练词向量
    print('start build w2v model')
    wv_model = Word2Vec(LineSentence(merger_seg_path),
                        size=embedding_dim,
                        sg=1,
                        workers=cores,
                        iter=wv_train_epochs,
                        window=5,
                        min_count=5)

    # 8. 分离数据和标签
    train_df['X'] = train_df[['Question',
                              'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'] = test_df[['Question',
                            'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

    # 训练集 验证集划分
    X_train, X_val, y_train, y_val = train_test_split(
        train_df['X'],
        train_df['Report'],
        test_size=0.002,  # 8W*0.002
    )

    X_train.to_csv(train_x_seg_path, index=None, header=False)
    y_train.to_csv(train_y_seg_path, index=None, header=False)
    X_val.to_csv(val_x_seg_path, index=None, header=False)
    y_val.to_csv(val_y_seg_path, index=None, header=False)

    test_df['X'].to_csv(test_x_seg_path, index=None, header=False)

    # 9. 填充开始结束符号,未知词填充 oov, 长度填充
    # 使用GenSim训练得出的vocab
    vocab = wv_model.wv.vocab

    # 训练集X处理
    # 获取适当的最大长度
    train_x_max_len = get_max_len(train_df['X'])
    test_X_max_len = get_max_len(test_df['X'])
    X_max_len = max(train_x_max_len, test_X_max_len)
    train_df['X'] = train_df['X'].apply(
        lambda x: pad_proc(x, X_max_len, vocab))

    # 测试集X处理
    # 获取适当的最大长度
    test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab))

    # 训练集Y处理
    # 获取适当的最大长度
    train_y_max_len = get_max_len(train_df['Report'])
    train_df['Y'] = train_df['Report'].apply(
        lambda x: pad_proc(x, train_y_max_len, vocab))

    # 10. 保存pad oov处理后的,数据和标签
    train_df['X'].to_csv(train_x_pad_path, index=None, header=False)
    train_df['Y'].to_csv(train_y_pad_path, index=None, header=False)
    test_df['X'].to_csv(test_x_pad_path, index=None, header=False)
    #
    # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len))

    # 11. 词向量再次训练
    # print('start retrain w2v model')
    # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
    # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('1/3')
    # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
    # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('2/3')
    # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
    # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)

    # 保存词向量模型
    wv_model.save(save_wv_model_path)
    print('finish retrain w2v model')
    print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab))

    # 12. 更新vocab
    vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
    reverse_vocab = {
        index: word
        for index, word in enumerate(wv_model.wv.index2word)
    }

    # 保存字典
    save_dict(vocab_path, vocab)
    save_dict(reverse_vocab_path, reverse_vocab)

    # 13. 保存词向量矩阵
    embedding_matrix = wv_model.wv.vectors
    np.save(embedding_matrix_path, embedding_matrix)

    # 14. 数据集转换 将词转换成索引  [<START> 方向机 重 ...] -> [32800, 403, 986, 246, 231
    vocab = Vocab()

    train_ids_x = train_df['X'].apply(
        lambda x: transform_data(x, vocab.word2id))
    train_ids_y = train_df['Y'].apply(
        lambda x: transform_data(x, vocab.word2id))
    test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab.word2id))

    # 15. 数据转换成numpy数组
    # 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800,   403,   986 ]]
    train_X = np.array(train_ids_x.tolist())
    train_Y = np.array(train_ids_y.tolist())
    test_X = np.array(test_ids_x.tolist())

    # 保存数据
    np.save(train_x_path, train_X)
    np.save(train_y_path, train_Y)
    np.save(test_x_path, test_X)
    return train_X, train_Y, test_X
コード例 #27
0
ファイル: word2vec.py プロジェクト: phymucs/emgan
    cleaned_train.close()
    cleaned_test.close()

for category in categories:
    lines = open('data/cleaned/cleaned_'+category+'_train.txt', 'r').readlines()
    cleaned_docs_in_sent = open('data/cleaned/cleaned_'+category+'_train_in_sent.txt', 'w')
    for line in lines:
        sentences = nltk.sent_tokenize(line)
        for s in sentences:
            cleaned_docs_in_sent.write(re.sub('\.+$', '', s.strip()) + '\n')
    cleaned_docs_in_sent.close()

seeds = [1, 123, 888, 1234, 8888]
entertainment_model, ideas_model, world_model, us_model, politics_model, all_model = [], [], [], [], [], []
for i in range(5):
    entertainment_model += [Word2Vec(LineSentence('data/cleaned/cleaned_Entertainment_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)]
    ideas_model += [Word2Vec(LineSentence('data/cleaned/cleaned_Ideas_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)]
    world_model += [Word2Vec(LineSentence('data/cleaned/cleaned_World_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)]
    us_model += [Word2Vec(LineSentence('data/cleaned/cleaned_US_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)]
    politics_model += [Word2Vec(LineSentence('data/cleaned/cleaned_Politics_train_in_sent.txt'), size=300, window=5, min_count=5, workers=4)]
    # os.system('cat data/cleaned/cleaned_Entertainment_train_in_sent.txt data/cleaned/cleaned_Ideas_train_in_sent.txt data/cleaned/cleaned_World_train_in_sent.txt '
    #     + 'data/cleaned/cleaned_US_train_in_sent.txt data/cleaned/cleaned_Politics_train_in_sent.txt > data/cleaned/cleaned_all_train_in_sent.txt')
    all_model += [Word2Vec(LineSentence('data/cleaned/cleaned_all_train_in_sent.txt'), seed=seeds[i], size=300, window=5, min_count=5, workers=4)]

vocab = list(set(world_model[0].vocab.keys()).union(us_model[0].vocab.keys()).union(politics_model[0].vocab.keys()).union(all_model[0].vocab.keys())
    .union(entertainment_model[0].vocab.keys()).union(ideas_model[0].vocab.keys()))
indices_of_vocab = dict({vocab[i] : i for i in range(len(vocab))})
count_of_vocab = np.zeros(len(vocab), dtype='int32')
for category in categories:
    lines = open('data/cleaned/cleaned_'+category+'_train.txt', 'r').readlines()
    for line in lines:
コード例 #28
0
ファイル: all_test.py プロジェクト: ZBayes/NLP_using
# 数据集打乱
index = [i for i in range(len(all_data))]
np.random.shuffle(index)
x = np.array(all_data)[index]
y = y[index]

# 数据集划分
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=20)
print("data preprocessing completed")

# word2vec
imdb_w2v = Word2Vec(size=N_DIM, min_count=MIN_COUNT)
imdb_w2v.build_vocab(all_data)
imdb_w2v.train(all_data, total_examples=len(all_data), epochs=w2v_EPOCH)
print("word2vec completed")

# word2vec后处理
n_symbols = len(imdb_w2v.wv.vocab.keys()) + 1
embedding_weights = np.zeros((n_symbols, 100))
idx = 1
word2idx_dic = {}
for w in imdb_w2v.wv.vocab.keys():
    embedding_weights[idx, :] = imdb_w2v[w]
    word2idx_dic[w] = idx
    idx = idx + 1
# print(embedding_weights[0, :])
コード例 #29
0
sentences = []
pre = open('sentences.txt', 'w', encoding='utf-8')
for index in range(1, 600):
    path = ('judgments-%s.json' % index)
    data = json.load(open(path, encoding="utf8"))['items']

    for j in data:
        text = j['textContent'].replace("-\n", "").lower()
        text = re.sub(r'<[^>]*>', "", text)
        for sentence in sent_tokenize(text, language='polish'):
            pre.write(re.sub('\s+', ' ', sentence).strip() + "\n")
        size += len(text)
        if size >= 1000000000:
            break
    if size >= 1000000000:
        break
pre.close()
print(size)
sentences = LineSentence('sentences.txt')
bigram = Phraser(Phrases(sentences))
bigram.save("bigram")
print("1")
sentence_stream = [bigram[sentence] for sentence in sentences]
trigram = Phraser(Phrases(sentence_stream))
trigram.save("trigram")
print("2")
model = Word2Vec([trigram[bigram[sentence]] for sentence in sentence_stream],
                 window=5, size=300, sg=0, workers=12, min_count=3)
model.save("model")
print("processed")
コード例 #30
0
review_part.shape
import warnings
warnings.filterwarnings("ignore")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


def split_sentences(review):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = [clean_text(s) for s in raw_sentences if s]  #清洗
    return sentences


sentences = sum(review_part.apply(split_sentences), [])
print('{} reviews -> {} sentences'.format(len(review_part), len(sentences)))
sentences_list = []
for line in sentences:
    sentences_list.append(nltk.word_tokenize(line))
num_features = 300  # Word vector dimensionality
min_word_count = 40  # Minimum word count
num_workers = 4  # Number of threads to run in parallel
context = 10  # Context window size
model_name = 'Word_Vector.model'
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences_list,
                 workers=num_workers,
                 size=num_features,
                 min_count=min_word_count,
                 window=context)
model.init_sims(replace=True)
model.save(os.path.join('..', 'models', model_name))