Ejemplo n.º 1
0
def train_fasttext(corpus_folder ='assets/corpus/', target ='models/ft/', size = 300, window = 5, mincount = 100):


    sentences = CorpusSentences(corpus_folder)

    model = FastText(window=window,min_count=mincount,size=size)
    model.build_vocab(sentences)
    model.train(sentences,total_examples=model.corpus_count,epochs = 5)
    # store model

    if not os.path.exists(target):
        os.makedirs(target)

    model_fn = target + 'med_model_dim{}_win{}_min{}.bin'.format(size, window, mincount)

    model.save(model_fn)

    # test model
    model.most_similar('transglutaminase')
Ejemplo n.º 2
0
def build_and_save_word2vec_embeddings_model():
    print('Connecting to the database...')
    sentences = SentencesIterator(tokens_generator)
    print('Calculating the embeddings...')
    model = FastText(sentences, size=100, window=10, min_count=3, workers=4)
    print('Saving the model...')
    model.save(EMBEDDINGS_WORD2VEC_MODEL_FILE)
    print('WORD2VEC Model saved. Examples:')
    interesting_words = [
        'ciao', 'salutare', 'motorino', 'simpatia', 'milano', 'roma',
        'sgargapuffoparolainventata'
    ]
    for w in interesting_words:
        print('Words most similar to', w)
        print([sw[0] for sw in model.most_similar(w)])
    return model
    model = FastText(words_to_embed,
                     window=2,
                     negative=10,
                     iter=50,
                     sg=1,
                     workers=4,
                     alpha=0.005,
                     size=300,
                     seed=100)
    (model.wv.save_word2vec_format("data/training_word_vectors.bin",
                                   fvocab="data/training_word_vocab.txt",
                                   binary=False))

# In[69]:

model.most_similar("diabetes")

# In[70]:

from sklearn.manifold import TSNE
vocab = list(model.wv.vocab)
X = model[vocab]
tsne = TSNE(n_components=2)
tsne_df = pd.DataFrame(tsne.fit_transform(X),
                       index=vocab,
                       columns=["comp_1", "comp_2"])

# In[71]:

fig, ax = plt.subplots()
fig.set_size_inches(13, 11)
from gensim.models import FastText
import pandas as pd

article = pd.read_excel('Cut_Finish_jieba.xlsx')

sentences = article['內容'].tolist()

split_sentences = []

for i in sentences:
    split_sentences.append(i.split(' '))

print('訓練開始')
# build a Word2Vce model
model = FastText(split_sentences, size=500, window=10, min_count=5, workers=4)
# save model to file
model.save("fastText_stock.model")
# load model to python
# model = Word2Vec.load("word2vec.model")

print(model.most_similar("台積電", topn=5))
print(model.most_similar("鴻海", topn=5))
print(model.most_similar("中華電信", topn=5))
print(model.most_similar("仁寶", topn=5))
print(model.most_similar("兆豐金", topn=5))
Ejemplo n.º 5
0
import data_manager
from argparse import ArgumentParser
from gensim.models import FastText, Word2Vec
import logging

if __name__ == '__main__':

    parser = ArgumentParser()
    parser.add_argument('-c', '--corpus', help='Corpus file', required=True)
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    corpus = data_manager.ExampleCorpus(args.corpus)

    for line in corpus:
        print(line)

    # Gets most frequent words
    topk = corpus.get_topk_words(topk=100)
    print('Most frequent words:')
    for k in topk:
        print(k)

    ft = FastText(size=100, window=5, min_count=3, sentences=corpus, iter=10)

    for a, b in ft.most_similar('felltithio'):
        print(a, b)
Ejemplo n.º 6
0
class WordEmbedding():
    def __init__(self,
                 embedding_type="w2v",
                 embedding_size=100,
                 ngram=(3, 6),
                 window_size=5,
                 architecture="sg"):
        self.embedding_type = embedding_type
        self.window = window_size
        self.size = embedding_size
        self.model = None
        if architecture == "sg":
            self.skip_gram = True
        else:
            self.skip_gram = False
        if ngram is None:
            ngram = (3, 6)
        self.min_gram = ngram[0]
        self.max_gram = ngram[1]

    def train_embedding(self,
                        sentences,
                        n_iter=100,
                        workers=1,
                        min_count=3,
                        negative_sample=1):
        if self.embedding_type == "w2v":
            train_corpus = sentences
            if self.model is None:
                self.model = Word2Vec(size=self.size,
                                      window=self.window,
                                      min_count=min_count,
                                      negative=negative_sample,
                                      workers=workers,
                                      sg=int(self.skip_gram))
                self.model.build_vocab(train_corpus)
            # self.model.build_vocab()
            else:
                self.model.build_vocab(train_corpus, update=True)
        elif self.embedding_type == "ft":
            train_corpus = sentences
            if self.model is None:
                self.model = FastText(sg=int(self.skip_gram),
                                      size=self.size,
                                      window=self.window,
                                      min_count=min_count,
                                      min_n=self.min_gram,
                                      max_n=self.max_gram,
                                      workers=workers,
                                      negative=negative_sample)
                self.model.build_vocab(train_corpus)
            else:
                self.model.build_vocab(train_corpus, update=True)
        elif self.embedding_type == "glove":
            raise ValueError("GloVe training not supported use official repo")
        else:
            raise ValueError("Invalid Embedding Type")
        train_corpus = sentences
        self.model.train(train_corpus,
                         epochs=n_iter,
                         total_examples=self.model.corpus_count)

    def retrieve_vector(self, word):
        try:
            return self.model.wv[word]
        except KeyError:
            return np.random.random(self.size)

    def find_similar_word(self, word, n=10):
        try:
            return self.model.most_similar(positive=[word], topn=n)
        except KeyError:
            return []

    def save_model(self, file_name):
        self.model.save("{}.model".format(file_name))
        we_model_files = glob("{}.model*".format(file_name))
        with ZipFile(file_name, "w") as zipf:
            for we_file in we_model_files:
                zipf.write(we_file)
                os.remove(we_file)

    def load_model(self, file_name):
        try:
            with ZipFile(file_name, "r") as zipf:
                zipf.extractall("/tmp/")
                nl = zipf.namelist()
            fn = [name for name in nl if name.endswith(".model")][0]
            path = "/tmp/" + fn
        except BadZipFile:
            path = file_name

        if self.embedding_type == "w2v":
            self.model = KeyedVectors.load_word2vec_format(path)
        elif self.embedding_type == "ft":
            self.model = FastText.load_fasttext_format(path)
        elif self.embedding_type == "glove":
            """path name: .txt file"""
            try:
                glove_file = datapath(os.path.abspath(path))
                tmp_file = get_tmpfile("/tmp/g2w2v.txt")
                glove2word2vec(glove_file, tmp_file)
                self.model = KeyedVectors.load_word2vec_format(tmp_file)
            except UnicodeDecodeError:
                self.model = KeyedVectors.load(os.path.abspath(path))
        self.size = self.model.wv.vector_size

    def remove_from_vocab(self, word_list):
        new_vectors = []
        new_vocab = {}
        new_index2entity = []
        new_vectors_norm = []
        if self.embedding_type == "ft":
            self.model.wv.init_sims()
            for i in range(len(self.model.wv.vocab)):
                word = self.model.wv.index2entity[i]
                vec = self.model.wv.vectors[i]
                vocab = self.model.wv.vocab[word]
                vec_norm = self.model.wv.vectors_norm[i]
                if word not in word_list:
                    vocab.index = len(new_index2entity)
                    new_index2entity.append(word)
                    new_vocab[word] = vocab
                    new_vectors.append(vec)
                    new_vectors_norm.append(vec_norm)
            self.model.wv.vocab = new_vocab
            self.model.wv.vectors = np.array(new_vectors)
            self.model.wv.index2entity = new_index2entity
            self.model.wv.index2word = new_index2entity
            self.model.wv.vectors_norm = new_vectors_norm
        else:
            self.model.init_sims()
            for i in range(len(self.model.vocab)):
                word = self.model.index2entity[i]
                vec = self.model.vectors[i]
                vocab = self.model.vocab[word]
                vec_norm = self.model.vectors_norm[i]
                if word not in word_list:
                    vocab.index = len(new_index2entity)
                    new_index2entity.append(word)
                    new_vocab[word] = vocab
                    new_vectors.append(vec)
                    new_vectors_norm.append(vec_norm)
            self.model.vocab = new_vocab
            self.model.vectors = np.array(new_vectors)
            self.model.index2entity = new_index2entity
            self.model.index2word = new_index2entity
            self.model.vectors_norm = new_vectors_norm
Ejemplo n.º 7
0
#FastText 모델 저장
model.save("FastText.model")

val_data = json.load(open("data\\val.json", "rb"))

#val 데이터로부터 유추한 태그들(Type2 유형)
playlist_to_tags = {}

for data in val_data:
    # 플레이리스트만 주어진 유형(Type2)에 대해 진행
    if data['plylst_title'] != "" and data['tags'] == [] and data[
            'songs'] == []:
        playlist_to_tags[data['plylst_title']] = []
        #val.json의 type 2 문제에서 플레이리스트 제목을 띄어쓰기로 구분한 각각의 요소가 공통으로 가까운 단어를 50개 가져오기
        recommends = model.most_similar(
            positive=[word for word in data['plylst_title'].split(" ")],
            topn=50)
        #위에서 가져온 단어(recommends) 중 태그만을 playlist_to_tags에 추가
        for recommend in recommends:
            if recommend[0] in tags:
                if len(playlist_to_tags[data['plylst_title']]) == 0:
                    playlist_to_tags[data['plylst_title']].append(recommend)
                else:
                    for begin_tag in playlist_to_tags[data['plylst_title']]:
                        sim1, sim2 = diff_2gram(begin_tag, recommend[0])
                        if sim1 <= 0.5 and sim2 <= 0.5:
                            playlist_to_tags[data['plylst_title']].append(
                                recommend)
                        else:
                            pass
Ejemplo n.º 8
0
### Create word2vec model w/ merged vocab
t = time()
new_wv = FastText(size=30, window=5, min_count=1, workers=3, sg=0, hs=1, negative = 10, sample=0.001, alpha=0.1)
new_wv.build_vocab(sentences)
'''
total_examples = new_wv.corpus_count
new_wv.build_vocab([list(pubmed_wv.vocab.keys())], update=True)
new_wv.intersect_word2vec_format(preTrainedPath, binary=True, lockf=1.0)
'''

### Train for 2 epochs
new_wv.train(sentences, epochs=2) # , total_examples=total_examples
print('Time to train the model 2 epochs: {} mins'.format(round((time() - t) / 60, 2)))
print('----------------------------')
print(new_wv.most_similar(positive=['treatment']))
print(new_wv.most_similar(positive=['female']))
print(new_wv.most_similar(positive=['history']))
print(new_wv.most_similar(positive=['disease']))
print(new_wv.most_similar(positive=['brain']))
new_wv.save_word2vec_format('mimic-pubmed_2.bin', binary=True)
print('----------------------------')


# Train for 10 epochs
new_wv.train(sentences, epochs=8) # , total_examples=total_examples
print('Time to train the model 10 epochs: {} mins'.format(round((time() - t) / 60, 2)))
print('----------------------------')
print(new_wv.most_similar(positive=['treatment']))
print(new_wv.most_similar(positive=['female']))
print(new_wv.most_similar(positive=['history']))
# print(len(text))
ct = 0
text_vocab = []
# for i,ctext in enumerate(df1["ctext"]):
for text in x:
    text = str(text).lower()
    articles_tokens = []
    for token in word_tokenize(text):
        articles_tokens.append(token)
    ct += 1
    print(ct)
    text_vocab.append(articles_tokens)
print(len(x))
print(len(text_vocab))

model_ted = FastText(text_vocab, size=100, window=5, min_count=5, workers=4)

print(model_ted.most_similar('china'))
print(model_ted.most_similar('bjp'))
print(model_ted.most_similar('timeswarner'))
#model_ted.save('bbc_ft.bin')
print(model_ted['china'])
print(model_ted['bjp'])
# print(model_ted[''])
# articles_tokens=[]
# for text in x:
#     articles_tokens.append([word for word in word_tokenize(str(x).lower().replace("."," "))])
#     print('')
# print(x[0])
# print(articles_tokens[0 ])
from gensim.models import FastText
import pandas as pd

article = pd.read_excel('../article set/All_File/Cut_Finish_jieba.xlsx')

sentences = article['內容'].tolist()

split_sentences = []

for i in sentences:
    split_sentences.append(i.split(' '))

print('訓練開始')
# # build a Word2Vce model
model = FastText(split_sentences, size=500, window=10, min_count=5, workers=4 ,iter=10)
# save model to file
model.save("../Word_Embedding_model/fastText_stock.model")
# load model to python
# model = FastText.load("fastText_stock.model")
print(model.wv['台積電'])


print('台積電',model.most_similar("台積電", topn=5))
print('鴻海',model.most_similar("鴻海", topn=5))
print('中華電信',model.most_similar("中華電信", topn=5))
print('仁寶',model.most_similar("仁寶", topn=5))
print('兆豐金',model.most_similar("兆豐金", topn=5))
Ejemplo n.º 11
0
class FeatureBuilder:
    def __init__(self, ordering=['company', 'location', 'goods']):
        self.feature_encoder = None
        self.sizes = []
        self.train = None
        self.validation = None
        self.ordering = ordering
        self.word_mapping = {}
        self.company_feature_encoder = None
        self.location_feature_encoder = None
        self.goods_feature_encoder = None

    def load(self):
        self.load_data()

    def load_model(self):
        try:
            #self.feature_encoder = FastText.load('./models/fasttext.model')
            self.company_feature_encoder = FastText.load(
                './models/company_fasttext.model')
            self.location_feature_encoder = FastText.load(
                './models/location_fasttext.model')
            self.goods_feature_encoder = FastText.load(
                './models/goods_fasttext.model')
        except:
            print('Existing model does not exist. Training from scratch')
            self.classType_fasttext_train('company')
            self.classType_fasttext_train('location')
            self.classType_fasttext_train('goods')
            #self.train_fasttext_encoder()
            #self.validate_encoder()

    def load_data(self):
        data = []
        datasets = [get_company_data(), get_location_data(), get_items_cat()]
        for idx, dataset in enumerate(datasets):
            print('Is any entry Null?:', dataset.isnull().values.any())
            for idx2, row in dataset.iterrows():
                if row['name'] not in self.word_mapping:
                    self.word_mapping[row['name']] = []
                self.word_mapping[row['name']].append(self.ordering[idx])
            self.sizes.append(dataset.shape[0])
            data += list(dataset['name'].values)
        #data = shuffle(data,random_state=0)
        self.train, self.validation = train_test_split(data,
                                                       random_state=0,
                                                       test_size=0.2)
        print('Train Test Constructed')

    def classType_fasttext_train(self, classType):

        train_sentences = []

        for word in self.train:
            sentence = []
            mappings = self.word_mapping[word]
            for mapping in mappings:
                if mapping == classType:
                    sentence.append(word)
            if len(sentence) > 0:
                train_sentences.append(sentence)

        feature_encoder = FastText(size=50,
                                   window=2,
                                   min_count=1,
                                   min_n=2,
                                   max_n=6)
        feature_encoder.build_vocab(sentences=train_sentences)
        feature_encoder.train(sentences=train_sentences,
                              total_examples=feature_encoder.corpus_count,
                              epochs=1000)
        feature_encoder.save('./models/' + classType + '_fasttext.model')
        if classType == 'company':
            self.company_feature_encoder = feature_encoder
        elif classType == 'location':
            self.location_feature_encoder = feature_encoder
        elif classType == 'goods':
            self.goods_feature_encoder = feature_encoder
        else:
            raise Exception(
                'Allowed arguments are company, location and goods')
        #self.feature_encoder = FastText(size=25, window=1, min_count=1, sentences=train_sentences, iter=50)

    def train_fasttext_encoder(self):
        train_sentences = []

        for word in self.train:
            mappings = self.word_mapping[word]
            for mapping in mappings:
                sentence = [word]
            train_sentences.append(sentence)

        self.feature_encoder = FastText(size=50,
                                        window=2,
                                        min_count=1,
                                        min_n=2,
                                        max_n=6)
        self.feature_encoder.build_vocab(sentences=train_sentences)
        self.feature_encoder.train(
            sentences=train_sentences,
            total_examples=self.feature_encoder.corpus_count,
            epochs=1000)
        self.feature_encoder.save('./models/fasttext.model')

        #self.feature_encoder = FastText(size=25, window=1, min_count=1, sentences=train_sentences, iter=50)

    def validate_encoder(self):
        test_words = self.validation

        ## Finding the closest cluster center (Company, Location or Good)
        tp = 0

        for word in test_words:
            distances = []
            encoding = self.feature_encoder[word]
            for order in self.ordering:
                category_encoding = self.feature_encoder[order]
                distances.append(np.linalg.norm(encoding - category_encoding))
            idx = distances.index(min(distances))

            gt_categories = self.word_mapping[word]
            for gt_category in gt_categories:
                if self.ordering[idx] == gt_category:
                    tp += 1
                    break

        print('Closest cluster center validation approach accuracy:',
              str(tp / len(test_words)))

        ## Doing the K-nearest analysis
        tp = 0

        order_idx = {}
        for idx, order in enumerate(self.ordering):
            order_idx[order] = idx

        for word in test_words:
            distances = []
            encoding = self.feature_encoder[word]
            nearest_neighbours = self.feature_encoder.most_similar(word,
                                                                   topn=15)
            votes = [0, 0, 0]

            for neighbour in nearest_neighbours:
                mappings = self.word_mapping[neighbour[0]]
                for mapping in mappings:
                    votes[order_idx[mapping]] += 1

            assigned_idx = votes.index(max(votes))

            gt_categories = self.word_mapping[word]
            for gt_category in gt_categories:
                if self.ordering[assigned_idx] == gt_category:
                    tp += 1
                    break

        print('Nearest 15-Neighbour accuracy:', str(tp / len(test_words)))

    def one_vs_rest_generator(self, positive_index=None):

        assert positive_index is not None, "Requires index for the positive class(see ordering)"
        if self.ordering[positive_index] == 'company':
            feature_encoder = self.company_feature_encoder
        elif self.ordering[positive_index] == 'location':
            feature_encoder = self.location_feature_encoder
        elif self.ordering[positive_index] == 'goods':
            feature_encoder = self.location_feature_encoder
        else:
            raise Exception('Marked positive class not in the set {0,1,2}')

        X_train = []
        y_train = []
        X_test = []
        y_test = []

        for word in self.train:
            try:
                X_train.append(feature_encoder[word])
                if self.ordering[positive_index] in self.word_mapping[word]:
                    y_train.append(1)
                else:
                    y_train.append(0)
            except KeyError:
                print(
                    'all ngrams for word %s absent from model. Skipping for %s'
                    % (word, self.ordering[positive_index]))

        for word in self.validation:
            try:
                X_test.append(feature_encoder[word])
                if self.ordering[positive_index] in self.word_mapping[word]:
                    y_test.append(1)
                else:
                    y_test.append(0)
            except KeyError:
                print(
                    'all ngrams for word %s absent from model. Skipping for %s'
                    % (word, self.ordering[positive_index]))

        return np.asarray(X_train,dtype=np.float64),np.asarray(y_train,dtype=np.float64),\
               np.asarray(X_test,dtype=np.float64),np.asarray(y_test,dtype=np.float64)

    def get_encoding(self, word):
        return self.feature_encoder[word]
def main():
    input_dir = args.input_dir
    output_dir = args.output_dir
    embeddings = args.embeddings
    embed_type = args.embed_type
    similarity_check = args.similarity_check


    if args.emb_path:
        # Eval mode mode
        model_check = KeyedVectors.load_word2vec_format(args.emb_path, binary=False)
    
        print("Checking word similarity from: ", embeddings)
        for every in similarity_check:
            print("Most similar words for ", every)
            print(model_check.most_similar(every, topn=10))

        print("Exiting program")
        sys.exit(00)
    
    
    
    if embeddings == 'word2vec':
        output_dir = os.path.join(output_dir, 'nep2vec')
    else:
        output_dir = os.path.join(output_dir, 'nep2ft')

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    output_file = os.path.join(output_dir, 'embeddings.vec')

    # Training mode
    if not args.eval_mode:
        print("Training {0} model".format(embeddings))
        sents = []
        for root, dirs, files in os.walk(input_dir):
            for f in files:
                input_file = os.path.join(root, f)
                print("Processing {0}".format(input_file))
                i_f = open(input_file, 'r', encoding='utf8')
                for line in i_f:
                    if len(line) > 0:
                        sents.append(line.split())

        if embeddings == 'word2vec':
            model = Word2Vec(sents, size=300, sg=embed_type, workers=10)
            
        elif embeddings == 'fasttext':            
            model = FastText(size=300, window=5, min_count=1)
            model.build_vocab(sentences=sents)
            total_examples = model.corpus_count
            model.train(sentences=sents, total_examples=total_examples, epochs=5)            
                
        model.wv.save_word2vec_format(output_file, binary=False)
    
    # Eval mode mode
    model = KeyedVectors.load_word2vec_format(output_file, binary=False)
    
    print("Checking word similarity from: ", embeddings)
    for every in similarity_check:
        print("Most similar words for ", every)
        print(model.most_similar(every, topn=10))
    
    # Print info
    print("Length of vocabulary ",model.wv.vectors.shape[0])
Ejemplo n.º 13
0
    cleanedUp = textacy.preprocess.preprocess_text(txt,
            lowercase=True, transliterate=True, no_punct=True, no_contractions=True )
    sentenceAsList = textacy.preprocess.normalize_whitespace(cleanedUp).split(' ')
    filteredSentence = [w for w in sentenceAsList if not w in stopWords]
    return filteredSentence
titles = bag1.pluck('title').map(preProcessText).compute()
# titles = list([t.split(' ') for t in titles])
client.close()
print(time.clock() - startTime)

#%% Base model
startTime = time.clock()
model = None
model = FastText(titles[:100000], min_count=1, workers=4, sg=0)
print(time.clock() - startTime)
model.most_similar(positive=['cognitive'])

#%% experiments
startTime = time.clock()
model2 = None
model2 = FastText(titles, min_count=10, workers=4, sg=1, window=10,size=100) #size=300 for transfer
# r1 = model.most_similar(positive=['cognitive'])
# r2 = model2.most_similar(positive=['cognitive'])
# print(tabulate([[r2[i][0], r1[i][0]] for i, x in enumerate(r1)],
#                headers=['Model2', 'Model1'],
#                tablefmt='orgtbl'))
print(time.clock() - startTime)

#%% TSNE
words = ['cognitive', 'sensemaking', 'comprehension', 'reading',
         'articles', 'perception', 'notetaking', 'annotation', 'foraging',