class WordToVector: mode_dict = {0: "50d", 1: "100d", 2: "200d", 3: "300d"} def __init__(self, train_new_model=False, filepath=None, mode=0, tokenized_dataset=None, vector_size=100, train_epochs=30): self.word_weight_vec = None if not train_new_model: if filepath is not None: self.no_train = True with open( filepath + "glove.6B." + self.mode_dict[mode] + ".txt", "rb") as model_file: self.word_vec_dict = { word_vec_pair.split()[0]: np.array(map(float, word_vec_pair.split()[1:])) for word_vec_pair in model_file } else: self.no_train = False if tokenized_dataset is not None: self.to_train_model = FastText(size=vector_size, window=4, min_count=2) self.to_train_model.build_vocab(sentences=tokenized_dataset) self.to_train_model.train( sentences=tokenized_dataset, total_examples=len(tokenized_dataset), epochs=train_epochs) self.word_vec_dict = dict( zip(self.to_train_model.wv.index2word, self.to_train_model.wv.syn0)) else: print( "You have not given a tokenized_dataset. Please ensure that it is in the format of list of list \ of tokens and also that the parameter you have passed is not None" ) self.dim = len(next(iter(self.word_vec_dict))) def trainWithAdditionalData(self, tokenized_dataset_update): if not self.no_train: self.to_train_model.build_vocab(tokenized_dataset_update, update=True) self.to_train_model.train( tokenized_dataset_update, total_examples=len(tokenized_dataset_update), epochs=self.to_train_model.epochs) self.word_vec_dict = dict( zip(self.to_train_model.wv.index2word, self.to_train_model.wv.syn0)) self.dim = len(next(iter(self.word_vec_dict))) def convertSentenceToVector(self, sentence): if self.no_train: return np.array([ np.mean( [ self.word_vec_dict[ word] # * self.word_weight_vec[word] for word in sentence if word in self.word_vec_dict ] or [np.zeros(self.dim)], axis=0) ]) else: return np.array([ np.mean([ self.to_train_model.wv.get_vector(word) for word in sentence ], axis=0) ]) def getWordVector(self, word): if self.no_train: return self.word_vec_dict[word] else: return self.to_train_model.wv.get_vector(word)
class Model: def __init__(self, name=None, modelType=None): if name and modelType: self.name = name self.modelType = modelType self.model_path = self.getModelPath(name, modelType) def create(self, data_path, modelName='wordEmbedding', modelType='word2vec', model_path=None): ''' Uses Gensim to train a word embedding Model, either fasttext or word2vec are possible. file_path points to a csv file containing articles with the text of newspaper articles in a column called body ''' self.name = modelName self.modelType = modelType if model_path is None: model_path = self.getModelPath(self.name, self.modelType) self.model_path = model_path if self.modelType == 'word2vec': self.word_embedding = Word2Vec(min_count=8, window=5, workers=4, size=300, alpha=0.05, negative=10, sg=1) if self.modelType == 'fasttext': self.word_embedding = FastText(size=300) self.collectionInfo = CollectionInfo(data_path) collection = Collection(data_path) self.word_embedding.build_vocab(collection) self.word_embedding.train( collection, total_examples=self.word_embedding.corpus_count, epochs=self.word_embedding.iter) self.modelInfo = ModelInfo(self.modelType, self.word_embedding) def getModelPath(self, modelName, modelType): return './models/' + modelName + '_' + modelType def evaluate(self): ''' evaluates the semantic concepts a Word2Vec model has learned based on analogies, e.g. sister:brother :: daughter:son, in specific categories (e.g. currencies, verb forms, family, country capitals, etc.) ''' with open('newsAnalysis/questions-words.txt', 'r') as evaluationFile: self.accuracy = self.word_embedding.wv.accuracy(evaluationFile) correctAnalogies = [len(result['correct']) for result in self.accuracy] totalAnalogies = [ len(result['correct'] + result['incorrect']) for result in self.accuracy ] for ind in range(len(self.accuracy)): self.accuracy[ind]['nr_correct'] = correctAnalogies[ind] self.accuracy[ind]['nr_total'] = totalAnalogies[ind] def vectors2Bytes(self): vectors = self.word_embedding.wv.vectors vectors.tofile(self.model_path + '.bytes') def to_tsv(self): self.vectors2tsv() self.vocab2tsv() def vectors2tsv(self): with open(self.model_path + '.tsv', 'w') as f: writer = csv.writer(f, delimiter='\t', lineterminator='\n') writer.writerows(self.word_embedding.wv.vectors) f.close() def vocab2tsv(self): with open(self.model_path + '_metadata.tsv', 'w') as f: vocab = self.word_embedding.wv.vocab.keys() #vocabWithLineSeparator = [word + '\n' for ind,word in enumerate(vocab) if ind<len(vocab)-1] vocabWithLineSeparator = [word + '\n' for word in vocab] f.writelines(vocabWithLineSeparator) f.close() def exists(self, model_path=None): if hasattr(self, 'model_path'): return os.path.exists(self.model_path) elif model_path: return os.path.exists(model_path) else: return False def load(self, modelName=None, modelType=None, model_path=None): if hasattr(self, 'model_path'): model_path = self.model_path elif modelName and modelType: model_path = self.getModelPath(modelName, modelType) input_file = open(model_path + '.pkl', 'rb') self = pickle.load(input_file) self.word_embedding = KeyedVectors.load_word2vec_format( model_path) #, mmap='r') return self def __getstate__(self): return (self.modelType, self.name, self.collectionInfo, self.modelInfo, self.accuracy) def __setstate__(self, state): self.modelType, self.name, self.collectionInfo, self.modelInfo, self.accuracy = state def save(self): output = open(self.model_path + '.pkl', 'wb') pickle.dump(self, output) self.word_embedding.wv.save_word2vec_format(self.model_path) def hasWord(self, word): if self.word_embedding.wv.vocab.get(word) == None: return False else: return True def filterNonVocabWords(self, word_list): valid_words = [] oov = [] for word in word_list: if self.hasWord(word): valid_words.append(word) else: oov.append(word) return valid_words, oov def getWordCount(self, word): if self.hasWord(word): return self.word_embedding.wv.vocab.get(word).count else: raise KeyError('ERROR: WORD not in Model') def wordListSimilarity(self, w, listOfWords): ''' return the mean cosine similarity of a word and all words in a list ''' similarities = [ self.word_embedding.wv.similarity(w, word) for word in listOfWords ] return np.mean(similarities) def mapWordOnAxis(self, word, attributes1, attributes2): ''' substract the mean cos distance of a word with all attributes in attributes1 with the mean cosine distance of word with all attributes in attributes2: s(w, A1, A2) = mean[for a1 in A1: cos(w, a1)] - mean[for a2 in A2: cos(w, a2)] ''' return self.wordListSimilarity( word, attributes1) - self.wordListSimilarity(word, attributes2) def keywordMapping(self, listOfWords, attributes1, attributes2): wordAttributeSimTarget1 = [ self.wordListSimilarity(word, attributes1) for word in listOfWords ] wordAttributeSimTarget2 = [ self.wordListSimilarity(word, attributes2) for word in listOfWords ] return np.array(wordAttributeSimTarget1) - np.array( wordAttributeSimTarget2) def plotKeywordMapping(self, values, labels, title='test'): plotter = ImagePlotter(True) plotter.horizontalBarPlot(values, labels, title='Word-Axis Mapping', x_label='attribute association', path=title + '.png') def WEAT(self, targets1, targets2, attributes1, attributes2): wordAttributeSimTarget1 = [ self.mapWordOnAxis(target, attributes1, attributes2) for target in targets1 ] wordAttributeSimTarget2 = [ self.mapWordOnAxis(target, attributes1, attributes2) for target in targets2 ] return np.sum(wordAttributeSimTarget1) - np.sum( wordAttributeSimTarget2) def generate_analogies(self, w1, w2, restrict_vocab=3500): biasObject = BiasWordEmbedding(self.word_embedding) biasObject._identify_direction(w1, w2, [w1, w2], method='single') return biasObject.generate_analogies(restrict_vocab=restrict_vocab, unrestricted=True, n_analogies=10) def visualise(self): self.vocab2tsv() self.vectors2Bytes() projector = Projector() modelName = '_'.join([self.name, self.modelType]) shutil.copy(self.model_path + '.bytes', projector.data_path + '/' + modelName + '.bytes') shutil.copy(self.model_path + '_metadata.tsv', projector.data_path + '/' + modelName + '_metadata.tsv') path = os.path.join(projector.data_path.split('/')[-1], modelName) projector.addModelToConfig(self.name, path + '.bytes', path + '_metadata.tsv', len(self.word_embedding.wv.vocab), self.word_embedding.vector_size) projector.writeConfigFile() projector.run()
def Updates(): try: print("updating Doc2Vec") print(updating) a = stem.snowball.ArabicStemmer() stopwords_list = stopwords.words('arabic') df = pd.read_csv('textc-Copy1.csv', encoding='utf-8') df["contenu"].fillna("محتوى فارغ", inplace=True) df["article"].fillna("محتوى فارغ", inplace=True) y = df['ToF'] df = df.drop('ToF', axis=1) text = [] for i in range(df.shape[0]): x = nltk.tokenize.wordpunct_tokenize(df.contenu[i]) text1 = [a.stem(word) for word in x] text.append(text1) titre = [ a.stem(word) for word in df.article if word not in stopwords_list ] #doc2vec docs = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') for i, te in enumerate(text): tags = [i] docs.append(analyzedDocument(te, tags)) model = doc2vec.Doc2Vec(docs, vector_size=300, non_negative=True, window=8, min_count=1, workers=4, dm=1) from gensim.test.utils import get_tmpfile fname = get_tmpfile("doc2vec.model") model.save(fname) model = doc2vec.Doc2Vec.load(fname) print("updating fastext") class MyItera(object): def __iter__(self): for line in Corpus.article: filtered_sentence = [] for w in tokenize(line): if w not in stop_words: filtered_sentence.append(w) yield filtered_sentence class MyIter(object): def __iter__(self): for line in Corpus.contenu: filtered_sentence = [] for w in tokenize(line): if w not in stop_words: filtered_sentence.append(w) yield filtered_sentence model = FastText(size=150, window=3, min_count=1) model.build_vocab(sentences=MyIter()) total_examples = model.corpus_count model.train(sentences=MyIter(), total_examples=total_examples, epochs=5) except: Update()
def word2vec(inputFile, outputFile, size=60, window=5, min_count=5, epoch=5, down_sampling=1e-4): ''' This function triggers the FastText model Parameters: 1. inputFile => This is corpus data as input file 2. outputFile => Output file 3. size => embedding dimension size 4. window => window size for context (skip-gram) 5. min_count => minimum number of word count to be considered 6. epoch => number of times the algorithm will run 7. down_sampling => value for down_sampling ''' print('inputFile:' + inputFile) print('outputFile:' + outputFile) corpus = [] with open(inputFile, 'r') as fin: for blog in fin: corpus.append(blog.strip('\n')) word_tokenized_corpus = [review.split() for review in corpus] try: model = FastText(size=size, window=window, min_count=min_count, seed=0, workers=1) # instantiate the fasttext model model.build_vocab( sentences=word_tokenized_corpus) # build the vocabulary model.train(sentences=word_tokenized_corpus, total_examples=len(word_tokenized_corpus), sg=1, sample=down_sampling, epochs=epoch) word_vectors = [] for w in model.wv.vocab: try: word_vectors.append(model[w]) except Exception as err: print(str(err) + ": " + w) continue no_of_words = len(model.wv.vocab) dimension = size with open(outputFile, 'w') as fout: fout.write(str(no_of_words) + ' ' + str(dimension) + '\n') for i, w in enumerate(model.wv.vocab): fout.write(w) for feature in word_vectors[i]: fout.write(' ' + str(feature)) fout.write('\n') except Exception as err: print(err) return None
return [sentence.split(" ") for sentence in sentences][:-1] lumea_tokens = load_tokens("tokens_lumea.txt") lumea_token_count = sum([1 for sentence in lumea_tokens for token in sentence]) blog_tokens = load_tokens("tokens_blog.txt") blog_token_count = sum([1 for sentence in blog_tokens for token in sentence]) print("Start training FT model wtih blogs.") model = FastText(blog_tokens, size=300, window=5, min_count=1, workers=4) model.wv.save("model_ft_blog") print("Start updating FT model wuth Lumea corpus.") model.build_vocab(lumea_tokens, update=True) model.train(lumea_tokens, total_examples=model.corpus_count, epochs=model.epochs) model.wv.save("model_ft_expanded") print("Start training FT model wtih Lumea corpus only.") model = FastText(lumea_tokens, size=300, window=5, min_count=1, workers=4) model.wv.save("model_ft_lumea") print("Start training W2V model wtih blogs.") model = Word2Vec(blog_tokens, size=300, window=5, min_count=1, workers=4) model.wv.save("model_w2v_blog") print("Start updating W2V model wuth Lumea corpus.") model.build_vocab(lumea_tokens, update=True) model.train(lumea_tokens, total_examples=model.corpus_count,
if count != 0: vec /= count return vec tokenizer = MosesTokenizer() train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") corpus = [ tokenizer.tokenize(text) for df in (train, test) for text in df['text'] ] ft_model = FastText(size=embedding_size, window=5, min_count=2, seed=57) ft_model.build_vocab(sentences=corpus) ft_model.train(sentences=corpus, total_examples=len(corpus), epochs=10) labels = list({label for df in (train, test) for label in df['artist']}) X_train = np.array([ get_sentence_vector(text, embedding_size, tokenizer, ft_model) for text in train['text'] ]) y_train = np.array([labels.index(label) for label in train['artist']]) X_test = np.array([ get_sentence_vector(text, embedding_size, tokenizer, ft_model) for text in test['text'] ]) y_test = np.array([labels.index(label) for label in test['artist']]) X_train, X_valid, y_train, y_valid = train_test_split(X_train,
class WordEmbedding(): def __init__(self, embedding_type="w2v", embedding_size=100, ngram=(3, 6), window_size=5, architecture="sg"): self.embedding_type = embedding_type self.window = window_size self.size = embedding_size self.model = None if architecture == "sg": self.skip_gram = True else: self.skip_gram = False if ngram is None: ngram = (3, 6) self.min_gram = ngram[0] self.max_gram = ngram[1] def train_embedding(self, sentences, n_iter=100, workers=1, min_count=3, negative_sample=1): if self.embedding_type == "w2v": train_corpus = sentences if self.model is None: self.model = Word2Vec(size=self.size, window=self.window, min_count=min_count, negative=negative_sample, workers=workers, sg=int(self.skip_gram)) self.model.build_vocab(train_corpus) # self.model.build_vocab() else: self.model.build_vocab(train_corpus, update=True) elif self.embedding_type == "ft": train_corpus = sentences if self.model is None: self.model = FastText(sg=int(self.skip_gram), size=self.size, window=self.window, min_count=min_count, min_n=self.min_gram, max_n=self.max_gram, workers=workers, negative=negative_sample) self.model.build_vocab(train_corpus) else: self.model.build_vocab(train_corpus, update=True) elif self.embedding_type == "glove": raise ValueError("GloVe training not supported use official repo") else: raise ValueError("Invalid Embedding Type") train_corpus = sentences self.model.train(train_corpus, epochs=n_iter, total_examples=self.model.corpus_count) def retrieve_vector(self, word): try: return self.model.wv[word] except KeyError: return np.random.random(self.size) def find_similar_word(self, word, n=10): try: return self.model.most_similar(positive=[word], topn=n) except KeyError: return [] def save_model(self, file_name): self.model.save("{}.model".format(file_name)) we_model_files = glob("{}.model*".format(file_name)) with ZipFile(file_name, "w") as zipf: for we_file in we_model_files: zipf.write(we_file) os.remove(we_file) def load_model(self, file_name): try: with ZipFile(file_name, "r") as zipf: zipf.extractall("/tmp/") nl = zipf.namelist() fn = [name for name in nl if name.endswith(".model")][0] path = "/tmp/" + fn except BadZipFile: path = file_name if self.embedding_type == "w2v": self.model = KeyedVectors.load_word2vec_format(path) elif self.embedding_type == "ft": self.model = FastText.load_fasttext_format(path) elif self.embedding_type == "glove": """path name: .txt file""" try: glove_file = datapath(os.path.abspath(path)) tmp_file = get_tmpfile("/tmp/g2w2v.txt") glove2word2vec(glove_file, tmp_file) self.model = KeyedVectors.load_word2vec_format(tmp_file) except UnicodeDecodeError: self.model = KeyedVectors.load(os.path.abspath(path)) self.size = self.model.wv.vector_size def remove_from_vocab(self, word_list): new_vectors = [] new_vocab = {} new_index2entity = [] new_vectors_norm = [] if self.embedding_type == "ft": self.model.wv.init_sims() for i in range(len(self.model.wv.vocab)): word = self.model.wv.index2entity[i] vec = self.model.wv.vectors[i] vocab = self.model.wv.vocab[word] vec_norm = self.model.wv.vectors_norm[i] if word not in word_list: vocab.index = len(new_index2entity) new_index2entity.append(word) new_vocab[word] = vocab new_vectors.append(vec) new_vectors_norm.append(vec_norm) self.model.wv.vocab = new_vocab self.model.wv.vectors = np.array(new_vectors) self.model.wv.index2entity = new_index2entity self.model.wv.index2word = new_index2entity self.model.wv.vectors_norm = new_vectors_norm else: self.model.init_sims() for i in range(len(self.model.vocab)): word = self.model.index2entity[i] vec = self.model.vectors[i] vocab = self.model.vocab[word] vec_norm = self.model.vectors_norm[i] if word not in word_list: vocab.index = len(new_index2entity) new_index2entity.append(word) new_vocab[word] = vocab new_vectors.append(vec) new_vectors_norm.append(vec_norm) self.model.vocab = new_vocab self.model.vectors = np.array(new_vectors) self.model.index2entity = new_index2entity self.model.index2word = new_index2entity self.model.vectors_norm = new_vectors_norm
# ---------------------------------- Word2Vect ----------------------------------------------- model_w2v = Word2Vec(reviews, size=150, window=10, min_count=2, workers=10, sg=0) #sg=0 cbow model_w2v.train(reviews, total_examples=len(reviews), epochs=12) # ---------------------------------- Fasttext ----------------------------------------------- model_fasttext = FastText(size=170, window=10, min_count=2, workers=10, sg=0) # instantiate model_fasttext.build_vocab(sentences=reviews) model_fasttext.train(sentences=reviews, total_examples=len(reviews), epochs=12) # train word_vectors = model_fasttext.wv word_vectors_w2v = model_w2v.wv outF = open("Dataset/txt files/myOutFile1.txt", "w+") for i in range( len(attributes) ): # Each attributes send to word2vectfonc fonction to use w2v and fasttex. word2vectfonc(attMatrix[i][0], i) outF.close()
def main(): # global encode_length, vector_size ## 1. intent 데이터셋 불러오기 config = Configs() okt = Okt() question = preprocess_data(True) joinStr = ' '.join(question) morphs = okt.morphs(joinStr) joinString = ' '.join(morphs) pos1 = okt.pos(joinString) pos2 = ' '.join(list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n') morphs = list(map(lambda x: okt.morphs(x), pos2)) ## 2. 워드 임베딩 print("\n### Fasttext bulid model ###", end="\n") word2vec_model = FastText(size=config.vector_size, window=3, workers=8, min_count= 1) # word2vec_model = FastText(size=config.vector_size, window=2, workers=8, min_count= 1) word2vec_model.build_vocab(morphs) print('\n### Fasttext build complete ###', end="\n") print('\n### Fasttext trian start ###', end="\n") word2vec_model.train(morphs, total_examples= word2vec_model.corpus_count, epochs= word2vec_model.epochs, compute_loss=True, verbose=1) print('\n### Fasttext train complete ###', end="\n") word2vec_model.save(config.fasttext_model_path+"intent_fasttextmodel") print('\n### Fasttext model save ###', end="\n") w2c_index = word2vec_model.wv.index2word # fasttext가 적용된 단어 목록들 print("[DEBUG1-1]############ FastText representation ############", end="\n\n") print(w2c_index, end="\n\n\n") print('\n\n[DEBUG1-1]word_index 단어 개수 >> ', len(w2c_index)) # <class 'list'> ### intentIndex 저장 with open(config.fasttext_model_path+'/intentIndex.pickle', 'wb') as f: pickle.dump(w2c_index, f, pickle.HIGHEST_PROTOCOL) print("_________________________________________________________________________________________________________________\n") # # y_data 생성 y_data = config.df['intent'] y_data = y_data.map(config.intent_mapping) y_data = to_categorical(y_data) # x_data 생성 # encode_length = 15 x_data = [] for q_raw in question: q_raw = okt.morphs(q_raw) # 문장 형태소별로 분리(단어 분리). str > list q_raw = list(map(lambda x: q_raw[x] if x < len(q_raw) else '#', range(config.encode_length))) q_raw = list(map(lambda x: word2vec_model[x] if x in w2c_index else np.zeros(config.vector_size, dtype=float), q_raw)) q_raw = np.array(q_raw) x_data.append(q_raw) x_data = np.array(x_data) # (None, 15, 300) x_data = x_data.reshape(len(config.df), config.encode_length, config.vector_size, 1) print(x_data.shape) ## vector numpy array save # np.save("fasttext_vector.npy", x_data) print("_________________________________________________________________________________________________________________\n") ## 3. 모델 생성 및 훈련 print("shape >>", x_data.shape, y_data.shape) # (None, 15 ,300, 1) / (None, 5) model = Sequential() model.add(Conv2D(12, kernel_size=(2,2), input_shape=(config.encode_length, config.vector_size, 1), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(4,4), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(2,2), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(4,4), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(2,2), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu", data_format='channels_first')) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(4,4), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(2,2), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu", data_format='channels_first')) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Conv2D(12, kernel_size=(4,4), strides=(1,1), padding="valid", activation="relu")) model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1))) model.add(Flatten()) model.add(BatchNormalization()) # model.add(Dropout(1.0)) model.add(Dense(128, activation="relu")) # model.add(Dropout(0.1)) model.add(Dense(5, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # stop = EarlyStopping(monitor="loss", patience=20, mode="auto") model.summary() model.fit(x_data, y_data, batch_size=64, epochs=500) # model.fit(x_data, y_data, batch_size=64, epochs=500, callbacks=[stop]) print("_________________________________________________________________________________________________________________") loss, acc = model.evaluate(x_data, y_data) print("loss >> ", loss) print("acc >>", acc, end="\n") ## 4. 모델 저장 path = config.intent_model_path file_list = os.listdir(path) new_num = 0 if os.path.exists(path): # 파일 있을경우 for i in file_list: num = int(i.split(".")[0].split("-")[-1]) if new_num <= num: new_num = num + 100 else: pass model_name = "intent_model-"+str(new_num)+".h5" weights_name = "intent_weights-"+str(new_num)+".h5" print("\n\nFile name >>",model_name) model.save(path+model_name) model.save_weights(path+weights_name) else: model.save(path+"intent_model-100.h5") model.save_weights(path+"intent_weights-100.h5") print("\n#### MODEL SAVE ####", end='\n')
pbar.update(1) del review_unclean # FastText Vector vector_size = 256 window = 5 fasttext_model = 'fasttext.model' print('Generating FastText Vectors ..') start = time.time() model = FastText(size=vector_size) model.build_vocab(review) model.train(review, window=window, min_count=1, workers=4, total_examples=model.corpus_count, epochs=model.epochs) print('FastText Created in {} seconds.'.format(time.time() - start)) model.save(fasttext_model) print('FastText Model saved at {}'.format(fasttext_model)) del model model = FastText.load(fasttext_model) x_vectors = model.wv del model # Dataset Partition # Spliting the review1 and labels in (x_train, y_train) and (x_test, y_test) with 90% for training and 10% for testing from all the tweets. # Maximum number of tokens allowed for each review is set to be 15.
def train(self, epochs=30): """ Train with own Data(s) Support single or multiple corpus or dataframe. Parameters: ----------- model_name(optional): preferred model name epochs : int : total epochs for training Example -------- >>> from ekushey.feature_extraction import BN_FastText #Training Against Sentences >>> ft = BN_FastText(sentences=[['আমার', 'প্রিয়', 'জন্মভূমি'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'] ]) >>> ft.train() #Training Against one Text Corpus >>> ft = BN_FastText(corpus_file="path_to_corpus.txt") >>> ft.train() #Training Against Multiple Corpuses path ->corpus ->1.txt ->2.txt ->3.txt >>> ft = BN_FastText(corpus_path="path/corpus") >>> ft.train(epochs=25) #Training Against a Dataframe Column >>> ft = BN_FastText(df= news_data['text_content']) >>> ft.train(epochs=25) """ if not(self.sentences) and not(self.corpus_file) and not(self.corpus_path) and self.df is None: raise Exception('Data is not given') elif self.sentences: data = self.sentences #print("got sentence") elif self.corpus_file: #print("got sentence") data = PathLineSentences(self.corpus_file) elif self.corpus_path: #print("got sentence") data = PathLineSentences(self.corpus_path) elif self.df is not None: #print("Dataframe got") data = '\n'.join(self.df) data = data.split('\n') data = [sent.split() for sent in data] else: print("Unexpected error occured: Please check your data file again.") cpu_cores = multiprocessing.cpu_count() ft_model = FastText( size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, negative=self.negative ) print("Working with "+str(self.workers)+" worker threads") ft_model.build_vocab(data, progress_per=10000) print("Vocabulary build Successfully") t=time() ft_model.train(data, total_examples=ft_model.corpus_count, epochs=epochs, report_delay=1) print('Training took : {} mins'.format(round((time() - t) / 60, 2))) ft_model.save(self.model_name) print(ft_model)
if i % 10000 == 0: print(str(i) + " samples") yield words for ep in range(epoch): step = 0 for i in range(0, data_size, batch_size): step += 1 if i == 0 and ep == 0: ## Initialize and train a FastText model ### fast_model = FastText(size=feature_size, window=window_context, min_count=min_word_count, workers=multiprocessing.cpu_count()) tokenized_corpus = list( norm_doc_tokenizer("../data/full_dataset.txt", i, batch_size)) fast_model.build_vocab(tokenized_corpus) fast_model.train(tokenized_corpus, total_examples=batch_size, epochs=fast_model.epochs) else: tokenized_corpus = list( norm_doc_tokenizer("../data/full_dataset.txt", i, batch_size)) fast_model.build_vocab(tokenized_corpus, update=True) fast_model.train(tokenized_corpus, total_examples=batch_size, epochs=fast_model.epochs) print("Epoch", str(ep + 1), ",Step", str(step)) fast_model.save("./output/fasttext")
class TweetModelRunner: def __init__(self, startdate=None, enddate=None, tweettype=None, search_terms=None, remove_search_terms=True, size=None, aws_credentials=None): self.creds = aws_credentials self.e = ESSearch(aws_credentials) self.startdate = startdate self.enddate = enddate self.tweettype = tweettype self.processed_count = 0 self.total_count = 0 self.fasttextModel = None self.d2vmodel = None self.search_terms = search_terms self.size = size self.stopwords = set(stopwords.words('english')) additional_stops = [ 'rt', 'de', 'que', 'en', 'la', 'por', 'un', 'se', 'el', '...', 'amp', "coronavirus", "covid", "19", '&' ] for stop in additional_stops: self.stopwords.add(stop) if remove_search_terms is True and search_terms is not None: for term in self.search_terms.lower().translate( str.maketrans('', '', string.punctuation)).split(): self.stopwords.add(term) p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.MENTION) def _removeNonAscii(self, s): return "".join(i for i in s if ord(i) < 128) def _remove_stops(self, word_text): filtered_text = [w for w in word_text if not w in self.stopwords] return filtered_text def _clean_text(self, the_tweet_text): cleaned_text = p.clean(the_tweet_text).lower().replace("’", "'") words = cleaned_text.split() reformed = [ CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words ] cleaned_text = " ".join(reformed) cleaned_text = cleaned_text.translate( str.maketrans('', '', string.punctuation)) cleaned_text = self._removeNonAscii(cleaned_text) tokenized_text = list(tokenize(cleaned_text)) tokenized_text = self._remove_stops(tokenized_text) return tokenized_text def tweetsIter(self, d2v=False): for tw in self.e.query(self.search_terms, tweettype=self.tweettype, startDateString=self.startdate, endDateString=self.enddate, size=self.size): tokenized_text = self._clean_text(tw["_source"]["text"]) if len(tokenized_text) > 3: if d2v: yield TaggedDocument(tokenized_text, [ str(tw["_source"]["tweet_id"]) + "&" + tw["_source"]["date"] + "&" + str(tw["_source"]["retweets"]) ]) else: yield tokenized_text else: continue def fastText(self): self.total_count = self.e.count(self.search_terms, tweettype=self.tweettype, startDateString=self.startdate, endDateString=self.enddate) print("TOTAL TWEETS MATCHING:" + str(self.total_count)) self.fasttextModel = FastText(size=4, window=3, min_count=1) self.fasttextModel.build_vocab(sentences=self.tweetsIter( search_terms=search_terms)) total_examples = self.fasttextModel.corpus_count self.fasttextModel.train( sentences=self.tweetsIter(search_terms=search_terms), total_examples=total_examples, epochs=5) sstring = self.search_terms + self.startdate.replace('/', '-') fname = open('twitter_models/' + sstring + "fasttext.model", "wb") self.fasttextModel.save(fname) def loadFTModel(self, fileName): self.fasttextModel = FastText.load(fileName) def most_sims_FT(self, word): print(self.fasttextModel.wv.similar_by_word(word)) def doc2vec(self, search_terms=None, save_model=True): from gensim.test.utils import common_texts print(self.tweettype) self.total_count = self.e.count(self.search_terms, tweettype=self.tweettype, startDateString=self.startdate, endDateString=self.enddate) print("TOTAL TWEETS MATCHING:" + str(self.total_count)) self.d2vmodel = Doc2Vec(vector_size=100, window=10, min_count=1, workers=4, epochs=20) self.d2vmodel.build_vocab(self.tweetsIter(d2v=True)) self.d2vmodel.train(self.tweetsIter(d2v=True), total_examples=self.d2vmodel.corpus_count, epochs=self.d2vmodel.epochs) sstring = self.search_terms + self.startdate.replace( '/', '-') if self.startdate is not None else self.search_terms sstring = sstring.replace('"', '*') fname = open( 'twitter_network/twitter_created_models/' + sstring + "d2v.model", "wb") print(self.d2vmodel) if save_model: self.d2vmodel.save(fname) def loadd2vModel(self): lstring = self.search_terms + self.startdate.replace( '/', '-') if self.startdate is not None else self.search_terms lstring = sstring = sstring.replace('"', '*') fstring = 'twitter_network/twitter_created_models/' + lstring + "d2v.model" self.d2vmodel = Doc2Vec.load(fstring) print(self.d2vmodel.corpus_count) def jsonclusterd2vModel(self, wfile=None, write_s3=True, write_local=False): from sklearn.cluster import AffinityPropagation from sklearn.cluster import KMeans from sklearn.cluster import MiniBatchKMeans from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import numpy import json import umap from collections import Counter import random if self.d2vmodel is None: raise ValueError("Please Initialize d2vmodel!") num_clusters = 1 kmeans_model = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=250) self.d2vmodel.init_sims(replace=True) X = kmeans_model.fit(self.d2vmodel.docvecs.doctag_syn0) labels = kmeans_model.labels_.tolist() l = kmeans_model.fit_predict(self.d2vmodel.docvecs.doctag_syn0) pca = PCA(n_components=2).fit(self.d2vmodel.docvecs.doctag_syn0) datapoint = pca.transform(self.d2vmodel.docvecs.doctag_syn0) if wfile: json_d = { "data": [], "centroids": [], "timeline": [], "search_terms": self.search_terms } centroid_labels = [] centroids = kmeans_model.cluster_centers_ for x in range(datapoint.shape[0]): json_d["data"].append({ "c": labels[x], "id": self.d2vmodel.docvecs.index_to_doctag(x).split('&')[0], "l": datapoint[x].tolist(), "d": self.d2vmodel.docvecs.index_to_doctag(x).split('&')[1], "p": self.d2vmodel.docvecs.index_to_doctag(x).split('&')[2], }) for cluster in range(num_clusters): thing = list( filter(lambda x: x["c"] == cluster, json_d["data"])) centroide = (sum(map(lambda x: x["l"][0], thing)) / len(thing), sum(map(lambda x: x["l"][1], thing)) / len(thing)) wcounter = Counter() sample_n = 100 if len(thing) > 100 else len(thing) for choice in range(sample_n): if sample_n < 100: tweetobj = thing[choice] else: tweetobj = (random.choice(thing)) cleaned_text = self._clean_text((self.e.get_doc( tweetobj["id"].split('&')[0])["_source"]["text"])) wcounter.update(cleaned_text) centroid_labels.append(",".join( map(lambda x: x[0], wcounter.most_common(5)))) json_d["centroids"].append( [centroide, centroid_labels[cluster]]) print(centroid_labels) if write_local: pre = "twitter_network/static/twitter_network/data/" json_f = open(pre + wfile, "w") json.dump(json_d, json_f) if write_s3: S3_BUCKET = "socialmedia-models" s3 = S3Client(self.creds, S3_BUCKET) s3.upload_str(json.dumps(json_d), wfile)
from gensim.models import FastText from configuration import ROOT_PATH, WORD_DiMENSION from word_embeddings.skipgram_model import read_sentences if __name__ == '__main__': sentences = read_sentences(ROOT_PATH + '/data/cornell_movie_dialogs_corpus/movie_lines.txt') sg_model = FastText(size=WORD_DiMENSION, window=5, min_count=10, workers=4, sg=1) sg_model.build_vocab(sentences) sg_model.train(sentences, total_examples=sg_model.corpus_count, epochs=1) sg_model.save(ROOT_PATH + '/models/embeddings/gensim_fasttext.models')
# Tokenizes and reads a corpus formatted as a CSV line by line. class csvIterator(object): def __iter__(self): path = ('processed messages.csv') with open(path) as fin: for line in fin: yield list(tokenize(line)) # Basic Hyperparameters for training an embedding. model = FT(size=350, window=5, min_count=5) # Builds a list of all words encountered while reading the corpus. model.build_vocab(sentences=csvIterator()) # Sets the total number of words in the model to be equal to the number of words in the corpus. total_examples = model.corpus_count # Trains the model. Epochs is an ML terms to refern to the number of times the model learns the training data. model.train(sentences=csvIterator(), total_examples=total_examples, epochs=5) # Normalizes the vector length. Helpful for similarity comparisons later on. model.init_sims(replace=True) print('Time elapsed during training: {:.2f} minutes'.format( (time.time() - training_time) / 60)) model.save('Embedding.model')
#import library from nltk.tokenize import word_tokenize from gensim.models import FastText import pandas as pd #load data file = pd.read_csv("tmc_data.csv") sentences = file["Text"] corpus = [] #make corpus for sent in sentences: corpus.append(word_tokenize(sent)) model = FastText(corpus, vector_size=100, workers=4, sg=1, window=3) model.train(corpus, total_examples=len(corpus), epochs=10) model.save("tmc2007_fasttext") #binary file #get info print("Embeding size : ", 100) #print("vocab size : ",len(model.vocab)) #test print(model.wv.most_similar("airpor", topn=5)) print(model.wv.most_similar("airpo", topn=5)) print(model.wv.most_similar("airtraffic", topn=5)) print(model.wv.most_similar("craft", topn=5)) print(model.wv.most_similar("acce", topn=5))
class Embed_Vocab(object): def __init__(self, corpus='Avocado', corpus_size=-1, embed_type='word2vec', embed_dim=50, window_size=5, max_iter=10, path_to_corpus='', save_flag=True): self.corpus = corpus self.embed_type = embed_type self.embed_dim = embed_dim self.window_size = window_size self.max_iter = max_iter self.path_to_corpus = path_to_corpus self.save_flag = save_flag self.model = None def train(self): self.gen = Tokenize_Sent(self.path_to_corpus, corpus_size) #Iterator to read files. if self.embed_type == 'fasttext': print('Training fasttext model ...') # self.model = FastText(sentences = self.gen, size = self.embed_dim, iter =self.max_iter, window = self.window_size, min_count = 5, workers = 1, sg = 1) self.model = FastText(size=self.embed_dim, window=self.window_size, min_count=5, workers=4, sg=1) self.model.build_vocab(sentences=self.gen) self.model.train(sentences=self.gen, total_examples=self.gen.size, epochs=self.max_iter) wv = self.model.wv print('Words most similar to \'manager\':') print(wv.most_similar('manager')) else: raise NotImplementedError if self.save_flag: self.save(self.model) def save(self, model): checkpoint_dir = '../../logs/checkpoint_wordEmbed/{}'.format( self.corpus) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) model.save( os.path.join( checkpoint_dir, '{}.{}d.model'.format(self.embed_type, self.embed_dim))) def load(self): checkpoint_dir = '../../logs/checkpoint_wordEmbed/{}/corpus800k'.format( self.corpus) if not os.path.exists(checkpoint_dir): print('Checkpoint Dir Does not Exist !') else: if self.embed_type == 'fasttext': self.model = FastText.load( os.path.join(checkpoint_dir, 'fasttext.{}d.model'.format(self.embed_dim))) else: raise NotImplementedError def _infer(self): self.load() wv = self.model.wv print('hwty' in wv) print('##ed' in wv) print('##y' in wv) print('##mi' in wv) print('##ne' in wv) print('Vector embedding for \'hello\':') print(wv['hello']) word_list = [ 'thanks', 'dear', 'happy', 'sad', 'cost', 'will', 'engine', 'call', 'mail', 'server', 'bug', 'posted', 'inform', 'done', 'send', 'forward', 'talk', 'update', 'regards', 'best', 'worst', 'http' ] for word in word_list: print('Words most similar to \'{}\':'.format(word)) print(wv.most_similar(word))
from gensim.models import FastText from gensim.models import KeyedVectors import csv import os recipe_sentences = [] # csv파일로 부터 읽어온 documents 저장 direction = 'recipe_data/' recipe_folder = os.listdir(direction) # recipe_data폴더에 들어있는 파일(폴더) 목록 list for i, folder in enumerate(recipe_folder): csv_filepath = os.listdir(direction + folder) # 해당 폴더에 들어있는 csv파일 목록 list for j, csv_file in enumerate(csv_filepath): fi = open(direction+folder+'/'+csv_file, 'rt', encoding='UTF8') rdr = csv.reader(fi) for k, row in enumerate(rdr): if k == 0: continue elif k % 2 == 0: recipe_sentences.append(row) fi.close() model_ingredient = FastText(sg=1, window = 10 * 1000000, vector_size=100, min_count=3) # item2vec로 사용하기 위해 windowsize를 크게 설정 model_ingredient.build_vocab(recipe_sentences) model_ingredient.train(recipe_sentences, epochs = 10, total_examples=model_ingredient.corpus_count) model_ingredient.save("./_model_ingredient") #학습한 모델 저장 model_ingredient.wv.save("./_model_ingredient_wv") #학습한 모델의 wv 저장 similarity = model_ingredient.wv.most_similar(positive=['소세지']) print(similarity)
def main(): print( '■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ main() ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■') df = pd.read_csv('./project/data/train_intent.csv') print(df.shape) # (3918, 2) print(df.isnull().sum()) # 결측값 확인 question 0 intent 0 # 형태소 추출 및 Word2Vec vector_size = 15 okt = Okt() word2vec_model = FastText(size=vector_size, window=3, min_count=1) question = df['question'] joinStr = ' '.join(question) # list -> str로 형 변환 morphs = okt.morphs(joinStr) # 형태소 추출 -> list morphs = np.array(list(set(morphs))) # set: 중복된 단어를 제거한다. morphs = morphs.reshape( 1, len(morphs)) # FastText가 단어별로 적용되도록 차원 크기 변경. (1, n) # print(morphs) # [['규모' '포시' '하하' ... '음악' '성시경' '공주']] # print(morphs.shape) # (1, 1605) print('FastText build compile') word2vec_model.build_vocab(sentences=morphs) print('FastText train') word2vec_model.train(sentences=morphs, total_examples=word2vec_model.corpus_count, epochs=10) print('FastText complete') w2c_index = word2vec_model.wv.index2word # FastText가 적용된 단어 목록들 # intent 값 분류 intent = df['intent'] # 의도 값 intent = list(set(intent)) # 중복된 단어를 제거한다. print( intent ) # ['명언', '번역', '날씨', '시간', '맛집', '먼지', '달력', '위키', '인물', '뉴스', '음악', '이슈'] # intent_mapping 생성 idx = 0 intent_mapping = {} for i in intent: intent_mapping[i] = idx idx += 1 print( intent_mapping ) # {'달력': 0, '번역': 1, '맛집': 2, '날씨': 3, '음악': 4, '이슈': 5, '뉴스': 6, '인물': 7, '시간': 8, '위키': 9, '먼지': 10, '명언': 11} # y_data 생성 y_data = df['intent'] # 의도값 y_data = y_data.map(intent_mapping) y_data = to_categorical(y_data) # OneHot encoding print(y_data.shape) # (3918, 12) # x_data 생성 encode_length = 10 x_data = [] for q_raw in question: q_raw = okt.morphs(q_raw) # 문장 형태소별로 분리(단어 분리). str > list q_raw = list( map(lambda x: q_raw[x] if x < len(q_raw) else '@', range(encode_length))) # x가 단어의 수보다 작을 경우 단어(q_raw[x]) 그대로 리스트에 삽입하고 아닐 경우 @를 삽입한다. q_raw = list( map( lambda x: word2vec_model[x] if x in w2c_index else np.zeros(vector_size, dtype=float), q_raw)) q_raw = np.array(q_raw) x_data.append(q_raw) x_data = np.array(x_data) print(x_data.shape) # (3918, 10, 15) x_data = x_data.reshape(len(x_data), encode_length * vector_size) print('Keras Start', x_data.shape, y_data.shape) model = Sequential() model.add(Dense(256, input_dim=150, activation='relu')) model.add(BatchNormalization()) model.add(Dense(128, activation='relu')) model.add(Dense(12, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_data, y_data, batch_size=128, epochs=100) # 입력 데이터 중 불용어 제거 del_josa = [ '이구나', '이네', '이야', '은', '는', '이', '가', '을', '를', '로', '으로', '이야', '야', '냐', '니' ] def tokenize(sentence): word_bag = [] pos = okt.pos(sentence) # 형태소에 품사를 추가한다. for word, tag in pos: # 단어와 품사 if (tag == 'Josa' and word in del_josa) or tag == 'Punctuation': # 불 필요한 조사와 구두점을 제거 continue else: word_bag.append(word) # 단어를 리스트에 추가한다. result = ' '.join(word_bag) return result # 입력 데이터(문장)를 벡터화 한다. (데이터 전처리) def pred(text): q_raw = okt.morphs(text) q_raw = list( map(lambda x: q_raw[x] if x < len(q_raw) else '@', range(encode_length))) q_raw = list( map( lambda x: word2vec_model[x] if x in w2c_index else np.zeros(vector_size, dtype=float), q_raw)) q_raw = np.array(q_raw) print(q_raw) q_raw = q_raw.reshape(1, 150) return q_raw # 작동. while True: print('User : '******'') speech = tokenize(input()) print('tokenize : ', speech) speech = pred(speech) # 결과 y_intent = model.predict(speech) y_intent = np.argmax(y_intent) for result, num in intent_mapping.items(): if y_intent == num: print('Intent : ', result, y_intent) break print( '■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ main() end ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■' )
class Embedding: """ Wrapper class for gensim FastText """ # Constants DEFAULT_COS_DISTANCE = 0 def __init__(self, *args, model_base=None, model_filename=None, fast_text=True, **kwargs): """ Initialize object. Set model_filename (path to saved base embedding model) OR model_base(initialized embedding model). If both are specified, model_base is preferred. If none, new model will be generated based on args and kwargs. """ if model_base is not None: self.model = model_base elif model_filename is not None: self.load(path=model_filename) elif fast_text: self.model = FastText(*args, **kwargs) else: self.model = Word2Vec(*args, **kwargs) def save(self, path): """ Save model to path """ self.model.save(path) def load(self, path): """ Load model from path """ self.model = FastText.load(path) def build_vocab(self, *args, **kwargs): """ Gensim build_vocab wrapper """ self.model.build_vocab(*args, **kwargs) def train(self, *args, total_examples=None, epochs=None, verbose=False, **kwargs): """ Gensim train wrapper """ if total_examples is None: total_examples = self.model.corpus_count if epochs is None: epochs = self.model.epochs if verbose: kwargs['callbacks'] = [EmbeddingEpochCallback()] self.model.train(*args, total_examples=total_examples, epochs=epochs, **kwargs) def get_vector_word(self, word, use_norm=False, handle_oov=True): """ Get vector representation of a word """ if handle_oov: try: result = self.model.wv.word_vec(word=word, use_norm=use_norm) except KeyError: result = np.zeros(self.model.wv.vector_size, np.float32) return result return self.model.wv.word_vec(word=word, use_norm=use_norm) def get_vector_sentence(self, sentence, min_n=3, max_n=3, use_norm_ngram_word=False, use_norm_word=False, use_norm_ngram_char=False, handle_oov=True): """ Get vector representation of a sentence """ if min_n > max_n or min_n > len(sentence): min_n = len(sentence) max_n = len(sentence) elif max_n > len(sentence): max_n = len(sentence) ngrams = [] for n in range(min_n, max_n + 1): ngrams += self._get_ngram_word(sentence=sentence, n=n) ngrams_found = 0 result = np.zeros(self.model.wv.vector_size, np.float32) for ngram in ngrams: try: ngram_vector = self._get_vector_ngram_words( ngram_words=ngram, use_norm_word=use_norm_word, use_norm_ngram_char=use_norm_ngram_char, handle_oov=False) if use_norm_ngram_word: ngram_vector = self.normalize_vector(ngram_vector) result += ngram_vector ngrams_found += 1 except KeyError: pass if not handle_oov and ngrams_found == 0: raise KeyError('all word level n-grams are absent from model') else: return result / max(1, ngrams_found) @staticmethod def normalize_vector(vec): vec_length = np.linalg.norm(vec) if vec_length == 0.0: return vec return vec / vec_length @staticmethod def cosine_distance(vec1, vec2): return cosine_similarity([vec1], [vec2])[0][0] @staticmethod def _get_ngram_word(sentence, n): if n > len(sentence): n = len(sentence) elif n <= 0: return [] result = [] for idx_start in range(0, len(sentence) - n + 1): result.append(sentence[idx_start:idx_start + n]) return result def _get_vector_ngram_words(self, ngram_words, use_norm_word=False, use_norm_ngram_char=False, handle_oov=True): words_found = 0 result = np.zeros(self.model.wv.vector_size, np.float32) for word in ngram_words: try: word_vector = self.get_vector_word( word=word, use_norm=use_norm_ngram_char, handle_oov=False) if use_norm_word: word_vector = self.normalize_vector(word_vector) result += word_vector words_found += 1 except KeyError: pass if not handle_oov and words_found == 0: raise KeyError('all words are absent from model') else: return result / max(1, words_found)
for com in data.Comment: com= re.sub("[^a-zA-Z0-8ğüşıöçİĞÜŞÖÇ]"," ",com) com=com.lower() com=nlp.word_tokenize(com) comment_list.append(com) #%% vector_size=250 window=5 #%% Fassttext modeli oluşturma ve diske kaydetme fasttext_model = 'fasstext.model' print("Generating Fasttext Vectors...") start = time.time() model= FastText(size = vector_size) model.build_vocab(comment_list) model.train( comment_list,window= window,min_count = 1, workers =4 , total_examples = model.corpus_count , epochs = model.epochs) print("Model created in {} seconds",format(time.time() -start)) model.save(fasttext_model) del model #%% fasttext_model = 'fasstext.model' model = FastText.load(fasttext_model) #%% her bir yorumdaki kelimelerin vektör ortalamalarını hesaplama main_mean_array=[] mean_vektor = np.zeros((1,250)) with tqdm(total=len(comment_list)) as pbar:
#!/usr/bin/env python # _*_ coding: utf-8 _*_ ''' Created on 2019-05-21 15:23:11 @author: wind ''' from gensim.models import FastText # 训练 sentences = [["你", "是", "谁"], ["我", "是", "中国人"]] # 方法一(官方不建议这样用) # model = FastText(sentences, size=4, window=3, min_count=1, iter=10,min_n = 3 , max_n = 6,word_ngrams = 1) # 方法二 model = FastText(size=4, window=3, min_count=1, word_ngrams=1) model.build_vocab(sentences=sentences) model.train(sentences=sentences, total_examples=len(sentences), epochs=10) # 获取词向量 print(model.wv['你']) # 词向量获得的方式 print(model.wv.word_vec('你')) # 保存模型 model.save('./model.bin') # 加载模型 model = FastText.load("./model.bin") # 保存词向量 model.wv.save_word2vec_format("./wv.txt")
sentenceList, entityList = train_data_load(traindataset_path) # 훈련 데이터 print('start', len(sentenceList), path + folder) if not os.path.exists(path + folder): os.makedirs(path + folder) # 풀더 생성 #──────────────────────────────────────────────────────────────────── # → 워드 임베딩 print("\n### Fasttext bulid model ###", end="\n") w2vModel = FastText(size=vector_size, window=3, workers=8, min_count=1) w2vModel.build_vocab(sentenceList) print('\n### Fasttext build complete ###', end="\n") print('\n### Fasttext trian start ###', end="\n") w2vModel.train(sentenceList, total_examples=w2vModel.corpus_count, epochs=w2vModel.epochs, compute_loss=True, verbose=1) print('\n### Fasttext train complete ###', end="\n") w2vModel.save(path + folder + 'fasttext') # 저장 print('\n### Fasttext model save ###', end="\n") # w2vModel = FastText.load('model/entity/통합_1031/fasttext') # → fasttext가 적응된 단어 목록들 w2vIndex = w2vModel.wv.index2word print('*단어의 개수:', len(w2vIndex)) #──────────────────────────────────────────────────────────────────── # → 개체명 인덱스 부여
print(pubmed_wv.most_similar(positive=['brain'])) print('----------------------------') ''' ### Create word2vec model w/ merged vocab t = time() new_wv = FastText(size=30, window=5, min_count=1, workers=3, sg=0, hs=1, negative = 10, sample=0.001, alpha=0.1) new_wv.build_vocab(sentences) ''' total_examples = new_wv.corpus_count new_wv.build_vocab([list(pubmed_wv.vocab.keys())], update=True) new_wv.intersect_word2vec_format(preTrainedPath, binary=True, lockf=1.0) ''' ### Train for 2 epochs new_wv.train(sentences, epochs=2) # , total_examples=total_examples print('Time to train the model 2 epochs: {} mins'.format(round((time() - t) / 60, 2))) print('----------------------------') print(new_wv.most_similar(positive=['treatment'])) print(new_wv.most_similar(positive=['female'])) print(new_wv.most_similar(positive=['history'])) print(new_wv.most_similar(positive=['disease'])) print(new_wv.most_similar(positive=['brain'])) new_wv.save_word2vec_format('mimic-pubmed_2.bin', binary=True) print('----------------------------') # Train for 10 epochs new_wv.train(sentences, epochs=8) # , total_examples=total_examples print('Time to train the model 10 epochs: {} mins'.format(round((time() - t) / 60, 2))) print('----------------------------')
class Text(Dataset): def __init__(self, file=None, df=None, feature_col='Text', label_col=''): super().__init__(file, df, feature_col, label_col) self.text = self.X self.weights = None self.split_text() # Split into text train and test def split_text(self): self.refresh() self.sentence_train, self.sentence_test, self.y_train, self.y_test = train_test_split( self.text, self.y) def bag_of_words(self, **kwargs): """Transform text corpus into bag of words i.e ['Hi you, how are you', 'I am doing well, thank you!'] -> [[1, 1, 1, 2, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 1]] """ self.vectorizer = CountVectorizer(**kwargs) self.vectorizer.fit(self.sentence_train) self.BoW_train = self.vectorizer.transform( self.sentence_train).toarray() self.BoW_test = self.vectorizer.transform(self.sentence_test).toarray() self.X_train = self.BoW_train self.X_test = self.BoW_test self.feature_names = self.vectorizer.get_feature_names() def vectorize(self, num_words=10000): """Transform text corpus to integers in a tokenizer i.e. ["Hi how are you?", "I'm well, how about you"] becomes [[10, 3, 4, 7, 0], [5, 12, 3, 15, 7]] """ self.vectorizer = Tokenizer(num_words) self.vectorizer.fit_on_texts(self.sentence_train) self.tokenized_train = self.vectorizer.texts_to_sequences( self.sentence_train) self.tokenized_test = self.vectorizer.texts_to_sequences( self.sentence_test) self.wtoi = self.vectorizer.word_index self.itow = self.vectorizer.index_word self.pad_and_refresh() def pad_and_refresh(self, max_len=None): if max_len is None: self.tokenized_train = pad_sequences(self.tokenized_train, padding='post') self.tokenized_test = pad_sequences(self.tokenized_test, padding='post') else: self.tokenized_train = pad_sequences(self.tokenized_train, padding='post', max_len=max_len) self.tokenized_test = pad_sequences(self.tokenized_test, padding='post', max_len=max_len) self.X_train = self.tokenized_train self.X_test = self.tokenized_test self.vocab_size = len(self.wtoi) + 1 def create_pretrained_embedding_matrix(self, path, embedding_dim=300): # works after vectorize self.weights = np.zeros((self.vocab_size, embedding_dim)) with open(path) as f: for line in f: word, vector = line.split() if word in self.vectorizer.word_index: idx = self.wtoi(word) self.weights[idx] = np.array( vector, dtype=np.float32)[:embedding_dim] def word_to_index(self, word): #word to index return self.wtoi[word] def index_to_word(self, idx): #index to word return self.itow[idx] def train_fasttext(self, path, sg=1, embedding_dim=300, min_count=2, max_vocab_size=30000, seed=42, epochs=10, workers=4, lowercase=False, full=False): sentences = self.sentence_train.values self.fasttext_model = FastText(sg=sg, size=embedding_dim, min_count=min_count, max_vocab_size=max_vocab_size, seed=seed, workers=workers) tokenized = list(self._gen_sentences(sentences)) print('Building vocabulary for fasttext model...') self.fasttext_model.build_vocab(sentences=tokenized) print('Training fasttext model...') self.fasttext_model.train(sentences=tokenized, total_examples=len(tokenized), epochs=epochs) self.word_vectors = self.fasttext_model.wv counts = Counter({ word: vocab.count for (word, vocab) in self.word_vectors.vocab.items() }) self.wtoi = { t[0]: i + 1 for i, t in enumerate(counts.most_common(max_vocab_size)) } self.itow = {v: k for k, v in self.wtoi.items()} self.tokenized_train = [[self.wtoi.get(word, 0) for word in sentence] for sentence in tokenized] tok_test = list(self._gen_sentences(self.sentence_test.values)) self.tokenized_test = [[self.wtoi.get(word, 0) for word in sentence] for sentence in tok_test] self.pad_and_refresh() self.save_fasttext(path) self.create_embedding_matrix(embedding_dim) def create_embedding_matrix(self, embedding_dim): self.weights = np.zeros((self.vocab_size, embedding_dim)) for word, i in self.wtoi.items(): if i >= 10000: continue try: embedding_vector = self.word_vectors[word] # words not found in embedding index will be all-zeros. self.weights[i] = embedding_vector except: pass def save_fasttext(self, path): model_path = os.path.join(path, 'fasttext.model') self.fasttext_model.save(model_path) def _gen_sentences(self, sentences, lowercase=False): for s in sentences: yield (list(tokenize(s, lowercase=lowercase)))
def _create_vocab(self): assert self.split == 'train', "Vocablurary can only be created for training file." with open(self.raw_data_path, 'r') as file: text = file.read() sentences = sent_tokenize(text) occ_register = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<exc>', '<pad>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) texts = [] unq_words = [] unk_words = [] for i, line in enumerate(sentences): words = word_tokenize(line) occ_register.update(words) texts.append(words) if self.pre_emb: model = KeyedVectors.load_word2vec_format(self.model_path) else: if os.path.exists(self.model_path): model = FastText.load(self.model_path) else: model = FastText(size=self.ft_size, window=self.ft_w, min_count=self.min_occ) model.build_vocab(sentences=texts) model.train(sentences=texts, total_examples=len(texts), epochs=0) model.save(self.model_path) base = np.ones((300, ), dtype=np.float32) emb = [base * (i - 1) for i in range(len(special_tokens))] for w, occ in occ_register.items(): if occ > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) if self.pre_emb: if w in model.vocab: emb.append(model[w]) else: emb.append(emb[0]) unk_words.append(w) else: emb.append(model[w]) else: unq_words.append(w) assert len(w2i) == len(i2w) == len(emb) emb = np.array(emb) # print(emb.min()) # print(emb.max()) emb = (emb - emb.min()) / (emb.max() - emb.min()) print("Vocablurary of {} keys created, {} words are excluded, {} " "words not in embedding dictionary.".format( len(w2i), len(unq_words), len(unk_words))) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.gen_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) with open(os.path.join(self.gen_dir, self.emb_file), 'wb') as emb_file: pickle.dump(np.array(emb), emb_file) with open(os.path.join(self.gen_dir, 'cub.unique'), 'wb') as unq_file: pickle.dump(np.array(unq_words), unq_file) with open(os.path.join(self.gen_dir, 'cub.unknown'), 'wb') as unknown_file: pickle.dump(np.array(unk_words), unknown_file) with open(os.path.join(self.gen_dir, 'cub.all'), 'wb') as a_file: pickle.dump(occ_register, a_file) self._load_vocab()
class OneVsRestSGDClassifier(LabelClassifier): def __init__(self, f_dim=100, ft_iters=20, update_iters=100, label_dict_path='data/labels.txt'): LabelClassifier.__init__(self, label_dict_path) self.f_dim = f_dim # dimension of word feature vector self.ft_iters = ft_iters self.update_iters = update_iters self.ft_model = FastText(min_count=1, size=self.f_dim) self.clf = OneVsRestClassifier( SGDClassifier(loss='modified_huber', class_weight={ 0: 0.4, 1: 0.6 }, penalty='l2', warm_start=False, random_state=1)) def init_fasttext(self, model_path=None, train_data=None): """ if train_data is provided, train a new fasttext model; otherwise, load it from the given path -------- Parameter: model_path: fasttext model prefix train_data: a list of tokenized sentences. if not provided, will try to load existing model from model_path """ if not train_data and model_path and os.path.isfile(model_path): #=== load exisitng model ==== print('loading fasttext model from', model_path) self.ft_model = FastText.load(model_path) elif train_data: #=== train fast text model ==== # if train_data is not a list of list, split each sentence # into list of words print('training fasttext model from scratch...') train_data = [re.split(',| ',r) if (not isinstance(r,list)) else r\ for r in train_data ] self.ft_model.build_vocab(train_data) self.ft_model.train(train_data, total_examples=len(train_data), epochs=self.ft_iters) if model_path: self.ft_model.save(model_path, separately=[]) else: #=== no train data and no model path provided raise TrainDataException( 'Error building fasttext model. No data/model provided.') def div_norm(self, x): norm_value = np.sqrt(np.sum(x**2)) #l2norm if norm_value > 0: return x * (1.0 / norm_value) else: return x def sentence_to_vec(self, words): """ generating embedding by summing up normalized word embeddings -------- Parameter: words: a list of words or a string representation of a sentence (seperated by space or ',' ) Return: sentence embedding matrix of size len(words) x f_dim """ if not isinstance(words, list): words = re.split(',| ', words) vecs = np.zeros((len(words), self.f_dim)) for i, word in enumerate(words): v = self.ft_model.wv.get_vector(word) vecs[i] = self.div_norm(v) return np.mean(vecs, axis=0) def to_vec(self, data): """ batch computation of sentence embeddings """ vec = np.zeros((len(data), self.f_dim)) for i, sentence in enumerate(data): vec[i] = self.sentence_to_vec(sentence) return vec def train(self, train_data, train_label): """ offline training of the SGD classifier -------- Parameters: train_data: a list of tokenized sentences. Each sentence is either a string deliminated by comma or space, or a list of words. train_label: a list of labels. Each label is a string deliminated by comma or space. Return: X: sentence embedding matrix of size len(train_data) x f_dim Y: binary label matrix of size len(train_data) x #_classes """ print('training multilabel classifier on %d samples...' % len(train_data)) Y = np.zeros((len(train_label), len(self.labeldict))) for i, labels in enumerate(train_label): label_list = re.split(',| ', labels) for l in label_list: if l: Y[i, self.labeldictR[l]] = 1 # add dummy sample to classes that do not have samples indices = np.where(np.sum(Y, axis=0) == 0)[0] Y_new = np.zeros((len(indices), Y.shape[1])) for i, id in enumerate(indices): train_data.append([self.labeldict[id]]) Y_new[i, id] = 1 Y = np.vstack((Y, Y_new)) X = self.to_vec(train_data) self.clf.fit(X, Y) return X, Y def train_update(self, train_data, train_label): """ online training of the SGD classifier -------- Parameters: see train() """ Y = np.zeros((len(train_label), len(self.labeldict))) X = self.to_vec(train_data) for i, labels in enumerate(train_label): label_list = re.split(',| ', labels) for l in label_list: if l: Y[i, self.labeldictR[l]] = 1 for i in range(self.update_iters): self.clf.partial_fit(X, Y) return X, Y def classify(self, string): """ predict the labels of a tokenized sentence -------- Parameters: string: string delimited by comma or space, or a list of words Return: labels: a list of predicted labels """ X = self.to_vec([string]) Y = self.clf.predict(X) #print('class probability',self.clf.predict_proba(X) ) labels = [self.labeldict[id] for id in np.nonzero(Y[0])[0]] return labels def save_clf(self, filename): print('writing classification model to', filename, '...') with open(filename, 'wb') as f: pickle.dump(self.clf, f) def load_clf(self, filename): print('loading classification model from', filename, '...') with open(filename, 'rb') as f: self.clf = pickle.load(f)
# Save the particular model model_dbow.save("/share/pi/rubin/jiaming/models/{}.model".format(model_name)) model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) elif args.model == 'fasttext': model_name = "fasttext_v{}_a{}_e{}_t{}_w{}_s{}_ns{}".format(args.size, args.alpha, args.epochs, args.alg, args.window, args.sample, args.ns_exponent) if model_name + ".model" in trained_models: print (model_name + "already trained. Passing.") continue print("Training model: " + model_name) model = FastText(min_count=10, negative=5, size=args.size, sg = args.alg, alpha=args.alpha, min_alpha=args.alpha, window=args.window, sample=args.sample, ns_exponent=args.ns_exponent, workers=10) model.build_vocab(sentences=[word_tokenize(line.strip()) for line in note_sentences]) for epoch in range(args.epochs): model.train(sentences=utils.shuffle([x for x in tqdm(note_sentences)]), total_examples=len(note_sentences), epochs=1) model.alpha -= 0.002 model.min_alpha = model.alpha # Save the particular model model.save("/share/pi/rubin/jiaming/models/{}.model".format(model_name))
def train_fasttext(corpus): model = FastText(size=9, window=2, min_count=1) model.build_vocab(sentences=corpus) model.train(sentences=corpus, total_examples=len(corpus), epochs=10) model.save(r'models\fasttext.model')
food = ["food"] rest = ["restaurant"] #-------------------------- Preproccessing ---------------------------------- reviews = [] for i in reviews_df_com["Review Text"]: reviews.append(clean_text(i)) #top.append(text) # That is very IMPORTANT !!! :) # --------------------------- Word2Vect ---------------------------------- # model = Word2Vec(reviews, size=150, window=10, min_count=2, workers=10) # model.train(reviews, total_examples=len(reviews), epochs=10) # ---------------------------- Fasttext ---------------------------------- model = FastText(size=170, window=10, min_count=2, workers=10) # instantiate model.build_vocab(sentences=reviews) model.train(sentences=reviews, total_examples=len(reviews), epochs=10) # train word_vectors = model.wv word2vectfonc(hotel, 1) word2vectfonc(staff, 1) word2vectfonc(loc, 2) word2vectfonc(room, 3) word2vectfonc(breakfast, 4) word2vectfonc(bed, 5) word2vectfonc(service, 6) word2vectfonc(bath, 7) word2vectfonc(view, 8) word2vectfonc(food, 9) word2vectfonc(rest, 10)