def load(fname, kind='auto', *args, **kwargs): ''' Loads a word vectors file ''' if kind == 'auto': if fname.endswith('.bin'): kind = 'bin' elif fname.endswith('.txt'): kind = 'txt' else: raise Exception('Could not identify kind') if kind == 'bin': return WordVectors.from_binary(fname, *args, **kwargs) elif kind == 'txt': return WordVectors.from_text(fname, *args, **kwargs) elif kind == 'mmap': return WordVectors.from_mmap(fname, *args, **kwargs) else: raise Exception('Unknown kind')
def read_from_senna(): file_embedding = "/Users/HyNguyen/Documents/Research/Data/senna/embeddings/embeddings.txt" file_wordslist = "/Users/HyNguyen/Documents/Research/Data/senna/hash/words.lst" with open(file_wordslist, mode="r") as f: words = f.readlines() with open(file_embedding, mode="r") as f: vectors = f.readlines() words_embedding = [] words_index = {} for i ,(word, vector) in enumerate(zip(words,vectors)): word_2 = word[:-1] vector_2 = vector[:-1].split() vec_np = np.array(vector_2,dtype=np.float32) words_embedding.append(vec_np) words_index[word_2] = i words_embedding = np.array(words_embedding,dtype=np.float32) print("words_embedding.shape",words_embedding.shape) print("words_index.length",len(words_index)) wordvectors = WordVectors(50,words_embedding,words_index) wordvectors.save_text_format("../model/cwvector.txt")
def read_from_senna(): file_embedding = "/Users/HyNguyen/Documents/Research/Data/senna/embeddings/embeddings.txt" file_wordslist = "/Users/HyNguyen/Documents/Research/Data/senna/hash/words.lst" with open(file_wordslist, mode="r") as f: words = f.readlines() with open(file_embedding, mode="r") as f: vectors = f.readlines() words_embedding = [] words_index = {} for i, (word, vector) in enumerate(zip(words, vectors)): word_2 = word[:-1] vector_2 = vector[:-1].split() vec_np = np.array(vector_2, dtype=np.float32) words_embedding.append(vec_np) words_index[word_2] = i words_embedding = np.array(words_embedding, dtype=np.float32) print("words_embedding.shape", words_embedding.shape) print("words_index.length", len(words_index)) wordvectors = WordVectors(50, words_embedding, words_index) wordvectors.save_text_format("../model/cwvector.txt")
words = nltk.word_tokenize(sentence) for word in words: vocab[str(word).lower()] = 1 # duc04 data with open("../data/sentence.score.duc04.txt", mode="r") as f: for line in f: sentence, score = line.split("hynguyensplit") words = nltk.word_tokenize(sentence) for word in words: vocab[str(word).lower()] = 1 # duc05 data with open("../data/sentence.score.duc05.txt", mode="r") as f: for line in f: sentence, score = line.split("hynguyensplit") words = nltk.word_tokenize(sentence) for word in words: vocab[str(word).lower()] = 1 print("Finish reading vocab size: ", str(len(vocab.keys()))) return vocab import pickle if __name__ == "__main__": w2v = word2vec.Word2Vec.load_word2vec_format("/Users/HyNguyen/Documents/MachineLearning/convae/model/glove.400k.txt",binary=False) with open("vocab.lower.pickle", mode="rb") as f: vocab = pickle.load(f) wordvectors = WordVectors.create_wordvectos_from_word2vec_vocab(w2v,vocab) wordvectors.save_text_format("../model/glove.filter.txt")
def main(wordCorpus): min_df = 2 if (wordCorpus == 'twenty-news'): groupIndices = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] orderReductions = [ 'none', 'svd', 'glove', 'fasttext', 'word2vec', 'custom-vectors-fasttext', 'custom-vectors-word2vec' ] elif (wordCorpus == 'acl-imdb'): groupIndices = [0, 1] orderReductions = [ 'svd', 'glove', 'fasttext', 'word2vec', 'custom-vectors-fasttext', 'custom-vectors-word2vec' ] nClusters = len(groupIndices) vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=min_df) metrics = {} for tokenType in ['stopped']: X, indexList = getX(wordCorpus, tokenType, groupIndices) sparseX = vectorizer.fit_transform(X) corpusVocab = vectorizer.vocabulary_ metrics[tokenType] = {} for orderReduction in orderReductions: if (((orderReduction == 'word2vec') or (orderReduction == 'fasttext') or (orderReduction == 'glove')) and (tokenType != 'stopped')): continue else: print(tokenType, orderReduction) metrics[tokenType][orderReduction] = {} if ((orderReduction != 'svd') and (orderReduction != 'none')): wvObject = WordVectors(wordCorpus=wordCorpus, wordVecSource=orderReduction, corpusVocab=corpusVocab, tokenType=tokenType) if (orderReduction == 'none'): denseZ = sparseX denseZ = denseZ.toarray() elif (orderReduction == 'svd'): denseZ = svdReduce(sparseX, order=300) else: argsForTransform = { 'sparseX': sparseX, 'vocab': corpusVocab } denseZ = Transform2WordVectors(wvObject).transform( argsForTransform) normalizer = Normalizer(copy=False) denseZ = normalizer.fit_transform(denseZ) intraClusterMetics, centroidMetrics = computeMetrics( denseZ, indexList, tokenType, orderReduction) metrics[tokenType][orderReduction][ 'intraClusterMetics'] = intraClusterMetics metrics[tokenType][orderReduction][ 'centroidMetrics'] = centroidMetrics f = open('./results/' + wordCorpus + '.json', 'w') out = json.dumps(metrics, ensure_ascii=True) f.write(out) f.close()
from wordvectors import WordVectors import numpy as np def read_from_senna(): file_embedding = "/Users/HyNguyen/Documents/Research/Data/senna/embeddings/embeddings.txt" file_wordslist = "/Users/HyNguyen/Documents/Research/Data/senna/hash/words.lst" with open(file_wordslist, mode="r") as f: words = f.readlines() with open(file_embedding, mode="r") as f: vectors = f.readlines() words_embedding = [] words_index = {} for i ,(word, vector) in enumerate(zip(words,vectors)): word_2 = word[:-1] vector_2 = vector[:-1].split() vec_np = np.array(vector_2,dtype=np.float32) words_embedding.append(vec_np) words_index[word_2] = i words_embedding = np.array(words_embedding,dtype=np.float32) print("words_embedding.shape",words_embedding.shape) print("words_index.length",len(words_index)) wordvectors = WordVectors(50,words_embedding,words_index) wordvectors.save_text_format("../model/cwvector.txt") if __name__ == "__main__": a = WordVectors.load_from_text_format("../model/cwvector.txt", "CWVector")
import xml.etree.ElementTree as ET import os import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) if __name__ == "__main__": # Load Word2Vec from Google w2v = word2vec.Word2Vec.load_word2vec_format("/Users/HyNguyen/Documents/Research/Data/GoogleNews-vectors-negative300.bin",binary=True) # Create object WordVectors wordvectors = WordVectors(300,np.empty((0,300),dtype=float),{}) # wordvectors = WordVectors.load("model/wordvector.txt") # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) wordvectors.add_wordvector_from_w2vmodel(w2v,words) print("Finish penn tree bank corpus, Wordvector size: ", str(wordvectors.embed_matrix.shape[0])) # Brown brown_sents = brown.sents()
def main(): start0 = time.time() wordCorpus, min_df, model, tokenType, wordVecSource = processArgs() logger.info('Running: WordCorpus: {}, Models: {}, TokenType: {}, min_df: {}, wordVecSource: {}'.format(wordCorpus, model, tokenType, min_df, wordVecSource)) X, y, classNames = Tokens(wordCorpus).getTokens(tokenType) vocabularyGenerator = CountVectorizer(analyzer=lambda x: x, min_df=min_df).fit(X) # This is only to generate a vocabulary with min_df corpusVocab = sorted(vocabularyGenerator.vocabulary_, key=vocabularyGenerator.vocabulary_.get) logger.info('Total Corpus Size: len(corpusVocab) with frequency > min_df : {}, X.shape: {}, y.shape: {}, # classes: {}'.format(len(corpusVocab), X.shape, y.shape, len(classNames))) logger.info('Class Names:{}'.format(classNames)) if (wordVecSource): wvObject = WordVectors(wordCorpus=wordCorpus, wordVecSource=wordVecSource,corpusVocab=corpusVocab,tokenType=tokenType) # nWords_in_this_set X wvLength else: wvObject = None results = {} results['timeForDataFetch'] = time.time() - start0 logger.info('Time Taken For Data Fetch: {}'.format(results['timeForDataFetch'])) modelRuns = defineModels(min_df, model, wvObject) logger.info ('Model Runs:\n{}'.format(modelRuns)) if (wordCorpus == 'twenty-news'): testDataFraction = 0.2 sss = StratifiedShuffleSplit(n_splits=1, test_size=testDataFraction, random_state=0) sss.get_n_splits(X, y) for train_index, test_index in sss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] elif (wordCorpus == 'acl-imdb'): X_train, y_train, classNames = Tokens(wordCorpus).getTokens(tokenType,'train') X_test, y_test, classNames = Tokens(wordCorpus).getTokens(tokenType,'test') marker = 'X y vocab: Train => Test:' + str(X_train.shape) + ',' + str(y_train.shape) + '=>' + str(X_test.shape) + ',' + str(y_test.shape) for name, model in modelRuns: results[name] = {} results[name][marker] = {} logger.info('\n\nCurrent Run: {} => {}'.format(name, marker)) start = time.time() logger.info("Training Begin") model.fit(X_train, y_train) logger.info("Training End") logger.info("Prediction Begin") predicted = model.predict(X_test) logger.info("Prediction End") #` # nclasses x nclasses matrix.` # # M_ij => Truly 'i' but predicted as 'j'. C_ii => TP. # class_i => class[groupIndex] => so classNames go top->bottom & left->right # # sum row i - C_ii => Predicted as NOT 'i' when it should be 'i' => FN => C_ii/(sum_row_i) = recall # sum column i - C_ii => Predicted as 'i' when it should NOT be 'i' => FP => C_ii/(sum_column_i) = precision #` results[name][marker]['model_vocabulary_size'] = len(model.named_steps['vectorizer'].model.vocabulary_) results[name][marker]['confusion_matrix'] = confusion_matrix(y_test, predicted) results[name][marker]['timeForThisModel_fit_predict'] = time.time()-start logger.info ('Run:{}, {}, Confusion Matrix:\n{}'.format(name,marker, results[name][marker]['confusion_matrix'])) logger.info ('Run:{}, {}, Classification Report:\n{}'.format(name,marker,classification_report(y_test, predicted, target_names=classNames))) logger.info ('Model Vocab Size:{}'.format(results[name][marker]['model_vocabulary_size'])) logger.info ('Time Taken For This Model Run:{}'.format(results[name][marker]['timeForThisModel_fit_predict'])) results['overAllTimeTaken'] = time.time() - start0 logger.info('Overall Time Taken:{}'.format(results['overAllTimeTaken'])) logger.info("Prediction End")
import os import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) if __name__ == "__main__": # Load Word2Vec from Google w2v = word2vec.Word2Vec.load_word2vec_format( "/Users/HyNguyen/Documents/Research/Data/GoogleNews-vectors-negative300.bin", binary=True) # Create object WordVectors wordvectors = WordVectors(300, np.empty((0, 300), dtype=float), {}) # wordvectors = WordVectors.load("model/wordvector.txt") # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) wordvectors.add_wordvector_from_w2vmodel(w2v, words) print("Finish penn tree bank corpus, Wordvector size: ", str(wordvectors.embed_matrix.shape[0])) # Brown brown_sents = brown.sents() for i in range(len(brown_sents)):
import os from wordvectors import WordVectors import numpy as np def read_from_senna(): file_embedding = "/Users/HyNguyen/Documents/Research/Data/senna/embeddings/embeddings.txt" file_wordslist = "/Users/HyNguyen/Documents/Research/Data/senna/hash/words.lst" with open(file_wordslist, mode="r") as f: words = f.readlines() with open(file_embedding, mode="r") as f: vectors = f.readlines() words_embedding = [] words_index = {} for i, (word, vector) in enumerate(zip(words, vectors)): word_2 = word[:-1] vector_2 = vector[:-1].split() vec_np = np.array(vector_2, dtype=np.float32) words_embedding.append(vec_np) words_index[word_2] = i words_embedding = np.array(words_embedding, dtype=np.float32) print("words_embedding.shape", words_embedding.shape) print("words_index.length", len(words_index)) wordvectors = WordVectors(50, words_embedding, words_index) wordvectors.save_text_format("../model/cwvector.txt") if __name__ == "__main__": a = WordVectors.load_from_text_format("../model/cwvector.txt", "CWVector")
def main(): start0 = time.time() wordCorpus, min_df, tokenType, orderReduction, listOfClasses = processArgs( ) classList = list(map(int, listOfClasses.split(','))) logger.info( 'Running: WordCorpus: {}, TokenType: {}, min_df: {}, orderReduction: {}, listOfClasses: {}' .format(wordCorpus, tokenType, min_df, orderReduction, classList)) # vectorizers = [ ('counts', CountVectorizer(analyzer=lambda x: x, min_df=min_df)), ('tf-idf', TfidfVectorizer(analyzer=lambda x: x, min_df=min_df)) ] vectorizers = [('tf-idf', TfidfVectorizer(analyzer=lambda x: x, min_df=min_df))] X, indexList = getX(wordCorpus, tokenType, listOfClasses) out0 = [tokenType] for trueCluster, startEnd in indexList.items(): out0.append(trueCluster + ':' + str(startEnd['end'] - startEnd['start'])) vocabularyGenerator = CountVectorizer( analyzer=lambda x: x, min_df=min_df).fit( X) # This is only to generate a vocabulary with min_df corpusVocab = vocabularyGenerator.vocabulary_ logger.info( 'Total Corpus Size: len(corpusVocab) with frequency > min_df : {}, X.shape: {}, # clusters: {}' .format(len(corpusVocab), X.shape, len(classList))) if ((orderReduction) and (orderReduction != 'svd')): wvObject = WordVectors(wordCorpus=wordCorpus, wordVecSource=orderReduction, corpusVocab=corpusVocab, tokenType=tokenType) results = [] for name, vectorizer in vectorizers: logger.info('\n\nVectorizer: {}'.format(name)) sparseX = vectorizer.fit_transform(X) if (not orderReduction): denseZ = sparseX elif (orderReduction == 'svd'): denseZ = svdReduce(sparseX, order=300) else: argsForTransform = {'sparseX': sparseX, 'vocab': corpusVocab} denseZ = Transform2WordVectors(wvObject).transform( argsForTransform) nClusters = len(classList) normalizer = Normalizer(copy=False) denseZ = normalizer.fit_transform(denseZ) nRuns = 1 for run in range(nRuns): result = [] result = result + out0 result = result + [name, run, orderReduction] model = KMeans(n_clusters=nClusters, max_iter=5000, tol=1.0e-8) labels = model.fit_predict(denseZ) logger.info('\nRun:{}'.format(run)) for predictedCluster in range(nClusters): result.append( str(predictedCluster) + ':' + str(len(set(np.where(labels == predictedCluster)[0])))) for trueCluster, startEnd in indexList.items(): predictedLabels = labels[startEnd['start']:startEnd['end']] for predictedCluster in range(nClusters): count = len( set(np.where(predictedLabels == predictedCluster)[0])) result.append(str(predictedCluster) + ':' + str(count)) minClusterSeparation = getMinClusterSeparation( nClusters, model.cluster_centers_) ratio = model.inertia_ / minClusterSeparation result = result + [model.inertia_, minClusterSeparation, ratio] results.append(result) with open('./results.csv', 'wb') as fh1: np.savetxt(fh1, results, delimiter=", ", fmt='%s')