Esempio n. 1
0
def load(fname, kind='auto', *args, **kwargs):
    '''
    Loads a word vectors file
    '''
    if kind == 'auto':
        if fname.endswith('.bin'):
            kind = 'bin'
        elif fname.endswith('.txt'):
            kind = 'txt'
        else:
            raise Exception('Could not identify kind')
    if kind == 'bin':
        return WordVectors.from_binary(fname, *args, **kwargs)
    elif kind == 'txt':
        return WordVectors.from_text(fname, *args, **kwargs)
    elif kind == 'mmap':
        return WordVectors.from_mmap(fname, *args, **kwargs)
    else:
        raise Exception('Unknown kind')
Esempio n. 2
0
def read_from_senna():
    file_embedding  = "/Users/HyNguyen/Documents/Research/Data/senna/embeddings/embeddings.txt"
    file_wordslist  = "/Users/HyNguyen/Documents/Research/Data/senna/hash/words.lst"
    with open(file_wordslist, mode="r") as f:
        words = f.readlines()
    with open(file_embedding, mode="r") as f:
        vectors = f.readlines()
    words_embedding = []
    words_index = {}
    for i ,(word, vector) in enumerate(zip(words,vectors)):
        word_2 = word[:-1]
        vector_2 = vector[:-1].split()
        vec_np = np.array(vector_2,dtype=np.float32)
        words_embedding.append(vec_np)
        words_index[word_2] = i

    words_embedding = np.array(words_embedding,dtype=np.float32)
    print("words_embedding.shape",words_embedding.shape)
    print("words_index.length",len(words_index))
    wordvectors = WordVectors(50,words_embedding,words_index)
    wordvectors.save_text_format("../model/cwvector.txt")
Esempio n. 3
0
def read_from_senna():
    file_embedding = "/Users/HyNguyen/Documents/Research/Data/senna/embeddings/embeddings.txt"
    file_wordslist = "/Users/HyNguyen/Documents/Research/Data/senna/hash/words.lst"
    with open(file_wordslist, mode="r") as f:
        words = f.readlines()
    with open(file_embedding, mode="r") as f:
        vectors = f.readlines()
    words_embedding = []
    words_index = {}
    for i, (word, vector) in enumerate(zip(words, vectors)):
        word_2 = word[:-1]
        vector_2 = vector[:-1].split()
        vec_np = np.array(vector_2, dtype=np.float32)
        words_embedding.append(vec_np)
        words_index[word_2] = i

    words_embedding = np.array(words_embedding, dtype=np.float32)
    print("words_embedding.shape", words_embedding.shape)
    print("words_index.length", len(words_index))
    wordvectors = WordVectors(50, words_embedding, words_index)
    wordvectors.save_text_format("../model/cwvector.txt")
Esempio n. 4
0
            words = nltk.word_tokenize(sentence)
            for word in words:
                vocab[str(word).lower()] = 1

    # duc04 data
    with open("../data/sentence.score.duc04.txt", mode="r") as f:
        for line in f:
            sentence, score = line.split("hynguyensplit")
            words = nltk.word_tokenize(sentence)
            for word in words:
                vocab[str(word).lower()] = 1

    # duc05 data
    with open("../data/sentence.score.duc05.txt", mode="r") as f:
        for line in f:
            sentence, score = line.split("hynguyensplit")
            words = nltk.word_tokenize(sentence)
            for word in words:
                vocab[str(word).lower()] = 1

    print("Finish reading vocab size: ", str(len(vocab.keys())))
    return vocab

import pickle
if __name__ == "__main__":

    w2v = word2vec.Word2Vec.load_word2vec_format("/Users/HyNguyen/Documents/MachineLearning/convae/model/glove.400k.txt",binary=False)
    with open("vocab.lower.pickle", mode="rb") as f:
        vocab = pickle.load(f)
    wordvectors = WordVectors.create_wordvectos_from_word2vec_vocab(w2v,vocab)
    wordvectors.save_text_format("../model/glove.filter.txt")
def main(wordCorpus):
    min_df = 2
    if (wordCorpus == 'twenty-news'):
        groupIndices = [
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19
        ]
        orderReductions = [
            'none', 'svd', 'glove', 'fasttext', 'word2vec',
            'custom-vectors-fasttext', 'custom-vectors-word2vec'
        ]
    elif (wordCorpus == 'acl-imdb'):
        groupIndices = [0, 1]
        orderReductions = [
            'svd', 'glove', 'fasttext', 'word2vec', 'custom-vectors-fasttext',
            'custom-vectors-word2vec'
        ]

    nClusters = len(groupIndices)

    vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=min_df)

    metrics = {}
    for tokenType in ['stopped']:
        X, indexList = getX(wordCorpus, tokenType, groupIndices)

        sparseX = vectorizer.fit_transform(X)
        corpusVocab = vectorizer.vocabulary_

        metrics[tokenType] = {}

        for orderReduction in orderReductions:
            if (((orderReduction == 'word2vec') or
                 (orderReduction == 'fasttext') or (orderReduction == 'glove'))
                    and (tokenType != 'stopped')):
                continue
            else:
                print(tokenType, orderReduction)
                metrics[tokenType][orderReduction] = {}

                if ((orderReduction != 'svd') and (orderReduction != 'none')):
                    wvObject = WordVectors(wordCorpus=wordCorpus,
                                           wordVecSource=orderReduction,
                                           corpusVocab=corpusVocab,
                                           tokenType=tokenType)

                if (orderReduction == 'none'):
                    denseZ = sparseX
                    denseZ = denseZ.toarray()
                elif (orderReduction == 'svd'):
                    denseZ = svdReduce(sparseX, order=300)
                else:
                    argsForTransform = {
                        'sparseX': sparseX,
                        'vocab': corpusVocab
                    }
                    denseZ = Transform2WordVectors(wvObject).transform(
                        argsForTransform)

                normalizer = Normalizer(copy=False)
                denseZ = normalizer.fit_transform(denseZ)
                intraClusterMetics, centroidMetrics = computeMetrics(
                    denseZ, indexList, tokenType, orderReduction)
                metrics[tokenType][orderReduction][
                    'intraClusterMetics'] = intraClusterMetics
                metrics[tokenType][orderReduction][
                    'centroidMetrics'] = centroidMetrics

    f = open('./results/' + wordCorpus + '.json', 'w')
    out = json.dumps(metrics, ensure_ascii=True)
    f.write(out)
    f.close()
Esempio n. 6
0
from wordvectors import WordVectors
import numpy as np

def read_from_senna():
    file_embedding  = "/Users/HyNguyen/Documents/Research/Data/senna/embeddings/embeddings.txt"
    file_wordslist  = "/Users/HyNguyen/Documents/Research/Data/senna/hash/words.lst"
    with open(file_wordslist, mode="r") as f:
        words = f.readlines()
    with open(file_embedding, mode="r") as f:
        vectors = f.readlines()
    words_embedding = []
    words_index = {}
    for i ,(word, vector) in enumerate(zip(words,vectors)):
        word_2 = word[:-1]
        vector_2 = vector[:-1].split()
        vec_np = np.array(vector_2,dtype=np.float32)
        words_embedding.append(vec_np)
        words_index[word_2] = i

    words_embedding = np.array(words_embedding,dtype=np.float32)
    print("words_embedding.shape",words_embedding.shape)
    print("words_index.length",len(words_index))
    wordvectors = WordVectors(50,words_embedding,words_index)
    wordvectors.save_text_format("../model/cwvector.txt")

if __name__ == "__main__":
    a = WordVectors.load_from_text_format("../model/cwvector.txt", "CWVector")



Esempio n. 7
0
import xml.etree.ElementTree as ET
import os

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


if __name__ == "__main__":
    # Load Word2Vec from Google
    w2v = word2vec.Word2Vec.load_word2vec_format("/Users/HyNguyen/Documents/Research/Data/GoogleNews-vectors-negative300.bin",binary=True)

    # Create object WordVectors

    wordvectors = WordVectors(300,np.empty((0,300),dtype=float),{})

    # wordvectors = WordVectors.load("model/wordvector.txt")

    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        wordvectors.add_wordvector_from_w2vmodel(w2v,words)
    print("Finish penn tree bank corpus, Wordvector size: ", str(wordvectors.embed_matrix.shape[0]))



    # Brown
    brown_sents = brown.sents()
def main():
    start0 = time.time()

    wordCorpus, min_df, model, tokenType, wordVecSource = processArgs()
    logger.info('Running: WordCorpus: {}, Models: {}, TokenType: {}, min_df: {}, wordVecSource: {}'.format(wordCorpus, model, tokenType, min_df, wordVecSource))

    X, y, classNames = Tokens(wordCorpus).getTokens(tokenType)
    vocabularyGenerator = CountVectorizer(analyzer=lambda x: x, min_df=min_df).fit(X) # This is only to generate a vocabulary with min_df
    corpusVocab = sorted(vocabularyGenerator.vocabulary_, key=vocabularyGenerator.vocabulary_.get)
    logger.info('Total Corpus Size: len(corpusVocab) with frequency > min_df : {}, X.shape: {}, y.shape: {}, # classes: {}'.format(len(corpusVocab), X.shape, y.shape, len(classNames)))
    logger.info('Class Names:{}'.format(classNames))
    if (wordVecSource):
        wvObject = WordVectors(wordCorpus=wordCorpus, wordVecSource=wordVecSource,corpusVocab=corpusVocab,tokenType=tokenType) # nWords_in_this_set X wvLength
    else:
        wvObject = None
    results = {}
    results['timeForDataFetch'] = time.time() - start0
    logger.info('Time Taken For Data Fetch: {}'.format(results['timeForDataFetch']))

    modelRuns = defineModels(min_df, model, wvObject)
    logger.info ('Model Runs:\n{}'.format(modelRuns))

    if (wordCorpus == 'twenty-news'):
        testDataFraction = 0.2
        sss = StratifiedShuffleSplit(n_splits=1, test_size=testDataFraction, random_state=0)
        sss.get_n_splits(X, y)
        for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
    elif (wordCorpus == 'acl-imdb'):
        X_train, y_train, classNames = Tokens(wordCorpus).getTokens(tokenType,'train')
        X_test, y_test, classNames = Tokens(wordCorpus).getTokens(tokenType,'test')

    marker = 'X y vocab: Train => Test:' + str(X_train.shape) + ',' + str(y_train.shape) + '=>' + str(X_test.shape) + ',' + str(y_test.shape)
    for name, model in modelRuns:
        results[name] = {}
        results[name][marker] = {}
        logger.info('\n\nCurrent Run: {} => {}'.format(name, marker))
        start = time.time()
        logger.info("Training Begin")
        model.fit(X_train, y_train)
        logger.info("Training End")
        logger.info("Prediction Begin")
        predicted = model.predict(X_test)
        logger.info("Prediction End")
#`
#   nclasses x nclasses matrix.`
#       
#           M_ij    =>  Truly 'i' but predicted as 'j'. C_ii => TP.
#               class_i =>  class[groupIndex]           =>  so classNames go top->bottom & left->right
#               
#               sum row i - C_ii       =>   Predicted as NOT 'i' when it should be 'i'  =>  FN  =>  C_ii/(sum_row_i) = recall
#               sum column i - C_ii    =>  Predicted as 'i' when it should NOT be 'i' =>    FP     => C_ii/(sum_column_i) = precision
#`
        results[name][marker]['model_vocabulary_size'] = len(model.named_steps['vectorizer'].model.vocabulary_)
        results[name][marker]['confusion_matrix'] = confusion_matrix(y_test, predicted)
        results[name][marker]['timeForThisModel_fit_predict'] = time.time()-start

        logger.info ('Run:{}, {}, Confusion Matrix:\n{}'.format(name,marker, results[name][marker]['confusion_matrix']))
        logger.info ('Run:{}, {}, Classification Report:\n{}'.format(name,marker,classification_report(y_test, predicted, target_names=classNames)))
        logger.info ('Model Vocab Size:{}'.format(results[name][marker]['model_vocabulary_size']))
        logger.info ('Time Taken For This Model Run:{}'.format(results[name][marker]['timeForThisModel_fit_predict']))

    results['overAllTimeTaken'] = time.time() - start0
    logger.info('Overall Time Taken:{}'.format(results['overAllTimeTaken']))
    logger.info("Prediction End")
Esempio n. 9
0
import os

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

if __name__ == "__main__":
    # Load Word2Vec from Google
    w2v = word2vec.Word2Vec.load_word2vec_format(
        "/Users/HyNguyen/Documents/Research/Data/GoogleNews-vectors-negative300.bin",
        binary=True)

    # Create object WordVectors

    wordvectors = WordVectors(300, np.empty((0, 300), dtype=float), {})

    # wordvectors = WordVectors.load("model/wordvector.txt")

    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        wordvectors.add_wordvector_from_w2vmodel(w2v, words)
    print("Finish penn tree bank corpus, Wordvector size: ",
          str(wordvectors.embed_matrix.shape[0]))

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
Esempio n. 10
0
import os
from wordvectors import WordVectors
import numpy as np


def read_from_senna():
    file_embedding = "/Users/HyNguyen/Documents/Research/Data/senna/embeddings/embeddings.txt"
    file_wordslist = "/Users/HyNguyen/Documents/Research/Data/senna/hash/words.lst"
    with open(file_wordslist, mode="r") as f:
        words = f.readlines()
    with open(file_embedding, mode="r") as f:
        vectors = f.readlines()
    words_embedding = []
    words_index = {}
    for i, (word, vector) in enumerate(zip(words, vectors)):
        word_2 = word[:-1]
        vector_2 = vector[:-1].split()
        vec_np = np.array(vector_2, dtype=np.float32)
        words_embedding.append(vec_np)
        words_index[word_2] = i

    words_embedding = np.array(words_embedding, dtype=np.float32)
    print("words_embedding.shape", words_embedding.shape)
    print("words_index.length", len(words_index))
    wordvectors = WordVectors(50, words_embedding, words_index)
    wordvectors.save_text_format("../model/cwvector.txt")


if __name__ == "__main__":
    a = WordVectors.load_from_text_format("../model/cwvector.txt", "CWVector")
def main():
    start0 = time.time()

    wordCorpus, min_df, tokenType, orderReduction, listOfClasses = processArgs(
    )
    classList = list(map(int, listOfClasses.split(',')))
    logger.info(
        'Running: WordCorpus: {}, TokenType: {}, min_df: {}, orderReduction: {}, listOfClasses: {}'
        .format(wordCorpus, tokenType, min_df, orderReduction, classList))

    #    vectorizers = [ ('counts', CountVectorizer(analyzer=lambda x: x, min_df=min_df)), ('tf-idf', TfidfVectorizer(analyzer=lambda x: x, min_df=min_df)) ]
    vectorizers = [('tf-idf',
                    TfidfVectorizer(analyzer=lambda x: x, min_df=min_df))]

    X, indexList = getX(wordCorpus, tokenType, listOfClasses)
    out0 = [tokenType]
    for trueCluster, startEnd in indexList.items():
        out0.append(trueCluster + ':' +
                    str(startEnd['end'] - startEnd['start']))

    vocabularyGenerator = CountVectorizer(
        analyzer=lambda x: x, min_df=min_df).fit(
            X)  # This is only to generate a vocabulary with min_df
    corpusVocab = vocabularyGenerator.vocabulary_
    logger.info(
        'Total Corpus Size: len(corpusVocab) with frequency > min_df : {}, X.shape: {}, # clusters: {}'
        .format(len(corpusVocab), X.shape, len(classList)))
    if ((orderReduction) and (orderReduction != 'svd')):
        wvObject = WordVectors(wordCorpus=wordCorpus,
                               wordVecSource=orderReduction,
                               corpusVocab=corpusVocab,
                               tokenType=tokenType)

    results = []
    for name, vectorizer in vectorizers:
        logger.info('\n\nVectorizer: {}'.format(name))

        sparseX = vectorizer.fit_transform(X)
        if (not orderReduction):
            denseZ = sparseX
        elif (orderReduction == 'svd'):
            denseZ = svdReduce(sparseX, order=300)
        else:
            argsForTransform = {'sparseX': sparseX, 'vocab': corpusVocab}
            denseZ = Transform2WordVectors(wvObject).transform(
                argsForTransform)
        nClusters = len(classList)

        normalizer = Normalizer(copy=False)
        denseZ = normalizer.fit_transform(denseZ)

        nRuns = 1
        for run in range(nRuns):
            result = []
            result = result + out0
            result = result + [name, run, orderReduction]
            model = KMeans(n_clusters=nClusters, max_iter=5000, tol=1.0e-8)
            labels = model.fit_predict(denseZ)
            logger.info('\nRun:{}'.format(run))
            for predictedCluster in range(nClusters):
                result.append(
                    str(predictedCluster) + ':' +
                    str(len(set(np.where(labels == predictedCluster)[0]))))

            for trueCluster, startEnd in indexList.items():
                predictedLabels = labels[startEnd['start']:startEnd['end']]
                for predictedCluster in range(nClusters):
                    count = len(
                        set(np.where(predictedLabels == predictedCluster)[0]))
                    result.append(str(predictedCluster) + ':' + str(count))

            minClusterSeparation = getMinClusterSeparation(
                nClusters, model.cluster_centers_)
            ratio = model.inertia_ / minClusterSeparation
            result = result + [model.inertia_, minClusterSeparation, ratio]
            results.append(result)

    with open('./results.csv', 'wb') as fh1:
        np.savetxt(fh1, results, delimiter=", ", fmt='%s')