Esempio n. 1
0
    def load(self, model_name='kyubyong', df_data=None):
        '''
        Available pre-trained models: lwvlib, kyubyong, wikicorpus
        '''
        if model_name == 'lwvlib':
            # http://dl.turkunlp.org/finnish-embeddings/
            # https://turkunlp.org/finnish_nlp.html --> Finnish internet parsebank
            # http://bionlp-www.utu.fi/wv_demo/ --> online interface
            # https://github.com/fginter/wvlib_light/tree/3471a8db66883769c4e5398806876d5be3e3df24 --> library
            import lwvlib
            self.wv = lwvlib.load("finnish_4B_parsebank_skgram.bin")  #
            #self.wv=lwvlib.load("finnish_s24_skgram.bin") #,10000,500000)
        elif model_name == 'kyubyong':
            #import gensim
            #https://github.com/Kyubyong/wordvectors
            self.wv = gensim.models.Word2Vec.load(
                'kyubyong_fin_word2vec.bin').wv
        elif model_name == 'wikicorpus':
            wiki_corpus = WikiDumpToCorpus(
                'fiwiki-latest-pages-articles.xml.bz2')
            if os.path.exists('wikicorpus_word2vec.bin'):  # load model
                self.mv = gensim.models.Word2Vec.load(
                    'wikicorpus_word2vec.bin').wv
            elif wiki_corpus.check_corpus():  # train model
                # model = gensim.models.Word2Vec(['testi'], size=200, window=10, min_count=5,
                #             workers=multiprocessing.cpu_count(), iter=2000)
                if not df_data.empty:
                    # sentences = []
                    # for i, row in df_data.iterrows():
                    #     sentence = gensim.utils.simple_preprocess(row['Nimi'])
                    #     if len(sentence) > 0:
                    #         sentences.append(sentence)
                    # model.build_vocab(sentences)

                    def train_func(train_data):
                        model = gensim.models.Word2Vec(
                            gensim.utils.simple_preprocess(train_data),
                            size=200,
                            window=10,
                            min_count=5,
                            workers=multiprocessing.cpu_count(),
                            iter=1)
                        model.save("wikicorpus_word2vec.model")
                        model.wv.save_word2vec_format(
                            "wikicorpus_word2vec.bin", binary=True)
                        #data = gensim.utils.simple_preprocess(train_data)
                        #model.train(data, total_examples=len(data), epochs=100)

                    wiki_corpus.open_corpus(train_func)

            elif os.path.exists(
                    'fiwiki-latest-pages-articles.xml.bz2'):  # make corpus
                wiki_corpus.make_corpus()
            else:  #train
                print("Error in 'wikicorpus'")
Esempio n. 2
0
def read_pretrained_vectors(vocabulary, vector_model, max_words=1000000):
    print("Loading pretrained word embeddings from " + vector_model)
    model = lwvlib.load(vector_model, max_words, max_words)
    vectors = np.zeros((len(vocabulary), model.vectors.shape[1]), np.float)
    for word, idx in vocabulary.items():
        if idx == 0 or idx == 1 or idx == 2:
            continue  # mask and unk and end
        midx = model.get(word)
        if midx is not None:
            vectors[idx] = model.vectors[midx]  # TODO: normalize?

    return vectors
Esempio n. 3
0
    def read_vocab(self,fname,vsize=10000):
        """ read vocabulary from wvlib model """
        model=lwvlib.load(fname,vsize,vsize)
        

        self.words=model.words
        self.words.insert(0,"<MASK>")
        self.words.insert(1,"<UNK>")
        self.shape=(len(self.words),model.vectors.shape[1])

        self.word_to_dim=dict((w,i) for i,w in enumerate(self.words))

        self.vectors=np.zeros((vsize+2,model.vectors.shape[1]),np.float)
        
        for i,row in enumerate(model.vectors):
            self.vectors[i+2]=row
        self.vectors[1]=self.vectors[np.random.randint(2,len(self.words))] # take a random vector for unk # TODO: average of something...
Esempio n. 4
0
    def read_vocab(self, fname, vsize=10000):
        """ read vocabulary from wvlib model """
        model = lwvlib.load(fname, vsize, vsize)

        self.words = model.words
        self.words.insert(0, "<MASK>")
        self.words.insert(1, "<UNK>")
        self.shape = (len(self.words), model.vectors.shape[1])

        self.word_to_dim = dict((w, i) for i, w in enumerate(self.words))

        self.vectors = np.zeros((vsize + 2, model.vectors.shape[1]), np.float)

        for i, row in enumerate(model.vectors):
            self.vectors[i + 2] = row
        self.vectors[1] = self.vectors[np.random.randint(
            2, len(self.words)
        )]  # take a random vector for unk # TODO: average of something...
Esempio n. 5
0
def main(args):

    print("Reading",args.char_model,"model",file=sys.stderr)
    model=lwvlib.load(args.char_model)

    vocabulary=read_vocab(args.vocab)
    
    # TODO normalize?


    # words: the words themselves
    all_features=[]
    # data: the vector matrix
    data=np.zeros((len(vocabulary),model.vsize),np.float32)
    
    for feature,wcount in vocabulary:
        vector=np.zeros(model.vsize,np.float32)
        norm=0
        postag,feats=feature.split("|",1)
        for feat in feats.split("|"):
            vidx=model.get(postag+"|"+feat)
        if vidx!=None:
            vector=np.add(vector,model.vectors[vidx])
            norm+=1
        if norm==0:
            print("No vector for feature",feature,"with pos tag",postag,file=sys.stderr)
            continue
        vector=np.divide(vector,norm)
        data[len(all_features),:]=vector
        all_features.append(feature)

    # reshape data to len(words)
    data=data[:len(all_features),:]

    print("Original vector model:",len(model.words),model.vsize,file=sys.stderr)
    print("New vector model:",data.shape,file=sys.stderr)
    assert len(all_features)==data.shape[0]
    
    save_txt(all_features,data,args.output)
Esempio n. 6
0
def replace_embeddings(host, donor, out):
    """
    Replaces donor embeddings in host, writes to out.

    host: hnmt/bnas Embeddings submodel file name in
    donor: word2vec bin file name
    out: hnmt/bnas Embeddings submodel file name out
    """
    with open(host, "rb") as hostf:
        host_config = pickle.load(hostf)
        host_data = pickle.load(hostf)
        host_matrix = host_data[("w", )]
        print("...{} loaded".format(host), file=sys.stderr, flush=True)
    donor_model = lwvlib.load(donor)
    donor_matrix = donor_model.vectors.astype(host_matrix.dtype)
    assert host_matrix.shape[1] == donor_matrix.shape[1], (
        host_matrix.shape[1], donor_matrix.shape[1])
    print("...{} loaded".format(donor), file=sys.stderr, flush=True)
    print("Sum before exchange",
          np.sum(host_data[("w", )]),
          file=sys.stderr,
          flush=True)
    #Replace
    for w, host_dim in host_config["src_encoder"].index.items():
        donor_dim = donor_model.get(w)
        if donor_dim is None:
            print("Cannot map {}".format(w), file=sys.stderr, flush=True)
            continue
        host_matrix[host_dim] = donor_matrix[donor_dim]
    #Done replacing now
    with open(out, "wb") as outf:
        pickle.dump(host_config, outf)
        pickle.dump(host_data, outf)
    print("Sum after exchange",
          np.sum(host_data[("w", )]),
          file=sys.stderr,
          flush=True)
Esempio n. 7
0
        random.shuffle(shuffle)
        data_matrix=data_matrix[shuffle]
        class_indices=class_indices[shuffle].astype(np.int32)
        return class_indices, data_matrix

if __name__=="__main__":
    import argparse
    parser = argparse.ArgumentParser(description='Sclassifier')
    parser.add_argument('--embeddings', help='embeddings file')
    parser.add_argument('--edim', type=int, default=200000, help='Embedding dimension')
    parser.add_argument('--tr-data', help='training data')
    parser.add_argument('--max-seq-len', type=int, default=30, help='training data')
    args = parser.parse_args()

    #source of embeddings
    wv_model=lwvlib.load(args.embeddings,args.edim,args.edim)

    d=Data(wv_model)
    with open(args.tr_data) as tr_f:
        class_indices, data_matrix=d.read(tr_f,args)
        class_indices_1hot=keras.utils.to_categorical(class_indices)

    
    inp_seq=Input(shape=(args.max_seq_len,), name="words", dtype='int32')
    inp_embeddings=Embedding(*wv_model.vectors.shape, input_length=args.max_seq_len, mask_zero=False, weights=[wv_model.vectors])
    inp_embeddings.trainable=False
    text_src=inp_embeddings(inp_seq)

    #gru1_out=GRU(100,name="gru1")(text_src)
    cnn1_out=Conv1D(100,2,padding="same")(text_src)
Esempio n. 8
0
    """Matrix words x dimensions, returns"""
    vectors = sklearn.preprocessing.normalize(vectors)
    top_n = np.argpartition(-np.abs(vectors), max_count, 0)
    top_per_dim = (top_n.T)[:, :max_count]  #dims x max_count
    for dim in range(top_per_dim.shape[0]):
        print(dim, "   ", end=' ')
        for w in top_per_dim[dim]:
            print(model.words[w], end=' ')
        print()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='ICA postprocessing for w2v')
    parser.add_argument('--w2v-model',
                        required=True,
                        action="store",
                        help='W2V model')
    parser.add_argument('--ica-model',
                        required=True,
                        action="store",
                        help='ICA model')
    parser.add_argument('--max-rank',
                        type=int,
                        default=200000,
                        help='Max rank default %(default)d')

    args = parser.parse_args()
    model = lwvlib.load(args.w2v_model, args.max_rank, args.max_rank)
    ica_matrix = np.load(args.ica_model)
    dimension_extremes(ica_matrix, 15, model)
Esempio n. 9
0
    parser.add_argument("--max-rank-mem",
                        type=int,
                        default=None,
                        help='Max vectors in memory')
    parser.add_argument("--max-rank",
                        type=int,
                        default=None,
                        help='Max vectors total')
    parser.add_argument("--interactive",
                        default=False,
                        action="store_true",
                        help='Read interactive')
    args = parser.parse_args()

    models = [
        lwvlib.load(m, max_rank_mem=args.max_rank_mem, max_rank=args.max_rank)
        for m in args.models
    ]
    while True:
        if args.interactive:
            w = input("> ")
        else:
            w = sys.stdin.readline().strip()
        if not w:
            break
        nearest = [m.nearest(w, 200) for m in models]
        if any(n == None for n in nearest):
            print("Out of vocab")
            continue
        for nn in zip(*nearest):
            for sim, neighb in nn:
Esempio n. 10
0
import finmeter, random, lwvlib, json, re
from uralicNLP import semfi
from uralicNLP import uralicApi
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import numpy as np
from numpy.random import choice
from markovchain import JsonStorage
from markovchain.text import MarkovText, ReplyMode

wv = lwvlib.load("fin-word2vec-lemma.bin", 10000, 500000)
vowels = ['a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö']
markov = MarkovText.from_file('kalevala_and_others_markov.json')
fallback_markov = MarkovText.from_file('fallback_markov.json')


def count_syllables(verse):
    ''' Count syllables in a verse '''

    tokenizer = RegexpTokenizer(r'\w+')
    n_syllables = 0
    for word in tokenizer.tokenize(verse):

        try:
            n_syllables += len(finmeter.hyphenate(word).split("-"))
        except Exception as e:
            pass
            # print(e)
            # print(verse)
            # print("Error täällä: count_syllables")
Esempio n. 11
0
import sys, json, gzip
import lwvlib
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def check_nearest_words(vector, vector_i, wv):
    wv_vectors = wv.vectors
    wv_words = wv.words
    sims = cosine_similarity(np.array(vector).reshape(1, -1), wv_vectors)[0]
    indexes = np.argsort(sims)[::-1]
    for index in indexes[:10]:
        print(wv_words[index])

    print()

    #print(vector)
    #print(wv.nearest(vector))


if __name__ == "__main__":
    wv = lwvlib.load("/home/ginter/w2v/pb34_wf_200_v2.bin", max_rank=400000)

    for line in sys.stdin:
        vectors = json.loads(line)
        for vector_i, vector in enumerate(vectors):
            check_nearest_words(vector, vector_i, wv)
Esempio n. 12
0
    ordered = []  #(major,minor)
    sents = {}  #(major,minor) -> Sentiment()
    for line in inp:
        line = line.strip().lower()
        if not line:
            continue
        if line.startswith("## "):
            minor = line.split()[1]
            ordered.append((major, minor))
        elif line.startswith("# "):
            major = line.split()[1]
        else:
            words = set(commaRe.split(line))
            sents[(major, minor)] = Sentiment(major, minor, words)
    return ordered, sents


if __name__ == "__main__":
    wv = lwvlib.load("../w2v/pb34_wf_200_v2_skgram.bin", 1000000, 1000000)
    ordered, sents = get_sents(sys.stdin)
    for s in sents.values():
        s.expand(wv, 100)
    m = None
    for (major, minor) in ordered:
        if major != m:
            print("#", major)
            print()
            m = major
        print(sents[(major, minor)])
        print()
Esempio n. 13
0
import lwvlib

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Train')
    parser.add_argument('--orig',
                        required=True,
                        help='A (.bin) embedding file before training.')
    parser.add_argument(
        '--trained',
        required=True,
        help=
        'A (.bin) embedding file after training (produced by tools/extract_embeddings).'
    )
    args = parser.parse_args()

    orig_m = lwvlib.load(args.orig)
    trained_m = lwvlib.load(args.trained)

    df = orig_m.vectors - trained_m.vectors
    print(df)