def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5,
              sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0,
              dm_mean=0, train_words=True, train_lbls=True, **kwargs):
     """
     Initialize the model from an iterable of `sentences`. Each sentence is a
     LabeledSentence object that will be used for training.
     The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora,
     consider an iterable that streams the sentences directly from disk/network.
     If you don't supply `sentences`, the model is left uninitialized -- use if
     you plan to initialize it in some other way.
     `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used.
     Otherwise, `dbow` is employed.
     `size` is the dimensionality of the feature vectors.
     `window` is the maximum distance between the current and predicted word within a sentence.
     `alpha` is the initial learning rate (will linearly drop to zero as training progresses).
     `seed` = for the random number generator.
     `min_count` = ignore all words with total frequency lower than this.
     `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
             default is 0 (off), useful value is 1e-5.
     `workers` = use this many worker threads to train the model (=faster training with multicore machines).
     `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).
     `negative` = if > 0, negative sampling will be used, the int for negative
     specifies how many "noise words" should be drawn (usually between 5-20).
     `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
     Only applies when dm is used.
     """
     Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count,
                       sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
                       sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs)
     self.train_words = train_words
     self.train_lbls = train_lbls
     if sentences is not None:
         self.build_vocab(sentences)
         self.train(sentences)
Ejemplo n.º 2
0
def gene():
    modelpre=Word2Vec.load('corpus/pretrain40.model')
    modelfield=Word2Vec.load('corpus/fieldtrained40.model')
    modelmerged=Word2Vec.load('corpus/mergedtrained40iter1.model')
    xlist=[]
    ylist=[]
    zlist=[]
    labellist=[]
    upperline=0.016
    floor=0.008 #0.01 0.013
    upperlinefield=0.06
    floorfield=0.02
    upperlinepre=0.019
    floorpre=0.018
    with open('corpus/word2pic2.txt') as fp:
        for row in fp:
            word=unicode(row[:-1])
            x=(modelmerged.similarity(word,u"好")+modelmerged.similarity(word,u"快乐")+modelmerged.similarity(word,u"开心"))/3.0-(modelmerged.similarity(word,u"坏")+modelmerged.similarity(word,u"悲伤"))/2.0
            y=(modelfield.similarity(word,u"好")+modelfield.similarity(word,u"快乐")+modelfield.similarity(word,u"开心"))/3.0-(modelfield.similarity(word,u"坏")+modelfield.similarity(word,u"悲伤"))/2.0
            z=(modelpre.similarity(word,u"好")+modelpre.similarity(word,u"快乐")+modelpre.similarity(word,u"开心"))/3.0-(modelpre.similarity(word,u"坏")+modelpre.similarity(word,u"悲伤"))/2.0
            labellist.append(word)
            # xlist.append(x-(upperline+floor)/2.0)
            xlist.append(x-0.016)
            ylist.append(y-(upperlinefield+floorfield)/2.0)
            zlist.append(z-(upperlinepre+floorpre)/2.0)
    # with open('corpus/word2picxyz.txt','w') as fp:
    #     pickle.dump(labellist,xlist,ylist,zlist,fp)
    return labellist,xlist,ylist,zlist
Ejemplo n.º 3
0
def dis(vectorsize):
    # print model.similarity("今天","在")
    model=Word2Vec.load('corpus/mergedtrained'+str(vectorsize)+'iter1'+'.model')
    modelfield=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model')
    print model.similarity(u"分手",u"好")
    print model.similarity(u"分手",u"坏")
    print modelfield.similarity(u"分手",u"好")
    print modelfield.similarity(u"分手",u"坏")
Ejemplo n.º 4
0
def load_word2vec(w2v):
    if isinstance(w2v, str):
        print("Loading word vectors from '%s'..." % w2v, flush=True)
        try:
            w2v = Word2Vec.load_word2vec_format(w2v)
        except ValueError:
            w2v = Word2Vec.load_word2vec_format(w2v, binary=True)
    return w2v
Ejemplo n.º 5
0
def intersect(vectorsize):
    model=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model')
    # setwordwindow(vectorsize)
    print 'finish load'
    Word2Vec.intersect_word2vec_format(model,'corpus/initindex'+str(vectorsize),binary=False)
    print 'finish intersect'
    model.save('corpus/merged'+str(vectorsize)+'.model')
    model.save_word2vec_format('corpus/merged'+str(vectorsize), binary=False)
    print 'finish save'
def get_model(model_num, model_names):
    
    
    if model_num < 10:
        model = Word2Vec.load(model_path + model_names)
    elif model_num < 99:
        model = Doc2Vec.load(model_path + model_names)
    else:
        model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True)  # C text format
    return model
Ejemplo n.º 7
0
    def train(self, sentences, total_examples=None, total_words=None,
              epochs=None, start_alpha=None, end_alpha=None,
              word_count=0, queue_factor=2, report_delay=1.0):
        self.neg_labels = []
        if self.negative > 0:
                # precompute negative labels optimization for pure-python training
                self.neg_labels = zeros(self.negative + 1)
                self.neg_labels[0] = 1.

        Word2Vec.train(self, sentences, total_examples=self.corpus_count, epochs=self.iter,
            start_alpha=self.alpha, end_alpha=self.min_alpha)
        self.get_vocab_word_vecs()
Ejemplo n.º 8
0
def main():
    # te()
    # teword()
    # intersect(40)
    # setwordwindow(40)
    # Word2Vec.load_word2vec_format('corpus/initindex40',binary=False)
    
    modelpre=Word2Vec.load('corpus/pretrain40.model')
    modelfield=Word2Vec.load('corpus/fieldtrained40.model')
    modelmerged=Word2Vec.load('corpus/mergedtrained40iter1.model')
    print 'finish load'
    classify(modelpre,modelfield,modelmerged,40)
Ejemplo n.º 9
0
def teword():
    # model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False)
    # sim=model.most_similar(positive=[u'好',u'开心'],negative=[u'下雨'],topn=2)
    # print sim
    documents=[u"今天 天气 真是 好 啊",u"明天 就要 下雨 了,伐 开心"]
    model=Word2Vec(documents,size=20,window=5,min_count=1)
    sim=model.most_similar(positive=[u"好"],topn=2)
    # model.save('./tmp/tevec')
    print sim

    model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False)
    Word2Vec.intersect_word2vec_format(model,'fieldvec.bin',binary=False)
    Word2Vec.train_batch_sg(model, sentences, alpha, work=None)
Ejemplo n.º 10
0
 def load_external(self, model_file_name):
     """
     load a word2vec model from the file specified
     :param model_file_name: name of the model file
     :return:
     """
     self.model = Word2Vec.load(model_file_name)
Ejemplo n.º 11
0
def main():
    industry = sys.argv[1]
    vocab_file = "../data/" + industry + "/embed_vocab"
    model_file = "../data/" + industry + "/user_model"
    # load vocab list
    with open(vocab_file) as f:
        vocab_list = map(str.strip, f.readlines())
    # load model
    model = Word2Vec.load(model_file)

    # build vocab index dict
    vob_index_dict = {}
    for i, vob in enumerate(vocab_list):
        vob_index_dict[vob] = i

    # calc vocab dist
    logging.info("calucating vocab dist matrix")
    dm = get_vocab_dist_matrix(vocab_list, model)

    # get company domain list dict
    comp_domain_file = "../data/" + industry + "/company_file"
    comp_dict = get_comp_dict(comp_domain_file)
    logging.info("company dict generated : " + str(comp_dict.keys()))

    # delete domain not exist in vocab list
    filter_company_by_vocab(comp_dict, vocab_list)

    # filter company domain by uv : default uv > 100
    filter_action_by_uv(comp_dict, 100)

    # calc dist between two company
    res_file = "../data/" + industry + "/company_dist"
    calc_company_dist(res_file, comp_dict, dm, vob_index_dict)
Ejemplo n.º 12
0
    def __init__(self):

        '''
        Training parameters:
        '''

        self.w2v_dim=100
        self.num_feature=400
        self.batch_size=16
        self.num_epoch=30

        # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')

        self.index2word_set = set(self.w2v_model.index2word)

        #self.bigram=None
        #self.trigram=None

        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        print('Build model...')

        self.model = Sequential()
        self.model.add(Dropout(0.2,input_shape=(self.num_feature,)))
        self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal'))
        self.model.add(Activation('softmax'))


        self.model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode="categorical")

        print('Model has been built!')
Ejemplo n.º 13
0
def return_data(data_type, embed_dim=50): 
    """Return the data specified by the inputted `data_type`.

    This function is built to allow for easier calls for the data from scripts
    external to this one. 

    Args: 
    ----
        data_type: str
        embed_dim (optional): int

    Return: varied
    """

    if data_type == "word_embedding": 
        embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format(embed_dim)
        wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp, binary=False)
        return wrd_embedding
    elif data_type == "articles": 
        body_fp = 'data/articles/twenty_newsgroups/bodies.pkl'
        headline_fp = 'data/articles/twenty_newsgroups/headlines.pkl'

        with open(body_fp, 'rb') as f: 
            bodies = pickle.load(f)
        with open(headline_fp, 'rb') as f: 
            headlines = pickle.load(f)
        return bodies, headlines
    else: 
        raise Exception('Invalid data type requested!')
Ejemplo n.º 14
0
def get_init_data(model_file, ark_file):
    from gensim.models.word2vec import Word2Vec

    model = Word2Vec.load_word2vec_format(model_file, binary=False)
    ark_clusters = get_ark_clusters(ark_file)

    files = [resource_filename('twitter_dm', 'data/identity_dictionaries/identity/'+x) for x in
             resource_listdir('twitter_dm', 'data/identity_dictionaries/identity/')]

    files += [resource_filename('twitter_dm', 'data/identity_dictionaries/non_identity_words/'+x) for x in
             resource_listdir('twitter_dm', 'data/identity_dictionaries/non_identity_words/')]

    all_dictionaries = Dictionaries(list_of_files=files)
    twit_sets = []
    stopwords = get_stopwords()

    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(None)

    for v in [10, 100, 1000, 10000,50000]:
        twit_id = set(tw_distant_supervision_identity_dat[
                      (tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords}
        twit_sets.append([twit_id,"twit_identities_"+str(v)])

    twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"])
    twit_sets.append([stopwords,"stopword"])

    return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
Ejemplo n.º 15
0
def load_vectors():
    print("loading word2vec vectors...")
    t0 = time()
    model = Word2Vec.load_word2vec_format('/Volumes/Seagate Backup Plus Drive/MacFilesThatICantFit/GoogleNews-vectors-negative300.bin', binary = True)
    loadTime = time() - t0
    print("word2vec vectors loaded in %0.3f seconds" % loadTime)
    print()

    # done "training" the model; we can do the following to trim uneeded memory
    t0 = time()
    print("trimming model memory...")
    model.init_sims(replace=True)
    trimTime = time() - t0
    print("trimmed memory in %0.3f seconds" % trimTime)
    print()

    vec = model['hello']

    print('type of vector')
    print(type(vec))
    print('vector')
    print(vec)

    sys.exit(1)

    return model
Ejemplo n.º 16
0
def return_data(data_type, embed_dim=50): 
    """Return the data specified by the inputted `data_type`.

    This function is built to allow for easier calls for the data from scripts
    external to this one. 

    Args: 
    ----
        data_type: str
        embed_dim (optional): int

    Return: varied
    """

    if data_type == "word_embedding": 
        embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format(embed_dim)
        wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp, binary=False)
        return wrd_embedding
    elif data_type == "reviews": 
        reviews_fp = 'work/reviews/amazon/filtered_tokenized_reviews.pkl'
        ratios_fp = 'work/reviews/amazon/filtered_ratios.npy'

        with open(reviews_fp, 'rb') as f: 
            reviews = pickle.load(f)
        ratios = np.load(ratios_fp)
        return reviews, ratios 
    else: 
        raise Exception('Invalid data type requested!')
Ejemplo n.º 17
0
    def __init__(self, *args, **kwargs):

        '''
        Computes various measures of central tendency of a document.
        For Z_X scores, the raw word tokens are summed over the partition
        function. For I_X scores, the same statistics are computed over
        the similarity of all word pairs for words with top 10% Z values.
        This will precompute the partition function if it doesn't exist.
        '''
        cfg_embed = kwargs["embedding"]
        cfg_score = kwargs["score"]

        f_w2v = os.path.join(
            cfg_embed["output_data_directory"],
            cfg_embed["w2v_embedding"]["f_db"],
        )

        f_partition_function = os.path.join(
            cfg_embed["output_data_directory"],
            cfg_score["document_log_probability"]["f_partition_function"],
        )
        
        if not os.path.exists(f_partition_function):
            self.create_partition_function(f_w2v, f_partition_function)

        self.Z = self.load_partition_function(f_partition_function)
        self.scores = []

        val = cfg_score["document_log_probability"]["intra_document_cutoff"]
        self.intra_document_cutoff = float(val)

        self.model = Word2Vec.load(f_w2v)
Ejemplo n.º 18
0
 def initialize(self):
     sys.stdout.write("Metric initialization\n")
     sys.stdout.write("1 - Word2vec model")
     self.model = Word2Vec.load(model_path)
     sys.stdout.write("...loaded\n")
     sys.stdout.write("2 - Stop words")
     self.stop_words = [line.strip('\n') for line in open(stop_words_path)]
     sys.stdout.write("...loaded\n")
     sys.stdout.write("3 - Word-Averages model: ")
     self.wordAverages = defaultdict()
     for i in self.files_list:
         sys.stdout.write(str(i) + " - ")
         sys.stdout.flush()
         tweetsFile = tweets_path + str(i) + ".csv"
         wAvgsFile = wAvgs_path + str(i) + ".csv"
         tweets = []
         values = []
         with open(tweetsFile, 'r') as f1: 
             tweets = f1.readlines()
             f1.close()
         with open(wAvgsFile, 'r') as f2: 
             reader = csv.reader(f2)
             for r in reader:
                 values.append( np.array([ float(v) for v in r  ]) )
             f2.close()
         for j in range(len(tweets)):   
             self.wordAverages[ tweets[j].strip('\n')  ] = values[j]
     sys.stdout.write("loaded\n")
Ejemplo n.º 19
0
    def create_partition_function(self, f_w2v, f_h5):
        print "Building the partition function"
        
        # Load the model from disk
        M = Word2Vec.load(f_w2v)

        words = M.index2word
        ZT = []
        INPUT_ITR = tqdm.tqdm(words)

        # Compute the partition function for each word
        for w in INPUT_ITR:
            UE = self.energy(M.syn0, M[w])
            z  = compute_partition_stats(UE)
            ZT.append(z)

        # Save the partition function to disk
        # (special care needed for h5py unicode strings)
        dt = h5py.special_dtype(vlen=unicode)

        with h5py.File(f_h5,'w') as h5:
                       
            h5.create_dataset("words", (len(words),),
                              dtype=dt,
                              data=[w.encode('utf8') for w in words])

            h5.attrs['vocab_N'] = len(words)
            h5['Z'] = ZT            
Ejemplo n.º 20
0
def get_predict_vecs(words):
    n_dim = 300
    imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
    #imdb_w2v.train(words)
    train_vecs = buildWordVector(words, n_dim,imdb_w2v)
    #print train_vecs.shape
    return train_vecs
Ejemplo n.º 21
0
def build_word_graph(model_fname, limiar=0.2):
    """
    Constroi um grafo de walavras ponderado pela similaridade entre elas
    de acordo com o modelo.
    :param model_fname: Nome do arquivo com o modelo word2vec como foi salvo
    :return: objeto grafo
    """
    m = Word2Vec.load(model_fname)
    g = Graph()
    freq = g.new_vertex_property("int")
    weight = g.new_edge_property("float")
    i = 0
    vdict = {}
    for w1, w2 in combinations(m.vocab.keys(), 2):
        if w1 == '' or w2 == '':
            continue
        # print(w1,w2)

        v1 = g.add_vertex() if w1 not in vdict else vdict[w1]
        vdict[w1] = v1
        freq[v1] = m.vocab[w1].count
        v2 = g.add_vertex() if w2 not in vdict else vdict[w2]
        vdict[w2] = v2
        freq[v2] = m.vocab[w2].count
        sim = m.similarity(w1, w2)
        if sim > 0.1:
            e = g.add_edge(v1, v2)
            weight[e] = sim
        if i > 10000:
            break
        i += 1
    g.vertex_properties['freq'] = freq
    g.edge_properties['sim'] = weight
    return g
Ejemplo n.º 22
0
    def __init__(self,*args,**kwargs):
        super(generic_document_score, self).__init__(*args,**kwargs)

        f_w2v = os.path.join(
            kwargs["embedding"]["output_data_directory"],
            kwargs["embedding"]["w2v_embedding"]["f_db"],
        )

        # Load the model from disk
        self.M = Word2Vec.load(f_w2v)
        self.shape = self.M.syn0.shape
        
        # Build the dictionary
        vocab_n = self.shape[0]
        self.word2index = dict(zip(self.M.index2word,range(vocab_n)))
        
        # Set parallel option (currently does nothing)
        self._PARALLEL = kwargs["_PARALLEL"]

        # Load the negative weights
        if "negative_weights" in kwargs:
            neg_W = kwargs["negative_weights"]
            self.neg_W = dict((k, float(v)) for k,v in neg_W.items())
        else:
            self.neg_W = {}
Ejemplo n.º 23
0
    def from_word2vec_model(cls, word2vec_model):
        """
        WARNING: `gensim` is required to use this function!

        Load a word2vec vector model.
        :param word2vec_model: path to word2vec model or a fitted word2vec model
        :return: a `Vectors` object
        """
        try:
            import gensim # gensim version hack
            if (int(gensim.__version__.split('.')[0]) < 1):
                from gensim.models.word2vec import Word2Vec as Word2VecLoader
            else:
                from gensim.models import KeyedVectors as Word2VecLoader
        except ImportError as ex:
            logging.error('Gensim is required to use this method!')
            raise ex

        if (isinstance(word2vec_model, str)):
            model = Word2VecLoader.load_word2vec_format(word2vec_model, binary=word2vec_model.endswith('bin'))
        else:
            model = word2vec_model

        vocab = model.vocab.keys()

        vectors = {}

        dims = len(model[next(iter(vocab))])  # vector dimensionality

        dimension_names = ['f%02d' % i for i in range(dims)]
        for word in vocab:
            vectors[word] = zip(dimension_names, model[word])

        return Vectors(vectors)
Ejemplo n.º 24
0
def vectorize(model_file, dictionary_file, corpus_file):
  seterr(all='raise')  # don't ignore numpy errors

  #load model from given file
  model = Word2Vec.load(model_file)
  dictionary = corpora.Dictionary().load(dictionary_file)
  corpus = corpora.MmCorpus(corpus_file)
  tfidf = models.TfidfModel(corpus)
  d = corpora.Dictionary()
  d = d.load(dictionary_file)
  corpus = corpora.MmCorpus(corpus_file)
  tf = models.TfidfModel(corpus)
  vectorize = []
  for doc_no, tdoc in enumerate(tf[corpus]):
    tdoc.sort(key=lambda kv: kv[1], reverse=True)
    if doc_no % 100 == 0:
          logger.info("PROGRESS: vectorizing user #%i of %i" %
              (doc_no, len(corpus)))
    words_per_user = 8
    word_vecs = []
    for wordid, measure in tdoc:
      word = d[wordid]
      if word in model:
        word_vecs.append(model[word])
        print word
      if len(word_vecs)>=words_per_user:
        break

    if len(word_vecs)==words_per_user:
      avg = matutils.unitvec(array(word_vecs).mean(axis=0)).astype(REAL)
      vectorize.append(avg)
      #print [word for word, measure in model.most_similar_from_array(avg, topn=5)]
  
  return vectorize
Ejemplo n.º 25
0
 def __init__(self, word2vec_path=""):
     self.sentence = []
     self.tfidf_sparse = []
     self.bi_set = [-1 for i in range(1000000)]
     self.tfidf_model_dict = {}
     if word2vec_path != "":
         self.word2vec_model = Word2Vec.load(word2vec_path)        
Ejemplo n.º 26
0
def term_expansion(fpath, terms, knn):
    '''Expand term list by creating list of nearest neighbors in provided embeddings
    representation. This is usually very noisy and there is a fuzzy distinction between
    semantic similarity and "relatedness". Bacteria names, for example, often neighbor
    diseases caused by those organisms.
    '''
    model = Word2Vec.load(fpath)
    model.init_sims()
    nbrs = NearestNeighbors(n_neighbors=knn+1, algorithm='ball_tree', metric='l2')
    nbrs.fit(model.syn0norm)
    
    expansion = []
    for phrase in terms:
        # space replaced with underscore in PMC/PubMed embeddings
        phrase = phrase.replace(" ","_")
        if phrase not in model.vocab:
            continue
        idx = model.vocab[phrase].index
        vec = model.syn0norm[idx]
        _,indices = nbrs.kneighbors(vec)
        neighbors = [model.index2word[j] for j in indices.flatten()]
        neighbors.remove(phrase)
        expansion += neighbors
    
    # transform words back to whitespace separators 
    return map(lambda x:x.replace("_"," "), expansion)
Ejemplo n.º 27
0
def wordEmbedding():
    """
    These code is from 
    http://vene.ro/blog/word-movers-distance-in-python.html
    """
    if not os.path.exists("data/embed.dat"):
	    print ("Caching word embeddings in memmapped format...")
	    from gensim.models.word2vec import Word2Vec
	    wv = Word2Vec.load_word2vec_format("/home/medialab/NLP_data/GoogleNews-vectors-negative300.bin.gz", binary = True)
	    fp = numpy.memmap("data/embed.dat", dtype=numpy.double, mode='w+', shape=wv.syn0.shape)
	    fp[:] = wv.syn0[:]
	    with open("data/embed.vocab", "w") as f:
		    for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
			    print >> f, unidecode(w)
			    pass
	    del fp, wv
	

    W = numpy.memmap("data/embed.dat", dtype=numpy.double, mode="r", shape=(3000000, 300))
    with open("data/embed.vocab") as f:
	    vocab_list = map(str.strip, f.readlines())
    
  
    vocab_dict = {w:k for k,w in enumerate(vocab_list)}
    return W, vocab_dict
Ejemplo n.º 28
0
def query_word_similarity(model_file, word1, word2):
  seterr(all='raise')  # don't ignore numpy errors

  #load model from given file
  model = Word2Vec.load(model_file + '.model')
  similarity = model.similarity(word1,word2)
  logging.info("similarity of \'%s\' and \'%s\' is %f" % (word1,word2,similarity))
Ejemplo n.º 29
0
    def __init__(self, tokenWeights = True, extraFeatures = True, EXTRA_WEIGHTS_LABELS = [
    'bleuScore', 'similarityScore', 'wordMoversDistance', 'crossUnigramsRatio']):
        self.words = {}
        self.words2 = {}  # hypothesis words
        self.wordId = 0
        self.wordId2 = 0  # hypothesis
        self.extraFeatures = {} # for our new features
        self.docId = 0
        self.documents = {}
        self.tokenWeights = tokenWeights
        self.extraFeatures = extraFeatures
        self.EXTRA_WEIGHTS_LABELS = EXTRA_WEIGHTS_LABELS
        #####################
        if not os.path.exists("data/embed.dat"):
            print("Caching word embeddings in memmapped format...")
            #from gensim import models
            from gensim.models.word2vec import Word2Vec
            wv = Word2Vec.load_word2vec_format("data/GoogleNews-vectors-negative300.bin.gz",
                binary=True)
            wv.init_sims(replace=True) # recommended new step?
            fp = np.memmap("data/embed.dat", dtype=np.double, mode='w+', shape=wv.syn0.shape)
            fp[:] = wv.syn0[:]
            with open("data/embed.vocab", "w") as f:
                for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                    f.write(w.encode('utf-8'))
                    f.write('\n'.encode('utf-8'))
                    #print(w, file=f)
                    pass
            del wv

        self.W = np.memmap("data/embed.dat", dtype=np.double, mode="r", shape=(3000000, 300))
        with open("data/embed.vocab") as f:
            self.vocab_list = map(str.strip, f.readlines())

        self.vocab_dict = {w: k for k, w in enumerate(self.vocab_list)}
Ejemplo n.º 30
0
    def __init__(self,*args,**kwargs):
        super(affinity_mapping, self).__init__(*args,**kwargs)

         # Load the model from disk
        self.M = Word2Vec.load(kwargs["f_w2v"])       
        self.shape = self.M.syn0.shape
        
        # Set parallel option
        self._PARALLEL = ast.literal_eval(kwargs["_PARALLEL"])

        self.damping = float(kwargs["damping"])
        
        if not os.path.exists(kwargs["f_affinity"]):
            h5 = h5py.File(kwargs["f_affinity"],'w')
            h5.close()
 
        self.h5 = h5py.File(kwargs["f_affinity"],'r+')

        global damping, M

        damping = self.damping
        M = self.M

        self.vocab_n = len(M.index2word)
    
        M.word2index = dict([(w,i) for w,i in
                             zip(M.index2word,range(self.vocab_n))])

        # Increment this as we find more clusters
        self.cluster_n = 0
Ejemplo n.º 31
0
'''
print("Loading data...")
x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)

#提取测试固定数据
x_test,y_test = data_helpers.load_data_and_labels(FLAGS.positive_test_file,FLAGS.negative_test_file)

max_document_length = max([len(x.split(" ")) for x in x_text])
print('max_document_length')
print(max_document_length)
'''
word2vec向量化
'''
#加载模型
model_path = './word2vec/word2vec_wx'
model = Word2Vec.load(model_path)

x_tmp = []
out_vocabulary_file = open('/Users/xiamin/Desktop/TextCNN/mail_data/out_of_vocabulary.txt','a')
line_num = 0
for line in x_text:
    line_num += 1
    line = line.decode('utf-8')
    words = line.split(" ")
    vector_line = []
    i=0
    for word in words:
        if(model.wv.__contains__(word)):
            i = i+1
            word_vec = model.wv[word] #numpy array
Ejemplo n.º 32
0
    pool = Pool()
    ret = pool.map(f, parms)
    pool.close()
    pool.join()
    return ret


# -- run shit in parallel...
# sentences = parallel_run(tokenize_document, enumerate(reviews_texts))
sentences = [tokenize_document(txt) for txt in enumerate(reviews_texts)]

# build a default w2v model...
w2v = Word2Vec(sentences=sentences,
               size=100,
               alpha=0.025,
               window=4,
               min_count=2,
               sample=1e-5,
               workers=4,
               negative=10)


def tokens_to_mean_vec(tokens, w2v):
    '''
    Takes a list of tokens and a Word2Vec models
    and finds the mean word vector of that list.
    '''
    vec = []
    for w in tokens:
        try:
            vec.append(w2v[w])
        except KeyError:
Ejemplo n.º 33
0
import mysql.connector

db = mysql.connector.connect(host="localhost",
                             database="nsf",
                             user="******",
                             password="******")

cursor = db.cursor()

cursor.execute("select AwardTitle, AbstractNarration from Award limit 100")

stop = set(stopwords.words('english'))
stop.add(',')
stop.add('.')

result = cursor.fetchall()
proc = []
for r in result:
    proc.append([w.lower()
        for w in sent_tokenize(r[0].replace("<br/>", "\n")) +\
                 sent_tokenize(r[1].replace("<br/>", "\n"))
        if w.lower() not in stop])

print(proc)
exit()

model = Word2Vec(proc, size=100, window=5, min_count=5, workers=4)
model.save("nsf_w2v_model_2")
similar = model.wv.most_similar(['data', 'science'], [])
print(similar)
Ejemplo n.º 34
0
# -*- coding:utf-8 -*-

import logging
from gensim.models.word2vec import LineSentence, Word2Vec
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

# raw_sentences = ["the quick brown fox jumps over the lazy dogs","yoyoyo you go home now to sleep"]

sentences = LineSentence("../dataset/train_questions_with_evidence.txt")

model = Word2Vec(sentences, min_count=1, iter=1000)
model.train(sentences, total_examples=model.corpus_count, epochs=1000)

model.save("../model/w2v.mod")
model_loaded = Word2Vec.load("../model/w2v.mod")

sim = model_loaded.wv.most_similar(positive=[u'酒精'])
for s in sim:
    print s[0]
Ejemplo n.º 35
0
def generate_word2vec_file(x: pd.DataFrame,
                           y: pd.Series,
                           description: str,
                           feature_column: str,
                           timer: Timer = None,
                           feature_size: int = 100,
                           window_context: int = 5,
                           min_word_count: int = 5,
                           sample: float = 0.001,
                           iterations: int = 5,
                           ):
    """
    generate features using word2vec
    :param x:
    :param y:
    :param description:
    :param feature_size:
    :param window_context:
    :param min_word_count:
    :param sample:
    :param iterations:
    :return:
    """
    log.info("generating word2vec")
    log.debug(f'{x.head()}')
    wpt = WordPunctTokenizer()

    if timer:
        timer.start_timer(TOKENIZE_TIME_MIN)
    documents = [wpt.tokenize(review) for review in x.array]
    if timer:
        timer.end_timer(TOKENIZE_TIME_MIN)

    if timer:
        timer.start_timer(VECTORIZE_TIME_MIN)

    # TODO: add configuraton for pre-trained or train
    # if x.shape[0] <= 50:
    w2v_model = Word2Vec(documents,
                         size=int(feature_size),
                         window=int(window_context),
                         min_count=int(min_word_count),
                         sample=sample,
                         iter=int(iterations)
                         )
    # else:
    #     log.info("Downloading pre-trained word2vec")
    #     w2v_model = api.load("word2vec-google-news-300")
    if timer:
        timer.end_timer(VECTORIZE_TIME_MIN)


    model_file = f"{MODEL_DIR}/{description}-{len(x)}-{feature_size}.model"
    log.info(f'Writing model file: {model_file}')
    if timer:
        timer.start_timer(MODEL_SAVE_TIME_MIN)
    w2v_model.save(model_file)
    if timer:
        timer.end_timer(MODEL_SAVE_TIME_MIN)

    feature_df = get_feature_df(w2v_model, x)
    return write_to_file(feature_df, y, feature_column, description, include_lda=False)
    for word in words:
        lower_case_word = word.lower()
        temp_sentence.append(lower_case_word)  # keep original formats

    Text_Data.append(temp_sentence)

print('the length of text data is ', len(Text_Data))

# Create CBOW model
# word2vec_model = gensim.models.Word2Vec(Text_Data, min_count = 1, size = 100, window = 2) # size : Dimensionality of the word vectors.
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

corpus = api.load('text8')
print("training word2vec model now...")
word2vec_model = Word2Vec(corpus)
print("word2vec training completed.")

word2vec_features = []

for sentence_index in range(len(Text_Data)):
    sentence = Text_Data[sentence_index]
    sentence_vector = []

    for word_index in range(len(sentence)):
        word = sentence[word_index]
        word_vector = word2vec_model.wv[word]
        sentence_vector.extend(word_vector)

    # for class 'ES'
    if Tags_List[sentence_index] == 'ES':
Ejemplo n.º 37
0
import shlex
import re
import gensim
from pyemd import emd
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cosine
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.cross_validation import train_test_split


if not os.path.exists("/home/avinash/PycharmProjects/data_local_model/embed.dat"):
    print("Caching word embeddings in memmapped format...")
    from gensim.models.word2vec import Word2Vec
    wv = Word2Vec.load(
        "/home/caniz/Devel/cognostics/difficulty_measure/Shared/word2vec_models/size600_min50_window5_bigram/size600_min50_window5_bigram")
    fp = np.memmap("/home/avinash/PycharmProjects/data_local_model/embed.dat", dtype=np.double, mode='w+', shape=wv.syn0.shape)
    fp[:] = wv.syn0[:]
    with open("/home/avinash/PycharmProjects/data_local_model/embed.vocab", "w") as f:
        for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
            print(w, file=f)
    del fp, wv

W = np.memmap("/home/avinash/PycharmProjects/data_local_model/embed.dat", dtype=np.double, mode="r", shape=(300000, 300))
with open("/home/avinash/PycharmProjects/data_local_model/embed.vocab") as f:
    vocab_list = map(str.strip, f.readlines())

vocab_dict = {w: k for k, w in enumerate(vocab_list)}


Ejemplo n.º 38
0
def get_predict_vecs(words):
    n_dim=500
    imdb_w2v=Word2Vec.load('sentiment_analysis/w2v_model.pkl')
    train_vecs=build_sentence_vector_ave(words,n_dim,imdb_w2v)
    return train_vecs
Ejemplo n.º 39
0
def get_glove(W2V_DIM=50):
    glove_file = 'data/glove_data/glove.6B.' + str(W2V_DIM) + 'd.txt'
    glove = Word2Vec.load_word2vec_format(glove_file)
    return glove
Ejemplo n.º 40
0
parser.add_argument('--w2v',
                    default='all.norm-sz100-w10-cb0-it1-min100.w2v',
                    nargs='?',
                    help='Path to the word2vec model.')
parser.add_argument('--seed',
                    default=228,
                    type=int,
                    nargs='?',
                    help='Random seed.')
args = vars(parser.parse_args())

RANDOM_SEED = args['seed']
random.seed(RANDOM_SEED)

w2v = Word2Vec.load_word2vec_format(args['w2v'],
                                    binary=True,
                                    unicode_errors='ignore')
w2v.init_sims(replace=True)
print('Using %d word2vec dimensions from "%s".' %
      (w2v.layer1_size, args['w2v']))


def read_subsumptions(filename):
    subsumptions = []

    with codecs.open(filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            subsumptions.append((row[0], row[1]))
Ejemplo n.º 41
0
import torch.nn as nn
import matplotlib.pyplot as plt
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import time

from model import ASTNN

TRAINING_SET_SIZE = 30000
VALIDATION_SET_SIZE = 10000
TEST_SET_SIZE = 10000

print('Reading data...')

w2v = Word2Vec.load('./data/c/w2v_128').wv
embeddings = torch.tensor(np.vstack([w2v.vectors, [0] * 128]))

programs = pd.read_pickle('./data/c/id_code_label_ast_(index_tree).pkl')

training_set = programs[:TRAINING_SET_SIZE]
validation_set = programs[TRAINING_SET_SIZE:TRAINING_SET_SIZE +
                          VALIDATION_SET_SIZE]
test_set = programs[TRAINING_SET_SIZE + VALIDATION_SET_SIZE:TRAINING_SET_SIZE +
                    VALIDATION_SET_SIZE + TEST_SET_SIZE]


def get_batch(dataset, i, batch_size):
    return dataset.iloc[i:i + batch_size]

Ejemplo n.º 42
0
def train_w2v(c):
    w2v = Word2Vec(c, size=100)
    return w2v
Ejemplo n.º 43
0
 def __init__(self):
     self.data = pd.read_csv("./best_film.csv")
     self.annoy = AnnoyIndex(100, 'angular')
     self.annoy.load('annoy_for_films.ann')
     self.model = Word2Vec.load('w2v_for_films')
     self.lem = WordNetLemmatizer()
Ejemplo n.º 44
0
#sample(frac=1) -> shuffle data
corpus = pd.concat([train_df.text, test_df.text]).sample(frac=1)
#print(corpus.head())

#put it in Word2Vec
#adjust size iter sg=0 use skip-gram(low freq good) sg=1(use CBOW) ,window: use+-1 letter to predict

#train

#model = Word2Vec(corpus, size = 250, iter = 10, sg=0, window=3, min_count=7, max_vocab_size=None, workers=3, min_alpha=0.0001, hs=0, negative=5, batch_words=10000)
model.save('word2vec_min7.model')
#window = 3 word2vec
#window = 2 word2vec_window

#load
model = Word2Vec.load('word2vec_min7.model')


#tcheck trained model
def most_similar(w2v_model, words, topn=10):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word,
                                                                   topn=topn),
                                         columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df
Ejemplo n.º 45
0
# First download the pretrained gensim model:
# $ wget http://public.shiroyagi.s3.amazonaws.com/latest-ja-word2vec-gensim-model.zip
# $ unzip latest-ja-word2vec-gensim-model.zip

from gensim.models.word2vec import Word2Vec

model = Word2Vec.load('word2vec.gensim.model')

some_words = ['ブランド', 'バッグ', 'アルマーニ']
for w in some_words:
    print(model.wv.most_similar(w))
Ejemplo n.º 46
0
import os
import pickle
import utils
import numpy as np
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load('w2vmodel_run/word2vec_model_cleaned_data')
thermometers = [
    'democrats', 'republicans', 'protestants', 'catholics', 'jews', 'blacks',
    'whites', 'southerners', 'big business', 'labor unions', 'liberals',
    'conservatives', 'military', 'policemen', 'black militants',
    'civil rights leaders', 'chicanos hispanics', 'democratic party',
    'middle class people', 'people on welfare', 'political independents',
    'political parties', 'poor people', 'republican party',
    'women right activist', 'young people', 'asian americans', 'congress',
    'environmentalists', 'anti abortionists', 'federal government',
    'illegal aliens', 'christian fundamentalists', 'radical students',
    'farmers', 'feminists', 'evangelical groups', 'elderly', 'supreme court',
    'women'
]

# Word2Vec size
word2Vec_dimension = 100

# Data directory for cases
data_dir = 'data'
case_dir = os.path.join(data_dir, 'clean_Mar_20')

# sub-directory containing our cases
maj_dir = 'maj'
Ejemplo n.º 47
0
def load_w2v_model(modelpath):
  print("Loading model ", modelpath)
  return Word2Vec.load(modelpath)
Ejemplo n.º 48
0
 def loadModelfromFile(self, modelFilePath):
     '''
     load model from disk which is already existed
     can continue training with the loaded model (need more test)
     '''
     return Word2Vec.load(modelFilePath)
Ejemplo n.º 49
0
    def load_word2vec_format(cls,
                             fname,
                             fvocab=None,
                             binary=False,
                             norm_only=True):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.

        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.

        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        """
        counts = None
        if fvocab is not None:
            logger.info("loading word counts from %s" % (fvocab))
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        logger.info("loading projection weights from %s" % (fname))
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline())
            vocab_size, layer1_size = map(
                int, header.split())  # throws for invalid file format
            result = Word2Vec(size=layer1_size)
            result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL)
            if binary:
                binary_len = dtype(REAL).itemsize * layer1_size
                for line_no in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch != b'\n':  # ignore newlines in front of words (some binary files have newline, some don't)
                            word.append(ch)
                    word = utils.to_unicode(b''.join(word))
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no,
                                                   count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no,
                                                   count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = fromstring(fin.read(binary_len),
                                                      dtype=REAL)
            else:
                for line_no, line in enumerate(fin):
                    parts = utils.to_unicode(line).split()
                    if len(parts) != layer1_size + 1:
                        raise ValueError(
                            "invalid vector on line %s (is this really the text format?)"
                            % (line_no))
                    word, weights = parts[0], map(REAL, parts[1:])
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no,
                                                   count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no,
                                                   count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = weights
        logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
        result.init_sims(norm_only)
        return result
Ejemplo n.º 50
0
    batch_size = CommonUtilities.get_param_value_int("batch_size", sys.argv, batch_size)
    logging.info('batch_size:{0}'.format(batch_size))

    lstm_hidden_size = 50
    lstm_hidden_size = CommonUtilities.get_param_value_int("lstm_hidden_size", sys.argv, lstm_hidden_size)
    logging.info('lstm_hidden_size:{0}'.format(lstm_hidden_size))

    if cmd == "train":
        logging.info("==========================")
        logging.info("======== TRAINING ========")
        logging.info("==========================")
        embeddings_vec_size = embeddings_size
        if embeddings_model_type == "w2v":
            logging.info("Loading w2v model..")
            if word2vec_load_bin:
                embeddings_model = Word2Vec.load_word2vec_format(embeddings_model_file, binary=True)  # use this for google vectors
            else:
                embeddings_model = Word2Vec.load(embeddings_model_file)
            embeddings_vec_size = embeddings_model.syn0.shape[1]
        elif embeddings_model_type == "rand":
            embeddings_model = None
        else:
            raise Exception("embeddings_model_type=%s is not yet supported!" % embeddings_model_type)

        # train data
        input_data_fileslist_train = [data_tac2014_train, data_tac2015_train, data_tac2014_eval]

        train_data_files = ""  #
        train_data_files = CommonUtilities.get_param_value("train_data_files", sys.argv,
                                                                default=train_data_files)
Ejemplo n.º 51
0
from newsParser import parseSNU
from db import loadSNU, connectDB

host = 'localhost'

chunk = 100

if __name__ == '__main__':
    build_hannanum.build()
    hannanum = Hannanum()

    print('Load Parser Done!')

    filedir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'w2v')
    os.chdir(filedir)
    w2v = Word2Vec.load('model.model')
    print('Load Word2Vector Done!')
    _, _, _, _, vecDB, *_ = connectDB(host)

    try:
        begin = vecDB['metadata'].find_one({'type': 'snu'})['idx']
    except:
        begin = 0

    while True:
        wv = []
        li = loadSNU(chunk, begin)
        for i in li:
            wv += parseSNU(i, (hannanum, w2v))
        begin += len(li)
        if wv:
Ejemplo n.º 52
0
                    try:
                        cursor.execute(
                            query_insert_word,
                            (morph.parse(word)[0].normal_form, id_article))
                        connection.commit()
                    except Exception:
                        print(file)
                        continue
                    g = g + 1
                    print("Article " + str(n_files) + ": " + str(g) + " из " +
                          str(len(list_words)))

    n_files = n_files + 1
    print("Обработано: " + str(n_files) + " из " + str(len(files)))

model = Word2Vec.load('ruwiki.word2vec.model')
country_list = [item[0] for item in model.most_similar('швеция')]

stop_words = get_stop_words('ru')
cursor.execute(
    "SELECT word,count(*) as c FROM stats GROUP BY word ORDER BY c desc LIMIT 100"
)
for row in cursor:
    if (row[0] not in stop_words) and (re.compile('\d+').match(row[0])
                                       == None):
        if (row[0] in country_list):
            #print (row[0]+": "+ str(row[1]))
            president = model.most_similar(positive=[row[0], 'путин'],
                                           negative=['россия'],
                                           topn=1)
            print(row[0] + ": " + president[0])
Ejemplo n.º 53
0
    tokenized_twt = [j for j in tokenized_twt if ( j not in string.punctuation )]
    stop_words = stopwords.words('russian')
    stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
    tokenized_twt = [j for j in tokenized_twt if ( j not in stop_words )]
    tokenized_twt = [j.replace("«", "").replace("»", "") for j in tokenized_twt]
    tokens = [stemmer.stem(t) for t in tokenized_twt if not t.startswith('@')]
    #print(tokens)
    tokenized_corpus.append(tokens)

vector_size = 512
window_size = 10

word2vec = Word2Vec(sentences=tokenized_corpus,
                    size=vector_size, 
                    window=window_size, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=4)

model = load_model('model.h5')
stemmer = SnowballStemmer("russian")
cv = nltk.word_tokenize(sys.argv[1:][0])
cv = [j for j in cv if ( j not in string.punctuation )]
stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
cv = [j for j in cv if ( j not in stop_words )]
cv = [j.replace("«", "").replace("»", "") for j in cv]


vecs = []
Ejemplo n.º 54
0
    logging.basicConfig(
        format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
        level=logging.INFO)
    logging.info("running %s" % " ".join(sys.argv))
    logging.info("using optimization %s" % FAST_VERSION)

    # check and process cmdline input
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 2:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    infile = sys.argv[1]
    from gensim.models.word2vec import Word2Vec  # avoid referencing __main__ in pickle

    seterr(all='raise')  # don't ignore numpy errors

    # model = Word2Vec(LineSentence(infile), size=200, min_count=5, workers=4)
    model = Word2Vec(Text8Corpus(infile), size=200, min_count=5, workers=1)

    if len(sys.argv) > 3:
        outfile = sys.argv[3]
        model.save(outfile + '.model')
        model.save_word2vec_format(outfile + '.model.bin', binary=True)
        model.save_word2vec_format(outfile + '.model.txt', binary=False)

    if len(sys.argv) > 2:
        questions_file = sys.argv[2]
        model.accuracy(sys.argv[2])

    logging.info("finished running %s" % program)
Ejemplo n.º 55
0
def build_dataset(train_data_path,
                  test_data_path,
                  save_wv_model_path,
                  testOnly=True,
                  toCSV=True):
    '''
    数据加载+预处理
    :param train_data_path:训练集路径
    :param test_data_path: 测试集路径
    :return: 训练数据 测试数据  合并后的数据
    '''

    # 1.加载数据
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)
    print('train data size {},test data size {}'.format(
        len(train_df), len(test_df)))

    # 2. 空值填充
    train_df.dropna(subset=['Question', 'Dialogue', 'Report'],
                    how='any',
                    inplace=True)
    test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True)

    # 3.多进程, 批量数据处理
    train_df = parallelize(train_df, sentences_proc)
    test_df = parallelize(test_df, sentences_proc)

    # 4. 合并训练测试集合
    train_df['merged'] = train_df[['Question', 'Dialogue',
                                   'Report']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    test_df['merged'] = test_df[['Question',
                                 'Dialogue']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
    print('train data size {},test data size {},merged_df data size {}'.format(
        len(train_df), len(test_df), len(merged_df)))

    # 5.保存处理好的 训练 测试集合
    train_df = train_df.drop(['merged'], axis=1)
    test_df = test_df.drop(['merged'], axis=1)

    if toCSV:
        train_df.to_csv(train_seg_path, index=None, header=True)
        test_df.to_csv(test_seg_path, index=None, header=True)
        # 6. 保存合并数据
        merged_df.to_csv(merger_seg_path, index=None, header=False)

    if osp.exists(save_wv_model_path):
        wv_model = Word2Vec.load(save_wv_model_path)
    else:
        # 7. 训练词向量
        print('start build w2v model')
        wv_model = Word2Vec(LineSentence(merger_seg_path),
                            size=embedding_dim,
                            negative=5,
                            workers=8,
                            iter=wv_train_epochs,
                            window=3,
                            min_count=5)

    # 8. 分离数据和标签
    train_df['X'] = train_df[['Question',
                              'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'] = test_df[['Question',
                            'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

    # 9. 填充开始结束符号,未知词填充 oov, 长度填充
    # 使用GenSim训练得出的vocab
    vocab = wv_model.wv.vocab

    # 训练集X处理
    # 获取适当的最大长度
    train_x_max_len = get_max_len(train_df['X'])
    test_X_max_len = get_max_len(test_df['X'])
    X_max_len = max(train_x_max_len, test_X_max_len)
    train_df['X'] = train_df['X'].apply(
        lambda x: pad_proc(x, X_max_len, vocab))

    # 测试集X处理
    # 获取适当的最大长度
    test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab))

    # 训练集Y处理
    # 获取适当的最大长度
    train_y_max_len = get_max_len(train_df['Report'])
    train_df['Y'] = train_df['Report'].apply(
        lambda x: pad_proc(x, train_y_max_len, vocab))

    # 10. 保存pad oov处理后的,数据和标签
    if toCSV:
        train_df['X'].to_csv(train_x_pad_path, index=None, header=False)
        train_df['Y'].to_csv(train_y_pad_path, index=None, header=False)
        test_df['X'].to_csv(test_x_pad_path, index=None, header=False)

    if testOnly:
        print("No retraining! Test only...")
        return train_df['X'], train_df['Y'], test_df['X'], wv_model
    else:
        # 11. 词向量再次训练
        print('start retrain w2v model')
        wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
        wv_model.train(LineSentence(train_x_pad_path),
                       epochs=wv_train_epochs,
                       total_examples=wv_model.corpus_count)
        print('1/3')
        wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
        wv_model.train(LineSentence(train_y_pad_path),
                       epochs=wv_train_epochs,
                       total_examples=wv_model.corpus_count)
        print('2/3')
        wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
        wv_model.train(LineSentence(test_x_pad_path),
                       epochs=wv_train_epochs,
                       total_examples=wv_model.corpus_count)

        # 保存词向量模型
        wv_model.save(save_wv_model_path)
    # or load wv_model
    # wv_model = Word2Vec.load(save_wv_model_path)

    print('finish retrain w2v model')
    print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab))

    return train_df['X'], train_df['Y'], test_df['X'], wv_model
Ejemplo n.º 56
0
def input_transform(string):
    words = jieba.lcut(string)
    words = np.array(words).reshape(1, -1)
    model = Word2Vec.load('lstm_data/Word2vec_model.pkl')
    _, _, combined = create_dictionaries(model, words)
    return combined
Ejemplo n.º 57
0
            score.append(1)
        else:
            score.append(0)
        dd.append({'y_test': y_test[i], 'first': label[0], 'second': label[1]})
    acc = sum(score) / len(y_test)
    return acc


data = pd.read_excel('./data/doc_set_final_version3.xlsx')
data['token'] = data.token.apply(lambda x: literal_eval(x))
X_data = data[['token', 'new_small_class']]
target_big = data.new_class.tolist()
target_small = data.new_small_class.tolist()

w2v_model_name = './model/word_embedding/Word2vec1(base_token).model'
word_vectorizer = Word2Vec.load(w2v_model_name)
word_vectorizer.wv.vectors.shape
word_index = word_vectorizer.wv.index2word
EMBEDDING_DIM = word_vectorizer.trainables.layer1_size
word_index
tfidf = TfidfVectorizer(analyzer=lambda x: x, vocabulary=word_index)
tfidf.fit(data['token'])

max_idf = max(tfidf.idf_)
word2weight = defaultdict(lambda: max_idf,
                          [(w, tfidf.idf_[i])
                           for w, i in tfidf.vocabulary_.items()])

train_X, test_X, train_y, test_y = train_test_split(X_data,
                                                    target_big,
                                                    test_size=0.3,
Ejemplo n.º 58
0
from gensim.models.word2vec import Word2Vec
from keras import Input, Model
from keras.models import Sequential
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Activation
from m1 import BOW

maxlen = 100 # 每句话的固定长度(截断或者补全)
batch_size = 64
embedding_dim = 300
epochs = 10

comments = [['same', 'coffee', 'shop', 'my', 'memory', 'of', 'it', 'is'],[]]


# 训练词向量---------------------------------------
w2v_model = Word2Vec(comments,size=embedding_dim, min_count=5, workers=10)

# 构造embedding字典
bow = BOW(comments.tolist(), min_count=5, maxlen=maxlen)
vocab_size = len(bow.word2idx)

embedding_matrix = np.zeros((vocab_size+1,300))
for key, value in bow.word2idx.items():
    if key in w2v_model.wv.vocab: # Word2Vec训练得到的的实例需要word2vec.wv.vocab
        embedding_matrix[value] = w2v_model.wv[key]
    else:
        embedding_matrix[value] = [0] * embedding_dim

# 构建数据集-------------------------------------
X = copy.deepcopy(bow.doc2num[:159571])
# 训练集和验证集划分 4:1
Ejemplo n.º 59
0
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load(
    '/Users/pavel/PycharmProjects/NeuralNetworkWithTenser/src/wordrecognition/savemodal/word2vec_modal'
)
print(model.wv.most_similar(positive=['woman', 'king'], topn=5))
Ejemplo n.º 60
0
 def load_model(path):
     w2v = Word2vecEmbedder()
     w2v._model = Word2Vec.load(path)
     logging.info("loaded word2vec model from: " + path)
     w2v.is_fitted = True
     return w2v