Ejemplo n.º 1
0
def gene():
    modelpre=Word2Vec.load('corpus/pretrain40.model')
    modelfield=Word2Vec.load('corpus/fieldtrained40.model')
    modelmerged=Word2Vec.load('corpus/mergedtrained40iter1.model')
    xlist=[]
    ylist=[]
    zlist=[]
    labellist=[]
    upperline=0.016
    floor=0.008 #0.01 0.013
    upperlinefield=0.06
    floorfield=0.02
    upperlinepre=0.019
    floorpre=0.018
    with open('corpus/word2pic2.txt') as fp:
        for row in fp:
            word=unicode(row[:-1])
            x=(modelmerged.similarity(word,u"好")+modelmerged.similarity(word,u"快乐")+modelmerged.similarity(word,u"开心"))/3.0-(modelmerged.similarity(word,u"坏")+modelmerged.similarity(word,u"悲伤"))/2.0
            y=(modelfield.similarity(word,u"好")+modelfield.similarity(word,u"快乐")+modelfield.similarity(word,u"开心"))/3.0-(modelfield.similarity(word,u"坏")+modelfield.similarity(word,u"悲伤"))/2.0
            z=(modelpre.similarity(word,u"好")+modelpre.similarity(word,u"快乐")+modelpre.similarity(word,u"开心"))/3.0-(modelpre.similarity(word,u"坏")+modelpre.similarity(word,u"悲伤"))/2.0
            labellist.append(word)
            # xlist.append(x-(upperline+floor)/2.0)
            xlist.append(x-0.016)
            ylist.append(y-(upperlinefield+floorfield)/2.0)
            zlist.append(z-(upperlinepre+floorpre)/2.0)
    # with open('corpus/word2picxyz.txt','w') as fp:
    #     pickle.dump(labellist,xlist,ylist,zlist,fp)
    return labellist,xlist,ylist,zlist
Ejemplo n.º 2
0
def dis(vectorsize):
    # print model.similarity("今天","在")
    model=Word2Vec.load('corpus/mergedtrained'+str(vectorsize)+'iter1'+'.model')
    modelfield=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model')
    print model.similarity(u"分手",u"好")
    print model.similarity(u"分手",u"坏")
    print modelfield.similarity(u"分手",u"好")
    print modelfield.similarity(u"分手",u"坏")
Ejemplo n.º 3
0
def main():
    # te()
    # teword()
    # intersect(40)
    # setwordwindow(40)
    # Word2Vec.load_word2vec_format('corpus/initindex40',binary=False)
    
    modelpre=Word2Vec.load('corpus/pretrain40.model')
    modelfield=Word2Vec.load('corpus/fieldtrained40.model')
    modelmerged=Word2Vec.load('corpus/mergedtrained40iter1.model')
    print 'finish load'
    classify(modelpre,modelfield,modelmerged,40)
Ejemplo n.º 4
0
    def create_partition_function(self, f_w2v, f_h5):
        print "Building the partition function"
        
        # Load the model from disk
        M = Word2Vec.load(f_w2v)

        words = M.index2word
        ZT = []
        INPUT_ITR = tqdm.tqdm(words)

        # Compute the partition function for each word
        for w in INPUT_ITR:
            UE = self.energy(M.syn0, M[w])
            z  = compute_partition_stats(UE)
            ZT.append(z)

        # Save the partition function to disk
        # (special care needed for h5py unicode strings)
        dt = h5py.special_dtype(vlen=unicode)

        with h5py.File(f_h5,'w') as h5:
                       
            h5.create_dataset("words", (len(words),),
                              dtype=dt,
                              data=[w.encode('utf8') for w in words])

            h5.attrs['vocab_N'] = len(words)
            h5['Z'] = ZT            
Ejemplo n.º 5
0
def main():
    industry = sys.argv[1]
    vocab_file = "../data/" + industry + "/embed_vocab"
    model_file = "../data/" + industry + "/user_model"
    # load vocab list
    with open(vocab_file) as f:
        vocab_list = map(str.strip, f.readlines())
    # load model
    model = Word2Vec.load(model_file)

    # build vocab index dict
    vob_index_dict = {}
    for i, vob in enumerate(vocab_list):
        vob_index_dict[vob] = i

    # calc vocab dist
    logging.info("calucating vocab dist matrix")
    dm = get_vocab_dist_matrix(vocab_list, model)

    # get company domain list dict
    comp_domain_file = "../data/" + industry + "/company_file"
    comp_dict = get_comp_dict(comp_domain_file)
    logging.info("company dict generated : " + str(comp_dict.keys()))

    # delete domain not exist in vocab list
    filter_company_by_vocab(comp_dict, vocab_list)

    # filter company domain by uv : default uv > 100
    filter_action_by_uv(comp_dict, 100)

    # calc dist between two company
    res_file = "../data/" + industry + "/company_dist"
    calc_company_dist(res_file, comp_dict, dm, vob_index_dict)
Ejemplo n.º 6
0
    def __init__(self):

        '''
        Training parameters:
        '''

        self.w2v_dim=100
        self.num_feature=400
        self.batch_size=16
        self.num_epoch=30

        # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')

        self.index2word_set = set(self.w2v_model.index2word)

        #self.bigram=None
        #self.trigram=None

        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        print('Build model...')

        self.model = Sequential()
        self.model.add(Dropout(0.2,input_shape=(self.num_feature,)))
        self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal'))
        self.model.add(Activation('softmax'))


        self.model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode="categorical")

        print('Model has been built!')
Ejemplo n.º 7
0
 def initialize(self):
     sys.stdout.write("Metric initialization\n")
     sys.stdout.write("1 - Word2vec model")
     self.model = Word2Vec.load(model_path)
     sys.stdout.write("...loaded\n")
     sys.stdout.write("2 - Stop words")
     self.stop_words = [line.strip('\n') for line in open(stop_words_path)]
     sys.stdout.write("...loaded\n")
     sys.stdout.write("3 - Word-Averages model: ")
     self.wordAverages = defaultdict()
     for i in self.files_list:
         sys.stdout.write(str(i) + " - ")
         sys.stdout.flush()
         tweetsFile = tweets_path + str(i) + ".csv"
         wAvgsFile = wAvgs_path + str(i) + ".csv"
         tweets = []
         values = []
         with open(tweetsFile, 'r') as f1: 
             tweets = f1.readlines()
             f1.close()
         with open(wAvgsFile, 'r') as f2: 
             reader = csv.reader(f2)
             for r in reader:
                 values.append( np.array([ float(v) for v in r  ]) )
             f2.close()
         for j in range(len(tweets)):   
             self.wordAverages[ tweets[j].strip('\n')  ] = values[j]
     sys.stdout.write("loaded\n")
Ejemplo n.º 8
0
def get_predict_vecs(words):
    n_dim = 300
    imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
    #imdb_w2v.train(words)
    train_vecs = buildWordVector(words, n_dim,imdb_w2v)
    #print train_vecs.shape
    return train_vecs
Ejemplo n.º 9
0
 def load_external(self, model_file_name):
     """
     load a word2vec model from the file specified
     :param model_file_name: name of the model file
     :return:
     """
     self.model = Word2Vec.load(model_file_name)
Ejemplo n.º 10
0
    def __init__(self,*args,**kwargs):
        super(affinity_mapping, self).__init__(*args,**kwargs)

         # Load the model from disk
        self.M = Word2Vec.load(kwargs["f_w2v"])       
        self.shape = self.M.syn0.shape
        
        # Set parallel option
        self._PARALLEL = ast.literal_eval(kwargs["_PARALLEL"])

        self.damping = float(kwargs["damping"])
        
        if not os.path.exists(kwargs["f_affinity"]):
            h5 = h5py.File(kwargs["f_affinity"],'w')
            h5.close()
 
        self.h5 = h5py.File(kwargs["f_affinity"],'r+')

        global damping, M

        damping = self.damping
        M = self.M

        self.vocab_n = len(M.index2word)
    
        M.word2index = dict([(w,i) for w,i in
                             zip(M.index2word,range(self.vocab_n))])

        # Increment this as we find more clusters
        self.cluster_n = 0
Ejemplo n.º 11
0
def build_word_graph(model_fname, limiar=0.2):
    """
    Constroi um grafo de walavras ponderado pela similaridade entre elas
    de acordo com o modelo.
    :param model_fname: Nome do arquivo com o modelo word2vec como foi salvo
    :return: objeto grafo
    """
    m = Word2Vec.load(model_fname)
    g = Graph()
    freq = g.new_vertex_property("int")
    weight = g.new_edge_property("float")
    i = 0
    vdict = {}
    for w1, w2 in combinations(m.vocab.keys(), 2):
        if w1 == '' or w2 == '':
            continue
        # print(w1,w2)

        v1 = g.add_vertex() if w1 not in vdict else vdict[w1]
        vdict[w1] = v1
        freq[v1] = m.vocab[w1].count
        v2 = g.add_vertex() if w2 not in vdict else vdict[w2]
        vdict[w2] = v2
        freq[v2] = m.vocab[w2].count
        sim = m.similarity(w1, w2)
        if sim > 0.1:
            e = g.add_edge(v1, v2)
            weight[e] = sim
        if i > 10000:
            break
        i += 1
    g.vertex_properties['freq'] = freq
    g.edge_properties['sim'] = weight
    return g
Ejemplo n.º 12
0
 def __init__(self, word2vec_path=""):
     self.sentence = []
     self.tfidf_sparse = []
     self.bi_set = [-1 for i in range(1000000)]
     self.tfidf_model_dict = {}
     if word2vec_path != "":
         self.word2vec_model = Word2Vec.load(word2vec_path)        
Ejemplo n.º 13
0
def vectorize(model_file, dictionary_file, corpus_file):
  seterr(all='raise')  # don't ignore numpy errors

  #load model from given file
  model = Word2Vec.load(model_file)
  dictionary = corpora.Dictionary().load(dictionary_file)
  corpus = corpora.MmCorpus(corpus_file)
  tfidf = models.TfidfModel(corpus)
  d = corpora.Dictionary()
  d = d.load(dictionary_file)
  corpus = corpora.MmCorpus(corpus_file)
  tf = models.TfidfModel(corpus)
  vectorize = []
  for doc_no, tdoc in enumerate(tf[corpus]):
    tdoc.sort(key=lambda kv: kv[1], reverse=True)
    if doc_no % 100 == 0:
          logger.info("PROGRESS: vectorizing user #%i of %i" %
              (doc_no, len(corpus)))
    words_per_user = 8
    word_vecs = []
    for wordid, measure in tdoc:
      word = d[wordid]
      if word in model:
        word_vecs.append(model[word])
        print word
      if len(word_vecs)>=words_per_user:
        break

    if len(word_vecs)==words_per_user:
      avg = matutils.unitvec(array(word_vecs).mean(axis=0)).astype(REAL)
      vectorize.append(avg)
      #print [word for word, measure in model.most_similar_from_array(avg, topn=5)]
  
  return vectorize
Ejemplo n.º 14
0
def term_expansion(fpath, terms, knn):
    '''Expand term list by creating list of nearest neighbors in provided embeddings
    representation. This is usually very noisy and there is a fuzzy distinction between
    semantic similarity and "relatedness". Bacteria names, for example, often neighbor
    diseases caused by those organisms.
    '''
    model = Word2Vec.load(fpath)
    model.init_sims()
    nbrs = NearestNeighbors(n_neighbors=knn+1, algorithm='ball_tree', metric='l2')
    nbrs.fit(model.syn0norm)
    
    expansion = []
    for phrase in terms:
        # space replaced with underscore in PMC/PubMed embeddings
        phrase = phrase.replace(" ","_")
        if phrase not in model.vocab:
            continue
        idx = model.vocab[phrase].index
        vec = model.syn0norm[idx]
        _,indices = nbrs.kneighbors(vec)
        neighbors = [model.index2word[j] for j in indices.flatten()]
        neighbors.remove(phrase)
        expansion += neighbors
    
    # transform words back to whitespace separators 
    return map(lambda x:x.replace("_"," "), expansion)
Ejemplo n.º 15
0
    def __init__(self,*args,**kwargs):
        super(generic_document_score, self).__init__(*args,**kwargs)

        f_w2v = os.path.join(
            kwargs["embedding"]["output_data_directory"],
            kwargs["embedding"]["w2v_embedding"]["f_db"],
        )

        # Load the model from disk
        self.M = Word2Vec.load(f_w2v)
        self.shape = self.M.syn0.shape
        
        # Build the dictionary
        vocab_n = self.shape[0]
        self.word2index = dict(zip(self.M.index2word,range(vocab_n)))
        
        # Set parallel option (currently does nothing)
        self._PARALLEL = kwargs["_PARALLEL"]

        # Load the negative weights
        if "negative_weights" in kwargs:
            neg_W = kwargs["negative_weights"]
            self.neg_W = dict((k, float(v)) for k,v in neg_W.items())
        else:
            self.neg_W = {}
Ejemplo n.º 16
0
def query_word_similarity(model_file, word1, word2):
  seterr(all='raise')  # don't ignore numpy errors

  #load model from given file
  model = Word2Vec.load(model_file + '.model')
  similarity = model.similarity(word1,word2)
  logging.info("similarity of \'%s\' and \'%s\' is %f" % (word1,word2,similarity))
Ejemplo n.º 17
0
    def __init__(self, *args, **kwargs):

        '''
        Computes various measures of central tendency of a document.
        For Z_X scores, the raw word tokens are summed over the partition
        function. For I_X scores, the same statistics are computed over
        the similarity of all word pairs for words with top 10% Z values.
        This will precompute the partition function if it doesn't exist.
        '''
        cfg_embed = kwargs["embedding"]
        cfg_score = kwargs["score"]

        f_w2v = os.path.join(
            cfg_embed["output_data_directory"],
            cfg_embed["w2v_embedding"]["f_db"],
        )

        f_partition_function = os.path.join(
            cfg_embed["output_data_directory"],
            cfg_score["document_log_probability"]["f_partition_function"],
        )
        
        if not os.path.exists(f_partition_function):
            self.create_partition_function(f_w2v, f_partition_function)

        self.Z = self.load_partition_function(f_partition_function)
        self.scores = []

        val = cfg_score["document_log_probability"]["intra_document_cutoff"]
        self.intra_document_cutoff = float(val)

        self.model = Word2Vec.load(f_w2v)
Ejemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser(
        description='Python Word2Vec Cluster')

    parser.add_argument('model',
                        action='store',
                        help='Name of word2vec binary modelfile.')

    parser.add_argument('-o', '--out',
                        action='store',
                        default='model.pkl',
                        help='Set output filename.')

    parser.add_argument('-k', '--K',
                        action='store',
                        type=int,
                        default=500,
                        help='Num of classes on KMeans.')

    parser.add_argument('-p', '--pre-trained-model',
                        action='store',
                        default=None,
                        help='Use pre-trained KMeans Model.')

    parser.add_argument('-w', '--words-to-pred',
                        action='store',
                        nargs='+',
                        type=str,
                        default=None,
                        help='List of word to predict.')

    args = parser.parse_args()

    model = Word2Vec.load(args.model)

    if not args.pre_trained_model:
        X = make_dataset(model)
        classifier = train(X, args.K)
        joblib.dump(classifier, args.out)
        reduced =  reduce_dems(X)
        plot(classifier, reduced)

    else:
        classifier = joblib.load(args.pre_trained_model)

    if args.words_to_pred:

        X = [model[word] for word in args.words_to_pred if word in model]
        classes = classifier.predict(X)

        result = []
        i = 0
        for word in args.words_to_pred:
            if word in model:
                result.append(str(classes[i]))
                i += 1
            else:
                result.append(str(-1))
        print(' '.join(result))
Ejemplo n.º 19
0
def wordclasscification():
    model=Word2Vec.load('corpus/mergedtrained40iter1.model')
    modelfield=Word2Vec.load('corpus/fieldtrained40.model')
    modelpre=Word2Vec.load('corpus/pretrain40.model')
    # wordlist=[u"喝酒",u"竞赛",u"原生",u"警察",u"离婚",u"单身"]
    with open('corpus/wordlabelcorpuslarge.txt') as fp:
        with open('corpus/wordneulabelsepe3','w') as file:
            for i in fp:
                # print i[:-1]
                try:
                    word=unicode(i[:-1])
                    upperline=0.016
                    floor=0.008 #0.01 0.013
                    upperlinefield=0.06
                    floorfield=0.02
                    upperlinepre=0.019
                    floorpre=0.018
                    try:
                        sub=(model.similarity(word,u"好")+model.similarity(word,u"快乐")+model.similarity(word,u"开心"))/3.0-(model.similarity(word,u"坏")+model.similarity(word,u"悲伤"))/2.0
                        if sub>upperline:
                            modellabel=1
                        elif sub<floor:
                            modellabel=-1
                        else:
                            modellabel=0
                        sub=(modelfield.similarity(word,u"好")+modelfield.similarity(word,u"快乐")+modelfield.similarity(word,u"开心"))/3.0-(modelfield.similarity(word,u"坏")+modelfield.similarity(word,u"悲伤"))/2.0
                        if sub>upperlinefield:
                            modelfieldlabel=1
                        elif sub<floorfield:
                            modelfieldlabel=-1
                        else:
                            modelfieldlabel=0
                        sub= (modelpre.similarity(word,u"好")+modelpre.similarity(word,u"快乐")+modelpre.similarity(word,u"开心"))/3.0-(modelpre.similarity(word,u"坏")+modelpre.similarity(word,u"悲伤"))/2.0
                        if sub>upperlinepre:
                            modelprelabel=1
                        elif sub<floorpre:
                            modelprelabel=-1
                        else:
                            modelprelabel=0
                        file.write(i[:-1]+' '+str(modellabel)+' '+str(modelfieldlabel)+' '+str(modelprelabel)+'\n') 
                    except KeyError:
                        print 'no key'
                        continue
                except UnicodeDecodeError:
                    print 'unicode error'
                    continue
Ejemplo n.º 20
0
def main():
    # te()
    # teword()
    # intersect(40)
    # setwordwindow(40)
    # Word2Vec.load_word2vec_format('corpus/initindex40',binary=False)
    model=Word2Vec.load('corpus/mergedtrained40iter1.model')

    dis(model)
Ejemplo n.º 21
0
def intersect(vectorsize):
    model=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model')
    # setwordwindow(vectorsize)
    print 'finish load'
    Word2Vec.intersect_word2vec_format(model,'corpus/initindex'+str(vectorsize),binary=False)
    print 'finish intersect'
    model.save('corpus/merged'+str(vectorsize)+'.model')
    model.save_word2vec_format('corpus/merged'+str(vectorsize), binary=False)
    print 'finish save'
Ejemplo n.º 22
0
def fieldtrain(vectorsize):
    model=Word2Vec.load('corpus/pretrain'+str(vectorsize)+'.model')
    print 'finish load'
    sentences=LineSentence('corpus/fieldcorpus')
    model.train(sentences)
    print 'finish fieldtrain'
    model.save('corpus/fieldtrained'+str(vectorsize)+'.model')
    model.save_word2vec_format('corpus/fieldtrained'+str(vectorsize), binary=False)
    print 'finish save'
Ejemplo n.º 23
0
	def __init__(self, num_topics, window_size, dim_size, model_folder='../Data/models'):
		models_file_template = model_folder+"/{model}_{run_id}.{filetype}"
		self._run_id = "K{topics}_W{window}_D{dims}".format(topics=num_topics, window=window_size, dims=dim_size)
		w2v_filename = models_file_template.format(model='w2v', run_id=self._run_id, filetype='gensim')
		gmm_filename = models_file_template.format(model='gmm', run_id=self._run_id, filetype='pkl')
		self._w2v_model = Word2Vec.load(w2v_filename)
		self._gmm_model = joblib.load(gmm_filename)

		self.index2word = self._w2v_model.index2word
Ejemplo n.º 24
0
def main2():
    bow = cPickle.load(open('data/bow.pkl'))
    M = 10
    m = Word2Vec.load('data/word2vecmodels/model%d.mm' % (M))
    word_to_vec = np.array([m[bow[i]] for i in xrange(len(bow))])

    # word_to_vec = cPickle.load(open('data/word_to_vec_pkl'))
    # print word_to_vec[0][5]
    # print word_to_vec[0][6]
    np.savetxt('output/word2vec_vectors.10d', word_to_vec, delimiter=' ')
def get_model(model_num, model_names):
    
    
    if model_num < 10:
        model = Word2Vec.load(model_path + model_names)
    elif model_num < 99:
        model = Doc2Vec.load(model_path + model_names)
    else:
        model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True)  # C text format
    return model
Ejemplo n.º 26
0
def retrain(orig_model_name, sentences, corpus_name, iter=10):
    orig_model_path = special_dir / orig_model_name
    model = Word2Vec.load(orig_model_path.as_posix())
    nb_sentences = len(sentences)
    (special_dir / corpus_name).mkdir(exist_ok=True)
    for i in range(1, iter + 1):
        dest_name = "{}_{}_{}".format(orig_model_name, corpus_name, i)
        dest_path = special_dir / corpus_name / dest_name
        model.train(sentences, total_examples=nb_sentences)
        model.save(dest_path.as_posix())
Ejemplo n.º 27
0
def filterLists(engList, freList):
    freModel = Word2Vec.load("../models/defFrePunct.model")
    engRetList = []
    freRetList = []
    for engWord, freWord in zip(engList, freList):
        try:
            freModel[freWord.lower()]
            engRetList.append(engWord)
            freRetList.append(freWord)
        except:
            continue
            
    return engRetList, freRetList
Ejemplo n.º 28
0
    def __init__(self,train_data,dev_data,test_data):
        self.train_data=train_data
        self.dev_data=dev_data
        self.test_data=test_data

        # Hyper-parameters
        self.learningRate=0.01
        self.trainSize=2000
        self.testSize=1000
        self.totalSize = self.trainSize + self.testSize
        self.maxEpochs=10000
        self.num_processed=-1

        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')
Ejemplo n.º 29
0
def load_embeddings():
    '''
    Loads the gensim word embedding model.
    '''
    config = simple_config.load("embedding")
    
    from gensim.models.word2vec import Word2Vec

    f_w2v = os.path.join(
        config["output_data_directory"],
        config["w2v_embedding"]["f_db"],
    )

    return Word2Vec.load(f_w2v)
Ejemplo n.º 30
0
def getDistRep(words,modelPath,dims):
    '''
    Takes a list of words and returns distributed representation of words
    according to the model provided 
    '''
    # Load model 1
    model = Word2Vec.load(modelPath)
    numWords = len(words)
    retMat = np.zeros((numWords,dims))
    
    for idx, word in enumerate(words):
#         print word, chardet.detect(word)
        retMat[idx] = model[word]
        
    return retMat
Ejemplo n.º 31
0
    def word_2_vec(self):
        print '向量化 start'
        '''格式化数据'''
        sentences = []
        sentences_word = []
        with open(self.__input_url, 'r') as f:
            for line in f.readlines():
                line = line.strip().decode('utf-8')  # 把末尾的'\n'删掉
                sentences.append(line)
                line = line.split(' ')
                sentences_word.append(line)
        ''' 获取tf-idf矩阵 '''
        vectorizer = TfidfVectorizer()
        tf_idf = vectorizer.fit_transform(sentences)
        word = vectorizer.get_feature_names()  # 返回词数组
        tf_idf = tf_idf.toarray()  # 返回词tf_idf值的数组
        print '句数:%d 词数:%d' % (len(sentences_word), len(word))
        '''格式化每句每词的tf_idf值'''
        sentences_tf_idf = []
        for si, sv in enumerate(sentences_word):
            for wi, wv in enumerate(sv):
                sentences_tf_idf.append([])
                if wv in word:
                    sentences_tf_idf[si].append(tf_idf[si][word.index(wv)])
                else:
                    sentences_tf_idf[si].append(0.0)
                    # print si + 1, len(sentences_tf_idf[si]), sentences_tf_idf[si]
        '''训练词向量模型'''
        word2vec_size = 2000
        if not os.path.exists('./source/model'):
            os.mkdir('./source/model')
        if os.path.exists('./source/model/word.model'):
            model = Word2Vec.load('./source/model/word.model')
            print 'model loaded success'
        else:
            model = Word2Vec(sentences_word, size=word2vec_size, min_count=0)
            model.save('./source/model/word.model')
            print 'model saved success to ./source/model/word.model'
        '''格式化 句-词向量 and 句向量'''
        sentences_word_vec = []
        sentences_vec = []
        for si, sv in enumerate(sentences_word):
            sum_x = np.array([0.0 for x in range(0, word2vec_size)])
            for wi, wv in enumerate(sv):
                sentences_word_vec.append([])
                temp = []
                if wv in model.wv:
                    for ci, cv in enumerate(model.wv[wv]):
                        temp.append(cv)
                else:
                    temp.append(0.0)
                # 句向量 = (tf_idf * 词向量)的和
                sentences_word_vec[si].append(
                    np.array(temp) * sentences_tf_idf[si][wi])
                sum_x += np.array(temp)
            # print si + 1, len(sentences_word_vec[si]), sum_x
            sentences_vec.append(sum_x)

        print '向量化 done\n'
        '''kmeans聚类'''
        print '聚类 start'
        # 调用kmeans类
        clf = KMeans(n_clusters=7)
        s = clf.fit(sentences_vec)
        print s
        print '聚类 done\n'

        # 7个中心
        # print '中心', clf.cluster_centers_

        # 每个样本所属的簇
        # print len(clf.labels_), clf.labels_

        # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数
        # print clf.inertia_

        # 进行预测
        # print clf.predict(sentences_vec)

        # # 保存模型
        # joblib.dump(clf, 'c:/km.pkl')
        #
        # # 载入保存的模型
        # clf = joblib.load('c:/km.pkl')

        # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数
        # clfinertia = []
        # for i in range(5, 30):
        #     clf = KMeans(n_clusters=i)
        #     s = clf.fit(sentences_vec)
        #     clfinertia.append(clf.inertia_)
        #     print i, clf.inertia_
        #
        # clfinertia_sum = 0
        # con = 4000000
        # for i in range(0, 23):
        #     # sum_i = math.atan(
        #     #     clfinertia[i] / con * (i + 3) - clfinertia[i + 1] / con * (i + 3)) - math.atan(
        #     #     clfinertia[i + 1] / con * (i + 3) - clfinertia[i + 2] / con * (i + 3))
        #     sum_i = clfinertia[i] - clfinertia[i + 1] - (clfinertia[i + 1] - clfinertia[i + 2])
        #
        #     if clfinertia_sum < sum_i:
        #         clfinertia_sum = sum_i
        #         print i + 6, sum_i
        '''输出每类tf_idf值 and 分类文件'''
        print '输出分类 start'
        word_class_vec = [[] for x in range(0, len(clf.cluster_centers_))]
        fr = open('./source/thulac_out.txt', 'r')
        line_x = []
        for line in fr.readlines():
            line_x.append(line)
        for i in range(0, len(clf.cluster_centers_)):
            class_filename = './source/classes/class_%d.txt' % i
            if os.path.exists(class_filename):
                os.remove(class_filename)
        for i in range(0, len(clf.labels_)):
            class_i = clf.labels_[i]
            word_class_vec[class_i].append(tf_idf[i])
            class_filename = './source/classes/class_%d.txt' % class_i
            with open(class_filename, 'a') as fw:
                fw.write(str(line_x[i]))
                # print line_x[i]
        print '输出分类 保存到 ./source/classes/'
        fr.close()
        print '输出分类 done\n'
        '''统计关键词词频'''
        print '统计词频 start'
        word_n = [[] for x in range(0, len(clf.cluster_centers_))]
        if not os.path.exists('./source/word_n'):
            os.mkdir('./source/word_n')
        for i in range(0, len(clf.cluster_centers_)):
            word_n[i] = [0.0 for x in range(0, len(word))]
            for si, sv in enumerate(word_class_vec[i]):
                for wi, wv in enumerate(sv):
                    word_n[i][wi] += wv
            word_n_filename = './source/word_n/word_n%d.txt' % i
            if os.path.exists(word_n_filename):
                os.remove(word_n_filename)
            for si, sv in enumerate(word_n[i]):
                if int(sv) > 0:
                    with open(word_n_filename, 'a') as fw:
                        fw.write('%s %d\n' % (word[si], int(sv)))
                        # print len(word_n[i])
            print '输出词频 保存到 %s' % word_n_filename
        print '统计词频 done\n'
    output = pd.DataFrame({'id': df.id, 'sentiment': result})
    output.to_csv(os.path.join('.', 'data', file_name), index=False)
    output.head()
    del df
    del test_data_features


if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    # 加载停用词
    eng_stopwords = set(stopwords.words('english'))
    # eng_stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwords.txt')])
    # 加载之前训练好的Word2Vec模型
    model_name = '300features_40minwords_10context.model'
    model = Word2Vec.load(os.path.join('.', 'models', model_name))
    # 加载数据
    df = load_dataset('labeled_train')
    # 将原始数据转换成词向量
    train_data_features = df.review.apply(to_review_vector)
    # 训练分类器
    forest = RandomForestClassifier(n_estimators=100, random_state=42)
    forest = forest.fit(train_data_features, df.sentiment)
    print('Train done.')
    # 清理占用内存的变量
    del df
    del train_data_features
    # 做预测
    predict(forest)
    print('Predict done.')
Ejemplo n.º 33
0
def load_w2v(dim=100):  #载入词向量
    if dim == 100:
        return Word2Vec.load('../data/wordvec/model100_20180703')
    return None
Ejemplo n.º 34
0
DATA_CONFIGS = 'data_configs.json'
SEQ_CONFIGS = 'seq_configs_bt.json'

# Train label save file name
TRAIN_LABEL_DATA = 'train_label.npy'
TRAIN_LABEL_SMALL = 'train_label_small.npy'
TEST_LABEL_DATA = 'test_label.npy'
TEST_LABEL_SMALL = 'test_label_small.npy'

# pre-trained model load
d2v_model_name = './model_save/embedding_model/Doc2vec_new.model'
w2v_model_name = './model_save/embedding_model/Word2vec1.model'
pre_trained_name = './model_save/embedding_model/trained_word2vec1.model'

doc_vectorizer = Doc2Vec.load(d2v_model_name)
word_vectorizer = Word2Vec.load(w2v_model_name)
pre_trained_w2v = Word2Vec.load(pre_trained_name)

train_X = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
test_X = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb'))

if label_size == 'big':
    train_Y = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb'))
    train_YS = tf.one_hot(train_Y, 43)
    test_Y = np.load(open(DATA_IN_PATH + TEST_LABEL_DATA, 'rb'))
    test_YS = tf.one_hot(test_Y, 43)
else:
    train_Y = np.load(open(DATA_IN_PATH + TRAIN_LABEL_SMALL, 'rb'))
    train_YS = tf.one_hot(train_Y, 455)
    test_Y = np.load(open(DATA_IN_PATH + TEST_LABEL_SMALL, 'rb'))
    test_YS = tf.one_hot(test_Y, 455)
Ejemplo n.º 35
0
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min() + 0.00005, x_coords.max() + 0.00005)
    plt.ylim(y_coords.min() + 0.00005, y_coords.max() + 0.00005)
    plt.show()

model=Word2Vec.load("ConversationalService/Twitter_Sentiment_model_W2V")
#model=Word2Vec.load("ConversationalService/updatedInsurance_word2vec_v3_18650_tri1")
#display_closestwords_tsnescatterplot(model, 'pay')

#vector building

def ClassifierOnload():
    vec = np.zeros(model.wv.syn0.shape[1]).reshape((1, model.wv.syn0.shape[1]))
    train_vecs_w2v = vec
    for utterence in list(dataset['Query']):
        print(utterence)
        words = Data_Cleaner(str(utterence))
        count = 0
        for word in words:
            try:
                print(word)
Ejemplo n.º 36
0
    modified_str = Date2Str(modified)
    created_str = Date2Str(created)

    #文章番号(number_list)とそれぞれの文(docs), 語彙(word)のリストの作製
    print('Preprocessing ...')
    docs, number_list, words = model_scdv.Preprocess_Mecab(number, data)

    #Word2Vecモデルの学習
    path = glob.glob('./model/*.model')
    if (len(path) == 0):

        path = glob.glob('./pretrained_model/*.model')
        print('Loading pre-trained W2V model (' + str(path[0]) + ')...')

        model = Word2Vec.load(path[0])
        print('Updating specified word vectors by Word2Vec...')
        model.train(sentences=docs,
                    total_examples=len(docs),
                    total_words=len(words),
                    word_count=len(model.wv.index2word),
                    epochs=1000)

        model.save("./model/" + dir_name + "_model.model")

    else:
        path = glob.glob('./model/*.model')
        print('Loading W2V model (' + str(path[0]) + ')...')
        model = Word2Vec.load(path[0])

    #W2Vモデルから必要な単語ベクトルを取得
Ejemplo n.º 37
0
# for i in train_labels:
#     train_labels_trigrams.append(i)
#     train_labels_trigrams.append(i)
#     train_labels_trigrams.append(i)
# write_new_file(1,2,test_sents)
# print("DONE")
# print get_Ngrams("This sentence is for testing",1,3)
vectorizer =TfidfVectorizer(lowercase=True,max_features=max_features,ngram_range=(1,1),stop_words='english')
X_train = vectorizer.fit_transform(train_sents_bigrams[:num_of_docs])
vocab_w2v = vectorizer.get_feature_names()
X_test = vectorizer.transform(test_sents_bigrams)
print 'obtained a vocab of len: {} from the training + testing set'.format(len(vocab_w2v))
# model = Word2Vec([i.translate(string.maketrans('\n',' ')).split() for i in wv2_train[:num_of_docs]],size=w2v_vect_dim,min_count=1)
# model.save('C:/Users/admin/FYP/modelBigrams')
# print("DONE")
model = Word2Vec.load('modelBigrams')
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
# bow_classify (X_train,train_labels,X_test,test_labels)

if deep_kernel_mode == 'diag':
    # word kernel is a DIAG matrix
    word_kernel = csr_matrix((len(vocab_w2v),len(vocab_w2v)))
    for i,w in enumerate(vocab_w2v):
        print i
        word_vec = w2v.get(w,np.zeros(shape=(w2v_vect_dim,)))
        word_kernel[i,i] = word_vec.dot(word_vec.T)

elif deep_kernel_mode == 'pairwise':
    # word kernel is pairwise similarity
    # word_vect_as_in_vocab = np.zeros(shape=(len(vocab_w2v), w2v_vect_dim))
    word_vect_as_in_vocab = csr_matrix((len(vocab_w2v), int(w2v_vect_dim)))
#python Word2Vec_AverageVectorsUtilities.py E:\semeval2016-task3-caq\qatarliving\qatarliving_qc_size100_win10_mincnt5_with_sent_repl_iter1.word2vec.bin
if __name__ == '__main__':
    import logging
    from gensim.models.word2vec import Word2Vec
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    #read publications file
    if len(sys.argv) > 1:
        word2vec_file_to_load = sys.argv[1]
        print('Word2vec file:\n\t%s' % word2vec_file_to_load)
    else:
        print('Error: missing input file parameter')
        quit()

    model = Word2Vec.load(word2vec_file_to_load)
    index2word = set(model.index2word)

    word2vec_num_features = len(model.syn0[0])
    print "Feature vectors length:%s" % word2vec_num_features
    print "Model syn0 len=%d" % (len(model.syn0))

    question_body = u'is there any place i can find scented massage oils in qatar?'
    answers = [u'Yes. It is right behind Kahrama in the National area.',\
                    u'whats the name of the shop?',\
                    u'It s called Naseem Al-Nadir. Right next to the Smartlink shop. You ll find the chinese salesgirls at affordable prices there.',\
                    u'dont want girls;want oil',\
                    u'Try Both ;) I am just trying to be helpful. On a serious note - Please go there. you ll find what you are looking for.',\
                    u'you mean oil and filter both',\
                    u'Yes Lawa...you couldn t be more right LOL',\
                    u'What they offer?',\
Ejemplo n.º 39
0
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models.word2vec import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
import time

start = time.time()
train_articles_df = pd.read_pickle('train_articles_df.pkl')
train_answers_df = pd.read_pickle('train_answers_df.pkl')
train_articles_df = train_articles_df.drop(['id'], 1)
train_answers_df = train_answers_df.drop(['id'], 1)
w2v_model = Word2Vec.load("w2v2019-02-10-23_19_06.model")

train_df = pd.concat([train_answers_df, train_articles_df], axis=1)
train_df = train_df.sample(frac=1)
# print(train_df.head)

embedding_matrix = np.zeros(
    (len(w2v_model.wv.vocab.items()) + 1, w2v_model.vector_size))
word2idx = {}

vocab_list = [(word, w2v_model.wv[word])
              for word, _ in w2v_model.wv.vocab.items()]
for i, vocab in enumerate(vocab_list):
    word, vec = vocab
    embedding_matrix[i + 1] = vec
    word2idx[word] = i + 1
Ejemplo n.º 40
0
# -*- coding: utf-8 -*-
# 文本文件必须是utf-8无bom格式
from gensim.models.word2vec import Word2Vec
import gensim

model = Word2Vec.load('./word2vecModel/word_embedding.model')
#model = gensim.models.KeyedVectors.load_word2vec_format('C:\\Users\\lanxum\\Desktop\\mymodel.model')  # 3个文件放在一起:Word60.model   Word60.model.syn0.npy   Word60.model.syn1neg.npy
print("read model successful")

word_list = [
    '教育',
    '不存在的词',
    '的',
    '我',
    '你',
    '他',
    '个',
    '1',
    '完成',
    '吃',
    '苹果',
    '香蕉',
    '词汇',
    '物理',
    '地球',
    '黑死病',
    '瘟疫',
    '',
]
for word in word_list:
    if word in model.wv.index2word:
Ejemplo n.º 41
0
    # Shuffle final reviews and labels
    combined_lists = zip(final_reviews, final_labels)
    np.random.shuffle(combined_lists)
    final_reviews[:], final_labels[:] = zip(*combined_lists)

    print "Returning %d funny reviews and a total of %d reviews" % (
        num_funny_reviews, len(final_reviews))

    return (final_reviews, final_labels)


WORD2VEC_MODEL = "w2v_70_parts_100_vector_4_window"
PARTITIONS_TRAINING = range(1, 30)  #15
PARTITIONS_TESTING = range(50, 53)  #22

w2vmodel = Word2Vec.load(WORD2VEC_MODEL)

reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data(
    PARTITIONS_TRAINING)
reviews_train, labels_train = give_balanced_classes(reviews_train,
                                                    funny_votes_train)

print "Tokenizing"
NUM_ELEMENTS_TRAIN = None
NUM_ELEMENTS_TEST = None
reviews_tokens_train = [
    language.tokenize_document((i, unicode(txt)))
    for (i, txt) in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN])
]

X_train = tokens_to_word_vectors(reviews_tokens_train, w2vmodel)
Ejemplo n.º 42
0
def map_text_list_to_embedding(text_list, label_for_text_list, num_labels,
                               label_to_id):
    """

    Parameters
    ----------
    text_list: list of str
        List of text
    label_for_text_list: list of str
        List of labels, which is the ground truth for each text on the text_list
    num_labels:
        Number of labels
    label_to_id: dict
        Label to integer id mapping

    Returns
    -------
    x: ndarray
        Numpy array of mean word embeddings for each text.
    y: ndarray
        Numpy array of indices representing labels
    missing_words: set
        Set of words not in the Word2Vec model's dictionary.
    """
    model = Word2Vec.load(MODEL_PATH)
    missing_words = set()
    x_list = list()
    y_list = list()

    total_found_in_dict = 0
    total_not_in_dict = 0
    for i, text in enumerate(text_list):
        log.debug("Processing post: [%d]" % (i + 1))
        words_in_text = map_text_to_word_list(text)

        word_v_list = list()
        for w in words_in_text:
            try:
                v = model[w]
            except KeyError:
                missing_words.add(w)
                #log.warning("Skipping %s" % (w))
                total_not_in_dict += 1
                continue

            word_v_list.append(v)
            total_found_in_dict += 1

        if len(word_v_list) == 0:
            # log.warning("Did not find any words in vocabulary.  Skipping the text.")
            continue

        # For now, do not change non-zero element to 1.
        label_id = label_to_id[label_for_text_list[i]]
        label_id = keras.utils.to_categorical(label_id,
                                              num_labels).astype(np.float32)
        label_id = label_id.reshape(1, num_labels)

        # Squish word_id_list
        word_v_np = np.array(word_v_list)
        word_count = word_v_np.shape[0]
        word_v_mean = np.sum(word_v_np, axis=0) / word_count
        word_v_sum = np.sum(word_v_np, axis=0)

        #log.info("word_v_mean.shape")
        #log.info(word_v_mean.shape)

        x_list.append(word_v_mean)
        #        x_list.append(word_v_sum)

        y_list.append(label_id)

    x = np.array(x_list)
    print(x.shape)
    y = np.concatenate(y_list)

    assert x.shape[0] == y.shape[0]

    log.info("Number of words found in dict: %d" % (total_found_in_dict))
    log.info("Number of words not found in dict: %d" % (total_not_in_dict))

    return x, y, missing_words
Ejemplo n.º 43
0
    def load_model(self, fpath):

        embeddings_file = fpath
        return Word2Vec.load(embeddings_file)
Ejemplo n.º 44
0
import torch.nn as nn
import matplotlib.pyplot as plt
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import time

from model_compare import ASTNN

TRAINING_SET_SIZE = 30000
VALIDATION_SET_SIZE = 10000
TEST_SET_SIZE = 10000

print('Reading data...')

w2v = Word2Vec.load('./data/c/w2v_128').wv
embeddings = torch.tensor(np.vstack([w2v.vectors, [0] * 128]))

programs = pd.read_pickle('./data/c/id_code_label_ast_(index_tree).pkl')

training_set = programs[:TRAINING_SET_SIZE]
validation_set = programs[TRAINING_SET_SIZE:TRAINING_SET_SIZE +
                          VALIDATION_SET_SIZE]
test_set = programs[TRAINING_SET_SIZE + VALIDATION_SET_SIZE:TRAINING_SET_SIZE +
                    VALIDATION_SET_SIZE + TEST_SET_SIZE]


def get_batch(dataset, i, batch_size):
    return dataset.iloc[i:i + batch_size]

Ejemplo n.º 45
0
def most_similar():
    model = Word2Vec.load("./result/embedding.model")
    print("용돈과 관련된 키워드 : ", model.most_similar("용돈"))
    print("졍이와 관련된 키워드 : ", model.most_similar("졍이"))
    print("쭈니와 관련된 키워드 : ", model.most_similar("쭈니"))
from gensim.models.word2vec import Word2Vec
from scipy import spatial
from sklearn.metrics import confusion_matrix

#设置目录环境
root_path = ''
pd.set_option('display.width', 1000)  #设置字符显示宽度
pd.set_option('display.max_columns', 1000)  #设置显示最大列
pd.set_option('display.max_rows', 1000)  #设置显示最大行

#设置jieba字典
jieba.set_dictionary(os.path.join(root_path, 'corpus/dict.txt.big'))
jieba.load_userdict(os.path.join(root_path, 'corpus/medical_term.txt'))

#装载预训练模型
med_model = Word2Vec.load(os.path.join(root_path, "model/med_word2vec.model"))
index2word_set = set(med_model.wv.index2word)


#读取txt文件
#输入:file_name: 文件地址
#输出: lines: 行内容列表
def loadfile(file_name):
    file_path = os.path.join(root_path, file_name)
    lines = []
    with open(file_path, 'r', encoding='utf-8') as f_stop:
        for line in f_stop:
            lines.append(line.replace('\n', ''))
    return lines

Ejemplo n.º 47
0
    if not os.path.exists(vocab_path):

        vocab_model = Word2Vec(size=embedding_size,
                               max_vocab_size=max_vocab_size,
                               min_count=min_word_count,
                               workers=2,
                               seed=2245)

        print('{0}: Building own vocabulary'.format(datetime.datetime.now()))
        desc_generator = basic_desc_generator(train_path)
        vocab_model.build_vocab(desc_generator)
        print('{0}: Saving vocabulary to {1}'.format(datetime.datetime.now(),
                                                     vocab_path))
        vocab_model.save(vocab_path)

    vocab_model = Word2Vec.load(vocab_path)

if use_google_word2vec and __name__ == "__main__":
    ## Google word2vec
    # Load pre-trained embeddings
    assert embedding_size == 300

    #Take the first bunch of words, these are sorted by decreasing count
    #so these will be the most important, and it saves a bunch of space/time
    #Save vocab for future use
    if not os.path.exists(word2vec_model_path):
        print('Loading word2vec embeddings from {0:}'.format(google_word2vec))
        model = KeyedVectors.load_word2vec_format(google_word2vec,
                                                  limit=max_vocab_size,
                                                  binary=True)
        model.init_sims(replace=True)
Ejemplo n.º 48
0
 def __init__(self):
     self.model = Word2Vec.load('./Model/ko_en.mdl')
Ejemplo n.º 49
0
def raiseError(error):
    return error


if __name__ == '__main__':
    global model

    #----------- Parsing Arguments ---------------
    p = argparse.ArgumentParser()
    p.add_argument("--model", help="Path to the trained model")
    p.add_argument("--binary", help="Specifies the loaded model is binary")
    p.add_argument("--host", help="Host name (default: localhost)")
    p.add_argument("--port", help="Port (default: 5000)")
    p.add_argument("--path", help="Path (default: /word2vec)")
    args = p.parse_args()

    model_path = args.model if args.model else "/home/fox/xavier_corpus/word2vec/sgns-50-tra.model"
    binary = True if args.binary else False
    host = args.host if args.host else "localhost"
    path = args.path if args.path else "/word2vec"
    port = int(args.port) if args.port else 5000
    if not args.model:
        print "Usage: word2vec-api.py --model path/to/the/model [--host host --port 1234]"
    model = w.load(model_path)
    api.add_resource(N_Similarity, path + '/n_similarity')
    api.add_resource(Similarity, path + '/similarity')
    api.add_resource(MostSimilar, path + '/most_similar')
    api.add_resource(Model, path + '/model')
    api.add_resource(ModelWordSet, '/word2vec/model_word_set')
    app.run(host=host, port=port)
Ejemplo n.º 50
0
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api
from pprint import pprint

# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]

# Split the data into 2 parts. Part 2 will be used later to update the model
data_part1 = data[:1000]
data_part2 = data[1000:]

# Train Word2Vec model. Defaults result vector size = 100
model = Word2Vec(data_part1, min_count=0, workers=cpu_count())

# Get the word vector for given word
pprint(model['topic'])

pprint(model.most_similar('topic'))

# Save and Load Model
model.save('newmodel')
model = Word2Vec.load('newmodel')

#Update existing Word2Vec Model with the new data
model.build_vocab(data_part2, update=True)
model.train(data_part2, total_examples=model.corpus_count, epochs=model.iter)

pprint(model['topic'])
Ejemplo n.º 51
0
        if len(data) > 900:
            pass
        if len(data) <=  maxlen:
            data = data + [fill_0] * (maxlen - len(data))
        else:
            data = data[:maxlen]
    return data

if __name__ == "__main__":
    
    CORPUSPATH = "./data/NVD/corpus/"
    VECTORPATH = "./data/vector/"
    W2VPATH = "w2v_model/wordmodel_min_iter5.model"
    
    print("turn the corpus into vectors...")
    model = Word2Vec.load(W2VPATH)
    for testcase in os.listdir(CORPUSPATH):
        print("\r" + testcase, end='')
        if testcase not in os.listdir(VECTORPATH):  
            folder_path = os.path.join(VECTORPATH, testcase)
            os.mkdir(folder_path)
        for corpusfile in os.listdir(CORPUSPATH + testcase):
            corpus_path = os.path.join(CORPUSPATH, testcase, corpusfile)
            f_corpus = open(corpus_path, 'rb')
            data = pickle.load(f_corpus)
            f_corpus.close()
            data.append(data[0])
            data[0] = generate_corpus(model, data[0])
            vector_path = os.path.join(VECTORPATH, testcase, corpusfile)
            f_vector = open(vector_path, 'wb')
            pickle.dump(data, f_vector)
Ejemplo n.º 52
0
        the_sample_index = TestBatchWordIndex[i]
        for j in range(maxSeqLength):
            the_sample_vec.append(model.wv[allVocabList[the_sample_index[j]]])
        TestBatchWordVec.append(the_sample_vec)

    TestBatchWordVec = np.array(TestBatchWordVec)

    TestBatchLabel = np.array(TestBatchLabel)

    return TestBatchSampleIndex, TestBatchWordVec, TestBatchLabel


if __name__ == "__main__":
    print("CASDMN_Model")

    model = Word2Vec.load(corpusWord2Vect)
    Pos_Txt_Index_List = list(np.load(Pos_Txt_Index_List_Path))
    Neg_Txt_Index_List = list(np.load(Neg_Txt_Index_List_Path))

    Train_Set, Valid_Set, Test_Set = getSplitSets()

    tf.reset_default_graph()
    labels = tf.placeholder(tf.float32, [batchSize, numClasses])
    input_text = tf.placeholder(tf.float32, [batchSize, maxSeqLength, wordDim])
    input_emoji = tf.placeholder(tf.float32, [batchSize, wordDim])

    # (Bi-)RNN layer(-s)
    seq_len_ph = []
    for i in range(batchSize):
        seq_len_ph.append(maxSeqLength)
    rnn_outputs, _ = bi_rnn(GRUCell(hiddenSize),
    # Параметры
    num_features = 300
    min_word_count = 1
    num_workers = 4
    window_size = 6
    subsampling = 1e-3

    # Создание экземпляра
    model = Word2Vec(token_list,
                     workers=num_workers,
                     size=num_features,
                     min_count=min_word_count,
                     window=window_size,
                     sample=subsampling)

    # Заморозка модели, исключение ненужных выходных весов
    model.init_sims(replace=True)

    # Сохранение
    model_name = "vk_comment_model"
    model.save(model_name)

    # Загрузка
    model_name = "vk_comment_model"
    model = Word2Vec.load(model_name)

    stemmer = SnowballStemmer('russian')
    print(model.wv.similarity(stemmer.stem("поезд"), stemmer.stem("Пусан")))
    print(model.wv[stemmer.stem("поезд")])
Ejemplo n.º 54
0
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
####################################################################
# prepare datasets
import os
import csv
import torch
import numpy as np
from gensim.models.word2vec import Word2Vec

model_path = '/home/cl/jungmin-c/japanese-word2vec-model-builder/output/word2vec.gensim.model'
lang_model = Word2Vec.load(model_path)
directory_train = "/home/cl/jungmin-c/legal-i_corpus/AIL2019/RSC_original2/"
directory_test = "/home/cl/jungmin-c/legal-i_corpus/test_hanrei/"
tag_to_ix = {}
def build_emb(directory):
    data = []
    for item in os.listdir(directory):

        datafile = open(directory + item,'r')
        reader = csv.reader(datafile)
        next(reader)
        sentences = []
        tags = []
        for row in reader:
            sentence = row[0].split()
            sentence = list(filter(lambda x: x in lang_model.wv.vocab, sentence))
Ejemplo n.º 55
0
        print("taggedFile Not Saved.")
    # building model
    model = intent_vectorization.build_model(tagged_data)
    '''
    # path to save/load model
    model_path = "Models\intentModelArxiveWord2Vec"
    # checking if the model got saved
    '''
    if intent_vectorization.save_model(model, model_path):
        print("Model Saved")
    else:
        print("Model Not saved")

    '''
    # loading saved model
    model = Word2Vec.load(model_path + ".model")

    # testing
    test_data = "Youtube"
    test_tokenized = [
        token.lemma_ for token in intent_vectorization.nlp(test_data.lower())
        if not token.is_stop and len(token.text) > 2
    ]
    print(test_tokenized)
    v1 = model.wv.most_similar(test_tokenized)
    print("V1_infer", v1)

    # to find most similar doc using tags (returns 10 most similar docs according to cosine similarity)
    similar_doc = model.docvecs.most_similar(positive=[v1])
    print(similar_doc)
Ejemplo n.º 56
0
    # ================================================================================
    # After finishing the training, unload useless data from the memory
    # model.init_sims(replace=True)

    # ================================================================================
    # Checkpoint file name for trained W2V model
    model_name = './Models/300features_40minwords_10text'
    # model_name='./Models/300features_50minwords_20text'

    model.save(model_name)


# train_W2V_model_and_save_checkpoint_file(sentences)

# ================================================================================
model = Word2Vec.load('./Models/300features_40minwords_10text')
# model=gensim.models.Word2Vec('./Models/300features_40minwords_10text')
# model=gensim.models.Word2Vec.load('model')

# ================================================================================
sample_words = 'man woman child kitchen'.split()
# print("sample_words",sample_words)
# ['man', 'woman', 'child', 'kitchen']

# ================================================================================
abnormal_word = model.wv.doesnt_match(sample_words)
# print("abnormal_word",abnormal_word)
# kitchen

# ================================================================================
country_names = "france england germany berlin".split()
Ejemplo n.º 57
0
evaluation of BalancedBaggingClassifiers trained on top of them.
"""

from w2v_vectorizers import MeanEmbeddingVectorizer
from gensim.models.word2vec import Word2Vec
from sklearn.externals import joblib
from multiprocess import Pool
import pandas as pd
import glob, os

if __name__ == "__main__":
    train_data = pd.read_csv(
        "/data/SO_data/downvoter/wv_train_processed_data.csv")
    val_data = pd.read_csv("/data/SO_data/downvoter/wv_val_processed_data.csv")

    wv_models = [Word2Vec.load(f) for f in glob.glob("./word_models/*.model*")]
    path = "/data/SO_data/downvoter/vectorized_data/"

    def process_model(wv_model):
        size = wv_model.vector_size
        window = wv_model.window

        print("Vectorizing s=%d, w=%d" % (size, window))
        vectorizer = MeanEmbeddingVectorizer(wv_model)

        ext = ".w2v.s%d.w%d.pkl" % (size, window)

        print("Body train set...")
        if not os.path.isfile("".join([path, "train_body", ext])):
            joblib.dump(vectorizer.transform(train_data.body),
                        "".join([path, "train_body", ext]))
Ejemplo n.º 58
0
def get_predict_vecs(words):
    n_dim = 300
    imdb_w2v = Word2Vec.load('../svm_data/w2v_model/w2v_model.pkl')
    train_vecs = buildWordVector(words, n_dim, imdb_w2v)
    return train_vecs
Ejemplo n.º 59
0
from multiprocessing import Process, Value, Queue, cpu_count
from time import sleep
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.word2vec import Word2Vec
from sklearn.cluster import KMeans
from gensim.models.keyedvectors import KeyedVectors
from math import inf
import torch
import pickle


args = NLP_args(k=30, min=0.0, random=0,min_cls=5,lr=0.0005)

labels_dict=pickle.load(open("labels_dict.pkl", "rb"))

word2vec_for_kmeans_model = Word2Vec.load("word2vec_for_kmeans_model.model")

tfidf_model=pickle.load(open("tfidf_model.pkl", "rb"))

word2vec_for_rnn_model = Word2Vec.load("word2vec_for_rnn_model.model")


rnn_model = RNN(args.word2vec_vec_size_for_rnn, args.hidden_layer, len(labels_dict))
rnn_model.load_state_dict(torch.load('w2v_5_rnn_model.pth'))
rnn_model.eval()

random_forest_model=pickle.load(open("random_forest_model.pkl", "rb"))

global number_of_free_processes
number_of_free_processes = Value('i', cpu_count(), lock=True)
def word2vec_train():
    model = Word2Vec.load('./vec_model/Word2vec_model.pkl')
    w2indx, w2vec = create_dictionaries(model)
    return w2indx, w2vec