コード例 #1
0
ファイル: wiki.py プロジェクト: VV123/NLIDB_gradient
def load_vocab_all( load=True ):

    if load==False:
        vocab_dict = {}
        reverse_vocab_dict = {}
        embedding = glove.Glove()
        vocabs = build_vocab_all()
        vocab_tokens = []
        for i,word in enumerate(vocabs):
            vocab_dict[word]=i
            reverse_vocab_dict[i]=word
            vocab_tokens.append([word])
        np.save('data/vocab_dict_all.npy',vocab_dict)
        np.save('data/reverse_vocab_dict_all.npy',reverse_vocab_dict)
        vocab_emb = embedding.embedding(vocab_tokens, maxlen=1)
        vocab_emb = vocab_emb[:,0] #retrieve embedding
        np.save('data/vocab_emb_all.npy',vocab_emb)
        print('Vocab shape:')
        print(vocab_emb.shape)
    else:
        vocab_emb=np.load('data/vocab_emb_all.npy')
        vocab_dict=np.load('data/vocab_dict_all.npy').item()
        reverse_vocab_dict=np.load('data/reverse_vocab_dict_all.npy').item()
        print('Vocab shape:')
        print(vocab_emb.shape)
    return vocab_dict,reverse_vocab_dict,vocab_emb
コード例 #2
0
    def _do_glove(self, package, cooccurrence_dict, dimensions, alpha, x_max,
                  vocab):
        glove_start = time.time()
        model = glove.Glove(cooccurrence_dict,
                            d=dimensions,
                            alpha=alpha,
                            x_max=x_max)
        glove_time = (time.time() - glove_start)
        log.getLogger().info("glove_time  " + str(glove_time))
        glove_train_start = time.time()
        model.train(batch_size=200, workers=9)
        glove_train_time = (time.time() - glove_train_start)
        log.getLogger().info("glove_train_time  " + str(glove_train_time))
        glove_list = self.output_format(model.W, vocab)
        glove_output_key = str(dimensions) + "d_" + str(x_max) + "_" + str(
            alpha) + "_glove_output"

        if "glove_output_key" in package.any_inputs_dict.keys():
            package.any_inputs_dict[
                "glove_output_key"] = package.any_inputs_dict[
                    "glove_output_key"] + "," + glove_output_key
        else:
            package.any_inputs_dict["glove_output_key"] = glove_output_key

        package.any_analysis_dict[glove_output_key] = glove_list
        package.any_analysis_dict["gl0ve_vocab"] = vocab
コード例 #3
0
    def build_gensim(self, docs, model=None):
        dp = DocumentPreprocessor()
        docs_tokenized = (dp.tokenizer(doc) for doc in docs)

        # Get the word co-occurrence matrix -- needs lots of RAM!!
        cooccur = glove.Corpus()
        cooccur.fit(docs_tokenized, window=10)

        # wiki_generator = lambda: (filter_text(text) for text in wiki)
        # cooccur.fit(wiki_generator(), window=10)

        # and train GloVe model itself, using 10 epochs
        if model is None:
            model = glove.Glove(no_components=600, learning_rate=0.05)
        model.fit(cooccur.matrix, epochs=10)

        doc_vectors = []
        docs_tokenized = (dp.tokenizer(doc) for doc in docs)
        for doc in docs_tokenized:
            doc_vector = np.zeros(len(model.word_vectors[0]), dtype=np.float)
            if len(doc):
                for word in doc:
                    try:
                        doc_vector += model[word]
                    except:
                        log.debug(
                            'Word: {} doesn\'t appear in model.'.format(word))
            else:
                log.debug('Empty document in data')
            doc_vectors.append(doc_vector)

        return np.array(doc_vectors), model
コード例 #4
0
def _embed_list(ls, g=None, maxlen_p=20, maxlen_q=2):
    if g == None:
        g = glove.Glove()
    for line in ls:
        assert len(line.split('\t')) == 2
    questions = [ nltk.word_tokenize(line.split('\t')[0]) for line in ls]
    cols = [ nltk.word_tokenize(line.split('\t')[1]) for line in ls]
    return g.embedding(questions, maxlen=maxlen_p-1), g.embedding(cols, maxlen=maxlen_q-1)
コード例 #5
0
ファイル: glove_steam_review.py プロジェクト: JasSong/NLP
def train_glove(dic_comtx, dimension = 100, alpha = 0.75, x_max = 100.0, epoch = 20, batch = 200):
    model = glove.Glove(dic_comtx, d = dimension, alpha = alpha, x_max = x_max)
    for epoch in range(epoch):
        err = model.train(batch_size = batch, workers = 4)
        print("epoch %d, error %.3f" % (epoch, err), flush=True)
    
    wordvectors = model.W #extract wordvector
    
    return wordvectors
コード例 #6
0
    def train_glove(self):
        processed_sentence = self.preprocessing(self._filename)
        model = glove.Glove(processed_sentence, d=self._n_dim, alpha=0.75, x_max=100.0)

        for epoch in range(150):
            err = model.train(batch_size=150, workers=3)

        X = model.W
        return model, X
コード例 #7
0
    def train_glove(self):
        processed_sentence = self.preprocessing(self._filename)
        model = glove.Glove(processed_sentence, d=300, alpha=0.75, x_max=100.0)

        for epoch in range(150):
            err = model.train(batch_size=150, workers=3)

        X = model.W
        self.save_embed_file(
            'embed_{}/{}.embd'.format(self._embed_dir, self.class_name), X,
            self._labels)

        return X
コード例 #8
0
    def train_all(self):
        """
        builds the vocab and trains the model
        :return:
        """
        documents = list(self.read_input())
        corpus = glove.Corpus()
        corpus.fit(documents, window=self.window_size)
        self.model = glove.Glove(no_components=self.vector_size, learning_rate=self.learning_rate, alpha=self.alpha)

        self.model.fit(corpus.matrix, epochs=self.iterations, no_threads=self.workers, verbose=True)
        self.dictionary=corpus.dictionary
        self.model.add_dictionary(corpus.dictionary)
コード例 #9
0
def train(word2id, id2word, corpus, win, dim):
    cooccur = glove.Corpus(dictionary=word2id)
    cooccur.fit(corpus(), window=win)

    logger.info("glove model creating")
    logger.info('Dict size: %s' % len(cooccur.dictionary))
    logger.info('Collocations: %s' % cooccur.matrix.nnz)
    model = glove.Glove(no_components=dim, learning_rate=0.05)
    model.fit(cooccur.matrix, epochs=10, no_threads=5, verbose=True)
    model.add_dictionary(cooccur.dictionary)
    model.word2id = dict(
        (utils.to_unicode(w), id) for w, id in model.dictionary.items())
    model.id2word = gensim.utils.revdict(model.word2id)
    utils.pickle(model, './model/glove.model')
コード例 #10
0
    def train_glove(self):

        cleaned_text = self.text_preprocessing()
        dic = self.count_vectorize(cleaned_text)

        model = glove.Glove(dic, d=self._n_dim, alpha=0.75, x_max=100.0)

        for epoch in range(150):
            err = model.train(batch_size=150, workers=3)
            # print("epoch %d, error %.3f" % (epoch, err), flush=True)

        X = model.W
        # self.save_embed_file('embed_{}/{}.embd'.format(self._embed_dir, self.class_name), X, self._labels)

        return model, X
コード例 #11
0
    def generate_embedding_dictionary(docs_tokens, embedding_dim, iters, window=2, learning_rate=0.05):
        time_start = time()
        corpus_model = glove.Corpus()
        corpus_model.fit(docs_tokens, window=window)
        glove_model = glove.Glove(no_components=embedding_dim, learning_rate=learning_rate)
        glove_model.fit(corpus_model.matrix, epochs=iters, no_threads=4)
        end_time = time()
        glove_model.add_dictionary(corpus_model.dictionary)

        word_to_index = glove_model.dictionary
        index_word = glove_model.inverse_dictionary
        embedding_dictionary = {index_word[i]: vector for i, vector in enumerate(glove_model.word_vectors)}

        # embedding_dictionary["<<UNKNOWN>>"] = np.zeros(embedding_dim)

        return embedding_dictionary, embedding_dim, word_to_index, end_time - time_start
コード例 #12
0
def glove2vec(text_sentence,
              win=10,
              noc=1,
              lr=0.05,
              epochs=10,
              nothr=1,
              verbose=True):
    corpus_model = glove.Corpus()
    corpus_model.fit(text_sentence, window=win)
    word_list = glove.Glove(no_components=noc, learning_rate=lr)
    word_list.fit(corpus_model.matrix,
                  epochs=epochs,
                  no_threads=nothr,
                  verbose=verbose)
    word_list.add_dictionary(corpus_model.dictionary)
    return word_list
コード例 #13
0
def load_data(filepath='imdb/imdb.npz', rawpath='~/data/aclImdb', maxlen=400,
              embedding=None):
    filepath = os.path.expanduser(os.path.join('~/data', filepath))
    datapath = os.path.expanduser('~/data/imdb')
    rawpath = os.path.expanduser(rawpath)

    if os.path.exists(filepath):
        data = np.load(filepath)
        X_train, y_train = data['X_train'], data['y_train']
        X_test, y_test = data['X_test'], data['y_test']
    else:
        g = embedding
        if g is None:
            import glove
            g = glove.Glove()

        _collect_reviews()

        import nltk

        def _embedding(fpath):
            reviews = [nltk.word_tokenize(line) for line in open(fpath, 'r')]
            # maxlen-1 since we add a <bos> symbol to each sentence
            return g.embedding(reviews, maxlen=maxlen-1)

        print('\nGenerating training data')
        X_train_pos = _embedding(os.path.join(datapath, 'train-pos.txt'))
        X_train_neg = _embedding(os.path.join(datapath, 'train-neg.txt'))
        X_train = np.vstack((X_train_pos, X_train_neg))
        y_train = np.append(np.zeros(X_train_pos.shape[0]),
                            np.ones(X_train_neg.shape[0]))
        y_train = np.reshape(y_train, [-1, 1])

        print('\nGenerating testing data')
        X_test_pos = _embedding(os.path.join(datapath, 'test-pos.txt'))
        X_test_neg = _embedding(os.path.join(datapath, 'test-neg.txt'))
        X_test = np.vstack((X_test_pos, X_test_neg))
        y_test = np.append(np.zeros(X_test_pos.shape[0]),
                           np.ones(X_test_neg.shape[0]))
        y_test = np.reshape(y_test, [-1, 1])

        print('\nSaving {}'.format(filepath))
        np.savez(filepath, X_train=X_train, y_train=y_train, X_test=X_test,
                 y_test=y_test)

    return X_train, y_train, X_test, y_test
コード例 #14
0
def main():

    if os.path.exists(FITTED_MODEL_FILENAME):
        glove_model = glove.Glove.load(FITTED_MODEL_FILENAME)
    else:
        matrix, dictionary = load_graph_adj_matrix()
        glove_model = glove.Glove(2)
        glove_model.fit(matrix, epochs=10)
        glove_model.add_dictionary(dictionary=dictionary)
        glove_model.save(FITTED_MODEL_FILENAME)

    graph_positions = {}
    for vertex_idx, vertex_name in glove_model.inverse_dictionary.items():
        vertex_pos = tuple(glove_model.word_vectors[vertex_idx])
        graph_positions[vertex_name] = vertex_pos

    pass
コード例 #15
0
def run_multi(d, size):
    cores = multiprocessing.cpu_count()
    sentiment_doc2vec_amazon_cv(
        base_path='/datasets/amazon-data/csv/nan-removed',
        # base_path='/nfs/amazon/csv/nan-removed',
        dataset_filter=d,
        # stars=[1, 5],
        stars=[1, 2, 3, 4, 5],
        n_cv=1,
        model=glove.Glove(no_components=size, learning_rate=0.05),
        d2v_size=size,
        save_model='/models/gensim/domains',
        # save_model='/nfs/amazon/doc2vec/models',
        output_folder='/models/gensim/domains/results'
        # output_folder='/nfs/amazon/doc2vec/results'
        # n_max_unsupervised=100000
    )
コード例 #16
0
def main():
    wheel_graph = networkx.generators.classic.wheel_graph(10)
    model = glove.Glove(2, learning_rate=0.01, alpha=0.2, max_count=1000)

    adj_matrix = networkx.adjacency_matrix(wheel_graph)
    adj_matrix = adj_matrix.toarray().astype('d')
    normalized_adj_matrix = scipy.divide(adj_matrix,
                                         adj_matrix.sum(1)[:, scipy.newaxis])
    model.fit(scipy.sparse.coo_matrix(normalized_adj_matrix), epochs=1000)

    vertex_positions = {
        vertex_idx: tuple(model.word_vectors[vertex_idx])
        for vertex_idx in range(wheel_graph.order())
    }

    networkx.drawing.draw(wheel_graph, pos=vertex_positions)
    plt.savefig("asdf.png")
    pass
コード例 #17
0
def embed_data(maxlen_p=maxlen0,maxlen_q=maxlen1,embedding=None,save=False,datapath=None,savepath=None):
	if not datapath:
		datapath = os.path.dirname(path).replace('/data','')

	if not savepath:
		savepath = os.path.dirname(path).replace('/data','')

	filepath ='bc.npz'
	filepath_X = os.path.expanduser(os.path.join(savepath, filepath))
	filepath = 'bc_label.npz'
	filepath_y = os.path.expanduser(os.path.join(savepath, filepath))
	g = embedding
	if g is None:
		g = glove.Glove()
	
	def _embedding(fpath):
		for line in codecs.open(fpath,'r','utf-8-sig'):
			# assert len(line.split('\t')) == 3 or line.startswith('#')
			assert len(line.split('\t')) == 3
		questions = [ nltk.word_tokenize(line.split('\t')[0]) for line in codecs.open(fpath,'r','utf-8-sig') if not line.startswith('#')]
		cols = [ nltk.word_tokenize(line.split('\t')[1]) for line in codecs.open(fpath,'r','utf-8-sig') if not line.startswith('#')]
		labels = [ line.split('\t')[2] for line in codecs.open(fpath,'r','utf-8-sig') if not line.startswith('#')]
		return g.embedding(questions, maxlen=maxlen_p-1), g.embedding(cols, maxlen=maxlen_q-1)


	def _read_label(fpath):
		#labels = [ line.split('\t')[2] for line in codecs.open(fpath,'r','utf-8-sig') if not line.startswith('#')]
		labels = [ int(line.split('\t')[2].strip('\n')) for line in codecs.open(fpath,'r','utf-8-sig')]
		return labels

	print('\nGenerating training/test data')
	X_train_p,X_train_q = _embedding(os.path.join(datapath, 'train_model_const.txt'))
	X_test_p,X_test_q = _embedding(os.path.join(datapath, 'test_model_const.txt'))
	X_dev_p,X_dev_q = _embedding(os.path.join(datapath, 'dev_model_const.txt'))
	X_train_ans = _read_label(os.path.join(datapath, 'train_model_const.txt'))
	X_test_ans = _read_label(os.path.join(datapath, 'test_model_const.txt'))
	X_dev_ans = _read_label(os.path.join(datapath, 'dev_model_const.txt'))

	if save:
		print('\nSaving')
		np.savez(filepath_y, y_train=X_train_ans, y_test=X_test_ans, y_dev=X_dev_ans)
		np.savez(filepath_X, X_train_qu=X_train_p, X_train_col=X_train_q, X_test_qu=X_test_p, X_test_col=X_test_q, X_dev_qu=X_dev_p, X_dev_col=X_dev_q)
		print('\nSaved!')
コード例 #18
0
ファイル: events.py プロジェクト: hemavakade/magichour
def glove(windows,
          num_components=16,
          glove_window=10,
          epochs=20,
          verbose=False):
    import glove
    import hdbscan
    import multiprocessing

    ws = [[template_id for template_id in w] for w in windows]
    corpus = glove.Corpus()
    corpus.fit(ws, window=glove_window)
    # TODO: Explore reasonable glove defaults
    glove_model = glove.Glove(no_components=num_components, learning_rate=0.05)
    glove_model.fit(corpus.matrix,
                    epochs=epochs,
                    no_threads=multiprocessing.cpu_count(),
                    verbose=verbose)
    glove_model.add_dictionary(corpus.dictionary)

    labels = []
    vectors = []
    # TODO: Explore how to pull data more nicely from glove
    for key in glove_model.__dict__['dictionary']:
        word_vector_index = glove_model.__dict__['dictionary'][key]
        labels.append(key)
        vectors.append(
            list(glove_model.__dict__['word_vectors'][word_vector_index]))

    # Clustering
    output_events = defaultdict(list)
    for i, val in enumerate(
            hdbscan.HDBSCAN(min_cluster_size=2).fit_predict(vectors)):
        output_events[val].append(labels[i])

    # Create event objects
    events = []
    for item in output_events:
        event = Event(id=str(uuid.uuid4()),
                      template_ids=map(int, output_events[item]))
        if len(event.template_ids) > 0:
            events.append(event)
    return events
コード例 #19
0
ファイル: glove_embedding.py プロジェクト: ana-dev/sentime
    def __init__(self, docs_tokens, emb_dim, iters, window, learn_rate):
        self.time = 0.

        self.time = time()

        corpus_model = glove.Corpus()
        corpus_model.fit(docs_tokens, window=window)
        glove_model = glove.Glove(no_components=emb_dim, learning_rate=learn_rate)
        glove_model.fit(corpus_model.matrix, epochs=iters, no_threads=4)
        glove_model.add_dictionary(corpus_model.dictionary)

        self.time = time() - self.time

        word_to_index = glove_model.dictionary
        index_word = glove_model.inverse_dictionary
        embedding_dictionary = {index_word[i]: vector for i, vector in enumerate(glove_model.word_vectors)}

        super(EmbeddingModel, self).get_from_data(embedding_dictionary, emb_dim, word_to_index, self)

        self.name = 'glove'
コード例 #20
0
def get_glove(X):
    print("X_cooc 생성...")
    X_cooc = X.T @ X
    print("X_cooc 생성 완료")
    X_cooc.setdiag(0)
    result = X_cooc.toarray()
    dic = {}
    for idx1, doc in enumerate(result):
        tmpdic = {}
        for idx2, word2 in enumerate(doc):
            if word2 > 0:
                tmpdic[idx2] = word2
        dic[idx1] = tmpdic
    model = glove.Glove(dic, d=200, alpha=0.75, x_max=5.0)
    for epoch in range(150):
        err = model.train(batch_size=100, workers=4, step_size=0.05)
        print("epoch %d, error %.3f" % (epoch, err), flush=True)
    gloveVectors = model.W
    print("glove vectors shape: " + str(gloveVectors.shape))

    return gloveVectors
コード例 #21
0
    def glove_pemb(self):
        """Computes Glove embeddings from co-occurrence matrix
            and returns patient embeddings

        Return
        ------
        list
            pids list
        list
            matrix of patient embeddings
        array
            word embeddings
        """

        corpus = self.__build_corpus()
        coocc_dict = self.__build_cooccur(corpus, window_size=10)
        model = glove.Glove(coocc_dict,
                            alpha=0.75,
                            x_max=10.0,
                            d=ut.n_dim_glove,
                            seed=1234)
        logging.info("\nTraining Glove embeddings...")
        for epoch in range(ut.n_epoch_glove):
            err = model.train(batch_size=ut.batch_size_glove,
                              step_size=ut.learning_rate_glove)
            if epoch % 10 == 0:
                logging.info("epoch %d, error %.3f" % (epoch, err))
        logging.info("epoch %d, error %.3f" % (epoch, err))

        wemb = model.W + model.ContextW  # as suggested in Pennington et al.
        p_emb = []
        pid_list = []
        for pid, term in corpus.items():
            if len(term) != 0:
                pid_list.append(pid)
                p_emb.append(
                    np.mean([wemb[int(t)].tolist() for t in term],
                            axis=0).tolist())

        return pid_list, p_emb, wemb
コード例 #22
0
ファイル: preprocessor.py プロジェクト: pegahkmi/GloVex
def glovex_model(filepath,
                 argstring,
                 cooccurrence,
                 dims=100,
                 alpha=0.75,
                 x_max=100,
                 force_overwrite=False,
                 suffix=".glovex"):
    model_path = filepath + argstring
    model_files = glob.glob(model_path + "_epochs*" + suffix)
    if not len(model_files) or force_overwrite:
        model = glove.Glove(cooccurrence, d=dims, alpha=alpha, x_max=x_max)
    else:
        highest_epochs = max(
            [int(f.split("epochs")[1].split(".")[0]) for f in model_files])
        logger.info(
            " ** Existing model file found.  Re-run with --overwrite_model if you did not intend to reuse it."
        )
        with open(model_path + "_epochs" + str(highest_epochs) + suffix,
                  "rb") as pro_f:
            model = pickle.load(pro_f)
    return model
コード例 #23
0
def train_glove(sentences=None, nr_feature=None, save_name=None):
    verify_cwd()
    if sentences is None:
        print("preprocessing sentences...")
        sentences = list(
            itertools.islice(word2vec.Text8Corpus('./data/text8'), None))
        print("{} sentences found.".format(len(sentences)))
    if save_name is None:
        save_name = "./data/glove.model"
    if nr_feature is None:
        nr_feature = 200

    corpus = glove.Corpus()
    print("start fiting sentences...")
    corpus.fit(sentences, window=10)
    gl = glove.Glove(no_components=nr_feature, learning_rate=0.05)
    print("start training glove...")
    gl.fit(corpus.matrix,
           epochs=10,
           no_threads=multiprocessing.cpu_count(),
           verbose=True)
    corpus.save("./data/corpus.model")
    gl.save("./data/glove.model")
コード例 #24
0
 def __init__(self):
     self.options = {
         '1': self.key,
         '2': self.key,
         '3': self.key,
         '4': self.key,
         '5': self.key,
         '6': self.key,
         '7': self.key,
         '8': self.key,
         '\x80': self.key,
         'R': self.release,
         'H': self.hold,
         'O': self.on
     }
     self.index_array = [
         'P1', 'P2', 'R1', 'R2', 'M1', 'M2', 'I1', 'I2', '3', '1', '2', '4',
         '8', '5', '8', '7'
     ]
     self.hold_flag = False
     self.glove = glove.Glove('11')
     self.on_increment = 0
     self.last_read_byte = '0'
     self.increment = 0
コード例 #25
0
    def __init__(self, parse=PARSE_PATH, split='train'):

        p_length = self.phrase_length
        remove_stop = self.remove_stop
        add_stop = self.add_stop

        parse = os.path.expanduser(parse)
        rawfile = '%s_org.qu' % split
        rawfile = os.path.join(parse, rawfile)

        self.split = split
        self.key_w, self.human_info, self.street, self.city, self.county, self.region, self.rest, self.foodtype, self.rating = read_word_rest(
        )
        self.g = glove.Glove()
        self.embed()

        with open(rawfile, 'r') as f:

            res = []
            res_pair = []

            for i, line in enumerate(f):
                self.qu_pairs = []  #words pairs

                self.count = 0  #number of <f>
                self.count_c = 0  #number of <c>

                words = word_tokenize(line)

                self.w_filter = words  #line filtered out the stop words later
                self.qu_annot = [''] * len(words)  #final result

                line = ' '.join(line.strip('\n').split(' '))

                self.stop_words = set(stopwords.words('english'))
                for rw in remove_stop:
                    self.stop_words.remove(rw)
                for aw in add_stop:
                    self.stop_words.add(aw)
                for k, w in enumerate(words):
                    if w not in self.stop_words:
                        self.w_filter[k] = w
                    else:
                        self.w_filter[k] = ''
                        if self.qu_annot[k] == '':
                            self.qu_annot[
                                k] = w  #append stop words in final result

                self.special_w_1(line)

                for le in range(p_length - 1, -1, -1):
                    for idx in range(len(self.w_filter) - le):
                        word = self.check_phrase(idx, le)
                        if word != None:
                            word = ' '.join(word)
                            self.find_const_w(word, idx, le, line)

                #find the human knowledge word first before moving out stop words, since 'of' have to be moved out but then we can't find 'number of citizens'
                for le in range(p_length - 1, -1, -1):
                    for idx in range(len(self.w_filter) - le):
                        word = self.check_phrase(idx, le)
                        if word != None:
                            word = ' '.join(word)
                            self.find_human_w(word, idx, le, line)

                #the word exact match key word
                for le in range(p_length - 1, -1, -1):
                    for idx in range(len(self.w_filter) - le):
                        word = self.check_phrase(idx, le)
                        if word != None:
                            word = ' '.join(word)
                            self.exact_match(word, idx, le, line)

                self.special_w_2(line)

                if len(self.qu_annot) == len(self.w_filter):
                    for aw_idx, aw in enumerate(self.qu_annot):
                        if aw == '' and '<>' not in self.w_filter[aw_idx]:
                            self.qu_annot[aw_idx] = self.w_filter[aw_idx]
                else:
                    print('-------------wrong length for result--------------')
                    print(self.qu_annot)
                    print(self.w_filter)
                    print(
                        '----------------------------------------------------')

                qu_annot = [
                    item for item in filter(lambda x: x != '', self.qu_annot)
                ]
                qu_annot = ' '.join(qu_annot)
                qu_pairs = ''.join(self.qu_pairs)

                #all the key words have been picked up and labled, but their index number in <f+num> and <c+num> are not in order. Then we need to reorder the index.
                qu_pairs, qu_annot = self.reorder(qu_pairs, qu_annot)

                res.append(qu_annot)
                res_pair.append(qu_pairs)

            print('\nSaving questions')
            with open(os.path.join(parse, '%s.qu' % self.split), 'w') as f:
                f.write('\n'.join(res))

            print('\nSaving pairs')
            with open(os.path.join(parse, '%s_sym_pairs.txt' % self.split),
                      'w') as f:
                f.write('\n'.join(res_pair))

        generate_lon(parse, split, 8)
コード例 #26
0
USE_I18N = True

USE_L10N = True

USE_TZ = True


# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/1.10/howto/static-files/

STATIC_ROOT = os.path.join(BASE_DIR, 'static')
STATIC_URL = '/static/'
STATICFILES_DIRS = (
                os.path.join(BASE_DIR, 'staticfiles'), # if your staticfiles files folder is named "staticfiles"
)

glove_instance = glove.Glove()

print("Loading tags and images dictionary")
IMAGE_RETRIEVED_TAGS_FILE_PATH = 'tags_images.txt'
IMAGE_RETRIEVED_TAGS = dict()
with open("contentBasedSearch/"+IMAGE_RETRIEVED_TAGS_FILE_PATH, "rb") as f:
    IMAGE_RETRIEVED_TAGS = pickle.load(f)
print("Finished loading dictionary")

print(IMAGE_RETRIEVED_TAGS)

print("Loading Glove vectors")
TERM_SEARCH_ENGINE = glove_instance.load_stanford(filename="vectorsGloveLight.txt")
print("Finished Loading vectors")
コード例 #27
0
def glove_pro(df_raw,
              sentence_id,
              word_id,
              emb_size=128,
              window=50,
              dropna=False,
              n_jobs=16,
              learning_rate=0.05,
              epoch=8,
              return_model=False):
    """
    conda create -y -n TF1.14 python=3.6 
    pip install glove_python
    ------
    test_glove = datalog.head(10000)
    sentence_id = 'user_id'
    word_id = 'industry'

    res = glove_pro(test_glove, sentence_id, word_id, emb_size=32, 
                  window=20, dropna=False, n_jobs=16, 
                  learning_rate=0.05, 
                  epoch=8,return_model=True)
    res.keys()
    res['sentence_emb_df'].info()
    res['model'].most_similar("6", number=10)

    """
    list_col_nm = f'{sentence_id}__{word_id}_list'
    if (n_jobs is None) or (n_jobs <= 0):
        n_jobs = multiprocessing.cpu_count()
    logger.info(f"========== GloVE: {sentence_id} {word_id} ==========")
    df = df_raw[[sentence_id, word_id]].copy()
    if df[sentence_id].isnull().sum() > 0:
        logger.warning("NaNs exist in sentence_id column!!")
    if dropna:
        df = df.dropna(subset=[sentence_id, word_id])
    else:
        df = df.fillna('NULL_zhangqibot')
    df = df.astype(str)
    tmp = df.groupby(sentence_id,
                     as_index=False)[word_id].agg({list_col_nm: list})
    sentences = tmp[list_col_nm].values.tolist()
    all_words_vocabulary = df[word_id].unique().tolist()
    del tmp[list_col_nm]
    gc.collect()

    matrix = glv.Corpus()
    matrix.fit(corpus=sentences, window=window)
    model = glv.Glove(no_components=emb_size,
                      learning_rate=learning_rate,
                      alpha=0.75,
                      max_count=100,
                      max_loss=10.0,
                      random_state=666)
    model.fit(matrix.matrix, epochs=epoch, no_threads=n_jobs, verbose=1)
    model.add_dictionary(matrix.dictionary)
    # get word embedding matrix
    emb_dict = {}
    for word_i in all_words_vocabulary:
        if word_i in model.dictionary:
            emb_dict[word_i] = model.word_vectors[model.dictionary[word_i]]
        else:
            emb_dict[word_i] = np.zeros(emb_size, dtype="float32")
    return {"word_emb_dict": emb_dict}
コード例 #28
0
import glove

import glove

cooccur = {0: {0: 1.0, 2: 3.5}, 1: {2: 0.5}, 2: {0: 3.5, 1: 0.5, 2: 1.2}}

model = glove.Glove(cooccur, vocab_size=3, d=50, alpha=0.75, x_max=100.0)

for epoch in range(25):
    err = model.train(batch_size=200, workers=9)
    print("epoch %d, error %.3f" % (epoch, err), flush=True)
コード例 #29
0
import glove

cooccur = {0: {0: 1.0, 2: 3.5}, 1: {2: 0.5}, 2: {0: 3.5, 1: 0.5, 2: 1.2}}

cooccur_mat = [[1, 2, 3], [2, 1, 2], [2, 1, 3]]

# convert matrix to dict
keys = range(len(cooccur_mat))
lines = []
for i in range(len(cooccur_mat)):
    line = dict(zip(keys, cooccur_mat[i]))
    lines.append(line)

c_c_mat = dict(zip(keys, lines))

model = glove.Glove(c_c_mat, d=50, alpha=0.75, x_max=100.0)

for epoch in range(25):
    err = model.train(step_size=0.05, workers=9, batch_size=50)
    print err

print model.W
print model.b
コード例 #30
0
def run_emb(datadir, level=None):
    outdir = datadir + '/level-' + level
    
    # load vocabulary and behrs (ID_SUBJ:[terms]; ID_SUBJ:Fn:[terms])
    bt_to_idx, idx_to_bt = _load_vocab(outdir, ut.file_names['vocab'])
    behr, behr_tf = _load_data(outdir, ut.file_names['behr'])

    terms = []
    for vec in behrs.values():
        terms.extend(vec)

    count = 0
    list_count = {}
    for idx, lab in idx_to_bt.items():
        co = terms.count(str(idx))
        list_count[lab] = co
        if co > 1:
            count += 1
    print("Number of repeated terms: {0} -- Terms with one occurrence: {1}\n".format(count, 
          len(bt_to_idx)-count))

    print('Most frequent terms (TF>20)')
    x = []
    y = []
    for lab, co in list_count.items():
        if co > 20:
            x.append(lab)
            y.append(co)
            print('%s, %d' % (lab, co))
        else:
            x.append('TF<20')
            y.append(co)
    
    # save plot term distribution
    plt.figure(figsize=(30, 20))
    plt.bar(x, y)
    plt.tick_params(axis='x', rotation=90, labelsize=10)
    plt.savefig(os.path.join(outdir, 'term20-distribution.png'))

    plt.figure(figsize=(20, 10))
    plt.bar(range(len(list_count.values())), list(list_count.values()))
    plt.tick_params(axis='x', rotation=90, labelsize=10)
    plt.savefig(os.path.join(outdir, 'term-distribution.png'))

    print('\n')

    # TF-IDF
    print('Computing TF-IDF matrix...')
    doc_list = list(map(lambda x: ' '.join(x), list(behrs.values())))
    id_subj = [id_lab for id_lab in behrs]

    vectorizer = TfidfVectorizer(norm='l2')
    tfidf_mtx = vectorizer.fit_transform(doc_list)

    print('Performing SVD on the TF-IDF matrix...')
    reducer = TruncatedSVD(n_components=ut.n_dim, random_state=123)
    svd_mtx = reducer.fit_transform(tfidf_mtx)

    # save SVD mtx
    with open(os.path.join(outdir, 'svd-mtx.csv'), 'w') as f:
        wr = csv.writerow(f)
        for idx, lab in enumerate(id_subj):
            wr.writerow([lab] + svd_mtx[idx])
    print('\n\n')

    # GloVe embeddings
    print('Starting computing GloVe embeddings for {0} epochs'.format(ut.n_epoch))
    corpus = _build_corpus(behrs_tf)
    coocc_dict = build_cooccur(idx_to_bt, corpus, window_size=20)

    model = glove.Glove(out, alpha=0.75, x_max=100.0, d=ut.n_dim)
    for epoch in range(ut.n_epoch):
        err = model.train(batch_size=ut.batch_size)
        print("epoch %d, error %.3f" % (epoch, err), flush=True)

    Wemb = model.W + model.ContextW # as suggested in Pennington et al.
    p_emb = []
    id_list = []
    for id_subj, term in corpus.items():
        if len(term)!=0:
            id_list.append(id_subj)
            p_emb.append(np.mean([Wemb[int(t)].tolist() for t in term], 
                                 axis=0).tolist())
    # save subject embeddings
    with open(os.path.join(outdir, 'glove-mtx.csv'), 'w') as f:
        wr = csv.writer(f)
        for id_p, pe in zip(id_list, p_emb):
            wr.writerow([id_p] + list(pe))