Esempio n. 1
0
    def train(self, corpus, chunksize=10000, use_temp_files=True):
        """
        train the underlying linear mappings.

        @param corpus is a gensim corpus compatible format
        @param use_temp_files determines whether to use temporary files to store the intermediate representations of
        the corpus to train the next layer. Setting flag True will not greatly affect memory usage, but will temporarily
        require a significant amount of disk space. Using temp files will strongly speed up training, especially as the
        number of layers increases.
        """
        ln.info("Training mSDA with %s layers. ")
        if not use_temp_files:
            ln.warn("Training without temporary files. May take a long time!")
            self.reduction_layer.train(corpus, chunksize=chunksize)
            current_representation = self.reduction_layer[corpus]
            for layer_num, layer in enumerate(self.mda_layers):

                # We feed the corpus through all intermediate layers to get the current representation
                # that representation is then used to train the next layer
                # this is memory-independent, but will probably be very slow.

                ln.info("Training layer %s.", layer_num)
                layer.train(current_representation, chunksize=chunksize)
                if layer_num < len(self.mda_layers) - 1:
                    current_representation = layer[current_representation]

        else:
            ln.info("Using temporary files to speed up training.")

            ln.info("Beginning training on %s layers." %
                    (len(self.mda_layers) + 1))
            self.reduction_layer.train(corpus, chunksize=chunksize)

            # serializing intermediate representation
            MmCorpus.serialize(".msda_intermediate.mm",
                               self.reduction_layer[corpus],
                               progress_cnt=chunksize)

            # load corpus to train next layer
            current_representation = MmCorpus(".msda_intermediate.mm")

            for layer_num, layer in enumerate(self.mda_layers):
                layer.train(current_representation, chunksize=chunksize)
                os.remove(".msda_intermediate.mm")
                os.remove(".msda_intermediate.mm.index")

                if layer_num < len(self.mda_layers) - 1:
                    MmCorpus.serialize(".msda_intermediate.mm",
                                       layer[current_representation],
                                       progress_cnt=chunksize)
                    current_representation = MmCorpus(".msda_intermediate.mm")

        ln.info("mSDA finished training.")
Esempio n. 2
0
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None  # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
def load_model(output_path):
    """
    Load working model

    Loads working model, BoW corpus and initial dataframe with tokens

    Outputs:
    Writes a .txt file of top 7 words per topic for subsequent inspection.
    """

    # load data
    combined_df = pd.read_csv(pkg_resources.resource_filename(
        resource_package, "data/data_processed.csv"),
                              index_col=0)

    corpus = MmCorpus(
        pkg_resources.resource_filename(resource_package,
                                        "data/BoW_corpus.mm"))
    print('Data loaded.')
    # load the mallet model
    ldamallet = gensim.models.wrappers.LdaMallet.load(
        pkg_resources.resource_filename(
            resource_package, 'model/working_ldamallet_model.gensim'))
    print('Model loaded.')
    # write out topics to a text file
    topics = ldamallet.print_topics(num_topics=-1, num_words=7)

    with open(os.path.join(output_path, 'LDA_topics.txt'), 'w') as topic_file:
        for topic in topics:
            topic_file.write(str(topic) + '\n')
    print('Topics written to data folder.')

    return (corpus, ldamallet, combined_df)
 def setUp(self):
     self.corpus = MmCorpus(datapath('testcorpus.mm'))
     self.class_ = ldamodel.LdaModel
     self.model = self.class_(common_corpus,
                              id2word=common_dictionary,
                              num_topics=2,
                              passes=100)
Esempio n. 5
0
def load_models(args):
    """
        Load tfidf model, corpus, and dictionary if specified in arguments.

        input:
            args (arparse object): input arguments

        return loaded tfidf object, corpus, and dictionary if specified
    """
    try:
        tfidf = models.TfidfModel.load(os.path.join(args.tfidf_model_dir_path, "model"))
        corpus = mm = MmCorpus(os.path.join(args.tfidf_model_dir_path, "corpus"))
        mydict = corpora.Dictionary.load(os.path.join(args.tfidf_model_dir_path, "dictionary"))
    except FileNotFoundError:
        print(timestamp(), "Tf-idf model directory path must contain model, corpus, and dictionary.", file=sys.stderr)
        exit(1)
    return tfidf, corpus, mydict
Esempio n. 6
0
def corpus_tfidf():
    path = "" 
    corpus = MmCorpus(path + "corpus.mm")
    id2word = Dictionary.load(path + 'corpus.mm.dict')
    
    # TF-IDF the corpus
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    tfidf.save("5_topics_tfidf_only.model")
    
    lda_model_tfidf = models.LdaModel(corpus_tfidf, num_topics=5, id2word=id2word)#models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=id2word, passes=2, workers=4) # better model
    print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
    
    for idx, topic in lda_model_tfidf.print_topics(-1):
        print('Topic: {} Word: {}'.format(idx, topic))
        
    lda_model_tfidf.save(path + "5_topics_test.model")
    lda_model_tfidf.wv.save(path + "5_topics_test_kv.model")
Esempio n. 7
0
def exp_variable_selection(dict_name, corpus_name, N=2, n_noise_term=10, n_epoches=20, \
                           learning_rate=.001, batch_size=30, n_hidden=50):
    """
    Main function for selecting variables and calculating embeddings for selected
    embeddings vectors by using vanilla RBM.
    """
    # load existing dictionary (or creat a new dictionary from scratch)
    # code for creating new dictionary ...
    ngram_dict = corpora.Dictionary.load(dict_name)
    # select key ides of some random ngram terms from loaded dictionary as dictionary noise
    random_terms = list(set(ngram_dict.keys()) - set(PRESERV_TERMS))
    noise_terms = random.sample(random_terms, n_noise_term)
    print("[%s] [Var Select] %d noise terms has been added: %s" % \
         (arrow.now(), len(noise_terms), [ngram_dict[key] for key in noise_terms]), file=sys.stderr)

    # # shrink dictionary to a subset in accordance with PRESERV_TERMS
    # sub_ngram_dict = sub_dictionary(ngram_dict, PRESERV_TERMS, by_key=True)

    # load existing corpus
    corpus = MmCorpus(corpus_name)
    dense_corpus = corpus2dense(corpus, num_terms=len(ngram_dict)).transpose()
    print("[%s] [Var Select] raw corpus has been loaded with size (%d, %d)" % \
         (arrow.now(), dense_corpus.shape[0], dense_corpus.shape[1]), file=sys.stderr)
    # slice the corpus by PRESERV_TERMS and corpus
    # (remove columns which are not included in PRESERV_TERMS)
    # noted: indexing arrays could not be broadcast together
    # e.g. dense_corpus[PRESERV_DOCS, PRESERV_TERMS]
    corpus_slice = dense_corpus[:, PRESERV_TERMS + noise_terms]
    corpus_slice = corpus_slice[PRESERV_DOCS, :]
    print("[%s] [Var Select] corpus has been sliced with size (%d, %d)" % \
         (arrow.now(), corpus_slice.shape[0], corpus_slice.shape[1]), file=sys.stderr)
    # mat2img(np.log(corpus_slice))

    rbm = GBRBM(n_visible=corpus_slice.shape[1], n_hidden=n_hidden, \
                learning_rate=learning_rate, momentum=0.95, err_function='mse', \
                use_tqdm=False, sample_visible=False, sigma=1.)
    rbm.fit(corpus_slice, n_epoches=n_epoches, batch_size=batch_size, \
            shuffle=True, verbose=True)
    embeddings = rbm.transform(corpus_slice).round().astype(int)
    # w, vbias, hbias = rbm.get_weights()
    # mat2img(w)
    return corpus_slice, embeddings
Esempio n. 8
0
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 10 18:21:00 2019

@author: evefl
"""

from pprint import pprint
from gensim.corpora.mmcorpus import MmCorpus
from gensim.corpora.dictionary import Dictionary
from gensim import models

# set path to wherever you download the files
path = '/data/'

corpus = MmCorpus("corpus.mm")  #MmCorpus('%scorpus.mm' % path) # BOW
id2word = Dictionary.load('corpus.mm.dict')  #'%scorpus.mm.dict' % path)

for doc in corpus[:1]:
    for word in doc[:2000]:
        print(word)
        print(id2word[word[0]])

# TF-IDF the corpus

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#
#for doc in corpus_tfidf: # preview tfidf scores for first document
#    pprint(doc)
#    break
Esempio n. 9
0
            for topic_id in range(num_topics):
                # cal p(w) : p(w) = sumz(p(z)*p(w|z))
                prob_topic = doc_topics_ist[i][topic_id][1]
                prob_topic_word = topic_word_list[topic_id][word]
                prob_word += prob_topic * prob_topic_word
            prob_doc += math.log(prob_word)  # p(d) = sum(log(p(w)))
        prob_doc_sum += prob_doc
        testset_word_num += doc_word_num
    prep = math.exp(-prob_doc_sum /
                    testset_word_num)  # perplexity = exp(-sum(p(d)/sum(Nd))
    #print("the perplexity of this ldamodel is : %s" % prep)
    return prep


topicnum_perplexity = []
corpus = MmCorpus('./ths_corpora.mm')
testset = []
import random
for i in random.sample(range(corpus.num_docs), corpus.num_docs // 100):
    testset.append(corpus[i])

for topic_num in range(20, 60, 3):
    # lda = models.LdaModel(dict_corpora, num_topics=topic_num, id2word=dict_1, iterations=1000)
    # prep = perplexity(lda, testset, dict_1, len(dict_1.keys()), topic_num)
    # print(topic_num, "success!!!!!!!!!!!!!!!!!!!", prep)
    # topicnum_perplexity.append([topic_num, prep])

    lda_tfidf = models.LdaModel(corpus_tfidf,
                                num_topics=topic_num,
                                id2word=dict_1,
                                iterations=1000)
 def load(id):
     tokenizer = Dictionary.load(BM25Engine.tokenizer_file.format(id=id))
     corpus = MmCorpus(BM25Engine.corpus_file.format(id=id))
     with open(BM25Engine.doc_idxs_file.format(id=id), "r") as f:
         idxs2id = json.load(f)
     return BM25Engine(tokenizer, corpus, idxs2id)
Esempio n. 11
0
 def setUp(self):
     self.corpus = MmCorpus(datapath('testcorpus.mm'))
Esempio n. 12
0
 def setUp(self):
     self.corpus_small = MmCorpus(datapath('test_corpus_small.mm'))
     self.corpus_ok = MmCorpus(datapath('test_corpus_ok.mm'))
     self.corpus_empty = []
Esempio n. 13
0
corpusname = "brown"
corpus = [
    preprocessor.preprocess(" ".join(text), return_bow=True)
    for text in brown.sents()
]
preprocessor.dictionary.filter_extremes(15, 0.1, 30000)
corpus = [
    preprocessor.preprocess(" ".join(text),
                            allow_update=False,
                            return_bow=True) for text in brown.sents()
]

ln.debug("saving/loading corpus")
save = MmCorpus.serialize("test.mm", corpus)
corpus = MmCorpus("test.mm")

dimensions = 2000
params = [{"num_layers": 5, "noise": 0.7}, {"num_layers": 3, "noise": 0.3}][0]

ln.info("training mSDA with %s dimensions. params: %s" % (dimensions, params))
model = mSDAWrapper.train(corpus, dimensions, dictionary, params)

paramstring = "_".join(["%s-%s" % (k, v) for k, v in params.items()])
savestring = "mSDA_%s_%s_" % (corpusname, paramstring)
model.save(savestring)
msda_wrapper = mSDAWrapper(savestring, preprocessor)


def get_synonyms(word):
    return [synset.synonyms for synset in wordnet.synsets(word)]
Esempio n. 14
0
def search(request):

    if request.method == 'POST':
        global catch
        catch = request.POST['title']
        data = [catch]

        stop_words = stopwords.words('indonesian')
        stop_words2 = stopwords.words('english')
        stop_words.extend(stop_words2)
        stop_words.extend([
            'of', 'in', 'and', 'the', 'for', 'on', 'using', 'based', 'from',
            'with', 'to', 'by', 'as', 'an', 'pengaruh', 'effect', 'analisis',
            'at', 'pre', 'pro', 'analysis', 'berbasis', 'tahun', 'between',
            'kualitas', 'method', 'metode', 'through', 'menggunakan', 'hasil'
        ])

        # Remove Numbers
        data = [re.sub(" \d+", ' ', sent) for sent in data]
        data = [re.sub('[^a-zA-Z]', ' ', sent) for sent in data]

        # Remove new line characters
        data = [re.sub('\s+', ' ', sent) for sent in data]

        # Remove distracting single quotes
        data = [re.sub("\'", "", sent) for sent in data]

        def sent_to_words(sentences):
            for sentence in sentences:
                yield (gensim.utils.simple_preprocess(str(sentence),
                                                      deacc=True)
                       )  # deacc=True removes punctuations

        coba = sent_to_words(data)
        data_words = list(coba)

        # Build the bigram and trigram models
        bigram = gensim.models.Phrases(
            data_words, min_count=5,
            threshold=100)  # higher threshold fewer phrases.
        trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram_mod = gensim.models.phrases.Phraser(trigram)

        # Define functions for stopwords, bigrams, trigrams and lemmatization
        # from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
        def remove_stopwords(texts):
            return [[
                word for word in simple_preprocess(str(doc))
                if word not in (stop_words or stop_words2)
            ] for doc in texts]

        def make_bigrams(texts):
            return [bigram_mod[doc] for doc in texts]

        def make_trigrams(texts):
            return [trigram_mod[bigram_mod[doc]] for doc in texts]

        def lemmatization(texts):
            """https://spacy.io/api/annotation"""
            texts_out = []
            for sent in texts:
                doc = nlp(" ".join(sent))
                texts_out.append([token.lemma_ for token in doc])
            return texts_out

        # Remove Stop Words
        data_words_nostops = remove_stopwords(data_words)

        # # Form Bigrams
        data_words_bigrams = make_bigrams(data_words_nostops)

        nlp = spacy.load('en_core_web_sm')

        data_lemmatized = lemmatization(data_words_bigrams)

        #stem masing-masing kata yang ada
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        for x in range(len(data_lemmatized) - 1):
            for y in range(len(data_lemmatized[x]) - 1):
                data_lemmatized[x][y] = stemmer.stem(data_lemmatized[x][y])

        id2wordd = corpora.Dictionary(data_lemmatized)
        # Create Corpus
        texts = data_lemmatized
        # Term Document Frequency
        corpuss = [id2wordd.doc2bow(text) for text in texts]

        id2word = Dictionary.load('papers/id2word_new.dict')
        corpus = MmCorpus('papers/corpus_new.mm')

        # import gensim
        model = gensim.models.ldamodel.LdaModel.load(
            'papers/mallet_18_lda.mdl', mmap='r')
        new_doc_bow = id2word.doc2bow(data_lemmatized[0])
        hasil = model.get_document_topics(new_doc_bow)

        topic = 0
        nilai = -99
        for i, row in (hasil):
            if (row > nilai):
                topic = i
                nilai = row

        keywords = []
        for i, nilai in model.show_topic(topic):
            keywords.append(i)

        # topics = Topics.objects.filter(id_topic=topic).values_list('id_publication', flat=True)

        #load data
        df = pd.read_csv('papers/label18baru.csv')
        with open("papers/lemma_new.txt", "rb") as fp:  #Pickling
            data_lemmatizedd = pickle.load(fp)

        #init tempat menyimpan hasil
        hasil_cosine_keseluruhan = []
        hasil_cosine = []

        #mengambil data yang sesuai dengan topik
        # topic=df
        topik = df.loc[df['Topic1'] == topic]

        ##membuat data lemma, corpus dan dictionary berdasarkan data dalam 1 topik
        res_list = [data_lemmatizedd[i] for i in topik.index]
        # Create Dictionary
        id2worddd = corpora.Dictionary(res_list)

        # Create Corpus
        texts = res_list

        # Term Document Frequency
        corpusss = [id2worddd.doc2bow(text) for text in res_list]

        #menghitung cosine sim judul dibandingkan dengan keseluruhan judul yang ada

        index_tmpfile = get_tmpfile("index")
        index = Similarity(index_tmpfile,
                           corpusss,
                           num_features=len(id2worddd))

        index = MatrixSimilarity(corpusss, num_features=len(id2worddd))
        sims = index[corpuss]

        sort_index = np.argsort(sims[0])

        reversed_arr = sort_index[::-1]

        hasil = pd.DataFrame(reversed_arr)

        hasilbaru = hasil.iloc[:40, :]

        hasilmantep = hasilbaru.to_numpy()

        idfix = []
        for i in range(0, 40):
            idfix.append(hasilmantep[i][0])

        ngetest = topik.to_numpy()

        id_artikel = []
        for i in idfix:
            id_artikel.append(ngetest[i][9])

        global user_list
        user_list = Papers.objects.filter(
            id_pub__in=id_artikel).order_by('id_pub')

        topic_dict = {
            '0': 'Kimia',
            '1': 'Industri',
            '2': 'Biologi-Tumbuhan',
            '3': 'Biologi-Pangan',
            '4': 'Mikrobiologi',
            '5': 'Studi-Penemuan',
            '6': 'Sosial-Masyarakat-Sejarah',
            '7': 'Habitat Makhluk Hidup',
            '8': 'Elektro-Mesin',
            '9': 'Pendidikan',
            '10': 'Sosial-Pengaruh',
            '11': 'Pertanian',
            '12': 'Data-Citra-Statistik',
            '13': 'Jawa-Indonesia',
            '14': 'Masyarakat',
            '15': 'Biokimia',
            '16': 'Kesehatan',
            '17': 'Kesehatan 2',
        }

        global hasiltopik
        hasiltopik = topic_dict.get(str(topic))

        page = request.GET.get('page', 1)
        paginator = Paginator(user_list, 10)

        try:
            users = paginator.page(page)
        except PageNotAnInteger:
            users = paginator.page(1)
        except EmptyPage:
            users = paginator.page(paginator.num_pages)

        context = {
            'title': 'Halaman Utama',
            'topic': hasiltopik,
            'catch': catch,
            'users': users,
        }

        return render(request, 'papers/index.html', context)

    else:
        page = request.GET.get('page', 1)
        paginator = Paginator(user_list, 10)

        try:
            users = paginator.page(page)
        except PageNotAnInteger:
            users = paginator.page(1)
        except EmptyPage:
            users = paginator.page(paginator.num_pages)

        context = {
            'title': 'Halaman Utama',
            'topic': hasiltopik,
            'catch': catch,
            'users': users,
        }

        return render(request, 'papers/index.html', context)
Esempio n. 15
0
def rekomendasi(input):
    data = [input]
    id2word = Dictionary.load('pdupt_website/id2word_new.dict')
    corpus = MmCorpus('pdupt_website/corpus_new.mm')
    df = pd.read_csv('pdupt_website/reduksifix.csv')
    with open("pdupt_website/lemma_new.txt", "rb") as fp:  #Pickling
        data_lemmatized = pickle.load(fp)
    stop_words = stopwords.words('indonesian')
    stop_words2 = stopwords.words('english')
    stop_words.extend(stop_words2)
    stop_words.extend([
        'of', 'in', 'and', 'the', 'for', 'on', 'using', 'based', 'from',
        'with', 'to', 'by', 'as', 'an', 'pengaruh', 'effect', 'analisis', 'at',
        'pre', 'pro', 'analysis', 'berbasis', 'tahun', 'between', 'kualitas',
        'method', 'metode', 'through', 'menggunakan', 'hasil'
    ])
    # Remove Numbers
    data = [re.sub(" \d+", ' ', sent) for sent in data]
    data = [re.sub('[^a-zA-Z]', ' ', sent) for sent in data]
    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]

    def sent_to_words(sentences):
        for sentence in sentences:
            yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)
                   )  # deacc=True removes punctuations

    data = sent_to_words(data)
    data_words = list(data)
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(
        data_words, min_count=5,
        threshold=100)  # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Define functions for stopwords, bigrams, trigrams and lemmatization
    # from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    def remove_stopwords(texts):
        return [[
            word for word in simple_preprocess(str(doc))
            if word not in (stop_words or stop_words2)
        ] for doc in texts]

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    def lemmatization(texts):
        """https://spacy.io/api/annotation"""
        texts_out = []
        for sent in texts:
            doc = nlp(" ".join(sent))
            texts_out.append([token.lemma_ for token in doc])
        return texts_out

    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)

    nlp = spacy.load('en_core_web_sm')

    data_lemmatized_search = lemmatization(data_words_bigrams)

    #stem masing-masing kata yang ada
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    for x in range(len(data_lemmatized_search) - 1):
        for y in range(len(data_lemmatized_search[x]) - 1):
            data_lemmatized_search[x][y] = stemmer.stem(
                data_lemmatized_search[x][y])

            # import gensim
    model = gensim.models.ldamodel.LdaModel.load(
        'pdupt_website/mallet_18_lda.mdl', mmap='r')
    new_doc_bow = id2word.doc2bow(data_lemmatized_search[0])
    hasil = model.get_document_topics(new_doc_bow)

    topic = 0
    nilai = -99
    for i, row in (hasil):
        if (row > nilai):
            topic = i
            nilai = row

    df_topik = df.loc[df['Topic1'] == topic]
    df_topik = df_topik.astype({"id_judul": int})
    df_topik = df_topik.reset_index(drop=True)

    ##membuat data lemma, corpus dan dictionary berdasarkan data dalam 1 topik
    res_list = [data_lemmatized[int(i) - 1] for i in df_topik.id_judul]
    # Create Dictionary
    id2word_topik = corpora.Dictionary(res_list)

    # Create Corpus
    texts = res_list

    # Term Document Frequency
    corpus_topik = [id2word_topik.doc2bow(text) for text in res_list]

    #membuat indexing untuk perhitungan cossim
    index_tmpfile = get_tmpfile("index")
    index = Similarity(index_tmpfile,
                       corpus_topik,
                       num_features=len(id2word_topik))

    #query diambil dari term document berdasarkan corpus per topik dari data lemma hasil search
    query = id2word_topik.doc2bow(data_lemmatized_search[0])
    similarities = index[query]

    sort_index = np.argsort(similarities)
    sort_index

    reversed_arr = sort_index[::-1]
    reversed_arr

    list_idx = reversed_arr[:10]

    list_id_artikel = list(df_topik[df_topik.index.isin(list_idx)].id_judul)

    return (list_id_artikel, topic + 1)
        self.titles = []
        for title, tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

# create a stream of bag-of-words vectors
wiki_corpus = WikiCorpus(fileLocation+'enwiki-latest-pages-articles.xml.bz2', id2word_wiki)
vector = next(iter(wiki_corpus))
print(vector)  # print the first vector in the stream

MmCorpus.serialize(fileLocation+'wikiModels/wiki_bow.mm', wiki_corpus)

mm_corpus = MmCorpus(fileLocation+'wikiModels/wiki_bow.mm')
print(mm_corpus)

clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000)
lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=10, id2word=id2word_wiki, passes=4)

# store all trained models to disk
lda_model.save(fileLocation+'wikiModels/lda_wiki.model')
#lsi_model.save('./data/lsi_wiki.model')
#tfidf_model.save('./data/tfidf_wiki.model')
id2word_wiki.save(fileLocation+'wikiModels/wiki.dictionary')

loaded_lda_model = gensim.models.LdaModel.load(fileLocation+'wikiModels/lda_wiki.model')

# select top 50 words for each of the 20 LDA topics
top_words = [[word for _, word in loaded_lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
Esempio n. 17
0
File: lsi.py Progetto: Sasafrass/IR1
    def __init__(self, docs, topic_number=500):
        # Create a dictionary representation of the documents.
        print('training LSI models with topic number = ' + str(topic_number))
        if (not os.path.isfile('./lsi/lsi_dict.dict')):
            print('creating dict')
            dictionary = Dictionary(docs)
            dictionary.save('./lsi/lsi_dict.dict')
        else:
            print('dict already exists')
            dictionary = Dictionary.load("./lsi/lsi_dict.dict")
        self.dictionary = dictionary

        # Create corpora
        if (not os.path.isfile('./lsi/lsi_corpus.mm')):
            # Filter out words that occur less than 20 documents, or more than 50% of the documents.
            print('creating bow corpus')
            dictionary.filter_extremes(no_below=20, no_above=0.5)
            corpus = [dictionary.doc2bow(doc) for doc in docs]
            MmCorpus.serialize("lsi/lsi_corpus.mm", corpus)
        else:
            print('bow corpus already exists')
            corpus = MmCorpus("./lsi/lsi_corpus.mm")

        self.tfidf = models.TfidfModel(corpus)
        if (not os.path.isfile('./lsi/lsi_tf_corpus.mm')):
            print('creating tf corpus')
            tf_corp = self.tfidf[corpus]
            MmCorpus.serialize("lsi/lsi_tf_corpus.mm", tf_corp)
        else:
            print('tf corpus already exists')
            tf_corp = MmCorpus("./lsi/lsi_tf_corpus.mm")

        # Make a index to word dictionary.
        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token

        #Create the models and vectors
        if (not os.path.isfile('./lsi/lsi_bow_model' + str(topic_number) +
                               '.model')):
            print('creating bow model')
            bow_model = models.LsiModel(corpus=corpus,
                                        num_topics=topic_number,
                                        id2word=id2word)
            bow_model.save('lsi/lsi_bow_model' + str(topic_number) + '.model')
        else:
            print('bow model already exists')
            bow_model = models.LsiModel.load('./lsi/lsi_bow_model' +
                                             str(topic_number) + '.model')
        bow_vector = bow_model[corpus]
        self.bow_model = bow_model

        if (not os.path.isfile('./lsi/lsi_tf_model' + str(topic_number) +
                               '.model')):
            print('creating tfidf model')
            tf_model = models.LsiModel(corpus=tf_corp,
                                       num_topics=topic_number,
                                       id2word=id2word)
            tf_model.save('./lsi/lsi_tf_model' + str(topic_number) + '.model')
        else:
            print('tfidf model already exists')
            tf_model = models.LsiModel.load('./lsi/lsi_tf_model' +
                                            str(topic_number) + '.model')
        tf_vector = tf_model[tf_corp]
        self.tf_model = tf_model

        #Create indices
        if (not os.path.isfile('./lsi/lsi_bow_model' + str(topic_number) +
                               '.index')):
            print('creating bow index')
            bow_index = similarities.MatrixSimilarity(
                bow_vector)  # index corpus in bow LSI space
            bow_index.save('lsi/lsi_bow_model' + str(topic_number) + '.index')
        else:
            print('bow index already exists')
            bow_index = similarities.MatrixSimilarity.load(
                './lsi/lsi_bow_model' + str(topic_number) + '.index')
        self.bow_index = bow_index

        if (not os.path.isfile('./lsi/lsi_tf_model' + str(topic_number) +
                               '.index')):
            print('creating tf index')
            tf_index = similarities.MatrixSimilarity(
                tf_vector)  # index corpus in tf LSI space
            tf_index.save('lsi/lsi_tf_model' + str(topic_number) + '.index')
        else:
            print('tf index already exists')
            tf_index = similarities.MatrixSimilarity.load(
                './lsi/lsi_tf_model' + str(topic_number) + '.index')
        self.tf_index = tf_index
        print('model created!')
Esempio n. 18
0
    # optional argv[3] = keep_words
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        #sys.exit(1)
    input, output = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE

    # build dictionary. only keep 100k most frequent words (out of total ~900k unique tokens)
    enron = EnronCorpus(input, keep_words=keep_words)

    # save dictionary and bag-of-words (term-document frequency matrix)
    enron.dictionary.save_as_text(output + '_wordids.txt')
    MmCorpus.serialize(output + '_bow.mm', enron, progress_cnt=10000)
    del enron

    # initialize corpus reader and word->id mapping
    id2token = Dictionary.load_from_text(output + '_wordids.txt')
    mm = MmCorpus(output + '_bow.mm')

    # build tfidf
    from gensim.models import TfidfModel
    tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

    # save tfidf vectors in matrix market format
    MmCorpus.serialize(output + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
 def setUp(self):
     self.corpus = MmCorpus(datapath('testcorpus.mm'))
     self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
        mean_jaccard.append(np.mean(jacc_np))
        mean_bleu.append(np.mean(bleu_np))
        mean_cos.append(np.mean(cos_np))
        mean_fscore.append(np.mean(fscore_np))
    return np.max(np.asarray(mean_bleu)), np.max(
        np.asarray(mean_jaccard)), np.max(np.asarray(mean_cos)), np.max(
            np.asarray(mean_fscore))


GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union = load_annotations(
)
path = "/home/norberteke/PycharmProjects/Thesis/data/"

dictionary = Dictionary.load(path + 'GH_full_processed_Dictionary.dict')
corpus = MmCorpus(datapath(path + 'corpus_processed_GH_full.mm'))

texts = []
with open(path + 'GH_full_processed_corpus.csv', 'r') as f:
    reader = csv.reader(f)
    texts = list(reader)

terms = []
for (key, value) in dictionary.iteritems():
    terms.append(value)


def write_results_to_file(path, lda_model, max_bleu, max_jaccard, max_cos,
                          max_fscore):
    with open(path, 'a') as f:
        writer = csv.writer(f,
Esempio n. 21
0
async def load_corpus():
    if 'corpus' not in model:
        model['corpus'] = MmCorpus(await tasks['corpus'])
    return model['corpus']