Beispiel #1
0
def store_contents(data_path,
                   save_path,
                   datasource,
                   processOnlyFilesinOriginalQrels,
                   num_workers=None):
    """Preprocess and store a corpus of documents in sqlite.

    Args:
        data_path: Root path to directory (or directory of directories) of files
          containing json encoded documents (must have `id` and `text` fields).
        save_path: Path to output sqlite db.
        preprocess: Path to file defining a custom `preprocess` function. Takes
          in and outputs a structured doc.
        num_workers: Number of parallel processes to use when reading docs.
    """
    if os.path.isfile(save_path):
        raise RuntimeError('%s already exists! Not overwriting.' % save_path)

    print save_path
    print data_path
    docIds = []  # list of TREC DocID
    docIdToDocIndex = {}  # key is DocID, value is docIndex
    docIndex = 0

    workers = ProcessPool(num_workers)
    files = []
    if processOnlyFilesinOriginalQrels == True:
        topicData = TRECTopics(datasource, start_topic[datasource],
                               end_topic[datasource])
        qrelDocList = topicData.qrelDocIdLister(
            qrelAddress[datasource], save_path,
            topic_original_qrels_doc_list_file_name)
        files = []
        for docId in qrelDocList:
            fileid = docId + '.txt'
            files.append(os.path.join(data_path, fileid))
        #files = [f for f in iter_files(data_path) if os.path.splitext(os.path.basename(f))[0] in qrelDocList]
        print "Number of unique documents in the qrels", len(files)

    else:
        files = [f for f in iter_files(data_path)]

    dictionary = Dictionary()
    count = 0

    with tqdm(total=len(files)) as pbar:
        for pairs in tqdm(workers.imap_unordered(get_contents, files)):
            count += len(pairs)
            dictionary.add_documents([
                pairs[0][1].split()
            ])  # pairs[0][0]-->docId, pairs[0][1]-->documentContent
            docIdToDocIndex[pairs[0][0]] = docIndex
            docIds.append(pairs[0][0])
            docIndex = docIndex + 1
            pbar.update()

    print("Number of documents:", docIndex, len(docIds), len(docIdToDocIndex))
    total_documents = len(docIds)
    metadata = {}
    metadata['docIdToDocIndex'] = docIdToDocIndex
    metadata['docIndexToDocId'] = docIds
    # protocol 2 for version compaitability
    pickle.dump(metadata,
                open(save_path + meta_data_file_name[datasource], 'wb'),
                protocol=2)

    # keep only words that
    # exist within at least 20 articles
    # keep only the top most freqent 15000 tokens
    dictionary.filter_extremes(no_below=20, keep_n=dictionary_features_number)
    dictionary.compactify()
    dictionary.save_as_text(save_path + dictionary_name)

    dictionary = Dictionary.load_from_text(save_path + dictionary_name)
    start_time = time.time()
    corpus_bow_stream = stream_corpus(data_path, dictionary, files)
    MmCorpus.serialize(save_path + corpus_bow_file_name,
                       corpus_bow_stream,
                       progress_cnt=10000)
    corpus_bow = MmCorpus(save_path + corpus_bow_file_name)
    model_tfidf = TfidfModel(corpus_bow, id2word=dictionary, normalize=True)
    model_tfidf.save(save_path + corpus_tfidf_model_file_name)
    corpus_tfidf = model_tfidf[corpus_bow]  # apply model
    MmCorpus.serialize(save_path + corpus_tfidf_file_name,
                       corpus_tfidf,
                       progress_cnt=1000)

    # Load the tf-idf corpus back from disk.
    corpus_tfidf = MmCorpus(save_path + corpus_tfidf_file_name)
    #n_items = len(dictionary)
    #print corpus_tfidf

    # CSR matrix construction phase
    indptr = [0]
    indices = []
    data = []
    # processing took 9:26s
    with tqdm(total=total_documents) as pbar:
        for doc in corpus_tfidf:
            for (index, values) in doc:
                indices.append(index)
                data.append(values)
            indptr.append(len(indices))
            pbar.update()

    start = time.time()
    sparse_matrix = sp.csr_matrix((data, indices, indptr), dtype=float)
    # saving took 01:21s
    sp.save_npz(save_path + csr_matrix_file_name[datasource], sparse_matrix)
    print "Finished in:", (time.time() - start)
# In[511]:

dictionary = Dictionary(tokens_lst)
print("Dictionary: ", dictionary)
corpus = []
for i in tokens_lst:
    corpus.append(dictionary.doc2bow(i))
print("Corpus:", corpus)

#file_path_corpus = "/Users/manukarreddy/Desktop/kiruba_python/mkbhd"
lda = LdaModel.load(
    "/Users/manukarreddy/Desktop/BigDataProject/mkbhdtfidf_modelfinal_lda")
#corpus = MmCorpus"))

#mkbhd_file_path_corpus = "/Users/manukarreddy/Desktop/kiruba_python/mkbhd"
mkbhd_corpus = MmCorpus(
    "/Users/manukarreddy/Desktop/BigDataProject/mkbhdmkbhd.mm")

# In[512]:

import random
import numpy


def random_floats(low, high, size):
    return [random.uniform(low, high) for _ in range(size)]


scores_ = random_floats(0.0001, 0.2, 57)

#scores_
from gensim.corpora import MmCorpus
from multiprocessing import Array, Process, Pool, Queue, Manager
import numpy as np
import tqdm
import sys
import gensim 
import numpy as np
import multiprocessing as mp
import io

from gensim.models import TfidfModel
from gensim.corpora import Dictionary, MmCorpus

tf_idf_model = TfidfModel.load('/mnt/disk/wikipedia/wikipedia.tfidf_model')
dct = Dictionary.load_from_text('/mnt/disk/wikipedia/wikipedia_wordids.txt.bz2')
corpus = MmCorpus('/mnt/disk/wikipedia/wikipedia_bow.mm')


def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm.tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

w2v_fasttext = load_vectors('wiki-news-300d-1M.vec')

import numpy as np
Beispiel #4
0
#                              gammaln(n.sum(self._lambda, 1)))
#
#        return(score)

if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running %s" % ' '.join(sys.argv))

    import os.path
    program = os.path.basename(sys.argv[0])

    # The number of documents to analyze each iteration
    vocab = WikiCorpus.loadDictionary(
        '/Users/kofola/gensim/results/wiki10_en_wordids.txt')
    corpus = MmCorpus('/Users/kofola/gensim/results/wiki10_en_bow.mm')
    sumcnts = sum(sum(cnt for _, cnt in doc) for doc in corpus)
    logger.info("running LDA on %i documents, %i total tokens" %
                (len(corpus), sumcnts))

    batchsize = 100000
    D = 100000  # total number of docs
    K = 100  # number of topics
    iterations = int(sys.argv[1])

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = OnlineLDA(vocab.values(), K, D, 1. / K, 1. / K, 1., kappa=0.0)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, iterations):
        # maybe select only a subset of corpus here (to simulate their "stochastic" approach)
Beispiel #5
0
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(
            inp, lemmatize=lemmatize
        )  # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20,
                                        no_above=0.1,
                                        keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki,
                           progress_cnt=10000)  # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
Beispiel #6
0
                        words in both sentences.)
'''

#Please modify your all the paths for your resources
print('Modify the paths of your corpus on config.ini file')
input()

config = ConfigParser()
config.read('config.ini')

#TODO: Generalize this step puting corpus_path as your actual corpus
#Config file must allow wikipedia, Gutenberg, ...
corpus_path = config['WIKI']['en'][1:-1]

dictionary = Dictionary.load_from_text(os.path.relpath(corpus_path+'_wordids.txt.bz2'))
bow_corpus = MmCorpus(os.path.relpath(corpus_path+'_bow.mm'))

try:
    tfidf = TfidfModel.load(corpus_path+'wiki-tfidf.model')
except:
    tfidf = TfidfModel()
    tfidf = TfidfModel(bow_corpus,dictionary)
    tfidf._smart_save(corpus_path+'wiki-tfidf.model')
    pass

#testing sentences
sentence1 = 'pilar pescado en la tarde es fatal'
sentence2 = 'machacar pescado al atardecer es terrible'

#Transforming sentences
sent1 = sentence1.split()
Beispiel #7
0
background_corpus.dictionary.save(
    "my_dict.dict")

MmCorpus.serialize("background_corpus.mm",
    background_corpus) 

from gensim.corpora import WikiCorpus, wikicorpus

articles = "enwiki-latest-pages-articles.xml.bz2" 

wiki_corpus = WikiCorpus(articles)
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) 

bow_corpus = MmCorpus("wiki_corpus.mm") 

dictionary = Dictionary.load("wiki_dict.dict")  

from gensim.models import LsiModel, LogEntropyModel

logent_transformation = LogEntropyModel(wiki_corpus,
    id2word=dictionary) 

tokenize_func = wikicorpus.tokenize  
document = "Some text to be transformed."

bow_document = dictionary.doc2bow(tokenize_func(
    document))
logent_document = logent_transformation[[
    bow_document]]
Beispiel #8
0
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
# %matplotlib inline

from gensim.corpora import Dictionary, MmCorpus
trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict')
trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm')
# Document to matrix
import numpy as np
from scipy.sparse import csr_matrix
rows = []
cols = []
data = []
Nrow = 1000000  #len(trigram_bow_corpus)
Ncol = len(trigram_dictionary)
for i in range(0, Nrow):  #
    line = trigram_bow_corpus[i]
    for indx, freq in line:
        rows.append(i)
        cols.append(indx)
        data.append(freq)
dtm = csr_matrix((data, (rows, cols)), shape=(Nrow, Ncol), dtype=int)
Beispiel #9
0
        print(" ============ RELATED TITLES =======================")

        related_articles = get_related_articles(doc, 10)
        for article in related_articles:
            print(article)

    argparser = argparse.ArgumentParser()
    argparser.add_argument('--fileName')

    args = argparser.parse_args()
    titles, texts, documents, urls = load_stuff(args.fileName)

    dictionary = corpora.Dictionary.load(
        basename(args.fileName) +
        '.dict')  # store the dictionary, for future reference
    corpus = MmCorpus(basename(args.fileName) + '.mm')
    lsi = models.LsiModel.load(basename(args.fileName) + '.lsi')

    index = similarities.MatrixSimilarity.load(
        basename(args.fileName) +
        '.index')  # transform corpus to LSI space and index it

    do_print_related(
        "TO FIX ITS TOXIC AD PROBLEM, FACEBOOK MUST BREAK ITSELF\nIT IS A sure sign that Facebook’s algorithms have run amok when they allow anyone to target ads to people with an expressed interest in burning Jews. Likewise, when Russians can sow chaos in American elections by purchasing thousands of phony Facebook ads without Facebook realizing it, the automated systems selling those ads may need some oversight."
    )

    do_print_related(
        "ABOUT A WEEK ago, Stanford University researchers (posted online)[https://osf.io/zn79k/] a study on the latest dystopian AI: They'd made a machine learning algorithm that essentially works as gaydar. After training the algorithm with tens of thousands of photographs from a dating site, the algorithm could, for example, guess if a white man in a photograph was gay with 81 percent accuracy. The researchers’ motives? They wanted to protect gay people. "
    )

    do_print_related(
#!/usr/bin/env python

from gensim.models import LdaModel
from gensim.corpora import MmCorpus, Dictionary
import sys, os
import pyLDAvis.gensim

if len(sys.argv) < 2:
    print("usage: {0} [path to model.lda]\n".format(sys.argv[0]))
    sys.exit(1)

path, file = os.path.split(sys.argv[1])
corpusname = file.split(".")[0]

dictionary = Dictionary.load(path + "/" + corpusname + ".dict")
corpus = MmCorpus(path + "/" + corpusname + ".mm")
model = LdaModel.load(sys.argv[1])

##############
# cf. https://pyldavis.readthedocs.org/en/latest/modules/API.html

vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)

pyLDAvis.save_html(vis, path + "/" + corpusname + "_interactive.html")
pyLDAvis.show(vis)
Beispiel #11
0
)  #  Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs.

### Generating a large training/background corpus using Wikipedia
from gensim.corpora import WikiCorpus, wikicorpus

articles = "enwiki-latest-pages-articles.xml.bz2"  # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download

wiki_corpus = WikiCorpus(
    articles
)  # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus)  #  File will be several GBs.

### Working with persisted corpus and dictionary
bow_corpus = MmCorpus("wiki_corpus.mm")  # Revive a corpus

dictionary = Dictionary.load("wiki_dict.dict")  # Load a dictionary

### Transformations among vector spaces
from gensim.models import LsiModel, LogEntropyModel

logent_transformation = LogEntropyModel(
    wiki_corpus, id2word=dictionary
)  # Log Entropy weights frequencies of all document features in the corpus

tokenize_func = wikicorpus.tokenize  # The tokenizer used to create the Wikipedia corpus
document = "Some text to be transformed."
bow_document = dictionary.doc2bow(
    tokenize_func(document)
)  # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to BOW representation using the dictionary created when generating the background corpus.
Beispiel #12
0
    """
    trigram_reviews_filepath = os.path.join(
        'results', 'trigram_transformed_reviews_all.txt')

    trigram_dictionary_filepath = os.path.join('trigram_dict_all.dict')

    learn_vocab_corpus(trigram_reviews_filepath, trigram_dictionary_filepath)

    # load the finished dictionary from disk
    trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

    trigram_bow_filepath = os.path.join('trigram_bow_corpus_all.mm')

    create_bow(trigram_reviews_filepath, trigram_bow_filepath,
               trigram_dictionary)

    # load the finished bag-of-words corpus from disk
    trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
    """
    find topics
    """

    lda_model_filepath = os.path.join('lda_model_all')

    create_topics(lda_model_filepath, trigram_bow_corpus, trigram_dictionary)

    # load the finished LDA model from disk
    lda = LdaMulticore.load(lda_model_filepath)

    explore_topic(lda, topic_number=0)
Beispiel #13
0
cleared_docs = [[token for token in document if token in cleared_tokens]
                for document in cleared_docs]

## Save dictionary in serialized form
dictionary = Dictionary(cleared_docs)
dictionary.save('./dictionaries/python_tags.dict')
corpus = [dictionary.doc2bow(document) for document in cleared_docs]
MmCorpus.serialize('./dictionaries/python_tags.mm', corpus)

########################################
## Load Data
########################################
if (os.path.exists("./dictionaries/python_tags.dict")):
    dictionary = Dictionary.load('./dictionaries/python_tags.dict')
    corpus = MmCorpus('./dictionaries/python_tags.mm')
    print("Used dictionary generated")
else:
    print("Please run the preprocessing to generate a dictionary file")

########################################
## Create Model
########################################
print(corpus)
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

########################################
## Applying LSI
########################################
lsi = LsiModel(corpus_tfidf,
Beispiel #14
0
def LDA_Analysis():
    #http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

    if 0 == 1:
        with open('data/review_text_all.txt','w') as myfile:
            myfile.write("")
        
        '''
        loop through db and write jobs descriptions
        '''
        
        with open('data/review_text_all.txt','a') as myfile:
            with Job() as db:
                a=0
                max_ = int(db.getNoJobs()[0][0])
                while (a < max_):
                    #print(a)
                    sample_review = db.readJobDetailClean(a)[0][1]
                    if (sample_review != 'Json Error'):
                        myfile.write(str(sample_review)+'\n')
                    a += 1
    
    #unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt')
    
    if 0 == 1:
    
        with codecs.open('data/unigram_sentences_all.txt', 'w', encoding='utf_8') as f:
            for sentence in lemmatized_sentence_corpus('data/review_text_all.txt'):
                f.write(sentence + '\n')
    
    unigram_sentences = LineSentence('data/unigram_sentences_all.txt')
   
    '''
    for unigram_sentence in it.islice(unigram_sentences, 230, 240):
        print(u' '.join(unigram_sentence))
        print(u'')
    '''
        
    #bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')
    
    if 0 == 1:

        bigram_model = Phrases('data/unigram_sentences_all.txt')
    
        bigram_model.save('data/bigram_model_all')
    
    # load the finished model from disk
    bigram_model = Phrases.load('data/bigram_model_all')
    
    #bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt')
   
    if 0 == 1:
    
        with codecs.open('data/bigram_sentences_all.txt', 'w', encoding='utf_8') as f:
            
            for unigram_sentence in unigram_sentences:
                
                bigram_sentence = u' '.join(bigram_model[unigram_sentence])
                
                f.write(bigram_sentence + '\n')
            
    bigram_sentences = LineSentence('data/bigram_sentences_all.txt')
            
    '''                    
    for bigram_sentence in it.islice(bigram_sentences, 230, 240):
        print(u' '.join(bigram_sentence))
        print(u'')  
    '''

    #trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all')

    if 0 == 1:
    
        trigram_model = Phrases(bigram_sentences)
    
        trigram_model.save('data/trigram_model_all')
        
    # load the finished model from disk
    trigram_model = Phrases.load('data/trigram_model_all')

    #trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt')                     

    if 0 == 1:
    
        with codecs.open('data/trigram_sentences_all.txt', 'w', encoding='utf_8') as f:
            
            for bigram_sentence in bigram_sentences:
                
                trigram_sentence = u' '.join(trigram_model[bigram_sentence])
                
                f.write(trigram_sentence + '\n')
                
    trigram_sentences = LineSentence('data/trigram_sentences_all.txt')

    '''
    for trigram_sentence in it.islice(trigram_sentences, 230, 240):
        print(u' '.join(trigram_sentence))
        print(u'')
    '''

    #trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt')
    
    if  0 == 1:
      
        import csv
        
        '''
        Variant A: Use Stopwords
        1) download StopWords.csv from MySQL table: KeyWords.
        2) Remove all relevant words by hand ;)
        '''
        with open('data/StopWords.csv', newline='') as csvfile:
          
          stopwords_ = csv.reader(csvfile, delimiter=' ', quotechar='|')
          for words_ in stopwords_:
            #print(words_[0])
            STOP_WORDS.add(words_[0])
    
        #print(STOP_WORDS)
        
        '''
        Varaint B: Use Dictionary
        '''
        with open('data/Dictionary.csv', 'r', newline='') as csvfile:
          
          file_ = csv.reader(csvfile, delimiter=',', quotechar='"')
          
          dictionary_ = []

          for row in file_:
              dictionary_.append(row[0])
          
          #with open('file.csv', 'r') as f:
  #reader = csv.reader(f)
  #your_list = list(reader)
    
    
        with codecs.open('data/trigram_transformed_reviews_all.txt', 'w', encoding='utf_8') as f:
            
            for parsed_review in nlp.pipe(line_review('data/review_text_all.txt'), batch_size=10000, n_threads=4):
                
                # lemmatize the text, removing punctuation and whitespace
                unigram_review = [token.lemma_ for token in parsed_review
                                  if not punct_space(token)]
                
                # apply the first-order and second-order phrase models
                bigram_review = bigram_model[unigram_review]
                trigram_review = trigram_model[bigram_review]
                
                # remove any remaining stopwords
                '''
                Variant A:
                '''
                #trigram_review = [term for term in trigram_review
                #                  if term not in STOP_WORDS]#spacy.en.STOPWORDS] !!!!! CHECK THIS !!!!! module 'spacy' has no attribute 'en'
                
                '''
                Variant B:
                '''
                trigram_review = [term for term in trigram_review
                                  if term in dictionary_]#
                
                # write the transformed review as a line in the new file
                trigram_review = u' '.join(trigram_review)
                f.write(trigram_review + '\n')
                
    '''
    print(u'Original:' + u'\n')
    
    for review in it.islice(line_review('review_text_all.txt'), 11, 12):
        print(review)
    
    print(u'----' + u'\n')
    print(u'Transformed:' + u'\n')
    
    with codecs.open('trigram_transformed_reviews_all.txt', encoding='utf_8') as f:
        for review in it.islice(f, 11, 12):
            print(review)
    '''

    #trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict')

    if 0 == 1:
    
        trigram_reviews = LineSentence('data/trigram_transformed_reviews_all.txt')
    
        # learn the dictionary by iterating over all of the reviews
        trigram_dictionary = Dictionary(trigram_reviews)
        
        # filter tokens that are very rare or too common from
        # the dictionary (filter_extremes) and reassign integer ids (compactify)
        trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)#,keep_n=100000)#,)
        trigram_dictionary.compactify()
    
        trigram_dictionary.save('data/trigram_dict_all.dict')
        
    # load the finished dictionary from disk
    trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict')
    
    #trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm')
    
    if 0 == 1:
    
        # generate bag-of-words representations for
        # all reviews and save them as a matrix
        MmCorpus.serialize('data/trigram_bow_corpus_all.mm', trigram_bow_generator(trigram_dictionary,'data/trigram_transformed_reviews_all.txt'))
        
    # load the finished bag-of-words corpus from disk
    trigram_bow_corpus = MmCorpus('data/trigram_bow_corpus_all.mm')
    
    #lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')
    
    if 0 == 1:
    
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            
            # workers => sets the parallelism, and should be
            # set to your number of physical cores minus one
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=15,
                               id2word=trigram_dictionary,
                               workers=1)
        
        lda.save('data/lda_model_all')
        
    # load the finished LDA model from disk
    lda = LdaMulticore.load('data/lda_model_all')

    #explore_topic(lda, topic_number=1)

    topic_names = {0:u'Risk Management Bank', 
                   1:u'Big Data Report', 
                   2:u'Automotive SAP', 
                   3:u'Microsoft Java Scrum', 
                   4:u'Medical Consultant', 
                   5:u'Java Engineer', 
                   6:u'Computer Vision Developer', 
                   7:u'Data Analyst', 
                   8:u'BI SAP BW', 
                   9:u'IOT Reporting R', 
                   10:u'Global Project Presentation',
                   11:u'Cloud Engineer IOT', 
                   12:u'Industry 4.0', 
                   13:u'Risk Consulting', 
                   14:u'Machine Learning Data Science'}
    
    #topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')
    
    with open('data/topic_names.pkl', 'wb') as f:
        pickle.dump(topic_names, f)
    
    #load sameple_review from database
    #sample_review = get_sample_review(10)
    
    #lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, sample_review)

    #LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')
    
    if 0 == 1:
        
        #term_ix = np.sort(topic_info.index.unique().values)
    
        LDAvis_prepared = pyLDAvis.gensim_.prepare(lda, trigram_bow_corpus, trigram_dictionary)
    
        with open('data/ldavis_prepared', 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
            
    '''
    export LDA file
    '''
    
    # load the pre-prepared pyLDAvis data from disk
    with open('data/ldavis_prepared', 'rb') as f:
        LDAvis_prepared = pickle.load(f)

    with open('data/DSJobs_LDA.html', 'w') as f:
        pyLDAvis.save_html(LDAvis_prepared, f)            
Beispiel #15
0
                      help="specify LDA model.")
    args.add_argument("-s",
                      "--save_to_file",
                      type=str,
                      help="speficy file which the HTML will be saved to.")
    args.add_argument("-t",
                      "--use_tfidf",
                      action="store_true",
                      help="use TF-IDF corpus.")
    args.add_argument(
        "--method",
        type=str,
        default="pcoa",
        help="specify a method for MDS by one from 'pcoa', 'mmds', or 'tsne'.")
    return args.parse_args()


if __name__ == "__main__":
    args = parse_arg()
    model = LdaModel.load(args.model[0])
    corpus = MmCorpus(args.corpus[0])
    if args.use_tfidf:
        tfidf = TfidfModel(corpus)
        corpus = tfidf[corpus]
    dictionary = Dictionary.load_from_text(args.dictionary[0])
    vis = pyLDAvis.gensim.prepare(model, corpus, dictionary, mds=args.method)
    if args.save_to_file is not None:
        pyLDAvis.save_html(vis, args.save_to_file)
    else:
        pyLDAvis.show(vis)
Beispiel #16
0
# approximately 8 hours on a 8GB machine with a dual core processor

wiki_corpus =corpora.wikicorpus.WikiCorpus(wiki_file)

print('Finished making the wikicorpus, saving BOW corpus\n')
corpora.mmcorpus.MmCorpus.serialize('../data/wiki_en_vocab200k', wiki_corpus)
print('Done saving BOW Corpus\n')

# Save the dicitonary, you will need it to convert future documents into
# BOW format

#wiki.dictionary.save("../data/wiki_dict.dict")
#print 'Saved dictionary'

print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models')
BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus

#log_entropy = LogEntropyModel(BOW_corpus)
#log_entropy.save('../models/logEntropy.model') #already provided
log_entropy = LogEntropyModel.load('../models/logEntropy.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
log_entropy[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

#tfidf = TfidfModel(BOW_corpus)
#tfidf.save('../models/tfidf.model') #already provided
tfidf = TfidfModel.load('../models/tfidf.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
tfidf[BOW_corpus])
def SAVE_CORPUS_MM_FORMAT():
    ###save corpus into mm format
    MmCorpus.serialize('bow_corpus.mm', bow_corpus)
    mm = MmCorpus('bow_corpus.mm')
    print(mm[1]) #retrieve doc 1
if __name__ == '__main__':

    # training_file_path = 'E:/2017_Deep_learning/text similarity'
    training_file_path = './keywords/sentidata'

    # Lsi model
    dictionary = Dictionary()
    corpus = sohu_corpus(fname=os.path.join(training_file_path, 'neg_1.txt'),
                         dic=dictionary)

    # save dictionary
    # dictionary.save(os.path.join(training_file_path, '07_11_dictionary.dict'))
    MmCorpus.serialize(os.path.join(training_file_path, '07_11_corpus_12.mm'),
                       corpus)
    # dictionary = Dictionary.load(os.path.join(training_file_path, '07_11_dictionary.dict'))
    corpus_tfidf_mm = MmCorpus(
        os.path.join(training_file_path, '07_11_corpus_12.mm'))

    training_src_data = sogou_corpus_file(
        os.path.join(training_file_path, 'neg_1.txt'))
    training_src = []
    for each_file in training_src_data:
        training_src.append(each_file)

    # convert counts to tfidf
    tfidf = TfidfModel(corpus=corpus_tfidf_mm)

    index = MatrixSimilarity(tfidf[corpus_tfidf_mm])

    sims = index[tfidf[dictionary.doc2bow(['阳台', '打死'])]]
    print('doc2bow:')
    print(dictionary.doc2bow(['阳台']))
# import argparse

from gensim.corpora import Dictionary, MmCorpus
from gensim.models.nmf import Nmf
from gensim.models import TfidfModel

from codebase.utils import TweetRawCorpusStream
from codebase.topic_utilities import export_dtm

if __name__ == "__main__":

    corpora_path = "./corpora/"
    model_path = "./models/"
    num_topics = 50
    model_suffix = "-{}topics".format(num_topics)
    modelTag = "Seventh-and-EighthWeek-Tweets-Rolling"

    nmf = Nmf.load("{}{}{}.model".format(model_path, modelTag, model_suffix))

    fileTag_list = ["Fifth-and-SixthWeek-Tweets-Rolling"]
    for fileTag in fileTag_list:
        tfidf_corpus = MmCorpus('{}{}-tf-idf.mm'.format(corpora_path, fileTag))
        export_dtm(nmf=nmf, corpus=tfidf_corpus,\
            out_path="{}{}{}-dtm.csv".format(model_path, fileTag, model_suffix),\
            stop_at=None)
Beispiel #20
0
from gensim.models.ldamodel import LdaModel

logger = logging.getLogger(__name__)

if __name__ == '__main__':

    '''
    What           Visualize LDA topic model using pyLDAvis
    Documentation  https://pyldavis.readthedocs.io/en/latest/
    Source         https://github.com/bmabey/pyLDAvis
    Article        https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf
    '''
    data_folder = '/tmp/Data'
    models_names = [ ]  # store.get_model_names(data_folder)

    logger.info(models_names)
    OPTS = { 'R': 100, 'mds': 'tsne', 'sort_topics': False, 'plot_opts': { 'xlab': 'PC1', 'ylab': 'PC2' } }

    for basename in models_names:

        target_folder = os.path.join(data_folder, basename)
        corpus_filename = os.path.join(target_folder, 'corpus.mm')
        dictionary_filename = os.path.join(target_folder, 'corpus.dict.gz')
        model_filename = os.path.join(target_folder, 'gensim_model_{}.gensim.gz'.format(basename))

        lda = LdaModel.load(model_filename)
        dictionary = Dictionary.load(dictionary_filename)
        corpus = MmCorpus(corpus_filename)

        convert_to_pyLDAvis(data_folder, basename, **OPTS)
Beispiel #21
0
    input, output = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE

    # build dictionary. only keep 200k most frequent words (out of total ~7m unique tokens)
    # takes about 8h on a macbook pro
    wiki = WikiCorpus(input, keep_words=keep_words)

    # save dictionary and bag-of-words
    # another ~8h
    wiki.saveAsText(output)
    del wiki

    # initialize corpus reader and word->id mapping
    from gensim.corpora import MmCorpus
    id2token = WikiCorpus.loadDictionary(output + '_wordids.txt')
    mm = MmCorpus(output + '_bow.mm')

    # build tfidf
    # ~20min
    from gensim.models import TfidfModel
    tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

    # save tfidf vectors in matrix market format
    # ~1.5h; result file is 14GB! bzip2'ed down to 4.5GB
    MmCorpus.saveCorpus(output + '_tfidf.mm', tfidf[mm], progressCnt=10000)

    logging.info("finished running %s" % program)
Beispiel #22
0
def loadCorpus(corpusPath):
    return MmCorpus(corpusPath)
def main():
    parser = argparse.ArgumentParser(
        description=
        'maps a given document-author-contribution file to a weighted bipartite network of document and author nodes'
    )
    parser.add_argument(
        '--contribs',
        type=argparse.FileType('r'),
        help='path to input contribution MatrixMarket file (.mm/.mm.bz2)',
        required=True)
    parser.add_argument('--bipart-graph',
                        type=argparse.FileType('w'),
                        help='path to output graph (.graph/.graph.bz2) file',
                        required=True)
    parser.add_argument('--top-n-contribs',
                        type=int,
                        help='keep at most N highest contribs per author',
                        required=True)

    args = parser.parse_args()
    input_contribs_path = args.contribs.name
    output_bipart_graph_path = args.bipart_graph.name
    top_n_contribs = args.top_n_contribs

    logger.info('running with:\n{}'.format(
        pformat({
            'input_contribs_path': input_contribs_path,
            'output_bipart_graph_path': output_bipart_graph_path,
            'top_n_contribs': top_n_contribs
        })))

    # lade gespeicherte Beiträge
    contribs = MmCorpus(input_contribs_path)
    num_docs = contribs.num_docs
    num_authors = contribs.num_terms
    logger.info('processing contributions of {} documents, {} authors'.format(
        num_docs, num_authors))

    # erzeuge bipartites Affiliationsnetzwerk: enthält Dokumente & Autoren als Knoten, Dokument-Autor-Beiträge ergeben entsprechende gewichtete Kanten
    bipart_graph = nx.Graph()
    doc_nodes = tuple('d' + str(n) for n in range(0, num_docs))
    bipart_graph.add_nodes_from(doc_nodes, bipartite=0)
    auth_nodes = tuple('a' + str(n) for n in range(0, num_authors))
    bipart_graph.add_nodes_from(auth_nodes, bipartite=1)
    bipart_graph.add_weighted_edges_from(get_edges_from_contribs(contribs),
                                         weight='weight')
    log_nwx(bipart_graph)
    logger.info('bipartite? {}'.format(bipartite.is_bipartite(bipart_graph)))
    simplify_graph_nwx(bipart_graph)
    logger.info(
        'actual numbers after simplifying: {} docs, {} authors, {} edges'.
        format(*get_bipartite_node_counts(bipart_graph),
               len(bipart_graph.edges)))

    # gib höchsten Knotengrad eines Autoren aus
    max_degree_author = max(bipart_graph.degree(auth_nodes),
                            key=lambda node_deg: node_deg[1])
    logger.info('author {} having max degree of {}'.format(*max_degree_author))

    # aktalisiere variablen
    doc_nodes, auth_nodes = get_bipartite_nodes(bipart_graph)

    # prune die Anzahl aller inzidenten Kanten von Autoren jeweils auf die K Kanten mit den größten Gewichten
    logger.info('pruning to top {} edges per author'.format(top_n_contribs))
    for auth_node in auth_nodes:
        logger.debug('author {}'.format(auth_node))
        auth_edges = bipart_graph[auth_node]
        auth_edges = tuple((neighbor, weight['weight'])
                           for neighbor, weight in auth_edges.items())
        logger.debug('incident edges \n{}'.format(pformat(auth_edges)))
        num_remove = len(auth_edges) - top_n_contribs
        author_min_edges = nsmallest(num_remove,
                                     auth_edges,
                                     key=lambda edge: edge[1])
        logger.debug('removing edges \n{}'.format(pformat(author_min_edges)))
        bipart_graph.remove_edges_from(
            (auth_node, neighbor) for neighbor, weight in author_min_edges)

    # keep_max_edges = 10000
    # logger.info('pruning to {} highest edges'.format(keep_max_edges))
    # num_edges_to_remove = len(bipart_graph.edges) - keep_max_edges
    # min_edges = nsmallest(num_edges_to_remove, bipart_graph.edges(data='weight'), key=lambda edge: edge[2])
    # bipart_graph.remove_edges_from(min_edges)
    # log_nwx(bipart_graph)

    # gib höchsten Knotengrad eines Autoren aus
    max_degree_author = max(bipart_graph.degree(auth_nodes),
                            key=lambda node_deg: node_deg[1])
    logger.info('author {} having max degree of {}'.format(*max_degree_author))

    # entferne isolierte Knoten
    simplify_graph_nwx(bipart_graph)
    log_nwx(bipart_graph)
    logger.info('new number of documents {}, authors {}'.format(
        *get_bipartite_node_counts(bipart_graph)))

    # speichere Affiliationsnetzwerk
    logger.info('writing graph to {}'.format(output_bipart_graph_path))
    nx.write_gpickle(bipart_graph, output_bipart_graph_path)
Beispiel #24
0
def lda():
    """ LDA model
    https://radimrehurek.com/gensim/models/ldamodel.html

    num_topics is the number of requested latent topics to be extracted from the
    training corpus.

    id2word is a mapping from word ids (integers) to words (strings). It is used
    to determine the vocabulary size, as well as for debugging and topic
    printing.

    alpha and eta are hyperparameters that affect sparsity of the document-topic
    (theta) and topic-word (lambda) distributions. Both default to a symmetric
    1.0/num_topics prior.

    alpha can be set to an explicit array = prior of your choice. It also
    support special values of ‘asymmetric’ and ‘auto’: the former uses a fixed
    normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
    prior directly from your data.

    eta can be a scalar for a symmetric prior over topic/word distributions, or
    a vector of shape num_words, which can be used to impose (user defined)
    asymmetric priors over the word distribution. It also supports the special
    value ‘auto’, which learns an asymmetric prior over words directly from your
    data. eta can also be a matrix of shape num_topics x num_words, which can be
    used to impose asymmetric priors over the word distribution on a per-topic
    basis (can not be learned from data).

    Calculate and log perplexity estimate from the latest mini-batch every
    eval_every model updates (setting this to 1 slows down training ~2x; default
    is 10 for better performance). Set to None to disable perplexity estimation.

    decay and offset parameters are the same as Kappa and Tau_0 in Hoffman et
    al, respectively.

    minimum_probability controls filtering the topics returned for a document
    (bow).

    random_state can be a np.random.RandomState object or the seed for one

    callbacks a list of metric callbacks to log/visualize evaluation metrics of
    topic model during training

    The model can be updated (trained) with new documents via
    >>> lda.update(other_corpus)

    You can then infer topic distributions on new, unseen documents, with
    >>> doc_lda = lda[doc_bow]

    """

    # load word-id dictionary
    id2word = Dictionary.load('foobar.txtdic')
    # load matrix market format bow vectors
    # mm = MmCorpus('bow.mm')
    # load Tfidf Model in matrix market format
    mm = MmCorpus('tfidf_JD.mm')
    # train LDA model
    lda = LdaModel(
        corpus=mm, id2word=id2word, num_topics=21, distributed=False,
        chunksize=2000, passes=3, update_every=1, alpha='symmetric',
        decay=0.5, offset=1.0, eval_every=10, iterations=50,
        gamma_threshold=0.001, minimum_probability=0.01, random_state=None,
        ns_conf=None, minimum_phi_value=0.01, per_word_topics=False,
        callbacks=None)

    # save LDA model
    lda.save('lda.model')
                       resultDir=gensim_build.RESULT_DIR,
                       acceptLangs=[language])
    try:
        dml = DmlCorpus.load(config.resultFile('.pkl'))
    except IOError, e:
        raise IOError(
            "no word-count corpus found at %s; you must first generate it through gensim_build.py"
        )

    logging.info("loading word id mapping from %s" %
                 config.resultFile('wordids.txt'))
    id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))

    if method == 'tfidf':
        corpus = MmCorpus(config.resultFile('bow.mm'))
        model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True)
        model.save(config.resultFile('tfidfmodel.pkl'))
    elif method == 'lda':
        corpus = MmCorpus(config.resultFile('bow.mm'))
        model = ldamodel.LdaModel(corpus, id2word=id2word, numTopics=DIM_LDA)
        model.save(config.resultFile('ldamodel%i.pkl' % DIM_LDA))
    elif method == 'lsi' or method == 'lsa':
        # first, transform word counts to tf-idf weights
        corpus = MmCorpus(config.resultFile('bow.mm'))
        tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True)
        # then find the transformation from tf-idf to latent space
        model = lsimodel.LsiModel(tfidf.apply(corpus),
                                  id2word=id2word,
                                  numTopics=DIM_LSI)
        model.save(config.resultFile('lsimodel%i.pkl' % DIM_LSI))
import warnings
import pickle
import time

from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

trigram_dict_file = 'trigram_dict.dict'
trigram_dictionary = Dictionary.load(trigram_dict_file)

trigram_threads_bow_file = 'trigram_threads_bow_corpus.mm'
trigram_users_bow_file = 'trigram_users_bow_corpus.mm'

trigram_threads_bow_corpus = MmCorpus(trigram_threads_bow_file)
trigram_users_bow_corpus = MmCorpus(trigram_users_bow_file)

lda_threads_model_file = "lda_threads_model"
lda_users_model_file = "lda_users_model"

lda_threads = LdaMulticore.load(lda_threads_model_file)
lda_users = LdaMulticore.load(lda_users_model_file)

LDAvis_threads_file = 'ldavis_threads_prep'
LDAvis_users_file = 'ldavis_users_prep'

t0 = time.time()
LDAvis_threads_prep = pyLDAvis.gensim.prepare(lda_threads, trigram_threads_bow_corpus, trigram_dictionary)

t1 = time.time()
Beispiel #27
0
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()

    logging.info("loading corpus mappings")
    config = dmlcorpus.DmlConfig('gensim_%s' % language,
                                 resultDir=gensim_build.RESULT_DIR,
                                 acceptLangs=[language])

    logging.info("loading word id mapping from %s" %
                 config.resultFile('wordids.txt'))
    id2word = dmlcorpus.DmlCorpus.loadDictionary(
        config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))

    corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl'))
    input = MmCorpus(config.resultFile('corpus_%s.mm' % method))
    assert len(input) == len(
        corpus
    ), "corpus size mismatch (%i vs %i): run ./gensim_genmodel again" % (
        len(input), len(corpus))

    # initialize structure for similarity queries
    if method == 'lsi' or method == 'rp':  # for these methods, use dense vectors
        index = MatrixSimilarity(input,
                                 numBest=MAX_SIMILAR + 1,
                                 numFeatures=input.numTerms)
    else:
        index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1)

    index.normalize = False  # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op)
    generateSimilar(
    logging.info("loading corpus mappings")
    try:
        dml = DmlCorpus.load(config.resultFile('.pkl'))
    except IOError, e:
        raise IOError(
            "no word-count corpus found at %s; you must first generate it through gensim_build.py"
        )
    config = dml.config

    logging.info("loading word id mapping from %s" %
                 config.resultFile('wordids.txt'))
    id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))

    input = MmCorpus(bow.mm)

    if method == 'tfidf':
        model = tfidfmodel.TfidfModel.load(modelfname('tfidf'))
    elif method == 'lsi':
        tfidf = tfidfmodel.TfidfModel.load(modelfname('tfidf'))
        input = tfidf[input]  # transform to tfidf
        model = lsimodel.LsiModel.load(modelfname('lsi'))
    elif method == 'lda':
        model = ldamodel.LdaModel.load(modelfname('lda'))
    else:
        raise ValueError('unknown method: %s' % repr(method))

    topics = model[
        input]  # documents from 'input' will be represented via 'model'
    sims = SparseMatrixSimilarity(
 def __init__(self, filename):
     self.corpus = MmCorpus(filename)
     self.metadata = unpickle(filename + ".metadata.cpickle")
=============================================================================
bi-grammed tokenized article: {}

""".format(docs[1], docs_tokens[1], docs_phrased[1]))

# %% get corpus & dictionary to use for further nlp analysis

# get dictionary and write it to a file
ws_dictionary = Dictionary(docs_tokens)
ws_dictionary.save('.data/ws_dictionary.dict')

# get corpus and write it to a file
ws_corpus = [ws_dictionary.doc2bow(doc) for doc in docs_tokens]
out_f = ('.data/ws_corpus.mm')
MmCorpus.serialize(out_f, ws_corpus)
mm = MmCorpus(out_f)  # `mm` document stream now has random access

# send tokenized test to MongoDB
# --+ open monog pipeline
# ----+ params
mongo_host = "10.16.142.91"
mongo_db = "digitalTechs"
mongo_user = "******"
mongo_pass = "******"
# ----+ server
server = SSHTunnelForwarder(mongo_host,
                            ssh_username=mongo_user,
                            ssh_password=mongo_pass,
                            remote_bind_address=('127.0.0.1', 27017))
# ----+ start server
server.start()