Python serial Examples

Programming Language: Python

Namespace/Package Name: lda

Method/Function: serial

Examples at hotexamples.com: 3

Python serial - 3 examples found. These are the top rated real world Python examples of lda.serial extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: process.py Project: wsun/abstracts

def serial_topics(abstracts, num):
    ''' Serial computation of topic models for all abstracts. '''
    # prepare dictionary and corpora for topic modeling
    docs = [abstract.Get('cleantext') for abstract in abstracts]
    dictionary = corpora.Dictionary(docs)
    dictionary.save('abstracts.dict')           
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    corpora.MmCorpus.serialize('abstracts.mm', corpus)

    # use gensim tfidf to transform
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    # load lsa and lda models
    lsaModel = Lsa.serial(corpus_tfidf, dictionary, num)
    ldaModel = Lda.serial(corpus_tfidf, dictionary, num)

    # store lda and lsa representation in all abstracts
    for i in xrange(len(abstracts)):
        lsaVec = lsaModel[tfidf[corpus[i]]]
        ldaVec = ldaModel[tfidf[corpus[i]]]
        lsaVector = defaultdict(float)
        ldaVector = defaultdict(float)
        for v in lsaVec:
            lsaVector[v[0]] = v[1]
        for v in ldaVec:
            ldaVector[v[0]] = v[1]
        abstracts[i].Set('lsa', lsaVector)
        abstracts[i].Set('lda', ldaVector)
        abstracts[i].Set('numtopics', num)

Example #2

Show file

File: perplexity.py Project: wsun/abstracts

def perplexity(abstracts, nums):
    ''' Serial computation of topic models for all abstracts. '''
    # prepare dictionary and corpora for topic modeling
    docs = [abstract.Get('cleantext') for abstract in abstracts]
    dictionary = corpora.Dictionary(docs)
    #dictionary.save('abstracts.dict')           

    # main loop
    random.seed()
    for num in nums:
        count = 0

        for i in xrange(3):
            # prepare holdout set
            p = range(len(docs))
            random.shuffle(p)
            docs = [docs[i] for i in p]
            tenth = int(len(docs) / 10)
            train = docs[tenth:]
            test = docs[:tenth]

            traincorpus = [dictionary.doc2bow(doc) for doc in train]
            testcorpus = [dictionary.doc2bow(doc) for doc in test]

            traintfidf = models.TfidfModel(traincorpus)
            testtfidf = models.TfidfModel(testcorpus)

            traincorpus2 = traintfidf[traincorpus]
            testcorpus2 = testtfidf[testcorpus]

            ldaModel = Lda.serial(traincorpus2, dictionary, num, chunksize=1000, alpha=50.0/num, eta=2.0/num)
            count += ldaModel.bound(testcorpus2)

        avg = count / 3.0
        print "%d: %f" % (num, avg)

    return

Example #3

Show file

File: cluster.py Project: wsun/abstracts

def process(filename):
    ''' Serial processing of abstracts, for evaluation purposes. '''
    
    abstracts = []
    dictionary = []

    # load stop words
    stops = set()
    stop_file = 'stopwords.txt'
    with open(stop_file, 'rU') as stopFile:
        for row in stopFile.readlines():
            stops.add(row.replace('\n', ''))
        
    dictlist = Process.load(filename, abstracts, stops) 
    # create dictionary
    Process.create_dict(dictlist, dictionary)

    # clean text of words not in dictionary
    for abstract in abstracts:
        abstext = [word for word in abstract.Get('cleantext') if word in dictionary]
        abstract.Set('cleantext', abstext)

    dictlength = len(dictionary) 
    bigramdict = []
    termbow = defaultdict(float)
    termbigram = defaultdict(float)
    for abstract in abstracts:
        # create dict of word frequency (bag of words)
        bow = Process.create_bagofwords(abstract, dictionary)
        abstract.Set('bow', bow)
        abstract.Set('bownum', dictlength)
        for ind in bow.keys():
            termbow[ind] += 1.0
        # create dict of bigram frequency
        bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict)
        abstract.Set('bigram', bigram)
        for pair in bigram.keys():
            termbigram[pair] += 1.0
    # create dict of tfidf
    Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict))
    Process.serial_tfidf(abstracts, 'bigram', termbigram)

    ##### TOPICS
    # prepare dictionary and corpora for topic modeling
    docs = [abstract.Get('cleantext') for abstract in abstracts]
    dictionary = corpora.Dictionary(docs)
    dictionary.save('abstracts.dict')           
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    corpora.MmCorpus.serialize('abstracts.mm', corpus)

    # use gensim tfidf to transform
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    # load lsa and lda models
    numtopics = 10  # this can be adjusted
    lsaModel = Lsa.serial(corpus_tfidf, dictionary, numtopics)
    ldaModel = Lda.serial(corpus_tfidf, dictionary, numtopics, alpha=50.0/numtopics, eta=2.0/numtopics)

    # store lda and lsa representation in all abstracts
    for i in xrange(len(abstracts)):
        lsaVec = lsaModel[tfidf[corpus[i]]]
        ldaVec = ldaModel[tfidf[corpus[i]]]
        lsaVector = defaultdict(float)
        ldaVector = defaultdict(float)
        for v in lsaVec:
            lsaVector[v[0]] = v[1]
        for v in ldaVec:
            ldaVector[v[0]] = v[1]
        abstracts[i].Set('lsa', lsaVector)
        abstracts[i].Set('lda', ldaVector)
        abstracts[i].Set('numtopics', numtopics)
    
    return abstracts