コード例 #1
0
ファイル: perplexity.py プロジェクト: wsun/abstracts
def modified_process(filename):
    ''' Serial processing of abstracts, no topic modeling. '''
    abstracts = []
    dictionary = []

    # load stop words
    stops = set()
    stop_file = 'stopwords.txt'
    with open(stop_file, 'rU') as stopFile:
        for row in stopFile.readlines():
            stops.add(row.replace('\n', ''))
    
    dictlist = Process.load(filename, abstracts, stops) 
    # create dictionary
    Process.create_dict(dictlist, dictionary)

    # clean text of words not in dictionary
    for abstract in abstracts:
        abstext = [word for word in abstract.Get('cleantext') if word in dictionary]
        abstract.Set('cleantext', abstext)

    dictlength = len(dictionary) 
    bigramdict = []
    termbow = defaultdict(float)
    termbigram = defaultdict(float)
    for abstract in abstracts:
        # create dict of word frequency (bag of words)
        bow = Process.create_bagofwords(abstract, dictionary)
        abstract.Set('bow', bow)
        abstract.Set('bownum', dictlength)
        for ind in bow.keys():
            termbow[ind] += 1.0
        # create dict of bigram frequency
        bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict)
        abstract.Set('bigram', bigram)
        for pair in bigram.keys():
            termbigram[pair] += 1.0
    # create dict of tfidf
    Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict))
    Process.serial_tfidf(abstracts, 'bigram', termbigram)

    return abstracts
コード例 #2
0
ファイル: processtest.py プロジェクト: wsun/abstracts
        with open(stop_file, 'rU') as stopFile:
            for row in stopFile.readlines():
                stops.add(row.replace('\n', ''))
        for r in range(1,size):
            comm.send(stops, dest=r)

        print "Timing load time ..."
        ploadstart = MPI.Wtime()
        abstracts, dictlist = Process.master_load(comm, filename)
        ploadend = MPI.Wtime()

        print "Timing dictionary creation time ..."
        pdictstart = MPI.Wtime()
        # Create dictionary
        #print "Creating dictionary ..."
        Process.create_dict(dictlist, dictionary)
        # send dictionary everywhere
        for r in range(1,size):
            comm.send(dictionary, dest=r)
        pdictend = MPI.Wtime()

        print "Timing text cleaning time ..."
        pcleanstart = MPI.Wtime()
        Process.master_cleantext(comm, abstracts)
        pcleanend = MPI.Wtime()

        print "Timing abstract send time ..."
        pabsstart = MPI.Wtime()
        # send abstracts to all slaves
        for r in range(1,size):
            comm.send(abstracts, dest=r)
コード例 #3
0
ファイル: cluster.py プロジェクト: wsun/abstracts
def process(filename):
    ''' Serial processing of abstracts, for evaluation purposes. '''
    
    abstracts = []
    dictionary = []

    # load stop words
    stops = set()
    stop_file = 'stopwords.txt'
    with open(stop_file, 'rU') as stopFile:
        for row in stopFile.readlines():
            stops.add(row.replace('\n', ''))
        
    dictlist = Process.load(filename, abstracts, stops) 
    # create dictionary
    Process.create_dict(dictlist, dictionary)

    # clean text of words not in dictionary
    for abstract in abstracts:
        abstext = [word for word in abstract.Get('cleantext') if word in dictionary]
        abstract.Set('cleantext', abstext)

    dictlength = len(dictionary) 
    bigramdict = []
    termbow = defaultdict(float)
    termbigram = defaultdict(float)
    for abstract in abstracts:
        # create dict of word frequency (bag of words)
        bow = Process.create_bagofwords(abstract, dictionary)
        abstract.Set('bow', bow)
        abstract.Set('bownum', dictlength)
        for ind in bow.keys():
            termbow[ind] += 1.0
        # create dict of bigram frequency
        bigram, bigramdict = Process.create_bigram(abstract, dictionary, bigramdict)
        abstract.Set('bigram', bigram)
        for pair in bigram.keys():
            termbigram[pair] += 1.0
    # create dict of tfidf
    Process.serial_tfidf(abstracts, 'bow', termbow, len(bigramdict))
    Process.serial_tfidf(abstracts, 'bigram', termbigram)

    ##### TOPICS
    # prepare dictionary and corpora for topic modeling
    docs = [abstract.Get('cleantext') for abstract in abstracts]
    dictionary = corpora.Dictionary(docs)
    dictionary.save('abstracts.dict')           
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    corpora.MmCorpus.serialize('abstracts.mm', corpus)

    # use gensim tfidf to transform
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    # load lsa and lda models
    numtopics = 10  # this can be adjusted
    lsaModel = Lsa.serial(corpus_tfidf, dictionary, numtopics)
    ldaModel = Lda.serial(corpus_tfidf, dictionary, numtopics, alpha=50.0/numtopics, eta=2.0/numtopics)

    # store lda and lsa representation in all abstracts
    for i in xrange(len(abstracts)):
        lsaVec = lsaModel[tfidf[corpus[i]]]
        ldaVec = ldaModel[tfidf[corpus[i]]]
        lsaVector = defaultdict(float)
        ldaVector = defaultdict(float)
        for v in lsaVec:
            lsaVector[v[0]] = v[1]
        for v in ldaVec:
            ldaVector[v[0]] = v[1]
        abstracts[i].Set('lsa', lsaVector)
        abstracts[i].Set('lda', ldaVector)
        abstracts[i].Set('numtopics', numtopics)
    
    return abstracts