Ejemplo n.º 1
0
def corrBrands(lda,brandListFileName=r".\wordlists\brands.txt"):
    brands= [b for b in bf.getMakes(brandListFileName)if b.find(' ')==-1]
    br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands]
    
    #try if brands are not in dict
    br_ids=[]; bad_brands=[]
    for (i,brt) in enumerate(br_tokens):    
        try:
            ID=lda.id2word.token2id[brt]
            print brands[i] ,lda.id2word.dfs[ID]
        except KeyError:
            print 'no '+ brt + ' in dict'
            bad_brands.append(brands[br_tokens.index(brt)])
    #update
    brands=sorted(list(set(brands)-set(bad_brands)))
    br_tokens=[gsLib.wordCleanUp(gsLib.textCleanUp(word)) for word in brands]
    br_ids=[lda.id2word.token2id[brt] for brt in br_tokens]        
            
    topics = lda.state.get_lambda()
    topics = [topic / topic.sum() for topic in topics]
    l=(len(brands));
    sims = numpy.zeros((l,l))
    for i in xrange(l):
        for j in xrange(l):
                p= sum([t[br_ids[i]]*t[br_ids[j]] for t in topics])
                sims[i,j]= p
                
    return (sims,brands)
Ejemplo n.º 2
0
def pruneWordsList(words, lda):
    '''
    goes through list of words, stems them and checks if the word is in dict of LDA
    if it is, appends returns it along with count in corpus and ID in dict
    possible problem - some models were calibrated without stemming and this 
    can reject a valid word because its stem is not in dict
    '''
    words_tokens = [
        gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words
    ]
    good_IDs = []
    good_words = []
    counts = []
    for (i, t) in enumerate(words_tokens):
        try:
            ID = lda.id2word.token2id[t]
            #print words[i] ,lda.id2word.dfs[ID]
            counts.append(lda.id2word.dfs[ID])
            good_IDs.append(ID)
            good_words.append(words[i])
        except KeyError:
            print 'no ' + t + ' in dict'

    df = pandas.DataFrame({
        'IDs': good_IDs,
        'Counts': counts
    },
                          index=good_words)
    return df
Ejemplo n.º 3
0
def pruneWordsList(words,lda):
    '''
    goes through list of words, stems them and checks if the word is in dict of LDA
    if it is, appends returns it along with count in corpus and ID in dict
    possible problem - some models were calibrated without stemming and this 
    can reject a valid word because its stem is not in dict
    '''
    words_tokens=   [gslib.wordCleanUp(gslib.textCleanUp(word)) for word in words]
    good_IDs=[];good_words=[];counts=[]
    for (i,t) in enumerate(words_tokens):    
        try:
            ID=lda.id2word.token2id[t]
            #print words[i] ,lda.id2word.dfs[ID]
            counts.append(lda.id2word.dfs[ID])
            good_IDs.append(ID)
            good_words.append(words[i])
        except KeyError:
            print 'no '+ t + ' in dict'
            
    df = pandas.DataFrame({'IDs':good_IDs,'Counts':counts},index = good_words)
    return df
import someBrandFiltering as bf

allthreads = bf.init()

somethreads = allthreads[0:100]

authPostCount = collections.defaultdict(int)
for t in allthreads:
    for post in t.getPosts():
        author = post.msgAuthor
        authPostCount[author] += 1
len(authPostCount)  #375,569

authorStems = set()
for author in authPostCount.keys():
    authorStem = gslib.wordCleanUp(gslib.textCleanUp(author))
    authorStems.add(authorStem)
len(authorStems)  #27,3418

for author in sorted(authPostCount.keys()):
    if author.find(' ') != -1:
        print author

with open('authList.txt', 'w') as authListF:
    for author in sorted(authPostCount.keys()):
        authListF.write(author + '\n')

with open('authStemsList.txt', 'w') as authListF:
    for author in sorted(authorStems):
        authListF.write(author + '\n')
Ejemplo n.º 5
0
def main(
        outdir=r'Z:\ermunds\results\2005 20t unbranded',
        num_passes=2,
        n_repeat=10,
        num_topics=20,
        threadChoseStr='',
        modelTag='2005+',
        time_low_cutoff=time.strptime("1 Jan 2005", "%d %b %Y"),
        time_hi_cutoff=time.strptime("1 Jan 2006", "%d %b %Y"),
):
    ''' 
    # time_low_cutoff, time_hi_cutoff posts are chosen between these two dates             
    # threadChoseStr - filter topic names by this phrase
    '''
    dTr = bf.notMain(threadChoseStr)

    modelName = modelTag + str(num_topics) + 'topics'
    dirs = gslib.LDAdirs(modelName, outdir)
    with open(dirs.dataFileName, 'a') as file1:
        pickle.dump(dTr, file1)

    ## setup logging to file and console
    logger = logging.getLogger('')
    logger.setLevel(logging.DEBUG)
    fh = logging.FileHandler(dirs.logFileName)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)-12s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    fh.setFormatter(formatter)
    logger.addHandler(ch)
    logger.addHandler(fh)

    ## get threads, extract post texts and save to single file
    # 7min per 1GB
    logging.log(logging.INFO, "building doc list")
    lineCounter = 0
    with open(dirs.allDocsFileName, 'a') as docDumpFile:
        for Trlist in dTr.values():
            for Tr in Trlist:
                for p in Tr.getPosts():
                    if (p.msgTime > time_low_cutoff) and (p.msgTime <
                                                          time_hi_cutoff):
                        doc = gslib.textCleanUp(
                            p.msgTitle) + gslib.textCleanUp(p.msgText)
                        lineCounter += 1
                        print(doc, file=docDumpFile)
    logging.log(logging.INFO, "total {} docs ".format(lineCounter))

    #build dict 1.5H/GB
    dict1 = gslib.build_dict(dirs)
    dict1.save(dirs.dictFileName)
    #dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)

    #pipe docfile to gensim corpus
    #fixme - corpusAdapter missing a len() property
    corpus = gslib.corpusAdapter(dirs.allDocsFileName, id2word=dict1)
    gensim.corpora.MmCorpus.serialize(fname=dirs.corpusFname,
                                      corpus=corpus,
                                      id2word=dict1)
    mm = gensim.corpora.MmCorpus(dirs.corpusFname)

    ## run the LDA (2h per update on 2M posts)
    # first runs a small step and then update 9 times saving results to disk every time

    lda = gensim.models.ldamodel.LdaModel(corpus=mm,
                                          id2word=dict1,
                                          num_topics=num_topics,
                                          update_every=0,
                                          passes=num_passes)
    lda.save(dirs.modelFname + "_0")

    for i in xrange(n_repeat - 1):
        lda.update(mm)
        # save inremediate result
        lda.save(dirs.modelFname + "_" + str(i + 1))
        for t in lda.show_topics(-1):
            logging.info(str('all topics here') + t)
    lda.save(dirs.modelFname)

    logger.removeHandler(ch)
    logger.removeHandler(fh)
    return modelName

allthreads =bf.init()

somethreads=allthreads[0:100]

authPostCount = collections.defaultdict(int)
for t in allthreads: 
    for post in t.getPosts():
        author = post.msgAuthor
        authPostCount[author]+=1
len(authPostCount) #375,569 

authorStems=set()
for author in authPostCount.keys():
    authorStem=gslib.wordCleanUp(gslib.textCleanUp(author))
    authorStems.add(authorStem)
len(authorStems) #27,3418

for author in sorted(authPostCount.keys()):
    if author.find(' ')!=-1:
        print author


with open('authList.txt','w') as authListF:
    for author in sorted(authPostCount.keys()):
        authListF.write(author+'\n')

with open('authStemsList.txt','w') as authListF:
    for author in sorted(authorStems):
        authListF.write(author+'\n')
Ejemplo n.º 7
0
def main(outdir = r'Z:\ermunds\results\2005 20t unbranded',
         num_passes=2,
         n_repeat = 10,
         num_topics=20,
         threadChoseStr='',
         modelTag='2005+',
         time_low_cutoff=time.strptime("1 Jan 2005", "%d %b %Y"),
         time_hi_cutoff=time.strptime("1 Jan 2006", "%d %b %Y"),

         ):
    ''' 
    # time_low_cutoff, time_hi_cutoff posts are chosen between these two dates             
    # threadChoseStr - filter topic names by this phrase
    '''
    dTr = bf.notMain(threadChoseStr)
    
    modelName = modelTag+str(num_topics)+'topics'
    dirs =gslib.LDAdirs(modelName,outdir)
    with open(dirs.dataFileName,'a') as file1: pickle.dump(dTr,file1)

    ## setup logging to file and console
    logger = logging.getLogger('')
    logger.setLevel(logging.DEBUG)
    fh = logging.FileHandler(dirs.logFileName)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)-12s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    fh.setFormatter(formatter)
    logger.addHandler(ch)
    logger.addHandler(fh)


    ## get threads, extract post texts and save to single file
    # 7min per 1GB
    logging.log(logging.INFO,"building doc list")
    lineCounter=0
    with open(dirs.allDocsFileName,'a') as docDumpFile:
        for Trlist in dTr.values():
            for Tr in Trlist:
                for p in Tr.getPosts():
                    if (p.msgTime>time_low_cutoff) and (p.msgTime<time_hi_cutoff):
                        doc = gslib.textCleanUp(p.msgTitle)+gslib.textCleanUp(p.msgText)
                        lineCounter+=1
                        print(doc,file = docDumpFile)
    logging.log(logging.INFO,"total {} docs ".format(lineCounter))            

    #build dict 1.5H/GB
    dict1 = gslib.build_dict(dirs)
    dict1.save(dirs.dictFileName)
    #dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)

    #pipe docfile to gensim corpus
    #fixme - corpusAdapter missing a len() property
    corpus = gslib.corpusAdapter(dirs.allDocsFileName,id2word=dict1)
    gensim.corpora.MmCorpus.serialize(fname=dirs.corpusFname, corpus=corpus, id2word=dict1)
    mm=gensim.corpora.MmCorpus(dirs.corpusFname)

    ## run the LDA (2h per update on 2M posts)
    # first runs a small step and then update 9 times saving results to disk every time

    lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dict1, num_topics=num_topics, update_every=0, passes=num_passes)
    lda.save(dirs.modelFname+"_0")
    
    for i in xrange(n_repeat-1):
        lda.update(mm);
        # save inremediate result
        lda.save(dirs.modelFname+"_"+str(i+1));
        for t in lda.show_topics(-1):
            logging.info(str('all topics here')+t);
    lda.save(dirs.modelFname)
    
    logger.removeHandler(ch)
    logger.removeHandler(fh)
    return modelName