def SNAP_generateMmCorpus(self, topic): if topic == 'all': topics = ['bieber', 'cyrus', 'syria', 'ufo'] for t in topics: self.SNAP_generateMmCorpus(t) return corpus = self.SNAP_corpusForTopic(topic) outputPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'snap_data', "gensim_snap_mmcorpus_%s.mm" % topic ) id2word = self.SNAP_id2word() MmCorpus.save_corpus(outputPath, corpus, id2word) return
def saveGensim(self, topic): if topic is None: # generate all self.saveGensim('movie') self.saveGensim('celebrity') self.saveGensim('syria') self.saveGensim('ufo') return posDocs = [] negDocs = [] if topic == 'movie': topic = 'movie_reviews' elif topic == 'celebrity': topic = 'bieber' if topic == 'movie_reviews': count = 100 posDocs = self.movieReviews('positive', count) negDocs = self.movieReviews('negative', count) else: posDocs = self.getArticlesHelper('positive', topic) negDocs = self.getArticlesHelper('negative', topic) listOfTokens = [] # dictionary docs = [] # corpus for posDoc in posDocs: processed = self.processDocForGensim(posDoc) tokens = self.tokensFromText(processed) listOfTokens.append(tokens) docs.append(processed) for negDoc in negDocs: processed = self.processDocForGensim(negDoc) tokens = self.tokensFromText(processed) listOfTokens.append(tokens) docs.append(processed) dictionaryFilename = 'gensim_dictionary.txt' corpusFilename = 'gensim_corpus.mm' # make destination files if they don't exist dictionaryPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'james_data', topic, dictionaryFilename ) corpusPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'james_data', topic, corpusFilename ) corpusTempPath = corpusPath + '.tmp' if os.path.exists(dictionaryPath): os.remove(dictionaryPath) if os.path.exists(corpusPath): os.remove(corpusPath) if os.path.exists(corpusTempPath): os.remove(corpusTempPath) with open(dictionaryPath, 'w') as f: f.write(' ') with open(corpusPath, 'w') as f: f.write(' ') # save dictionary and corpus d = Dictionary(listOfTokens) d.save(dictionaryPath) with open(corpusTempPath, 'w') as f: f.write('\n'.join(docs)) corpus = TextCorpus(corpusTempPath) MmCorpus.save_corpus(corpusPath, corpus) return