Esempio n. 1
0
 def SNAP_generateMmCorpus(self, topic):
   if topic == 'all':
     topics = ['bieber', 'cyrus', 'syria', 'ufo']
     for t in topics:
       self.SNAP_generateMmCorpus(t)
     return
   corpus = self.SNAP_corpusForTopic(topic)
   outputPath = os.path.join(
     os.path.dirname(os.path.abspath(__file__)),
     'snap_data',
     "gensim_snap_mmcorpus_%s.mm" % topic
   )
   id2word = self.SNAP_id2word()
   MmCorpus.save_corpus(outputPath, corpus, id2word)
   return
Esempio n. 2
0
  def saveGensim(self, topic):
    if topic is None:
      # generate all
      self.saveGensim('movie')
      self.saveGensim('celebrity')
      self.saveGensim('syria')
      self.saveGensim('ufo')
      return

    posDocs = []
    negDocs = []

    if topic == 'movie':
      topic = 'movie_reviews'
    elif topic == 'celebrity':
      topic = 'bieber'

    if topic == 'movie_reviews':
      count = 100
      posDocs = self.movieReviews('positive', count)
      negDocs = self.movieReviews('negative', count)
    else:
      posDocs = self.getArticlesHelper('positive', topic)
      negDocs = self.getArticlesHelper('negative', topic)

    listOfTokens = [] # dictionary
    docs = [] # corpus

    for posDoc in posDocs:
      processed = self.processDocForGensim(posDoc)
      tokens = self.tokensFromText(processed)
      listOfTokens.append(tokens)
      docs.append(processed)
    for negDoc in negDocs:
      processed = self.processDocForGensim(negDoc)
      tokens = self.tokensFromText(processed)
      listOfTokens.append(tokens)
      docs.append(processed)

    dictionaryFilename = 'gensim_dictionary.txt'
    corpusFilename = 'gensim_corpus.mm'

    # make destination files if they don't exist
    dictionaryPath = os.path.join(
      os.path.dirname(os.path.abspath(__file__)),
      'james_data',
      topic,
      dictionaryFilename
    )

    corpusPath = os.path.join(
      os.path.dirname(os.path.abspath(__file__)),
      'james_data',
      topic,
      corpusFilename
    )

    corpusTempPath = corpusPath + '.tmp'

    if os.path.exists(dictionaryPath):
      os.remove(dictionaryPath)

    if os.path.exists(corpusPath):
      os.remove(corpusPath)

    if os.path.exists(corpusTempPath):
      os.remove(corpusTempPath)

    with open(dictionaryPath, 'w') as f:
      f.write(' ')

    with open(corpusPath, 'w') as f:
      f.write(' ')

    # save dictionary and corpus
    d = Dictionary(listOfTokens)
    d.save(dictionaryPath)

    with open(corpusTempPath, 'w') as f:
      f.write('\n'.join(docs))

    corpus = TextCorpus(corpusTempPath)
    MmCorpus.save_corpus(corpusPath, corpus)

    return