Ejemplo n.º 1
0
 def testStoryExists(self):
     story = self._getFakeStory()
     db = StoryDatabase()
     db.createDatabase(self.TEST_DB_NAME)
     db.addStory(story)
     saved_story = db.getStory(str(story['stories_id']))
     self.assertTrue(db.storyExists(str(story['stories_id'])))
     self.assertFalse(db.storyExists('43223535'))
     db.deleteDatabase(self.TEST_DB_NAME)
Ejemplo n.º 2
0
 def testAddStory(self):
     story = self._getFakeStory()
     db = StoryDatabase()
     db.createDatabase(self.TEST_DB_NAME)
     worked = db.addStory(story)
     self.assertTrue(worked)
     worked = db.addStory(story)
     self.assertFalse(worked)        
     saved_story = db.getStory(str(story['stories_id']))
     self.assertEquals(saved_story['_id'], str(story['stories_id']))
     self.assertEquals(saved_story['story_sentences_count'], 2)
     db.deleteDatabase(self.TEST_DB_NAME)
Ejemplo n.º 3
0
 def testGetMaxStoryId(self):
     story1 = self._getFakeStory()
     story1['stories_id'] = "1000"
     story2 = self._getFakeStory()
     story1['stories_id'] = "2000"
     db = StoryDatabase()
     db.createDatabase(self.TEST_DB_NAME)
     db._db.save(mediacloud.examples.getAllExampleViews())
     self.assertEquals(db.getMaxStoryId(),0)
     db.addStory(story1)
     db.addStory(story2)
     self.assertEquals(db.getMaxStoryId(),2000)
     db.deleteDatabase(self.TEST_DB_NAME)        
To Install:
>>> import nltk
>>> nltk.download()
[ select d for Download ]
[ enter "stopwords" as the identifier ]
[ enter "punkt" as the identifier ]
'''

config = ConfigParser.ConfigParser()
config.read('mc-client.config')

# set up a connection to a local DB
db = StoryDatabase('mediacloud', config.get('db','host'), config.get('db','port') )

# connect to MC and fetch some articles
mc = MediaCloud( config.get('api','user'), config.get('api','pass') )
results = mc.recentStories()
print "Fetched "+str(len(results))+" stories"

# set up my callback function that adds readability score to the story
pub.subscribe(mediacloud.examples.addFleshKincaidGradeLevelToStory, StoryDatabase.EVENT_PRE_STORY_SAVE)

# save all the stories in the db (this will fire the callback above)
saved = 0
for story in results:
    worked = db.addStory(story)
    if worked:
        saved = saved + 1

print "Saved "+str(saved)+" stories"
# setup the mediacloud connection
mc = MediaCloud( config.get('api','user'), config.get('api','pass') )

# Must first seed database with latest Story ID, or else it will start at the beginning (2005)
max_story_id = articles_db.getMaxStoryId()
results = mc.storiesSince( max_story_id, STORIES_TO_FETCH, fetch_raw_text = True )
log.info("Fetched "+str(len(results))+" stories (after "+str(max_story_id)+")")

# set up a callback function that adds twitter username occurrences to the story
pub.subscribe(mediacloud.examples.addTwitterReferencesToStory, StoryDatabase.EVENT_PRE_STORY_SAVE)

# save all the stories in the db
saved = 0
for story in results:
    print 'new story',
    worked = articles_db.addStory(story)
    if worked:
      saved = saved + 1
    else:
      log.warning("  unable to save story "+str(story['stories_id']))
    
    if story['first_raw_download_file']:
        text = story['first_raw_download_file']
    elif story['story_text']:
        text = story['story_text']
    else:
        text = ''
    try:
        if not mediacloud.examples.isEnglish(text):
            print 'i',
            continue