Esempio n. 1
0
 def test_do_work(self):
     def fetch_file(url, _):
         return "fooo"
     commons_speech_saver("http://foo.com/foo/bar.html", fetch_file, commons_speech_working_dir(self.rootdir))
     path = os.path.join(commons_speech_working_dir(self.rootdir), "bar.html")
     self.assertTrue(os.path.exists(path))
     with open(path) as f:
         self.assertEqual(f.read(), "fooo")
Esempio n. 2
0
def load_commons_speeches(root_dir, writer, num_workers):
    log.debug("starting loader")
    working_dir = commons_speech_working_dir(root_dir)
    log.debug(working_dir)
    tracker = os.path.join(working_dir, "tracker")
    pool = mp.Pool(num_workers, lambda *args: globals().update(dict(args)), {"_writer":writer}.items())
    day_mod = 100
    day_count = 0
    day_time = datetime.now()
    speech_mod = 10000
    speech_count = 0
    speech_time = datetime.now()
    with open(tracker, "a+") as tracker_file:
        for filename, speeches in pool.imap(commons_speech_saver, commons_speech_feeder(working_dir, tracker)):
            day_count += 1
            if day_count % day_mod is 0:
                delta = datetime.now() - day_time
                log.info("%10s Days in %s (%s/s)" % (day_count, delta, day_count / delta.seconds))
                log.info("Latest file %s" % filename)
            for i in range((speech_count + speeches) / speech_mod - speech_count / speech_mod):
                sp_count = (speech_count / speech_mod + i + 1) * speech_mod
                delta = datetime.now() - speech_time
                log.info("%10s Speeches in %s (%s/s)" % (sp_count, delta, sp_count / delta.seconds))
                tracker_file.flush()
            speech_count += speeches
            tracker_file.write("%s\n" % filename)
Esempio n. 3
0
    def test_feeder(self):
        def fetch_index(url, _):
            return self.read_fixture("hanalytics/fetchers/parlparse-commons-index.html")

        start = datetime.now()
        urls = [x for x in commons_speech_feeder(commons_speech_working_dir(self.rootdir), fetch_index)]
        self.assertEqual(len(urls), 1000)
        self.assertLess(datetime.now() - start, timedelta(seconds=1))
        self.assertEqual(urls[0], u'http://ukparse.kforge.net/parldata/scrapedxml/debates/debates2008-06-30a.xml')
        self.assertEqual(urls[-1], u'http://ukparse.kforge.net/parldata/scrapedxml/debates/debates2011-11-14a.xml')
Esempio n. 4
0
 def test_create_working_dir(self):
     working_dir = commons_speech_working_dir(self.rootdir)
     self.assertEqual(working_dir, "/tmp/hanalytics-test/parlparse/commons")
     self.assertTrue(os.path.exists("/tmp/hanalytics-test/parlparse/commons"))
Esempio n. 5
0
 def setUp(self):
     self.rootdir = "/tmp/hanalytics-test"
     self.writer = mock.Mock()
     self.working_dir = commons_speech_working_dir(self.rootdir)