Example #1
0
def load_commons_speeches(root_dir, writer, num_workers, rate):
    log.debug("starting loader")
    working_dir = commons_speech_working_dir(root_dir)
    log.debug(working_dir)
    tracker = os.path.join(working_dir, "tracker")
    pool = mp.Pool(num_workers, lambda *args: globals().update(dict(args)), {"_writer":writer, "rate":rate}.items())
    # TODO: move reporting out
    day_mod = 100
    day_count = 0
    day_time = datetime.datetime.now()
    speech_mod = 10000
    speech_count = 0
    speech_time = datetime.datetime.now()
    with open(tracker, "a+") as tracker_file:
        for filename, speeches in pool.imap(commons_speech_saver, commons_speech_feeder(working_dir, tracker)):
            day_count += 1
            if day_count % day_mod is 0:
                delta = datetime.datetime.now() - day_time
                log.info("%10s Days in %s (%s/s)" % (day_count, delta, day_count / delta.seconds))
                log.info("Latest file %s" % filename)
            for i in range((speech_count + speeches) / speech_mod - speech_count / speech_mod):
                sp_count = (speech_count / speech_mod + i + 1) * speech_mod
                delta = datetime.datetime.now() - speech_time
                log.info("%10s Speeches in %s (%s/s)" % (sp_count, delta, sp_count / delta.seconds))
                tracker_file.flush()
            speech_count += speeches
            tracker_file.write("%s\n" % filename)
Example #2
0
 def test_feeder(self):
     """Test source urls are generated correctly and in quickly"""
     start = datetime.datetime.now()
     urls = [x for x in commons_speech_feeder(commons_speech_working_dir(self.rootdir))]
     self.assertEqual(len(urls), 995)
     self.assertLess(datetime.datetime.now() - start, datetime.timedelta(seconds=0.1))
     self.assertEqual(urls[0], "http://www.hansard-archive.parliament.uk/Parliamentary_Debates_1803_to_1820/S1V0001P0.zip")
     self.assertEqual(urls[-1], "http://www.hansard-archive.parliament.uk/Parliamentary_Debates_1909_to_1981/S5V0199P0.zip")
Example #3
0
 def setUp(self):
     """Create and set up working locations"""
     self.rootdir = "/tmp/hanalytics-test"
     self.writer  = mock.Mock()
     self.working_dir = commons_speech_working_dir(self.rootdir)
Example #4
0
 def test_create_working_dir(self):
     """Test the working directory is correctly created"""
     working_dir = commons_speech_working_dir(self.rootdir)
     self.assertEqual(working_dir, "/tmp/hanalytics-test/hansardarchive")
     self.assertTrue(os.path.exists(working_dir))