def test_do_work(self): def fetch_file(url, _): return "fooo" commons_speech_saver("http://foo.com/foo/bar.html", fetch_file, commons_speech_working_dir(self.rootdir)) path = os.path.join(commons_speech_working_dir(self.rootdir), "bar.html") self.assertTrue(os.path.exists(path)) with open(path) as f: self.assertEqual(f.read(), "fooo")
def load_commons_speeches(root_dir, writer, num_workers): log.debug("starting loader") working_dir = commons_speech_working_dir(root_dir) log.debug(working_dir) tracker = os.path.join(working_dir, "tracker") pool = mp.Pool(num_workers, lambda *args: globals().update(dict(args)), {"_writer":writer}.items()) day_mod = 100 day_count = 0 day_time = datetime.now() speech_mod = 10000 speech_count = 0 speech_time = datetime.now() with open(tracker, "a+") as tracker_file: for filename, speeches in pool.imap(commons_speech_saver, commons_speech_feeder(working_dir, tracker)): day_count += 1 if day_count % day_mod is 0: delta = datetime.now() - day_time log.info("%10s Days in %s (%s/s)" % (day_count, delta, day_count / delta.seconds)) log.info("Latest file %s" % filename) for i in range((speech_count + speeches) / speech_mod - speech_count / speech_mod): sp_count = (speech_count / speech_mod + i + 1) * speech_mod delta = datetime.now() - speech_time log.info("%10s Speeches in %s (%s/s)" % (sp_count, delta, sp_count / delta.seconds)) tracker_file.flush() speech_count += speeches tracker_file.write("%s\n" % filename)
def test_feeder(self): def fetch_index(url, _): return self.read_fixture("hanalytics/fetchers/parlparse-commons-index.html") start = datetime.now() urls = [x for x in commons_speech_feeder(commons_speech_working_dir(self.rootdir), fetch_index)] self.assertEqual(len(urls), 1000) self.assertLess(datetime.now() - start, timedelta(seconds=1)) self.assertEqual(urls[0], u'http://ukparse.kforge.net/parldata/scrapedxml/debates/debates2008-06-30a.xml') self.assertEqual(urls[-1], u'http://ukparse.kforge.net/parldata/scrapedxml/debates/debates2011-11-14a.xml')
def test_create_working_dir(self): working_dir = commons_speech_working_dir(self.rootdir) self.assertEqual(working_dir, "/tmp/hanalytics-test/parlparse/commons") self.assertTrue(os.path.exists("/tmp/hanalytics-test/parlparse/commons"))
def setUp(self): self.rootdir = "/tmp/hanalytics-test" self.writer = mock.Mock() self.working_dir = commons_speech_working_dir(self.rootdir)