def run(self, **kwargs):
     today = datetime.date.today()
     for year in range(0, 2):
         date = [today.year + year, None, None]
         collector = CollectArticles(get_available_channels(),
                                     *date,
                                     use_storage=True)
         worker.run(collector)
 def run(self, **kwargs):
     today = datetime.date.today()
     date = (today.year, today.month, today.day)
     collector = CollectArticles(get_available_channels(),
                                 *date,
                                 use_storage=True,
                                 force_collect=True)
     worker.run(collector)
 def test_get_articles_with_queue(self):
     # need to explicitly import the runnable object
     from brokenpromises.operations import CollectArticles
     from brokenpromises.worker import worker
     collector = CollectArticles(("brokenpromises.channels.guardian", ),
                                 "2014",
                                 1,
                                 use_storage=False)
     worker.run(collector)
 def run(self, **kwargs):
     today = datetime.date.today()
     for day in range(1, 7):  # j+7
         date = today + datetime.timedelta(days=day)
         date = (date.year, date.month, date.day)
         collector = CollectArticles(get_available_channels(),
                                     *date,
                                     use_storage=True)
         worker.run(collector)
 def run(self, **kwargs):
     today = datetime.date.today()
     for month in range(0, 2):
         date = [None, None, None]
         date[0] = today.year + (today.month + month) / 12
         date[1] = (today.month + month - 1) % 12 + 1
         collector = CollectArticles(get_available_channels(),
                                     *date,
                                     use_storage=True)
         worker.run(collector)
 def test_get_articles(self):
     collector = CollectArticles(("brokenpromises.channels.guardian", ),
                                 "2014", "1")
     results = collector.run()
     print
     print "results:", len(results)
     assert len(results) > 0
     for result in results:
         assert result.ref_dates, "%s : %s" % (result, result.url)
     assert collector.get_report()
     assert collector.get_report(
     ).collector == "brokenpromises.operations.CollectArticles", collector.get_report(
     ).collector
     assert collector.get_report().meta['count'] == len(results)
     assert collector.get_report().meta['related_articles'] <= len(results)
     assert len(collector.get_report().meta['urls_found']) == len(results)
 def test_get_articles_with_storage(self):
     from brokenpromises import Article
     searched_date = (2014, 1, None)
     collector = CollectArticles(("brokenpromises.channels.nytimes", ),
                                 *searched_date,
                                 use_storage=True)
     # replace storage with custom storage (testing db)
     collector.storage = self.testing_storage
     results = collector.run()
     print
     print "results:", len(results)
     assert len(results) > 0
     for result in results:
         assert result.ref_dates, "%s : %s" % (result, result.url)
     assert collector.get_report()
     assert collector.get_report(
     ).collector == "brokenpromises.operations.CollectArticles"
     assert collector.get_report().meta['count'] == len(results)
     assert collector.get_report().meta['related_articles'] <= len(results)
     assert len(collector.get_report().meta['urls_found']) == len(results)
     assert len(
         self.testing_storage.get_reports(
             name="collector", searched_date=searched_date,
             status="done")) == 1, self.testing_storage.get_reports(
                 searched_date)
     results = collector.run()
     assert len(results) > 0, results
     assert type(results[0]) is Article, type(results[0])
     assert len(
         self.testing_storage.get_reports(searched_date=searched_date)) == 2
     assert len(
         self.testing_storage.get_reports(name="collector",
                                          searched_date=searched_date)) == 2
     assert len(
         self.testing_storage.get_reports(name="collector",
                                          searched_date=searched_date,
                                          status="escaped")) == 1
	def test_get_articles(self):
		collector = CollectArticles(("brokenpromises.channels.guardian",), "2014", "1")
		results   = collector.run()
		print 
		print "results:", len(results)
		assert len(results) > 0
		for result in results:
			assert result.ref_dates, "%s : %s" % (result, result.url)
		assert collector.get_report()
		assert collector.get_report().collector                == "brokenpromises.operations.CollectArticles", collector.get_report().collector
		assert collector.get_report().meta['count']            == len(results)
		assert collector.get_report().meta['related_articles'] <= len(results)
		assert len(collector.get_report().meta['urls_found'])  == len(results)
    def test_retrieve_referenced_dates(self):
        dates = (
            ("10 October 2013", (2013, 10, 10)),
            ("10 october, 2013", (2013, 10, 10)),
            ("4 by October 2013", (2013, 10, 4)),
            ("10 by October 2013", (2013, 10, 10)),
            ("10 by October, 2013", (2013, 10, 10)),
            ("Jan 2014", (2014, 1, None)),
            ("10 in October 2013", (2013, 10, 10)),
            ("10 in October, 2013", (2013, 10, 10)),
            ("10 of October 2013", (2013, 10, 10)),
            ("10 of October, 2013", (2013, 10, 10)),
            ("10th October 2013", (2013, 10, 10)),
            ("10th by October 2013", (2013, 10, 10)),
            ("10th by October, 2013", (2013, 10, 10)),
            ("10th in october 2013", (2013, 10, 10)),
            ("10th in October, 2013", (2013, 10, 10)),
            ("10th of October 2013", (2013, 10, 10)),
            ("10th of October, 2013", (2013, 10, 10)),
            ("2013-10-10", (2013, 10, 10)),
            ("2013/10/10", (2013, 10, 10)),
            ("August, 2013", (2013, 8, None)),
            ("2013", (2013, None, None)),
            ("November 04, 2013", (2013, 11, 4)),
            ("November 4, 2013", (2013, 11, 4)),
        )

        text = " bla bli 123. Bu \n pouet12 \n 12412 ".join(
            [_[0] for _ in dates])
        refs = CollectArticles.retrieve_referenced_dates(text)
        date_found = [_['extracted_date'] for _ in refs]
        for searched_date in dates:
            try:
                ref = filter(lambda _: _["extracted_date"] == searched_date[0],
                             refs)[0]
            except:
                raise Exception(
                    "\"%s\" not found in document. Date found:\n%s" %
                    (searched_date[0], "\n".join(date_found)))
            assert ref['extracted_date'] in searched_date[0]
            assert ref['date'] == searched_date[1], "%s != %s" % (
                ref['date'], searched_date[1])
            date_found.remove(ref['extracted_date'])
        assert len(refs) == len(dates), "%s != %s\nToo much : %s" % (
            len(refs), len(dates), date_found)
	def test_get_articles_with_storage(self):
		from brokenpromises import Article
		searched_date = (2014, 1, None)
		collector = CollectArticles(("brokenpromises.channels.nytimes",), *searched_date, use_storage=True)
		# replace storage with custom storage (testing db)
		collector.storage = self.testing_storage
		results           = collector.run()
		print 
		print "results:", len(results)
		assert len(results) > 0
		for result in results:
			assert result.ref_dates, "%s : %s" % (result, result.url)
		assert collector.get_report()
		assert collector.get_report().collector                == "brokenpromises.operations.CollectArticles"
		assert collector.get_report().meta['count']            == len(results)
		assert collector.get_report().meta['related_articles'] <= len(results)
		assert len(collector.get_report().meta['urls_found'])  == len(results)
		assert len(self.testing_storage.get_reports(name="collector", searched_date=searched_date, status="done")) == 1, self.testing_storage.get_reports(searched_date)
		results = collector.run()
		assert len(results) > 0, results
		assert type(results[0]) is Article, type(results[0])
		assert len(self.testing_storage.get_reports(searched_date=searched_date)) == 2
		assert len(self.testing_storage.get_reports(name="collector", searched_date=searched_date)) == 2
		assert len(self.testing_storage.get_reports(name="collector", searched_date=searched_date, status="escaped")) == 1
	def test_retrieve_referenced_dates(self):
		dates = (
			("10 October 2013"       , (2013, 10, 10)),
			("10 october, 2013"      , (2013, 10, 10)),
			("4 by October 2013"     , (2013, 10, 4)),
			("10 by October 2013"    , (2013, 10, 10)),
			("10 by October, 2013"   , (2013, 10, 10)),
			("Jan 2014"              , (2014, 1, None)),
			("10 in October 2013"    , (2013, 10, 10)),
			("10 in October, 2013"   , (2013, 10, 10)),
			("10 of October 2013"    , (2013, 10, 10)),
			("10 of October, 2013"   , (2013, 10, 10)),
			("10th October 2013"     , (2013, 10, 10)),
			("10th by October 2013"  , (2013, 10, 10)),
			("10th by October, 2013" , (2013, 10, 10)),
			("10th in october 2013"  , (2013, 10, 10)),
			("10th in October, 2013" , (2013, 10, 10)),
			("10th of October 2013"  , (2013, 10, 10)),
			("10th of October, 2013" , (2013, 10, 10)),
			("2013-10-10"            , (2013, 10, 10)),
			("2013/10/10"            , (2013, 10, 10)),
			("August, 2013"          , (2013, 8, None)),
			("2013"                  , (2013, None, None)),
			("November 04, 2013"     , (2013, 11, 4)),
			("November 4, 2013"      , (2013, 11, 4)),
		)

		text  = " bla bli 123. Bu \n pouet12 \n 12412 ".join([_[0] for _ in dates])
		refs  = CollectArticles.retrieve_referenced_dates(text)
		date_found = [_['extracted_date'] for _ in refs]
		for searched_date in dates:
			try:
				ref = filter(lambda _: _["extracted_date"] == searched_date[0], refs)[0]
			except:
				raise Exception("\"%s\" not found in document. Date found:\n%s" % (searched_date[0], "\n".join(date_found)))
			assert ref['extracted_date'] in searched_date[0]
			assert ref['date']           == searched_date[1], "%s != %s" % (ref['date'], searched_date[1])
			date_found.remove(ref['extracted_date'])
		assert len(refs) == len(dates), "%s != %s\nToo much : %s" % (len(refs), len(dates), date_found)
# Think to update the README.md file after modifying the options

options, args = oparser.parse_args()
assert len(args) > 0 and len(args) <= 3

if options.output_file:
	sys.stdout = open(options.output_file, 'a')

channels = brokenpromises.channels.get_available_channels()
if options.channels_file:
	with open(options.channels_file) as f:
		channels = [line.replace("\n", "") for line in f.readlines()]
if options.channels_list:
	channels = options.channels_list.split(",")

collector = CollectArticles(channels, *args, use_storage=options.storage, force_collect=options.force_collect)

if options.mongodb_drop:
	collector.storage.get_database().drop_collection("articles")
	collector.storage.get_database().drop_collection("reports")

results = collector.run()

# OUTPUT
print dumps([_.__dict__ for _ in results]).encode('utf-8')
info("%d articles collected." % (len(results)))
exit()

# EOF
Exemple #13
0
options, args = oparser.parse_args()
assert len(args) > 0 and len(args) <= 3

if options.output_file:
    sys.stdout = open(options.output_file, 'a')

channels = brokenpromises.channels.get_available_channels()
if options.channels_file:
    with open(options.channels_file) as f:
        channels = [line.replace("\n", "") for line in f.readlines()]
if options.channels_list:
    channels = options.channels_list.split(",")

collector = CollectArticles(channels,
                            *args,
                            use_storage=options.storage,
                            force_collect=options.force_collect)

if options.mongodb_drop:
    collector.storage.get_database().drop_collection("articles")
    collector.storage.get_database().drop_collection("reports")

results = collector.run()

# OUTPUT
print dumps([_.__dict__ for _ in results]).encode('utf-8')
info("%d articles collected." % (len(results)))
exit()

# EOF