Ejemplo n.º 1
0
 def retrieve_referenced_dates(cls, text, filters_on_text=None):
     references = []
     # filters
     # remove all tags
     if filters_on_text:
         text = filters_on_text(text)
     # search and add dates to `refrences`
     for date_obj, date_row, date_position in dateparser.find_dates(text):
         reference = {
             "date": date_obj,
             "extract": cls.get_sentence(text, date_row),
             "extracted_date": date_row
         }
         references.append(reference)
     return references
Ejemplo n.º 2
0
	def retrieve_referenced_dates(cls, text, filters_on_text=None):
		references = []
		# filters
		# remove all tags
		if filters_on_text:
			text = filters_on_text(text)
		# search and add dates to `refrences`
		for date_obj, date_row, date_position in dateparser.find_dates(text):
			reference = {
				"date"           : date_obj,
				"extract"        : cls.get_sentence(text, date_row),
				"extracted_date" : date_row
			}
			references.append(reference)
		return references
Ejemplo n.º 3
0
	return brokenpromises.channels.Catalogue.CHANNELS[_id]['class']()

if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='')
	parser.add_argument('url', type=str, help='url to scrap')
	parser.add_argument('--with-filters', dest='filters', action='store_true', default=False, help='Apply filters to remove unwanted dates')
	parser.add_argument('--dates', dest='dates', action='store_true', default=False, help='Return the date found in the article')

	args = parser.parse_args()
	url = args.url
	available_channels = brokenpromises.channels.get_available_channels()
	brokenpromises.channels.perform_channels_import(available_channels)
	channel = None
	if "nytimes.com" in url:
		channel = get_channel('nytimes')
	elif "theguardian.com" in url:
		channel = get_channel('guardian')
	if channel:
		if args.dates:
			args.filters = True
		body = channel.scrape_body_article(url, filter_=args.filters)
		if args.dates:
			dates = []
			for date_obj, date_row, date_position in dateparser.find_dates(body):
				dates.append(date_obj)
			print json.dumps(dates)
		else:
			print body.encode('utf-8', 'ignore')

# EOF
Ejemplo n.º 4
0
 def retrieve_referenced_dates(cls, text):
     references = []
     for date_obj, date_row, date_position in dateparser.find_dates(text):
         reference = {"date": date_obj, "extract": cls.get_sentence(text, date_row), "extracted_date": date_row}
         references.append(reference)
     return references