def retrieve_referenced_dates(cls, text, filters_on_text=None): references = [] # filters # remove all tags if filters_on_text: text = filters_on_text(text) # search and add dates to `refrences` for date_obj, date_row, date_position in dateparser.find_dates(text): reference = { "date": date_obj, "extract": cls.get_sentence(text, date_row), "extracted_date": date_row } references.append(reference) return references
def retrieve_referenced_dates(cls, text, filters_on_text=None): references = [] # filters # remove all tags if filters_on_text: text = filters_on_text(text) # search and add dates to `refrences` for date_obj, date_row, date_position in dateparser.find_dates(text): reference = { "date" : date_obj, "extract" : cls.get_sentence(text, date_row), "extracted_date" : date_row } references.append(reference) return references
return brokenpromises.channels.Catalogue.CHANNELS[_id]['class']() if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('url', type=str, help='url to scrap') parser.add_argument('--with-filters', dest='filters', action='store_true', default=False, help='Apply filters to remove unwanted dates') parser.add_argument('--dates', dest='dates', action='store_true', default=False, help='Return the date found in the article') args = parser.parse_args() url = args.url available_channels = brokenpromises.channels.get_available_channels() brokenpromises.channels.perform_channels_import(available_channels) channel = None if "nytimes.com" in url: channel = get_channel('nytimes') elif "theguardian.com" in url: channel = get_channel('guardian') if channel: if args.dates: args.filters = True body = channel.scrape_body_article(url, filter_=args.filters) if args.dates: dates = [] for date_obj, date_row, date_position in dateparser.find_dates(body): dates.append(date_obj) print json.dumps(dates) else: print body.encode('utf-8', 'ignore') # EOF
def retrieve_referenced_dates(cls, text): references = [] for date_obj, date_row, date_position in dateparser.find_dates(text): reference = {"date": date_obj, "extract": cls.get_sentence(text, date_row), "extracted_date": date_row} references.append(reference) return references