def harvest_front_pages(start, end, title_id, size='small'): ''' Harvest images of front pages of the given title over the specified period. start and end are dates in ISO YYYY-MM-DD format, eg: '1857-02-04' >>> harvest_front_pages('1902-01-01','1902-01-01', '34') Checking date: 1902-01-01 Saving: 1902-01-01-905929-small.jpg ''' directory = '%s%s' % (HARVEST_DIR, title_id) if not os.path.exists(directory): os.makedirs(directory) start_date = convert_iso_to_datetime(start) end_date = convert_iso_to_datetime(end) one_day = datetime.timedelta(days=1) this_day = start_date # Loop through each day in specified period while this_day <= end_date: print 'Checking date: %s' % this_day.isoformat() try: page_id = get_front_page_id(this_day, title_id) except IssueError: print 'No such issue.' else: filename = '%s/%s-%s-%s.jpg' % (directory, this_day.isoformat(), page_id, size) if not os.path.exists(filename): image = get_front_page_image(None, None, page_id, size=size) print 'Saving: %s' % os.path.basename(filename) with open(filename, 'wb') as f: f.write(image) this_day += one_day time.sleep(1)
def harvest_front_pages(start, end, title_id, size='small'): ''' Harvest images of front pages of the given title over the specified period. start and end are dates in ISO YYYY-MM-DD format, eg: '1857-02-04' >>> harvest_front_pages('1902-01-01','1902-01-01', '34') Checking date: 1902-01-01 Saving: 1902-01-01-905929-small.jpg ''' directory = '%s%s' % (HARVEST_DIR, title_id) if not os.path.exists(directory): os.makedirs(directory) start_date = convert_iso_to_datetime(start) end_date = convert_iso_to_datetime(end) one_day = datetime.timedelta(days=1) this_day = start_date # Loop through each day in specified period while this_day <= end_date: print 'Checking date: %s' % this_day.isoformat() try: page_id = get_front_page_id(this_day, title_id) except IssueError: print 'No such issue.' else: filename = '%s/%s-%s-%s.jpg' % (directory, this_day.isoformat(), page_id, size) if not os.path.exists(filename): image = get_front_page_image(None, None, page_id, size=size) if image: print 'Saving: %s' % os.path.basename(filename) with open(filename, 'wb') as f: f.write(image) this_day += one_day time.sleep(1)
def harvest_front_pages_text(start, end, title_id): ''' Harvest and concatenate text content of all articles on front page. start and end are dates in ISO YYYY-MM-DD format, eg: '1857-02-04' >>> harvest_front_pages_text('1902-01-01','1902-01-01', '34') Checking date: 1902-01-01 Saving: 1902-01-01-905929.txt ''' directory = '%s%s/text/' % (HARVEST_DIR, title_id) if not os.path.exists(directory): os.makedirs(directory) start_date = convert_iso_to_datetime(start) end_date = convert_iso_to_datetime(end) one_day = datetime.timedelta(days=1) this_day = start_date # Loop through each day in specified period while this_day <= end_date: print 'Checking date: %s' % this_day.isoformat() try: page_url = get_front_page_url(this_day, title_id) except IssueError: print 'No such issue.' else: page_id = get_front_page_id(None, None, page_url) filename = '%s%s-%s.txt' % (directory, this_day.isoformat(), page_id) if not os.path.exists(filename): np = scrape.TroveNewspapersClient() np.extract_page_articles(page_url) articles = np.results page_text = '' for article in articles: page_text += article['text'] print 'Saving: %s' % os.path.basename(filename) with open(filename, 'wb') as f: f.write(page_text) this_day += one_day time.sleep(1)