def harvest_front_pages(start, end, title_id, size='small'):
    '''
    Harvest images of front pages of the given title over the specified period.
    start and end are dates in ISO YYYY-MM-DD format, eg: '1857-02-04'
    
    >>> harvest_front_pages('1902-01-01','1902-01-01', '34')
    Checking date: 1902-01-01
    Saving: 1902-01-01-905929-small.jpg
    
    '''
    directory = '%s%s' % (HARVEST_DIR, title_id)
    if not os.path.exists(directory):
        os.makedirs(directory)
    start_date = convert_iso_to_datetime(start)
    end_date = convert_iso_to_datetime(end)
    one_day = datetime.timedelta(days=1)
    this_day = start_date
    # Loop through each day in specified period 
    while this_day <= end_date:
        print 'Checking date: %s' % this_day.isoformat()
        try:
            page_id = get_front_page_id(this_day, title_id)
        except IssueError:
            print 'No such issue.'
        else:
            filename = '%s/%s-%s-%s.jpg' % (directory, this_day.isoformat(), page_id, size)
            if not os.path.exists(filename):
                image = get_front_page_image(None, None, page_id, size=size)
                print 'Saving: %s' % os.path.basename(filename)
                with open(filename, 'wb') as f:
                    f.write(image)            
        this_day += one_day
        time.sleep(1)
Beispiel #2
0
def harvest_front_pages(start, end, title_id, size='small'):
    '''
    Harvest images of front pages of the given title over the specified period.
    start and end are dates in ISO YYYY-MM-DD format, eg: '1857-02-04'
    
    >>> harvest_front_pages('1902-01-01','1902-01-01', '34')
    Checking date: 1902-01-01
    Saving: 1902-01-01-905929-small.jpg
    
    '''
    directory = '%s%s' % (HARVEST_DIR, title_id)
    if not os.path.exists(directory):
        os.makedirs(directory)
    start_date = convert_iso_to_datetime(start)
    end_date = convert_iso_to_datetime(end)
    one_day = datetime.timedelta(days=1)
    this_day = start_date
    # Loop through each day in specified period
    while this_day <= end_date:
        print 'Checking date: %s' % this_day.isoformat()
        try:
            page_id = get_front_page_id(this_day, title_id)
        except IssueError:
            print 'No such issue.'
        else:
            filename = '%s/%s-%s-%s.jpg' % (directory, this_day.isoformat(),
                                            page_id, size)
            if not os.path.exists(filename):
                image = get_front_page_image(None, None, page_id, size=size)
                if image:
                    print 'Saving: %s' % os.path.basename(filename)
                    with open(filename, 'wb') as f:
                        f.write(image)
        this_day += one_day
        time.sleep(1)
Beispiel #3
0
def harvest_front_pages_text(start, end, title_id):
    '''
    Harvest and concatenate text content of all articles on front page.
    start and end are dates in ISO YYYY-MM-DD format, eg: '1857-02-04'
    
    >>> harvest_front_pages_text('1902-01-01','1902-01-01', '34')
    Checking date: 1902-01-01
    Saving: 1902-01-01-905929.txt
    '''
    directory = '%s%s/text/' % (HARVEST_DIR, title_id)
    if not os.path.exists(directory):
        os.makedirs(directory)
    start_date = convert_iso_to_datetime(start)
    end_date = convert_iso_to_datetime(end)
    one_day = datetime.timedelta(days=1)
    this_day = start_date
    # Loop through each day in specified period
    while this_day <= end_date:
        print 'Checking date: %s' % this_day.isoformat()
        try:
            page_url = get_front_page_url(this_day, title_id)
        except IssueError:
            print 'No such issue.'
        else:
            page_id = get_front_page_id(None, None, page_url)
            filename = '%s%s-%s.txt' % (directory, this_day.isoformat(),
                                        page_id)
            if not os.path.exists(filename):
                np = scrape.TroveNewspapersClient()
                np.extract_page_articles(page_url)
                articles = np.results
                page_text = ''
                for article in articles:
                    page_text += article['text']
                print 'Saving: %s' % os.path.basename(filename)
                with open(filename, 'wb') as f:
                    f.write(page_text)
        this_day += one_day
        time.sleep(1)
def harvest_front_pages_text(start, end, title_id):
    '''
    Harvest and concatenate text content of all articles on front page.
    start and end are dates in ISO YYYY-MM-DD format, eg: '1857-02-04'
    
    >>> harvest_front_pages_text('1902-01-01','1902-01-01', '34')
    Checking date: 1902-01-01
    Saving: 1902-01-01-905929.txt
    '''
    directory = '%s%s/text/' % (HARVEST_DIR, title_id)
    if not os.path.exists(directory):
        os.makedirs(directory)
    start_date = convert_iso_to_datetime(start)
    end_date = convert_iso_to_datetime(end)
    one_day = datetime.timedelta(days=1)
    this_day = start_date
    # Loop through each day in specified period 
    while this_day <= end_date:
        print 'Checking date: %s' % this_day.isoformat()
        try:
            page_url = get_front_page_url(this_day, title_id)
        except IssueError:
            print 'No such issue.'
        else:
            page_id = get_front_page_id(None, None, page_url)
            filename = '%s%s-%s.txt' % (directory, this_day.isoformat(), page_id)
            if not os.path.exists(filename):
                np = scrape.TroveNewspapersClient()
                np.extract_page_articles(page_url)
                articles = np.results
                page_text = ''
                for article in articles:
                    page_text += article['text']
                print 'Saving: %s' % os.path.basename(filename)
                with open(filename, 'wb') as f:
                    f.write(page_text)            
        this_day += one_day
        time.sleep(1)