Esempio n. 1
0
def get_issues_by_titles(titles=None):
    '''
    For each title calculate the total number of issues on Trove, and the number of issues for each year.
    
    >>> get_issues_by_titles(['32'])
    The Hobart Town Mercury (Tas. : 1857): 142 issues
    [{'total_issues': 142, 'title_id': u'32', 'title_name': u'The Hobart Town Mercury (Tas. : 1857)', 'issues_by_year': {u'1857': 142}}]
    
    '''
    issue_totals = []
    title_list = json.load(get_url(TITLES_URL))
    for title in title_list:
        if title['id'] in titles:
            title_url = '%s%s' % (TITLE_HOLDINGS_URL, title['id'])
            holdings = json.load(get_url(title_url))
            current_year = holdings[0]['y']
            totals = {}
            total = 0
            for month in holdings:
                if current_year != month['y']:
                    current_year = month['y']
                try:
                    totals[current_year] += int(month['c'])
                except KeyError:
                    totals[current_year] = int(month['c'])
                total += int(month['c'])
            issue_totals.append({
                'title_id': title['id'],
                'title_name': title['name'],
                'total_issues': total,
                'issues_by_year': totals
            })
            print '%s: %s issues' % (title['name'], total)
    return issue_totals
Esempio n. 2
0
def get_issues_by_titles(titles=None):
    '''
    For each title calculate the total number of issues on Trove, and the number of issues for each year.
    
    >>> get_issues_by_titles(['32'])
    The Hobart Town Mercury (Tas. : 1857): 142 issues
    [{'total_issues': 142, 'title_id': u'32', 'title_name': u'The Hobart Town Mercury (Tas. : 1857)', 'issues_by_year': {u'1857': 142}}]
    
    '''
    issue_totals = []
    title_list = json.load(get_url(TITLES_URL))
    for title in title_list:
        if title['id'] in titles:
            title_url = '%s%s' % (TITLE_HOLDINGS_URL, title['id'])
            holdings = json.load(get_url(title_url))
            current_year = holdings[0]['y']
            totals = {}
            total = 0
            for month in holdings:
                if current_year != month['y']:
                    current_year = month['y']
                try:
                    totals[current_year] += int(month['c'])
                except KeyError:
                    totals[current_year] = int(month['c'])
                total += int(month['c'])
            issue_totals.append({'title_id': title['id'], 'title_name': title['name'], 'total_issues': total, 'issues_by_year': totals})
            print '%s: %s issues' % (title['name'], total)
    return issue_totals
Esempio n. 3
0
def get_titles(locate=False):
    '''
    Retrieves a list of current newspaper titles from Trove.
    Retrieves current holdings details about each title.
    Saves details of newspapers with holdings to a list.
    Returns a list of dictionaries with the following fields:
    name, id, state, start_year, start_month, end_year, end_month.
    '''
    title_list = json.load(get_url(TITLES_URL))
    titles = []
    for title in title_list:
        name = title['name']
        print unicode(name).encode('utf-8')
        try:
            place, state = re.search(r'\(([a-zA-Z \.]+, )*?(National|ACT|NSW|NT|Qld|QLD|SA|Tas|TAS|Vic|VIC|WA)\.*?', 
                              name).groups()
        except AttributeError:
            place = None
            state = 'national'
        if locate and place is None and state is not 'national':
            locate_title(name)       
        url = '%s%s' % (TITLE_HOLDINGS_URL, title['id'])
        holdings = json.load(get_url(url))
        #Only save those who have holdings online
        if len(holdings) > 0:
            titles.append({'name': name,
                               'id': title['id'],
                               'state': state,
                               'place': place,
                               'start_year': holdings[0]['y'],
                               'start_month': holdings[0]['m'],
                               'end_year': holdings[-1]['y'],
                               'end_month': holdings[-1]['m'],
                               })
    return titles
Esempio n. 4
0
def sample_front_pages(size='thumb'):
    '''
    Retrieve a front page image for every title at monthly intervals.
    '''
    titles = json.load(get_url(TITLES_URL))
    for title in titles:
        print 'Processing: %s' % title['name']
        directory = '%ssamples/%s' % (HARVEST_DIR, title['id'])
        if not os.path.exists(directory):
            os.makedirs(directory)
        title_url = '%s%s' % (TITLE_HOLDINGS_URL, title['id'])
        holdings = json.load(get_url(title_url))
        for month in holdings:
            month_url = '%s%s/%s' % (MONTH_ISSUES_URL, month['y'], month['m'])
            issues = json.load(get_url(month_url))
            for issue in issues:
                if issue['t'] == title['id']:
                    first_issue = issue
                    break
            first_issue_id = first_issue['iss']
            first_issue_date = datetime.date(int(month['y']), int(month['m']), int(first_issue['p']))
            print 'Checking date: %s' % first_issue_date.isoformat()
            page_id = get_front_page_id(first_issue_date, title['id'])
            filename = '%s/%s-%s-%s.jpg' % (directory, first_issue_date.isoformat(), page_id, size)
            if not os.path.exists(filename):
                image = get_front_page_image(None, None, page_id, size=size)
                print 'Saving: %s' % filename
                with open(filename, 'wb') as f:
                    f.write(image) 
Esempio n. 5
0
def get_title_issues(title, year):
    title_url = '%s%s' % (TITLE_HOLDINGS_URL, title)
    holdings = json.load(get_url(title_url))
    issues = []
    for month in holdings:
        if month['y'] == str(year): 
            month_url = '%s%s/%s' % (MONTH_ISSUES_URL, month['y'], month['m'])
            print month_url
            month_issues = json.load(get_url(month_url))
            for issue in month_issues:
                if issue['t'] == str(title):
                    issue_date = get_issue_date(issue['iss'])
                    issues.append({'id': issue['iss'], 'date': issue_date.isoformat()})
    return issues
Esempio n. 6
0
def get_issue_url(date, title_id):
    '''
    Gets the issue url given a title and date.
    
    >>> get_issue_url(datetime.date(1925,1,1), '35')
    u'http://trove.nla.gov.au/ndp/del/issue/120168'
    
    '''
    if type(date) is datetime.date:
        year, month, day = date.timetuple()[:3]
    else:
        year, month, day = (int(num) for num in date.split('-'))
    data_file = os.path.join(ISSUE_DATA_DIR, '%s-%s.js' % (year, month))
    if os.path.exists(data_file):
        with open(data_file, 'rb') as issue_data:
            issues = json.load(issue_data)
    else:
        url = '%s%s/%02d' % (MONTH_ISSUES_URL, year, month)
        issues = json.load(get_url(url))
        with open(data_file, 'wb') as issue_data:
            json.dump(issues, issue_data)
    issue_id = None
    issue_url = None
    for issue in issues:
        if issue['t'] == title_id and int(issue['p']) == day:
            issue_id = issue['iss']
            break
    if issue_id:
        issue_url = '%s%s' % (ISSUE_URL, issue_id)
    else:
        raise IssueError
    return issue_url
Esempio n. 7
0
def get_issue_date(issue_id):
    issue_url = '%s%s' % (ISSUE_URL, issue_id)
    response = get_url(issue_url)
    page = BeautifulSoup(response.read())
    issue_date = page.find('div', 'issue').strong.string
    issue_datetime = parse_date(issue_date)
    return issue_datetime
Esempio n. 8
0
def get_issue_url(date, title_id):
    '''
    Gets the issue url given a title and date.
    
    >>> get_issue_url(datetime.date(1925,1,1), '35')
    u'http://trove.nla.gov.au/ndp/del/issue/120168'
    
    '''
    if type(date) is datetime.date:
        year, month, day = date.timetuple()[:3]
    else:
        year, month, day = (int(num) for num in date.split('-'))
    data_file = os.path.join(ISSUE_DATA_DIR, '%s-%s.js' % (year, month))
    if os.path.exists(data_file):
        with open(data_file, 'rb') as issue_data:
            issues = json.load(issue_data)
    else:
        url = '%s%s/%02d' % (MONTH_ISSUES_URL, year, month)
        issues = json.load(get_url(url))
        with open(data_file, 'wb') as issue_data:
            json.dump(issues, issue_data)
    issue_id = None
    issue_url = None
    for issue in issues:
        if issue['t'] == title_id and int(issue['p']) == day:
            issue_id = issue['iss']
            break
    if issue_id:
        issue_url = '%s%s' % (ISSUE_URL, issue_id)
    else:
        raise IssueError
    return issue_url
Esempio n. 9
0
def get_issue_url(date, title_id):
    '''
    Gets the issue url given a title and date.
    
    >>> get_issue_url(datetime.date(1925,1,1), '35')
    u'http://trove.nla.gov.au/ndp/del/issue/120168'
    
    '''
    if type(date) is datetime.date:
        year, month, day = date.timetuple()[:3]
    else:
        year, month, day = (int(num) for num in date.split('-'))
    url = '%s%s/%02d' % (MONTH_ISSUES_URL, year, month)
    issues = json.load(get_url(url))
    issue_id = None
    issue_url = None
    for issue in issues:
        if issue['t'] == title_id and int(issue['p']) == day:
            issue_id = issue['iss']
            break
    if issue_id:
        issue_url = '%s%s' % (ISSUE_URL, issue_id)
    else:
        raise IssueError
    return issue_url
Esempio n. 10
0
def get_issue_date(issue_id):
    issue_url = '%s%s' % (ISSUE_URL, issue_id)
    response = get_url(issue_url)
    page = BeautifulSoup(response.read())
    issue_date = page.find('div', 'issue').strong.string
    issue_datetime = parse_date(issue_date)
    return issue_datetime
Esempio n. 11
0
def get_title_issues(title, year):
    title_url = '%s%s' % (TITLE_HOLDINGS_URL, title)
    holdings = json.load(get_url(title_url))
    issues = []
    for month in holdings:
        if month['y'] == str(year):
            month_url = '%s%s/%s' % (MONTH_ISSUES_URL, month['y'], month['m'])
            print month_url
            month_issues = json.load(get_url(month_url))
            for issue in month_issues:
                if issue['t'] == str(title):
                    issue_date = get_issue_date(issue['iss'])
                    issues.append({
                        'id': issue['iss'],
                        'date': issue_date.isoformat()
                    })
    return issues
Esempio n. 12
0
def get_issue_totals_years(title_id):
    url = '%s%s/?encoding=json&key=%s&include=years' % (TROVE_TITLE_URL, title_id, TROVE_KEY)
    print url
    results = json.load(get_url(url))
    issues = {}
    for year in results['newspaper']['year']:
        issues[int(year['date'])] = int(year['issuecount'])
    return issues
Esempio n. 13
0
def get_issue_totals_years(title_id):
    url = '%s%s/?encoding=json&key=%s&include=years' % (TROVE_TITLE_URL,
                                                        title_id, TROVE_KEY)
    print url
    results = json.load(get_url(url))
    issues = {}
    for year in results['newspaper']['year']:
        issues[int(year['date'])] = int(year['issuecount'])
    return issues
Esempio n. 14
0
def sample_front_pages(size='thumb'):
    '''
    Retrieve a front page image for every title at monthly intervals.
    '''
    titles = json.load(get_url(TITLES_URL))
    for title in titles:
        print 'Processing: %s' % title['name']
        directory = '%ssamples/%s-%s' % (HARVEST_DIR, title['id'],
                                         title['name'])
        if not os.path.exists(directory):
            os.makedirs(directory)
            title_url = '%s%s' % (TITLE_HOLDINGS_URL, title['id'])
            holdings = json.load(get_url(title_url))
            for month in holdings:
                month_url = '%s%s/%s' % (MONTH_ISSUES_URL, month['y'],
                                         month['m'])
                issues = json.load(get_url(month_url))
                for issue in issues:
                    if issue['t'] == title['id']:
                        first_issue = issue
                        break
                first_issue_id = first_issue['iss']
                first_issue_date = datetime.date(int(month['y']),
                                                 int(month['m']),
                                                 int(first_issue['p']))
                print 'Checking date: %s' % first_issue_date.isoformat()
                page_id = get_front_page_id(first_issue_date, title['id'])
                filename = '%s/%s-%s-%s-%s-p1.jpg' % (
                    directory, first_issue_id, first_issue_date.isoformat(),
                    page_id, size)
                if not os.path.exists(filename):
                    image = get_front_page_image(None,
                                                 None,
                                                 page_id,
                                                 size=size)
                    if image:
                        print 'Saving: %s' % filename
                        with open(filename, 'wb') as f:
                            f.write(image)
Esempio n. 15
0
def get_front_page_url(date, title_id):
    '''
    Gets the url of the front page given a date and a title

    >>> get_front_page_url(datetime.date(1925,1,1), '35')
    'http://trove.nla.gov.au/ndp/del/page/1223077'
    
    >>> get_front_page_url('1925-01-01', '35')
    'http://trove.nla.gov.au/ndp/del/page/1223077'
    
    '''
    issue_url = get_issue_url(date, title_id)
    response = get_url(issue_url)
    return response.geturl()
Esempio n. 16
0
def get_front_page_image(date, title_id, page_id=None, size='small'):
    '''
    Retrieves jpg of front page.
    Small images are about 300px wide.
    Thumbs are 150px high.  
    '''
    if not page_id:
        page_id = get_front_page_id(date, title_id)
    if size == 'small':
        image_url = '%s%s' % (scrape.IMAGE_PATH, page_id)
    elif size == 'thumb':
        image_url = '%s%s/thumb' % (scrape.IMAGE_PATH, page_id)
    response = get_url(image_url)
    return response.read()
Esempio n. 17
0
def get_front_page_url(date, title_id):
    '''
    Gets the url of the front page given a date and a title

    >>> get_front_page_url(datetime.date(1925,1,1), '35')
    'http://trove.nla.gov.au/ndp/del/page/1223077'
    
    >>> get_front_page_url('1925-01-01', '35')
    'http://trove.nla.gov.au/ndp/del/page/1223077'
    
    '''
    issue_url = get_issue_url(date, title_id)
    response = get_url(issue_url)
    return response.geturl()
Esempio n. 18
0
def get_titles(locate=False):
    '''
    Retrieves a list of current newspaper titles from Trove.
    Retrieves current holdings details about each title.
    Saves details of newspapers with holdings to a list.
    Returns a list of dictionaries with the following fields:
    name, id, state, start_year, start_month, end_year, end_month.
    '''
    title_list = json.load(get_url(TITLES_URL))
    titles = []
    for title in title_list:
        name = title['name']
        print unicode(name).encode('utf-8')
        try:
            place, state = re.search(
                r'\(([a-zA-Z \.]+, )*?(National|ACT|NSW|NT|Qld|QLD|SA|Tas|TAS|Vic|VIC|WA)\.*?',
                name).groups()
        except AttributeError:
            place = None
            state = 'national'
        if locate and place is None and state is not 'national':
            locate_title(name)
        url = '%s%s' % (TITLE_HOLDINGS_URL, title['id'])
        holdings = json.load(get_url(url))
        #Only save those who have holdings online
        if len(holdings) > 0:
            titles.append({
                'name': name,
                'id': title['id'],
                'state': state,
                'place': place,
                'start_year': holdings[0]['y'],
                'start_month': holdings[0]['m'],
                'end_year': holdings[-1]['y'],
                'end_month': holdings[-1]['m'],
            })
    return titles
Esempio n. 19
0
def get_front_page_image(date, title_id, page_id=None, size='small'):
    '''
    Retrieves jpg of front page.
    Small images are about 300px wide.
    Thumbs are 150px high.  
    '''
    if not page_id:
        page_id = get_front_page_id(date, title_id)
    if size == 'small':
        image_url = '%s%s' % (scrape.IMAGE_PATH, page_id)
    elif size == 'thumb':
        image_url = '%s%s/thumb' % (scrape.IMAGE_PATH, page_id)
    try:
        response = get_url(image_url)
    except HTTPError:
        return None
    else:
        return response.read()
Esempio n. 20
0
def get_front_page_totals():
    categories = {
        'Article': 'article',
        'Advertising': 'advertising',
        'Detailed lists, results, guides': 'lists',
        'Family Notices': 'family',
        'Literature': 'literature'
    }
    output_dir = os.path.join(HARVEST_DIR, 'frontpages')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    #newspapers = []
    titles = []
    titles_file = os.path.join(output_dir, 'titles.js')
    results = json.load(
        get_url('%s?encoding=json&key=%s' % (TROVE_TITLES_URL, TROVE_KEY)))
    for newspaper_result in results['response']['records']['newspaper']:
        titles.append([newspaper_result['id'], newspaper_result['title']])
        with open(titles_file, 'wb') as titles_js:
            titles_js.write('var titles = %s;' % json.dumps(titles))
    for newspaper_result in results['response']['records']['newspaper']:
        id = newspaper_result['id']
        print 'Processing: %s' % newspaper_result['title']
        newspaper_dir = os.path.join(output_dir, id)
        if not os.path.exists(newspaper_dir):
            os.makedirs(newspaper_dir)
        years_file = os.path.join(newspaper_dir, 'year_totals.js')
        if not os.path.exists(years_file):
            issues_years = get_issue_totals_years(id)
            #newspaper['years'] = {}
            start_date = datetime.date(
                *map(int, re.split('[^\d]', newspaper_result['startDate'])))
            end_date = datetime.date(
                *map(int, re.split('[^\d]', newspaper_result['endDate'])))
            #for each year get month summaries
            year_totals = {}
            num_issues_year = {}
            for year in range(start_date.year, end_date.year + 1):
                print 'Year: %s' % year
                year_totals[year] = {}
                num_issues_year[year] = 0
                num_issues_month = {}
                year_dir = os.path.join(newspaper_dir, str(year))
                if not os.path.exists(year_dir):
                    os.makedirs(year_dir)
                '''
                # First we need to get the number of issues per month
                url = '%s%s/?encoding=json&key=%s&include=years&range=%s0101-%s1231' % (TROVE_TITLE_URL, newspaper['id'], TROVE_KEY, year, year)
                results = json.load(get_url(url))
                for year_issues in results['newspaper']['year']:
                    if year_issues['date'] == str(year):
                        issues_months = {}
                        for issue in year_issues['issue']:
                            issue_date = datetime.date(*map(int, re.split('[^\d]', issue['date'])))
                            try:
                                issues_months[issue_date.month] += 1
                            except KeyError:
                                issues_months[issue_date.month] = 1
                '''
                # Then we can get article details per month
                year_file = os.path.join(newspaper_dir, '%s.js' % year)
                if not os.path.exists(year_file):
                    print 'Getting article details...'
                    month_totals = {}
                    for month in range(1, 13):
                        month_totals[month] = {}
                        issue_totals = {}
                        article_list = {}
                        print 'Month: %s' % month
                        month_totals[month] = {}
                        url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-year=%s&l-month=%02d&reclevel=full&n=100' % (
                            TROVE_API_URL, TROVE_KEY, id, year, month)
                        results = json.load(get_url(url))
                        total = int(
                            results['response']['zone'][0]['records']['total'])
                        if total > 0:
                            articles = results['response']['zone'][0][
                                'records']['article']
                            if total > 100:
                                n = 100
                                s = 0
                                while n == 100:
                                    next_url = '%s&s=%s' % (url, n + s)
                                    print next_url
                                    results = json.load(get_url(next_url))
                                    s = int(results['response']['zone'][0]
                                            ['records']['s'])
                                    n = int(results['response']['zone'][0]
                                            ['records']['n'])
                                    if n > 0:
                                        articles.extend(
                                            results['response']['zone'][0]
                                            ['records']['article'])
                            for article in articles:
                                article_date = datetime.date(*map(
                                    int, re.split('[^\d]', article['date'])))
                                #Calculate totals for the month
                                if article['category'] != 'Other':
                                    cat = categories[article['category']]
                                    try:
                                        year_totals[year][cat]['total'] += 1
                                        year_totals[year][cat][
                                            'words'] += article['wordCount']
                                    except KeyError:
                                        year_totals[year][cat] = {}
                                        year_totals[year][cat]['total'] = 1
                                        year_totals[year][cat][
                                            'words'] = article['wordCount']
                                    try:
                                        month_totals[month][cat]['total'] += 1
                                        month_totals[month][cat][
                                            'words'] += article['wordCount']
                                    except KeyError:
                                        month_totals[month][cat] = {}
                                        month_totals[month][cat]['total'] = 1
                                        month_totals[month][cat][
                                            'words'] = article['wordCount']
                                    # Calculate totals for each issue
                                    try:
                                        issue_totals[
                                            article['date']][cat]['total'] += 1
                                        issue_totals[article['date']][cat][
                                            'words'] += article['wordCount']
                                    except KeyError:
                                        try:
                                            issue_totals[
                                                article['date']][cat] = {}
                                            issue_totals[article['date']][cat][
                                                'total'] = 1
                                            issue_totals[article['date']][cat][
                                                'words'] = article['wordCount']
                                        except KeyError:
                                            issue_totals[article['date']] = {}
                                            issue_totals[
                                                article['date']][cat] = {}
                                            issue_totals[article['date']][cat][
                                                'total'] = 1
                                            issue_totals[article['date']][cat][
                                                'words'] = article['wordCount']
                                    article_details = {
                                        'date': article['date'],
                                        'heading': article['heading'],
                                        'category': article['category'],
                                        'word_count': article['wordCount'],
                                        'url': article['identifier']
                                    }
                                    try:
                                        article_list[article['date']][
                                            'page_url'] = article[
                                                'trovePageUrl']
                                    except KeyError:
                                        article_list[article['date']] = {}
                                        article_list[article['date']][
                                            'page_url'] = article[
                                                'trovePageUrl']
                                    try:
                                        article_list[article['date']][
                                            'articles'].append(article_details)
                                    except KeyError:
                                        article_list[
                                            article['date']]['articles'] = []
                                        article_list[article['date']][
                                            'articles'].append(article_details)
                        for date, details in article_list.items():
                            with open(os.path.join(year_dir, '%s.js' % date),
                                      'wb') as date_js:
                                json.dump(details, date_js)
                        num_issues_month[month] = len(article_list)
                        num_issues_year[year] += len(article_list)
                        month_file = os.path.join(year_dir, '%s.js' % month)
                        with open(month_file, 'wb') as month_js:
                            json.dump(issue_totals, month_js)
                            '''
                            for category in categories.values():
                                total_list = []
                                words_list = []
                                for issue, values in issue_totals.items():
                                    try:
                                        total_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), values[category]['total']))
                                        words_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), values[category]['words']))
                                    except KeyError:
                                        total_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), 0))
                                        words_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), 0))
                                month_js.write('var %s_totals = %s;\n' % (category, json.dumps(total_list)))
                                month_js.write('var %s_words = %s;\n' % (category, json.dumps(words_list)))
                            month_js.write('var articles = %s;\n' % json.dumps(article_list))
                            '''
                    for month, values in month_totals.items():
                        num_issues = num_issues_month[month]
                        for cat, totals in values.items():
                            total = totals['total']
                            words = totals['words']
                            if total > 0:
                                totals['total'] = float(total) / num_issues
                            if words > 0:
                                totals['words'] = float(words) / num_issues
                    with open(year_file, 'wb') as year_js:
                        json.dump(month_totals, year_js)
                        '''
                        for category in categories.values():
                            total_list = []
                            words_list = []
                            for month, values in month_totals.items():
                                try:
                                    total = values[category]['total']
                                    words = values[category]['words']
                                except KeyError:
                                    total = 0
                                    words = 0
                                num_issues = num_issues_month[month]
                                if total > 0: total = float(total) / num_issues
                                if words > 0: words = float(words) / num_issues
                                total_list.append((month, total))
                                words_list.append((month, words))
                            year_js.write('var %s_totals = %s;\n' % (category, json.dumps(total_list))) 
                            year_js.write('var %s_words = %s;\n' % (category, json.dumps(words_list)))
                            '''
                        #print 'No %s' % category
                    # Then we can get articles by month facets
                    '''
                    print 'Getting totals by month...'
                    newspaper['years'][year]['months'] = {}
                    for category, label in categories.items():
                        #print url
                        url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-category=%s&l-year=%s&facet=month&n=0' % (TROVE_API_URL, TROVE_KEY, newspaper['id'], quote_plus(category), year)
                        results = json.load(get_url(url))
                        try:
                            months = results['response']['zone'][0]['facets']['facet']['term']
                        except TypeError:
                            months = []
                        for month_result in months:
                            month = int(month_result['search'])
                            count = float(month_result['count'])
                            if count != 0:
                                try:
                                    count = count / issues[month]
                                except KeyError:
                                    count = 0
                            try:
                                newspaper['years'][year]['months'][month][label]['total'] = count
                            except KeyError:
                                try:
                                    newspaper['years'][year]['months'][month][label] = {}
                                    newspaper['years'][year]['months'][month][label]['total'] = count
                                except KeyError:
                                    newspaper['years'][year]['months'][month] = {}
                                    newspaper['years'][year]['months'][month][label] = {}
                                    newspaper['years'][year]['months'][month][label]['total'] = count
                            try:
                                newspaper['years'][year]['months'][month][label]['words'] = month_totals[month][label]['words']
                            except KeyError:
                                newspaper['years'][year]['months'][month][label]['words'] = 0
                    year_file = os.path.join(newspaper_dir, '%s.js' % year)
                    print 'Writing %s' % year_file
                    with open(year_file, 'wb') as year_js:
                        for category in categories.values():
                            try:
                                totals = [(month, values[category]['total']) for month, values in newspaper['years'][year]['months'].items()]
                                print totals
                                year_js.write('var %s_totals = %s;\n' % (category, json.dumps(totals))) 
                                words = [(month, values[category]['words']) for month, values in newspaper['years'][year]['months'].items()]
                                year_js.write('var %s_words = %s;\n' % (category, json.dumps(words)))
                            except KeyError:
                                print 'No %s' % category
                # for each decade get year summaries
                print 'Getting totals by year...'
                start_decade = str(start_date.year)[:3]
                end_decade = str(end_date.year)[:3]
                for decade in range(int(start_decade), int(end_decade)+1):
                    for category, label in categories.items():
                        url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-category=%s&l-decade=%s&facet=year&n=0' % (TROVE_API_URL, TROVE_KEY, newspaper['id'], quote_plus(category), decade)
                        for num in range(0,10):
                            year = int('%s%s' % (decade, num))
                            try:
                                newspaper['years'][year][label] = {}
                            except KeyError:
                                newspaper['years'][year] = {}
                                newspaper['years'][year][label] = {}
                        results = json.load(get_url(url))
                        try:
                            years = results['response']['zone'][0]['facets']['facet']['term']
                        except TypeError:
                            years = []
                        for year_result in years:
                            year = int(year_result['display'])
                            count = float(year_result['count'])
                            if count != 0:
                                count = count / newspaper['issues'][year]
                            newspaper['years'][year][label]['total'] = count
                            try:
                                newspaper['years'][year][label]['words'] = year_totals[year][label]['words']
                            except KeyError:
                                newspaper['years'][year][label]['words'] = 0
                    print 'Writing %s' % years_file
                    with open(years_file, 'wb') as years_js:
                        for category in categories.values():
                            try:
                                totals = [(year, values[category]['total']) for year, values in newspaper['years'].items()]
                                years_js.write('var %s_totals = %s;\n' % (category, json.dumps(totals)))
                                words = [(year, values[category]['words']) for year, values in newspaper['years'].items()]
                                years_js.write('var %s_words = %s;\n' % (category, json.dumps(words)))
                            except KeyError:
                                print 'No %s' % category
        '''
            print 'Getting totals for this year...'
            for year, values in year_totals.items():
                num_issues = num_issues_year[year]
                for cat, totals in values.items():
                    total = totals['total']
                    words = totals['words']
                    if total > 0: totals['total'] = float(total) / num_issues
                    if words > 0: totals['words'] = float(words) / num_issues
            with open(years_file, 'wb') as years_js:
                json.dump(year_totals, years_js)
            '''
Esempio n. 21
0
def get_front_page_totals():
    categories = {'Article': 'article', 'Advertising': 'advertising', 'Detailed lists, results, guides': 'lists', 'Family Notices': 'family', 'Literature': 'literature'}
    output_dir = os.path.join(HARVEST_DIR, 'frontpages')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)   
    #newspapers = []
    titles = []
    titles_file = os.path.join(output_dir, 'titles.js')
    results = json.load(get_url('%s?encoding=json&key=%s' % (TROVE_TITLES_URL, TROVE_KEY)))
    for newspaper_result in results['response']['records']['newspaper']:
        titles.append([newspaper_result['id'], newspaper_result['title']])
        with open(titles_file, 'wb') as titles_js:
            titles_js.write('var titles = %s;' % json.dumps(titles))
    for newspaper_result in results['response']['records']['newspaper']:
        id = newspaper_result['id']
        print 'Processing: %s' % newspaper_result['title']
        newspaper_dir = os.path.join(output_dir, id)
        if not os.path.exists(newspaper_dir):
            os.makedirs(newspaper_dir)
        years_file = os.path.join(newspaper_dir, 'year_totals.js')
        if not os.path.exists(years_file):
            issues_years = get_issue_totals_years(id)
            #newspaper['years'] = {}
            start_date = datetime.date(*map(int, re.split('[^\d]', newspaper_result['startDate'])))
            end_date = datetime.date(*map(int, re.split('[^\d]', newspaper_result['endDate'])))
            #for each year get month summaries
            year_totals = {}
            num_issues_year = {}
            for year in range(start_date.year, end_date.year+1):
                print 'Year: %s' % year
                year_totals[year] = {}
                num_issues_year[year] = 0
                num_issues_month = {}
                year_dir = os.path.join(newspaper_dir, str(year))
                if not os.path.exists(year_dir):
                    os.makedirs(year_dir)
                '''
                # First we need to get the number of issues per month
                url = '%s%s/?encoding=json&key=%s&include=years&range=%s0101-%s1231' % (TROVE_TITLE_URL, newspaper['id'], TROVE_KEY, year, year)
                results = json.load(get_url(url))
                for year_issues in results['newspaper']['year']:
                    if year_issues['date'] == str(year):
                        issues_months = {}
                        for issue in year_issues['issue']:
                            issue_date = datetime.date(*map(int, re.split('[^\d]', issue['date'])))
                            try:
                                issues_months[issue_date.month] += 1
                            except KeyError:
                                issues_months[issue_date.month] = 1
                '''
                # Then we can get article details per month
                year_file = os.path.join(newspaper_dir, '%s.js' % year)
                if not os.path.exists(year_file):
                    print 'Getting article details...'
                    month_totals = {}
                    for month in range(1, 13):
                        month_totals[month] = {}
                        issue_totals = {}
                        article_list = {}
                        print 'Month: %s' % month
                        month_totals[month] = {}
                        url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-year=%s&l-month=%02d&reclevel=full&n=100' % (TROVE_API_URL, TROVE_KEY, id, year, month)
                        results = json.load(get_url(url))
                        total = int(results['response']['zone'][0]['records']['total'])
                        if total > 0:
                            articles = results['response']['zone'][0]['records']['article']
                            if total > 100:
                                n = 100
                                s = 0
                                while n == 100:
                                    next_url = '%s&s=%s' % (url, n+s)
                                    print next_url
                                    results = json.load(get_url(next_url))
                                    s = int(results['response']['zone'][0]['records']['s'])
                                    n = int(results['response']['zone'][0]['records']['n'])
                                    if n > 0:
                                        articles.extend(results['response']['zone'][0]['records']['article'])
                            for article in articles:
                                article_date = datetime.date(*map(int, re.split('[^\d]', article['date'])))
                                #Calculate totals for the month
                                if article['category'] != 'Other':
                                    cat = categories[article['category']]
                                    try:
                                        year_totals[year][cat]['total'] += 1
                                        year_totals[year][cat]['words'] += article['wordCount']
                                    except KeyError:
                                        year_totals[year][cat] = {}
                                        year_totals[year][cat]['total'] = 1
                                        year_totals[year][cat]['words'] = article['wordCount']
                                    try:
                                        month_totals[month][cat]['total'] += 1
                                        month_totals[month][cat]['words'] += article['wordCount']
                                    except KeyError:
                                        month_totals[month][cat] = {}
                                        month_totals[month][cat]['total'] = 1
                                        month_totals[month][cat]['words'] = article['wordCount']
                                    # Calculate totals for each issue
                                    try:
                                        issue_totals[article['date']][cat]['total'] += 1
                                        issue_totals[article['date']][cat]['words'] += article['wordCount']
                                    except KeyError:
                                        try:
                                            issue_totals[article['date']][cat] = {}
                                            issue_totals[article['date']][cat]['total'] = 1
                                            issue_totals[article['date']][cat]['words'] = article['wordCount']
                                        except KeyError:
                                            issue_totals[article['date']] = {}
                                            issue_totals[article['date']][cat] = {}
                                            issue_totals[article['date']][cat]['total'] = 1
                                            issue_totals[article['date']][cat]['words'] = article['wordCount']
                                    article_details = {'date': article['date'], 'heading': article['heading'], 'category': article['category'], 'word_count': article['wordCount'], 'url': article['identifier']}
                                    try:
                                        article_list[article['date']]['page_url'] = article['trovePageUrl']
                                    except KeyError:
                                        article_list[article['date']] = {}
                                        article_list[article['date']]['page_url'] = article['trovePageUrl']
                                    try:
                                        article_list[article['date']]['articles'].append(article_details)
                                    except KeyError:
                                        article_list[article['date']]['articles'] = []
                                        article_list[article['date']]['articles'].append(article_details)
                        for date, details in article_list.items():
                            with open(os.path.join(year_dir, '%s.js' % date), 'wb') as date_js:
                                json.dump(details, date_js)                
                        num_issues_month[month] = len(article_list)
                        num_issues_year[year] += len(article_list)
                        month_file = os.path.join(year_dir, '%s.js' % month)
                        with open(month_file, 'wb') as month_js:
                            json.dump(issue_totals, month_js);
                            '''
                            for category in categories.values():
                                total_list = []
                                words_list = []
                                for issue, values in issue_totals.items():
                                    try:
                                        total_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), values[category]['total']))
                                        words_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), values[category]['words']))
                                    except KeyError:
                                        total_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), 0))
                                        words_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), 0))
                                month_js.write('var %s_totals = %s;\n' % (category, json.dumps(total_list)))
                                month_js.write('var %s_words = %s;\n' % (category, json.dumps(words_list)))
                            month_js.write('var articles = %s;\n' % json.dumps(article_list))
                            '''
                    for month, values in month_totals.items():
                        num_issues = num_issues_month[month]
                        for cat, totals in values.items():
                            total = totals['total']
                            words = totals['words']
                            if total > 0: totals['total'] = float(total) / num_issues
                            if words > 0: totals['words'] = float(words) / num_issues
                    with open(year_file, 'wb') as year_js:
                        json.dump(month_totals, year_js)
                        '''
                        for category in categories.values():
                            total_list = []
                            words_list = []
                            for month, values in month_totals.items():
                                try:
                                    total = values[category]['total']
                                    words = values[category]['words']
                                except KeyError:
                                    total = 0
                                    words = 0
                                num_issues = num_issues_month[month]
                                if total > 0: total = float(total) / num_issues
                                if words > 0: words = float(words) / num_issues
                                total_list.append((month, total))
                                words_list.append((month, words))
                            year_js.write('var %s_totals = %s;\n' % (category, json.dumps(total_list))) 
                            year_js.write('var %s_words = %s;\n' % (category, json.dumps(words_list)))
                            '''
                                #print 'No %s' % category
                    # Then we can get articles by month facets
                    '''
                    print 'Getting totals by month...'
                    newspaper['years'][year]['months'] = {}
                    for category, label in categories.items():
                        #print url
                        url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-category=%s&l-year=%s&facet=month&n=0' % (TROVE_API_URL, TROVE_KEY, newspaper['id'], quote_plus(category), year)
                        results = json.load(get_url(url))
                        try:
                            months = results['response']['zone'][0]['facets']['facet']['term']
                        except TypeError:
                            months = []
                        for month_result in months:
                            month = int(month_result['search'])
                            count = float(month_result['count'])
                            if count != 0:
                                try:
                                    count = count / issues[month]
                                except KeyError:
                                    count = 0
                            try:
                                newspaper['years'][year]['months'][month][label]['total'] = count
                            except KeyError:
                                try:
                                    newspaper['years'][year]['months'][month][label] = {}
                                    newspaper['years'][year]['months'][month][label]['total'] = count
                                except KeyError:
                                    newspaper['years'][year]['months'][month] = {}
                                    newspaper['years'][year]['months'][month][label] = {}
                                    newspaper['years'][year]['months'][month][label]['total'] = count
                            try:
                                newspaper['years'][year]['months'][month][label]['words'] = month_totals[month][label]['words']
                            except KeyError:
                                newspaper['years'][year]['months'][month][label]['words'] = 0
                    year_file = os.path.join(newspaper_dir, '%s.js' % year)
                    print 'Writing %s' % year_file
                    with open(year_file, 'wb') as year_js:
                        for category in categories.values():
                            try:
                                totals = [(month, values[category]['total']) for month, values in newspaper['years'][year]['months'].items()]
                                print totals
                                year_js.write('var %s_totals = %s;\n' % (category, json.dumps(totals))) 
                                words = [(month, values[category]['words']) for month, values in newspaper['years'][year]['months'].items()]
                                year_js.write('var %s_words = %s;\n' % (category, json.dumps(words)))
                            except KeyError:
                                print 'No %s' % category
                # for each decade get year summaries
                print 'Getting totals by year...'
                start_decade = str(start_date.year)[:3]
                end_decade = str(end_date.year)[:3]
                for decade in range(int(start_decade), int(end_decade)+1):
                    for category, label in categories.items():
                        url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-category=%s&l-decade=%s&facet=year&n=0' % (TROVE_API_URL, TROVE_KEY, newspaper['id'], quote_plus(category), decade)
                        for num in range(0,10):
                            year = int('%s%s' % (decade, num))
                            try:
                                newspaper['years'][year][label] = {}
                            except KeyError:
                                newspaper['years'][year] = {}
                                newspaper['years'][year][label] = {}
                        results = json.load(get_url(url))
                        try:
                            years = results['response']['zone'][0]['facets']['facet']['term']
                        except TypeError:
                            years = []
                        for year_result in years:
                            year = int(year_result['display'])
                            count = float(year_result['count'])
                            if count != 0:
                                count = count / newspaper['issues'][year]
                            newspaper['years'][year][label]['total'] = count
                            try:
                                newspaper['years'][year][label]['words'] = year_totals[year][label]['words']
                            except KeyError:
                                newspaper['years'][year][label]['words'] = 0
                    print 'Writing %s' % years_file
                    with open(years_file, 'wb') as years_js:
                        for category in categories.values():
                            try:
                                totals = [(year, values[category]['total']) for year, values in newspaper['years'].items()]
                                years_js.write('var %s_totals = %s;\n' % (category, json.dumps(totals)))
                                words = [(year, values[category]['words']) for year, values in newspaper['years'].items()]
                                years_js.write('var %s_words = %s;\n' % (category, json.dumps(words)))
                            except KeyError:
                                print 'No %s' % category
        '''
            print 'Getting totals for this year...'
            for year, values in year_totals.items():
                        num_issues = num_issues_year[year]
                        for cat, totals in values.items():
                            total = totals['total']
                            words = totals['words']
                            if total > 0: totals['total'] = float(total) / num_issues
                            if words > 0: totals['words'] = float(words) / num_issues
            with open(years_file, 'wb') as years_js:
                json.dump(year_totals, years_js)
            '''