Example #1
0
def cycle():
    # for group in range(1,16526, 5):
    for group in range(1,5,5):
        print "----------------",group,"----------------"

        for page in range(group, group+5):

            # wait two seconds between every page request
            time.sleep(2)

            url = "http://eric.ed.gov/?q=education&ft=on&pg=%d" %page
            print url

            # get html for search results page
            html = pq(url=url)
            publications = html(".r_i")

            # process each of the 15 publications on a given search results page
            for i in range(0,15):

                current_pub = publications.eq(i)

                # initialize pub_dict to hold all of the pub's data
                pub_dict = {}

                # get eric_id, title, and pdf link from search results page
                eric_id = current_pub.attr("id")[2:].encode('ascii', 'replace')
                title = current_pub.find('.r_t').text().encode('ascii', 'replace')
                link = current_pub.find('.r_f').find('a').attr("href").encode('ascii', 'replace')

                # get authors and source/year (split on hyphen)
                auth_src_yr = current_pub.find('.r_a').text()
                auth_src_yr_split = auth_src_yr.split(u'\u2013 ')

                # if there is a hyphen, process the portion before the hyphen
                # to get a list of authors (split on semicolons)
                if len(auth_src_yr_split) > 1:
                    u_authors = auth_src_yr_split[0].split('; ')

                    authors = []
                    for u_auth in u_authors:
                        auth = u_auth.encode('ascii', 'replace')
                        auth = auth.strip()
                        authors.append(auth)
                    # process line after the hyphen to get source and year
                    source = auth_src_yr_split[1][:-6].encode('ascii', 'replace')
                    year = auth_src_yr_split[1][-4:].encode('ascii', 'replace')

                # if there is not a hyphen, authors is empty
                # process line to get source and year
                else:
                    authors = []
                    source = auth_src_yr_split[0][:-6].encode('ascii', 'replace')
                    year = auth_src_yr_split[0][-4:].encode('ascii', 'replace')

                # pull the short description from the search results page
                # get rid of weird \r\n formatting, and fix ellipses at the end
                u_short_desc = current_pub.find('.r_d').text()
                u_short_desc_split = u_short_desc.split(u'\r\n')
                u_short_desc = ''
                for item in u_short_desc_split:
                    u_short_desc += ' ' + item

                if u_short_desc[-1:] == u'\u2026':
                    short_desc = u_short_desc[:-1].encode('ascii', 'replace') + '...'
                else:
                    short_desc = u_short_desc.encode('ascii', 'replace')

                # pull the short list of descriptors from the search results page
                descriptors = current_pub.find('.keywords').text()[13:].encode('ascii', 'replace').split(', ')

                # populate the pub_dict
                pub_dict["eric_id"] = eric_id
                pub_dict["title"] = title
                pub_dict["url"] = link
                pub_dict["authors"] = authors
                pub_dict["source"] = source
                pub_dict["year"] = year
                pub_dict["short_desc"] = short_desc
                pub_dict["descriptors"] = descriptors

                # add the publication entry (and all related info) to DB
                update.add_source(source)
                update.add_publication(pub_dict)
Example #2
0
def cycle():

    # map months to integers
    month_dict = {
        'Jan':1, 'January':1,
        'Feb':2, 'February':2,
        'Mar':3, 'March':3,
        'Apr':4, 'April':4,
        'May':5,
        'Jun':6, 'June':6,
        'Jul':7, 'July':7,
        'Aug':8, 'August':8,
        'Sep':9, 'Sept':9, 'September':9,
        'Oct':10, 'October':10,
        'Nov':11, 'November':11,
        'Dec':12, 'December':12
    }

    for group in range(2436,16526, 5):
    # for group in range(1,5,5):
        print "----------------",group,"----------------"

        for page in range(group, group+5):

            url = "http://eric.ed.gov/?q=education&ft=on&pg=%d" %page
            print url

            # get html for search results page
            html = pq(url=url)
            publications = html(".r_i")

            # process each of the 15 publications on a given search results page
            for i in range(0,15):

                # wait two seconds between every page request
                time.sleep(2)

                current_pub = publications.eq(i)

                # "deep url" goes in one more layer, to the detailed pub page
                deep_url = 'http://eric.ed.gov/' + current_pub.find('.r_t'
                    ).find('a').attr("href").encode('ascii', 'replace')
                deep_html = pq(url=deep_url)

                # initialize pub_dict to hold all of the pub's data
                pub_dict = {}

                # get eric_id, title, and pdf link from search results page
                eric_id = current_pub.attr("id")[2:].encode('ascii', 'replace')
                title = current_pub.find('.r_t').text().encode('ascii', 'replace')
                link = current_pub.find('.r_f').find('a').attr("href").encode('ascii', 'replace')

                # get authors and source/year (split on hyphen)
                auth_src_yr = current_pub.find('.r_a').text()
                auth_src_yr_split = auth_src_yr.split(u'\u2013 ')

                # if there is a hyphen, process the portion before the hyphen
                # to get a list of authors (split on semicolons)
                if len(auth_src_yr_split) > 1:
                    u_authors = auth_src_yr_split[0].split('; ')

                    authors = []
                    for u_auth in u_authors:
                        auth = u_auth.encode('ascii', 'replace')
                        auth = auth.strip()
                        authors.append(auth)
                    # process line after the hyphen to get source and year
                    source = auth_src_yr_split[1][:-6].encode('ascii', 'replace')
                    year = auth_src_yr_split[1][-4:].encode('ascii', 'replace')

                # if there is not a hyphen, authors is empty
                # process line to get source and year
                else:
                    authors = []
                    source = auth_src_yr_split[0][:-6].encode('ascii', 'replace')
                    year = auth_src_yr_split[0][-4:].encode('ascii', 'replace')

                try:
                    int(year)
                    print eric_id, year
                except ValueError:
                    print eric_id, "year is not an integer"
                    continue

                # pull the full description from the detailed pub page
                # get rid of weird \r\n formatting
                u_deep_desc = deep_html.find('.abstract').text()
                deep_desc_split = u_deep_desc.split(u'\r\n')
                deep_desc = ''
                for part in deep_desc_split:
                    deep_desc += part

                # pull the full list of descriptors from the detailed pub page
                deep_descriptors_html = str(deep_html.find('.keywords').find('a'))
                pattern = r"<a href=.*?>(.*?)</a>"
                deep_descriptors = re.findall(pattern, deep_descriptors_html)

                # pull citation count from detailed pub page
                deep_citation_html = str(deep_html.find('div#r_colR'))
                pattern = r"<div><strong>Reference Count:</strong> (.*?)</div>"
                deep_citation_count_raw = re.findall(pattern, deep_citation_html)[0]
                
                try:
                    deep_citation_count = int(deep_citation_count_raw)
                except ValueError:
                    deep_citation_count = 0

                # pull volume, issue, page, and month from detailed pub page
                deep_info = deep_html('.r_a').text().encode('ascii', 'replace')
                volume_pattern = r" v(\d*?) "
                issue_pattern = r" n(\d*?) "
                page_pattern = r" p(\d*?)-(\d*?) "
                date_pattern = r".*? ([a-zA-Z]*) \d\d\d\d"

                volume_list = re.findall(volume_pattern, deep_info)
                issue_list = re.findall(issue_pattern, deep_info)
                pages_list = re.findall(page_pattern, deep_info)
                month_str_list = re.findall(date_pattern, deep_info)

                # populate the pub_dict
                if volume_list:
                    deep_volume = volume_list[0]
                    pub_dict["volume"] = deep_volume
                else:
                    pub_dict["volume"] = None

                if issue_list:
                    deep_issue = issue_list[0]
                    pub_dict["issue"] = deep_issue
                else:
                    pub_dict["issue"] = None

                if pages_list:
                    if len(pages_list[0])==2:
                        deep_start_page = pages_list[0][0]
                        deep_end_page = pages_list[0][1]
                        pub_dict["start_page"] = deep_start_page
                        pub_dict["end_page"] = deep_end_page
                    else:
                        deep_start_page = pages_list[0][0]
                        pub_dict["start_page"] = deep_start_page
                        pub_dict["end_page"] = None
                else:
                    pub_dict["start_page"] = None
                    pub_dict["end_page"] = None

                if month_str_list:
                    if month_str_list[0] in month_dict:
                        deep_month = month_dict[month_str_list[0]]
                        pub_dict["month"] = deep_month
                    else:
                        pub_dict["month"] = None

                pub_dict["eric_id"] = eric_id
                pub_dict["title"] = title[:500]
                pub_dict["url"] = link
                pub_dict["authors"] = authors
                pub_dict["source"] = source[:100]
                pub_dict["year"] = year
                pub_dict["full_desc"] = deep_desc[:5000]
                pub_dict["descriptors"] = deep_descriptors
                pub_dict["citation_count"] = deep_citation_count

                # add the publication entry (and all related info) to DB
                update.add_source(source[:100])
                update.add_publication(pub_dict)