Example #1
0
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain):
    '''
    Crawl the college catalog and adds to an index dictionary to map set of
    words with associated course identifier.

    Inputs:
        url: starting url to begin crawling with
        q: queue of urls in line to be crawled
        links_visited: list of visited links
        limiting_domain: domain name
        index_list: dictionary that maps words to course identifiers
    '''
    if url in links_visited:
        return
    request = util.get_request(url)
    if request is None:
        return
    post_url = util.get_request_url(request)
    if post_url in links_visited:
        return
    html = util.read_request(request)
    soup = bs4.BeautifulSoup(html, features="html5lib")
    find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain)
    tag_list = soup.find_all("ul", attrs = {"class": "pagination"})
    current_page = tag_list[0].find_all("li", attrs = {"class": "current"})
    next_page = current_page[0].next_sibling.next_sibling.findChild()
    next_page_href = next_page.get('href')
    next_page_href = util.convert_if_relative_url(post_url, next_page_href)
    page_parser_q.put(next_page_href)
Example #2
0
def get_restaurant_links_chicago():
    # start from searching "Restaurant", "Chicago" from yelp main page
    page_suffix = [i for i in range(0, 231, 10)]
    url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start='
    url_list = []

    for suffix in page_suffix:
        page_url = url + str(suffix)
        url_list.append(page_url)

    links = []

    for url in url_list:

        request = util.get_request(url)
        text = util.read_request(request)

        soup = bs4.BeautifulSoup(text, "html5lib")
        tags = soup.find_all('a', href=True, target="", role="")

        # extract href links to restaurants
        links = []
        for tag in tags:
            link = tag['href']
            link = util.convert_if_relative_url(url, link)
            link = util.remove_fragment(link)
            # Hardcoded filter
            if link[-11:] == "Restaurants":
                if tag["name"] != '':
                    if link not in links:
                        links.append(link)

    return links
Example #3
0
def get_walk_score(zip_code):
    '''
    Gets walk score for single zip code

    Input:
    zip_code (str or int): a US zip code

    Output:
    score (int): Walk score for that zip code. Missing values get -1.
    '''

    url = "https://www.walkscore.com/score/" + str(zip_code)
    req = util.get_request(url)
    if req:
        text = util.read_request(req)
    else:
        score = -1
        text = None
    if text:
        soup = bs4.BeautifulSoup(text, features='lxml')
        span = soup.find('span', attrs={'id': 'score-description-sentence'})
        try:
            score_txt = span.text
            match = re.search("(Walk Score of)(\s)(\d+)(\s)", score_txt)
            score = int(match.group(3))
        except AttributeError:
            score = -1
    else:
        score = -1

    return score
Example #4
0
def queue_children_sites(starting_url, queue):
    '''Given a url and a queue, adds all children urls
     of the start point to the queue

     Inputs: starting_url -- string that corresponds to a url
     queue -- queue.Queue object

     Outputs: None, queue is modified
     in place to contain all child urls'''

    if starting_url[4] == 's':
        pass
    else:
        starting_url = starting_url[:4] + 's' + starting_url[4:]
    #turns http to https if not already
    request = util.get_request(starting_url)
    assert request != None
    text = util.read_request(request)
    soup = bs4.BeautifulSoup(text, "html5lib")
    URLs = soup.find_all("a")
    URLs = [URL["href"] for URL in URLs if URL.has_attr("href")]
    children = []
    for URL in URLs:
        if util.is_absolute_url(URL):
            children.append(URL)
        else:
            URL = util.convert_if_relative_url(starting_url, URL)
            children.append(URL)

    children = [
        child for child in children
        if util.is_url_ok_to_follow(child, limiting_domain)
    ]
    for child in children:
        queue.put(child)
Example #5
0
def get_cities():
    city_state = get_loc_cook()
    cook_cities = []
    for ele in city_state:
        cook_cities.append(ele[0])

    #print(cook_cities)
    #print(len(cook_cities))

    url = 'https://www.rentcafe.com/sitemaps/us/il/average-rent-market-trends/'
    request = util.get_request(url)
    text = util.read_request(request)

    soup = bs4.BeautifulSoup(text, "html5lib")
    tags = soup.find_all('a', href=True, target="", role="")

    cities = []
    count = 0
    for tag in tags:
        if "title" in tag.attrs:
            city = tag['title']
            if city[0:15] == "Average Rent in":
                #print(city)
                city = city[16:]
                #print(city)
                count += 1
                if city in cook_cities:
                    cities.append(city)

    #print(count)
    #print(len(cities))

    return cities
Example #6
0
def mini_crawler(url, page_parser_q, pull_info_q, links_visited,
                 limiting_domain, index_list):
    '''
    Crawl the college catalog and adds to an index list to map set of
    words with associated course identifier.

    Inputs:
        url: starting url to begin crawling with
        q: queue of urls in line to be crawled
        links_visited: list of visited links
        limiting_domain: domain name
        index_list: list of dictionaries for each webpage
    '''
    if url in links_visited:
        return
    request = util.get_request(url)
    if request is None:
        return
    post_url = util.get_request_url(request)
    if post_url in links_visited:
        return
    html = util.read_request(request)
    soup = bs4.BeautifulSoup(html, features="html5lib")
    find_links(soup, url, post_url, pull_info_q, links_visited,
               limiting_domain)
    tag_list = soup.find_all("ul", attrs={"class": "Pagination"})
    pages = tag_list[0].find_all("li")
    pages = pages[1:]
    for page in pages:
        page_parser_q.put(page.findChild().get('href'))
Example #7
0
def crawler():
    starting_url = "https://www.teenlife.com/search?q=&l=&c=Summer%20Program&p=1"
    limiting_domain = "www.teenlife.com"
    parsing_default_domain = "https://www.teenlife.com/search"

    numpages = 0
    links_visited = []
    index_list = []
    page_parser_q = queue.Queue()
    pull_info_q = queue.Queue()
    page_parser_q.put(starting_url)
    while page_parser_q.empty() == False:
        link = page_parser_q.get()
        mini_crawler(link, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain)
        numpages += 1
        print(link, "link")

    while pull_info_q.empty() == False:
        page_link = pull_info_q.get()
        print(page_link, "page_link")
        request = util.get_request(page_link)
        if request is not None:
	        html = util.read_request(request)
	        soup = bs4.BeautifulSoup(html, features="html5lib")
	        make_index(soup, index_list)


    df = pd.DataFrame(index_list)

    return df
Example #8
0
def crawler():
    # starting_url = "https://www.teenlife.com/search/?q=None&l=None&c=Summer%20Program&p=1"
    starting_url = "https://rusticpathways.com/students/programs?_=1584132668586&page=1"
    limiting_domain = "rusticpathways.com"

    numpages = 0
    links_visited = []
    index_list = []
    page_parser_q = queue.Queue()
    pull_info_q = queue.Queue()
    page_parser_q.put(starting_url)
    while page_parser_q.empty() == False:
        link = page_parser_q.get()
        mini_crawler(link, page_parser_q, pull_info_q, links_visited,
                     limiting_domain, index_list)
        numpages += 1

    while pull_info_q.empty() == False:
        page_link = pull_info_q.get()
        # print(page_link)
        request = util.get_request(page_link)
        if request is not None:
            html = util.read_request(request)
            soup = bs4.BeautifulSoup(html, features="html5lib")
            make_index(soup, index_list, page_link)
            # print(index_list)
    df = pd.DataFrame(index_list)
    return df
def go(housing_links):
    '''
    Main function
    Inputs:
        housing_links (list): a list of links obtained from inputing different
            zipcodes to the search bar of rentcafe.com
    Output: 
        d (dict): a dictionary mapping each zipcode to a tuple (mean_price, income)
    '''
    # a dictionary with zipcode as keys, avg rent price as values
    d = {}

    # start from the first zip_code...
    for link in housing_links:
        zip_code = str(link[-5:])
        d[zip_code] = []
        request = util.get_request(link)
        text = util.read_request(request)
        soup = bs4.BeautifulSoup(text, "html5lib")

        # find median income under this zipcode
        li_tags = soup.find_all('li', class_="medium")
        income = np.int64(re.findall(r'\d+(?:,\d+)?', li_tags[2].text)[0].replace(',',''))

        # collect all subpages under this zipcode
        pages_to_crawl = []
        tags = soup.find('ul', class_="pagination")
        if tags is None:
            pages_to_crawl = [link]
        else:
            pages = tags.find_all('a', href=True)
            for a in pages:
                if a['href'] not in pages_to_crawl:
                    pages_to_crawl.append(a['href'])

        for url in pages_to_crawl:
            request = util.get_request(url)
            text = util.read_request(request)
            soup = bs4.BeautifulSoup(text, "html5lib")
            property_tags = soup.find_all('div', class_='item-information')
    
            for item in property_tags:
                d[zip_code].append(find_adj_price(item))
            
        d[zip_code] = (np.mean([x for x in d[zip_code] if x != 0]), income)
        
    return d
Example #10
0
def create_dictionary(num_pages_to_crawl, course_map_filename, starting_url,
                      limiting_domain):
    '''
    Creates the dictionary mapping course id numbers to the words in the
    course titles and descriptions.

    Inputs:
        num_pages_to_crawl: (int) The number of pages to process
                            during the crawl.
        course_map_filename: (string) The name of the JSON file that contains
                             the mapping of course codes to course ids.
        starting_url: (string) The url of the first page that the
                      crawler visits.
        limiting_domain: (string) The limiting domain of the url.

    Outputs:
        The dictionary mapping course id numbers to the words in the course 
        titles and descriptions.
    '''

    with open(course_map_filename) as json_file:
        coursemap = json.load(json_file)

    url_list = []
    url_queue = queue.Queue()
    num_pages = 0
    course_dict = {}
    process_dict = {}

    starting_url = clean_url(starting_url, limiting_domain, parent_url=None)

    if starting_url:
        url_queue.put(starting_url)

    while num_pages < num_pages_to_crawl and not url_queue.empty():
        num_pages += 1
        next_url = url_queue.get()
        if next_url and next_url not in url_list:
            request = util.get_request(next_url)
            if request:
                request_url = util.get_request_url(request)
                if request_url and request_url not in url_list:
                    url_list.append(next_url)
                    if request_url not in url_list:
                        url_list.append(request_url)
                    html_text = util.read_request(request)
                    soup = bs4.BeautifulSoup(html_text, "html5lib")
                    process_dict.update(find_course_info(soup, coursemap,\
                    course_dict))
                    if process_dict:
                        course_dict.update(process_dict)
                    href_list = soup.find_all("a", href=True)
                    for h in href_list:
                        h_url = h['href']
                        h_url = clean_url(h_url, limiting_domain, request_url)
                        if h_url:
                            url_queue.put(h_url)

    return course_dict
Example #11
0
def open_page(url):

	# print("opening...", url)
	r = util.get_request(url)

	# print("r:", r)

	if r:
		return r.url, BeautifulSoup(util.read_request(r))

	else:
		return None, None
Example #12
0
def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit):
    urls = Queue.Queue()
    visited = []
    index = {}

    def search(word):
        rv = []
        matches = []
        words = re.findall("[a-zA-Z]\w*", word)
        if len(words) == 0:
            return []
        for url in index.keys():
            for title in index[url].keys():
                for word in words:
                    word = word.lower()
                    if word in title or word in index[url][title]:
                        matches.append((title, url))
        for pair in matches:
            if matches.count(pair) == len(words):
                if pair not in rv:
                    rv.append(pair)
        return rv

    if util.is_url_ok_to_follow(starting_url, limiting_domain):
        urls.put(starting_url)
        while not urls.empty() and len(visited) < max_num_pages_to_visit:
            top_queue = urls.get()
            if top_queue not in visited and util.is_url_ok_to_follow(
                    top_queue, limiting_domain):
                request = util.get_request(top_queue)
                if request == None:
                    visited.append(top_queue)
                    continue
                new_page = util.get_request_url(request)
                if new_page != top_queue:
                    if new_page not in visited:
                        visited.append(new_page)
                        top_queue = new_page
                data = bs4.BeautifulSoup(util.read_request(request))
                visited.append(top_queue)
                index = indexer(index, top_queue, data)
                for link in data.find_all('a'):
                    href = link.get('href')
                    if href == None:
                        continue
                    href = util.remove_fragment(href)
                    if not util.is_absolute_url(href):
                        url = util.convert_if_relative_url(top_queue, href)
                    urls.put(url)
    else:
        return None
    return search
Example #13
0
def get_soup_from_url(url):
    '''
    Input:
        url - absolute url
    Returns:
        BeautifulSoup object corresponding to url
    '''
    request = util.get_request(url)
    if request == None:
        return None
    text = util.read_request(request)
    soup = bs4.BeautifulSoup(text, "html5lib")

    return soup
Example #14
0
def get_restaurant_links_cook():
    cities = get_cities()

    city_state = get_loc_cook()
    new_city_state = []
    for ele in city_state:
        if ele[0] in cities:
            new_city_state.append(ele)

    page_suffix = [i for i in range(0, 231, 10)]
    #print(city_state)

    url_list = []
    for city, state in city_state:
        html = "https://www.yelp.com/search?find_desc=Restaurants&find_loc=" + city.replace(
            " ", "") + "%2C%20" + state
        for suffix in page_suffix:
            html_page = html + "&start=" + str(suffix)
            url_list.append(html_page)
    r'''
    with open(r"c:\Users\35653\Desktop\CS122\project\urls.txt", "w") as write_file:
        write_file.writelines(url_list)

        write_file.close()
    '''

    url_list = [
        "https://www.yelp.com/search?find_desc=Restaurants&find_loc=Lyons%2C%20IL&start=190"
    ]
    for url in url_list:
        request = util.get_request(url)
        if request:

            text = util.read_request(request)

            soup = bs4.BeautifulSoup(text, "html5lib")
            tags = soup.find_all('a', href=True, target="", role="")

            # extract href links to restaurants
            links = []
            for tag in tags:
                link = tag['href']
                link = util.convert_if_relative_url(url, link)
                link = util.remove_fragment(link)
                # Hardcoded filter
                if link[-11:] == "Restaurants":
                    if tag["name"] != '':
                        if link not in links:
                            links.append(link + "\n")
    return links
Example #15
0
def creat_model_data():
    request_path = r'data\request.txt'
    vertex_path = r'data\vertex.txt'
    vehicle_path = r'data\vehicle.txt'
    requests = util.read_request(request_path)
    vertexs = util.read_vertex(vertex_path)
    vehicles = util.read_vehicle(vehicle_path)
    time_matrix = util.cal_time(vertexs)
    distance_matrix = time_matrix * 2
    tim = multidict({})
    for i, j in range(time_matrix.shape[0]):
        tim[i, j] = time_matrix[i][j]
    print(tim)

    print()
def make_soup(url):
    '''
    Makes a soup object from a html request object

    Inputs:
        request: a request object of the html 
    Outputs:
        soup - Soup object, if request is valid url. 
    '''
    req = util.get_request(url)
    html = util.read_request(req)
    if html is not None and html is not "":
        soup = bs4.BeautifulSoup(html, "html5lib")
        return soup
    return None
Example #17
0
def get_soup_object(url):
    """
    Takes a url, checks for possible redirection,
    returns soup object.

    Inputs:
        url (string)
    
    Returns:
        Soup object
    """
    request = util.get_request(url)
    html_text = util.read_request(request)
    soup = bs4.BeautifulSoup(html_text, 'html5lib')

    return soup
def get_soup(url):
    '''
    Returns the soup of the current_market_url.
    Inputs:
        url: str
    Output:
        BeautifulSoup object
    '''
    time.sleep(0.05)
    url_request = util.get_request(url)
    if not url_request:
        return False
    html = util.read_request(url_request)
    if not html:
        return False
    return bs4.BeautifulSoup(html, "html5lib")
Example #19
0
def get_restaurant_links():
    '''
    Start from searching "Restaurant", "Chicago" on yelp main page,
    and collect all restaurant links from 24 pages

    Input:
        None

    Output:
        links (list): a list of links
    '''

    page_suffix = [i for i in range(0, 231, 10)]
    url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start='
    url_list = []

    for suffix in page_suffix:
        page_url = url + str(suffix)
        url_list.append(page_url)

    links = []
    count = 0

    for url in url_list:
        count += 1
        print(count)

        request = util.get_request(url)
        text = util.read_request(request)

        soup = bs4.BeautifulSoup(text, "html5lib")
        tags = soup.find_all('a', href=True, target="", role="")

        for tag in tags:
            link = tag['href']
            link = util.convert_if_relative_url(url, link)
            link = util.remove_fragment(link)
            if link[-11:] == "Restaurants":
                if tag["name"] != '':
                    if link not in links:
                        links.append(link + "\n")
                        print(link)

        i = 5 + random.random() * 5
        time.sleep(i)

    return links
Example #20
0
def get_referees(url):

    base_url = 'http://www.basketball-reference.com/boxscores/'
    comp_url = base_url + url + '.html'

    request = util.get_request(comp_url)

    if request != None:
        html = util.read_request(request)
        if html != None:
            soup = bs4.BeautifulSoup(html, "html5lib")
    div_tags = soup.find_all('div')
    good_tags = str(div_tags)
    string = re.findall(r'(?<=Officials:)(.*?)(?=\<br)', good_tags)

    rv = re.findall(r'(?<=.html\"\>)(.*?)(?=<\/a)', string[0])
    return rv
Example #21
0
def get_paper_links(number_of_articles, fields):
    '''
    Crawls through Nature search pages, and pulls article links 
    from different fields

    Input:
    number of articles, int number of articles to find 
    fields, list of field str, all major fields in nature.com/subjects
    
    Output:
    paper_links, list of paper urls (str)
    '''
    search_url = ('https://www.nature.com/search?article_type=protocols\
                   %2Cresearch%2Creviews&subject=')
    suffix = '&page='
    
    search_urls = []
    paper_links = []

    num_articles_per_field = number_of_articles // 8 
    num_pages_to_visit = int(np.ceil(num_articles_per_field / 50))
    num_on_last_page = num_articles_per_field % 50

    for field in fields:
        for i in range(num_pages_to_visit):
            new_url = search_url + field + suffix + str(i + 1)
            search_urls.append(new_url)
        
    for url in search_urls:
        num_to_search = 50
        if int(url[-1]) == num_pages_to_visit:
            num_to_search = num_on_last_page

        new_request = util.get_request(url)
        html = util.read_request(new_request)
        search_soup = bs4.BeautifulSoup(html, features = 'html.parser')
        article_links = search_soup.find_all('h2', 
                        class_ = 'h3 extra-tight-line-height', 
                        itemprop = 'headline')
        article_links = article_links[:num_to_search]
        paper_links.extend([i.find('a')['href'] for i in article_links])
    
    return paper_links
Example #22
0
def analyze_page(url, queue, limiting_domain, course_dict):
    '''
    Queues all urls, then makes a dictionary index of the course codes to
    a list of words in the course description.

    Inputs:
        url: the url of the page to analyze
        queue: the queue that holds the urls
        limiting_domain: a domain with which to stay in when queuing
        course_dict: the index dictionary

    Outputs:
        None
    '''
    request = util.get_request(url)
    text = util.read_request(request)
    soup = bs4.BeautifulSoup(text, "html5lib")

    queue_urls(url, soup, queue, limiting_domain)
    find_courses(soup, course_dict)
Example #23
0
def translate_general(code):
    '''
    Given an IDC-code, scrapes www.icd10data.com and returns a meaningful
    translation as a string

    Input:
        code (int): an IDC-code
    Output:
        rv (string): translation of the IDC-code
    '''
    url = BASE_GEN + str(code) + '&codebook=icd9volume1'
    ro = util.get_request(url)
    html = util.read_request(ro)
    soup = bs4.BeautifulSoup(html, "html5lib")
    rv = None
    search = soup.find('div').next_sibling.next_sibling.find('div',
        class_='searchPadded')

    if search and search.text:
        rv = search.text

    return rv
Example #24
0
def get_neighbors(node):
    print("      completed")
    neighbors = []
    test_link = []
    response = util.get_request(node)
    if response == None:
        print('No response')
        neighbors = None
    else:
        text = util.read_request(response)
        if text == "":
            print("No text read")
            neighbors = None
        else:
            soup = bs4.BeautifulSoup(text, "html5lib")
            for link in soup.find_all("a"):
                url_raw = link.get("href")
                url_rel = util.remove_fragment(url_raw)
                url = util.convert_if_relative_url(node, url_rel)
                print(url)
                if url != None:
                    neighbors.append(url)
    return neighbors, response, soup
Example #25
0
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list):
    '''
    Crawl the college catalog and adds to an index list to map set of
    words with associated course identifier.

    Inputs:
        url: starting url to begin crawling with
        q: queue of urls in line to be crawled
        links_visited: list of visited links
        limiting_domain: domain name
        index_list: list of dictionaries for individual programs
    '''
    if url in links_visited:
        return
    request = util.get_request(url)
    if request is None:
        return
    post_url = util.get_request_url(request)
    if post_url in links_visited:
        return
    html = util.read_request(request)
    soup = bs4.BeautifulSoup(html, features="html5lib")
    find_links(soup, url, post_url, page_parser_q, pull_info_q, links_visited, limiting_domain)
Example #26
0
def scrape_chart_data(html):
    '''
    Takes a html file for each neighborhood on Trulia.
    Extracts the data from a graph of median sale pricing
    using BeautifulSoup
    Inputs:
        html - the html doc
    Returns:
        data_points (list of dicts)
    '''
    request = util.get_request(html)
    doc = util.read_request(request)
    soup = bs4.BeautifulSoup(doc, 'lxml')

    scripts = soup.find_all('script')

    for script in scripts:
        if 'var CHART_DATA' in script.text:
            chart = script.text
            break

    start = 'medianSalesPoints: '
    end = 'salesVolumePoints'
    chart_list = (chart.split(start))[1].split(end)[0]
    chart_list = chart_list.replace('\n', '')
    chart_list = chart_list.replace('null', 'None')

    while chart_list[-1] == ' ':
        chart_list = chart_list[:-1]

    if chart_list[-1] == ',':
        chart_list = chart_list[:-1]

    data_by_bed_num = eval(chart_list)
    data_points = data_by_bed_num[-1]['points']

    return data_points
Example #27
0
def go(num_pages_to_crawl, course_map_filename, index_filename):
    '''
    Crawl the college catalog and generates a CSV file with an index.

    Inputs:
        num_pages_to_crawl: the number of pages to process during the crawl
        course_map_filename: the name of a JSON file that contains the mapping
          course codes to course identifiers
        index_filename: the name for the CSV of the index.

    Outputs: 
        CSV file of the index.
    '''

    starting_url = "http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/index.html"
    limiting_domain = "classes.cs.uchicago.edu"
    index = {}
    unique_list = [starting_url]
    url_queue = queue.Queue()
    url_queue.put(starting_url)
    counter = 0
    with open(course_map_filename) as data:
        course_dict = json.load(data)

    while counter < num_pages_to_crawl and not url_queue.empty():
        next_url = url_queue.get()
        request_obj = util.get_request(next_url)
        url_text = util.read_request(request_obj)
        soup = bs4.BeautifulSoup(url_text, "html5lib")
        next_tag_list = find_tag_list(next_url, soup, \
            request_obj, limiting_domain)
        create_index(soup, course_dict, index)
        add_to_queue(next_tag_list, url_queue, unique_list)
        counter += 1

    create_csv(index, index_filename)
Example #28
0
def nature_crawler(number_of_articles, database_name):
    '''
    Crawls nature.com and their suite of journals to extraact authorship 
    information for the database 'journals.db'.

    Inputs:
        number_of_articles, int, the number of articles to extract from
         the journals

    Outputs:
        None, but modifies journals.db
    '''

    conn = sqlite3.connect(database_name)
    c = conn.cursor()
    
    home_domain = "https://www.nature.com"

    search_urls = []

    fields = ['biological-sciences', 'business-and-commerce', 
              'earth-and-environmental-sciences','health-sciences',
              'humanities', 'physical-sciences', 
              'scientific-community-and-society','social-science']

    paper_links = get_paper_links(number_of_articles, fields)


    for i, link in enumerate(paper_links):

        try:
            new_request = util.get_request(home_domain + link)
            html = util.read_request(new_request)
            article_soup = bs4.BeautifulSoup(html, features = 'html.parser')
            authors = article_soup.find_all('meta', {'name':'citation_author'})

            #process paper

            full_title = article_soup.find('title').text.split(' | ')
            paper_title = full_title[0]
            print(paper_title)
            journal = full_title[1]
            num_authors = len(authors)
            year_find = article_soup.find('meta', {'name':'dc.date'})['content']
            year = year_find.split('-')[0]
            num_articles_per_field = number_of_articles // 8
            field_index = int(np.ceil(i // num_articles_per_field))
            field = fields[field_index]
            insert = (paper_title, year, journal, field, num_authors)
        except:
            print('paper extraction failed')
            continue

        try:
            c.execute('INSERT INTO papers(title, year, journal, field, \
                       num_authors) VALUES (?, ?, ?, ?, ?)', insert)
            conn.commit()
            
        except: 
            print('error, insert already in database')
            continue

        fetch = c.execute('SELECT paper_identifier FROM papers WHERE \
                           title = ?', (paper_title,))
        paper_identifier = fetch.fetchone()[0]

        #process author

        for rank, author in enumerate(authors): 
            try:
                name = author['content'].split()
                last_name = name.pop()
                first_name = ' '.join(name)
                gen = gender.get_gender(name[0].strip())
                institution, country = get_institution_name(author, authors)
                insert = (first_name, last_name, institution, gen, country)
            except:
                print('unable to extract')
                continue
            try:
                c.execute('INSERT INTO authors(first_name, last_name, \
                           institution, gender, country) VALUES (?, ?, ?, \
                           ?, ?)', insert)
                conn.commit()
            except:
                print('author already here')
            
            fetch = c.execute('SELECT author_identifier FROM authors WHERE \
                               first_name = ? AND last_name = ?', (first_name,\
                               last_name))
            author_identifier = fetch.fetchone()[0]
            insert = (author_identifier, paper_identifier, rank + 1)
                
            c.execute('INSERT INTO author_key_rank(author_identifier, \
                       paper_identifier, rank) VALUES (?, ?, ?)', insert)
            conn.commit()
                
        print(i)
def get_info(links):
    '''
    Extract restaurants name, zipcode, price, 
        cuisine, num of reviews, average rating
    
    Input:
        links (list): a list of links returned by link_crawler.py
        
    Output:
        info (dict): a dictionary of restaurant info
    '''
    info = []

    for url in links:
        print("start at", url)
        restaurant = {}
        request = util.get_request(url)
        if request is not None:
            text = util.read_request(request)
            soup = bs4.BeautifulSoup(text, "html5lib")
            tag = soup.find('script', type='application/ld+json')

            if tag is not None:
                d = json.loads(tag.text)

                restaurant['restaurant_name'] = d['name'].replace('&apos;',"'").\
                        replace('&amp;', "&")
                restaurant['zip_code'] = d['address']['postalCode']

                if 'priceRange' in d:
                    if d['priceRange'] == "Above $61":
                        restaurant['price'] = 70
                    elif d['priceRange'] == "Under $10":
                        restaurant['price'] = 5
                    else:
                        price = re.findall(r'[0-9]+', d['priceRange'])
                        #Average of price upper and lower bounds
                        restaurant['price'] = (float(price[1]) +\
                                    float(price[0])) / 2
                else:
                    restaurant['price'] = None

                restaurant['cuisine'] = d['servesCuisine'].replace(
                    '&amp;', "&")

                restaurant['num_review'] = len(d["review"])

                reviews = d["review"]
                if len(d['review']) != 0:
                    rating = 0
                    for review in reviews:
                        rating += review['reviewRating']['ratingValue']
                    restaurant['rating'] = rating / len(d["review"])
                else:
                    restaurant['rating'] = 0

                info.append(restaurant)

            else:
                print("the link gives an empty tag")
        else:
            print("empty request", url)

    return info
Example #30
0
def identify_text(url, filename):
    '''This is a very long function, with a lot of items in it,
    so get ready for this explanation.

    This function takes a url and a filename 
    where a course to course code dictionary can be found and returns a 
    dictionary where entries are of the form: {Course code: course title + course description}

    Inputs: url -- a string of the url to look for classes on,
    filename -- a file containing the course code dictionary

    Outputs: coursetext -- a dictionary of aforementioned form.'''

    with open(filename, 'r') as fp:
        course_dict = json.load(fp)
    #establish course_dict as course to course code dictionary

    request = util.get_request(url)
    assert request != None
    text = util.read_request(request)
    soup = bs4.BeautifulSoup(text, "html5lib")
    divs = soup.find_all("div", class_="courseblock main")
    coursetext = {}
    coursenames = []

    for div in divs:
        desc = []
        desc += (div.find_all("p", class_="courseblocktitle"))
        desc += (div.find_all("p", class_="courseblockdesc"))
        courses = util.find_sequence(div)
        if courses != []:  #courses in sequences fall here

            desc.append([])
            desc.append([])

            for div in courses:
                desc[2] += (div.find_all("p", class_="courseblocktitle"))
                desc[3] += div.find_all("p", class_="courseblockdesc")

            n = 0
            for course in desc[2]:
                #since we are on sequence path, desc[2] has html
                #for all courses in a sequence
                course_name = course.text
                course_name = str(course_name).split()
                course_title = course_name[2:]
                course_name = course_name[0] + ' ' + course_name[1][0:5]
                #course_name is now of form Department Code *space* Course number
                course_code = course_dict[course_name]

                course_text = desc[3][n].text
                main_title = desc[
                    0].text  #main_title is sequence title, as opposed to course title
                main_desc = desc[
                    1].text  #main_desc is sequence description, as opposed to course description

                course_text = course_title + str(course_text).split()
                course_text = course_text + str(main_desc).split() + str(
                    main_title).split()
                coursetext[course_code] = (' '.join(course_text)).lower()
                #Okay so I know the split, join thing seems counter-productive,
                #but it's to prevent splitting every letter. I want the text as a big block.
                #Also, course_text is a long string that is sequence title + desc + course title + desc.
                #Coursetext is a dictionary and course_text is the info in it.
                n += 1

        else:  #courses not in sequences fall here
            course_name = desc[0].text
            course_name = str(course_name).split()
            course_title = course_name[2:]
            course_name = course_name[0] + ' ' + course_name[1][0:5]
            course_code = course_dict[course_name]

            course_text = desc[1].text
            course_text = course_title + str(course_text).split()
            coursetext[course_code] = (' '.join(course_text)).lower()

            #See if statement path to address confusion, they are similar but NOT identical in procedure.

    return coursetext