コード例 #1
0
ファイル: scraper.py プロジェクト: Chen-Han/carpool_scraper
def scrape_page(carpool_page_url):
    carpool_page = parse_to_soup(carpool_page_url)
    threads =  carpool_page.find_all(is_normal_thread)
    post_list = []
    today = datetime.datetime.today()
    for thread in threads:
        try:
            title = thread.find('span',{'class':'comiis_common'})
            print('Getting thread %s ' % title)
            link = title.find('a',{'onclick':'atarget(this)'})
            normalized_txt = regex_util.normalize(link.text)
            date = date_extractor.extract_date_info(normalized_txt,today)
            location_pair = location_extractor.extract_location_info(normalized_txt)
            
            post_url = link['href']
            post_page = parse_to_soup(post_url) #getting the page
            phone_num = scrape_phone_num(getMainPost(post_page)) # getting the phone number
            # add to list
            if (post_url and date and location_pair):
                post_list.append(Post(carpool_date=date,
                    from_location=location_pair[0],
                    to_location=location_pair[1],
                    phone=phone_num,url=post_url,
                    original_title=link.text,scrape_date=today))
        except Exception as e: 
            print(e)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            print "*** print_tb:"
            traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
            print "*** print_exception:"
            traceback.print_exception(exc_type, exc_value, exc_traceback,
                                      limit=2, file=sys.stdout)
            print "*** print_exc:"
            traceback.print_exc()
    return post_list
コード例 #2
0
def parse_request(txt, reference_date=datetime.today()):
    normalized_txt = regex_util.normalize(txt)
    date = date_extractor.extract_date_info(normalized_txt, reference_date)
    location_info = location_extractor.extract_location_info(normalized_txt)
    if location_info:
        return RequestInfo(date, location_info[0], location_info[1])
    return None