def crawl(website_url, year, month, day): url = website_url + "/" + year + month + day matchList = [] print "[Message] Now running Crawler for " + url content_html = general_func.url_open(url, from_encoding='gbk') soup_content = BeautifulSoup(content_html, "html.parser") score_main = soup_content.find('div', {'id': 'nbaContent'}) score_selection = score_main.find('div', {'id': 'nbaSSOuter'}) if (score_selection is None): return matchList score_selections = score_selection.find_all("div", {'class': 'GameLine'}) if (score_selections is None): return matchList for selection in score_selections: table = selection.find("div", { 'class': 'nbaModTopScore' }).find("div", {'class': 'nbaTeamsRow'}) if (table is None): return matchList homeTeam = table.find("div", {'class': 'nbaModTopTeamScr nbaModTopTeamHm'}) awayTeam = table.find("div", {'class': 'nbaModTopTeamScr nbaModTopTeamAw'}) homeWin = homeTeam.find('h4', {'class': 'nbaModTopTeamNum win'}) awayWin = awayTeam.find('h4', {'class': 'nbaModTopTeamNum win'}) homeId = homeTeam.find('h5', { 'class': 'nbaModTopTeamName' }).text.strip() awayId = awayTeam.find('h5', { 'class': 'nbaModTopTeamName' }).text.strip() homeName = homeTeam.find('img')['title'] awayName = awayTeam.find('img')['title'] if (homeWin != None): winner = homeName label = "1" if (awayWin != None): winner = awayName label = "0" matchList.append({ 'team1': homeName, 'team2': awayName, 'winner': winner, 'label': label, 'date': month + "/" + day + "/" + year }) return matchList
def try_crawl(id): try: if id == '': return None url = 'http://www.cmt.com/cmt-music-awards/' + id + '.jhtml' htmlSource = general_func.url_open(url, from_encoding='utf-8') data = crawl_detail(htmlSource) return data except: return None
def try_crawl(id): try: if id == '': return None url = 'http://www.acmawards.com/nominees' htmlSource = general_func.url_open(url, from_encoding='utf-8') data = crawl_detail(htmlSource) return data except: return None
def try_crawl(id): try: if id == '': return None url = 'http://www.goodreads.com/search?q='+id htmlSource = general_func.url_open(url, from_encoding='gbk') data = crawl_detail(htmlSource) return data except: return None
def crawl_info(url): content_html = general_func.url_open(url, from_encoding='gbk') soup = BeautifulSoup(content_html, "html.parser") divs = [] try: divs = soup.find('div', {'class': 'highlightedStats__wrapper--VuLob'})\ .find('ul')\ .find_all('li') except: print url country = '' try: country = divs[1].find('div').text.strip() except: print url # dataTable__wrapper--2Y2vt dataTable__wrapper--2Y2vt divs = soup.find('div', {'class': 'dataTable__wrapper--2Y2vt dataTable__wrapper--2Y2vt'}) \ .find('table') \ .find_all('tr') #print(len(divs)) ceo = divs[0].find_all('td')[1].find('div').text.strip() sector = divs[1].find_all('td')[1].text.strip() industry = divs[2].find_all('td')[1].text.strip() hqlocation = divs[3].find_all('td')[1].text.strip() website = divs[4].find_all('td')[1].text.strip() yearsonlist = divs[5].find_all('td')[1].text.strip() employees = divs[6].find_all('td')[1].text.strip() #print(industry) #print(sector) return { 'country': country, 'ceo': ceo, 'sector': sector, 'industry': industry, 'hqlocation': hqlocation, 'website': website, 'yearsonlist': yearsonlist, 'employees': employees }