Exemple #1
0
def crawl(website_url, year, month, day):
    url = website_url + "/" + year + month + day
    matchList = []
    print "[Message] Now running Crawler for " + url
    content_html = general_func.url_open(url, from_encoding='gbk')

    soup_content = BeautifulSoup(content_html, "html.parser")
    score_main = soup_content.find('div', {'id': 'nbaContent'})
    score_selection = score_main.find('div', {'id': 'nbaSSOuter'})

    if (score_selection is None):
        return matchList
    score_selections = score_selection.find_all("div", {'class': 'GameLine'})
    if (score_selections is None):
        return matchList
    for selection in score_selections:
        table = selection.find("div", {
            'class': 'nbaModTopScore'
        }).find("div", {'class': 'nbaTeamsRow'})
        if (table is None):
            return matchList
        homeTeam = table.find("div",
                              {'class': 'nbaModTopTeamScr nbaModTopTeamHm'})
        awayTeam = table.find("div",
                              {'class': 'nbaModTopTeamScr nbaModTopTeamAw'})

        homeWin = homeTeam.find('h4', {'class': 'nbaModTopTeamNum win'})
        awayWin = awayTeam.find('h4', {'class': 'nbaModTopTeamNum win'})

        homeId = homeTeam.find('h5', {
            'class': 'nbaModTopTeamName'
        }).text.strip()
        awayId = awayTeam.find('h5', {
            'class': 'nbaModTopTeamName'
        }).text.strip()

        homeName = homeTeam.find('img')['title']
        awayName = awayTeam.find('img')['title']

        if (homeWin != None):
            winner = homeName
            label = "1"

        if (awayWin != None):
            winner = awayName
            label = "0"

        matchList.append({
            'team1': homeName,
            'team2': awayName,
            'winner': winner,
            'label': label,
            'date': month + "/" + day + "/" + year
        })

    return matchList
Exemple #2
0
def try_crawl(id):
    try:
        if id == '':
            return None

        url = 'http://www.cmt.com/cmt-music-awards/' + id + '.jhtml'
        htmlSource = general_func.url_open(url, from_encoding='utf-8')

        data = crawl_detail(htmlSource)
        return data
    except:
        return None
Exemple #3
0
def try_crawl(id):
    try:
        if id == '':
            return None

        url = 'http://www.acmawards.com/nominees'
        htmlSource = general_func.url_open(url, from_encoding='utf-8')

        data = crawl_detail(htmlSource)
        return data
    except:
        return None
Exemple #4
0
def try_crawl(id):
    try:
        if id == '':
            return None

        url = 'http://www.goodreads.com/search?q='+id
        htmlSource = general_func.url_open(url, from_encoding='gbk')

        data = crawl_detail(htmlSource)
        return data
    except:
        return None
Exemple #5
0
def crawl_info(url):
    content_html = general_func.url_open(url, from_encoding='gbk')
    soup = BeautifulSoup(content_html, "html.parser")

    divs = []
    try:
        divs = soup.find('div', {'class': 'highlightedStats__wrapper--VuLob'})\
            .find('ul')\
            .find_all('li')
    except:
        print url

    country = ''
    try:
        country = divs[1].find('div').text.strip()
    except:
        print url
    #                                  dataTable__wrapper--2Y2vt dataTable__wrapper--2Y2vt
    divs = soup.find('div', {'class': 'dataTable__wrapper--2Y2vt dataTable__wrapper--2Y2vt'}) \
        .find('table') \
        .find_all('tr')
    #print(len(divs))
    ceo = divs[0].find_all('td')[1].find('div').text.strip()
    sector = divs[1].find_all('td')[1].text.strip()
    industry = divs[2].find_all('td')[1].text.strip()
    hqlocation = divs[3].find_all('td')[1].text.strip()
    website = divs[4].find_all('td')[1].text.strip()
    yearsonlist = divs[5].find_all('td')[1].text.strip()
    employees = divs[6].find_all('td')[1].text.strip()

    #print(industry)

    #print(sector)
    return {
        'country': country,
        'ceo': ceo,
        'sector': sector,
        'industry': industry,
        'hqlocation': hqlocation,
        'website': website,
        'yearsonlist': yearsonlist,
        'employees': employees
    }