Ejemplo n.º 1
0
def _fetch_pagelet_highlights(pagelet_num, max_days_ago):
    highlights = []

    payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[category]\"\r\n\r\n58,29,72,69,30,65,907,31,419,67,18,417,25,63,82,28,256,902\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[count]\"\r\n\r\n15\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[order_by]\"\r\n\r\ndate\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[order]\"\r\n\r\nDESC\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[style]\"\r\n\r\nlisting-classic\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[show_excerpt]\"\r\n\r\n0\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[cats-tags-condition]\"\r\n\r\nand\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[cats-condition]\"\r\n\r\nin\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[tags-condition]\"\r\n\r\nin\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[featured_image]\"\r\n\r\n0\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[ignore_sticky_posts]\"\r\n\r\n1\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[disable_duplicate]\"\r\n\r\n0\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[paginate]\"\r\n\r\nmore_btn\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[pagination-show-label]\"\r\n\r\n0\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[columns]\"\r\n\r\n3\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[override-listing-settings]\"\r\n\r\n0\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[_layout][state]\"\r\n\r\n1|1|1\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"query[_layout][page]\"\r\n\r\n1-col\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"view\"\r\n\r\nPublisher_Classic_Listing_1_Shortcode\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"current_page\"\r\n\r\n" \
              + str(pagelet_num) + "\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"_bs_pagin_token\"\r\n\r\n2670529\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"action\"\r\n\r\npagination_ajax\r\n" \
              "------WebKitFormBoundary7MA4YWxkTrZu0gW--"

    headers = {
        'content-type':
        "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
        'cache-control': "no-cache",
    }

    page = requests.request("POST", ROOT_URL, data=payload, headers=headers)

    if page.text == 'INVALID TOKEN!':
        return []

    html = json.loads(page.text)['output'] \
        .replace("\n", "") \
        .replace("\t", "") \
        .replace("\\", "")

    soup = BeautifulSoup(html, 'html.parser')

    # Extract videos
    for vid in soup.find_all(class_='listing-inner'):

        # Extract match name
        match_name = str(vid.find(class_='title').find('a').get_text())

        if not 'vs' in match_name:
            # Check that the highlight is for a match
            continue

        # Extract view count - NOT AVAILABLE for this website
        view_count = 0

        # Extract category
        info = vid.find(class_='term-badge')

        if not info:
            continue

        category = str(info.find('a').get_text())

        # Extract time since video added
        date = vid.find(class_='post-meta').find('time')

        if not date:
            continue

        now = datetime.now()

        time_since_added = str(date.get_text())
        time_since_added_date = dateparser.parse(time_since_added).replace(
            hour=now.hour, minute=now.minute)
        time_since_added = str(time_since_added_date)

        # If error occur while parsing date, skip
        # TODO: handle case where date malformed (special string field)
        if not time_since_added_date:
            continue

        if not fetcher_footyroom.is_recent(time_since_added_date,
                                           max_days_ago):
            continue

        # Extract image link
        image = vid.find(class_='img-holder')

        if not image:
            continue

        img_link = image.get('data-src')

        # Extract link
        link_tag = vid.find(class_='img-holder').get('href')
        link = str(link_tag)

        if not _is_valid_link(link):
            continue

        video_links = _get_video_links(link)

        if not video_links:
            continue

        # Add multiple video links
        for type, link in video_links:
            highlights.append(
                SportyHLHighlight(link, match_name, img_link, view_count,
                                  category, time_since_added, type))

    return highlights
Ejemplo n.º 2
0
def _fetch_pagelet_highlights(pagelet_num, max_days_ago):
    highlights = []

    page = requests.get(ROOT_URL + PAGELET_EXTENSION + str(pagelet_num))
    soup = BeautifulSoup(page.content, 'html.parser')

    # Extract videos
    for vid in soup.find_all(class_='vidthumb'):
        thumb = vid.find(class_='thumb')

        if not thumb:
            continue

        link = thumb.find('a')

        # Extract match name
        match_name = str(link.get('title'))

        if not 'vs' in match_name:
            # Check that the highlight is for a match
            continue

        # Extract view count
        video_info = vid.find(class_="count")
        view_count = 0

        if video_info:
            count = video_info.get_text()
            view_count = int(float(count.replace('K', '')) *
                             1000) if 'K' in count else count

        # Extract category
        info = vid.find(class_='flecha')

        if not info:
            continue

        category = str(info.get_text())

        # Extract time since video added
        date = vid.find(class_='time')

        if not date:
            continue

        now = datetime.now()

        time_since_added = str(date.get_text())
        time_since_added_date = dateparser.parse(time_since_added).replace(
            hour=now.hour, minute=now.minute)
        time_since_added = str(time_since_added_date)

        # If error occur while parsing date, skip
        # TODO: handle case where date malformed (special string field)
        if not time_since_added_date:
            continue

        if not fetcher_footyroom.is_recent(time_since_added_date,
                                           max_days_ago):
            continue

        # Extract image link
        image = thumb.find('img')

        if not image:
            continue

        img_link = str(image.get('src'))

        # Extract link
        link = str(link.get('href'))

        if not _is_valid_link(link):
            continue

        # Get highlight page HTML
        page = requests.get(link)
        soup = BeautifulSoup(page.content, 'html.parser')

        video_links = _get_video_links(soup)

        if not video_links:
            continue

        score = _get_match_score(soup)

        try:
            goal_data = fetcher_score_ourmatch.get_goal_data(soup)
        except Exception:
            goal_data = []

        # Add multiple video links
        for type, link in video_links:
            h = OurMatchHighlight(link, match_name, img_link, view_count,
                                  category, time_since_added, goal_data, type)

            if score:
                h.set_score(score[0], score[1])

            highlights.append(h)

    return highlights
Ejemplo n.º 3
0
def _fetch_pagelet_highlights(pagelet_num, max_days_ago):
    highlights = []

    page = requests.get(ROOT_URL)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Extract videos
    for vid in soup.find_all(class_='td_module_1'):

        # Extract match name
        match_name = str(vid.find(class_='td-image-wrap').get('title'))

        if not 'bbc match of the day' in match_name.lower():
            # Check that the highlight is for a match
            continue

        # Extract category
        info = vid.find(class_='td-post-category')

        if not info:
            continue

        category = str(info.get_text())

        # Extract time since video added
        date = vid.find(class_='td-module-date')

        if not date:
            continue

        now = datetime.now()

        time_since_added = str(date.get_text())
        time_since_added_date = dateparser.parse(time_since_added).replace(
            hour=now.hour, minute=now.minute)
        time_since_added = str(time_since_added_date)

        # If error occur while parsing date, skip
        # TODO: handle case where date malformed (special string field)
        if not time_since_added_date:
            continue

        if not fetcher_footyroom.is_recent(time_since_added_date,
                                           max_days_ago):
            continue

        # Extract image link
        image = vid.find(class_='td-image-wrap')

        if not image:
            continue

        style = image.find("span").get("style")

        regex = "background-image: url\((.*?)\)"
        search_result = re.compile(regex, 0).search(style)

        img_link = ''

        if search_result:
            img_link = search_result.groups()[0]

        # Extract link
        link_tag = vid.find(class_="td-image-wrap")

        link = str(link_tag.get("href"))

        if not _is_valid_link(link):
            continue

        video_links = _get_video_links(link)

        for type, video_link in video_links:
            highlights.append(
                HighlightsFootballHighlight(video_link, match_name, img_link,
                                            view_count, category,
                                            time_since_added, type))

    return highlights
Ejemplo n.º 4
0
def _fetch_pagelet_highlights(pagelet_num, max_days_ago):
    highlights = []

    page = requests.post(ROOT_URL, data={
        'action': 'td_ajax_block',
        'block_type': 'td_block_3',
        'td_current_page': pagelet_num + 1
    })

    html = json.loads(page.text)['td_data'] \
        .replace("\n", "") \
        .replace("\t", "") \
        .replace("\\", "")

    soup = BeautifulSoup(html, 'html.parser')

    # Extract videos
    for vid in soup.find_all(class_='td_module_1'):

        # Extract match name
        match_name = str(vid.find('img').get('title'))

        if not 'vs' in match_name:
            # Check that the highlight is for a match
            continue

        # Extract view count - NOT AVAILABLE for this website
        view_count = 0

        # Extract category
        info = vid.find(class_='td-post-category')

        if not info:
            continue

        category = str(info.get_text())

        # Extract time since video added
        date = vid.find(class_='td-module-date')

        if not date:
            continue

        now = datetime.now()

        time_since_added = str(date.get_text())
        time_since_added_date = dateparser.parse(time_since_added).replace(hour=now.hour, minute=now.minute)
        time_since_added = str(time_since_added_date)

        # If error occur while parsing date, skip
        # TODO: handle case where date malformed (special string field)
        if not time_since_added_date:
            continue

        if not fetcher_footyroom.is_recent(time_since_added_date, max_days_ago):
            continue

        # Extract image link
        image = vid.find('img')

        if not image:
            continue

        img_link = str(image.get("src"))

        # Extract link
        link_tag = vid.find("a")

        link = str(link_tag.get("href"))

        if not _is_valid_link(link):
            continue

        video_link = _get_video_link(link)

        if not video_link:
            continue

        highlights.append(HighlightsFootballHighlight(video_link, match_name, img_link, view_count, category, time_since_added))

    return highlights
Ejemplo n.º 5
0
def _fetch_pagelet_highlights(pagelet_num, max_days_ago):
    highlights = []

    page = PROXY.get(ROOT_URL + PAGELET_EXTENSION + str(pagelet_num))
    soup = BeautifulSoup(page.content, 'html.parser')

    # Extract videos
    for vid in soup.find_all(id="cocog"):

        # Extract link
        link_tag = vid.find("a")

        link = str(link_tag.get("href"))

        if not _is_valid_link(link):
            continue

        full_link = _form_full_link(link)
        video_links = _get_video_links(full_link)

        if not video_links:
            continue

        # Extract image link
        image = link_tag.find("img")

        if not image:
            continue

        img_link = str(image.get("src"))

        # Extract match name
        match_name = str(image.get("alt"))

        if ' - ' not in match_name:
            continue

        # Extract view count - NOT AVAILABLE for this website
        view_count = 0

        # Extract category
        info = vid.find(class_="info")

        if not info:
            continue

        info_img = info.find("img")

        if not info_img:
            continue

        category = str(info_img.get("alt"))

        # Extract time since video added
        info_font = info.find("font")

        if not info_font:
            continue

        time_since_added = str(info_font.get_text())
        time_since_added_date = dateparser.parse(time_since_added)

        # If error occur while parsing date, skip
        # TODO: handle case where date malformed (special string field)
        if not time_since_added_date:
            continue

        if not fetcher_footyroom.is_recent(time_since_added_date, max_days_ago):
            continue

        for type, video_link in video_links:
            highlights.append(HoofootHighlight(video_link, match_name, img_link, view_count, category, time_since_added, type))

    return highlights