Example #1
0
    def get_episodes(self, show_id, season):
        r = self.session.get(f"{BASE_URL}/show/{show_id}/{season}")
        r.raise_for_status()
        sopa = bso(r.content, "lxml")
        tables = sopa.find_all("tr")
        seasons = [i.text for i in tables[1].find_all("a")]

        if not any(season == season_ for season_ in seasons):
            return

        season_subs = []
        episodes = []

        for tr in range(len(tables)):
            data = tables[tr].find_all("td")

            title = self.title_available(data)
            if title:
                episodes.append(title)

            source_var = self.source_separator(data)
            if not source_var:
                continue

            season_subs += list(
                self.scrape_episode_info(source_var, tables, tr))

        return list(self.get_episode_dicts(episodes, season_subs, season))
Example #2
0
    def download_lastfm_pl(self, playlist_url):
        # Apparently, last fm API doesn't have a playlist endpoint. If you
        # find out that it has, please fix this!
        try:
            r = requests.get(playlist_url, timeout=10)
        except requests.exceptions.RequestException as e:
            logger.error(f"{RED}Playlist download failed: {e}")
            return
        soup = bso(r.content, "html.parser")
        artists = [artist.text for artist in soup.select(ARTISTS_SELECTOR)]
        titles = [title.text for title in soup.select(TITLE_SELECTOR)]

        track_list = []
        if len(artists) == len(titles) and artists:
            track_list = [
                artist + " " + title for artist, title in zip(artists, titles)
            ]

        if not track_list:
            logger.info(f"{OFF}Nothing found")
            return

        pl_title = sanitize_filename(soup.select_one("h1").text)
        pl_directory = os.path.join(self.directory, pl_title)
        logger.info(f"{YELLOW}Downloading playlist: {pl_title} "
                    f"({len(track_list)} tracks)")

        for i in track_list:
            track_id = get_url_info(
                self.search_by_type(i, "track", 1, lucky=True)[0])[1]
            if track_id:
                self.download_from_id(track_id, False, pl_directory)

        if not self.no_m3u_for_playlists:
            make_m3u(pl_directory)
Example #3
0
def random_posts(html):
    soup = bso(html, "lxml")
    posts = list(soup.findAll(attrs={"role": "main"})[0].children)
    result = []
    for i in range(3):
        result.append(str(posts[random.randint(0, len(posts))]))
    return result
Example #4
0
    def getFakeAccData(self, response):
        beautifulContent = bso(response.content, "html.parser")

        mailAdress = self.getMailAdress(beautifulContent)
        cookies = self.getCookies(response)

        return mailAdress, cookies
def parse_pdf(path):
    """
    Parses a pdf to string. Reads the pdf from the given path
    """
    raw = parser.from_file(path, xmlContent=True)
    soup = bso(raw['content'], 'lxml')
    pages = soup.find_all('div', attrs={'class': 'page'})
    print(pages)
    return pages
Example #6
0
    def index_titles(self):
        r = self.session.get(f"{BASE_URL}/series.php?/")
        r.raise_for_status()
        soup = bso(r.content, "html.parser")

        for a in soup.find_all("a"):
            href_url = a.get("href")
            if "show" in href_url:
                yield {"title": a.text, "url": href_url}
Example #7
0
 def index_titles(self):
     r = self.session.get(BASE)
     r.raise_for_status()
     soup = bso(r.content, "html.parser")
     titles = []
     for a in soup.find_all("a"):
         href_url = a.get("href")
         if "show" in href_url:
             titles.append({"title": a.text, "url": href_url})
     return titles
Example #8
0
def get_tickers_list_SP500():
    import urllib
    from urllib import request
    from bs4 import BeautifulSoup as bso

    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    # Get page
    if not os.path.exists("wiki_tickers_page.txt"):
        get_url = request.urlopen(url).read()
        with open("wiki_tickers_page.txt", "w") as file:
            file.write(str(get_url))
    else:
        with open("wiki_tickers_page.txt", "r") as file:
            get_url = file.read()

    # Get Ticker
    bsdoc = bso(get_url, "html.parser")
    table = bsdoc.find("table", {"id": "constituents"})
    # print(table)
    table = table.find("tbody")
    if not os.path.exists("tickers.txt"):
        # print(get_url)
        a_ = table.findAll("a", {"class": "external text"})
        tickers = []
        for a in a_:
            if a.getText() != "reports":
                tickers.append(str(a.getText()))

        with open("tickers.txt", "w") as file:
            file.write(json.dumps(tickers))
            print("tickers file created")
    else:
        with open("tickers.txt", "r") as file:
            tickers = json.loads(file.read())

    if not os.path.exists("tickers_names.txt"):
        names = []
        trs = table.findAll("tr")
        i = 0
        for tr in trs:
            #print(tr)
            tds = bso.findAll(tr, "td")
            #ais = bso.findAll(tds, "a")
            for i, td in enumerate(tds):
                #print(i, td)
                if i == 1:
                    name = bso.find(td, "a").getText()
                    names.append(name)
        with open("tickers_names.txt", "w") as file:
            file.write(json.dumps(names))
    else:
        with open("tickers_names.txt", "r") as file:
            names = json.loads(file.read())

    return tickers, names
Example #9
0
    def getInboxLinks(self, response):
        beautifulContent = bso(response.content, "html.parser")
        inboxElements = beautifulContent.find("table")
        trimmedLinks = []

        for mails in inboxElements.contents:
            link = mails.attrs.get('onclick')
            if (link):
                extractedLink = re.findall("readmail.html.*'", link)
                trimmedLinks.append(extractedLink[0].strip("'"))
        return trimmedLinks
Example #10
0
 def getTwitterCode(self, response):
     beautifulContent = bso(response.content, "html.parser")
     codes = beautifulContent.findAll("td",
                                      attrs={
                                          "class": "h1 black",
                                          "dir": "ltr"
                                      })
     for code in codes:
         code = code.getText()
         if code.isnumeric():
             return code
         else:
             return -1
def get_html_soup_object(url):
    try:
        fp = urllib.request.urlopen(url)
        mybytes = fp.read()
        fp.close()
    except HTTPError as e:
        mybytes = e.read()
    except URLError:
        print("Błąd sieci! Upewnij się czy działa połączenie z internetem.")
        sys.exit(2)

    html = mybytes.decode("utf8")

    return bso(html, "html.parser")
Example #12
0
 def get_arg_links(moviePag):
     moviePag = requests.get("%s?id=%s" % (api_movie, arg_id))
     movieSop = bso(moviePag.content, "html.parser")
     movieJson = json.loads(movieSop.text)
     movieTitle = movieJson["title"]
     try:
         for rele in movieJson["releases"]:
             if rele["subtitles"]:
                 for uri in rele["subtitles"]:
                     self.appendSubs(
                         movieTitle,
                         uri["uri"].rsplit("/", 1)[-1],
                         uri["uri"],
                         "argenteam.net",
                     )
     except KeyError:
         pass
Example #13
0
    def get_subdivx(self):
        subdivx = "{}{}{}".format(subdivx_base, self.query, subdivx_query)
        page = requests.get(subdivx)
        soup = bso(page.content, "html.parser")
        title = soup.find_all(id="menu_titulo_buscador")
        desc = soup.find_all(id="buscador_detalle_sub")
        url = soup.find_all("a", class_="titulo_menu_izq")

        for t, d, u in zip(title, desc, url):
            self.appendSubs(
                t.text.replace("Subtitulos de ", ""),
                d.text,
                u.get("href"),
                "subdivx.com",
            )
        if not title:
            print("Sin resultados en Subdivx")
Example #14
0
def get_tickers_list_PA():
    import urllib
    from urllib import request
    from bs4 import BeautifulSoup as bso
    from string import ascii_uppercase

    alphabet = [letter for letter in ascii_uppercase]
    url = "https://www.borsaitaliana.it/borsa/azioni/listino-a-z.html?initial="

    for letter in alphabet:
        ticker_page = request.urlopen(url + letter).read()
        table = bso(ticker_page,
                    "html.parser").find("table",
                                        {"class": "m-table -firstlevel"})
        table_body = table.findAll("tr")

        for row in table_body:
            print(row)
        print(table)
Example #15
0
    def scrape_download_url(self, episode_dict):
        logger.debug("Scrapping download URL")
        r = self.session.get(episode_dict["download_url"])
        r.raise_for_status()

        discriminator = f".{episode_dict['season']}.{episode_dict['episode']}."
        soup = bso(r.content, "lxml")

        for url, selected in zip(soup.select(CSS1), soup.select(CSS2)):
            meta = ".".join(
                selected.get("href").split(discriminator)[-1].split(".")[:-1])
            if meta in episode_dict["download_url"]:

                id_url = url.find_all("a")[0].get("href")
                sub_id = parse.parse_qs(parse.urlparse(id_url).query)["id"][0]
                lang_id = parse.parse_qs(
                    parse.urlparse(id_url).query)["lang"][0]
                version_ = parse.parse_qs(
                    parse.urlparse(id_url).query)["fversion"][0]

                return f"{BASE_URL}/updated/{lang_id}/{sub_id}/{version_}"
Example #16
0
    def parse(self, response):
        '''Need regex to grab the time from the date and price attributes'''

        # print(response.css('ul.search-main-content__events-list').extract())

        ul_selector = response.css(
            "ul.search-main-content__events-list").extract_first()

        soup = bso(ul_selector, "lxml")
        unordered_list = soup.find("ul")

        print(unordered_list)
        # print(first_child.findChildren())
        list_of_tags = unordered_list.children
        # print(list_of_tags)

        all_events = list()

        # print(list_of_tags)

        for item in list_of_tags:
            event = Event()
            div_container = response.css(
                'div.eds-media-card-content__content__principal')
            # print(div_container)
            event['title'] = div_container.css(
                "a.eds-media-card-content__action-link h3.eds-media-card-content__title.eds-text-color--grey-800.eds-text-bl div.card-text--truncated__three::text"
            ).extract_first()
            event['date'] = div_container.css(
                'div.eds-media-card-content__sub-content div.eds-text-bs--fixed.eds-text-color--grey-600.eds-l-mar-top-1::text'
            ).extract_first()
            event['location'] = div_container.css(
                'div.eds-media-card-content__sub-content div.eds-media-card-content__sub-content-cropped div.eds-text-bs--fixed.eds-text-color--grey-600.eds-l-mar-top-1 div.card-text--truncated__one::text'
            ).extract_first()
            event['price'] = div_container.css(
                'div.eds-media-card-content__sub-content div.eds-media-card-content__sub-content-cropped div.eds-text-bs--fixed.eds-text-color--grey-600.eds-l-mar-top-1::text'
            ).extract_first()

            return event
Example #17
0
    def download_lastfm_pl(self, playlist_url):
        # Apparently, last fm API doesn't have a playlist endpoint. If you
        # find out that it has, please fix this!
        r = requests.get(playlist_url)
        soup = bso(r.content, "html.parser")
        artists = [artist.text for artist in soup.select(ARTISTS_SELECTOR)]
        titles = [title.text for title in soup.select(TITLE_SELECTOR)]

        if len(artists) == len(titles) and artists:
            track_list = [
                artist + " " + title for artist, title in zip(artists, titles)
            ]

        if not track_list:
            print("Nothing found")
            return

        pl_title = sanitize_filename(soup.select_one("h1").text)
        print("Downloading playlist: " + pl_title)
        self.directory = os.path.join(self.directory, pl_title)
        for i in track_list:
            track_url = self.search_by_type(i, "track", 1, lucky=True)[0]
            if track_url:
                self.handle_url(track_url)
Example #18
0
    def get_argenteam(self):
        argenteam_search = "%s?q=%s" % (api_search, self.query)
        page = requests.get(argenteam_search)
        soup = bso(page.content, "html.parser")
        arg_json = json.loads(soup.text)

        def get_arg_links(moviePag):
            moviePag = requests.get("%s?id=%s" % (api_movie, arg_id))
            movieSop = bso(moviePag.content, "html.parser")
            movieJson = json.loads(movieSop.text)
            movieTitle = movieJson["title"]
            try:
                for rele in movieJson["releases"]:
                    if rele["subtitles"]:
                        for uri in rele["subtitles"]:
                            self.appendSubs(
                                movieTitle,
                                uri["uri"].rsplit("/", 1)[-1],
                                uri["uri"],
                                "argenteam.net",
                            )
            except KeyError:
                pass

        for tipo in arg_json["results"]:
            mov_o_tv = tipo["type"]
            arg_id = tipo["id"]
            try:
                if mov_o_tv == "movie":
                    moviePag = requests.get("{}?id={}".format(api_movie, arg_id))
                    get_arg_links(moviePag)
                else:
                    moviePag = requests.get("{}?id={}".format(api_episode, arg_id))
                    get_arg_links(moviePag)
            except AttributeError:
                print("Sin resultados en Argenteam")
Example #19
0
def get_content_entries(counter):
    html = get_html()
    if html:
        soup = bso(html, "html.parser")
        bodies = soup.find_all("body")

        titles = soup.find_all("title")
        letters_body_list = []
        for idx, (body, title) in enumerate(zip(bodies, titles)):
            letter_dict = {
                "letter_number": idx,  # ex-letter_order
                "letter_title": title.text,
                "body": body
            }
            letters_body_list.append(letter_dict)

        wasted_letters_numbers = [2, 4, 8, 16]
        without_li_letters_numbers = [6, 7, 10, 11]
        strong_theme_letters = [5, 9, 12, 13, 14, 15, 17]

        for idx, letter in enumerate(letters_body_list):
            # for i in without_li_letters_numbers:
            #     print(letters_body_list[i]["letter_title"])
            # exit(0)
            # continue
            # letter = letters_body_list[10]
            # letter["letter_number"] = 12
            # letter["body"] = letters_body_list[12]["body"]
            # letter["letter_title"] = letters_body_list[12]["letter_title"]
            if letter["letter_number"] in wasted_letters_numbers:
                continue
            else:
                section_name = letter["letter_title"]
                modified_date = get_modified_date(section_name)

                get_all_a_in_text(letter["body"], letter["letter_title"],
                                  modified_date)

                # # text = get_text(letter["body"], 31, -16)
                description = section_name
                lesson_name = pretifier(section_name)
                url = "text_" + str(counter)
                counter += 1
                write_to_db(lesson_name,
                            description,
                            section_name,
                            url,
                            modified_date=modified_date,
                            url_description=str(letter["body"]))

                if letter["letter_number"] == 5:  # виртуальное окружение
                    ol = letter["body"].find('ol')
                    for li in ol.find_all('li'):
                        list_a = li.find_all('a')
                        for a in list_a:
                            description = li.text
                            lesson_name = pretifier(li.text)
                            section_name = letter["letter_title"]
                            url_description = a.text
                            url, type = get_url_and_url_type(a)
                            slug = get_slug_url(
                                lesson_name, section_name
                            )  # возможно стоит передавать description вместо lesson_name. Проверить!
                            write_to_db(lesson_name,
                                        description,
                                        section_name,
                                        url,
                                        modified_date=modified_date,
                                        url_description=url_description,
                                        type=type,
                                        slug=slug)

                if letter[
                        "letter_number"] in strong_theme_letters:  # недели 4,5,6,7,8 (тесты), 9
                    strongs = letter["body"].find_all("strong")
                    for strong in strongs:
                        next_ = strong.find_next()
                        if next_.li:
                            li_list = next_.find_all("li")
                            for li in li_list:
                                section_name = strong.text
                                lesson_name = pretifier(li.text)
                                description = li.text
                                url_description = letter["letter_title"]
                                url, type = get_url_and_url_type(li.a)
                                slug = get_slug_url(
                                    lesson_name, section_name
                                )  # возможно стоит передавать description вместо lesson_name. Проверить!
                                write_to_db(lesson_name,
                                            description,
                                            section_name,
                                            url,
                                            modified_date=modified_date,
                                            url_description=url_description,
                                            type=type,
                                            slug=slug)
                        elif "проекты" in strong.text.lower():
                            next_next_ = next_.find_next().find_next()
                            section_name = letter["letter_title"]
                            url_description = next_next_.text
                            lesson_name = pretifier(strong.text)
                            description = strong.text
                            url, type = get_url_and_url_type(next_next_)
                            slug = get_slug_url(
                                lesson_name, section_name
                            )  # возможно стоит передавать description вместо lesson_name. Проверить!
                            write_to_db(lesson_name,
                                        description,
                                        section_name,
                                        url,
                                        modified_date=modified_date,
                                        url_description=url_description,
                                        type=type,
                                        slug=slug)

                            next_next_next_ = next_next_.find_next()
                            section_name = letter["letter_title"]
                            url_description = next_next_next_.text
                            description = strong.text
                            lesson_name = pretifier(strong.text)
                            url, type = get_url_and_url_type(next_next_next_)
                            slug = get_slug_url(
                                lesson_name, section_name
                            )  # возможно стоит передавать description вместо lesson_name. Проверить!
                            write_to_db(lesson_name,
                                        description,
                                        section_name,
                                        url,
                                        modified_date=modified_date,
                                        url_description=url_description,
                                        type=type,
                                        slug=slug)
                        elif "трек" in strong.text.lower(
                        ) or "дополнительно" in strong.text.lower():
                            # continue
                            if "5-й недели" in letter["letter_title"].lower():
                                next_next_ = next_.find_next().find_next(
                                ).find_next()
                            elif "трек" in strong.text.lower(
                            ) or "9 недели" in letter["letter_title"].lower():
                                next_next_ = next_.find_next().find_next()
                            elif "дополнительно" in strong.text.lower():
                                next_next_ = next_.find_next()
                            li_list = next_next_.find_all("li")
                            for li in li_list:
                                section_name = strong.text.replace('"', '')
                                lesson_name = pretifier(li.text)
                                description = li.text
                                url_description = letter["letter_title"]
                                url, type = get_url_and_url_type(li.a)
                                slug = get_slug_url(
                                    lesson_name, section_name
                                )  # возможно стоит передавать description вместо lesson_name. Проверить!
                                write_to_db(lesson_name,
                                            description,
                                            section_name,
                                            url,
                                            modified_date=modified_date,
                                            url_description=url_description,
                                            type=type,
                                            slug=slug)

                elif letter[
                        "letter_number"] in without_li_letters_numbers:  # Trello, проекты, доп задания
                    lesson_name = pretifier(section_name)
                    description = section_name
                    regex = re.compile(
                        r".*gmail.com.*|.*subscri*.|why did I get this")
                    tables = letter["body"].find_all("table")
                    for td in tables:
                        list_a = td.find_all('a')
                        for a in list_a:
                            match = regex.search(a.text)
                            if not match:
                                url, type = get_url_and_url_type(a)
                                url_description = a.text
                                slug = get_slug_url(
                                    lesson_name, section_name
                                )  # возможно стоит передавать description вместо lesson_name. Проверить!
                                write_to_db(lesson_name,
                                            description,
                                            section_name,
                                            url,
                                            modified_date=modified_date,
                                            type=type,
                                            url_description=url_description,
                                            slug=slug)

                else:
                    # недели: 0(приветственное), 1, 2

                    # text = get_text(letter["body"], 31, -16)
                    # theme_name = section_name
                    # url = "text_" + str(counter)
                    # counter += 1
                    # slug = get_slug_url(theme_name, section_name)
                    # write_to_db(theme_name, section_name, url, modified_date=modified_date, description=str(letter["body"]), slug=slug)

                    # continue

                    li_els = letter["body"].find_all('li')
                    for li in li_els:
                        if li.a:
                            a_els = li.find_all('a')
                            for a in a_els:
                                lesson_name = pretifier(li.text)
                                description = li.text
                                url_description = a.text
                                url, type = get_url_and_url_type(a)
                                slug = get_slug_url(
                                    lesson_name, section_name
                                )  # возможно стоит передавать description вместо lesson_name. Проверить!
                                write_to_db(lesson_name,
                                            description,
                                            section_name,
                                            url,
                                            modified_date=modified_date,
                                            url_description=url_description,
                                            type=type,
                                            slug=slug)
    else:
        print('\t[Error] No datafile found')
 def __init__(self, data):
     self.engine = bso(data, "lxml")
 def updateData(self, data):
     self.engine = bso(data, "lxml")
Example #22
0
def convert_one(input_file):

    soup = bso(open(input_file), 'xml')

    newsoup = bso('<annotation></annotation>', 'xml')

    ann = newsoup.annotation

    filename = soup.annotation.filename.string
    folder = soup.annotation.folder.string

    folder_tag = newsoup.new_tag('folder')
    filename_tag = newsoup.new_tag('filename')

    folder_tag.string = "VOC10"
    filename_tag.string = filename

    ann.append(folder_tag)
    ann.append(filename_tag)

    source_tag = newsoup.new_tag('source')
    db_tag = newsoup.new_tag('database')
    db_tag.string = 'The VOC2007 Database'
    src_ann_tag = newsoup.new_tag('annotation')
    src_ann_tag.string = 'PASCAL VOC2007'
    img_tag = newsoup.new_tag('image')
    img_tag.string = 'flickr'

    source_tag.append(db_tag)
    source_tag.append(src_ann_tag)
    source_tag.append(img_tag)

    ann.append(source_tag)

    height = ann.find('nrows')  #height
    width = ann.find('ncols')  #width

    if height is not None and width is not None:
        size_tag = newsoup.new_tag('size')
        width_tag = newsoup.new_tag('width')
        width_tag.string = width.string
        height_tag = newsoup.new_tag('height')
        height_tag.string = height.string
        depth_tag = newsoup.new_tag('depth')
        depth_tag.string = '3'

        size_tag.append(width_tag)
        size_tag.append(height_tag)
        size_tag.append(depth_tag)
        ann.append(size_tag)

    segment_tag = newsoup.new_tag('segmented')
    segment_tag.string = '1'

    ann.append(segment_tag)

    objects = soup.find_all('object')

    for obj in objects:

        deleted = obj.find('deleted')

        if deleted.string == '1':
            continue

        object_tag = newsoup.new_tag('object')

        name = obj.find('name').string

        name_tag = newsoup.new_tag('name')
        name_tag.string = name

        pose_tag = newsoup.new_tag('pose')
        pose_tag.string = 'Unspecified'
        trunc_tag = newsoup.new_tag('truncated')
        trunc_tag.string = '0'
        difficult_tag = newsoup.new_tag('difficult')
        difficult_tag.string = '0'

        bndbox_tag = newsoup.new_tag('bndbox')

        x_list = []
        y_list = []
        for x in obj.find_all('x'):
            x_list.append(int(x.string))

        for y in obj.find_all('y'):
            y_list.append(int(y.string))

        x_max, x_min = max(x_list), min(x_list)
        y_max, y_min = max(y_list), min(y_list)

        xmin_tag = newsoup.new_tag('xmin')
        xmin_tag.string = str(x_min)

        ymin_tag = newsoup.new_tag('ymin')
        ymin_tag.string = str(y_min)

        xmax_tag = newsoup.new_tag('xmax')
        xmax_tag.string = str(x_max)

        ymax_tag = newsoup.new_tag('ymax')
        ymax_tag.string = str(y_max)

        bndbox_tag.append(xmin_tag)
        bndbox_tag.append(ymin_tag)
        bndbox_tag.append(xmax_tag)
        bndbox_tag.append(ymax_tag)

        object_tag.append(name_tag)
        object_tag.append(pose_tag)
        object_tag.append(trunc_tag)
        object_tag.append(difficult_tag)
        object_tag.append(bndbox_tag)

        ann.append(object_tag)

    return newsoup.encode_contents()
Example #23
0
    final_qr += val[1:i]
    try:
        if len(number) != 0:
            final_qr += str('"' + str(int(number)*qr_size) + '"')
    except:
        print('error in qr code generation')
        show_error = True
final_qr += new_file[-1][1:]

with open('newqrcode.svg', 'a+') as svg:
    svg.truncate(0)
    svg.write(final_qr)

s = ''
with open('newqrcode.svg') as svg:
    our_soup = bso(svg.read(), 'lxml')
    all_child = our_soup.find('svg')
    for child in all_child.findChildren():
        s += str(child)[:-8] + '/>\n'

with open('svg/part2.svg', 'a+') as write_file:
    write_file.truncate(0)
    write_file.write(s)

svg_file_name = input('Enter the svg file name you want to save : ')

with open('omr_svg/' + svg_file_name + '.svg', 'w') as svg_file:
    file = ''
    with open('svg/part1.svg', 'r') as p1:
        file += str(p1.read()) + '\n'
    with open('svg/part2.svg', 'r') as p2:
Example #24
0
    def get_episodes(self, show_id, season):
        logger.debug("https://www.tusubtitulo.com/show/{}/{}".format(
            show_id, season))
        r2 = self.session.get(
            "https://www.tusubtitulo.com/show/{}/{}".format(show_id, season), )
        r2.raise_for_status()
        sopa = bso(r2.content, "lxml")
        tables = sopa.find_all("tr")
        seasons = [i.text for i in tables[1].find_all("a")]
        if not self.is_season_available(seasons, season):
            logger.debug("Season not found")
            return
        season_subs = []
        episodes = []

        for tr in range(len(tables)):
            data = tables[tr].find_all("td")
            title = self.title_available(data)
            if title:
                episodes.append(title)
            source_var = self.source_separator(data)
            if source_var:
                inc = 1
                while True:
                    try:
                        content = tables[tr + inc].find_all("td")
                        language = content[4].text
                        if "eng" in language.lower():
                            language = "en"
                        elif "esp" in language.lower():
                            language = "es"
                        else:
                            language = None
                        completed = True if not "%" in content[
                            5].text else False
                        url = content[6].find_all("a")[0].get("href")
                        sub_id = parse.parse_qs(
                            parse.urlparse(url).query)["id"][0]
                        lang_id = parse.parse_qs(
                            parse.urlparse(url).query)["lang"][0]
                        version_ = parse.parse_qs(
                            parse.urlparse(url).query)["version"][0]
                        download_url = (
                            "https://www.tusubtitulo.com/updated/{}/{}/{}".
                            format(lang_id, sub_id, version_))
                        if language and completed:
                            season_subs.append({
                                "episode_id": sub_id,
                                "metadata": source_var,
                                "download_url": download_url,
                                "language": language,
                            })
                        inc += 1
                    except IndexError:
                        break

        final_list = []
        for i in episodes:
            for t in season_subs:
                if i["episode_id"] == t["episode_id"]:
                    final_list.append({
                        "episode_number": i["episode_number"],
                        "episode_url": i["episode_url"],
                        "metadata": t["metadata"],
                        "download_url": t["download_url"],
                        "language": t["language"],
                    })
        return final_list
Example #25
0
 def parse(self):
     return bso(self.response.text, "html.parser")
        self.engine = bso(data, "lxml")

    def getSearchItems(self):
        main = self.engine.find("div", attrs={"id": "main"})
        searchitems = main.findChildren(recursive=False)
        refineditems = []
        for item in searchitems:
            if item.name == "div" and "class" not in item.attrs.keys():
                refineditems.append(item)
        return refineditems

    def dictifyItem(self, soupSearchTag):
        titleH3 = soupSearchTag.find("h3", attrs={"class": "zBAuLc"})
        if titleH3 == None: return None
        title = titleH3.find("div").string
        date = soupSearchTag.find("span", attrs={"class": "r0bn4c rQMQod"})
        if date == None: return None
        date = date.string

        return {"title": title, "date": date}

    def updateData(self, data):
        self.engine = bso(data, "lxml")


#google.com/search?q=covid&num=100&pws=0&start=100

if __name__ == "__main__":
    our_soup = bso(sample_content, "lxml")
    first_child = our_soup.find()
    print(first_child)
Example #27
0
    def post(self):
        json_webhook = request.form.get('payload')
        parsed_json_webhook = json.loads(json_webhook)

        event = parsed_json_webhook['event']
        if event not in ['media.play']:
            return '', 204

        media_type = parsed_json_webhook['Metadata']['type']

        if media_type == 'episode':
            season = parsed_json_webhook['Metadata']['parentIndex']
            episode = parsed_json_webhook['Metadata']['index']
        else:
            season = episode = None

        ids = []
        for item in parsed_json_webhook['Metadata']['Guid']:
            splitted_id = item['id'].split('://')
            if len(splitted_id) == 2:
                ids.append({splitted_id[0]: splitted_id[1]})
        if not ids:
            return '', 404

        if media_type == 'episode':
            try:
                episode_imdb_id = [x['imdb'] for x in ids if 'imdb' in x][0]
                r = requests.get(
                    'https://imdb.com/title/{}'.format(episode_imdb_id),
                    headers={"User-Agent": os.environ["SZ_USER_AGENT"]})
                soup = bso(r.content, "html.parser")
                series_imdb_id = soup.find(
                    'a',
                    {'class': re.compile(r'SeriesParentLink__ParentTextLink')
                     })['href'].split('/')[2]
            except:
                return '', 404
            else:
                sonarrEpisodeId = TableEpisodes.select(TableEpisodes.sonarrEpisodeId) \
                    .join(TableShows, on=(TableEpisodes.sonarrSeriesId == TableShows.sonarrSeriesId)) \
                    .where(TableShows.imdbId == series_imdb_id,
                           TableEpisodes.season == season,
                           TableEpisodes.episode == episode) \
                    .dicts() \
                    .get()

                if sonarrEpisodeId:
                    episode_download_subtitles(
                        no=sonarrEpisodeId['sonarrEpisodeId'],
                        send_progress=True)
        else:
            try:
                movie_imdb_id = [x['imdb'] for x in ids if 'imdb' in x][0]
            except:
                return '', 404
            else:
                radarrId = TableMovies.select(TableMovies.radarrId)\
                    .where(TableMovies.imdbId == movie_imdb_id)\
                    .dicts()\
                    .get()
                if radarrId:
                    movies_download_subtitles(no=radarrId['radarrId'])

        return '', 200
Example #28
0
def getSoup():
    url = 'http://www.beisbolcubano.cu/'
    html = requests.get(url)
    soup = bso(html.text, 'lxml')

    return soup