Python url2soup Beispiele, utils.htmlsoup.url2soup Python Beispiele

Beispiel #1

0

Datei anzeigen

def getresults(url):
    """Search Getcomics.

    Returns names and urls of posts returned by input url.
    """
    searchlist = []
    try:
        res = url2soup(url).select("div.post-info")
        for d in res:
            if d.h1.a.has_attr('href'):
                size = None
                searchsize = re.search(r'\d+ [KMGT]B', d.p.text, re.M | re.I)
                if searchsize:
                    size = searchsize.group(0)
                result = {
                    "url": d.h1.a.get("href"),
                    "title": d.h1.a.text,
                    "size": size
                }
                searchlist.append(result)
        # print(searchlist)
    except HTTPError as e:
        print(e)
        print("something wrong happened")
    return searchlist

Beispiel #2

0

Datei anzeigen

def print_last_week(url, editor):
    """Print time since last weekly pack."""
    soup = htmlsoup.url2soup(url)
    last_post = soup.find_all('article', class_='type-post')[0]
    title = last_post.h1.a.text
    time = last_post.find('time')
    print(f'{editor}\t : {title} :\t{time.text}')

Beispiel #3

0

Datei anzeigen

Datei: getcomics.py Projekt: daniel-pimenta-cc/GetComicsDownloader

def downComZippy(url):
    soup = htmlsoup.url2soup(url)
    # Other beautiful soup selectors :
    # select("script[type='text/javascript']")
    # select("table[class='folderlogo'] > tr > td")[0]
    # find("div", style=re.compile("margin-left"))
    # find("script", type="text/javascript")
    # find("div", style=re.compile("width: 303px;"))
    # find("script", type="text/javascript")
    downButton = soup.find('a', id="dlbutton").find_next_sibling().text
    try:
        fullURL, fileName = zpshare.getFileUrl(url, downButton)
        print("Downloading from zippyshare into : " + fileName)
        r = requests.get(fullURL, stream=True)
        size = tools.bytes_2_human_readable(int(r.headers['Content-length']))
        print(size)
    except Exception as e:
        print(e)
        print("Can't get download link on zippyshare page")

    # Download from url & trim it a little bit
    with open(fileName, 'wb') as f:
        try:
            for block in r.iter_content(1024):
                f.write(block)
        except KeyboardInterrupt:
            pass
        except IOError:
            print("Error while writing file")
    r.close()
    print('Done\n--')
    return

Beispiel #4

0

Datei anzeigen

Datei: zpshare.py Projekt: Sergeileduc/ComicsDL

def find_zippy_download_button(zippy_url):
    """Find download button on zippyshare page."""
    try:
        soup = url2soup(zippy_url)
        return soup.find('a', id="dlbutton").find_next_sibling()
    except Exception:
        raise DownloadButtonError("Error on zp page : "
                                  "No download button found")

Beispiel #5

0

Datei anzeigen

Datei: test_zphare.py Projekt: daniel-pimenta-cc/GetComicsDownloader

    def test_getFileUrl(self):

        url = "https://www4.zippyshare.com/v/tbiaf4on/file.html"

        soup = htmlsoup.url2soup(url)
        # downButton = soup.select('script[type="text/javascript"]')
        downButton = soup.find('a', id="dlbutton").find_next_sibling().text

        name, out_url = getFileUrl(url, downButton)
        print("--------------------------------------")
        print(name)
        print(out_url)

Beispiel #6

0

Datei anzeigen

Datei: getcomics.py Projekt: daniel-pimenta-cc/GetComicsDownloader

def getresults(url):
    searchlist = list()
    try:
        soup = htmlsoup.url2soup(url)
        for d in soup.select("div.post-info"):
            if d.h1.a.has_attr('href'):
                size = None
                searchsize = re.search(r'\d+ [KMGT]B', d.p.text, re.M | re.I)
                if searchsize:
                    size = searchsize.group(0)
                searchlist.append((d.h1.a.get("href"), d.h1.a.text, size))
        # print(searchlist)
        return searchlist
    except urllib.error.HTTPError:
        print("something wrong happened")

Beispiel #7

0

Datei anzeigen

Datei: getcomics.py Projekt: daniel-pimenta-cc/GetComicsDownloader

def findLastWeekly(url):
    soup = htmlsoup.url2soup(url)
    lastPost = soup.find('article', class_='type-post')
    # Check if today's archive is there, and retrieve its url
    print("Latest weekly post: " + lastPost.time['datetime'])
    if today in lastPost.time['datetime']:
        # print ('There is a new one today. Hurrah!')
        pass
    else:
        # print ('Nothing yet. Exiting...')
        # print ('Continue anyway...')
        # quit()
        pass
    postUrl = lastPost.h1.a['href']
    return postUrl

Beispiel #8

0

Datei anzeigen

Datei: getcomics.py Projekt: daniel-pimenta-cc/GetComicsDownloader

def downCom(url):
    global user_agent
    headers = {'User-Agent': user_agent}
    try:
        req = urllib.request.Request(url, None, headers)
        finalurl = urllib.request.urlopen(req).geturl()
    except urllib.error.HTTPError:
        print("downCom can't get final url")
        raise
    print("Trying " + finalurl)
    zippylink = ''
    try:
        soup = htmlsoup.url2soup(finalurl)
        downButtons = soup.select("div.aio-pulse > a")
    except Exception as e:
        print(e)
    for button in downButtons:
        # if 'zippyshare' in str(button).lower() and 'href' in button.a.attrs:
        if 'zippyshare' in button.get("href") \
                or 'zippyshare' in button.get('title').lower():
            zippylink = button.get("href")
            print(zippylink)
            try:
                if str(zippylink).startswith(BASE):
                    finalzippy = base64.b64decode(
                        zippylink[len(BASE):]).decode()
                    print("Abracadabra !")
                else:
                    # headers = {'User-Agent': user_agent}
                    req = urllib.request.Request(zippylink, None, headers)
                    finalzippy = urllib.request.urlopen(req).geturl()
            except urllib.error.HTTPError as e:
                print("can't obtain final zippyshare page url")
                print(e)
                raise
            except IOError:
                print("Zippyhare download failed")
            try:
                print(finalzippy)
                downComZippy(finalzippy)
            except Exception as e:
                print("error in downComZippy")
                print(e)
    # except urllib.error.HTTPError:
    # print("downCom got HTTPError from returnHTML")
    # raise
    return

Beispiel #9

0

Datei anzeigen

def find_last_weekly(url):
    """Find las weekly post."""
    lastPost = url2soup(url).find('article', class_='type-post')
    # Check if today's archive is there, and retrieve its url
    # print(f"Latest weekly post: {lastPost.time['datetime']}")
    # TODO : code for auotmate, maybe uncode later
    # if today in lastPost.time['datetime']:
    #     # print ('There is a new one today. Hurrah!')
    #     pass
    # else:
    #     # print ('Nothing yet. Exiting...')
    #     # print ('Continue anyway...')
    #     # quit()
    #     pass
    postTitle = lastPost.h1.a.text
    postUrl = lastPost.h1.a['href']
    return postTitle, postUrl

Beispiel #10

0

Datei anzeigen

def printOneEditor(url, f, editor):
    soup = htmlsoup.url2soup(url)
    var = soup.select_one('section.post-contents > ul').find_all('strong')

    f.write(editor + '\n')
    f.write("=====================" + '\n')
    for s in var:
        name = s.text.replace(' : ', '').replace('Download', '')\
                    .replace(' | ', '').replace('Read Online', '')
        a = s.find('a')
        try:
            # if a.has_attr('href'):
            if 'href' in a.attrs:
                f.write('[url=' + a.get("href") + ']' + name + '[/url]' + '\n')
        except Exception:
            f.write(name + '\n')
    f.write("=====================" + '\n')
    f.write("" + '\n')

Beispiel #11

0

Datei anzeigen

def print_multiple_editors(url, f):
    """Write all comics in an Indies weekly pack in file f.

    For Indie week+.
    """
    soup = htmlsoup.url2soup(url).select_one('section.post-contents')

    # List of comics publishers
    publishers = soup.find_all('span', style="color: #3366ff;")
    indies = [p.text for p in publishers]

    # List of comics
    var = soup.find_all('strong')
    for s in var:
        # Highlight publishers
        if s.text in indies:
            f.write(f'\n{s.text}\n=====================\n')
        # blots
        elif (note in s.text or howto in s.text or consistof in s.text
              or howtodl in s.text or lower in s.text or indieweek in s.text):
            pass
        # more bloats
        elif s.text in bloat:
            pass
        # Comics
        else:
            # make a copy of strong s to remove span
            s_copy = copy.copy(s)
            for _ in s_copy:
                try:
                    s_copy.span.decompose()
                except AttributeError:
                    pass
            name = s_copy.text.replace(' : ', '').replace('| ', '')
            if s.a and s.a.has_attr('href'):
                f.write(f'[url={s.a.get("href")}]{name}[/url]\n')
            else:
                f.write(f'{name}\n')
    f.write('\n')

Beispiel #12

0

Datei anzeigen

def printMultipleEditors(url, f):
    soup = htmlsoup.url2soup(url).select_one('section.post-contents')

    # List of comics publishers
    publishers = soup.find_all('span', style="color: #3366ff;")
    indies = []
    for p in publishers:
        indies.append(p.text)

    # List of comics
    var = soup.find_all('strong')
    for s in var:
        # Highlight publishers
        if s.text in indies:
            f.write('\n' + s.text + '\n=====================' + '\n')
        # blots
        elif note in s.text \
                or howto in s.text \
                or consistof in s.text \
                or howtodl in s.text \
                or lower in s.text \
                or indieweek in s.text:
            pass
        # more bloats
        elif s.text in bloat:
            pass
        # Comics
        else:
            name = s.text.replace(' : ', '').replace('Download', '')\
                        .replace(' | ', '').replace('Read Online', '')
            if s.a and s.a.has_attr('href'):
                f.write('[url=' + s.a.get("href") + ']'
                        + name + '[/url]' + '\n')
            else:
                f.write(name + '\n')
    f.write('\n')

Beispiel #13

0

Datei anzeigen

def print_one_editor(url, f, editor):
    """Write all comics in an editor weekly pack in file f.

    For Marvel, DC, or Image weeklies
    """
    soup = htmlsoup.url2soup(url)
    var = soup.select_one('section.post-contents > ul').find_all('strong')

    f.write(editor + '\n')
    f.write("=====================\n")
    for s in var:  # var is a list of 'strong' divs
        s_copy = copy.copy(s)
        for _ in s_copy:
            s_copy.span.decompose()
        name = s_copy.text.replace(' : ', '').replace('| ', '')
        a = s.find('a')
        try:
            # if a.has_attr('href'):
            if 'href' in a.attrs:
                f.write(f'[url={a.get("href")}]{name}[/url]\n')
        except Exception:
            f.write(name + '\n')
    f.write("=====================\n")
    f.write("" + '\n')

Beispiel #14

0

Datei anzeigen

def printLastWeek(url, editor):
    soup = htmlsoup.url2soup(url)
    lastPost = soup.find_all('article', class_='type-post')[0]
    title = lastPost.h1.a.text
    time = lastPost.find('time')
    print(editor + '\t : ' + title + ' :\t' + time.text)

Beispiel #15

0

Datei anzeigen

def find_buttons(url):
    """Find download buttons in html soup, return list of buttons."""
    return url2soup(url).select("div.aio-pulse > a")

Beispiel #16

0

Datei anzeigen

Datei: getcomics.py Projekt: daniel-pimenta-cc/GetComicsDownloader

def comicsList(url):
    weeklyUrl = findLastWeekly(url)
    soup = htmlsoup.url2soup(weeklyUrl)
    liste_a = soup.select_one("section.post-contents")\
        .find_all('a', style="color: #ff0000;")
    return htmlsoup.getHrefwithName(liste_a, 'Download')

Beispiel #17

0

Datei anzeigen

Datei: getcomics.py Projekt: daniel-pimenta-cc/GetComicsDownloader

def findLastWeekly2(url):
    soup = htmlsoup.url2soup(url)
    lastPost = soup.find_all('article', class_='type-post')[0]
    postTitle = lastPost.h1.a.text
    postUrl = lastPost.h1.a['href']
    return postTitle, postUrl

Beispiel #18

0

Datei anzeigen

def comics_list(url):
    """Get comics in a weekly pack."""
    weeklyUrl = find_last_weekly(url)[1]
    content = url2soup(weeklyUrl).select_one("section.post-contents")
    liste_a = content.find_all('a', style="color: #ff0000;")
    return get_href_with_name(liste_a, 'Download')

Beispiel #19

0

Datei anzeigen

def _find_dl_buttons(url):
    """Find download buttons in a getcomics pages."""
    return url2soup(url).select("div.aio-pulse > a")