def parse(self, resp):
        hxs = Selector(resp)

        # handle pagination recursively
        base_url = get_base_url(resp)
        for pagination in hxs.css("ul.pager a"):
            txt, url = extract_link(pagination)
            if txt.endswith("Next"):
                yield Request(urljoin_rfc(base_url, url), self.parse)

        mangas = hxs.xpath("//table[@class='listing']/tr/td[1]/a")

        for manga in mangas:
            item = MangaItem()
            item['name'], item['link'] = extract_link(manga)
            yield item
    def parse(self, resp):
        hxs = Selector(resp)

        for manga in hxs.css("a.tooltip_manga"):
            item = MangaItem()
            item['name'], item['link'] = extract_link(manga)
            yield item
Beispiel #3
0
def crawler():
    counter = 1
    for url_ref in config.FULL_URLS:
        resp = requests.get(url_ref)
        if resp.status_code == 200:
            _, name = get_name(url_ref)
            # Ensure folder exists
            folter_path = create_folder([config.LYRICS_FOLDER, name])
            # Get all links
            parsed_html = BeautifulSoup(resp.content, features='html.parser')
            lyrics_links = parsed_html.select('.listalbum-item a')
            LOG.info(f"Number of {name.upper()} songs: {len(lyrics_links)}")

            lyric_paths = [extract_link(link) for link in lyrics_links]

            for lyric_path in lyric_paths:

                try:
                    writer, song_name = get_name(lyric_path)
                    if name != writer:
                        alt_folder = create_folder(
                            [config.LYRICS_FOLDER, writer])
                        lyrics_file = alt_folder.joinpath(song_name + '.txt')
                        file_found = lyrics_file.is_file()
                    else:
                        writer = name
                        lyrics_file = folter_path.joinpath(song_name + '.txt')
                        file_found = lyrics_file.is_file()

                    if not file_found:
                        # url = config.BASE_URL + lyric_path
                        text = get_lyrics(lyric_path).strip()
                        LOG.info("Downloading (" + str(counter).zfill(3) +
                                 f") [{writer}]: {song_name}")
                        counter += 1

                        with open(lyrics_file, "w") as f:
                            f.write(text)
                        time.sleep(config.CRAWLER_WAIT +
                                   config.CRAWLER_WAIT * random.random())

                except IndexError:
                    LOG.error(
                        f"Access denied while scraping: {lyric_path} \n"
                        f"Try increasing the waiting time.\n"
                        f"Finishing the scrapping for the moment. Try to access on your browser to unblock access"
                    )
                    return
                except Exception as err:
                    print(f"ERROR: {lyric_path}: {err}")

        else:
            LOG.warning(f"Unable to load: {url_ref}")
    def parse(self, resp):
        hxs = Selector(resp)

        for row in hxs.xpath("//table[@id='listing']//tr"):
            item = MangaChapterItem()
            cells = row.xpath("td")
            if not cells:
                continue

            item["name"], item["link"] = extract_link(cells[0].xpath("a"))
            item["date"] = self.parsedate(cells[-1].xpath("text()").extract()[0])
            yield item
    def parse(self, resp):
        hxs = Selector(resp)
        for row in hxs.css("ul.chapterlistfull > li"):
            item = MangaChapterItem()

            try:
                item["name"], item["link"] = extract_link(row.xpath("a")[0])

                dt = row.css("span.date::text")
                item["date"] = self.parsedate(dt.extract()[0])
            except IndexError:
                continue
            yield item
    def parse(self, resp):
        hxs = Selector(resp)
        for row in hxs.css("div.detail_list > ul > li"):
            item = MangaChapterItem()
            cells = row.xpath("span")
            if not cells:
                continue

            try:
                item['name'], item['link'] = extract_link(cells[0].xpath("a"))
                item['date'] = self.parsedate(
                                        cells[-1].xpath('text()').extract()[0])
                yield item
            except IndexError:
                pass
Beispiel #7
0
def run(D):
    active = True
    for fn in L:
        D[fn] = True

        with open(d + fn, 'r') as fh:
            data = fh.read()
            sL = data.strip().split('\n')
            for line in sL:

                # don't read quoted blocks
                if line.strip() == '```':
                    active = not (active)
                if not active:
                    continue

                link = ut.extract_link(line)
                if link:
                    check(fn, link, D)
    def parse(self, resp):
        hxs = Selector(resp)

        rows = hxs.xpath("//table[@class='listing']//tr")
        for row in rows:
            item = MangaChapterItem()

            cells = row.xpath("td")
            if not cells:
                continue

            try:
                item['name'], item['link'] = extract_link(cells.xpath("a")[0])

                dt = cells.xpath("text()")[-1]
                item["date"] = self.parsedate(dt.extract())
            except IndexError:
                continue
            except ValueError:
                continue

            yield item
 def parse(self, resp):
     hxs = Selector(resp)
     for manga in hxs.css("ul.series_alpha > li > a"):
         item = MangaItem()
         item["name"], item["link"] = extract_link(manga)
         yield item