Exemple #1
0
class Authors:

    """
    Class responsible for inserting the authors in the database
    """

    def __init__(self, manga_id, authors=None):

        self.log = Logging("weeb_crawler")

        self.manga_id = manga_id
        if authors is None:
            self.authors = []
        else:
            self.authors = authors


    def save(self):
        """
        Save the manga authors at the database
        """
        new_authors = 0
        database = Database()
        check_query = """SELECT id FROM authors WHERE name=%s AND manga_id=%s"""
        insert_query = """INSERT INTO authors VALUES (NULL, %s, %s)"""
        for author in self.authors:
            result = database.execute(check_query, [author, self.manga_id])
            if result is ():
                database.execute(insert_query, [author, self.manga_id])
                new_authors += 1

        self.log.info("Found %s new author(s)" % new_authors)
Exemple #2
0
class Api:
    def __init__(self):

        self.cfg = Config()
        cfg = self.cfg.get("api")

        self.log = Logging("Api")
        self.debug = int(cfg["logging"])
        self.host = cfg["host"]
        self.port = int(cfg["port"])
        self.app = self.create_app()
        self.add_routes()

    def create_app(self):

        app = Flask(__name__)
        app.debug = self.debug

        app.logger.disable = not self.debug
        log = logging.getLogger("werkzeug")
        log.disabled = not self.debug

        cors = CORS(app)
        app.config['CORS_HEADERS'] = 'Content-Type'

        return app

    def add_routes(self):
        @self.app.route("/getLog", methods=["GET"])
        @cross_origin()
        def get_log():
            log_name = self.log.get_log_name()
            cfg = self.cfg.get("getLog")
            return_amount = int(cfg["return_amount"])
            with open(log_name, "r") as log:
                return_data = {
                    "name": log_name,
                    "amount": return_amount,
                    "content": log.readlines()[-return_amount:]
                }
                return json.dumps(return_data)

        @self.app.route("/getStats", methods=["GET"])
        @cross_origin()
        def get_stats():
            database = Database()
            result = database.execute("""SELECT
                                        (SELECT COUNT(id) FROM manga) AS manga_amount,
                                        (SELECT COUNT(id) FROM chapter) AS chapter_amount,
                                        (SELECT COUNT(id) FROM page) AS page_amount"""
                                      )
            return json.dumps(result[0])

    def run(self):

        self.log.info("Api started in %s:%s" % (self.host, self.port))
        self.app.run(use_reloader=False, host=self.host, port=self.port)
Exemple #3
0
class Chapters:
    """
    Get the chapters of the manga
    """
    def __init__(self, manga_id, urls=None):

        self.log = Logging("weeb_crawler")

        if urls is None:
            self.urls = []
        else:
            self.urls = urls

        self.all_pages = False
        self.manga_id = manga_id

    def save(self):
        """
        Save the chapter in the database
        """
        new_chapters = 0
        database = Database()
        check_query = """SELECT id, all_pages FROM chapter WHERE manga_id=%s AND number=%s"""
        insert_query = """INSERT INTO chapter VALUES (NULL, %s, %s, 0, %s)"""
        update_query = """UPDATE chapter SET all_pages=1 WHERE id=%s"""
        for url in self.urls:
            chapter_id = None
            chapter_number = url.split("/")[-1]
            result = database.execute(check_query,
                                      [self.manga_id, chapter_number])

            if result is ():
                database.execute(insert_query,
                                 [chapter_number, url, self.manga_id])
                chapter_id = database.last_inserted_id()
                new_chapters += 1

            else:
                chapter_id = result[0][0]
                self.all_pages = True if result[0][1] == 1 else False

            if not self.all_pages:
                chapter_pages = Pages(self.manga_id, chapter_id, url)
                chapter_pages.save()
                database.execute(update_query, [chapter_id])

        self.log.info("Found %s new chapter(s)" % new_chapters)
Exemple #4
0
class Titles:

    """
    Class responsible for inserting the manga alternative
    titles in the database
    """

    def __init__(self, manga_id, titles=None):

        self.log = Logging("weeb_crawler")

        self.manga_id = manga_id
        if titles is None:
            self.titles = []
        else:
            self.titles = titles


    def save(self):
        """
        Insert the manga alternative titles
        in the database
        """
        new_titles = 0
        database = Database()
        check_query = """SELECT id FROM titles WHERE name=%s AND manga_id=%s"""
        insert_query = """INSERT INTO titles VALUES (NULL, %s, %s)"""
        for title in self.titles:
            result = database.execute(check_query, [title, self.manga_id])
            title = title[1:] if title[0] == " " else title

            if title != "-" and result is ():
                database.execute(insert_query, [title, self.manga_id])
                new_titles += 1

        self.log.info("Found %s new alternative title(s)" % new_titles)
Exemple #5
0
class Manga:
    """
    Manga class, one of the main classes of the program.
    Responsible for capturing and saving the mais piece
    of the system
    """
    def __init__(self, url):

        self.log = Logging("weeb_crawler")

        conf = Config()
        conf = conf.get("muID")
        self.title_diff_ratio = float(conf['diff_ratio'])

        self.url = url
        self.id = None

        query = """SELECT id FROM manga WHERE page_url=%s"""
        database = Database()
        result = database.execute(query, [url])

        self.file = open("last-manga-content.html", "w")

        self.page = self.get_page()
        self.title = self.get_title()
        self.muID = self.get_mu_id()
        if result is ():
            self.description = self.get_description()
            self.alternative_titles = None
            self.gender_tags = None
            self.authors = None
            self.artists = None
            self.status = None
            self.get_header_info()

            self.save()
            self.save_titles()
            self.save_authors()
            self.save_artists()
            self.save_gender()
            self.get_covers()
            self.get_chapters()
            self.log.info("Added new Manga: %s" % self.title)

        else:
            self.id = result[0][0]
            self.get_covers()
            self.get_chapters()
            self.log.info("Updated Manga: %s" % self.title)

    def get_page(self):
        """
        Requests the manga page and returns a soup
        """
        req = Request(self.url)
        soup = req.soup()
        self.file.write(str(soup.encode("utf-8")))
        return soup

    def get_title(self):
        """
        Get the manga title, return the text of the first h2
        """
        return self.page.findAll("h2")[0].text

    def get_description(self):
        """
        Get the manga description, search for a div with
        a specific class and returns the inner text
        """
        desc_container = self.page.find("div",
                                        {"class": "panel panel-default"})
        desc_body = desc_container.find("div", {"class": "panel-body"})
        return desc_body.text

    def get_header_info(self):
        """
        Get the manga indo present on the top of the page
        """
        header_content = self.page.findAll(
            "h4", {"class": "media-heading manga-perfil"})

        self.alternative_titles = re.split(', ',
                                           header_content[0].contents[1:][0])

        gender_tags_container = header_content[1].findAll("a", {"href": True})
        self.gender_tags = [tag.text for tag in gender_tags_container]

        self.authors = header_content[2].contents[1:]
        self.artists = header_content[3].contents[1:]

        status_tag = header_content[4].find("span")
        self.status = status_tag.text

    def get_mu_id(self):
        """
        Uses MCD api to search for the
        id to be user on MCD and Manga Updates
        """
        mu_id = None
        req = Request('https://mcd.iosphe.re/api/v1/search/')
        results = req.get_json({"Title": self.title})
        for result in results['Results']:
            if SequenceMatcher(None, self.title,
                               result[1]).ratio() > self.title_diff_ratio:
                mu_id = result[0]

        return mu_id

    def get_chapters(self):
        """
        Get all the chapter from the manga main page,
        then save each one
        """
        chapters = []
        chapter_containers = self.page.findAll(
            "div", {"class": "row lancamento-linha"})
        for chapter_container in chapter_containers:
            chapter_container = chapter_container.findAll(
                "div", {"class": "col-xs-6 col-md-6"})[0]
            chapters.append(
                chapter_container.findAll("a", {"href": True})[0]['href'])

        chapter = Chapters(self.id, chapters)
        chapter.save()

    def get_covers(self):
        """
        Call the covers class and its methods
        """
        if self.muID is not None:
            covers = Covers(self.id, self.muID)
            covers.get()
            covers.save()

    def save(self):
        """
        Save the manga in the database
        """
        database = Database()
        query = """INSERT INTO manga VALUES (NULL, %s, %s, %s, %s, %s, %s, %s, %s)"""
        database.execute(query, [
            self.muID, self.url, None, self.title, self.description,
            self.status, 0, None
        ])
        self.id = database.last_inserted_id()

    def save_authors(self):
        """
        Save the manga authors
        """
        authors = Authors(self.id, self.authors)
        authors.save()

    def save_artists(self):
        """
        Save the manga artists
        """
        artists = Artists(self.id, self.artists)
        artists.save()

    def save_titles(self):
        """
        Save the manga alternative titles
        """
        titles = Titles(self.id, self.alternative_titles)
        titles.save()

    def save_gender(self):
        """
        Save the manga genders
        """
        genders = Genders(self.id, self.gender_tags)
        genders.save()