def build_to_sys(site, source): try: lt = LocalDateTime.now() """ dict( thumb=self.netlocs[3] + "/".join([image_thumb.split('/')[-2], image_thumb.split('/')[-1]]), origin=origin_url, name=title, # time=self.parseDate.human_to_date_stamp(time), time=time, last_chapter=last_title, last_url=last_url, site=self.netlocs[1] ) """ # list latest # scrap series info # url = "/".join([site.netlocs[2], source.get('origin')]) url = source.get('origin') # print(url) respcontent = site.get_html(url) series_info = site.series_info(respcontent) # series == manga qry = Manga.query manga = qry.filter(Manga.slug == utils.slugist( "-".join([site.netlocs[4], source.get('name', None)]) )).first() if manga is None: with transaction.manager: manga = Manga( site.netlocs[4], series_info.get('name', []), 0, ", ".join(series_info.get('tags', [])), ", ".join(series_info.get('authors', [])), ", ".join(series_info.get('artists', [])), ', '.join(series_info.get('aka', [])), ",".join(series_info.get('description', None)), 1 if 'ongoing' in series_info.get('status', '').lower() else 2 if 'completed' in series_info.get('status', '').lower() else 0 ) # manga.id = utils.guid() manga.origin = source.get('origin', '') manga.chapter_updated = lt.from_time_stamp(source.get('time', 'now')) ext = series_info.get('thumb_url', '').lower().rsplit('.', 1)[-1] manga.thumb = '.'.join(['cover', ext]) manga.category = 'ja' DBSession.add(manga) DBSession.flush() manga = qry.filter(Manga.slug == utils.slugist( "-".join([site.netlocs[4], source.get('name', None)]) )).first() manga_id, manga_thumb, manga_slug = manga.id, manga.thumb, manga.slug ini_path = path.join( path.dirname( path.dirname(__file__) ), '/'.join(['rak', 'manga', manga_id]) ) r = requests.get(source.get('thumb')) path_img = '/'.join([ini_path, manga_thumb]) print(path_img) if not path.exists(ini_path): makedirs(ini_path) with open(path_img, "wb") as code: code.write(r.content) chapters_info = series_info.get('chapters', []) for i, ch in enumerate(chapters_info): print(ch.get('name', '')) # batoto slug slug_bt = ch.get('name', '') if ':' in slug_bt: slug_bt = slug_bt.split(':') slug_bt.pop(0) slug_bt = '-'.join(slug_bt) slug_chapter = ' '.join([manga_slug, slug_bt]) # cek chapter sudah didownload chapter = Chapter.query.filter(Chapter.slug == utils.slugist(slug_chapter)).first() if chapter is None: v = utils.parse_number(ch.get('name', ''), "Vol") v = 0 if v is None else v c = utils.parse_number(ch.get('name', ''), "Ch") c = 0 if c is None else c with transaction.manager: chapter = Chapter( slug_bt, c, v ) time = lt.human_to_date(ch.get('time', 'now')) # chapter.id = utils.guid() ch_manga = Manga.query.get(manga_id) ch_manga.chapter_count += 1 chapter.lang = ISOLang.query.filter(ISOLang.iso == 'en').first() chapter.updated = time chapter.manga = ch_manga # s = 1000v + c # chapter.sortorder = (1000*float(v)) + float(c) chapter.sortorder = float(c) chapter.slug = slug_chapter DBSession.add(chapter) DBSession.flush() chapter = Chapter.query.filter(Chapter.slug == utils.slugist(slug_chapter)).first() # batoto html = site.get_html(ch.get('url')) # # ambil image dan download locally di folder chapter.id chapter_info = site.chapter_info(html) try: # series info # chapter info and images session = FuturesSession(executor=ThreadPoolExecutor(max_workers=10)) for n, page in enumerate(chapter_info.get('pages', [])): ini_chapter = '/'.join([ini_path, chapter.id]) print(page) r = session.get(page).result() if r.status_code != 200: raise HtmlError('cannot fetch') ext = page.split('/')[-1].rsplit('.', 1)[-1] path_img = '/'.join([ini_chapter, "{num:03d}.{ext}".format(num=n, ext=ext)]) # path_img = '/'.join([ini_chapter, page.split('/')[-1]]) print(path_img) if not path.exists(ini_chapter): makedirs(ini_chapter) with open(path_img, "wb") as code: code.write(r.content) except ConnectionError as Conn: print(Conn) chapter = Chapter.query.get(chapter.id) DBSession.delete(chapter) shutil.rmtree(ini_chapter) except AttributeError as e: print(e.message) except KeyError as e: print(e.message) except ValueError as e: print(e.message)
def build_to_sys(self, site, source): try: url = source.get('last_url') # print(url) resp_content = site.get_html(url) series_info = site.series_info(resp_content) # series == manga qry = Manga.query manga = qry.filter(Manga.slug == utils.slugist("-".join( [site.netlocs[4], series_info.get('name', None)]))).first() if manga is None: with transaction.manager: manga = Manga( site.netlocs[4], utils.HTMLUnscaped(series_info.get('name', u'')), 0, ", ".join(series_info.get('tags', [])), series_info.get('authors', u''), series_info.get('artists', u''), utils.HTMLUnscaped(series_info.get('aka', u'')), utils.HTMLUnscaped(series_info.get('description', u'')), 1 if 'ongoing' in series_info.get( 'status', '').lower() else 2 if 'completed' in series_info.get('status', '').lower() else 0) # manga.id = utils.guid() manga.origin = source.get('origin', '') manga.chapter_updated = datetime.fromtimestamp( source.get('time', 'now')) ext = series_info.get('thumb_url', '').lower().rsplit('.', 1)[-1] manga.thumb = '.'.join(['cover', ext]) manga.category = 'ja' DBSession.add(manga) DBSession.flush() manga = qry.filter(Manga.slug == utils.slugist("-".join( [site.netlocs[4], series_info.get('name', None)]))).first() manga_id, manga_thumb, manga_slug = manga.id, manga.thumb, manga.slug ini_path = path.join(path.dirname(path.dirname(__file__)), '/'.join(['rak', 'manga', manga_id])) r = requests.get(series_info.get('thumb_url', '')) path_img = '/'.join([ini_path, manga_thumb]) print(path_img) if not path.exists(ini_path): makedirs(ini_path) with open(path_img, "wb") as code: code.write(r.content) chapters_info = series_info.get('chapters', []) for i, ch in enumerate(chapters_info): print(ch.get('name', '')) ch_name = str(ch.get( 'order', 0)) if ch.get('name', '') is None else utils.HTMLUnscaped( ch.get('name', u'')) # edenapi slug slug_bt = ch_name # if ':' in slug_bt: # slug_bt = slug_bt.split(':') # slug_bt.pop(0) # slug_bt = '-'.join(slug_bt) slug_chapter = ' '.join([manga_slug, slug_bt]) # cek chapter sudah didownload chapter = Chapter.query.filter( Chapter.slug == utils.slugist(slug_chapter)).first() if chapter is None: v = utils.parse_number(ch_name, "Vol") v = 0 if v is None else v c = ch.get('order', 0) with transaction.manager: chapter = Chapter(slug_bt, c, v) time = datetime.fromtimestamp( ch.get('time', datetime.now())) # chapter.id = utils.guid() ch_manga = Manga.query.get(manga_id) ch_manga.chapter_count += 1 chapter.lang = ISOLang.query.filter( ISOLang.iso == 'en').first() chapter.updated = time chapter.manga = ch_manga # s = 1000v + c # chapter.sortorder = (1000*float(v)) + float(c) chapter.sortorder = float(c) chapter.slug = slug_chapter DBSession.add(chapter) DBSession.flush() chapter = Chapter.query.filter( Chapter.slug == utils.slugist(slug_chapter)).first() # eden headers = { 'content-type': 'application/json; charset=utf8' } html = site.get_html(ch.get('url'), headers=headers) # # ambil image dan download locally di folder chapter.id chapter_info = site.chapter_info(html) try: # series info # chapter info and images session = FuturesSession(executor=ThreadPoolExecutor( max_workers=10)) for n, page in enumerate(chapter_info.get('pages', [])): ini_chapter = '/'.join([ini_path, chapter.id]) print(page) r = session.get(page).result() if r.status_code != 200: # raise HtmlError('cannot fetch') continue # path_img = '/'.join([ini_chapter, page.split('/')[-1]]) ext = page.split('/')[-1].rsplit('.', 1)[-1] path_img = '/'.join([ ini_chapter, "{num:03d}.{ext}".format(num=n, ext=ext) ]) print(path_img) if not path.exists(ini_chapter): makedirs(ini_chapter) with open(path_img, "wb") as code: code.write(r.content) except ConnectionError as Conn: print(Conn) chapter = Chapter.query.get(chapter.id) DBSession.delete(chapter) shutil.rmtree(ini_chapter) except AttributeError as e: print(e.message) except KeyError as e: print(e.message) except ValueError as e: print(e.message)
def script_to_sys(self, source_url, source_origin, time_str): """ untuk url API mangaeden + id manga[:24] python scripts/mangascrapper.py -s https://www.mangaeden.com/api/manga/4e70ea1dc092255ef7004d5c/ -o http://www.mangaeden.com/en/en-manga/fairy-tail/ -t "Aug 31, 2015" :param self: Manga API :param source_url: url data manga untuk di scrap :param source_origin: url sumber manga :param time: chapter release terakhir Agust 30, 2015(string time) : fairy tail https://www.mangaeden.com/api/manga/4e70ea1dc092255ef7004d5c/ : naruto http://www.mangaeden.com/api/manga/4e70ea03c092255ef70046f0/ : one piece http://www.mangaeden.com/api/manga/4e70ea10c092255ef7004aa2/ : bleach http://www.mangaeden.com/api/manga/4e70e9efc092255ef7004274/ : nanatsu http://www.mangaeden.com/api/manga/5099a865c092254a2000daf4/ :return: """ try: # print(url) # "{url}/api/manga/{id}/".format(url=self.netlocs[2], id=origin_url[:24]) # https://www.mangaeden.com/api/manga/:id[:24]/ resp_content = self.get_html(source_url) series_info = self.series_info(resp_content) time_long = self._parse_update_date(time_str) if isinstance( time_str, basestring) else long(time_str) # series == manga qry = Manga.query manga = qry.filter(Manga.slug == utils.slugist("-".join( [self.netlocs[4], series_info.get('name', None)]))).first() if manga is None: with transaction.manager: manga = Manga( self.netlocs[4], utils.HTMLUnscaped(series_info.get('name', u'')), 0, ", ".join(series_info.get('tags', [])), series_info.get('authors', u''), series_info.get('artists', u''), utils.HTMLUnscaped(series_info.get('aka', u'')), utils.HTMLUnscaped(series_info.get('description', u'')), 1 if 'ongoing' in series_info.get( 'status', '').lower() else 2 if 'completed' in series_info.get('status', '').lower() else 0) # manga.id = utils.guid() manga.origin = source_origin manga.chapter_updated = datetime.fromtimestamp(time_long) ext = series_info.get('thumb_url', '').lower().split('.')[-1] manga.thumb = '.'.join(['cover', ext]) manga.category = 'ja' DBSession.add(manga) DBSession.flush() manga = qry.filter(Manga.slug == utils.slugist("-".join( [self.netlocs[4], series_info.get('name', None)]))).first() manga_id, manga_thumb, manga_slug = manga.id, manga.thumb, manga.slug ini_path = path.join(path.dirname(path.dirname(__file__)), '/'.join(['rak', 'manga', manga_id])) r = requests.get(series_info.get('thumb_url', '')) path_img = '/'.join([ini_path, manga_thumb]) print(path_img) if not path.exists(ini_path): makedirs(ini_path) with open(path_img, "wb") as code: code.write(r.content) chapters_info = series_info.get('chapters', []) for i, ch in enumerate(chapters_info): print(ch.get('name', '')) ch_name = str(ch.get( 'order', 0)) if ch.get('name', '') is None else utils.HTMLUnscaped( ch.get('name', u'')) # edenapi slug slug_bt = ch_name # if ':' in slug_bt: # slug_bt = slug_bt.split(':') # slug_bt.pop(0) # slug_bt = '-'.join(slug_bt) slug_chapter = ' '.join([manga_slug, slug_bt]) # cek chapter sudah didownload chapter = Chapter.query.filter( Chapter.slug == utils.slugist(slug_chapter)).first() if chapter is None: v = utils.parse_number(ch_name, "Vol") v = 0 if v is None else v c = ch.get('order', 0) with transaction.manager: chapter = Chapter(slug_bt, c, v) time = datetime.fromtimestamp( ch.get('time', datetime.now())) # chapter.id = utils.guid() ch_manga = Manga.query.get(manga_id) ch_manga.chapter_count += 1 chapter.lang = ISOLang.query.filter( ISOLang.iso == 'en').first() chapter.updated = time chapter.manga = ch_manga # s = 1000v + c # chapter.sortorder = (1000*float(v)) + float(c) chapter.sortorder = float(c) chapter.slug = slug_chapter DBSession.add(chapter) DBSession.flush() chapter = Chapter.query.filter( Chapter.slug == utils.slugist(slug_chapter)).first() # eden headers = { 'content-type': 'application/json; charset=utf8' } html = self.get_html(ch.get('url'), headers=headers) # # ambil image dan download locally di folder chapter.id chapter_info = self.chapter_info(html) try: # series info # chapter info and images session = FuturesSession(executor=ThreadPoolExecutor( max_workers=10)) for n, page in enumerate(chapter_info.get('pages', [])): ini_chapter = '/'.join([ini_path, chapter.id]) print(page) r = session.get(page).result() if r.status_code != 200: print('continue chapter') continue # raise HtmlError('cannot fetch') # path_img = '/'.join([ini_chapter, page.split('/')[-1]]) ext = page.split('/')[-1].rsplit('.', 1)[-1] path_img = '/'.join([ ini_chapter, "{num:03d}.{ext}".format(num=n, ext=ext) ]) print(path_img) if not path.exists(ini_chapter): makedirs(ini_chapter) with open(path_img, "wb") as code: code.write(r.content) except ConnectionError as Conn: print(Conn) chapter = Chapter.query.get(chapter.id) DBSession.delete(chapter) shutil.rmtree(ini_chapter) except AttributeError as e: print(e.message) except KeyError as e: print(e.message) except ValueError as e: print(e.message)
def build_to_sys(site, source): try: lt = LocalDateTime.now() """ dict( thumb=self.netlocs[3] + "/".join([image_thumb.split('/')[-2], image_thumb.split('/')[-1]]), origin=origin_url, name=title, # time=self.parseDate.human_to_date_stamp(time), time=time, last_chapter=last_title, last_url=last_url, site=self.netlocs[1] ) """ # list latest # scrap series info # url = "/".join([site.netlocs[2], source.get('origin')]) url = source.get('origin') # print(url) respcontent = site.get_html(url) series_info = site.series_info(respcontent) # series == manga qry = Manga.query manga = qry.filter(Manga.slug == utils.slugist( "-".join([site.netlocs[4], source.get('name', None)]) )).first() if manga is None: with transaction.manager: manga = Manga( site.netlocs[4], series_info.get('name', []), 0, ", ".join(series_info.get('tags', [])), ", ".join(series_info.get('authors', [])), ", ".join(series_info.get('artists', [])), ', '.join(series_info.get('aka', [])), ",".join(series_info.get('description', None)), 1 if 'ongoing' in series_info.get('status', '').lower() else 2 if 'completed' in series_info.get('status', '').lower() else 0 ) # manga.id = utils.guid() manga.origin = source.get('origin', '') manga.chapter_updated = lt.from_time_stamp(source.get('time', 'now')) ext = series_info.get('thumb_url', '').lower().rsplit('.', 1)[-1] manga.thumb = '.'.join(['cover', ext]) manga.category = 'ja' DBSession.add(manga) DBSession.flush() manga = qry.filter(Manga.slug == utils.slugist( "-".join([site.netlocs[4], source.get('name', None)]) )).first() manga_id, manga_thumb, manga_slug = manga.id, manga.thumb, manga.slug ini_path = path.join( path.dirname( path.dirname(__file__) ), '/'.join(['rak', 'manga', manga_id]) ) r = requests.get(source.get('thumb')) path_img = '/'.join([ini_path, manga_thumb]) print(path_img) if not path.exists(ini_path): makedirs(ini_path) with open(path_img, "wb") as code: code.write(r.content) chapters_info = series_info.get('chapters', []) for i, ch in enumerate(chapters_info[0:2]): print(ch.get('name', '')) # batoto slug slug_bt = ch.get('name', '') if ':' in slug_bt: slug_bt = slug_bt.split(':') slug_bt.pop(0) slug_bt = '-'.join(slug_bt) slug_chapter = ' '.join([manga_slug, slug_bt]) # cek chapter sudah didownload chapter = Chapter.query.filter(Chapter.slug == utils.slugist(slug_chapter)).first() if chapter is None: v = utils.parse_number(ch.get('name', ''), "Vol") v = 0 if v is None else v c = utils.parse_number(ch.get('name', ''), "Ch") c = 0 if c is None else c with transaction.manager: chapter = Chapter( slug_bt, c, v ) time = lt.human_to_date(ch.get('time', 'now')) # chapter.id = utils.guid() ch_manga = Manga.query.get(manga_id) ch_manga.chapter_count += 1 chapter.lang = ISOLang.query.filter(ISOLang.iso == 'en').first() chapter.updated = time chapter.manga = ch_manga # s = 1000v + c # chapter.sortorder = (1000*float(v)) + float(c) chapter.sortorder = float(c) chapter.slug = slug_chapter DBSession.add(chapter) DBSession.flush() chapter = Chapter.query.filter(Chapter.slug == utils.slugist(slug_chapter)).first() # batoto html = site.get_html(ch.get('url')) # # ambil image dan download locally di folder chapter.id chapter_info = site.chapter_info(html) try: # series info # chapter info and images session = FuturesSession(executor=ThreadPoolExecutor(max_workers=10)) for page in chapter_info.get('pages', []): ini_chapter = '/'.join([ini_path, chapter.id]) print(page) r = session.get(page).result() if r.status_code != 200: raise HtmlError('cannot fetch') path_img = '/'.join([ini_chapter, page.split('/')[-1]]) print(path_img) if not path.exists(ini_chapter): makedirs(ini_chapter) with open(path_img, "wb") as code: code.write(r.content) except ConnectionError as Conn: print(Conn) chapter = Chapter.query.get(chapter.id) DBSession.delete(chapter) shutil.rmtree(ini_chapter) except AttributeError as e: print(e.message) except KeyError as e: print(e.message) except ValueError as e: print(e.message)