def get_manga_data(self, initial_data): """ Returns manga data from API Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.api_chapters_url.format(initial_data['slug'])) json_data = r.json() resp_data = json_data['series'] chapters = json_data['chapters'] data = initial_data.copy() data.update(dict( authors=[], scanlators=[], genres=[], status='ongoing', chapters=[], synopsis=resp_data['locale'][self.locale]['description'], server_id=self.id, cover=resp_data['locale'][self.locale]['thumb_url'], url=self.manga_url.format(resp_data['url'][1:]), )) if resp_data.get('authors'): data['authors'] += [t.strip() for t in resp_data['authors'].split(',')] if resp_data.get('artist'): data['authors'] += [t.strip() for t in resp_data['artist'].split(',') if t.strip() not in data['authors']] if resp_data.get('translator'): data['scanlators'] += [t.strip() for t in resp_data['translator'].split('|')] if resp_data.get('genres'): data['genres'] = resp_data['genres'] if resp_data['locale'][self.locale].get('copyright'): data['synopsis'] += '\n\n' + resp_data['locale'][self.locale]['copyright'] # Chapters for chapter in chapters: date = None if chapter.get('availability_start'): date_string = chapter['availability_start'].split(' ')[0] if len(date_string) == 10 and '-00' not in date_string: date = convert_date_string(date_string, '%Y-%m-%d') if date is None and chapter.get('updated'): date_string = chapter['updated'].split(' ')[0] if len(date_string) == 10 and '-00' not in date_string: date = convert_date_string(date_string, '%Y-%m-%d') data['chapters'].append(dict( slug=chapter['chapter_id'], title=chapter['locale'][self.locale]['name'], date=date, )) return data
def get_manga_chapters_data(self, url): """ Returns manga chapters data by scraping content of manga Mobile HTML page """ # Use a Mobile user agent r = self.session_get(self.chapters_url.format(url), headers={'user-agent': USER_AGENT_MOBILE}) if r is None: return [] mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return [] soup = BeautifulSoup(r.text, 'html.parser') li_elements = soup.find('ul', id='_episodeList').find_all('li', recursive=False) data = [] for li_element in reversed(li_elements): if li_element.get('data-episode-no') is None: continue date_element = li_element.find('p', class_='date') if date_element.span: date_element.span.decompose() # Small difference here compared to other servers # the slug can't be used to forge chapter URL, we must store the full url url_split = urlsplit(li_element.a.get('href')) data.append( dict( slug=url_split.query, title=li_element.find('p', class_='sub_title').find( 'span', class_='ellipsis').text.strip(), date=convert_date_string(date_element.text.strip(), format='%b %d, %Y'), url='{0}?{1}'.format(url_split.path, url_split.query), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content """ r = self.session_get(self.manga_url) if r is None: return None mime_type = get_buffer_mime_type(r.content) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update(dict( authors=['Randall Munroe', ], scanlators=[], genres=[], status='ongoing', synopsis='A webcomic of romance, sarcasm, math, and language.', chapters=[], server_id=self.id, cover=self.cover_url, )) # Chapters for a_element in reversed(soup.find('div', id='middleContainer').find_all('a')): slug = a_element.get('href')[1:-1] data['chapters'].append(dict( slug=slug, date=convert_date_string(a_element.get('title'), '%Y-%m-%d'), title='{0} - {1}'.format(slug, a_element.text.strip()), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = get_buffer_mime_type(r.content) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) data['name'] = soup.find('span', class_='manga-title').text.strip() cover_element = soup.find('div', class_='mangaImage2') if cover_element: data['cover'] = 'https:{0}'.format(cover_element.img.get('src')) # Details for element in soup.find_all('div', class_='rightBox')[1].find_all(): if element.name == 'h4': label = element.text.strip() if label.startswith(('Status', 'Stato')): status = element.find_all_next(string=True, limit=2)[1].strip().lower() if status in ('ongoing', 'in corso'): data['status'] = 'ongoing' elif status in ('completed', 'completato'): data['status'] = 'complete' elif status in ('suspended', 'sospeso'): data['status'] = 'suspended' continue if element.name == 'a': if label.startswith(('Author', 'Autore', 'Artist', 'Artista')): data['authors'].append(element.text.strip()) elif label.startswith(('Genres', 'Genere')): data['genres'].append(element.text.strip()) # Synopsis synopsis_element = soup.find('h2', id='mangaDescription') if synopsis_element: data['synopsis'] = synopsis_element.text.strip() # Chapters elements = soup.find('table').tbody.find_all('tr') for element in reversed(elements): tds_elements = element.find_all('td') data['chapters'].append( dict( slug=tds_elements[0].a.get('href').split('/')[-3], title=tds_elements[0].b.text.strip(), date=convert_date_string(tds_elements[3].text.strip(), format='%b %d, %Y'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.url == self.base_url: # Manga page doesn't exist, we have been redirected to homepage return None if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) name = soup.find('div', class_='ttline').h1.text.strip() name = name.replace(' Manga', '').replace(' Манга', '') # cleaning data['name'] = name data['cover'] = soup.find('a', class_='bookface').img.get('src') # Details elements = soup.find('ul', class_='message').find_all('li') for element in elements: label = element.b.text if label.startswith(('Author', 'Auteur', 'Autor')): data['authors'] = [ element.a.text.strip(), ] elif label.startswith( ('Genre', 'Genre', 'Género', 'Genere', 'Gênero')): for a_element in element.find_all('a'): data['genres'].append(a_element.text) elif label.startswith(('Status', 'Statut', 'Estado', 'Stato')): value = element.find_all('a')[0].text.strip().lower() if value in ('ongoing', 'en cours', 'laufende', 'en curso', 'in corso', 'em tradução'): data['status'] = 'ongoing' elif value in ('complete', 'complété', 'abgeschlossen', 'completado', 'completato', 'completo'): data['status'] = 'complete' # Synopsis synopsis_element = soup.find('p', itemprop='description') if synopsis_element: synopsis_element.b.extract() data['synopsis'] = synopsis_element.text.strip() # Chapters div_element = soup.find('div', class_='chapterbox') if div_element: li_elements = div_element.find_all('li') for li_element in reversed(li_elements): slug = li_element.a.get('href').split('/')[-1].replace( '.html', '') data['chapters'].append( dict( slug=slug, title=li_element.a.text.strip(), date=convert_date_string(li_element.span.text.strip(), format='%b %d, %Y'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) # Name & cover data['name'] = soup.find('h1', class_='SeriesName').text.strip() data['cover'] = soup.find('div', class_='leftImage').img.get('src') # Details & Synopsis elements = soup.find('span', class_='details').find_all('div', class_='row') for element in elements: div_element = element.div if div_element.b: label = div_element.b.text.strip() elif div_element.strong: label = div_element.strong.text.strip() if label.startswith('Author'): links_elements = div_element.find_all('a') for link_element in links_elements: data['authors'].append(link_element.text.strip()) elif label.startswith('Genre'): links_elements = div_element.find_all('a') for link_element in links_elements: data['genres'].append(link_element.text.strip()) elif label.startswith('Status'): value = div_element.find_all('a')[0].text.strip() if value.startswith('Complete'): data['status'] = 'complete' elif value.startswith('Ongoing'): data['status'] = 'ongoing' elif label.startswith('Description'): data['synopsis'] = div_element.div.text.strip() # Chapters elements = soup.find('div', class_='chapter-list').find_all('a', recursive=False) for link_element in reversed(elements): data['chapters'].append( dict( slug=link_element.get('chapter'), title=link_element.span.text.strip(), date=convert_date_string( link_element.time.get('datestring').strip(), format='%Y%m%d'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = get_buffer_mime_type(r.content) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[ self.name, ], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) data['name'] = soup.find_all('h5')[0].text.strip() data['cover'] = self.image_url.format( soup.find( 'div', class_='media-comic-card').a.get('style').split('(')[-1][:-1]) # Details data['synopsis'] = soup.find('div', class_='col-lg-9').contents[2].strip() # Chapters elements = soup.find('div', class_='list list-row row').find_all( 'div', class_='list-item') for element in reversed(elements): a_elements = element.find_all('a') slug = '/'.join(a_elements[0].get('href').split('/')[-2:]) title = '#{0} - {1}'.format(element.span.text.strip(), a_elements[0].text.strip()) date = a_elements[1].text.strip() data['chapters'].append( dict( slug=slug, date=convert_date_string(date), title=title, )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') adult_alert = False if soup.find('div', class_='alert'): adult_alert = True r = self.session_post(self.manga_url.format(initial_data['slug']), data=dict(adult='true')) if r is None: return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[ self.name, ], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) data['name'] = soup.find('h1', class_='title').text.strip() data['cover'] = soup.find('div', class_='thumbnail').img.get('src') # Details for element in soup.find('div', class_='info').find_all('b'): label = element.text value = list(element.next_siblings)[0][2:] if label in ('Author', 'Artist'): data['authors'].append(value) elif label in ( 'Description', 'Synopsis', ): if adult_alert: data['synopsis'] = '{0}\n\n{1}'.format( 'ALERT: This series contains mature contents and is meant to be viewed by an adult audience.', value) else: data['synopsis'] = value # Chapters for element in reversed( soup.find('div', class_='list').find_all('div', class_='element')): a_element = element.find('div', class_='title').a title = a_element.text.strip() slug = a_element.get('href').replace( f'{self.base_url}/read/{initial_data["slug"]}/{self.lang}/', '')[:-1] date = convert_date_string( list( element.find('div', class_='meta_r').find_all('a') [-1].next_siblings)[0][2:], '%Y.%m.%d') data['chapters'].append(dict( slug=slug, date=date, title=title, )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = get_buffer_mime_type(r.content) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'lxml') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) name_element = soup.find_all('h3')[0] name_element.i.decompose() name_element.small.decompose() data['name'] = name_element.text.strip() data['cover'] = self.cover_url.format(data['slug']) # Details elements = soup.find('div', class_='list-group').find_all( 'span', class_='list-group-item') for element in elements: label = element.b.text.strip() if label.startswith(('Autor', 'Artist')): for a_element in element.find_all('a'): value = a_element.text.strip() if value not in data['authors']: data['authors'].append(value) elif label.startswith('Categorías'): for a_element in element.find_all('a'): value = a_element.text.strip() if value not in data['authors']: data['genres'].append(value) elif label.startswith('Estado'): value = element.span.text.strip().lower() if value in ('complete', 'ongoing'): data['status'] = value elif label.startswith('Resumen'): element.b.extract() data['synopsis'] = element.text.strip() # Chapters elements = soup.find('div', class_='capitulos-list').find_all('tr') for element in reversed(elements): td_elements = element.find_all('td') a_element = td_elements[0].find('a') date_element = td_elements[1] date_element.i.extract() date_element.span.extract() data['chapters'].append( dict( slug=a_element.get('href').split('/')[-1], title=a_element.text.strip(), date=convert_date_string(date_element.text.strip(), '%d %b. %Y'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'lxml') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) data['name'] = soup.find_all('h2', class_='widget-title')[0].text.strip() data['cover'] = self.cover_url.format(data['slug']) # Details elements = soup.find( 'dl', class_='dl-horizontal').findChildren(recursive=False) for element in elements: if element.name not in ('dt', 'dd'): continue if element.name == 'dt': label = element.text continue if label.startswith('Auteur') or label.startswith('Artist'): value = element.text.strip() for t in value.split(','): t = t.strip() if t not in data['authors']: data['authors'].append(t) elif label.startswith('Catégories'): data['genres'] = [ a_element.text.strip() for a_element in element.find_all('a') ] elif label.startswith('Statut'): value = element.text.strip().lower() if value == 'en cours': data['status'] = 'ongoing' elif value == 'terminé': data['status'] = 'complete' data['synopsis'] = soup.find('div', class_='well').p.text.strip() alert_element = soup.find('div', class_='alert-danger') if alert_element: data['synopsis'] += '\n\n' + alert_element.text.strip() # Chapters elements = soup.find('ul', class_='chapters').find_all('li', recursive=False) for element in reversed(elements): h5 = element.h5 if not h5: continue slug = h5.a.get('href').split('/')[-1] title = '{0}: {1}'.format(h5.a.text.strip(), h5.em.text.strip()) date = element.div.div data['chapters'].append( dict(slug=slug, date=convert_date_string(date.text.strip(), format='%d %b. %Y'), title=title)) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, )) title_element = soup.find('h1', class_='manga-bg__title') if title_element is None: title_element = soup.find('h1', class_='manga__title') data['name'] = title_element.text.strip() if data.get('cover') is None: data['cover'] = self.cover_url.format(data['slug']) # Details elements = soup.find( 'div', class_='manga-info').find_all(class_='info-list__row') for element in elements: label = element.strong.text.strip() if label.startswith('Auteur') or label.startswith('Artiste'): value = element.a.text.strip() for t in value.split(','): t = t.strip() if t not in data['authors']: data['authors'].append(t) elif label.startswith('Scantrad'): a_element = element.find_all('a')[0] data['scanlators'] = [ a_element.text.replace('[', '').replace(']', '').strip(), ] elif label.startswith('Genres'): a_elements = element.find_all('a') data['genres'] = [ a_element.text.strip() for a_element in a_elements ] elif label.startswith('Statut'): status = element.span.text.strip().lower() if status == 'en cours': data['status'] = 'ongoing' elif status == 'terminé': data['status'] = 'complete' # Synopsis data['synopsis'] = soup.find('div', class_='info-desc__content').text.strip() # Chapters elements = soup.find('div', class_='chapters-list').find_all( 'div', class_='chapter-item') for element in reversed(elements): a_element = element.find('div', class_='chapter-item__name').a slug = a_element.get('href').split('/')[-1] title = a_element.text.strip() date = element.find('div', class_='chapter-item__date').text.strip() data['chapters'].append( dict( slug=slug, title=title, date=convert_date_string(date, format='%d.%m.%Y'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r.status_code != 200: return None mime_type = get_buffer_mime_type(r.content) if mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, )) # Name & cover data['name'] = soup.find( 'div', class_='story-info-right').find('h1').text.strip() if data.get('cover') is None: data['cover'] = soup.find('span', class_='info-image').img.get('src') # Details tr_elements = soup.find('table', class_='variations-tableInfo').find_all('tr') for tr_element in tr_elements: td_elements = tr_element.find_all('td') label = td_elements[0].text.strip() value = td_elements[1].text.strip() if label.startswith('Author'): data['authors'] = [t.strip() for t in value.split('-') if t] elif label.startswith('Genres'): data['genres'] = [t.strip() for t in value.split('-')] elif label.startswith('Status'): status = value.lower() if status == 'completed': data['status'] = 'complete' elif status == 'ongoing': data['status'] = 'ongoing' # Synopsis div_synopsis = soup.find('div', id='panel-story-info-description') div_synopsis.h3.extract() data['synopsis'] = div_synopsis.text.strip() # Chapters li_elements = soup.find('ul', class_='row-content-chapter').find_all('li') for li_element in reversed(li_elements): span_elements = li_element.find_all('span') slug = li_element.a.get('href').split('/')[-1] title = li_element.a.text.strip() date = span_elements[1].get('title')[:-6] data['chapters'].append( dict( slug=slug, title=title, date=convert_date_string(date, format='%b %d,%y'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'lxml') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, )) info_element = soup.find('div', class_='leftContent') title_element = info_element.find('span', class_='name') data['name'] = title_element.text.strip() cover_element = info_element.find('img', attrs={'data-full': True}) data['cover'] = cover_element.get('data-full') # Details elements = info_element.find('div', class_='subject-meta').find_all( 'p', recursive=False) status = elements[1].find(text=True, recursive=False).strip() if status == 'продолжается': data['status'] = 'ongoing' elif status == 'завершен': data['status'] = 'complete' for element in elements[2:]: label = element.span.text.strip() if label.startswith('Автор') or label.startswith( 'Сценарист') or label.startswith('Художник'): value = [ author.text.strip() for author in element.find_all('a', class_='person-link') ] data['authors'].extend(value) elif label.startswith('Переводчик'): value = [ scanlator.text.strip() for scanlator in element.find_all('a', class_='person-link') ] data['scanlators'].extend(value) elif label.startswith('Жанр'): value = [ genre.text.strip() for genre in element.find_all('a', class_='element-link') ] data['genres'].extend(value) # Synopsis data['synopsis'] = info_element.find( 'div', class_='manga-description').text.strip() # Chapters chapters_element = info_element.find('div', class_='chapters-link', recursive=False) if not chapters_element: return data for element in reversed( chapters_element.table.find_all('tr', recursive=False)): a_element = element.find('a') slug = a_element.get('href').split('/', 2)[2] title = a_element.find(text=True, recursive=False).strip() date = element.find('td', align='right').text.strip() data['chapters'].append( dict( slug=slug, title=title, date=convert_date_string(date, format='%d.%m.%Y'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content """ r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) container_element = soup.find('div', class_='tamanho-bloco-perfil') data['name'] = container_element.find('h2').text.strip() data['cover'] = container_element.find( 'img', class_='img-thumbnail').get('src') for div_element in container_element.find_all( 'div', class_='col-md-8 col-xs-12'): if not div_element.h4: continue label = div_element.find('label').text.strip() div_element.h4.label.extract() value = div_element.text.strip() if label.startswith('Gênero'): data['genres'] = [genre.strip() for genre in value.split(',')] elif label.startswith(('Autor', 'Artista')): for author in value.split(','): author = author.strip() if author not in data['authors']: data['authors'].append(author) elif label.startswith('Status'): if value == 'Completo': data['status'] = 'complete' elif value == 'Ativo': data['status'] = 'ongoing' data['synopsis'] = container_element.find( 'div', class_='panel-body').text.strip() # Chapters for div_element in reversed( container_element.find_all('div', class_='row lancamento-linha')): a_element = div_element.div.a span_element = div_element.div.find_all('span', recursive=False)[1] data['chapters'].append( dict( title=a_element.text.strip(), slug=a_element.get('href').split('/')[-1], date=convert_date_string(span_element.text.strip()[1:-1], format='%d/%m/%Y'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = get_buffer_mime_type(r.content) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'lxml') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, )) title_element = soup.find('h1', class_='manga-bg__title') if not title_element: title_element = soup.find('div', class_='manga-title').h1 data['name'] = title_element.text.strip() cover_element = soup.find('img', class_='manga__cover') data['cover'] = cover_element.get('src') # Details for info in soup.find_all('div', class_='info-list__row'): label = info.strong.text.strip() if label.startswith('Автор'): value = [author.text.strip() for author in info.find_all('a')] data['authors'].extend(value) elif label.startswith('Художник'): value = [ author.text.strip() for author in info.find_all('a') if not author.text.strip() in data['authors'] ] data['authors'].extend(value) elif label.startswith('Переводчик'): value = [ scanlator.text.strip() for scanlator in info.find_all('a') ] data['scanlators'].extend(value) elif label.startswith('Перевод'): status = info.span.text.strip() if status == 'продолжается': data['status'] = 'ongoing' elif status == 'завершен': data['status'] = 'complete' elif label.startswith('Жанр'): value = [genre.text.strip() for genre in info.find_all('a')] data['genres'].extend(value) # Synopsis synopsis_element = soup.find('div', class_='info-desc__content') if synopsis_element: data['synopsis'] = synopsis_element.text.strip() # Chapters for element in reversed(soup.find_all('div', class_='chapter-item')): a_element = element.find('a') if a_element: slug = a_element.get('href')[8:].split('/', 2)[2] else: teams = json.loads(element.get('data-teams')) slug = 'v{}/c{}/{}'.format(element.get('data-volume'), element.get('data-number'), teams[0]['slug']) title = ' '.join( element.find('div', class_='chapter-item__name').text.split()) date = element.find('div', class_='chapter-item__date').text.strip() data['chapters'].append( dict( slug=slug, title=title, date=convert_date_string(date, format='%d.%m.%Y'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = get_buffer_mime_type(r.content) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[ SERVER_NAME, ], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) div_info = soup.find('div', class_='mf-info') data['name'] = div_info.find('div', class_='titre').text.strip() data['cover'] = '{0}/{1}'.format( self.base_url, div_info.find('div', class_='poster').img.get('src')) status = div_info.find_all( 'div', class_='sub-i')[-1].span.text.strip().lower() if status == 'en cours': data['status'] = 'ongoing' elif status == 'terminé': data['status'] = 'complete' data['synopsis'] = div_info.find('div', class_='synopsis').text.strip() # Chapters for div_element in reversed( soup.find('div', id='chap-top').find_all('div', class_='chapitre')): btns_elements = div_element.find('div', class_='ch-right').find_all('a') if len(btns_elements) < 2: continue data['chapters'].append( dict( slug=btns_elements[0].get('href').split('/')[-1], date=convert_date_string( div_element.find('div', class_='chl-date').text), title='{0} {1}'.format( div_element.find('span', class_='chl-num').text.strip(), div_element.find('span', class_='chl-titre').text.strip()), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r.status_code != 200: return None mime_type = get_buffer_mime_type(r.content) if mime_type != 'text/html': return None data = initial_data.copy() data.update(dict( authors=[], scanlators=[], # not available genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=self.cover_url.format(data['slug']), )) soup = BeautifulSoup(r.content, 'lxml') data['name'] = soup.find('h1').text.strip() for li_element in soup.find('ul', class_='list-group list-group-flush').find_all('li'): if li_element.span is None: continue label = li_element.span.text.strip() li_element.span.extract() if label.startswith('Author'): data['authors'] = [artist.strip() for artist in li_element.text.split(',')] elif label.startswith('Genre'): data['genres'] = [genre.strip() for genre in li_element.text.split(',')] elif label.startswith('Status'): for status in li_element.text.split(','): if 'Scan' not in status: continue status = status.replace('(Scan)', '').strip().lower() if status in ('complete', 'hiatus', 'ongoing', ): data['status'] = status elif status in ('cancelled', 'discontinued', ): data['status'] = 'suspended' break elif label.startswith('Description'): data['synopsis'] = li_element.text.strip() # Chapters chapters = None try: script = soup.find_all('script')[-1].string if script: for line in script.split('\n'): line = line.strip() if not line.startswith('vm.Chapters'): continue chapters = json.loads(line.split('=')[1].strip()[:-1]) break except Exception as e: log_error_traceback(e) return None if chapters is not None: for chapter in reversed(chapters): slug = chapter['Chapter'] title = f'{chapter["Type"]} {int(chapter["Chapter"][1:-1])}' if chapter['Chapter'][-1] != '0': title = f'{title}.{chapter["Chapter"][-1]}' if chapter.get('ChapterName'): title = f'{title} - {chapter["ChapterName"]}' data['chapters'].append(dict( slug=slug, title=title, date=convert_date_string(chapter['Date'], '%Y-%m-%d %H:%M:%S') if chapter.get('Date') else None, )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, chapters=[], server_id=self.id, synopsis=None, )) card_element = soup.find_all('div', class_='card')[0] # Main name: japscan handles several names for mangas (main + alternatives) # Name provided by search can be one of the alternatives # First word (Manga, Manhwa, ...) must be removed from name data['name'] = ' '.join( card_element.find('h1').text.strip().split()[1:]) if data.get('cover') is None: data['cover'] = self.cover_url.format( card_element.find('img').get('src')) # Details if not card_element.find_all('div', class_='d-flex'): # mobile version elements = card_element.find_all('div', class_='row')[0].find_all('p') else: # desktop version elements = card_element.find_all( 'div', class_='d-flex')[0].find_all('p', class_='mb-2') for element in elements: label = element.span.text element.span.extract() value = element.text.strip() if label.startswith(('Auteur', 'Artiste')): for t in value.split(','): t = t.strip() if t not in data['authors']: data['authors'].append(t) elif label.startswith('Genre'): data['genres'] = [genre.strip() for genre in value.split(',')] elif label.startswith('Statut'): # Possible values: ongoing, complete data[ 'status'] = 'ongoing' if value == 'En Cours' else 'complete' # Synopsis synopsis_element = card_element.find('p', class_='list-group-item-primary') if synopsis_element: data['synopsis'] = synopsis_element.text.strip() # Chapters elements = soup.find('div', id='chapters_list').find_all( 'div', class_='chapters_list') for element in reversed(elements): if element.a.span: span = element.a.span.extract() # JapScan sometimes uploads some "spoiler preview" chapters, containing 2 or 3 untranslated pictures taken from a raw. # Sometimes they also upload full RAWs/US versions and replace them with a translation as soon as available. # Those have a span.badge "SPOILER", "RAW" or "VUS". We exclude these from the chapters list. if span.text.strip() in ( 'RAW', 'SPOILER', 'VUS', ): continue slug = element.a.get('href').split('/')[3] data['chapters'].append( dict( slug=slug, title=element.a.text.strip(), date=convert_date_string(element.span.text.strip(), format='%d %b %Y'), )) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) data['name'] = soup.find('h1').text.strip() # Details elements = soup.find('div', class_='relaxed').find_all('div', class_='item') for element in elements: label_element = element.find('div', class_='header') if not label_element: continue label = label_element.text.strip() value_element = element.find('div', class_='description') if label == 'Sinópse': cover_img = value_element.img.extract() data['cover'] = cover_img.get('src') data['synopsis'] = value_element.text.strip() elif label in ('Arte', 'Autor'): data['authors'].append(value_element.text.strip()) elif label == 'Gênero': for a_element in value_element.find_all('a'): data['genres'].append(a_element.text.strip()) elif label == 'Scantrad': for a_element in value_element.find_all('a'): data['scanlators'].append(a_element.text.strip()) elif label == 'Status': value = value_element.a.text.strip() if value == 'Em publicação': data['status'] = 'ongoing' elif value == 'Completo': data['status'] = 'complete' elif value == 'Cancelado': data['status'] = 'suspended' elif value == 'Pausado': data['status'] = 'hiatus' elif label == 'Capítulos': for tr_element in reversed( value_element.find_all('div', class_='content') [0].table.tbody.find_all('tr')[1:]): tds_elements = tr_element.find_all('td') data['chapters'].append( dict( slug=tds_elements[0].a.get('href').split('/')[-1], title=tds_elements[0].a.text.strip(), date=convert_date_string( tds_elements[1].text.strip(), format='%d/%m/%Y'), )) return data