def result2meta(self, result, prev_identifiers={}): ''' Converts the result dict into Calibre metadata. Note: Source download plugins do not have access to custom columns. ''' title = get_title(result) authors = get_author_list(result) mi = Metadata(title=title, authors=authors) mi.identifiers = update_identifiers(prev_identifiers, result) put_publisher(mi, result) put_language(mi, result) self.put_pubdate(mi, result) put_tags(mi, result) put_journal(mi, result) self.put_series_index(mi, result) comments = "" if prefs['abstract_to_comment'] and 'abstract' in result: comments = "\n\n".join([comments, result['abstract']]) if prefs['query_to_comment']: extra_meta = self.mkComments(result) extra_plus = map(lambda x: "crossref:%s" % x, extra_meta) extra = "\n".join(extra_plus) comments = "\n\n".join([comments, extra]) mi.comments = comments if 'score' in result: mi.source_relevance = 100 - result['score'] else: mi.source_relevance = 100 # self.log.info("set comment to %s"%mi.comments) return mi
def _get_results(self): """ Download Information from Google Scholar """ querier = ScholarQuerier(author=self.query_authors[0], count=self.count) querier.query(self.query_title, bibtex=True) articles = querier.articles if self.count > 0: articles = articles[:self.count] for num, art in enumerate(articles): bibtex_string = art.as_bib() bib = Bibparser(bibtex_string) bib.parse() slug = bib.records.keys()[0] bib_dict = bib.records[slug] title = bib_dict.get('title') authors = [] for author in bib_dict.get('author', []): # Ignore non existant given names given_name = '%s ' % author.get( 'given') if 'given' in author else '' # Add full stops after abbreviated name parts given_name = re.sub(r'(^| +)([A-Z])( +|$)', r'\1\2.\3', given_name) authors.append('%s%s' % (given_name, author['family'])) mi = Metadata(title, authors) mi.set_identifier('googlescholar', slug) mi.source_relevance = 100 - num if 'publisher' in bib_dict: mi.publisher = bib_dict['publisher'] if 'issued' in bib_dict: if 'literal' in bib_dict['issued']: year = int(bib_dict['issued']['literal']) from calibre.utils.date import utc_tz # We only have the year, so let's use Jan 1st mi.pubdate = datetime.datetime(year, 1, 1, tzinfo=utc_tz) self.plugin.clean_downloaded_metadata(mi) self._log_metadata(mi) self.result_queue.put(mi, True) self.log.info(self.result_queue.qsize())
def parse_details(self, root): """ """ try: self.log.info(" Parse details: %r" % self.url) self.databazeknih_id = self.parse_databazeknih_id(self.url) self.log.info(" Parsed DK identifier: %s" % self.databazeknih_id) except: self.log.exception("Error parsing DK identifier for url: %r" % self.url) self.databazeknih_id = None # Parse title self.parse_title(root) # Parse authors self.parse_authors(root) if not self.title or not self.authors or not self.databazeknih_id: self.log.error("Could not find title/authors/DK id for %r" % self.url) self.log.error("DK id: %r Title: %r Authors: %r" % (self.databazeknih_id, self.title, self.authors)) return mi = Metadata(self.title, self.authors) mi.set_identifier("databazeknih", self.databazeknih_id) # Parse series self.parse_series(root, mi) # Parse comments self.parse_comments(root, mi) # Parse publisher self.parse_publisher(root, mi) # Parse pubdate self.parse_pubdate(root, mi) # Parse tags self.parse_tags(root, mi) # Parse rating self.parse_rating(root, mi) # Parse book ISBN self.parse_isbn(self.more_info, mi) # Parse language self.parse_language(self.more_info, mi) # Parse book cover self.parse_cover(root, mi) mi.source_relevance = self.relevance self.log.info(mi) self.result_queue.put(mi)
def _get_results(self): """ Download Information from Google Scholar """ querier = ScholarQuerier(author=self.query_authors[0], count=self.count) querier.query(self.query_title, bibtex=True) articles = querier.articles if self.count > 0: articles = articles[:self.count] for num, art in enumerate(articles): bibtex_string = art.as_bib() bib = Bibparser(bibtex_string) bib.parse() slug = bib.records.keys()[0] bib_dict = bib.records[slug] title = bib_dict.get('title') authors = [] for author in bib_dict.get('author', []): # Ignore non existant given names given_name = '%s ' % author.get('given') if 'given' in author else '' # Add full stops after abbreviated name parts given_name = re.sub(r'(^| +)([A-Z])( +|$)', r'\1\2.\3', given_name) authors.append('%s%s' % (given_name, author['family'])) mi = Metadata(title, authors) mi.set_identifier('googlescholar', slug) mi.source_relevance = 100-num if 'publisher' in bib_dict: mi.publisher = bib_dict['publisher'] if 'issued' in bib_dict: if 'literal' in bib_dict['issued']: year = int(bib_dict['issued']['literal']) from calibre.utils.date import utc_tz # We only have the year, so let's use Jan 1st mi.pubdate = datetime.datetime(year, 1, 1, tzinfo=utc_tz) self.plugin.clean_downloaded_metadata(mi) self._log_metadata(mi) self.result_queue.put(mi, True) self.log.info(self.result_queue.qsize())
def extract_vol_details(self, vol_url): # Here we extract and format the information from the choosen volume. # - The first name and last name to populate author and author sort : vol_auteur_prenom and vol_auteur_nom # - The title of the volume : vol_title # - The serie name the volume is part of : vol_serie # - The sequence number in the serie : vol_serie_seq # missing # - The editor of this volume : vol_editor # - The editor's collection of this volume : vol_coll # - The collection serial code of this volume : vol_coll_srl # - The "dépot légal" date (the publication date is vastly unknown) : vol_dp_lgl # date format to be computed # - The ISBN number assoi-ciated with the volume : vol_isbn # - The volume tags : vol_genre # - The url pointer to the volume cover image : vol_cover_index # - The comments includes various info about the book : vol_comment_soup # . reference, an url pointer to noosfere # . couverture, an url pointer to noosfere, cover may be real smal, but is accurate to the volume # . first edition information # . serie (cycle) name and number # . this volume editor info # . Resume (quatrième de couverture) # . Critiques # . Sommaire detailing what novels are in the volume when it is an anthology # . Critiques about the serie and/or about another volume of the book # debug = self.dbg_lvl & 2 self.log.info(self.who, "\nIn extract_vol_details(soup)") if debug: self.log.info(self.who, "vol_url : ", vol_url) if debug: self.log.info( self.who, "calling ret_soup(log, dbg_lvl, br, url, rkt=None, who='[__init__]')" ) self.log.info(self.who, "vol_url : ", vol_url, "who : ", self.who) rsp = ret_soup(self.log, self.dbg_lvl, self.br, vol_url, who=self.who) soup = rsp[0] url_vrai = rsp[1].replace("&Tri=3", "") # if debug: self.log.info(self.who,soup.prettify()) # useful but too big... self.nsfr_id = self.nsfr_id + "$vl$" + url_vrai.replace( '?', '&').replace('=', '&').split('&')[2] # self.nsfr_id = (self.nfsr_id).strip("$") # If I use this form, it gives this error: 'Worker' object has no attribute 'nfsr_id' ??? tmp = self.nsfr_id self.nsfr_id = tmp.strip('$') if debug: self.log.info(self.who, "self.nsfr_id, type() : ", self.nsfr_id, type(self.nsfr_id)) tmp_lst = [] vol_info = {} vol_title = "" vol_auteur = "" vol_auteur_prenom = "" vol_auteur_nom = "" vol_serie = "" vol_serie_seq = "" vol_editor = "" vol_coll = "" vol_coll_srl = "" vol_dp_lgl = "" vol_isbn = "" vol_genre = "" vol_cover_index = "" comment_generic = None comment_resume = None comment_Critiques = None comment_Sommaire = None comment_AutresCritique = None comment_cover = None comment_decoupage_annexe = None # add volume address as a reference in the comment vol_comment_soup = BS( '<div><p>Référence: <a href="' + url_vrai + '">' + url_vrai + '</a></p></div>', "lxml") if debug: self.log.info(self.who, "vol reference processed") if soup.select("span[class='TitreNiourf']"): vol_title = soup.select( "span[class='TitreNiourf']")[0].text.strip() if debug: self.log.info(self.who, "vol_title processed : ", vol_title) if soup.select("span[class='AuteurNiourf']"): vol_auteur = soup.select( "span[class='AuteurNiourf']")[0].text.replace("\n", "").strip() if debug: self.log.info(self.who, "vol_auteur processed : ", vol_auteur) for i in range(len(vol_auteur.split())): if not vol_auteur.split()[i].isupper(): vol_auteur_prenom += " " + vol_auteur.split()[i] else: vol_auteur_nom += " " + vol_auteur.split()[i].title() vol_auteur = vol_auteur.title() vol_auteur_prenom = vol_auteur_prenom.strip() if debug: self.log.info(self.who, "vol_auteur_prenom processed : ", vol_auteur_prenom) vol_auteur_nom = vol_auteur_nom.strip() if debug: self.log.info(self.who, "vol_auteur_nom processed : ", vol_auteur_nom) if soup.select("a[href*='serie.asp']"): if soup.select("a[href*='serie.asp']")[0].find_parent( "span", {"class": "ficheNiourf"}): vol_serie = soup.select("a[href*='serie.asp']")[0].text tmp_vss = [ x for x in soup.select("a[href*='serie.asp']") [0].parent.stripped_strings ] for i in range(len(tmp_vss)): if "vol." in tmp_vss[i]: if not vol_serie_seq: vol_serie_seq = tmp_vss[i].replace("vol.", "").strip() if "découpage" in tmp_vss[i]: dec_anx_url = "https://www.noosfere.org/livres/" + soup.select( "a[href*='serie.asp']")[0]['href'] comment_pre_decoupage_annexe = BS( '<div><p> </p><p style="font-weight: 600; font-size: 18px"> Découpage annexe</p><hr style="color:CCC;"/></div>', "lxml") comment_decoupage_annexe = self.get_decoupage_annexe( dec_anx_url) if debug: self.log.info(self.who, "vol_serie, vol_serie_seq processed : ", vol_serie, ",", vol_serie_seq) comment_generic = soup.select("span[class='ficheNiourf']")[0] new_div = soup.new_tag('div') comment_generic = comment_generic.wrap(new_div) if debug: self.log.info(self.who, "comment_generic processed") if soup.select("a[href*='editeur.asp']"): vol_editor = soup.select("a[href*='editeur.asp']")[0].text if debug: self.log.info(self.who, "vol_editor processed : ", vol_editor) if soup.select("a[href*='collection.asp']"): vol_coll = soup.select("a[href*='collection.asp']")[0].text if debug: self.log.info(self.who, "vol_coll : ", vol_coll) for i in comment_generic.stripped_strings: tmp_lst.append(str(i)) vol_coll_srl = tmp_lst[len(tmp_lst) - 1] if "n°" in vol_coll_srl: for k in ["n°", "(", ")"]: if k in vol_coll_srl: vol_coll_srl = vol_coll_srl.replace(k, "") vol_coll_srl = vol_coll_srl.strip() vol_coll_srl = vol_coll_srl.split("/")[0] if vol_coll_srl[0].isnumeric(): vol_coll_srl = ("0" * 5 + vol_coll_srl)[-6:] else: vol_coll_srl = "" if debug: self.log.info(self.who, "vol_coll_srl processed : ", vol_coll_srl) # publication date is largely ignored in noosfere, but we have the "dépot legal" date and I use it instead # note that I 'calculate' the missing day of the month and even sometimes the missing month ms = ("janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre") for elemnt in soup.select_one( "span[class='sousFicheNiourf']").stripped_strings: if debug: self.log.info(self.who, "elemnt : ", elemnt) if not vol_dp_lgl: elemn = (elemnt.replace("Dépôt légal :", "").split(','))[0].strip() if elemn: if elemn.isnumeric() and len(elemn) == 4: vol_dp_lgl = datetime.datetime.strptime( "175 " + elemn, "%j %Y") elif "semestre" in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str((int(ele[0][0]) - 1) * 175 + 97))[-3:] + " " + ele[2], "%j %Y") elif "trimestre" in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str((int(ele[0][0]) - 1) * 91 + 47))[-3:] + " " + ele[2], "%j %Y") else: for i in range(len(ms)): if ms[i] in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str(10 + 31 * i))[-3:] + " " + ele[1], "%j %Y") break if debug: self.log.info(self.who, "vol_dp_lgl : ", vol_dp_lgl) if "ISBN" in elemnt: vol_isbn = elemnt.lower().replace(" ", "").replace('isbn:', '') if "néant" in vol_isbn: vol_isbn = "" if debug: self.log.info(self.who, "vol_isbn processed : ", vol_isbn) if "Genre" in elemnt: vol_genre = elemnt.lstrip("Genre : ") if debug: self.log.info(self.who, "vol_genre processed : ", vol_genre) if soup.select("img[name='couverture']"): for elemnt in repr( soup.select("img[name='couverture']")[0]).split('"'): if "http" in elemnt: if not vol_cover_index: vol_cover_index = elemnt if debug: self.log.info(self.who, "vol_cover_index processed : ", vol_cover_index) # add cover image address as a reference in the comment if vol_cover_index: comment_cover = BS( '<div><p>Couverture: <a href="' + vol_cover_index + '">' + vol_cover_index + '</a></p></div>', "lxml") # select the fields I want... More exist such as film adaptations or references to advises to read # but that is not quite consistant around all the books (noosfere is a common database from many people) # and beside I have enough info like that AND I do NOT want to take out the noosfere's business tmp_comm_lst = soup.select("span[class='AuteurNiourf']") if debug: self.log.info(self.who, tmp_comm_lst) #usefull but too long for i in range(len(tmp_comm_lst)): if "Quatrième de couverture" in str(tmp_comm_lst[i]): comment_resume = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_resume processed") if "Critiques" in str(tmp_comm_lst[i]): if not "autres" in str(tmp_comm_lst[i]): comment_Critiques = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_Critiques processed") if "Sommaire" in str(tmp_comm_lst[i]): comment_Sommaire = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_Sommaire processed") if "Critiques des autres" in str(tmp_comm_lst[i]): comment_AutresCritique = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if comment_AutresCritique.select('a[href*="serie.asp"]') and ( "Critique de la série" in comment_AutresCritique. select('a[href*="serie.asp"]')[0].text): critic_url = "https://www.noosfere.org/livres/" + comment_AutresCritique.select( 'a[href*="serie.asp"]')[0]['href'] try: more_comment_AutresCritique = self.get_Critique_de_la_serie( critic_url) comment_AutresCritique.append( more_comment_AutresCritique) except: self.log.exception( "get_Critique_de_la_serie failed for url: ", critic_url) if debug: self.log.info(self.who, "comment_AutresCritique processed") # group in a big bundle all the fields I think I want... (It is difficult not to include more... :-)) if comment_cover: vol_comment_soup.append(comment_cover) if comment_generic: vol_comment_soup.append(comment_generic) if comment_resume: vol_comment_soup.append(comment_resume) if comment_Critiques: vol_comment_soup.append(comment_Critiques) if comment_Sommaire: vol_comment_soup.append(comment_Sommaire) if comment_AutresCritique: vol_comment_soup.append(comment_AutresCritique) if comment_decoupage_annexe: vol_comment_soup.append( comment_pre_decoupage_annexe) # this is the title vol_comment_soup.append(comment_decoupage_annexe) # # Make a minimum of "repair" over vol_comment_soup so that it displays correctly (how I like it) in the comments and in my catalogs # - I hate justify when it makes margin "float" around the correct position (in fact when space are used instead of absolute positioning) # - I like to have functional url when they exist # - I like to find out the next and/or previous books in a serie (simulated arrows are link :-) ) for elemnt in vol_comment_soup.select('[align="justify"]'): del elemnt['align'] # remove all double or triple 'br' to improve presentation. # Note: tmp1 and tmp2 must contain a different value from any possible first elemnt. (yes, I am lrp and I am unique :-) ) # # ouais, et alors, si je modifie comment_generic APRES l'avoir integré à vol_comment_soup, il n'y a qu'une seule version en mémoire... # donc vol_comment_soup est modifié... # tmp1 = tmp2 = "lrp_the_unique" for elemnt in vol_comment_soup.findAll(): tmp1, tmp2 = tmp2, elemnt if tmp1 == tmp2: elemnt.extract() br = soup.new_tag('br') for elemnt in vol_comment_soup.select('.AuteurNiourf'): elemnt.insert(0, br) elemnt["style"] = "font-weight: 600; font-size: 18px" if debug: for elemnt in vol_comment_soup.select("a[href*='.asp']"): if 'http' not in elemnt.get('href'): self.log.info(self.who, "url incomplet avant correction: ", elemnt) for elemnt in vol_comment_soup.select("a[href*='/livres/auteur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/livres/auteur.asp", "https://www.noosfere.org/livres/auteur.asp") for elemnt in vol_comment_soup.select("a[href*='/livres/niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/livres/niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='/heberg/']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/heberg/", "https://www.noosfere.org/heberg/") for elemnt in vol_comment_soup.select( "a[href*='./EditionsLivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "./EditionsLivre.asp", "https://www.noosfere.org/livres/EditionsLivre.asp") for elemnt in vol_comment_soup.select("a[href*='./niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "./niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='heberg']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "../../heberg", "https://www.noosfere.org/heberg") for elemnt in vol_comment_soup.select("a[href*='../bd']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "../bd", "https://www.noosfere.org/bd") for elemnt in vol_comment_soup.select("a[href*='auteur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "auteur.asp", "https://www.noosfere.org/livres/auteur.asp") for elemnt in vol_comment_soup.select("a[href*='collection.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "collection.asp", "https://www.noosfere.org/livres/collection.asp") for elemnt in vol_comment_soup.select("a[href*='critsign.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "critsign.asp", "https://www.noosfere.org/livres/critsign.asp") for elemnt in vol_comment_soup.select("a[href*='EditionsLivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "EditionsLivre.asp", "https://www.noosfere.org/livres/EditionsLivre.asp") for elemnt in vol_comment_soup.select("a[href*='editeur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "editeur.asp", "https://www.noosfere.org/livres/editeur.asp") for elemnt in vol_comment_soup.select("a[href*='editionslivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "editionslivre.asp", "https://www.noosfere.org/livres/editionslivre.asp") for elemnt in vol_comment_soup.select("a[href*='niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='serie.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "serie.asp", "https://www.noosfere.org/livres/serie.asp") if debug: for elemnt in vol_comment_soup.select("a[href*='.asp']"): if 'http' not in elemnt.get('href'): self.log.info(self.who, "url incomplet apres correction: ", elemnt) fg, fd = "<<==", "==>>" #chr(0x21D0),chr(0x21D2) #chr(0x27f8),chr(0x27f9) for elemnt in vol_comment_soup.select("img[src*='arrow_left']"): elemnt.replace_with(fg) for elemnt in vol_comment_soup.select("img[src*='arrow_right']"): elemnt.replace_with(fd) # depending on the tick box, make a fat publisher using seperators that have a very low probability to pop up (§ and €) # only set vol_coll_srl if vol_coll exists # the idea is to use search and replace in the edit Metadata in bulk window. if self.extended_publisher: if debug: self.log.info( self.who, """flag : "Ajoute collection et son numéro d'ordre au champ èditeur" set""" ) if vol_coll: if debug: self.log.info(self.who, 'add collection') vol_editor = vol_editor + ('§') + vol_coll if vol_coll_srl: if debug: self.log.info(self.who, 'add collection number') vol_editor = vol_editor + ('€') + vol_coll_srl if vol_serie: if vol_serie_seq.isnumeric(): vol_serie_seq = float(vol_serie_seq) else: vol_serie_seq = 1.0 # UTF-8 characters may be serialized different ways, only xmlcharrefreplace produces xml compatible strings # any other non ascii character with another utf-8 byte representation will make calibre behave with the messsage: # ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters # Side note: # I have no real good url structure(i once got html 3 times, div a sibling of html...), but calibre does not seems to care (nice :-) ) # # Ca m'a pris un temps fou pour trouver, par hazard, que encode('ascii','xmlcharrefreplace') aidait bien... # (enfin, quasi par hazard, j' ai essayé tout ce qui pouvait ameliorer la compatibilité avec xml... mais je # lisais mal et je pensais à une incompatibilité avec la structure xml), # vol_comment_soup = vol_comment_soup.encode('ascii', 'xmlcharrefreplace') self.log.info(self.who, "+++" * 25) self.log.info(self.who, "nsfr_id, type() : ", self.nsfr_id, type(self.nsfr_id)) # must be <class 'str'> self.log.info(self.who, "relevance, type() : ", self.relevance, type(self.relevance)) # must be <class 'float'> self.log.info(self.who, "vol_title, type() : ", vol_title, type(vol_title)) # must be <class 'str'> self.log.info( self.who, "vol_auteur, type() : ", vol_auteur, type(vol_auteur)) # must be <class 'list'> of <class 'str'> self.log.info(self.who, "vol_auteur_prenom, type() : ", vol_auteur_prenom, type(vol_auteur_prenom)) # must be <class 'str'> self.log.info(self.who, "vol_auteur_nom, type() : ", vol_auteur_nom, type(vol_auteur_nom)) # must be <class 'str'> if vol_serie: self.log.info(self.who, "vol_serie, type() : ", vol_serie, type(vol_serie)) # must be <class 'str'> self.log.info(self.who, "vol_serie_seq, type() : ", vol_serie_seq, type(vol_serie_seq)) # must be <class 'float'> self.log.info(self.who, "vol_editor, type() : ", vol_editor, type(vol_editor)) # must be <class 'str'> self.log.info(self.who, "vol_coll, type() : ", vol_coll, type(vol_coll)) # must be <class 'str'> self.log.info(self.who, "vol_coll_srl, type() : ", vol_coll_srl, type(vol_coll_srl)) # must be <class 'str'> self.log.info( self.who, "vol_dp_lgl, type() : ", vol_dp_lgl, type(vol_dp_lgl) ) # must be <class 'datetime.datetime'> ('renderer=isoformat') self.log.info(self.who, "vol_isbn, type() : ", vol_isbn, type(vol_isbn)) # must be <class 'str'> self.log.info( self.who, "vol_genre, type() : ", vol_genre, type(vol_genre)) # must be <class 'list'> of <class 'str'> self.log.info(self.who, "vol_cover_index, type() : ", vol_cover_index, type(vol_cover_index)) # must be self.log.info(self.who, "type(vol_comment_soup) : ", type(vol_comment_soup) ) # must be byte encoded (start with b'blablabla... # self.log.info(self.who,"vol_comment_soup :\n",vol_comment_soup) # Maybe a bit long sometimes # language must be <class 'str'> if vol_cover_index: self.plugin.cache_identifier_to_cover_url(self.nsfr_id, vol_cover_index) if vol_isbn: self.plugin.cache_isbn_to_identifier(vol_isbn, self.nsfr_id) mi = Metadata(vol_title, [vol_auteur]) mi.set_identifier('nsfr_id', self.nsfr_id) mi.publisher = vol_editor mi.isbn = vol_isbn mi.tags = [vol_genre] mi.source_relevance = self.relevance mi.has_cover = bool(vol_cover_index) if vol_dp_lgl: mi.pubdate = vol_dp_lgl if vol_serie: mi.series = vol_serie mi.series_index = vol_serie_seq mi.language = "fra" mi.comments = vol_comment_soup if debug: self.log.info(self.who, "mi\n", mi, "\n") self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: yes24_id = self.parse_yes24_id(self.url) except: self.log.exception('Error parsing YES24 id for url: %r'%self.url) yes24_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not yes24_id: self.log.error('Could not find title/authors/YES24 id for %r'%self.url) self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('yes24', yes24_id) self.yes24_id = yes24_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance if self.yes24_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): #解析元数据各字段数据 #self.log.info("=====") try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r' % self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(asin or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', asin, 'saved in', f.name) # 分析取得书名 try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None #分析取得作者 try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title, authors)) return #以书名,作者为元数据对象mi,用于设置元数据 mi = Metadata(title, authors) #设置Bookid idtype = '17k' mi.set_identifier(idtype, asin) self.k17k_id = asin #设备注释(简介) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) #设置丛书系列 try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) #设置标签 try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) #设置最后更新日期 # try: # mi.last_modified = self.parse_last_modified(root) # except: # self.log.exception('Error parsing last_modified for url: %r'%self.url) #设置封面 try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) mi.source_relevance = self.relevance mi.languages = [ u'中文', ] if self.k17k_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.k17k_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.k17k_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
#!/usr/bin/env python
def parse_details(self, root): try: goodreads_id = self.parse_goodreads_id(self.url) except: self.log.exception("Error parsing goodreads id for url: %r" % self.url) goodreads_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception("Error parsing title and series for url: %r" % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception("Error parsing authors for url: %r" % self.url) authors = [] if not title or not authors or not goodreads_id: self.log.error("Could not find title/authors/goodreads id for %r" % self.url) self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier("goodreads", goodreads_id) self.goodreads_id = goodreads_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception("Error parsing ISBN for url: %r" % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception("Error parsing ratings for url: %r" % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception("Error parsing comments for url: %r" % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception("Error parsing cover for url: %r" % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception("Error parsing tags for url: %r" % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception("Error parsing publisher and date for url: %r" % self.url) mi.source_relevance = self.relevance if self.goodreads_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: moly_id = self.parse_moly_id(self.url) self.log.info('Parsed moly.hu identifier: %s'%moly_id) except: self.log.exception('Error parsing moly.hu id for url: %r'%self.url) moly_id = None try: title = self.parse_title(root) self.log.info('Parsed title: %s'%title) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors: %s'%authors) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not moly_id: self.log.error('Could not find title/authors/moly.hu id for %r'%self.url) self.log.error('Moly.hu id: %r Title: %r Authors: %r'%(moly_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('moly_hu', moly_id) self.moly_id = moly_id try: isbn = self.parse_isbn(root) self.log.info('Parsed ISBN: %s'%isbn) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: series_info = self.parse_series(root) if series_info is not None: mi.series = series_info[0] mi.series_index = int(series_info[1]) self.log.info('Parsed series: %s, series index: %f'%(mi.series,mi.series_index)) except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments: %s'%mi.comments) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_covers(root) self.log.info('Parsed URL for cover: %r'%self.cover_url) self.plugin.cache_identifier_to_cover_url(self.moly_id, self.cover_url) mi.has_cover = bool(self.cover_url) except: self.log.exception('Error parsing cover for url: %r'%self.url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags: %s'%mi.tags) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.languages = self.parse_languages(mi.tags) self.log.info('Parsed languages: %r'%mi.languages) except: self.log.exception('Error parsing language for url: %r'%self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher: %s'%mi.publisher) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) self.log.info('Parsed publication date: %s'%mi.pubdate) except: self.log.exception('Error parsing published date for url: %r'%self.url) try: mi.rating = self.parse_rating(root) self.log.info('Parsed rating: %s\n\n'%mi.rating) except: self.log.exception('Error parsing tags for url: %r\n\n'%self.url) mi.source_relevance = self.relevance if self.moly_id and self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.moly_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r'%self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r'%self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r'%self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_details(self): self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # We should not even be here if we are not processing an ebook hit if self.url.find("/ebook/") == -1: return try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Beam Ebooks timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # raw = raw.decode('utf-8', errors='replace') raw = raw.decode('iso-8859-1', errors='replace') # open('D:\\work\\calibre-dump-book-details.html', 'wb').write(raw) if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: # root = fromstring(clean_ascii_chars(raw)) root = fromstring(raw) except: msg = 'Failed to parse beam ebooks details page: %r' % self.url self.log.exception(msg) return try: self.beam_ebooks_id = self.parse_beam_ebooks_id(self.url) except: self.log.exception('Error parsing beam ebooks id for url: %r' % self.url) self.beam_ebooks_id = None try: (self.title, self.series_index) = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) self.title = None self.series_index = None try: self.authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None mi = Metadata(self.title, self.authors) mi.set_identifier('beam-ebooks', self.beam_ebooks_id) if self.series_index: mi.series_index = float(self.series_index) self._determine_perry_rhodan_cycle_name(mi) mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) print(mi) self.result_queue.put(mi)
def get_details(self): self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() self.log.info(raw) except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for biblionet timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: # root = fromstring(clean_ascii_chars(raw)) root = json.loads(raw) self.log.info(root) except: msg = 'Failed to parse book detail page: %r' % self.url self.log.exception(msg) return try: self.biblionetid = root['biblionetid'] except: self.log.exception('Error parsing book id for url: %r' % self.url) self.biblionetid = None try: self.title = root['title'].strip() except: self.log.exception('Error parsing title for url: %r' % self.url) self.title = None self.series_index = None try: self.authors = [root['authors'].strip()] self.log.info(self.authors) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None try: self.cover_url = root['cover_url'] self.log.info('Parsed URL for cover:%r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.biblionetid, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) try: self.publisher = root['publisher'] self.log.info('Parsed publisher:%s' % self.publisher) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: self.tags = root['categories'].replace('DDC: ', 'DDC:').replace( '-', '').split()[:-1] self.log.info('Parsed tags:%s' % self.tags) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.pubdate = root['yr_published'] self.log.info('Parsed publication date:%s' % self.pubdate) except: self.log.exception('Error parsing published date for url: %r' % self.url) mi = Metadata(self.title, self.authors) mi.set_identifier('biblionet', self.biblionetid) if self.series_index: try: mi.series_index = float(self.series_index) except: self.log.exception('Error loading series') if self.relevance: try: mi.source_relevance = self.relevance except: self.log.exception('Error loading relevance') if self.cover_url: try: mi.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') if self.publisher: try: mi.publisher = self.publisher except: self.log.exception('Error loading publisher') if self.tags: try: mi.tags = self.tags except: self.log.exception('Error loading tags') if self.pubdate: try: if self.pubdate not in (self.yr_msg1, self.yr_msg2): d = datetime.date(int(self.pubdate), 1, 1) mi.pubdate = d except: self.log.exception('Error loading pubdate') self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [_.get('content') for _ in node if _.get('property') == property][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]') # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath('//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [ _.get('content') for _ in node if _.get('property') == property ][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath( '//meta[starts-with(@property, "og") or starts-with(@property, "books")]' ) # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath( '//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([ _ + u'(역자)' for _ in _format_list(book_info['translator']['name']) ]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url( ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: CBDB_id = self.parse_CBDB_id(self.url) except: self.log.exception('Error parsing CBDB id for url: %r' % self.url) CBDB_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r' % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not CBDB_id: self.log.error('Could not find title/authors/CBDB id for %r' % self.url) self.log.error('CBDB: %r Title: %r Authors: %r' % (CBDB_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.identifiers['cbdb'] = CBDB_id mi.set_identifier('cbdb', CBDB_id) #self.log.info(CBDB_id) #self.log.info(mi.identifiers.get('cbdb', None)) self.CBDB_id = CBDB_id try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) # summary try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_urls = self.parse_covers(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_urls) #self.log.info('covers') #self.log.info(self.cover_urls) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.publisher, mi.pubdate, isbn = self.parse_editions(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing publisher and date for url: %r' % self.url) mi.source_relevance = self.relevance mi.language = 'Czech' #self.log.info('self.CBDB_id = ' + str(self.CBDB_id )) if self.CBDB_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.CBDB_id) if self.cover_urls: self.plugin.cache_identifier_to_cover_url( self.CBDB_id, self.cover_urls) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: legie_id = self.parse_legie_id(self.url) except: self.log.exception('Error parsing Legie id for url: %r' % self.url) legie_id = None try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not legie_id: self.log.error('Could not find title/authors/Legie id for %r' % self.url) self.log.error('Legie: %r Title: %r Authors: %r' % (legie_id, title, authors)) return self.legie_id = legie_id rating = comments = series = series_index = None try: rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: (series, series_index) = self.parse_series(root) except: self.log.info('Series not found.') try: tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) tags = None if legie_id: editions = self.get_editions() if editions: num_editions = len(editions) self.log.info('Nalezeno %d vydani' % num_editions) for edition in editions: (year, cover_url, publisher, isbn) = edition mi = Metadata(title, authors) self.legie_id = "%s#%s" % (legie_id, year) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index if cover_url: mi.cover_url = self.cover_url = cover_url self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) else: mi = Metadata(title, authors) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) if self.legie_id: if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url)
def parse_details(self, root): try: title = self.parse_title(root) except: self.log.exception('Error parsing title for query: %r' % self.query) title = None if not title: self.log.error('Could not find title for %r' % self.query) try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for query: %r' % self.query) authors = [] if not authors: self.log.error('Could not find authors for %r' % self.query) return mi = Metadata(title, authors) try: isbn = self.parse_isbn(root) if isbn: # match 10 of 13 getallen aan het begin, gevolgd door een spatie of niets p = re.compile('^([0-9]{13}|[0-9]{10})(?= |\Z)') if isinstance(isbn, str): m = p.match(isbn) if m: mi.isbn = m.group() else: m = p.match(isbn[0]) if m: mi.isbn = m.group() except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: lang = self.parse_language(root) if lang: mi.languages = lang except: self.log.exception('Error parsing language for url: %r' % self.url) try: lccn = self.parse_lccn(root) if lccn: if isinstance(lccn, str): mi.set_identifier('lccn', lccn) else: for identifier in lccn: mi.set_identifier('lccn', identifier) except: self.log.exception('Error parsing LCCN for url: %r' % self.url) try: ddc = self.parse_ddc(root) if ddc: if isinstance(ddc, str): mi.set_identifier('ddc', ddc) else: for identifier in ddc: mi.set_identifier('ddc', identifier) except: self.log.exception('Error parsing DDC for url: %r' % self.url) try: lcc = self.parse_lcc(root) if lcc: if isinstance(lcc, str): mi.set_identifier('lcc', lcc) else: for identifier in lcc: mi.set_identifier('lcc', identifier) except: self.log.exception('Error parsing LCC for url: %r' % self.url) mi.source_relevance = self.relevance self.result_queue.put(mi)
def get_details(self): ''' The get_details() function for stripping the website for all information ''' self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # Parse the html code from the website try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() # Do some error handling if it fails to read data except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for saxo timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # Do some error handling if the html code returned 404 if "<title>404 - " == raw: self.log.error('URL malformed: %r' % self.url) return # Clean the html data a little try: root = parse(raw) except: self.log.error("Error cleaning HTML") return # Get the title of the book try: title_node = root.xpath('//span[@itemprop="name"]') self.title = title_node[0].text except: self.log.exception('Error parsing title for url: %r' % self.url) # Get the author of the book try: author_node = root.xpath('//span[@class="expandAuthorName"]') author_strings = author_node[0].text.split(",") #print(author_strings) for name in author_strings: self.authors.append(name) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None # Get the series of the book try: series_node = root.xpath('//b[contains(text(), "Serie")]/a') if len(series_node) > 0: self.series = series_node[0].text.split(": ")[0].strip() self.series_index = series_node[0].text.split(": ")[-1].strip() # print("'%s'" % self.series) # print("'%s'" % self.series_index) except: self.log.exception('Error parsing series for url: %r' % self.url) # Some books have ratings, let's use them. try: self.rating = 0.0 except: self.log.exception('Error parsing rating for url: %r' % self.url) self.rating = 0.0 # Get the ISBN number from the site try: isbn_node = root.xpath( '//div[@class="eBookContainer"]/b/span[@itemprop="identifier"]' ) if len(isbn_node) > 0: self.isbn = isbn_node[0].text.replace("ISBN: ", "").strip() except: self.log.exception('Error parsing isbn for url: %r' % self.url) self.isbn = None # Get the comments/blurb for the book try: comment_node = root.xpath('//meta[@name="description"]/@content') self.comments = comment_node[0] except: self.log.exception('Error parsing comments for url: %r' % self.url) self.comments = None # Parse the cover url for downloading the cover. try: cover_node = root.xpath( '//div[@class="bookDetailCoverCover"]/img/@src') self.cover_url = "https://mofibo.com" + cover_node[0] self.log.info(' Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) # Get the publisher name try: publisher_node = root.xpath( '//div[@class="eBookContainer"]/b/span/a[@itemprop="brand"]') if len(publisher_node) > 0: self.publisher = publisher_node[0].text except: self.log.exception('Error parsing publisher for url: %r' % self.url) # Get the language of the book. Only english and danish are supported tho try: language_node = root.xpath('//b[@class="expanderLanguage"]') language = language_node[0].text.strip().replace("Sprog:", "").replace( " ", "") language = self.lang_map.get(language, None) self.language = language except: self.log.exception('Error parsing language for url: %r' % self.url) # Get the publisher date try: pubdate_node = root.xpath( '//div[@class="eBookContainer"]/b[contains(text(),"Udgivet:")]' ) if len(pubdate_node) > 0: date_str = pubdate_node[0].text.replace("Udgivet:", "").strip() format_str = '%Y-%m-%d' # The format self.pubdate = datetime.datetime.strptime(date_str, format_str) except: self.log.exception('Error parsing published date for url: %r' % self.url) # Get the tags try: tags = [] tags_node = root.xpath('//span[@itemprop="category"]') tags.append(tags_node[0].text.strip()) self.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) # Setup the metadata meta_data = Metadata(self.title, self.authors) meta_data.set_identifier('isbn', self.isbn) meta_data.set_identifier('mofibo', self.url) # Set rating if self.series: try: meta_data.series = self.series meta_data.series_index = self.series_index except: self.log.exception('Error loading series') # Set ISBN if self.isbn: try: meta_data.isbn = self.isbn except: self.log.exception('Error loading ISBN') # Set relevance if self.relevance: try: meta_data.source_relevance = self.relevance except: self.log.exception('Error loading relevance') # Set cover url if self.cover_url: try: meta_data.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') # Set publisher if self.publisher: try: meta_data.publisher = self.publisher except: self.log.exception('Error loading publisher') # Set language if self.language: try: meta_data.language = self.language except: self.log.exception('Error loading language') # Set comments/blurb if self.comments: try: meta_data.comments = self.comments except: self.log.exception("Error loading comments") # Set publisher data if self.pubdate: try: meta_data.pubdate = self.pubdate except: self.log.exception('Error loading pubdate') # Set tags data if self.tags: try: meta_data.tags = self.tags except: self.log.exception('Error loading tags') # Put meta data self.plugin.clean_downloaded_metadata(meta_data) self.result_queue.put(meta_data)
def parse_details(self, root): search_data = '' isbn = None try: self.log.info('Parse details:%s'%self.url) databazeknih_id = self.parse_databazeknih_id(self.url) self.log.info('Parsed DK identifier:%s'%databazeknih_id) except: self.log.exception('Error parsing databazeknih id for url: %r'%self.url) databazeknih_id = None # self.log.info('11') try: title = self.parse_title(root) self.log.info('Parsed title:%s'%title) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors:%s'%authors) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not databazeknih_id: self.log.error('Could not find title/authors/databazeknih id for %r'%self.url) self.log.error('DK id: %r Title: %r Authors: %r'%(databazeknih_id, title, authors)) return mi = Metadata(title, authors) self.log.info('dbki:%s'%databazeknih_id) mi.set_identifier('databazeknih', databazeknih_id) self.databazeknih_id = databazeknih_id try: (mi.series, mi.series_index) = self.parse_series(root) self.log.info('Parsed series:%s'%mi.series) self.log.info('Parsed series index:%s'%mi.series_index) except : self.log.exception('Error parsing series for url: %r'%self.url) series = None try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments:%s'%mi.comments) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) self.log.info('Parsed URL for cover:%r'%self.cover_url) self.plugin.cache_identifier_to_cover_url(self.databazeknih_id, self.cover_url) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags:%s'%mi.tags) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher:%s'%mi.publisher) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(root) self.log.info('Parsed pubdate:%s'%mi.pubdate) except: self.log.exception('Error parsing pubdate for url: %r'%self.url) try: mi.rating = self.parse_rating(root) self.log.info('Parsed rating:%s'%mi.rating) except: self.log.exception('Error parsing rating for url: %r'%self.url) mi.source_relevance = self.relevance # if series: # mi.series = series try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) if self.databazeknih_id: self.plugin.cache_isbn_to_identifier(self.isbn, self.databazeknih_id) # self.plugin.clean_downloaded_metadata(mi) # mi.isbn = check_isbn(mi.isbn) self.log.info(mi) self.result_queue.put(mi)
def _GoodreadsBook_to_Metadata(self, book): # type: (_GoodreadsBook) -> Metadata """ :param book: _GoodreadsBook: book :return: Metadata: Metadata """ mi = Metadata(book.title, book.authors) mi.source_relevance = 0 mi.set_identifier('goodreads', book.id) if self.prefs['NEVER_REPLACE_ISBN'] and mi.get_identifiers().get( 'isbn'): mi.set_identifier('isbn', '') if book.asin and not self.prefs['NEVER_REPLACE_AMAZONID']: mi.set_identifier('amazon', book.asin) if book.isbn and not self.prefs['NEVER_REPLACE_ISBN']: try: if len(book.isbn) == 10: mi.isbn = check_isbn13(_ISBNConvert.convert(book.isbn)) else: mi.isbn = check_isbn13(book.isbn) except: self.log.error("ISBN CONVERSION ERROR:", book.isbn) self.log.exception() if book.image_url: self.log.info('cache_identifier_to_cover_url:', book.asin, ':', book.image_url) self.cache_identifier_to_cover_url(book.id, book.image_url) if book.publisher: self.log.info('book.publisher is:', book.publisher) mi.publisher = book.publisher if book.pubdate: self.log.info('book.pubdate is:', book.pubdate.strftime('%Y-%m-%d')) mi.pubdate = book.pubdate if book.comments: self.log.info('book.editorial_review is:', book.comments) mi.comments = book.comments tags = self.prefs['ADD_THESE_TAGS'].split(',') tags.extend(book.tags) # tag_mappings = JSONConfig('plugins/GenreMappings')['genreMappings'] # mi.tags = list(set(sorted(filter(lambda x: tag_mappings.get(x, x), tags)))) if book.series: mi.series = book.series self.log.info(u'series:', book.series) if book.series_index: mi.series_index = book.series_index self.log.info(u'series_index:', "{0:.2f}".format(book.series_index)) else: mi.series_index = 0 if book.average_rating: mi.rating = book.average_rating self.clean_downloaded_metadata(mi) return mi
def parse_details(self, raw, root): dang_id = parse_dang_id(root, self.log, self.url) if not dang_id and root.xpath( '//form[@action="/errors/validateCaptcha"]'): raise CaptchaError( 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.' ) if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(dang_id or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', dang_id, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not dang_id: self.log.error('Could not find title/authors/dang_id for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (dang_id, title, authors)) return mi = Metadata(title, authors) idtype = 'dang' mi.set_identifier(idtype, dang_id) self.dang_id = dang_id try: mi.comments = self.parse_comments(root, raw) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) pd = root.xpath(self.pd_desc_xpath) pd_info = root.xpath(self.pd_info_xpath) pd_info_store = root.xpath(self.pd_info_store_xpath) pd_desc = root.xpath(self.pd_desc_xpath) if pd_info or pd_info_store: try: isbn = self.parse_isbn(pd_info, pd_info_store, pd_desc) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) if pd_info: pd_info = pd_info[0] else: pd_info = pd_info_store[0] try: mi.publisher = self.parse_publisher(pd_info) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.pubdate = self.parse_pubdate(pd_info) except: self.log.exception('Error parsing publish date for url: %r' % self.url) else: self.log.warning('Failed to find product description for url: %r' % self.url) mi.source_relevance = self.relevance if self.dang_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.dang_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.dang_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r' % self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r' % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r' % self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r' % (kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r' % self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r' % self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: antik_id = self.parse_antik_id(root) self.log.info('Parsed Antikvarium identifier: %s' % antik_id) except: self.log.exception('Error parsing Antikvarium id for url: %r' % self.url) antik_id = None try: title = self.parse_title(root) self.log.info('Parsed title: %s' % title) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors: %s' % authors) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not antik_id: self.log.error( 'Could not find title/authors/Antikvarium.hu id for %r' % self.url) self.log.error('Antikvarium.hu id: %r Title: %r Authors: %r' % (antik_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('antik_hu', antik_id) self.antik_id = antik_id try: isbn = self.parse_isbn(root) self.log.info('Parsed ISBN: %s' % isbn) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: series = self.parse_series(root) self.log.info('Parsed series: %s' % series) except: self.log.exception('Error parsing series for url: %r' % self.url) series = None try: mi.series_index = self.parse_series_index(root) self.log.info('Parsed series index: %s' % mi.series_index) except: self.log.exception('Error parsing series for url: %r' % self.url) mi.series_index = None try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments: %s' % mi.comments) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_url = self.parse_cover(root) self.log.info('Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.antik_id, self.cover_url) mi.has_cover = bool(self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher: %s' % mi.publisher) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags: %s' % mi.tags) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.pubdate = self.parse_published_date(root) self.log.info('Parsed publication date: %s' % mi.pubdate) except: self.log.exception('Error parsing published date for url: %r' % self.url) try: mi.languages = self.parse_languages(root) self.log.info('Parsed languages: %r' % mi.languages) except: self.log.exception('Error parsing languages for url: %r' % self.url) mi.source_relevance = self.relevance if series: mi.series = series if self.antik_id and self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.antik_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): isfdb_id = None title = None authors = [] isbn = None publisher = None pubdate = None try: isfdb_id = re.search('(\d+)$', self.url).groups(0)[0] except: self.log.exception('Error parsing ISFDB ID for url: %r' % self.url) detail_nodes = root.xpath('//div[@id="content"]//td[@class="pubheader"]/ul/li') if not detail_nodes: detail_nodes = root.xpath('//div[@id="content"]/div/ul/li') # no table (on records with no image) for detail_node in detail_nodes: section = detail_node[0].text_content().strip().rstrip(':') #self.log.info(section) try: if section == 'Publication': title = detail_node[0].tail.strip() if not title: # assume an extra span with a transliterated title tooltip title = detail_node[1].text_content().strip() #self.log.info(title) elif section == 'Authors' or section == 'Editors': for a in detail_node.xpath('.//a'): author = a.text_content().strip() if section.startswith('Editors'): authors.append(author + ' (Editor)') else: authors.append(author) #self.log.info(authors) elif section == 'ISBN': isbn = detail_node[0].tail.strip('[] \n') #self.log.info(isbn) elif section == 'Publisher': publisher = detail_node.xpath('a')[0].text_content().strip() #self.log.info(publisher) elif section == 'Date': pubdate = self._convert_date_text(detail_node[0].tail.strip()) #self.log.info(pubdate) except: self.log.exception('Error parsing section %r for url: %r' % (section, self.url) ) if not title or not authors or not isfdb_id: self.log.error('Could not find title/authors/ISFDB ID for %r' % self.url) self.log.error('ISFDB: %r Title: %r Authors: %r' % (isfdb_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('isfdb', isfdb_id) self.isfdb_id = isfdb_id if isbn: self.isbn = mi.isbn = isbn if publisher: mi.publisher = publisher if pubdate: mi.pubdate = pubdate try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! mi.source_relevance = self.relevance if self.isfdb_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.isfdb_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: isbn = self.extract_isbn(self.url) except: self.log.exception('No ISBN in URL: %r'%self.url) isbn = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not isbn: self.log.error('Could not find title/authors/Aladin id for %r'%self.url) self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.set_identifier('isbn', isbn) mi.isbn = isbn self.isbn = isbn # ISBN-13 try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! if mi.has_cover: self.log.info('Cover URL: '+mi.cover_url) try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_details(self): ''' The get_details() function for stripping the website for all information ''' self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # Parse the html code from the website try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() # Do some error handling if it fails to read data except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for saxo timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # Do some error handling if the html code returned 404 if "<title>404 - " == raw: self.log.error('URL malformed: %r' % self.url) return # Clean the html data a little try: root = parse(raw) except: self.log.error("Error cleaning HTML") return # Get the json data within the HTML code (some stuff is easier to get with json) try: json_raw = root.xpath('(//script[@type="application/ld+json"])[2]') json_root = json.loads(json_raw[0].text.strip()) #print(json.dumps(json_root, indent=4, sort_keys=True)) except: self.log.error("Error loading JSON data") return # Get the title of the book try: self.title = json_root['name'] except: self.log.exception('Error parsing title for url: %r' % self.url) # Get the author of the book try: author_node = root.xpath( '//h2[@class="product-page-heading__autor"]//a') for name in author_node: self.authors.append(name.text.strip()) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None # Some books have ratings, let's use them. try: self.rating = float(json_root['aggregateRating']['ratingValue']) except: self.log.exception('Error parsing rating for url: %r' % self.url) self.rating = 0.0 # Get the ISBN number from the site try: self.isbn = json_root['isbn'] except: self.log.exception('Error parsing isbn for url: %r' % self.url) self.isbn = None # Get the comments/blurb for the book try: self.comments = parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) self.comments = None # Parse the cover url for downloading the cover. try: self.cover_url = json_root['image'] self.log.info(' Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) # Get the publisher name try: self.publisher = json_root['publisher']['name'] except: self.log.exception('Error parsing publisher for url: %r' % self.url) # Get the language of the book. Only english and danish are supported tho try: language = json_root['inLanguage']['name'] language = self.lang_map.get(language, None) self.language = language except: self.log.exception('Error parsing language for url: %r' % self.url) # Get the publisher date try: #pubdate_node = root.xpath('(//dl[@class="product-info-list"]//dd)[2]') # Format dd-mm-yyyy pubdate_node = root.xpath( '//div[@class="product-page-block__container"]//dd' ) # Format dd-mm-yyyy date_str = pubdate_node[0].text.strip() format_str = '%d-%m-%Y' # The format self.pubdate = datetime.datetime.strptime(date_str, format_str) except: self.log.exception('Error parsing published date for url: %r' % self.url) # Setup the metadata meta_data = Metadata(self.title, self.authors) meta_data.set_identifier('isbn', self.isbn) meta_data.set_identifier('saxo', self.url) # Set rating if self.rating: try: meta_data.rating = self.rating except: self.log.exception('Error loading rating') # Set ISBN if self.isbn: try: meta_data.isbn = self.isbn except: self.log.exception('Error loading ISBN') # Set relevance if self.relevance: try: meta_data.source_relevance = self.relevance except: self.log.exception('Error loading relevance') # Set cover url if self.cover_url: try: meta_data.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') # Set publisher if self.publisher: try: meta_data.publisher = self.publisher except: self.log.exception('Error loading publisher') # Set language if self.language: try: meta_data.language = self.language except: self.log.exception('Error loading language') # Set comments/blurb if self.comments: try: meta_data.comments = self.comments except: self.log.exception("Error loading comments") # Set publisher data if self.pubdate: try: meta_data.pubdate = self.pubdate except: self.log.exception('Error loading pubdate') # Put meta data self.plugin.clean_downloaded_metadata(meta_data) self.result_queue.put(meta_data)