def result2meta(self, result, prev_identifiers={}):
        '''
        Converts the result dict into Calibre metadata.
        Note: Source download plugins do  not have access to custom columns.
        '''
        title = get_title(result)
        authors = get_author_list(result)
        mi = Metadata(title=title, authors=authors)

        mi.identifiers = update_identifiers(prev_identifiers, result)

        put_publisher(mi, result)
        put_language(mi, result)
        self.put_pubdate(mi, result)
        put_tags(mi, result)
        put_journal(mi, result)
        self.put_series_index(mi, result)

        comments = ""
        if prefs['abstract_to_comment'] and 'abstract' in result:
            comments = "\n\n".join([comments, result['abstract']])

        if prefs['query_to_comment']:
            extra_meta = self.mkComments(result)
            extra_plus = map(lambda x: "crossref:%s" % x, extra_meta)
            extra = "\n".join(extra_plus)
            comments = "\n\n".join([comments, extra])
        mi.comments = comments

        if 'score' in result:
            mi.source_relevance = 100 - result['score']
        else:
            mi.source_relevance = 100
        # self.log.info("set comment to %s"%mi.comments)
        return mi
Esempio n. 2
0
    def _get_results(self):
        """ Download Information from Google Scholar """
        querier = ScholarQuerier(author=self.query_authors[0],
                                 count=self.count)
        querier.query(self.query_title, bibtex=True)
        articles = querier.articles
        if self.count > 0:
            articles = articles[:self.count]
        for num, art in enumerate(articles):
            bibtex_string = art.as_bib()

            bib = Bibparser(bibtex_string)
            bib.parse()
            slug = bib.records.keys()[0]
            bib_dict = bib.records[slug]

            title = bib_dict.get('title')

            authors = []

            for author in bib_dict.get('author', []):
                # Ignore non existant given names
                given_name = '%s ' % author.get(
                    'given') if 'given' in author else ''
                # Add full stops after abbreviated name parts
                given_name = re.sub(r'(^| +)([A-Z])( +|$)', r'\1\2.\3',
                                    given_name)

                authors.append('%s%s' % (given_name, author['family']))

            mi = Metadata(title, authors)

            mi.set_identifier('googlescholar', slug)
            mi.source_relevance = 100 - num

            if 'publisher' in bib_dict:
                mi.publisher = bib_dict['publisher']

            if 'issued' in bib_dict:
                if 'literal' in bib_dict['issued']:
                    year = int(bib_dict['issued']['literal'])

                    from calibre.utils.date import utc_tz
                    # We only have the year, so let's use Jan 1st
                    mi.pubdate = datetime.datetime(year, 1, 1, tzinfo=utc_tz)

            self.plugin.clean_downloaded_metadata(mi)
            self._log_metadata(mi)
            self.result_queue.put(mi, True)
            self.log.info(self.result_queue.qsize())
Esempio n. 3
0
    def parse_details(self, root):
        """
        """
        try:
            self.log.info("        Parse details: %r" % self.url)
            self.databazeknih_id = self.parse_databazeknih_id(self.url)
            self.log.info("        Parsed DK identifier: %s" %
                          self.databazeknih_id)
        except:
            self.log.exception("Error parsing DK identifier for url: %r" %
                               self.url)
            self.databazeknih_id = None

        # Parse title
        self.parse_title(root)
        # Parse authors
        self.parse_authors(root)
        if not self.title or not self.authors or not self.databazeknih_id:
            self.log.error("Could not find title/authors/DK id for %r" %
                           self.url)
            self.log.error("DK id: %r Title: %r Authors: %r" %
                           (self.databazeknih_id, self.title, self.authors))
            return

        mi = Metadata(self.title, self.authors)
        mi.set_identifier("databazeknih", self.databazeknih_id)

        # Parse series
        self.parse_series(root, mi)
        # Parse comments
        self.parse_comments(root, mi)
        # Parse publisher
        self.parse_publisher(root, mi)
        # Parse pubdate
        self.parse_pubdate(root, mi)
        # Parse tags
        self.parse_tags(root, mi)
        # Parse rating
        self.parse_rating(root, mi)
        # Parse book ISBN
        self.parse_isbn(self.more_info, mi)
        # Parse language
        self.parse_language(self.more_info, mi)
        # Parse book cover
        self.parse_cover(root, mi)

        mi.source_relevance = self.relevance

        self.log.info(mi)
        self.result_queue.put(mi)
    def _get_results(self):
        """ Download Information from Google Scholar """
        querier = ScholarQuerier(author=self.query_authors[0], count=self.count)
        querier.query(self.query_title, bibtex=True)
        articles = querier.articles
        if self.count > 0:
            articles = articles[:self.count]
        for num, art in enumerate(articles):
            bibtex_string = art.as_bib()

            bib = Bibparser(bibtex_string)
            bib.parse()
            slug = bib.records.keys()[0]
            bib_dict = bib.records[slug]

            title = bib_dict.get('title')

            authors = []

            for author in bib_dict.get('author', []):
                # Ignore non existant given names
                given_name = '%s ' % author.get('given') if 'given' in author else ''
                # Add full stops after abbreviated name parts
                given_name = re.sub(r'(^| +)([A-Z])( +|$)', r'\1\2.\3', given_name)

                authors.append('%s%s' % (given_name, author['family']))

            mi = Metadata(title, authors)

            mi.set_identifier('googlescholar', slug)
            mi.source_relevance = 100-num

            if 'publisher' in bib_dict:
                mi.publisher = bib_dict['publisher']

            if 'issued' in bib_dict:
                if 'literal' in bib_dict['issued']:
                    year = int(bib_dict['issued']['literal'])

                    from calibre.utils.date import utc_tz
                    # We only have the year, so let's use Jan 1st
                    mi.pubdate = datetime.datetime(year, 1, 1, tzinfo=utc_tz)

            self.plugin.clean_downloaded_metadata(mi)
            self._log_metadata(mi)
            self.result_queue.put(mi, True)
            self.log.info(self.result_queue.qsize())
Esempio n. 5
0
    def extract_vol_details(self, vol_url):
        # Here we extract and format the information from the choosen volume.
        # - The first name and last name to populate author and author sort : vol_auteur_prenom  and vol_auteur_nom
        # - The title of the volume                                         : vol_title
        # - The serie name the volume is part of                            : vol_serie
        # - The sequence number in the serie                                : vol_serie_seq                         # missing
        # - The editor of this volume                                       : vol_editor
        # - The editor's collection of this volume                          : vol_coll
        # - The collection serial code of this volume                       : vol_coll_srl
        # - The "dépot légal" date (the publication date is vastly unknown) : vol_dp_lgl                            # date format to be computed
        # - The ISBN number assoi-ciated with the volume                    : vol_isbn
        # - The volume tags                                                 : vol_genre
        # - The url pointer to the volume cover image                       : vol_cover_index
        # - The comments includes various info about the book               : vol_comment_soup
        #   . reference, an url pointer to noosfere
        #   . couverture, an url pointer to noosfere, cover may be real smal, but is accurate to the volume
        #   . first edition information
        #   . serie (cycle) name and number
        #   . this volume editor info
        #   . Resume (quatrième de couverture)
        #   . Critiques
        #   . Sommaire detailing what novels are in the volume when it is an anthology
        #   . Critiques about the serie and/or about another volume of the book
        #

        debug = self.dbg_lvl & 2
        self.log.info(self.who, "\nIn extract_vol_details(soup)")
        if debug:
            self.log.info(self.who, "vol_url       : ", vol_url)

        if debug:
            self.log.info(
                self.who,
                "calling ret_soup(log, dbg_lvl, br, url, rkt=None, who='[__init__]')"
            )
            self.log.info(self.who, "vol_url : ", vol_url, "who : ", self.who)
        rsp = ret_soup(self.log, self.dbg_lvl, self.br, vol_url, who=self.who)
        soup = rsp[0]
        url_vrai = rsp[1].replace("&Tri=3", "")
        #        if debug: self.log.info(self.who,soup.prettify())              # useful but too big...

        self.nsfr_id = self.nsfr_id + "$vl$" + url_vrai.replace(
            '?', '&').replace('=', '&').split('&')[2]
        # self.nsfr_id = (self.nfsr_id).strip("$")                        # If I use this form, it gives this error: 'Worker' object has no attribute 'nfsr_id' ???
        tmp = self.nsfr_id
        self.nsfr_id = tmp.strip('$')

        if debug:
            self.log.info(self.who, "self.nsfr_id, type() : ", self.nsfr_id,
                          type(self.nsfr_id))

        tmp_lst = []
        vol_info = {}
        vol_title = ""
        vol_auteur = ""
        vol_auteur_prenom = ""
        vol_auteur_nom = ""
        vol_serie = ""
        vol_serie_seq = ""
        vol_editor = ""
        vol_coll = ""
        vol_coll_srl = ""
        vol_dp_lgl = ""
        vol_isbn = ""
        vol_genre = ""
        vol_cover_index = ""
        comment_generic = None
        comment_resume = None
        comment_Critiques = None
        comment_Sommaire = None
        comment_AutresCritique = None
        comment_cover = None
        comment_decoupage_annexe = None

        # add volume address as a reference in the comment
        vol_comment_soup = BS(
            '<div><p>Référence: <a href="' + url_vrai + '">' + url_vrai +
            '</a></p></div>', "lxml")
        if debug: self.log.info(self.who, "vol reference processed")

        if soup.select("span[class='TitreNiourf']"):
            vol_title = soup.select(
                "span[class='TitreNiourf']")[0].text.strip()
        if debug: self.log.info(self.who, "vol_title processed : ", vol_title)

        if soup.select("span[class='AuteurNiourf']"):
            vol_auteur = soup.select(
                "span[class='AuteurNiourf']")[0].text.replace("\n",
                                                              "").strip()
        if debug:
            self.log.info(self.who, "vol_auteur processed : ", vol_auteur)
        for i in range(len(vol_auteur.split())):
            if not vol_auteur.split()[i].isupper():
                vol_auteur_prenom += " " + vol_auteur.split()[i]
            else:
                vol_auteur_nom += " " + vol_auteur.split()[i].title()
        vol_auteur = vol_auteur.title()
        vol_auteur_prenom = vol_auteur_prenom.strip()
        if debug:
            self.log.info(self.who, "vol_auteur_prenom processed : ",
                          vol_auteur_prenom)
        vol_auteur_nom = vol_auteur_nom.strip()
        if debug:
            self.log.info(self.who, "vol_auteur_nom processed : ",
                          vol_auteur_nom)

        if soup.select("a[href*='serie.asp']"):
            if soup.select("a[href*='serie.asp']")[0].find_parent(
                    "span", {"class": "ficheNiourf"}):
                vol_serie = soup.select("a[href*='serie.asp']")[0].text
                tmp_vss = [
                    x for x in soup.select("a[href*='serie.asp']")
                    [0].parent.stripped_strings
                ]
                for i in range(len(tmp_vss)):
                    if "vol." in tmp_vss[i]:
                        if not vol_serie_seq:
                            vol_serie_seq = tmp_vss[i].replace("vol.",
                                                               "").strip()
                    if "découpage" in tmp_vss[i]:
                        dec_anx_url = "https://www.noosfere.org/livres/" + soup.select(
                            "a[href*='serie.asp']")[0]['href']
                        comment_pre_decoupage_annexe = BS(
                            '<div><p> </p><p style="font-weight: 600; font-size: 18px"> Découpage annexe</p><hr style="color:CCC;"/></div>',
                            "lxml")
                        comment_decoupage_annexe = self.get_decoupage_annexe(
                            dec_anx_url)
                if debug:
                    self.log.info(self.who,
                                  "vol_serie, vol_serie_seq processed : ",
                                  vol_serie, ",", vol_serie_seq)

        comment_generic = soup.select("span[class='ficheNiourf']")[0]
        new_div = soup.new_tag('div')
        comment_generic = comment_generic.wrap(new_div)
        if debug: self.log.info(self.who, "comment_generic processed")

        if soup.select("a[href*='editeur.asp']"):
            vol_editor = soup.select("a[href*='editeur.asp']")[0].text
        if debug:
            self.log.info(self.who, "vol_editor processed : ", vol_editor)

        if soup.select("a[href*='collection.asp']"):
            vol_coll = soup.select("a[href*='collection.asp']")[0].text
        if debug: self.log.info(self.who, "vol_coll : ", vol_coll)

        for i in comment_generic.stripped_strings:
            tmp_lst.append(str(i))
        vol_coll_srl = tmp_lst[len(tmp_lst) - 1]
        if "n°" in vol_coll_srl:
            for k in ["n°", "(", ")"]:
                if k in vol_coll_srl:
                    vol_coll_srl = vol_coll_srl.replace(k, "")
            vol_coll_srl = vol_coll_srl.strip()
            vol_coll_srl = vol_coll_srl.split("/")[0]
            if vol_coll_srl[0].isnumeric():
                vol_coll_srl = ("0" * 5 + vol_coll_srl)[-6:]
        else:
            vol_coll_srl = ""
        if debug:
            self.log.info(self.who, "vol_coll_srl processed : ", vol_coll_srl)

        # publication date is largely ignored in noosfere, but we have the "dépot legal" date and I use it instead
        # note that I 'calculate' the missing day of the month and even sometimes the missing month
        ms = ("janvier", "février", "mars", "avril", "mai", "juin", "juillet",
              "août", "septembre", "octobre", "novembre", "décembre")
        for elemnt in soup.select_one(
                "span[class='sousFicheNiourf']").stripped_strings:
            if debug: self.log.info(self.who, "elemnt : ", elemnt)
            if not vol_dp_lgl:
                elemn = (elemnt.replace("Dépôt légal :",
                                        "").split(','))[0].strip()
                if elemn:
                    if elemn.isnumeric() and len(elemn) == 4:
                        vol_dp_lgl = datetime.datetime.strptime(
                            "175 " + elemn, "%j %Y")
                    elif "semestre" in elemn:
                        ele = elemn.split()
                        vol_dp_lgl = datetime.datetime.strptime(
                            ("000" + str((int(ele[0][0]) - 1) * 175 + 97))[-3:]
                            + " " + ele[2], "%j %Y")
                    elif "trimestre" in elemn:
                        ele = elemn.split()
                        vol_dp_lgl = datetime.datetime.strptime(
                            ("000" + str((int(ele[0][0]) - 1) * 91 + 47))[-3:]
                            + " " + ele[2], "%j %Y")
                    else:
                        for i in range(len(ms)):
                            if ms[i] in elemn:
                                ele = elemn.split()
                                vol_dp_lgl = datetime.datetime.strptime(
                                    ("000" + str(10 + 31 * i))[-3:] + " " +
                                    ele[1], "%j %Y")
                                break
                    if debug:
                        self.log.info(self.who, "vol_dp_lgl : ", vol_dp_lgl)

            if "ISBN" in elemnt:
                vol_isbn = elemnt.lower().replace(" ", "").replace('isbn:', '')
                if "néant" in vol_isbn: vol_isbn = ""
                if debug:
                    self.log.info(self.who, "vol_isbn processed : ", vol_isbn)

            if "Genre" in elemnt:
                vol_genre = elemnt.lstrip("Genre : ")
                if debug:
                    self.log.info(self.who, "vol_genre processed : ",
                                  vol_genre)

        if soup.select("img[name='couverture']"):
            for elemnt in repr(
                    soup.select("img[name='couverture']")[0]).split('"'):
                if "http" in elemnt:
                    if not vol_cover_index:
                        vol_cover_index = elemnt
                        if debug:
                            self.log.info(self.who,
                                          "vol_cover_index processed : ",
                                          vol_cover_index)

        # add cover image address as a reference in the comment
        if vol_cover_index:
            comment_cover = BS(
                '<div><p>Couverture: <a href="' + vol_cover_index + '">' +
                vol_cover_index + '</a></p></div>', "lxml")

    # select the fields I want... More exist such as film adaptations or references to advises to read
    # but that is not quite consistant around all the books (noosfere is a common database from many people)
    # and beside I have enough info like that AND I do NOT want to take out the noosfere's business

        tmp_comm_lst = soup.select("span[class='AuteurNiourf']")
        if debug: self.log.info(self.who, tmp_comm_lst)  #usefull but too long
        for i in range(len(tmp_comm_lst)):
            if "Quatrième de couverture" in str(tmp_comm_lst[i]):
                comment_resume = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]
                if debug: self.log.info(self.who, "comment_resume processed")

            if "Critiques" in str(tmp_comm_lst[i]):
                if not "autres" in str(tmp_comm_lst[i]):
                    comment_Critiques = tmp_comm_lst[i].find_parents(
                        "div", {'class': 'sousbloc'})[0]
                    if debug:
                        self.log.info(self.who, "comment_Critiques processed")

            if "Sommaire" in str(tmp_comm_lst[i]):
                comment_Sommaire = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]
                if debug: self.log.info(self.who, "comment_Sommaire processed")

            if "Critiques des autres" in str(tmp_comm_lst[i]):
                comment_AutresCritique = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]

                if comment_AutresCritique.select('a[href*="serie.asp"]') and (
                        "Critique de la série" in comment_AutresCritique.
                        select('a[href*="serie.asp"]')[0].text):
                    critic_url = "https://www.noosfere.org/livres/" + comment_AutresCritique.select(
                        'a[href*="serie.asp"]')[0]['href']
                    try:
                        more_comment_AutresCritique = self.get_Critique_de_la_serie(
                            critic_url)
                        comment_AutresCritique.append(
                            more_comment_AutresCritique)
                    except:
                        self.log.exception(
                            "get_Critique_de_la_serie failed for url: ",
                            critic_url)

                if debug:
                    self.log.info(self.who, "comment_AutresCritique processed")

    # group in a big bundle all the fields I think I want... (It is difficult not to include more... :-))

        if comment_cover:
            vol_comment_soup.append(comment_cover)
        if comment_generic:
            vol_comment_soup.append(comment_generic)
        if comment_resume:
            vol_comment_soup.append(comment_resume)
        if comment_Critiques:
            vol_comment_soup.append(comment_Critiques)
        if comment_Sommaire:
            vol_comment_soup.append(comment_Sommaire)
        if comment_AutresCritique:
            vol_comment_soup.append(comment_AutresCritique)
        if comment_decoupage_annexe:
            vol_comment_soup.append(
                comment_pre_decoupage_annexe)  # this is the title
            vol_comment_soup.append(comment_decoupage_annexe)

    #
    # Make a minimum of "repair" over vol_comment_soup so that it displays correctly (how I like it) in the comments and in my catalogs
    # - I hate justify when it makes margin "float" around the correct position (in fact when space are used instead of absolute positioning)
    # - I like to have functional url when they exist
    # - I like to find out the next and/or previous books in a serie (simulated arrows are link :-) )

        for elemnt in vol_comment_soup.select('[align="justify"]'):
            del elemnt['align']

    # remove all double or triple 'br' to improve presentation.
    # Note: tmp1 and tmp2 must contain a different value from any possible first elemnt. (yes, I am lrp and I am unique :-) )
    #
    # ouais, et alors, si je modifie comment_generic APRES l'avoir integré à vol_comment_soup, il n'y a qu'une seule version en mémoire...
    # donc vol_comment_soup est modifié...
    #

        tmp1 = tmp2 = "lrp_the_unique"
        for elemnt in vol_comment_soup.findAll():
            tmp1, tmp2 = tmp2, elemnt
            if tmp1 == tmp2:
                elemnt.extract()

        br = soup.new_tag('br')
        for elemnt in vol_comment_soup.select('.AuteurNiourf'):
            elemnt.insert(0, br)
            elemnt["style"] = "font-weight: 600; font-size: 18px"

        if debug:
            for elemnt in vol_comment_soup.select("a[href*='.asp']"):
                if 'http' not in elemnt.get('href'):
                    self.log.info(self.who, "url incomplet avant correction: ",
                                  elemnt)

        for elemnt in vol_comment_soup.select("a[href*='/livres/auteur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/livres/auteur.asp",
                    "https://www.noosfere.org/livres/auteur.asp")
        for elemnt in vol_comment_soup.select("a[href*='/livres/niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/livres/niourf.asp",
                    "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='/heberg/']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/heberg/", "https://www.noosfere.org/heberg/")

        for elemnt in vol_comment_soup.select(
                "a[href*='./EditionsLivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "./EditionsLivre.asp",
                    "https://www.noosfere.org/livres/EditionsLivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='./niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "./niourf.asp",
                    "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='heberg']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "../../heberg", "https://www.noosfere.org/heberg")
        for elemnt in vol_comment_soup.select("a[href*='../bd']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "../bd", "https://www.noosfere.org/bd")

        for elemnt in vol_comment_soup.select("a[href*='auteur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "auteur.asp", "https://www.noosfere.org/livres/auteur.asp")
        for elemnt in vol_comment_soup.select("a[href*='collection.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "collection.asp",
                    "https://www.noosfere.org/livres/collection.asp")
        for elemnt in vol_comment_soup.select("a[href*='critsign.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "critsign.asp",
                    "https://www.noosfere.org/livres/critsign.asp")
        for elemnt in vol_comment_soup.select("a[href*='EditionsLivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "EditionsLivre.asp",
                    "https://www.noosfere.org/livres/EditionsLivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='editeur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "editeur.asp",
                    "https://www.noosfere.org/livres/editeur.asp")
        for elemnt in vol_comment_soup.select("a[href*='editionslivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "editionslivre.asp",
                    "https://www.noosfere.org/livres/editionslivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "niourf.asp", "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='serie.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "serie.asp", "https://www.noosfere.org/livres/serie.asp")

        if debug:
            for elemnt in vol_comment_soup.select("a[href*='.asp']"):
                if 'http' not in elemnt.get('href'):
                    self.log.info(self.who, "url incomplet apres correction: ",
                                  elemnt)

        fg, fd = "<<==", "==>>"  #chr(0x21D0),chr(0x21D2)   #chr(0x27f8),chr(0x27f9)
        for elemnt in vol_comment_soup.select("img[src*='arrow_left']"):
            elemnt.replace_with(fg)
        for elemnt in vol_comment_soup.select("img[src*='arrow_right']"):
            elemnt.replace_with(fd)

        # depending on the tick box, make a fat publisher using seperators that have a very low probability to pop up (§ and €)
        # only set vol_coll_srl if vol_coll exists
        # the idea is to use search and replace in the edit Metadata in bulk window.

        if self.extended_publisher:
            if debug:
                self.log.info(
                    self.who,
                    """flag : "Ajoute collection et son numéro d'ordre au champ èditeur" set"""
                )
            if vol_coll:
                if debug: self.log.info(self.who, 'add collection')
                vol_editor = vol_editor + ('§') + vol_coll
                if vol_coll_srl:
                    if debug: self.log.info(self.who, 'add collection number')
                    vol_editor = vol_editor + ('€') + vol_coll_srl

        if vol_serie:
            if vol_serie_seq.isnumeric(): vol_serie_seq = float(vol_serie_seq)
            else: vol_serie_seq = 1.0

        # UTF-8 characters may be serialized different ways, only xmlcharrefreplace produces xml compatible strings
        # any other non ascii character with another utf-8 byte representation will make calibre behave with the messsage:
        # ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
        # Side note:
        # I have no real good url structure(i once got html 3 times, div a sibling of html...), but calibre does not seems to care (nice :-) )
        #
        # Ca m'a pris un temps fou pour trouver, par hazard, que encode('ascii','xmlcharrefreplace') aidait bien...
        # (enfin, quasi par hazard, j' ai essayé tout ce qui pouvait ameliorer la compatibilité avec xml... mais je
        # lisais mal et je pensais à une incompatibilité avec la structure xml),
        #
        vol_comment_soup = vol_comment_soup.encode('ascii',
                                                   'xmlcharrefreplace')

        self.log.info(self.who, "+++" * 25)
        self.log.info(self.who,
                      "nsfr_id, type()                : ", self.nsfr_id,
                      type(self.nsfr_id))  # must be <class 'str'>
        self.log.info(self.who,
                      "relevance, type()              : ", self.relevance,
                      type(self.relevance))  # must be <class 'float'>
        self.log.info(self.who, "vol_title, type()              : ", vol_title,
                      type(vol_title))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_auteur, type()             : ", vol_auteur,
            type(vol_auteur))  # must be <class 'list'> of <class 'str'>
        self.log.info(self.who,
                      "vol_auteur_prenom, type()      : ", vol_auteur_prenom,
                      type(vol_auteur_prenom))  # must be <class 'str'>
        self.log.info(self.who,
                      "vol_auteur_nom, type()         : ", vol_auteur_nom,
                      type(vol_auteur_nom))  # must be <class 'str'>
        if vol_serie:
            self.log.info(self.who, "vol_serie, type()              : ",
                          vol_serie, type(vol_serie))  # must be <class 'str'>
            self.log.info(self.who,
                          "vol_serie_seq, type()          : ", vol_serie_seq,
                          type(vol_serie_seq))  # must be <class 'float'>
        self.log.info(self.who, "vol_editor, type()             : ",
                      vol_editor, type(vol_editor))  # must be <class 'str'>
        self.log.info(self.who, "vol_coll, type()               : ", vol_coll,
                      type(vol_coll))  # must be <class 'str'>
        self.log.info(self.who,
                      "vol_coll_srl, type()           : ", vol_coll_srl,
                      type(vol_coll_srl))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_dp_lgl, type()             : ", vol_dp_lgl,
            type(vol_dp_lgl)
        )  # must be <class 'datetime.datetime'> ('renderer=isoformat')
        self.log.info(self.who, "vol_isbn, type()               : ", vol_isbn,
                      type(vol_isbn))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_genre, type()              : ", vol_genre,
            type(vol_genre))  # must be <class 'list'> of <class 'str'>
        self.log.info(self.who, "vol_cover_index, type()        : ",
                      vol_cover_index, type(vol_cover_index))  # must be
        self.log.info(self.who, "type(vol_comment_soup)         : ",
                      type(vol_comment_soup)
                      )  # must be byte encoded (start with b'blablabla...
        #        self.log.info(self.who,"vol_comment_soup               :\n",vol_comment_soup)                                # Maybe a bit long sometimes
        # language must be <class 'str'>

        if vol_cover_index:
            self.plugin.cache_identifier_to_cover_url(self.nsfr_id,
                                                      vol_cover_index)

        if vol_isbn:
            self.plugin.cache_isbn_to_identifier(vol_isbn, self.nsfr_id)

        mi = Metadata(vol_title, [vol_auteur])
        mi.set_identifier('nsfr_id', self.nsfr_id)
        mi.publisher = vol_editor
        mi.isbn = vol_isbn
        mi.tags = [vol_genre]
        mi.source_relevance = self.relevance
        mi.has_cover = bool(vol_cover_index)
        if vol_dp_lgl:
            mi.pubdate = vol_dp_lgl
        if vol_serie:
            mi.series = vol_serie
            mi.series_index = vol_serie_seq
        mi.language = "fra"

        mi.comments = vol_comment_soup

        if debug: self.log.info(self.who, "mi\n", mi, "\n")
        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            yes24_id = self.parse_yes24_id(self.url)
        except:
            self.log.exception('Error parsing YES24 id for url: %r'%self.url)
            yes24_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not yes24_id:
            self.log.error('Could not find title/authors/YES24 id for %r'%self.url)
            self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title,
                authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('yes24', yes24_id)
        self.yes24_id = yes24_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        if self.yes24_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Esempio n. 7
0
    def parse_details(self, raw, root):
        #解析元数据各字段数据
        #self.log.info("=====")
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r' % self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(
                    prefix=(asin or str(uuid.uuid4())) + '_',
                    suffix='.html',
                    delete=False) as f:
                f.write(raw)
            print('Downloaded html for', asin, 'saved in', f.name)
        # 分析取得书名
        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None
        #分析取得作者
        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r' %
                           self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r' %
                           (asin, title, authors))
            return
        #以书名,作者为元数据对象mi,用于设置元数据
        mi = Metadata(title, authors)
        #设置Bookid
        idtype = '17k'
        mi.set_identifier(idtype, asin)
        self.k17k_id = asin

        #设备注释(简介)
        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
        #设置丛书系列
        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
        #设置标签
        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        #设置最后更新日期
#        try:
#            mi.last_modified = self.parse_last_modified(root)
#        except:
#            self.log.exception('Error parsing last_modified for url: %r'%self.url)
#设置封面
        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)

        mi.has_cover = bool(self.cover_url)
        mi.source_relevance = self.relevance
        mi.languages = [
            u'中文',
        ]

        if self.k17k_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.k17k_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.k17k_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Esempio n. 8
0
#!/usr/bin/env python
Esempio n. 9
0
    def parse_details(self, root):
        try:
            goodreads_id = self.parse_goodreads_id(self.url)
        except:
            self.log.exception("Error parsing goodreads id for url: %r" % self.url)
            goodreads_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception("Error parsing title and series for url: %r" % self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception("Error parsing authors for url: %r" % self.url)
            authors = []

        if not title or not authors or not goodreads_id:
            self.log.error("Could not find title/authors/goodreads id for %r" % self.url)
            self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier("goodreads", goodreads_id)
        self.goodreads_id = goodreads_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception("Error parsing ISBN for url: %r" % self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception("Error parsing ratings for url: %r" % self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception("Error parsing comments for url: %r" % self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception("Error parsing cover for url: %r" % self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception("Error parsing tags for url: %r" % self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception("Error parsing publisher and date for url: %r" % self.url)

        mi.source_relevance = self.relevance

        if self.goodreads_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Esempio n. 10
0
	def parse_details(self, root):
		try:
			moly_id = self.parse_moly_id(self.url)
			self.log.info('Parsed moly.hu identifier: %s'%moly_id)
		except:
			self.log.exception('Error parsing moly.hu id for url: %r'%self.url)
			moly_id = None

		try:
			title = self.parse_title(root)
			self.log.info('Parsed title: %s'%title)
		except:
			self.log.exception('Error parsing title for url: %r'%self.url)
			title = None
		
		try:
			authors = self.parse_authors(root)
			self.log.info('Parsed authors: %s'%authors)
		except:
			self.log.exception('Error parsing authors for url: %r'%self.url)
			authors = []

		if not title or not authors or not moly_id:
			self.log.error('Could not find title/authors/moly.hu id for %r'%self.url)
			self.log.error('Moly.hu id: %r Title: %r Authors: %r'%(moly_id, title, authors))
			return

		mi = Metadata(title, authors)
		mi.set_identifier('moly_hu', moly_id)
		self.moly_id = moly_id

		try:
			isbn = self.parse_isbn(root)
			self.log.info('Parsed ISBN: %s'%isbn)
			if isbn:
				self.isbn = mi.isbn = isbn
		except:
			self.log.exception('Error parsing ISBN for url: %r'%self.url)
		
		try:
			series_info = self.parse_series(root)
			if series_info is not None:
				mi.series = series_info[0]
				mi.series_index = int(series_info[1])
				self.log.info('Parsed series: %s, series index: %f'%(mi.series,mi.series_index))
		except:
			self.log.exception('Error parsing series for url: %r'%self.url)
			
		try:
			mi.comments = self.parse_comments(root)
			self.log.info('Parsed comments: %s'%mi.comments)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_covers(root)
			self.log.info('Parsed URL for cover: %r'%self.cover_url)
			self.plugin.cache_identifier_to_cover_url(self.moly_id, self.cover_url)
			mi.has_cover = bool(self.cover_url)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)

		try:
			mi.tags = self.parse_tags(root)
			self.log.info('Parsed tags: %s'%mi.tags)
		except:
			self.log.exception('Error parsing tags for url: %r'%self.url)
			
		try:
			mi.languages = self.parse_languages(mi.tags)
			self.log.info('Parsed languages: %r'%mi.languages)
		except:
			self.log.exception('Error parsing language for url: %r'%self.url)
			
		try:
			mi.publisher = self.parse_publisher(root)
			self.log.info('Parsed publisher: %s'%mi.publisher)
		except:
			self.log.exception('Error parsing publisher for url: %r'%self.url)	
			
		try:
			mi.pubdate = self.parse_published_date(root)
			self.log.info('Parsed publication date: %s'%mi.pubdate)
		except:
			self.log.exception('Error parsing published date for url: %r'%self.url)
			
		try:
			mi.rating = self.parse_rating(root)
			self.log.info('Parsed rating: %s\n\n'%mi.rating)
		except:
			self.log.exception('Error parsing tags for url: %r\n\n'%self.url)


		mi.source_relevance = self.relevance

		if self.moly_id and self.isbn:
			self.plugin.cache_isbn_to_identifier(self.isbn, self.moly_id)

		self.plugin.clean_downloaded_metadata(mi)

		self.result_queue.put(mi)
Esempio n. 11
0
    def parse_details(self, root):
        try:
            kyobobook_id = self.parse_kyobobook_id(self.url)
        except:
            self.log.exception('Error parsing Kyobobook id for url: %r'%self.url)
            kyobobook_id = None
        
        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not kyobobook_id:
            self.log.error('Could not find title/authors/kyobobook id for %r'%self.url)
            self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title,
                authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('kyobobook', kyobobook_id)
        self.kyobobook_id = kyobobook_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception('Error parsing publisher and date for url: %r'%self.url)

        try:
            lang = self._parse_language(root)
            if lang:
                mi.language = lang
        except:
            self.log.exception('Error parsing language for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.kyobobook_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.kyobobook_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
    def get_details(self):
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)
        
        # We should not even be here if we are not processing an ebook hit
        if self.url.find("/ebook/") == -1:
            return

        try:
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Beam Ebooks timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        # raw = raw.decode('utf-8', errors='replace')
        raw = raw.decode('iso-8859-1', errors='replace')
        # open('D:\\work\\calibre-dump-book-details.html', 'wb').write(raw)

        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            # root = fromstring(clean_ascii_chars(raw))
            root = fromstring(raw)
        except:
            msg = 'Failed to parse beam ebooks details page: %r' % self.url
            self.log.exception(msg)
            return

        try:
            self.beam_ebooks_id = self.parse_beam_ebooks_id(self.url)
        except:
            self.log.exception('Error parsing beam ebooks id for url: %r' % self.url)
            self.beam_ebooks_id = None

        try:
            (self.title, self.series_index) = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            self.title = None
            self.series_index = None

        try:
            self.authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        mi = Metadata(self.title, self.authors)
        mi.set_identifier('beam-ebooks', self.beam_ebooks_id)

        if self.series_index:
            mi.series_index = float(self.series_index)
        
        self._determine_perry_rhodan_cycle_name(mi)

        mi.source_relevance = self.relevance

        self.plugin.clean_downloaded_metadata(mi)

        print(mi)
        self.result_queue.put(mi)        
Esempio n. 13
0
    def get_details(self):
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
            self.log.info(raw)
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for biblionet timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            # root = fromstring(clean_ascii_chars(raw))
            root = json.loads(raw)
            self.log.info(root)
        except:
            msg = 'Failed to parse book detail page: %r' % self.url
            self.log.exception(msg)
            return

        try:
            self.biblionetid = root['biblionetid']
        except:
            self.log.exception('Error parsing book id for url: %r' % self.url)
            self.biblionetid = None

        try:
            self.title = root['title'].strip()
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            self.title = None
            self.series_index = None

        try:
            self.authors = [root['authors'].strip()]
            self.log.info(self.authors)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        try:
            self.cover_url = root['cover_url']
            self.log.info('Parsed URL for cover:%r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.biblionetid,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        try:
            self.publisher = root['publisher']
            self.log.info('Parsed publisher:%s' % self.publisher)
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        try:
            self.tags = root['categories'].replace('DDC: ', 'DDC:').replace(
                '-', '').split()[:-1]
            self.log.info('Parsed tags:%s' % self.tags)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            self.pubdate = root['yr_published']
            self.log.info('Parsed publication date:%s' % self.pubdate)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        mi = Metadata(self.title, self.authors)
        mi.set_identifier('biblionet', self.biblionetid)

        if self.series_index:
            try:
                mi.series_index = float(self.series_index)
            except:
                self.log.exception('Error loading series')
        if self.relevance:
            try:
                mi.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        if self.cover_url:
            try:
                mi.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        if self.publisher:
            try:
                mi.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        if self.tags:
            try:
                mi.tags = self.tags
            except:
                self.log.exception('Error loading tags')
        if self.pubdate:
            try:
                if self.pubdate not in (self.yr_msg1, self.yr_msg2):
                    d = datetime.date(int(self.pubdate), 1, 1)
                    mi.pubdate = d
            except:
                self.log.exception('Error loading pubdate')

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Esempio n. 14
0
    def load_details(self, url, timeout):
        def _format_item(str):
            return re.sub('^"(.*)"$', '\\1', unescape(str))

        def _format_list(str):
            return [_.strip() for _ in _format_item(str).split(',')]

        def _find_meta(node, property):
            return [_.get('content') for _ in node if _.get('property') == property][0]

        def _format_date(date_text):
            year = int(date_text[0:4])
            month = int(date_text[4:6]) 
            day = int(date_text[6:])
            return datetime.datetime(year, month, day, tzinfo=utc_tz)

        try:
            response = self.browser.open(url, timeout=timeout)
            root = lxml.html.fromstring(response.read())

            # <meta> tag에서 불러오는 항목
            # 책ID, 제목, ISBN, 이미지URL, 평점
            meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]')

            # schema.org JSON에서 불러오는 항목
            # 제목, 저자, 책소개, 출판사
            ld_json = root.xpath('//script[@type="application/ld+json"]/text()')
            ld = [json.loads(_) for _ in ld_json]
            book_info = [_ for _ in ld if _['@type'] == 'Book'][0]
        except Exception as e:
            self.log.exception(e)

        ridibooks_id = re.search('id=([0-9]+)', url).group(1)
        isbn = _find_meta(meta, 'books:isbn')
        cover_url = _find_meta(meta, 'og:image')

        title = _find_meta(meta, 'og:title')
        authors = _format_list(book_info['author']['name'])
        if book_info.has_key('translator'):
            authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])])

        mi = Metadata(title, authors)
        mi.set_identifier('ridibooks', ridibooks_id)

        mi.cover_url = cover_url
        mi.has_cover = bool(cover_url)

        mi.publisher = _format_item(book_info['publisher']['name'])
        mi.pubdate = _format_date(book_info['datePublished'])

        mi.comments = _format_item(book_info['description'])
        mi.rating = float(_find_meta(meta, 'books:rating:normalized_value'))

        series = re.search(u'(.*)\s*(\d+)권', title)
        if series:
            mi.series = series.group(1)
            mi.series_index = float(series.group(2))

        mi.language = 'Korean'
        mi.source_relevance = self.relevance

        if ridibooks_id:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id)
            if cover_url:
                self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Esempio n. 15
0
    def load_details(self, url, timeout):
        def _format_item(str):
            return re.sub('^"(.*)"$', '\\1', unescape(str))

        def _format_list(str):
            return [_.strip() for _ in _format_item(str).split(',')]

        def _find_meta(node, property):
            return [
                _.get('content') for _ in node if _.get('property') == property
            ][0]

        def _format_date(date_text):
            year = int(date_text[0:4])
            month = int(date_text[4:6])
            day = int(date_text[6:])
            return datetime.datetime(year, month, day, tzinfo=utc_tz)

        try:
            response = self.browser.open(url, timeout=timeout)
            root = lxml.html.fromstring(response.read())

            # <meta> tag에서 불러오는 항목
            # 책ID, 제목, ISBN, 이미지URL, 평점
            meta = root.xpath(
                '//meta[starts-with(@property, "og") or starts-with(@property, "books")]'
            )

            # schema.org JSON에서 불러오는 항목
            # 제목, 저자, 책소개, 출판사
            ld_json = root.xpath(
                '//script[@type="application/ld+json"]/text()')
            ld = [json.loads(_) for _ in ld_json]
            book_info = [_ for _ in ld if _['@type'] == 'Book'][0]
        except Exception as e:
            self.log.exception(e)

        ridibooks_id = re.search('id=([0-9]+)', url).group(1)
        isbn = _find_meta(meta, 'books:isbn')
        cover_url = _find_meta(meta, 'og:image')

        title = _find_meta(meta, 'og:title')
        authors = _format_list(book_info['author']['name'])
        if book_info.has_key('translator'):
            authors.extend([
                _ + u'(역자)'
                for _ in _format_list(book_info['translator']['name'])
            ])

        mi = Metadata(title, authors)
        mi.set_identifier('ridibooks', ridibooks_id)

        mi.cover_url = cover_url
        mi.has_cover = bool(cover_url)

        mi.publisher = _format_item(book_info['publisher']['name'])
        mi.pubdate = _format_date(book_info['datePublished'])

        mi.comments = _format_item(book_info['description'])
        mi.rating = float(_find_meta(meta, 'books:rating:normalized_value'))

        series = re.search(u'(.*)\s*(\d+)권', title)
        if series:
            mi.series = series.group(1)
            mi.series_index = float(series.group(2))

        mi.language = 'Korean'
        mi.source_relevance = self.relevance

        if ridibooks_id:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id)
            if cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    ridibooks_id, cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            CBDB_id = self.parse_CBDB_id(self.url)
        except:
            self.log.exception('Error parsing CBDB id for url: %r' % self.url)
            CBDB_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r' %
                               self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not CBDB_id:
            self.log.error('Could not find title/authors/CBDB id for %r' %
                           self.url)
            self.log.error('CBDB: %r Title: %r Authors: %r' %
                           (CBDB_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        #mi.identifiers['cbdb'] = CBDB_id
        mi.set_identifier('cbdb', CBDB_id)
        #self.log.info(CBDB_id)
        #self.log.info(mi.identifiers.get('cbdb', None))
        self.CBDB_id = CBDB_id

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        # summary
        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_urls = self.parse_covers(root)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        mi.has_cover = bool(self.cover_urls)
        #self.log.info('covers')
        #self.log.info(self.cover_urls)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.publisher, mi.pubdate, isbn = self.parse_editions(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing publisher and date for url: %r' %
                               self.url)

        mi.source_relevance = self.relevance

        mi.language = 'Czech'

        #self.log.info('self.CBDB_id = ' + str(self.CBDB_id ))

        if self.CBDB_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.CBDB_id)

            if self.cover_urls:
                self.plugin.cache_identifier_to_cover_url(
                    self.CBDB_id, self.cover_urls)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Esempio n. 17
0
    def parse_details(self, root):
        try:
            legie_id = self.parse_legie_id(self.url)
        except:
            self.log.exception('Error parsing Legie id for url: %r' % self.url)
            legie_id = None

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not legie_id:
            self.log.error('Could not find title/authors/Legie id for %r' %
                           self.url)
            self.log.error('Legie: %r Title: %r Authors: %r' %
                           (legie_id, title, authors))
            return

        self.legie_id = legie_id

        rating = comments = series = series_index = None
        try:
            rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        try:
            comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            (series, series_index) = self.parse_series(root)
        except:
            self.log.info('Series not found.')

        try:
            tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)
            tags = None

        if legie_id:
            editions = self.get_editions()

            if editions:
                num_editions = len(editions)
                self.log.info('Nalezeno %d vydani' % num_editions)
                for edition in editions:
                    (year, cover_url, publisher, isbn) = edition
                    mi = Metadata(title, authors)
                    self.legie_id = "%s#%s" % (legie_id, year)
                    mi.set_identifier('legie', self.legie_id)
                    mi.source_relevance = self.relevance
                    mi.rating = rating
                    mi.comments = comments
                    mi.series = series
                    mi.series_index = series_index
                    if cover_url:
                        mi.cover_url = self.cover_url = cover_url
                        self.plugin.cache_identifier_to_cover_url(
                            self.legie_id, self.cover_url)
                    if tags:
                        mi.tags = tags
                    mi.has_cover = bool(self.cover_url)
                    mi.publisher = publisher
                    mi.isbn = isbn
                    mi.pubdate = self.prepare_date(int(year))
                    mi.language = "ces"
                    self.result_queue.put(mi)
            else:
                mi = Metadata(title, authors)
                mi.set_identifier('legie', self.legie_id)
                mi.source_relevance = self.relevance
                mi.rating = rating
                mi.comments = comments
                mi.series = series
                mi.series_index = series_index
                try:
                    self.cover_url = self.parse_cover(root)
                except:
                    self.log.exception('Error parsing cover for url: %r' %
                                       self.url)
                if tags:
                    mi.tags = tags
                mi.has_cover = bool(self.cover_url)
                mi.publisher = publisher
                mi.isbn = isbn
                mi.pubdate = self.prepare_date(int(year))
                mi.language = "ces"
                self.result_queue.put(mi)
                if self.legie_id:
                    if self.cover_url:
                        self.plugin.cache_identifier_to_cover_url(
                            self.legie_id, self.cover_url)
Esempio n. 18
0
    def parse_details(self, root):

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for query: %r' %
                               self.query)
            title = None

        if not title:
            self.log.error('Could not find title for %r' % self.query)

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for query: %r' %
                               self.query)
            authors = []

        if not authors:
            self.log.error('Could not find authors for %r' % self.query)

            return

        mi = Metadata(title, authors)

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                # match 10 of 13 getallen aan het begin, gevolgd door een spatie of niets
                p = re.compile('^([0-9]{13}|[0-9]{10})(?= |\Z)')
                if isinstance(isbn, str):
                    m = p.match(isbn)
                    if m:
                        mi.isbn = m.group()
                else:
                    m = p.match(isbn[0])
                    if m:
                        mi.isbn = m.group()
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            lang = self.parse_language(root)
            if lang:
                mi.languages = lang
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        try:
            lccn = self.parse_lccn(root)
            if lccn:
                if isinstance(lccn, str):
                    mi.set_identifier('lccn', lccn)
                else:
                    for identifier in lccn:
                        mi.set_identifier('lccn', identifier)
        except:
            self.log.exception('Error parsing LCCN for url: %r' % self.url)

        try:
            ddc = self.parse_ddc(root)
            if ddc:
                if isinstance(ddc, str):
                    mi.set_identifier('ddc', ddc)
                else:
                    for identifier in ddc:
                        mi.set_identifier('ddc', identifier)
        except:
            self.log.exception('Error parsing DDC for url: %r' % self.url)

        try:
            lcc = self.parse_lcc(root)
            if lcc:
                if isinstance(lcc, str):
                    mi.set_identifier('lcc', lcc)
                else:
                    for identifier in lcc:
                        mi.set_identifier('lcc', identifier)
        except:
            self.log.exception('Error parsing LCC for url: %r' % self.url)

        mi.source_relevance = self.relevance

        self.result_queue.put(mi)
Esempio n. 19
0
    def get_details(self):
        '''
        The get_details() function for stripping the website for all information
        '''
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        # Parse the html code from the website
        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        # Do some error handling if it fails to read data
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for saxo timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        # Do some error handling if the html code returned 404
        if "<title>404 - " == raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        # Clean the html data a little
        try:
            root = parse(raw)
        except:
            self.log.error("Error cleaning HTML")
            return

        # Get the title of the book
        try:
            title_node = root.xpath('//span[@itemprop="name"]')
            self.title = title_node[0].text
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)

        # Get the author of the book
        try:
            author_node = root.xpath('//span[@class="expandAuthorName"]')
            author_strings = author_node[0].text.split(",")
            #print(author_strings)
            for name in author_strings:
                self.authors.append(name)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        # Get the series of the book
        try:
            series_node = root.xpath('//b[contains(text(), "Serie")]/a')
            if len(series_node) > 0:
                self.series = series_node[0].text.split(": ")[0].strip()
                self.series_index = series_node[0].text.split(": ")[-1].strip()
            #    print("'%s'" % self.series)
            #    print("'%s'" % self.series_index)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)

        # Some books have ratings, let's use them.
        try:
            self.rating = 0.0
        except:
            self.log.exception('Error parsing rating for url: %r' % self.url)
            self.rating = 0.0

        # Get the ISBN number from the site
        try:
            isbn_node = root.xpath(
                '//div[@class="eBookContainer"]/b/span[@itemprop="identifier"]'
            )
            if len(isbn_node) > 0:
                self.isbn = isbn_node[0].text.replace("ISBN: ", "").strip()
        except:
            self.log.exception('Error parsing isbn for url: %r' % self.url)
            self.isbn = None

        # Get the comments/blurb for the book
        try:
            comment_node = root.xpath('//meta[@name="description"]/@content')
            self.comments = comment_node[0]
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
            self.comments = None

        # Parse the cover url for downloading the cover.
        try:
            cover_node = root.xpath(
                '//div[@class="bookDetailCoverCover"]/img/@src')
            self.cover_url = "https://mofibo.com" + cover_node[0]
            self.log.info('    Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.isbn,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        # Get the publisher name
        try:
            publisher_node = root.xpath(
                '//div[@class="eBookContainer"]/b/span/a[@itemprop="brand"]')
            if len(publisher_node) > 0:
                self.publisher = publisher_node[0].text
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        # Get the language of the book. Only english and danish are supported tho
        try:
            language_node = root.xpath('//b[@class="expanderLanguage"]')
            language = language_node[0].text.strip().replace("Sprog:",
                                                             "").replace(
                                                                 " ", "")
            language = self.lang_map.get(language, None)
            self.language = language
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        # Get the publisher date
        try:
            pubdate_node = root.xpath(
                '//div[@class="eBookContainer"]/b[contains(text(),"Udgivet:")]'
            )
            if len(pubdate_node) > 0:
                date_str = pubdate_node[0].text.replace("Udgivet:", "").strip()
                format_str = '%Y-%m-%d'  # The format
                self.pubdate = datetime.datetime.strptime(date_str, format_str)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        # Get the tags
        try:
            tags = []
            tags_node = root.xpath('//span[@itemprop="category"]')
            tags.append(tags_node[0].text.strip())
            self.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        # Setup the metadata
        meta_data = Metadata(self.title, self.authors)
        meta_data.set_identifier('isbn', self.isbn)
        meta_data.set_identifier('mofibo', self.url)

        # Set rating
        if self.series:
            try:
                meta_data.series = self.series
                meta_data.series_index = self.series_index
            except:
                self.log.exception('Error loading series')
        # Set ISBN
        if self.isbn:
            try:
                meta_data.isbn = self.isbn
            except:
                self.log.exception('Error loading ISBN')
        # Set relevance
        if self.relevance:
            try:
                meta_data.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        # Set cover url
        if self.cover_url:
            try:
                meta_data.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        # Set publisher
        if self.publisher:
            try:
                meta_data.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        # Set language
        if self.language:
            try:
                meta_data.language = self.language
            except:
                self.log.exception('Error loading language')
        # Set comments/blurb
        if self.comments:
            try:
                meta_data.comments = self.comments
            except:
                self.log.exception("Error loading comments")
        # Set publisher data
        if self.pubdate:
            try:
                meta_data.pubdate = self.pubdate
            except:
                self.log.exception('Error loading pubdate')
        # Set tags data
        if self.tags:
            try:
                meta_data.tags = self.tags
            except:
                self.log.exception('Error loading tags')

        # Put meta data
        self.plugin.clean_downloaded_metadata(meta_data)
        self.result_queue.put(meta_data)
Esempio n. 20
0
	def parse_details(self, root):
		search_data = ''
		isbn = None
		
		try:
			self.log.info('Parse details:%s'%self.url)
			databazeknih_id = self.parse_databazeknih_id(self.url)
			self.log.info('Parsed DK identifier:%s'%databazeknih_id)
		except:
			self.log.exception('Error parsing databazeknih id for url: %r'%self.url)
			databazeknih_id = None

#		self.log.info('11')
		try:
			title = self.parse_title(root)
			self.log.info('Parsed title:%s'%title)
		except:
			self.log.exception('Error parsing title for url: %r'%self.url)
			title = None
		
		try:
			authors = self.parse_authors(root)
			self.log.info('Parsed authors:%s'%authors)
		except:
			self.log.exception('Error parsing authors for url: %r'%self.url)
			authors = []

		if not title or not authors or not databazeknih_id:
			self.log.error('Could not find title/authors/databazeknih id for %r'%self.url)
			self.log.error('DK id: %r Title: %r Authors: %r'%(databazeknih_id, title, authors))
			return

		mi = Metadata(title, authors)
		self.log.info('dbki:%s'%databazeknih_id)
		mi.set_identifier('databazeknih', databazeknih_id)
		self.databazeknih_id = databazeknih_id

		try:
			(mi.series, mi.series_index) = self.parse_series(root)
			self.log.info('Parsed series:%s'%mi.series)
			self.log.info('Parsed series index:%s'%mi.series_index)
		except :
			self.log.exception('Error parsing series for url: %r'%self.url)
			series = None
			
		try:
			mi.comments = self.parse_comments(root)
			self.log.info('Parsed comments:%s'%mi.comments)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_cover(root)
			self.log.info('Parsed URL for cover:%r'%self.cover_url)
			self.plugin.cache_identifier_to_cover_url(self.databazeknih_id, self.cover_url)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)
		mi.has_cover = bool(self.cover_url)

		try:
			mi.tags = self.parse_tags(root)
			self.log.info('Parsed tags:%s'%mi.tags)
		except:
			self.log.exception('Error parsing tags for url: %r'%self.url)
			
		try:
			mi.publisher = self.parse_publisher(root)
			self.log.info('Parsed publisher:%s'%mi.publisher)
		except:
			self.log.exception('Error parsing publisher for url: %r'%self.url)
			
		try:
			mi.pubdate = self.parse_pubdate(root)
			self.log.info('Parsed pubdate:%s'%mi.pubdate)
		except:
			self.log.exception('Error parsing pubdate for url: %r'%self.url)

			
		try:
			mi.rating = self.parse_rating(root)
			self.log.info('Parsed rating:%s'%mi.rating)
		except:
			self.log.exception('Error parsing rating for url: %r'%self.url)

		mi.source_relevance = self.relevance

#		if series:
#			mi.series = series
		
		try:
			isbn = self.parse_isbn(root)
			if isbn:
				self.isbn = mi.isbn = isbn
		except:
			self.log.exception('Error parsing ISBN for url: %r'%self.url)

		if self.databazeknih_id:
			self.plugin.cache_isbn_to_identifier(self.isbn, self.databazeknih_id)
			
#		self.plugin.clean_downloaded_metadata(mi)
#		mi.isbn = check_isbn(mi.isbn)
		self.log.info(mi)
		self.result_queue.put(mi)
Esempio n. 21
0
    def _GoodreadsBook_to_Metadata(self, book):
        # type: (_GoodreadsBook) -> Metadata
        """
        :param book: _GoodreadsBook: book
        :return: Metadata: Metadata
        """
        mi = Metadata(book.title, book.authors)
        mi.source_relevance = 0
        mi.set_identifier('goodreads', book.id)

        if self.prefs['NEVER_REPLACE_ISBN'] and mi.get_identifiers().get(
                'isbn'):
            mi.set_identifier('isbn', '')

        if book.asin and not self.prefs['NEVER_REPLACE_AMAZONID']:
            mi.set_identifier('amazon', book.asin)

        if book.isbn and not self.prefs['NEVER_REPLACE_ISBN']:
            try:
                if len(book.isbn) == 10:
                    mi.isbn = check_isbn13(_ISBNConvert.convert(book.isbn))
                else:
                    mi.isbn = check_isbn13(book.isbn)
            except:
                self.log.error("ISBN CONVERSION ERROR:", book.isbn)
                self.log.exception()

        if book.image_url:
            self.log.info('cache_identifier_to_cover_url:', book.asin, ':',
                          book.image_url)
            self.cache_identifier_to_cover_url(book.id, book.image_url)

        if book.publisher:
            self.log.info('book.publisher is:', book.publisher)
            mi.publisher = book.publisher

        if book.pubdate:
            self.log.info('book.pubdate is:',
                          book.pubdate.strftime('%Y-%m-%d'))
            mi.pubdate = book.pubdate

        if book.comments:
            self.log.info('book.editorial_review is:', book.comments)
            mi.comments = book.comments

        tags = self.prefs['ADD_THESE_TAGS'].split(',')
        tags.extend(book.tags)
        # tag_mappings = JSONConfig('plugins/GenreMappings')['genreMappings']
        # mi.tags = list(set(sorted(filter(lambda x: tag_mappings.get(x, x), tags))))

        if book.series:
            mi.series = book.series
            self.log.info(u'series:', book.series)
            if book.series_index:
                mi.series_index = book.series_index
                self.log.info(u'series_index:',
                              "{0:.2f}".format(book.series_index))
            else:
                mi.series_index = 0

        if book.average_rating:
            mi.rating = book.average_rating

        self.clean_downloaded_metadata(mi)

        return mi
Esempio n. 22
0
    def parse_details(self, raw, root):
        dang_id = parse_dang_id(root, self.log, self.url)
        if not dang_id and root.xpath(
                '//form[@action="/errors/validateCaptcha"]'):
            raise CaptchaError(
                'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.'
            )
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(
                    prefix=(dang_id or str(uuid.uuid4())) + '_',
                    suffix='.html',
                    delete=False) as f:
                f.write(raw)
            print('Downloaded html for', dang_id, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not dang_id:
            self.log.error('Could not find title/authors/dang_id for %r' %
                           self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r' %
                           (dang_id, title, authors))
            return

        mi = Metadata(title, authors)
        idtype = 'dang'
        mi.set_identifier(idtype, dang_id)
        self.dang_id = dang_id

        try:
            mi.comments = self.parse_comments(root, raw)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        mi.has_cover = bool(self.cover_url)

        pd = root.xpath(self.pd_desc_xpath)
        pd_info = root.xpath(self.pd_info_xpath)
        pd_info_store = root.xpath(self.pd_info_store_xpath)
        pd_desc = root.xpath(self.pd_desc_xpath)

        if pd_info or pd_info_store:
            try:
                isbn = self.parse_isbn(pd_info, pd_info_store, pd_desc)
                if isbn:
                    self.isbn = mi.isbn = isbn
            except:
                self.log.exception('Error parsing ISBN for url: %r' % self.url)

            if pd_info:
                pd_info = pd_info[0]
            else:
                pd_info = pd_info_store[0]

            try:
                mi.publisher = self.parse_publisher(pd_info)
            except:
                self.log.exception('Error parsing publisher for url: %r' %
                                   self.url)

            try:
                mi.pubdate = self.parse_pubdate(pd_info)
            except:
                self.log.exception('Error parsing publish date for url: %r' %
                                   self.url)

        else:
            self.log.warning('Failed to find product description for url: %r' %
                             self.url)

        mi.source_relevance = self.relevance

        if self.dang_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.dang_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.dang_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            kyobobook_id = self.parse_kyobobook_id(self.url)
        except:
            self.log.exception('Error parsing Kyobobook id for url: %r' %
                               self.url)
            kyobobook_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r' %
                               self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not kyobobook_id:
            self.log.error('Could not find title/authors/kyobobook id for %r' %
                           self.url)
            self.log.error('Kyobobook: %r Title: %r Authors: %r' %
                           (kyobobook_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('kyobobook', kyobobook_id)
        self.kyobobook_id = kyobobook_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception('Error parsing publisher and date for url: %r' %
                               self.url)

        try:
            lang = self._parse_language(root)
            if lang:
                mi.language = lang
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        mi.source_relevance = self.relevance

        if self.kyobobook_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn,
                                                     self.kyobobook_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.kyobobook_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Esempio n. 24
0
    def parse_details(self, raw, root):
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r'%self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
                    suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r'%self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r'%self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
                authors))
            return

        mi = Metadata(title, authors)
        idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain
        mi.set_identifier(idtype, asin)
        self.amazon_id = asin

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r'%self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root)
        if non_hero:
            # New style markup
            try:
                self.parse_new_details(root, mi, non_hero[0])
            except:
                self.log.exception('Failed to parse new-style book details section')
        else:
            pd = root.xpath(self.pd_xpath)
            if pd:
                pd = pd[0]

                try:
                    isbn = self.parse_isbn(pd)
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
                    self.log.exception('Error parsing ISBN for url: %r'%self.url)

                try:
                    mi.publisher = self.parse_publisher(pd)
                except:
                    self.log.exception('Error parsing publisher for url: %r'%self.url)

                try:
                    mi.pubdate = self.parse_pubdate(pd)
                except:
                    self.log.exception('Error parsing publish date for url: %r'%self.url)

                try:
                    lang = self.parse_language(pd)
                    if lang:
                        mi.language = lang
                except:
                    self.log.exception('Error parsing language for url: %r'%self.url)

            else:
                self.log.warning('Failed to find product description for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.amazon_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.amazon_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Esempio n. 25
0
    def parse_details(self, root):
        try:
            antik_id = self.parse_antik_id(root)
            self.log.info('Parsed Antikvarium identifier: %s' % antik_id)
        except:
            self.log.exception('Error parsing Antikvarium id for url: %r' %
                               self.url)
            antik_id = None

        try:
            title = self.parse_title(root)
            self.log.info('Parsed title: %s' % title)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
            self.log.info('Parsed authors: %s' % authors)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not antik_id:
            self.log.error(
                'Could not find title/authors/Antikvarium.hu id for %r' %
                self.url)
            self.log.error('Antikvarium.hu id: %r Title: %r Authors: %r' %
                           (antik_id, title, authors))
            return

        mi = Metadata(title, authors)
        mi.set_identifier('antik_hu', antik_id)
        self.antik_id = antik_id

        try:
            isbn = self.parse_isbn(root)
            self.log.info('Parsed ISBN: %s' % isbn)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            series = self.parse_series(root)
            self.log.info('Parsed series: %s' % series)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
            series = None

        try:
            mi.series_index = self.parse_series_index(root)
            self.log.info('Parsed series index: %s' % mi.series_index)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
            mi.series_index = None

        try:
            mi.comments = self.parse_comments(root)
            self.log.info('Parsed comments: %s' % mi.comments)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root)
            self.log.info('Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.antik_id,
                                                      self.cover_url)
            mi.has_cover = bool(self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)

        try:
            mi.publisher = self.parse_publisher(root)
            self.log.info('Parsed publisher: %s' % mi.publisher)
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        try:
            mi.tags = self.parse_tags(root)
            self.log.info('Parsed tags: %s' % mi.tags)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
            self.log.info('Parsed publication date: %s' % mi.pubdate)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        try:
            mi.languages = self.parse_languages(root)
            self.log.info('Parsed languages: %r' % mi.languages)
        except:
            self.log.exception('Error parsing languages for url: %r' %
                               self.url)

        mi.source_relevance = self.relevance

        if series:
            mi.series = series

        if self.antik_id and self.isbn:
            self.plugin.cache_isbn_to_identifier(self.isbn, self.antik_id)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Esempio n. 26
0
	def parse_details(self, root):
		isfdb_id = None
		title = None
		authors = []
		isbn = None
		publisher = None
		pubdate = None
		
		try:
			isfdb_id = re.search('(\d+)$', self.url).groups(0)[0]
		except:
			self.log.exception('Error parsing ISFDB ID for url: %r' % self.url)
        
		detail_nodes = root.xpath('//div[@id="content"]//td[@class="pubheader"]/ul/li')
		if not detail_nodes:
			detail_nodes = root.xpath('//div[@id="content"]/div/ul/li') # no table (on records with no image)

		for detail_node in detail_nodes:
			section = detail_node[0].text_content().strip().rstrip(':')
			#self.log.info(section)
			try:
				if section == 'Publication':
					title = detail_node[0].tail.strip()
					if not title:
						# assume an extra span with a transliterated title tooltip
						title = detail_node[1].text_content().strip()
                    #self.log.info(title)
				elif section == 'Authors' or section == 'Editors':
					for a in detail_node.xpath('.//a'):
						author = a.text_content().strip()
						if section.startswith('Editors'):
							authors.append(author + ' (Editor)')
						else:
							authors.append(author)
                    #self.log.info(authors)
				elif section == 'ISBN':
					isbn = detail_node[0].tail.strip('[] \n')
                    #self.log.info(isbn)
				elif section == 'Publisher':
					publisher = detail_node.xpath('a')[0].text_content().strip()
                    #self.log.info(publisher)
				elif section == 'Date':
					pubdate = self._convert_date_text(detail_node[0].tail.strip())
                    #self.log.info(pubdate)
			except:
				self.log.exception('Error parsing section %r for url: %r' % (section, self.url) )

		if not title or not authors or not isfdb_id:
			self.log.error('Could not find title/authors/ISFDB ID for %r' % self.url)
			self.log.error('ISFDB: %r Title: %r Authors: %r' % (isfdb_id, title,
				authors))
			return

		mi = Metadata(title, authors)
		mi.set_identifier('isfdb', isfdb_id)
		self.isfdb_id = isfdb_id

		if isbn:
			self.isbn = mi.isbn = isbn
		if publisher:
			mi.publisher = publisher
		if pubdate:
			mi.pubdate = pubdate
			
		try:
			mi.comments = self.parse_comments(root)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_cover(root)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)
		
		mi.has_cover = bool(self.cover_url)
		mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!

		mi.source_relevance = self.relevance

		if self.isfdb_id:
			if self.isbn:
				self.plugin.cache_isbn_to_identifier(self.isbn, self.isfdb_id)

		self.plugin.clean_downloaded_metadata(mi)
		self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            isbn = self.extract_isbn(self.url)
        except:
            self.log.exception('No ISBN in URL: %r'%self.url)
            isbn = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not isbn:
            self.log.error('Could not find title/authors/Aladin id for %r'%self.url)
            self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        #mi.set_identifier('isbn', isbn)
        mi.isbn = isbn
        self.isbn = isbn

        # ISBN-13
        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!
        if mi.has_cover:
            self.log.info('Cover URL: '+mi.cover_url)

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Esempio n. 28
0
    def parse_details(self, raw, root):
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r'%self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
                    suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r'%self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r'%self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
                authors))
            return

        mi = Metadata(title, authors)
        idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain
        mi.set_identifier(idtype, asin)
        self.amazon_id = asin

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r'%self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root)
        if non_hero:
            # New style markup
            try:
                self.parse_new_details(root, mi, non_hero[0])
            except:
                self.log.exception('Failed to parse new-style book details section')
        else:
            pd = root.xpath(self.pd_xpath)
            if pd:
                pd = pd[0]

                try:
                    isbn = self.parse_isbn(pd)
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
                    self.log.exception('Error parsing ISBN for url: %r'%self.url)

                try:
                    mi.publisher = self.parse_publisher(pd)
                except:
                    self.log.exception('Error parsing publisher for url: %r'%self.url)

                try:
                    mi.pubdate = self.parse_pubdate(pd)
                except:
                    self.log.exception('Error parsing publish date for url: %r'%self.url)

                try:
                    lang = self.parse_language(pd)
                    if lang:
                        mi.language = lang
                except:
                    self.log.exception('Error parsing language for url: %r'%self.url)

            else:
                self.log.warning('Failed to find product description for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.amazon_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.amazon_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Esempio n. 29
0
    def get_details(self):
        '''
        The get_details() function for stripping the website for all information
        '''
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        # Parse the html code from the website
        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        # Do some error handling if it fails to read data
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for saxo timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        # Do some error handling if the html code returned 404
        if "<title>404 - " == raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        # Clean the html data a little
        try:
            root = parse(raw)
        except:
            self.log.error("Error cleaning HTML")
            return

        # Get the json data within the HTML code (some stuff is easier to get with json)
        try:
            json_raw = root.xpath('(//script[@type="application/ld+json"])[2]')
            json_root = json.loads(json_raw[0].text.strip())
            #print(json.dumps(json_root, indent=4, sort_keys=True))
        except:
            self.log.error("Error loading JSON data")
            return

        # Get the title of the book
        try:
            self.title = json_root['name']
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)

        # Get the author of the book
        try:
            author_node = root.xpath(
                '//h2[@class="product-page-heading__autor"]//a')
            for name in author_node:
                self.authors.append(name.text.strip())
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        # Some books have ratings, let's use them.
        try:
            self.rating = float(json_root['aggregateRating']['ratingValue'])
        except:
            self.log.exception('Error parsing rating for url: %r' % self.url)
            self.rating = 0.0

        # Get the ISBN number from the site
        try:
            self.isbn = json_root['isbn']
        except:
            self.log.exception('Error parsing isbn for url: %r' % self.url)
            self.isbn = None

        # Get the comments/blurb for the book
        try:
            self.comments = parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
            self.comments = None

        # Parse the cover url for downloading the cover.
        try:
            self.cover_url = json_root['image']
            self.log.info('    Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.isbn,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        # Get the publisher name
        try:
            self.publisher = json_root['publisher']['name']
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        # Get the language of the book. Only english and danish are supported tho
        try:
            language = json_root['inLanguage']['name']
            language = self.lang_map.get(language, None)
            self.language = language
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        # Get the publisher date
        try:
            #pubdate_node = root.xpath('(//dl[@class="product-info-list"]//dd)[2]') # Format dd-mm-yyyy
            pubdate_node = root.xpath(
                '//div[@class="product-page-block__container"]//dd'
            )  # Format dd-mm-yyyy
            date_str = pubdate_node[0].text.strip()
            format_str = '%d-%m-%Y'  # The format
            self.pubdate = datetime.datetime.strptime(date_str, format_str)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        # Setup the metadata
        meta_data = Metadata(self.title, self.authors)
        meta_data.set_identifier('isbn', self.isbn)
        meta_data.set_identifier('saxo', self.url)

        # Set rating
        if self.rating:
            try:
                meta_data.rating = self.rating
            except:
                self.log.exception('Error loading rating')
        # Set ISBN
        if self.isbn:
            try:
                meta_data.isbn = self.isbn
            except:
                self.log.exception('Error loading ISBN')
        # Set relevance
        if self.relevance:
            try:
                meta_data.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        # Set cover url
        if self.cover_url:
            try:
                meta_data.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        # Set publisher
        if self.publisher:
            try:
                meta_data.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        # Set language
        if self.language:
            try:
                meta_data.language = self.language
            except:
                self.log.exception('Error loading language')
        # Set comments/blurb
        if self.comments:
            try:
                meta_data.comments = self.comments
            except:
                self.log.exception("Error loading comments")
        # Set publisher data
        if self.pubdate:
            try:
                meta_data.pubdate = self.pubdate
            except:
                self.log.exception('Error loading pubdate')

        # Put meta data
        self.plugin.clean_downloaded_metadata(meta_data)
        self.result_queue.put(meta_data)