Ejemplo n.º 1
0
    def get_biblio(self):

        info(f"url = {self.url}")
        json_bib = doi_query.query(self.url)
        info(f"{json_bib=}")
        biblio = {
            "permalink": self.url,
            "excerpt": "",
            "comment": self.comment,
        }
        for key, value in list(json_bib.items()):
            info(f"{key=} {value=} {type(value)=}")
            if value in (None, [], ""):
                pass
            elif key == "author":
                biblio["author"] = self.get_author(json_bib)
            elif key == "issued":
                biblio["date"] = self.get_date(json_bib)
            elif key == "page":
                biblio["pages"] = json_bib["page"]
            elif key == "container-title":
                biblio["journal"] = json_bib["container-title"]
            elif key == "issue":
                biblio["number"] = json_bib["issue"]
            elif key == "URL":
                biblio["permalink"] = biblio["url"] = json_bib["URL"]
            else:
                biblio[key] = json_bib[key]
        if "title" not in json_bib:
            biblio["title"] = "UNKNOWN"
        else:
            biblio["title"] = sentence_case(" ".join(biblio["title"].split()))
        info(f"{biblio=}")
        return biblio
Ejemplo n.º 2
0
    def get_biblio(self):

        info(f"url = {self.url}")
        dict_bib = arxiv_query.query(self.identifier)
        info(f"{dict_bib=}")
        biblio = {
            "entry_type": "report",
            "permalink": self.url,
            "excerpt": "",
            "organization": "arXiv",
            "identifier": self.identifier,
            "comment": self.comment,
        }
        for key, value in list(dict_bib.items()):
            info(f"{key=} {value=} {type(value)=}")
            if value in (None, [], ""):
                pass
            elif key == "author":
                biblio["author"] = self.get_author(dict_bib)
            elif key == "published":
                biblio["date"] = self.get_date(dict_bib)
            elif key == "URL":
                biblio["permalink"] = biblio["url"] = dict_bib["URL"]
            else:
                biblio[key] = dict_bib[key]
        if "title" not in dict_bib:
            biblio["title"] = "UNKNOWN"
        else:
            biblio["title"] = sentence_case(" ".join(biblio["title"].split()))
        info(f"{biblio=}")
        return biblio
Ejemplo n.º 3
0
    def get_biblio(self):

        import isbn_query

        info(f"url = {self.url}")
        json_bib = isbn_query.query(self.url)
        info(f"json_bib = '{json_bib}'")
        biblio = {
            "permalink": self.url,
            "excerpt": "",
            "comment": self.comment,
        }
        info("### json_bib.items()")
        for key, value in list(json_bib.items()):
            info(f"key = '{key}'")
            if key.startswith("subject"):
                continue
            info("key = '%s' value = '%s' type(value) = '%s'\n" %
                 (key, value, type(value)))
            if value in (None, [], ""):
                pass
            elif key == "author":
                biblio["author"] = self.get_author(json_bib)
            elif key == "year":
                biblio["date"] = json_bib["year"]
            elif key == "isbn":
                biblio["isbn"] = json_bib["isbn"]
            elif key == "pageCount":
                biblio["pages"] = json_bib["pageCount"]
            elif key == "publisher":
                biblio["publisher"] = json_bib["publisher"]
            elif key == "city":
                biblio["address"] = json_bib["city"]
            elif key == "url":
                biblio["url"] = json_bib["url"]
                biblio["permalink"] = json_bib["url"]
            else:
                biblio[key] = json_bib[key]
        if "title" in json_bib:
            title = biblio["title"].replace(": ", ": ")
            biblio["title"] = sentence_case(title)
            if "subtitle" in json_bib:
                biblio["subtitle"] = sentence_case(json_bib["subtitle"])
        else:
            biblio["title"] = "UNKNOWN"
        return biblio
Ejemplo n.º 4
0
    def get_title(self):

        title = "UNKNOWN"
        if self.type == "subreddit":
            title = self.url_dict["root"]
        elif self.type in ["post", "comment"]:
            title = sentence_case(
                self.json[0]["data"]["children"][0]["data"]["title"])
        info(f"{title=}")
        return title.strip()
Ejemplo n.º 5
0
    def split_title_org(self):
        """Separate the title by a delimiter and test if latter half is the
        organization (if it has certain words (blog) or is too short)"""

        ORG_WORDS = ["blog", "lab", "center"]

        title = title_ori = self.get_title()
        info(f"title_ori = '{title_ori}'")
        org = org_ori = self.get_org()
        info(f"org_ori = '{org_ori}'")
        STRONG_DELIMTERS = re.compile(r"\s[\|—«»]\s")
        WEAK_DELIMITERS = re.compile(r"[:;-]\s")
        if STRONG_DELIMTERS.search(title_ori):
            info("STRONG_DELIMTERS")
            parts = STRONG_DELIMTERS.split(title_ori)
        else:
            info("WEAK_DELIMITERS")
            parts = WEAK_DELIMITERS.split(title_ori)
        info(f"parts = '{parts}'")
        if len(parts) >= 2:
            beginning, end = " : ".join(parts[0:-1]), parts[-1]
            title, org = beginning, end
            title_c14n = title.replace(" ", "").lower()
            org_c14n = org.replace(" ", "").lower()
            if org_ori.lower() in org_c14n.lower():
                info("org_ori.lower() in org_c14n.lower(): pass")
                title, org = " ".join(parts[0:-1]), parts[-1]
            elif org_ori.lower() in title_c14n:
                info("org_ori.lower() in title_c14n: switch")
                title, org = parts[-1], " ".join(parts[0:-1])
            else:
                info(f"{beginning=}, {end=}")
                end_ratio = float(len(end)) / len(beginning + end)
                info(
                    " end_ratio: %d / %d = %.2f"
                    % (len(end), len(beginning + end), end_ratio)
                )
                # if beginning has org_word or end is large (>50%): switch
                if end_ratio > 0.5 or any(
                    word.lower() in beginning for word in ORG_WORDS
                ):
                    info("ratio and org_word: switch")
                    title = end
                    org = beginning
            title = sentence_case(title.strip())
            org = org.strip()
        return title, org
Ejemplo n.º 6
0
    def get_biblio(self):

        import book_query
        
        info("url = %s" % self.url)
        json_bib = book_query.query(self.url)
        info("json_bib = '%s'" %json_bib)
        biblio = {
            'permalink' : self.url,
            'excerpt' : '',
            'comment' : self.comment,
        }
        info("### json_bib.items()")
        for key, value in json_bib.items():
            info("key = '%s'" %key)
            if key.startswith('subject'):
                continue
            info("key = '%s' value = '%s' type(value) = '%s'\n" %(
                key, value, type(value)))
            if value in (None, [], ''):
                pass
            elif key == 'author':
                biblio['author'] = self.get_author(json_bib)
            elif key == 'year':
                biblio['date'] = json_bib['year']
            elif key == 'isbn':
                biblio['isbn'] = json_bib['isbn'][0]
            elif key == 'pageCount':
                biblio['pages'] = json_bib['pageCount']
            elif key == 'publisher':
                biblio['publisher'] = json_bib['publisher']
            elif key == 'city':
                biblio['address'] = json_bib['city']
            elif key == 'url':
                biblio['url'] = json_bib['url']
                biblio['permalink'] = json_bib['url']
            else:
                biblio[key] = json_bib[key]
        if 'title' not in json_bib:
            biblio['title'] = 'UNKNOWN'
        else:
            title = biblio['title'].replace(' : ', ': ')
            biblio['title'] = sentence_case(title)
        return biblio
Ejemplo n.º 7
0
 def split_title_org(self):
     '''Separate the title by a delimiter and test if latter half is the
     organization (if it has certain words (blog) or is too short)'''
     
     ORG_WORDS = ['blog', 'lab', 'center']
     
     title = title_ori = self.get_title()
     info("title_ori = '%s'" %(title_ori))
     org = org_ori = self.get_org()
     info("org_ori = '%s'" %(org_ori))
     STRONG_DELIMTERS = re.compile(u'\s[\|—«»]\s')
     WEAK_DELIMITERS = re.compile(u'[:;-]\s')
     if STRONG_DELIMTERS.search(title_ori):
         info("STRONG_DELIMTERS")
         parts = STRONG_DELIMTERS.split(title_ori)
     else:
         info("WEAK_DELIMITERS")
         parts = WEAK_DELIMITERS.split(title_ori)            
     info("parts = '%s'" %(parts))
     if len(parts) >= 2:
         beginning, end = parts[0], parts[-1]
         title, org = beginning, end
         title_c14n = title.replace(' ','').lower()
         org_c14n = org.replace(' ','').lower()
         if org_ori.lower() in org_c14n.lower(): 
             info("org_ori.lower() in org_c14n.lower(): pass")
             title, org = ' '.join(parts[0:-1]), parts[-1]
         elif org_ori.lower() in title_c14n: 
             info("org_ori.lower() in title_c14n: switch")
             title, org = parts[-1], ' '.join(parts[0:-1])
         else:
             info("beginning = %s, end = %s" %(beginning, end))
             end_ratio = float(len(end)) / len(beginning + end)
             info(" end_ratio: %d / %d = %.2f" %(
                 len(end), len(beginning + end), end_ratio))
             # if beginning has org_word or end is large (>50%): switch
             if end_ratio > 0.5 or \
                     any(word.lower() in beginning for word in ORG_WORDS):
                 info("ratio and org_word: switch")
                 title = end
                 org = beginning
         title = sentence_case(title.strip())
         org = org.strip()
     return title, org
Ejemplo n.º 8
0
    def get_title(self):

        title_regexps = (
            ('http://lists.w3.org/.*', u'<!-- subject="(.*?)" -->'),
            ('http://lists.kde.org/.*', ur"<title>MARC: msg '(.*?)'</title>"),
            ('', ur'<title>(.*?)</title>')    # default: make sure last
        )

        for prefix, regexp in title_regexps:
            if self.url.startswith(prefix):
                break 
        
        title = "UNKNOWN TITLE"
        if self.html_u:
            tmatch = re.search(regexp, self.html_u, re.DOTALL|re.IGNORECASE)
            if tmatch:
                title = tmatch.group(1).strip()
                title = unescape_XML(title)
                title = sentence_case(title)
                title = smart_punctuation_to_ascii(title)
        return title
Ejemplo n.º 9
0
    def get_title(self):

        title_regexps = (
            ("http://lists.w3.org/.*", '<!-- subject="(.*?)" -->'),
            ("http://lists.kde.org/.*", r"<title>MARC: msg '(.*?)'</title>"),
            ("https://www.youtube.com", r'''"title":"(.*?)"'''),
            ("", r"<title[^>]*>([^<]+)</title>"),  # default: make sure last
        )

        for prefix, regexp in title_regexps:
            if self.url.startswith(prefix):
                info(f"{prefix=}")
                break

        title = "UNKNOWN TITLE"
        if self.html_u:
            tmatch = re.search(regexp, self.html_u, re.DOTALL | re.IGNORECASE)
            if tmatch:
                title = tmatch.group(1).strip()
                title = unescape_XML(title)
                title = sentence_case(title)
                title = smart_to_markdown(title)
        return title
Ejemplo n.º 10
0
    def get_biblio(self):

        import doi_query
        
        info("url = %s" % self.url)
        json_bib = doi_query.query(self.url)
        biblio = {
            'permalink' : self.url,
            'excerpt' : '',
            'comment' : self.comment,
        }
        for key, value in json_bib.items():
            info("key = '%s' value = '%s' type(value) = '%s'" %(
                key, value, type(value)))
            if value in (None, [], ''):
                pass
            elif key == 'author':
                biblio['author'] = self.get_author(json_bib)
            elif key == 'issued':
                biblio['date'] = self.get_date(json_bib)
            elif key == 'page':
                biblio['pages'] = json_bib['page']
            elif key == 'container-title':
                biblio['journal'] = json_bib['container-title']
            elif key == 'issue':
                biblio['number'] = json_bib['issue']
            elif key == 'URL':
                biblio['url'] = json_bib['URL']
            else:
                biblio[key] = json_bib[key]
        if 'title' not in json_bib:
            biblio['title'] = 'UNKNOWN'
        else:
            biblio['title'] = sentence_case(' '.join(
                biblio['title'].split()))
        info("biblio = %s" % biblio)
        return biblio