def get_biblio(self): info(f"url = {self.url}") json_bib = doi_query.query(self.url) info(f"{json_bib=}") biblio = { "permalink": self.url, "excerpt": "", "comment": self.comment, } for key, value in list(json_bib.items()): info(f"{key=} {value=} {type(value)=}") if value in (None, [], ""): pass elif key == "author": biblio["author"] = self.get_author(json_bib) elif key == "issued": biblio["date"] = self.get_date(json_bib) elif key == "page": biblio["pages"] = json_bib["page"] elif key == "container-title": biblio["journal"] = json_bib["container-title"] elif key == "issue": biblio["number"] = json_bib["issue"] elif key == "URL": biblio["permalink"] = biblio["url"] = json_bib["URL"] else: biblio[key] = json_bib[key] if "title" not in json_bib: biblio["title"] = "UNKNOWN" else: biblio["title"] = sentence_case(" ".join(biblio["title"].split())) info(f"{biblio=}") return biblio
def get_biblio(self): info(f"url = {self.url}") dict_bib = arxiv_query.query(self.identifier) info(f"{dict_bib=}") biblio = { "entry_type": "report", "permalink": self.url, "excerpt": "", "organization": "arXiv", "identifier": self.identifier, "comment": self.comment, } for key, value in list(dict_bib.items()): info(f"{key=} {value=} {type(value)=}") if value in (None, [], ""): pass elif key == "author": biblio["author"] = self.get_author(dict_bib) elif key == "published": biblio["date"] = self.get_date(dict_bib) elif key == "URL": biblio["permalink"] = biblio["url"] = dict_bib["URL"] else: biblio[key] = dict_bib[key] if "title" not in dict_bib: biblio["title"] = "UNKNOWN" else: biblio["title"] = sentence_case(" ".join(biblio["title"].split())) info(f"{biblio=}") return biblio
def get_biblio(self): import isbn_query info(f"url = {self.url}") json_bib = isbn_query.query(self.url) info(f"json_bib = '{json_bib}'") biblio = { "permalink": self.url, "excerpt": "", "comment": self.comment, } info("### json_bib.items()") for key, value in list(json_bib.items()): info(f"key = '{key}'") if key.startswith("subject"): continue info("key = '%s' value = '%s' type(value) = '%s'\n" % (key, value, type(value))) if value in (None, [], ""): pass elif key == "author": biblio["author"] = self.get_author(json_bib) elif key == "year": biblio["date"] = json_bib["year"] elif key == "isbn": biblio["isbn"] = json_bib["isbn"] elif key == "pageCount": biblio["pages"] = json_bib["pageCount"] elif key == "publisher": biblio["publisher"] = json_bib["publisher"] elif key == "city": biblio["address"] = json_bib["city"] elif key == "url": biblio["url"] = json_bib["url"] biblio["permalink"] = json_bib["url"] else: biblio[key] = json_bib[key] if "title" in json_bib: title = biblio["title"].replace(": ", ": ") biblio["title"] = sentence_case(title) if "subtitle" in json_bib: biblio["subtitle"] = sentence_case(json_bib["subtitle"]) else: biblio["title"] = "UNKNOWN" return biblio
def get_title(self): title = "UNKNOWN" if self.type == "subreddit": title = self.url_dict["root"] elif self.type in ["post", "comment"]: title = sentence_case( self.json[0]["data"]["children"][0]["data"]["title"]) info(f"{title=}") return title.strip()
def split_title_org(self): """Separate the title by a delimiter and test if latter half is the organization (if it has certain words (blog) or is too short)""" ORG_WORDS = ["blog", "lab", "center"] title = title_ori = self.get_title() info(f"title_ori = '{title_ori}'") org = org_ori = self.get_org() info(f"org_ori = '{org_ori}'") STRONG_DELIMTERS = re.compile(r"\s[\|—«»]\s") WEAK_DELIMITERS = re.compile(r"[:;-]\s") if STRONG_DELIMTERS.search(title_ori): info("STRONG_DELIMTERS") parts = STRONG_DELIMTERS.split(title_ori) else: info("WEAK_DELIMITERS") parts = WEAK_DELIMITERS.split(title_ori) info(f"parts = '{parts}'") if len(parts) >= 2: beginning, end = " : ".join(parts[0:-1]), parts[-1] title, org = beginning, end title_c14n = title.replace(" ", "").lower() org_c14n = org.replace(" ", "").lower() if org_ori.lower() in org_c14n.lower(): info("org_ori.lower() in org_c14n.lower(): pass") title, org = " ".join(parts[0:-1]), parts[-1] elif org_ori.lower() in title_c14n: info("org_ori.lower() in title_c14n: switch") title, org = parts[-1], " ".join(parts[0:-1]) else: info(f"{beginning=}, {end=}") end_ratio = float(len(end)) / len(beginning + end) info( " end_ratio: %d / %d = %.2f" % (len(end), len(beginning + end), end_ratio) ) # if beginning has org_word or end is large (>50%): switch if end_ratio > 0.5 or any( word.lower() in beginning for word in ORG_WORDS ): info("ratio and org_word: switch") title = end org = beginning title = sentence_case(title.strip()) org = org.strip() return title, org
def get_biblio(self): import book_query info("url = %s" % self.url) json_bib = book_query.query(self.url) info("json_bib = '%s'" %json_bib) biblio = { 'permalink' : self.url, 'excerpt' : '', 'comment' : self.comment, } info("### json_bib.items()") for key, value in json_bib.items(): info("key = '%s'" %key) if key.startswith('subject'): continue info("key = '%s' value = '%s' type(value) = '%s'\n" %( key, value, type(value))) if value in (None, [], ''): pass elif key == 'author': biblio['author'] = self.get_author(json_bib) elif key == 'year': biblio['date'] = json_bib['year'] elif key == 'isbn': biblio['isbn'] = json_bib['isbn'][0] elif key == 'pageCount': biblio['pages'] = json_bib['pageCount'] elif key == 'publisher': biblio['publisher'] = json_bib['publisher'] elif key == 'city': biblio['address'] = json_bib['city'] elif key == 'url': biblio['url'] = json_bib['url'] biblio['permalink'] = json_bib['url'] else: biblio[key] = json_bib[key] if 'title' not in json_bib: biblio['title'] = 'UNKNOWN' else: title = biblio['title'].replace(' : ', ': ') biblio['title'] = sentence_case(title) return biblio
def split_title_org(self): '''Separate the title by a delimiter and test if latter half is the organization (if it has certain words (blog) or is too short)''' ORG_WORDS = ['blog', 'lab', 'center'] title = title_ori = self.get_title() info("title_ori = '%s'" %(title_ori)) org = org_ori = self.get_org() info("org_ori = '%s'" %(org_ori)) STRONG_DELIMTERS = re.compile(u'\s[\|—«»]\s') WEAK_DELIMITERS = re.compile(u'[:;-]\s') if STRONG_DELIMTERS.search(title_ori): info("STRONG_DELIMTERS") parts = STRONG_DELIMTERS.split(title_ori) else: info("WEAK_DELIMITERS") parts = WEAK_DELIMITERS.split(title_ori) info("parts = '%s'" %(parts)) if len(parts) >= 2: beginning, end = parts[0], parts[-1] title, org = beginning, end title_c14n = title.replace(' ','').lower() org_c14n = org.replace(' ','').lower() if org_ori.lower() in org_c14n.lower(): info("org_ori.lower() in org_c14n.lower(): pass") title, org = ' '.join(parts[0:-1]), parts[-1] elif org_ori.lower() in title_c14n: info("org_ori.lower() in title_c14n: switch") title, org = parts[-1], ' '.join(parts[0:-1]) else: info("beginning = %s, end = %s" %(beginning, end)) end_ratio = float(len(end)) / len(beginning + end) info(" end_ratio: %d / %d = %.2f" %( len(end), len(beginning + end), end_ratio)) # if beginning has org_word or end is large (>50%): switch if end_ratio > 0.5 or \ any(word.lower() in beginning for word in ORG_WORDS): info("ratio and org_word: switch") title = end org = beginning title = sentence_case(title.strip()) org = org.strip() return title, org
def get_title(self): title_regexps = ( ('http://lists.w3.org/.*', u'<!-- subject="(.*?)" -->'), ('http://lists.kde.org/.*', ur"<title>MARC: msg '(.*?)'</title>"), ('', ur'<title>(.*?)</title>') # default: make sure last ) for prefix, regexp in title_regexps: if self.url.startswith(prefix): break title = "UNKNOWN TITLE" if self.html_u: tmatch = re.search(regexp, self.html_u, re.DOTALL|re.IGNORECASE) if tmatch: title = tmatch.group(1).strip() title = unescape_XML(title) title = sentence_case(title) title = smart_punctuation_to_ascii(title) return title
def get_title(self): title_regexps = ( ("http://lists.w3.org/.*", '<!-- subject="(.*?)" -->'), ("http://lists.kde.org/.*", r"<title>MARC: msg '(.*?)'</title>"), ("https://www.youtube.com", r'''"title":"(.*?)"'''), ("", r"<title[^>]*>([^<]+)</title>"), # default: make sure last ) for prefix, regexp in title_regexps: if self.url.startswith(prefix): info(f"{prefix=}") break title = "UNKNOWN TITLE" if self.html_u: tmatch = re.search(regexp, self.html_u, re.DOTALL | re.IGNORECASE) if tmatch: title = tmatch.group(1).strip() title = unescape_XML(title) title = sentence_case(title) title = smart_to_markdown(title) return title
def get_biblio(self): import doi_query info("url = %s" % self.url) json_bib = doi_query.query(self.url) biblio = { 'permalink' : self.url, 'excerpt' : '', 'comment' : self.comment, } for key, value in json_bib.items(): info("key = '%s' value = '%s' type(value) = '%s'" %( key, value, type(value))) if value in (None, [], ''): pass elif key == 'author': biblio['author'] = self.get_author(json_bib) elif key == 'issued': biblio['date'] = self.get_date(json_bib) elif key == 'page': biblio['pages'] = json_bib['page'] elif key == 'container-title': biblio['journal'] = json_bib['container-title'] elif key == 'issue': biblio['number'] = json_bib['issue'] elif key == 'URL': biblio['url'] = json_bib['URL'] else: biblio[key] = json_bib[key] if 'title' not in json_bib: biblio['title'] = 'UNKNOWN' else: biblio['title'] = sentence_case(' '.join( biblio['title'].split())) info("biblio = %s" % biblio) return biblio