def process_entry (entry): """Return a URL and metadata.txt file drawn from the elements in this entry. :param: entry the FeedParser entry :type: entry a dictionary of metadata about the entry :return: a dictionary of UpLib metadata about the entry. The "original-url" metadata field is guaranteed. :rtype: dict """ d = {} from uplib.plibUtil import note from uplib.webutils import parse_URL if "link" not in entry: return None link = entry.get("origlink") or entry.get("link") # some elementary ad filtering host, port, path = parse_URL(link) if host.startswith("ads."): return None d["original-url"] = link if entry.has_key("title"): d["title"] = HTMLENTITIES.sub(deescape_html, entry.get("title")) if entry.has_key("summary"): summary = HTMLENTITIES.sub(deescape_html, entry.get("summary")) summary = re.sub(r"\s", " ", summary) if '<' in summary: summary = summary[:summary.index('<')] d["abstract"] = summary d["summary"] = summary author = None if entry.has_key("author_detail") and entry.get("author_detail").has_key("name"): author = entry.get("author_detail").get("name") elif entry.has_key("author"): author = entry.author if author: # ny times does bylines strangely if host.endswith("nytimes.com"): if author.startswith("By "): author = author[3:] # capitalize properly author = author.title() # lowercase "And" author = author.replace(" And ", " and ") d["authors"] = author if entry.has_key("updated_parsed"): date = entry.updated_parsed elif entry.has_key("published_parsed"): date = entry.published_parsed elif entry.has_key("created_parsed"): date = entry.created_parsed else: date = None if date: d["date"] = "%s/%s/%s" % (date[1], date[2], date[0]) d["rss-timestamp"] = str(int(time.mktime(date))) d["rss-id"] = entry.get("id") or entry.get("guid") or entry.get("link") return d
def rip (self, folder, docid): def encodestring(s): # nytimes strings have xml char refs, and we want Unicode if not s: return s s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s) s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s) return s mdpath = os.path.join(folder, "metadata.txt") originalspath = os.path.join(folder, "originals", "original.html") if not (os.path.exists(mdpath) and os.path.exists(originalspath)): return md = read_metadata(mdpath) url = md.get("original-url") if not url: return host, port, path = parse_URL(url) if host != "www.nytimes.com": return # OK, it's from the NY Times new_metadata = MetadataGatherer.parse(originalspath) if "source" not in md: md["source"] = "New York Times" # not all articles have metadata... if not ((('title' in new_metadata) or ('hdl' in new_metadata)) and ('pdate' in new_metadata)): note(3, "No metadata in article: %s", new_metadata) return md["title"] = encodestring(new_metadata.get("hdl") or md.get("title")) if "date" not in md: # get the date d = new_metadata.get("pdate") md["date"] = "%s/%s/%s" % (d[4:6], d[6:], d[:4]) if "authors" not in md: # get the byline d = new_metadata.get("byl") if d: if d.startswith("By "): d = d[3:] # capitalize properly d = d.title() # lowercase "And" d = d.replace(" And ", " and ") md["authors"] = encodestring(d) d = new_metadata.get("keywords") d0 = md.get("keywords") if d0: d0 += ("," + d) else: d0 = d if d0: md["keywords"] = encodestring(d0) if new_metadata.get("description"): md["summary"] = encodestring(new_metadata.get("description")) update_metadata(mdpath, md)
def rip(self, folder, docid): def encodestring(s): # WashPost strings have xml char refs, and we want Unicode if not s: return s s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s) s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s) return s def dequote(s): return re.sub(r"\\'", "'", s) def catclean(s): return re.sub(r"[/,]", "_", s) mdpath = os.path.join(folder, "metadata.txt") originalspath = os.path.join(folder, "originals", "original.html") if not (os.path.exists(mdpath) and os.path.exists(originalspath)): return md = read_metadata(mdpath) url = md.get("original-url") if not url: return host, port, path = parse_URL(url) if host != "www.washingtonpost.com": return # OK, it's from the Post new_metadata = MetadataGatherer.parse(originalspath) for line in open(originalspath): if line.startswith(_HEADLINE): line = line[len(_HEADLINE) :].strip("\n") t = _TITLEPATTERN.match(line) if t: new_metadata["hdl"] = dequote(t.group("title")) m = _AUTHORSPATTERN.search(line) if m: new_metadata["authors"] = dequote(line[len(m.group(0)) :].strip(" ';\n")) if line.startswith(_CONTENTID): new_metadata["content-id"] = line[len(_CONTENTID) :].strip(" ';\n") if line.startswith(_SECTION): section = line[len(_SECTION) :].strip(" ';\n") i = section.index("'") new_metadata["section"] = section[:i] if "source" not in md: md["source"] = "Washington Post" # not all articles have metadata... if not ("hdl" in new_metadata): note(3, "No metadata in article: %s", new_metadata) return md["title"] = encodestring(new_metadata.get("hdl") or md.get("title")) if "date" not in md: # get the date d = _URLDATEPATTERN.match(url) if d: md["date"] = "%s/%s/%s" % (d.group("month"), d.group("day"), d.group("year")) if "authors" not in md: # get the byline d = new_metadata.get("authors") if d: md["authors"] = encodestring(d) d = new_metadata.get("keywords") d0 = md.get("keywords") if d and d0: d0 = [x.strip() for x in d0.split(",")] + [x.strip() for x in d.split(";")] elif d: d0 = [x.strip() for x in d.split(";")] if d0: md["keywords"] = encodestring(",".join(d0)) if new_metadata.get("description"): md["summary"] = encodestring(new_metadata.get("description")) md["abstract"] = encodestring(new_metadata.get("description")) section = new_metadata.get("section") if section: c = md.get("categories") if c: c = [x.strip() for x in c.split(",")] else: c = [] c = c + ["article", "Washington Post/%s" % catclean(section)] md["categories"] = ",".join(c) content_id = new_metadata.get("content-id") if content_id: md["citation"] = "Washington Post article %s" % content_id update_metadata(mdpath, md)