def q_daterange(start, end, now=None) : """Takes (startdate, enddate) and creates [$date(startdate), $date(enddate)].""" if now is None : now = ("value", datetime.datetime.now()) now, start, end = require_type("value", now, start, end) start = fuzzydate.parse_date(start, now) end = fuzzydate.parse_date(end, now) return ("value", [("value", start), ("value", end)])
def q_date(s=("value",""), now=None) : """Takes s="" and now=datetime.now() and gives fuzzydate.parse_date(s,now).""" if now is None : now = ("value", datetime.datetime.now()) s, now = require_type("value", s, now) return ("value", fuzzydate.parse_date(s, now))
def strip_date(s): """ remove all date/time bits from text """ d,dspan = fuzzydate.parse_date(s) if dspan is not None: s = s[:dspan[0]] + s[dspan[1]:] t,tspan = fuzzydate.parse_time(s) if tspan is not None: s = s[:tspan[0]] + s[tspan[1]:] if tspan is not None or dspan is not None: # TODO: strip leftover "on" "at" etc... s = re.compile(r'\b(on|at|published|posted)\b[:]?',re.IGNORECASE).sub('',s) return s
def filter_blob_metadata_dates(db, tags) : if "event" not in tags : raise DeferAction() else : tags["tag"] = list_drop(list_lift(tags.get("tag", [])) + ["event"]) for field, depends, createp in event_date_fields : if depends in tags and type(tags[depends]) is datetime.datetime : if field in tags : sdate = tags[field][0] if type(tags[field]) is list else tags[field] try : date = fuzzydate.parse_date(sdate, tags[depends]) except fuzzydate.DateFormatException as x : date = "%s (DateFormatException: %r)" % (sdate, x.args) tags[field] = date elif createp in tags and type(tags[createp]) is datetime.datetime : tags[field] = tags[createp] raise ContinueWith(db, tags)
def parse_byline(candidate,all,headline_node): authors = [] score = 0.0 txt = util.render_text(candidate) txt = u' '.join(txt.split()).strip() if len(txt) > 200: return (authors,score) logging.debug("byline: consider <%s> '%s'"%(candidate.tag,txt[:75])) # if candidate.tag == 'a': # score += eval_author_link(candidate) # split up using html structure parts = util.iter_text(candidate) # pass 1: check for and strip out parts with dates & times # TODO: this is a bit ruthless - could lose names if in same block parts2 = [] for txt,el in parts: is_pubdate_frag = False if pats.pubdate['pubdate_indicator'].search(txt): is_pubdate_frag = True t,dspan = fuzzydate.parse_date(txt) if dspan is not None: logging.debug(" +0.1 contains date") score += 0.1 is_pubdate_frag = True d,tspan = fuzzydate.parse_time(txt) if tspan is not None: logging.debug(" +0.1 contains time") score += 0.1 is_pubdate_frag = True if not is_pubdate_frag: parts2.append((txt,el)) # pass 2: split up text on likely separators - "and" "in" or any non alphabetic chars... # (capturing patterns are included in results) split_pat = re.compile(r'((?:\b(?:and|with|in)\b)|(?:[^-_.\w\s]+))',re.IGNORECASE|re.UNICODE) parts3 = [] for txt,el in parts2: fragments = split_pat.split(txt) for frag in fragments: parts3.append((frag.strip(),el)) # pass three - split out indicatives ("by", "posted by" etc) parts4 = [] for txt,el in parts3: for frag in pats.byline['indicative'].split(txt): parts4.append((frag,el)) # clean up parts4 = [(txt.strip(),el) for txt,el in parts4] parts4 = [(txt,el) for txt,el in parts4 if txt!=u''] # now run through classifying and collecting authors authors,score = parse_byline_parts(parts4) # TEST: likely-looking class or id if pats.byline['classes'].search(candidate.get('class','')): logging.debug(" +1 likely class") score += 1.0 if pats.byline['classes'].search(candidate.get('id','')): logging.debug(" +1 likely id") score += 1.0 # TEST: directly after headline? foo = intervening(headline_node,candidate,all) if foo is not None: if len(foo) == 0: logging.debug(" +0.5 directly after headline") score += 0.5 logging.debug( " total: %.3f" % (score,)) return (authors, score)
def parse_byline(candidate, all, headline_node): authors = [] score = 0.0 txt = util.render_text(candidate) txt = u' '.join(txt.split()).strip() if len(txt) > 200: return (authors, score) logging.debug("byline: consider <%s> '%s'" % (candidate.tag, txt[:75])) # if candidate.tag == 'a': # score += eval_author_link(candidate) # split up using html structure parts = util.iter_text(candidate) # pass 1: check for and strip out parts with dates & times # TODO: this is a bit ruthless - could lose names if in same block parts2 = [] for txt, el in parts: is_pubdate_frag = False if pats.pubdate['pubdate_indicator'].search(txt): is_pubdate_frag = True t, dspan = fuzzydate.parse_date(txt) if dspan is not None: logging.debug(" +0.1 contains date") score += 0.1 is_pubdate_frag = True d, tspan = fuzzydate.parse_time(txt) if tspan is not None: logging.debug(" +0.1 contains time") score += 0.1 is_pubdate_frag = True if not is_pubdate_frag: parts2.append((txt, el)) # pass 2: split up text on likely separators - "and" "in" or any non alphabetic chars... # (capturing patterns are included in results) split_pat = re.compile(r'((?:\b(?:and|with|in)\b)|(?:[^-_.\w\s]+))', re.IGNORECASE | re.UNICODE) parts3 = [] for txt, el in parts2: fragments = split_pat.split(txt) for frag in fragments: parts3.append((frag.strip(), el)) # pass three - split out indicatives ("by", "posted by" etc) parts4 = [] for txt, el in parts3: for frag in pats.byline['indicative'].split(txt): parts4.append((frag, el)) # clean up parts4 = [(txt.strip(), el) for txt, el in parts4] parts4 = [(txt, el) for txt, el in parts4 if txt != u''] # now run through classifying and collecting authors authors, score = parse_byline_parts(parts4) # TEST: likely-looking class or id if pats.byline['classes'].search(candidate.get('class', '')): logging.debug(" +1 likely class") score += 1.0 if pats.byline['classes'].search(candidate.get('id', '')): logging.debug(" +1 likely id") score += 1.0 # TEST: directly after headline? foo = intervening(headline_node, candidate, all) if foo is not None: if len(foo) == 0: logging.debug(" +0.5 directly after headline") score += 0.5 logging.debug(" total: %.3f" % (score, )) return (authors, score)
def testSpans(self): got,span = parse_date('blah blah blah wibble foo, may 25th, 2011 some more crap here') self.assertEqual(span,(27,41)) got,span = parse_date('wibble 25-01-2011 pibble') self.assertEqual(span,(7,17))