Exemple #1
0
def q_daterange(start, end, now=None) :
    """Takes (startdate, enddate) and creates [$date(startdate),
    $date(enddate)]."""
    if now is None :
        now = ("value", datetime.datetime.now())
    now, start, end = require_type("value", now, start, end)
    start = fuzzydate.parse_date(start, now)
    end = fuzzydate.parse_date(end, now)
    return ("value", [("value", start), ("value", end)])
Exemple #2
0
def q_date(s=("value",""), now=None) :
    """Takes s="" and now=datetime.now() and gives
    fuzzydate.parse_date(s,now)."""
    if now is None :
        now = ("value", datetime.datetime.now())
    s, now = require_type("value", s, now)
    return ("value", fuzzydate.parse_date(s, now))
Exemple #3
0
def strip_date(s):
    """ remove all date/time bits from text """
    d,dspan = fuzzydate.parse_date(s)
    if dspan is not None:
        s = s[:dspan[0]] + s[dspan[1]:]

    t,tspan = fuzzydate.parse_time(s)
    if tspan is not None:
        s = s[:tspan[0]] + s[tspan[1]:]

    if tspan is not None or dspan is not None:
        # TODO: strip leftover "on" "at" etc...
        s = re.compile(r'\b(on|at|published|posted)\b[:]?',re.IGNORECASE).sub('',s)

    return s
Exemple #4
0
def filter_blob_metadata_dates(db, tags) :
    if "event" not in tags :
        raise DeferAction()
    else :
        tags["tag"] = list_drop(list_lift(tags.get("tag", [])) + ["event"])
        for field, depends, createp in event_date_fields :
            if depends in tags and type(tags[depends]) is datetime.datetime :
                if field in tags :
                    sdate = tags[field][0] if type(tags[field]) is list else tags[field]
                    try :
                        date = fuzzydate.parse_date(sdate, tags[depends])
                    except fuzzydate.DateFormatException as x :
                        date = "%s (DateFormatException: %r)" % (sdate, x.args)
                    tags[field] = date
                elif createp in tags and type(tags[createp]) is datetime.datetime :
                    tags[field] = tags[createp]
                    
        raise ContinueWith(db, tags)
Exemple #5
0
def parse_byline(candidate,all,headline_node):
    authors = []
    score = 0.0
    txt = util.render_text(candidate)
    txt = u' '.join(txt.split()).strip()
    if len(txt) > 200:
        return (authors,score)

    logging.debug("byline: consider <%s> '%s'"%(candidate.tag,txt[:75]))

#    if candidate.tag == 'a':
#        score += eval_author_link(candidate)

    # split up using html structure
    parts = util.iter_text(candidate)

    # pass 1: check for and strip out parts with dates & times
    # TODO: this is a bit ruthless - could lose names if in same block
    parts2 = []
    for txt,el in parts:
        is_pubdate_frag = False
        if pats.pubdate['pubdate_indicator'].search(txt):
            is_pubdate_frag = True

        t,dspan = fuzzydate.parse_date(txt)
        if dspan is not None:
            logging.debug("  +0.1 contains date")
            score += 0.1
            is_pubdate_frag = True

        d,tspan = fuzzydate.parse_time(txt)
        if tspan is not None:
            logging.debug("  +0.1 contains time")
            score += 0.1
            is_pubdate_frag = True

        if not is_pubdate_frag:
            parts2.append((txt,el))

    # pass 2: split up text on likely separators - "and" "in" or any non alphabetic chars...
    # (capturing patterns are included in results)
    split_pat = re.compile(r'((?:\b(?:and|with|in)\b)|(?:[^-_.\w\s]+))',re.IGNORECASE|re.UNICODE)
    parts3 = []
    for txt,el in parts2:
        fragments = split_pat.split(txt)
        for frag in fragments:
            parts3.append((frag.strip(),el))

    # pass three - split out indicatives ("by", "posted by" etc)
    parts4 = []
    for txt,el in parts3:
        for frag in pats.byline['indicative'].split(txt):
            parts4.append((frag,el))

    # clean up
    parts4 = [(txt.strip(),el) for txt,el in parts4]
    parts4 = [(txt,el) for txt,el in parts4 if txt!=u'']

    # now run through classifying and collecting authors
    authors,score = parse_byline_parts(parts4)

    # TEST: likely-looking class or id
    if pats.byline['classes'].search(candidate.get('class','')):
        logging.debug("  +1 likely class")
        score += 1.0
    if pats.byline['classes'].search(candidate.get('id','')):
        logging.debug("  +1 likely id")
        score += 1.0

    # TEST: directly after headline?
    foo = intervening(headline_node,candidate,all)
    if foo is not None:
        if len(foo) == 0:
            logging.debug("  +0.5 directly after headline")
            score += 0.5

    logging.debug( "  total: %.3f" % (score,))

    return (authors, score)
Exemple #6
0
def parse_byline(candidate, all, headline_node):
    authors = []
    score = 0.0
    txt = util.render_text(candidate)
    txt = u' '.join(txt.split()).strip()
    if len(txt) > 200:
        return (authors, score)

    logging.debug("byline: consider <%s> '%s'" % (candidate.tag, txt[:75]))

    #    if candidate.tag == 'a':
    #        score += eval_author_link(candidate)

    # split up using html structure
    parts = util.iter_text(candidate)

    # pass 1: check for and strip out parts with dates & times
    # TODO: this is a bit ruthless - could lose names if in same block
    parts2 = []
    for txt, el in parts:
        is_pubdate_frag = False
        if pats.pubdate['pubdate_indicator'].search(txt):
            is_pubdate_frag = True

        t, dspan = fuzzydate.parse_date(txt)
        if dspan is not None:
            logging.debug("  +0.1 contains date")
            score += 0.1
            is_pubdate_frag = True

        d, tspan = fuzzydate.parse_time(txt)
        if tspan is not None:
            logging.debug("  +0.1 contains time")
            score += 0.1
            is_pubdate_frag = True

        if not is_pubdate_frag:
            parts2.append((txt, el))

    # pass 2: split up text on likely separators - "and" "in" or any non alphabetic chars...
    # (capturing patterns are included in results)
    split_pat = re.compile(r'((?:\b(?:and|with|in)\b)|(?:[^-_.\w\s]+))',
                           re.IGNORECASE | re.UNICODE)
    parts3 = []
    for txt, el in parts2:
        fragments = split_pat.split(txt)
        for frag in fragments:
            parts3.append((frag.strip(), el))

    # pass three - split out indicatives ("by", "posted by" etc)
    parts4 = []
    for txt, el in parts3:
        for frag in pats.byline['indicative'].split(txt):
            parts4.append((frag, el))

    # clean up
    parts4 = [(txt.strip(), el) for txt, el in parts4]
    parts4 = [(txt, el) for txt, el in parts4 if txt != u'']

    # now run through classifying and collecting authors
    authors, score = parse_byline_parts(parts4)

    # TEST: likely-looking class or id
    if pats.byline['classes'].search(candidate.get('class', '')):
        logging.debug("  +1 likely class")
        score += 1.0
    if pats.byline['classes'].search(candidate.get('id', '')):
        logging.debug("  +1 likely id")
        score += 1.0

    # TEST: directly after headline?
    foo = intervening(headline_node, candidate, all)
    if foo is not None:
        if len(foo) == 0:
            logging.debug("  +0.5 directly after headline")
            score += 0.5

    logging.debug("  total: %.3f" % (score, ))

    return (authors, score)
Exemple #7
0
    def testSpans(self):
        got,span = parse_date('blah blah blah wibble foo, may 25th, 2011 some more crap here')
        self.assertEqual(span,(27,41))
 
        got,span = parse_date('wibble 25-01-2011 pibble')
        self.assertEqual(span,(7,17))