Ejemplo n.º 1
0
def strip_date(s):
    """ remove all date/time bits from text """
    d,dspan = fuzzydate.parse_date(s)
    if dspan is not None:
        s = s[:dspan[0]] + s[dspan[1]:]

    t,tspan = fuzzydate.parse_time(s)
    if tspan is not None:
        s = s[:tspan[0]] + s[tspan[1]:]

    if tspan is not None or dspan is not None:
        # TODO: strip leftover "on" "at" etc...
        s = re.compile(r'\b(on|at|published|posted)\b[:]?',re.IGNORECASE).sub('',s)

    return s
Ejemplo n.º 2
0
def parse_byline(candidate,all,headline_node):
    authors = []
    score = 0.0
    txt = util.render_text(candidate)
    txt = u' '.join(txt.split()).strip()
    if len(txt) > 200:
        return (authors,score)

    logging.debug("byline: consider <%s> '%s'"%(candidate.tag,txt[:75]))

#    if candidate.tag == 'a':
#        score += eval_author_link(candidate)

    # split up using html structure
    parts = util.iter_text(candidate)

    # pass 1: check for and strip out parts with dates & times
    # TODO: this is a bit ruthless - could lose names if in same block
    parts2 = []
    for txt,el in parts:
        is_pubdate_frag = False
        if pats.pubdate['pubdate_indicator'].search(txt):
            is_pubdate_frag = True

        t,dspan = fuzzydate.parse_date(txt)
        if dspan is not None:
            logging.debug("  +0.1 contains date")
            score += 0.1
            is_pubdate_frag = True

        d,tspan = fuzzydate.parse_time(txt)
        if tspan is not None:
            logging.debug("  +0.1 contains time")
            score += 0.1
            is_pubdate_frag = True

        if not is_pubdate_frag:
            parts2.append((txt,el))

    # pass 2: split up text on likely separators - "and" "in" or any non alphabetic chars...
    # (capturing patterns are included in results)
    split_pat = re.compile(r'((?:\b(?:and|with|in)\b)|(?:[^-_.\w\s]+))',re.IGNORECASE|re.UNICODE)
    parts3 = []
    for txt,el in parts2:
        fragments = split_pat.split(txt)
        for frag in fragments:
            parts3.append((frag.strip(),el))

    # pass three - split out indicatives ("by", "posted by" etc)
    parts4 = []
    for txt,el in parts3:
        for frag in pats.byline['indicative'].split(txt):
            parts4.append((frag,el))

    # clean up
    parts4 = [(txt.strip(),el) for txt,el in parts4]
    parts4 = [(txt,el) for txt,el in parts4 if txt!=u'']

    # now run through classifying and collecting authors
    authors,score = parse_byline_parts(parts4)

    # TEST: likely-looking class or id
    if pats.byline['classes'].search(candidate.get('class','')):
        logging.debug("  +1 likely class")
        score += 1.0
    if pats.byline['classes'].search(candidate.get('id','')):
        logging.debug("  +1 likely id")
        score += 1.0

    # TEST: directly after headline?
    foo = intervening(headline_node,candidate,all)
    if foo is not None:
        if len(foo) == 0:
            logging.debug("  +0.5 directly after headline")
            score += 0.5

    logging.debug( "  total: %.3f" % (score,))

    return (authors, score)
Ejemplo n.º 3
0
def parse_byline(candidate, all, headline_node):
    authors = []
    score = 0.0
    txt = util.render_text(candidate)
    txt = u' '.join(txt.split()).strip()
    if len(txt) > 200:
        return (authors, score)

    logging.debug("byline: consider <%s> '%s'" % (candidate.tag, txt[:75]))

    #    if candidate.tag == 'a':
    #        score += eval_author_link(candidate)

    # split up using html structure
    parts = util.iter_text(candidate)

    # pass 1: check for and strip out parts with dates & times
    # TODO: this is a bit ruthless - could lose names if in same block
    parts2 = []
    for txt, el in parts:
        is_pubdate_frag = False
        if pats.pubdate['pubdate_indicator'].search(txt):
            is_pubdate_frag = True

        t, dspan = fuzzydate.parse_date(txt)
        if dspan is not None:
            logging.debug("  +0.1 contains date")
            score += 0.1
            is_pubdate_frag = True

        d, tspan = fuzzydate.parse_time(txt)
        if tspan is not None:
            logging.debug("  +0.1 contains time")
            score += 0.1
            is_pubdate_frag = True

        if not is_pubdate_frag:
            parts2.append((txt, el))

    # pass 2: split up text on likely separators - "and" "in" or any non alphabetic chars...
    # (capturing patterns are included in results)
    split_pat = re.compile(r'((?:\b(?:and|with|in)\b)|(?:[^-_.\w\s]+))',
                           re.IGNORECASE | re.UNICODE)
    parts3 = []
    for txt, el in parts2:
        fragments = split_pat.split(txt)
        for frag in fragments:
            parts3.append((frag.strip(), el))

    # pass three - split out indicatives ("by", "posted by" etc)
    parts4 = []
    for txt, el in parts3:
        for frag in pats.byline['indicative'].split(txt):
            parts4.append((frag, el))

    # clean up
    parts4 = [(txt.strip(), el) for txt, el in parts4]
    parts4 = [(txt, el) for txt, el in parts4 if txt != u'']

    # now run through classifying and collecting authors
    authors, score = parse_byline_parts(parts4)

    # TEST: likely-looking class or id
    if pats.byline['classes'].search(candidate.get('class', '')):
        logging.debug("  +1 likely class")
        score += 1.0
    if pats.byline['classes'].search(candidate.get('id', '')):
        logging.debug("  +1 likely id")
        score += 1.0

    # TEST: directly after headline?
    foo = intervening(headline_node, candidate, all)
    if foo is not None:
        if len(foo) == 0:
            logging.debug("  +0.5 directly after headline")
            score += 0.5

    logging.debug("  total: %.3f" % (score, ))

    return (authors, score)