Example #1
0
def extract_date(txt):
    # TODO: provide default timezone based on guessed country (prob from domain name)
    filler = fuzzydate.fuzzydate( day=1, hour=0, minute=0, second=0, microsecond=0)
    fd = fuzzydate.parse_datetime(txt)
    try:
        return fd.datetime(filler)
    except:
        return None
Example #2
0
def extract_date(txt):
    # TODO: provide default timezone based on guessed country (prob from domain name)
    filler = fuzzydate.fuzzydate(day=1,
                                 hour=0,
                                 minute=0,
                                 second=0,
                                 microsecond=0)
    fd = fuzzydate.parse_datetime(txt)
    try:
        return fd.datetime(filler)
    except:
        return None
Example #3
0
    def testExamplesFromWild(self):
        filler = fuzzydate(month=1, day=1, hour=0, minute=0, second=0, microsecond=0)

        for foo in self.examples_from_the_wild:
            fuzzy = parse_datetime(foo[0])
            got = fuzzy.datetime(filler)
            if foo[1] is not None:
                expected = datetime.datetime(*foo[1])
            else:
                expected = None

            self.assertEqual(got,expected, "'%s': expected '%s', got '%s')" % (foo[0],expected,got))
Example #4
0
def extract_pubdate(doc, url, headline_linenum):
    """ returns date,linenum """
    candidates = {}

    logging.debug("extracting pubdate")

    # TODO: try some definitive meta tags first?
    # "DCSext.articleFirstPublished"
    # "DC.date.issued"
    # "last-modified"

    # check for date in slug
    for pat in pats.pubdate['url_datefmts']:
        m = pat.search(url)
        if m is not None:
            d = datetime.datetime( int(m.group('year')), int(m.group('month')), int(m.group('day')) )
            logging.debug("  using %s from url" % (d,))
            return d,None



    meta_dates = set()
    for meta in doc.findall('.//meta'):
        n = meta.get('name', meta.get('property', ''))
        if pats.pubdate['metatags'].search(n):
            logging.debug(" date: consider meta name='%s' content='%s'" % (n,meta.get('content','')))
            fuzzy = fuzzydate.parse_datetime(meta.get('content',''))
            if not fuzzy.empty_date():
                meta_dates.add(fuzzy.date(fuzzydate.fuzzydate(day=1)))


#    if len(meta_dates)==1:
#        # only one likely-looking <meta> entry - lets go with it
#        d = list(meta_dates)[0]
#        logging.debug("  using %s from <meta>" % (d,))
#        return d,None

    # start looking through whole page
    for e in util.tags(doc,'p','span','div','li','td','th','h4','h5','h6','font'):
        txt = unicode(e.text_content()).strip()
        txt = u' '.join(txt.split())

        # discard anything too short or long
        if len(txt)<6 or len(txt) > 150:
            continue

        score = 1
        dt = extract_date(txt)
        if dt is None:
            continue
        logging.debug(" date: considering %s '%s'" % (e.tag,txt))

        # TEST: proximity to headline in html
        if headline_linenum>0 and e.sourceline>0:
            dist = e.sourceline - headline_linenum
            if dist >-10 and dist <25:
                logging.debug("  near headline")
                score += 1

        # TEST: likely class or id?
        if pats.pubdate['classes'].search(e.get('class','')):
            logging.debug("  likely class")
            score += 1
        if pats.pubdate['classes'].search(e.get('id','')):
            logging.debug("  likely id")
            score += 1
        # in byline is also a good indicator
        if pats.byline['classes'].search(e.get('class','')):
            logging.debug("  likely class")
            score += 1
        if pats.byline['classes'].search(e.get('id','')):
            logging.debug("  likely id")
            score += 1


        # TEST: also appears in likely <meta> tags?
        if dt.date() in meta_dates:
            logging.debug("  appears in <meta>")
            score += 1


        # TEST: not within likely-looking comment container?
        in_comment = False
        foo = e.getparent()
        while foo is not None:
            if pats.pubdate['comment_classes'].search(foo.get('class','')):
                in_comment = True
                break
            foo = foo.getparent()
        if not in_comment:
            logging.debug("  not inside likely comment")
            score += 1

        # TEST: indicative text? ("posted on" , "last updated" etc...)
        if pats.pubdate['pubdate_indicator'].search(txt):
            logging.debug("  text indicative of pubdate")
            score += 1

        # TEST: date appears in url? eg "http://blah.com/blahblah-20100801-blah.html"
        if re.compile("%d[-_/.]?0?%d[-_/.]?0?%d" % (dt.year,dt.month,dt.day)).search(url):
            logging.debug("  full date appears in url")
            score += 2
        elif re.compile("%d[-_/.]?0?%d" % (dt.year,dt.month)).search(url):
            logging.debug("  year and month appear in url")
            score += 1

        if dt.date() not in candidates or score>candidates[dt.date()]['score']:
            candidates[dt.date()] = {'datetime': dt, 'score': score, 'node': e}


    if not candidates:
        return None,None

    out = sorted(candidates.items(), key=lambda item: item[1]['score'], reverse=True)
#    print "========="
#    pprint( out[:5] )
#    print "========="
    best = out[0][1]
    return best['datetime'],best['node']
Example #5
0
def extract_pubdate(doc, url, headline_linenum):
    """ returns date,linenum """
    candidates = {}

    logging.debug("extracting pubdate")

    # TODO: try some definitive meta tags first?
    # "DCSext.articleFirstPublished"
    # "DC.date.issued"
    # "last-modified"

    # check for date in slug
    for pat in pats.pubdate['url_datefmts']:
        m = pat.search(url)
        if m is not None:
            d = datetime.datetime(int(m.group('year')), int(m.group('month')),
                                  int(m.group('day')))
            logging.debug("  using %s from url" % (d, ))
            return d, None

    meta_dates = set()
    for meta in doc.findall('.//meta'):
        n = meta.get('name', meta.get('property', ''))
        if pats.pubdate['metatags'].search(n):
            logging.debug(" date: consider meta name='%s' content='%s'" %
                          (n, meta.get('content', '')))
            fuzzy = fuzzydate.parse_datetime(meta.get('content', ''))
            if not fuzzy.empty_date():
                meta_dates.add(fuzzy.date(fuzzydate.fuzzydate(day=1)))


#    if len(meta_dates)==1:
#        # only one likely-looking <meta> entry - lets go with it
#        d = list(meta_dates)[0]
#        logging.debug("  using %s from <meta>" % (d,))
#        return d,None

# start looking through whole page
    for e in util.tags(doc, 'p', 'span', 'div', 'li', 'td', 'th', 'h4', 'h5',
                       'h6', 'font'):
        txt = unicode(e.text_content()).strip()
        txt = u' '.join(txt.split())

        # discard anything too short or long
        if len(txt) < 6 or len(txt) > 150:
            continue

        score = 1
        dt = extract_date(txt)
        if dt is None:
            continue
        logging.debug(" date: considering %s '%s'" % (e.tag, txt))

        # TEST: proximity to headline in html
        if headline_linenum > 0 and e.sourceline > 0:
            dist = e.sourceline - headline_linenum
            if dist > -10 and dist < 25:
                logging.debug("  near headline")
                score += 1

        # TEST: likely class or id?
        if pats.pubdate['classes'].search(e.get('class', '')):
            logging.debug("  likely class")
            score += 1
        if pats.pubdate['classes'].search(e.get('id', '')):
            logging.debug("  likely id")
            score += 1
        # in byline is also a good indicator
        if pats.byline['classes'].search(e.get('class', '')):
            logging.debug("  likely class")
            score += 1
        if pats.byline['classes'].search(e.get('id', '')):
            logging.debug("  likely id")
            score += 1

        # TEST: also appears in likely <meta> tags?
        if dt.date() in meta_dates:
            logging.debug("  appears in <meta>")
            score += 1

        # TEST: not within likely-looking comment container?
        in_comment = False
        foo = e.getparent()
        while foo is not None:
            if pats.pubdate['comment_classes'].search(foo.get('class', '')):
                in_comment = True
                break
            foo = foo.getparent()
        if not in_comment:
            logging.debug("  not inside likely comment")
            score += 1

        # TEST: indicative text? ("posted on" , "last updated" etc...)
        if pats.pubdate['pubdate_indicator'].search(txt):
            logging.debug("  text indicative of pubdate")
            score += 1

        # TEST: date appears in url? eg "http://blah.com/blahblah-20100801-blah.html"
        if re.compile("%d[-_/.]?0?%d[-_/.]?0?%d" %
                      (dt.year, dt.month, dt.day)).search(url):
            logging.debug("  full date appears in url")
            score += 2
        elif re.compile("%d[-_/.]?0?%d" % (dt.year, dt.month)).search(url):
            logging.debug("  year and month appear in url")
            score += 1

        if dt.date(
        ) not in candidates or score > candidates[dt.date()]['score']:
            candidates[dt.date()] = {'datetime': dt, 'score': score, 'node': e}

    if not candidates:
        return None, None

    out = sorted(candidates.items(),
                 key=lambda item: item[1]['score'],
                 reverse=True)
    #    print "========="
    #    pprint( out[:5] )
    #    print "========="
    best = out[0][1]
    return best['datetime'], best['node']