Exemple #1
0
def publish_date(soup, source_url=None):
    """
    Extract publish date from meta / source_url.
    """

    # try isodate first
    for tag in PUBLISH_DATE_TAGS:
        ds = _extract_tag_data(soup, tag)
        if ds:
            dt = dates.parse_iso(ds, enforce_tz=False)
            if dt:
                return dt

    # try a timestamp next.
    for tag in PUBLISH_DATE_TAGS:
        ds = _extract_tag_data(soup, tag)
        if ds:
            dt = dates.parse_ts(ds)
            if dt:
                return dt

    # try any date next.
    for tag in PUBLISH_DATE_TAGS:
        ds = _extract_tag_data(soup, tag)
        if ds:
            dt = dates.parse_any(ds, enforce_tz=False)
            if dt:
                return dt

    # fallback on url regex
    if source_url:
        dm = re_url_date.search(source_url)
        if dm:
            ds = dm.group(0)
            dt = dates.parse_any(ds, enforce_tz=False)
            if dt:
                return dt
Exemple #2
0
def publish_date(soup, source_url=None):
    """
    Extract publish date from meta / source_url.
    """

    # try isodate first
    for tag in PUBLISH_DATE_TAGS:
        ds = _extract_tag_data(soup, tag)
        if ds:
            dt = dates.parse_iso(ds, enforce_tz=False)
            if dt:
                return dt

    # try a timestamp next.
    for tag in PUBLISH_DATE_TAGS:
        ds = _extract_tag_data(soup, tag)
        if ds:
            dt = dates.parse_ts(ds)
            if dt:
                return dt

    # try any date next.
    for tag in PUBLISH_DATE_TAGS:
        ds = _extract_tag_data(soup, tag)
        if ds:
            dt = dates.parse_any(ds, enforce_tz=False)
            if dt:
                return dt

    # fallback on url regex
    if source_url:
        dm = re_url_date.search(source_url)
        if dm:
            ds = dm.group(0)
            dt = dates.parse_any(ds, enforce_tz=False)
            if dt:
                return dt
Exemple #3
0
def _prepare_date(o, field):
    """
    Prepare a date
    """
    if field not in o:
        return None
    if o[field] is None:
        return None
    if isinstance(o[field], datetime):
        return o[field]
    dt = dates.parse_any(o[field], enforce_tz=True)
    if not dt:
        raise RequestError('{}: "{}" is an invalid date.'.format(
            field, o[field]))
    return dt
Exemple #4
0
def _prepare_date(o, field):
    """
    Prepare a date
    """
    if field not in o:
        return None
    if o[field] is None:
        return None
    if isinstance(o[field], datetime):
        return o[field]
    dt = dates.parse_any(o[field], enforce_tz=True)
    if not dt:
        raise RequestError(
            '{}: "{}" is an invalid date.'
            .format(field, o[field]))
    return dt
Exemple #5
0
 def _parse(self, raw):
     """
     pre process raw message
     """
     # validate the message
     msg = email.message_from_string(raw)
     # normalize
     clean = {}
     rec_parts = msg['Received'].split(';')
     if len(rec_parts) > 1:
         clean['datetime'] = dates.parse_any(rec_parts[-1].strip())
     else:
         clean['datetime'] = dates.now()
     clean['from'] = msg['from'].replace('<', '').replace('>', '')
     clean['to'] = msg['to'].replace('<', '').replace('>', '').strip()
     clean['subject'] = msg['subject'].strip()
     clean['body'] = msg.as_string()
     # return
     return clean
Exemple #6
0
 def _parse(self, raw):
     """
     pre process raw message
     """
     # validate the message
     msg = email.message_from_string(raw)
     # normalize
     clean = {}
     rec_parts = msg['Received'].split(';')
     if len(rec_parts) > 1:
         clean['datetime'] = dates.parse_any(rec_parts[-1].strip())
     else:
         clean['datetime'] = dates.now()
     clean['from'] = msg['from'].replace('<', '').replace('>', '')
     clean['to'] = msg['to'].replace('<', '').replace('>', '').strip()
     clean['subject'] = msg['subject'].strip()
     clean['body'] = msg.as_string()
     # return
     return clean