def publish_date(soup, source_url=None): """ Extract publish date from meta / source_url. """ # try isodate first for tag in PUBLISH_DATE_TAGS: ds = _extract_tag_data(soup, tag) if ds: dt = dates.parse_iso(ds, enforce_tz=False) if dt: return dt # try a timestamp next. for tag in PUBLISH_DATE_TAGS: ds = _extract_tag_data(soup, tag) if ds: dt = dates.parse_ts(ds) if dt: return dt # try any date next. for tag in PUBLISH_DATE_TAGS: ds = _extract_tag_data(soup, tag) if ds: dt = dates.parse_any(ds, enforce_tz=False) if dt: return dt # fallback on url regex if source_url: dm = re_url_date.search(source_url) if dm: ds = dm.group(0) dt = dates.parse_any(ds, enforce_tz=False) if dt: return dt
def _prepare_date(o, field): """ Prepare a date """ if field not in o: return None if o[field] is None: return None if isinstance(o[field], datetime): return o[field] dt = dates.parse_any(o[field], enforce_tz=True) if not dt: raise RequestError('{}: "{}" is an invalid date.'.format( field, o[field])) return dt
def _prepare_date(o, field): """ Prepare a date """ if field not in o: return None if o[field] is None: return None if isinstance(o[field], datetime): return o[field] dt = dates.parse_any(o[field], enforce_tz=True) if not dt: raise RequestError( '{}: "{}" is an invalid date.' .format(field, o[field])) return dt
def _parse(self, raw): """ pre process raw message """ # validate the message msg = email.message_from_string(raw) # normalize clean = {} rec_parts = msg['Received'].split(';') if len(rec_parts) > 1: clean['datetime'] = dates.parse_any(rec_parts[-1].strip()) else: clean['datetime'] = dates.now() clean['from'] = msg['from'].replace('<', '').replace('>', '') clean['to'] = msg['to'].replace('<', '').replace('>', '').strip() clean['subject'] = msg['subject'].strip() clean['body'] = msg.as_string() # return return clean