def get_parsed(self, message_set): parsed_list = [] header_list = [] header_time = [] headers = False local_time = timezone('UTC') if not isinstance(message_set, str): for message in message_set: headers = message.get_all( 'RECEIVED') # only handles single email, will revisit while headers: try: match = re.search('([a-zA-Z]{3},.+.[0-9].+[0-9]{4})', headers[len(headers) - 1], re.DOTALL) if match: match = match.group().replace('\r\n', '') header_time.append( parsedate_to_datetime(match).astimezone( local_time).strftime("%Y-%m-%d %H:%M:%S")) except IndexError: continue header_list.append(headers[len(headers) - 1]) headers.pop(len(headers) - 1) parsed_list.append(list(zip(header_list, header_time))) return parsed_list else: self.error = message_set return self
def new_func(*args, **kwargs): r = func(*args, **kwargs) attempts = 5 while attempts > 0 and r is not None and r.status_code == 429: attempts -= 1 logger.info( 'Too many req, sleeping for %d secs and retrying another %d times', sleeptime, attempts) if 'retry-after' in r.headers: logger.info('App suggested retrying after "%s"', r.headers['retry-after']) try: secs = int(r.headers['retry-after']) except ValueError: # parse HTTP-date, MUST also support RFC 850 and ANSI C asctime (but f that for the moment!) # rfc5322 date-time secs = rfc5322.parsedate_to_datetime( r.headers['retry-after']) secs = secs - datetime.now() secs = secs.total_seconds() if secs <= 0: secs = None logger.info('App suggested sleeping for %s secs', str(secs)) # todo: replace sleeptime with suggested retry-after value? # (3600 seems default, we really wanna wait an hour?) sleep(sleeptime) r = func(*args, **kwargs) return r
def sync_installer(repo_url, local_dir: Path): logging.info("Start syncing {}".format(repo_url)) local_dir.mkdir(parents=True, exist_ok=True) full_scan = random.random() < 0.1 # Do full version check less frequently def remote_list(): r = requests.get(repo_url, timeout=TIMEOUT_OPTION) d = pq(r.content) for tr in d('table').find('tr'): tds = pq(tr).find('td') if len(tds) != 4: continue fname = tds[0].find('a').text md5 = tds[3].text yield (fname, md5) for filename, md5 in remote_list(): pkg_url = "/".join([repo_url, filename]) dst_file = local_dir / filename dst_file_wip = local_dir / ('.downloading.' + filename) if dst_file.is_file(): r = requests.head(pkg_url, allow_redirects=True, timeout=TIMEOUT_OPTION) len_avail = 'content-length' in r.headers if len_avail: remote_filesize = int(r.headers['content-length']) remote_date = parsedate_to_datetime(r.headers['last-modified']) stat = dst_file.stat() local_filesize = stat.st_size local_mtime = stat.st_mtime # Do content verification on ~5% of files (see issue #25) if (not len_avail or remote_filesize == local_filesize) and remote_date.timestamp() == local_mtime and \ (random.random() < 0.95 or md5_check(dst_file, md5)): logging.info("Skipping {}".format(filename)) # Stop the scanning if the most recent version is present if not full_scan: logging.info("Stop the scanning") break continue logging.info("Removing {}".format(filename)) dst_file.unlink() for retry in range(3): logging.info("Downloading {}".format(filename)) err = '' try: err = curl_download(pkg_url, dst_file_wip, md5=md5) if err is None: dst_file_wip.rename(dst_file) except sp.CalledProcessError: err = 'CalledProcessError' if err is None: break logging.error("Failed to download {}: {}".format(filename, err))
def inbox(self, actor_id): if request.json is None: abort(400, "json body is necessary") if request.headers.get("Host") != current_app.config["SERVER_NAME"]: abort(401, "Incorrect host") raw_date = request.headers.get("Date") if not raw_date: abort(401, "Date not provided") time_difference = datetime.now(timezone.utc) - parsedate_to_datetime(raw_date) if abs(time_difference.total_seconds()) > 30: abort(401, "Invalid date") # TODO: implement digest check signature_check = signature_is_valid( self.manager.external_key_store.get, request.headers, url_for(".inbox", actor_id=actor_id), "POST", ) if not signature_check and not current_app.config.get("DEBUG_INBOX_IGNORE_SIGNATURE"): abort(401, "Request signature could not be verified") actor = self.manager.find(actor_id) if not actor: abort(404) response = self.manager.handle_activity(actor, request.json) return activityjsonify(response)
def get_date_from_email_header(header: str) -> Optional[datetime]: """Parse an email header such as Date or Received. The format is either just the date or name value pairs followed by ; and the date specification. For example: by 2002:a17:90a:77cb:0:0:0:0 with SMTP id e11csp4670216pjs; Mon, 21 Dec 2020 12:11:57 -0800 (PST) Args: header (str): header value to parse Returns: Optional[datetime]: parsed datetime """ if not header: return None try: date_part = header.split(';')[-1].strip() res = parsedate_to_datetime(date_part) if res.tzinfo is None: # some headers may contain a non TZ date so we assume utc res = res.replace(tzinfo=timezone.utc) return res except Exception as ex: demisto.debug( f'Failed parsing date from header value: [{header}]. Err: {ex}. Will ignore and continue.' ) return None
def save_email(message): """ Saves a given email in the database :param message: email message :return: """ ccs = message.cc recipients = message.sent_to parsed_date = parsedate_to_datetime(message.date) email = MyEmail( email_subject=message.subject, email_from=message.sent_from[0].get('email'), email_body=message.body.get('plain'), date=parsed_date ) email.save() if ccs: for cc in ccs: email.copies.create(email_address=cc.get('email')) for r in recipients: email.recipients.create(email_address=r.get('email'))
def get_parsed(self, message_set): parsed_list = [] header_list = [] header_time = [] headers = False local_time = timezone('UTC') if not isinstance(message_set, str): for message in message_set: headers = message.get_all('RECEIVED') # only handles single email, will revisit while headers: try: match = re.search('([a-zA-Z]{3},.+.[0-9].+[0-9]{4})', headers[len(headers) - 1], re.DOTALL) if match: match = match.group().replace('\r\n', '') header_time.append(parsedate_to_datetime(match).astimezone(local_time).strftime("%Y-%m-%d %H:%M:%S")) except IndexError: continue header_list.append(headers[len(headers) - 1]) headers.pop(len(headers)-1) parsed_list.append(list(zip(header_list, header_time))) return parsed_list else: self.error = message_set return self
def ingest(self, file_path, entity): entity.schema = model.get("Email") try: msg = Message(file_path.as_posix()) except Exception as exc: msg = "Cannot open message file: %s" % exc raise ProcessingException(msg) from exc self.extract_olefileio_metadata(msg, entity) try: self.extract_msg_headers(entity, msg.header) except Exception: log.exception("Cannot parse Outlook-stored headers") entity.add("subject", msg.subject) entity.add("threadTopic", msg.getStringField("0070")) entity.add("encoding", msg.encoding) entity.add("bodyText", msg.body) entity.add("bodyHtml", msg.htmlBody) entity.add("messageId", self.parse_message_ids(msg.message_id)) if not entity.has("inReplyTo"): entity.add("inReplyTo", self.parse_references(msg.references, [])) try: date = parsedate_to_datetime(msg.date).isoformat() entity.add("date", date) except Exception: log.warning("Could not parse date: %s", msg.date) # sender name and email sender = self.get_identities(msg.sender) self.apply_identities(entity, sender, "emitters", "sender") # received by sender = self.get_identity(msg.getStringField("0040"), msg.getStringField("0076")) self.apply_identities(entity, sender, "emitters") froms = self.get_identities(msg.getStringField("1046")) self.apply_identities(entity, froms, "emitters", "from") tos = self.get_identities(msg.to) self.apply_identities(entity, tos, "recipients", "to") ccs = self.get_identities(msg.cc) self.apply_identities(entity, ccs, "recipients", "cc") bccs = self.get_identities(msg.bcc) self.apply_identities(entity, bccs, "recipients", "bcc") self.resolve_message_ids(entity) for attachment in msg.attachments: if attachment.type != "data": continue name = stringify(attachment.longFilename) name = name or stringify(attachment.shortFilename) self.ingest_attachment(entity, name, attachment.type, attachment.data)
def write_table(mboxfile, mailTable): """ Takes a list and extends it with lists of data, which is extracted from mbox messages. :param mboxfile: Mbox file name/path :param mailTable: A list (of lists) :return: An extended list of lists """ mail_box_contents = mailbox.mbox(mboxfile) m_pbar = tqdm.tqdm(range(0, len(mail_box_contents))) m_pbar.set_description('Extracting mbox messages...') count = 0 update_interval = min(50, len(mail_box_contents)) for message in mail_box_contents: count += 1 if count % update_interval == 0: m_pbar.update(update_interval) clean_from = clean_address(message['From']) clean_to = clean_addresses(message['To']) clean_cc = clean_addresses(message['Cc']) try: clean_date = email.parsedate_to_datetime(message['Date']) except: clean_date = None mailTable.append([ clean_from, clean_to, clean_cc, clean_date, message['Subject'], get_body(message) ])
def get_date(self: object) -> datetime: date_element = self.soup.find('time', {'datetime': True}) if not date_element: raise ScraperError(f'Date missing from page.') date_string = date_element['datetime'] for date_format in self.date_formats: try: return datetime.strptime(date_string, date_format)\ .replace(tzinfo=timezone.utc) except ValueError: pass try: # RFC-822 return parsedate_to_datetime(date_string) except TypeError: pass try: # ISO-8601 return datetime.fromisoformat(date_string) except ValueError: pass raise ScraperError(f'Unrecognized date format "{date_string}".')
def _convert_to_timestamp(date): # covert HTTP date to POSIX timestamp from email.utils import parsedate_to_datetime try: return parsedate_to_datetime(date).timestamp() except TypeError: return None
def _record_call_status(request, related_cr): # Create dict to hold information of interest from call status response call_status_info = {} # Retrieve information we need to determine success of call # CallDuration will return the duration in seconds status_entries = ['CallStatus', 'Timestamp', 'AnsweredBy'] for entry in status_entries: call_status_info[entry] = request.GET[entry] # Set Timestamp entry to datetime obj instead of RFC 2822 call_status_info['Timestamp'] = parsedate_to_datetime( call_status_info['Timestamp']) # Check whether call was completed by a human (success metric) if (call_status_info['CallStatus'] == 'completed' and call_status_info['AnsweredBy'] == 'human'): # Save call duration and set related CallRequest.call_completed-True call_status_info['CallDuration'] = request.GET['CallDuration'] call_status_info['Success'] = True related_cr.call_completed = True related_cr.save() else: # Save duration=0, log miss, don't change CallRequest.call_completed call_status_info['CallDuration'] = 0 call_status_info['Success'] = False err_msg = 'Call ended with {} status, answered by {}' log.error(err_msg.format( call_status_info['CallStatus'], call_status_info['AnsweredBy'])) return call_status_info
async def get_rss_content(url: str) -> list: result = [] async with aiohttp.ClientSession(trust_env=True) as sess: logger.debug(f'Start fetching {url}') resp = await sess.get(url) doc = fromstring(await resp.text()) desc = doc.find('channel/description') if desc: desc = desc.text for i in doc.iterfind('channel/item'): item = {} # RFC822 item['date'] = parsedate_to_datetime( i.findtext('pubDate')).timestamp() item['title'] = i.findtext('title') item['description'] = i.findtext('description') item['link'] = i.findtext('link') item['enclosure_url'] = i.find('enclosure').attrib['url'] result.append(item) return result
def _process_message(self, message): msg = Message() msg.mailbox = self if 'subject' in message: msg.subject = convert_header_to_unicode(message['subject'])[0:255] if 'message-id' in message: msg.message_id = message['message-id'][0:255] if 'from' in message: msg.from_header = convert_header_to_unicode(message['from']) if 'to' in message: msg.to_header = convert_header_to_unicode(message['to']) if 'date' in message: sent_time_str = convert_header_to_unicode(message['date']) msg.sent_time = parsedate_to_datetime(sent_time_str) elif 'Delivered-To' in message: msg.to_header = convert_header_to_unicode(message['Delivered-To']) msg.save() message = self._get_dehydrated_message(message, msg) msg.set_body(message.as_string()) if message['in-reply-to']: try: msg.in_reply_to = Message.objects.filter( message_id=message['in-reply-to'] )[0] except IndexError: pass msg.save() return msg
def parse_http_date(value: str) -> Optional[datetime]: """Attempt to parse an HTTP (RFC 5322-compatible) timestamp""" try: return parsedate_to_datetime(value) except (TypeError, ValueError): logger.debug(f'Failed to parse timestamp: {value}') return None
def downloading_worker(q): while True: item = q.get() if item is None: break try: url, dst_file, working_dir = item if dst_file.is_file(): print("checking", url, flush=True) r = requests.head(url, timeout=TIMEOUT_OPTION) remote_filesize = int(r.headers['content-length']) remote_date = parsedate_to_datetime(r.headers['last-modified']) stat = dst_file.stat() local_filesize = stat.st_size local_mtime = stat.st_mtime if remote_filesize == local_filesize and remote_date.timestamp() == local_mtime: print("skipping", dst_file.relative_to(working_dir), flush=True) continue dst_file.unlink() print("downloading", url, flush=True) requests_download(url, dst_file) except Exception: traceback.print_exc() print("Failed to download", url, flush=True) if dst_file.is_file(): dst_file.unlink() finally: q.task_done()
def _fetch_entries(feed, loop=None): url = feed.url request_etag = feed.etag request_last_modified = feed.last_modified response = yield from _request_entries( url, etag=request_etag, last_modified=request_last_modified, loop=loop ) if response is None: log.info('%s is not modified', feed) return None response_headers, xml = response etag = response_headers.get('etag','') if isinstance(etag, bytes): etag = etag.decode('utf-8', 'ignore') last_modified = response_headers.get('last-modified') if last_modified: last_modified = parsedate_to_datetime(last_modified) data = feedparser.parse(xml) return FeedResponse(etag, last_modified, data.entries)
def new_format(self, navbar: BeautifulSoup, content: BeautifulSoup) -> List[str]: """ Extracts email message information if it uses the new Mailman format Args: content: BeautifulSoup Returns: List[str] """ sender = content.find(id='from').text.split('via')[0][6:].strip() date_str = content.find(id='date').text.split(': ')[1].strip() date = parsedate_to_datetime(date_str).isoformat() body = content.find(id='body').text.strip() nxt, rep_to = None, None links = navbar.findAll('a') for l in links: if 'Next in thread' in str(l): nxt = '/'.join( self.email_url.split('/')[:-1]) + '/' + l['href'] nxt = nxt[1:] if nxt[0] == '/' else nxt elif 'reply to' in str(l): rep_to = '/'.join( self.email_url.split('/')[:-1]) + '/' + l['href'] rep_to = rep_to[1:] if rep_to[0] == '/' else rep_to return [str(i) for i in [sender, date, body, nxt, rep_to]]
def getResponse_bf(path): global endPoint_bf url = endPoint_bf + path response = urllib.request.urlopen(url, timeout=30) res_date = parsedate_to_datetime(response.headers['date']) content = json.loads(response.read().decode('utf8')) return res_date, content
def PREPROCESS_FUNC(filename, row): return { "id": int(row["TweetId"]), "text": row["TweetText"], "created_at": parsedate_to_datetime(row["TweetDate"]), "tags": [row["Topic"], row["Sentiment"]] }
def asDate(raw): if raw: try: return parsedate_to_datetime(raw) except Exception: return None return None
def parse(cls, api, json): status = cls(api) setattr(status, '_json', json) for k, v in json.items(): if k == 'user': user_model = getattr(api.parser.model_factory, 'user') if api else User user = user_model.parse(api, v) setattr(status, 'author', user) setattr(status, 'user', user) # DEPRECIATED elif k == 'created_at': setattr(status, k, parsedate_to_datetime(v)) elif k == 'source': if '<' in v: # At this point, v should be of the format: # <a href="{source_url}" rel="nofollow">{source}</a> setattr(status, k, v[v.find('>') + 1:v.rfind('<')]) start = v.find('"') + 1 end = v.find('"', start) setattr(status, 'source_url', v[start:end]) else: setattr(status, k, v) setattr(status, 'source_url', None) elif k == 'retweeted_status': setattr(status, k, Status.parse(api, v)) elif k == 'quoted_status': setattr(status, k, Status.parse(api, v)) elif k == 'place': if v is not None: setattr(status, k, Place.parse(api, v)) else: setattr(status, k, None) else: setattr(status, k, v) return status
def download_list(url): headers = None cache = Path(config['cache'], hashlib.sha1(url.encode()).hexdigest()) if cache.is_file(): last_modified = datetime.utcfromtimestamp(cache.stat().st_mtime) headers = { 'If-modified-since': eut.format_datetime(last_modified), 'User-Agent': 'Bind adblock zonfile updater v1.0 (https://github.com/Trellmor/bind-adblock)' } try: r = requests.get(url, headers=headers, timeout=config['req_timeout_s']) if r.status_code == 200: with cache.open('w') as f: f.write(r.text) if 'last-modified' in r.headers: last_modified = eut.parsedate_to_datetime( r.headers['last-modified']).timestamp() os.utime(str(cache), times=(last_modified, last_modified)) return r.text except requests.exceptions.RequestException as e: print(e) if cache.is_file(): with cache.open() as f: return f.read()
def parse_email_date(value: str) -> datetime.datetime: """ Parsing the date described in rfc2822 1900-1-1 for unparsed, may be naive or with tzinfo """ try: return parsedate_to_datetime(value) except Exception: # noqa pass match = re.search( r'(?P<date>\d{1,2}\s+(' + '|'.join(SHORT_MONTH_NAMES) + r')\s+\d{4})\s+' + r'(?P<time>\d{1,2}:\d{1,2}(:\d{1,2})?)\s*' + r'(?P<zone_sign>[+-])?(?P<zone>\d{4})?', value) if match: group = match.groupdict() day, month, year = group['date'].split() time_values = group['time'].split(':') zone_sign = int('{}1'.format(group.get('zone_sign') or '+')) zone = group['zone'] return datetime.datetime( year=int(year), month=SHORT_MONTH_NAMES.index(month) + 1, day=int(day), hour=int(time_values[0]), minute=int(time_values[1]), second=int(time_values[2]) if len(time_values) > 2 else 0, tzinfo=datetime.timezone( datetime.timedelta(hours=int(zone[:2]) * zone_sign, minutes=int(zone[2:]) * zone_sign)) if zone else None, ) else: return datetime.datetime(1900, 1, 1)
def parse_date(date_string): if type(date_string) is datetime: return date_string date_string = date_string.strip().rstrip("-").strip() dt = _parse_datestrings(date_string) if dt and type(dt) is datetime: return dt try: dt = parsedate_to_datetime(date_string) return dt except (TypeError, KeyError, ValueError, AttributeError): pass date9tuple = _parse_date(date_string) if date9tuple: try: dt = datetime(date9tuple) except Exception as e: pass return dt
def check_and_download(url: str, dst_file: Path, caching=False) -> int: try: if caching: if url in download_cache: print(f"Using cached content: {url}", flush=True) with dst_file.open('wb') as f: f.write(download_cache[url]) return 0 download_cache[url] = bytes() start = time.time() with requests.get(url, stream=True, timeout=(5, 10)) as r: r.raise_for_status() if 'last-modified' in r.headers: remote_ts = parsedate_to_datetime( r.headers['last-modified']).timestamp() else: remote_ts = None with dst_file.open('wb') as f: for chunk in r.iter_content(chunk_size=1024**2): if time.time() - start > DOWNLOAD_TIMEOUT: raise TimeoutError("Download timeout") if not chunk: continue # filter out keep-alive new chunks f.write(chunk) if caching: download_cache[url] += chunk if remote_ts is not None: os.utime(dst_file, (remote_ts, remote_ts)) return 0 except BaseException as e: print(e, flush=True) if dst_file.is_file(): dst_file.unlink() if url in download_cache: del download_cache[url] return 1
def get_news(): news = [] try: feed = urlopen( 'https://kozeagroup.wordpress.com/category/backoffice/feed/', timeout=3) except socket.timeout: return news tree = ElementTree.parse(feed) for item in tree.find('channel').findall('item'): date = parsedate_to_datetime(item.find('pubDate').text) entry = { 'title': item.find('title').text, 'description': item.find('description').text, 'link': item.find('link').text, 'isodate': date.strftime('%Y-%m-%d'), 'date': date.strftime('%d %B %Y') } image = item.find( 'media:thumbnail', namespaces={'media': 'http://search.yahoo.com/mrss/'}) if image is not None: entry['image'] = image.attrib['url'] news.append(entry) return news
def handle_selection(self, k): data = self.parent.parentApp.login_form.emails[self.cursor_line] self.parent.parentApp.email_detail_form.form_addr.value = data['from'] self.parent.parentApp.email_detail_form.subject.value = data['subject'] self.parent.parentApp.email_detail_form.date.value = parsedate_to_datetime(data['date']).strftime('%a, %d %b') self.parent.parentApp.email_detail_form.content.value = '\n\n'+data['body'] self.parent.parentApp.switchForm('EMAIL_DETAIL')
def insertNewsIntoTable(channel): ''' 1. fill table with news 2. convert date into timestamp ''' news = list() for index, item in enumerate(channel.entries): description = getDescription(item.description) try: pub_date_stamp = time.mktime( parsedate_to_datetime(item.published).timetuple()) except ValueError as error: logg.logging.error("ValueError: " + str(error)) media_content = checkMediaContent(item) image = '' if (media_content): try: response = requests.get(media_content) image = psycopg2.Binary(response.content) except Exception as error: logg.logging.error("Exception: " + str(e)) row = (html.unescape(item.title), item.link, image, description, pub_date_stamp) news.append(row) return news
def check(since): conn = imaplib.IMAP4_SSL(settings.EMAIL_HOST) conn.login(settings.EMAIL_HOST_USER, settings.EMAIL_HOST_PASSWORD) conn.select('INBOX') # Format today as string today_string = date.today().strftime("%d-%b-%Y") # Search inbox (IMAP doesn't allow searching by timestamp) result, data = conn.search( None, f'(SENTSINCE {today_string} FROM {settings.CHRIS_EMAIL})') if not data: logger.debug('No emails found today.') return None # Look for emails since since for email_id in data[0].split(): msg_string = conn.fetch(email_id, '(RFC822)')[1][0][1] msg = message_from_bytes(msg_string) response_time_naive = parsedate_to_datetime(msg['Date']) response_time = utc.localize(response_time_naive) if response_time > since: return response_time else: logger.debug('No new emails found.') return None
def get_member_details(self, id): member = {} try: message = self.service.users().messages().get( userId='me', id=id, format='metadata').execute() header_data = message["payload"]["headers"] correct_subject = False for data in header_data: if 'subject' == data['name'].lower( ) and self.subject in data['value']: correct_subject = True if not correct_subject: return '' for data in header_data: if "Date" == data["name"]: date = parsedate_to_datetime(data["value"]) member["time"] = date.isoformat() if "From" == data["name"]: print(data["value"]) email_id = data["value"] if '<' in email_id: start = email_id.find('<') end = email_id.find('>') email_id = email_id[start + 1:end] member["email"] = email_id print(member) return member except errors.HttpError as error: print('An error occurred: %s' % error)
def test_last_modified_header_is_set(self): resp = self.app.get(self.changeset_uri, headers=self.headers) timestamp = resp.json["timestamp"] dt = parsedate_to_datetime(resp.headers["Last-Modified"]) assert dt.timestamp() == int(timestamp / 1000)
def get(self, request, locale, collection, *args, **kwargs): self.collection = self.get_collection(collection).first() self.locale = locale pages = self.get_queryset() updated_pages = list(pages) if not pages or len(pages) == 0: raise Http404 last_updated = None if 'lastUpdated' in request.GET: last_updated = datetime.strptime(request.GET['lastUpdated'], '%Y-%m-%dT%H:%M:%SZ') if 'HTTP_IF_MODIFIED_SINCE' in request.META: last_updated = parsedate_to_datetime( request.META['HTTP_IF_MODIFIED_SINCE']) if last_updated: updated_pages = Page.objects.filter( last_published_at__gt=last_updated, id__in=pages.values_list("id", flat=True)) if updated_pages.count() == 0: return HttpResponse(status=304) # not modified updated_pages = list(updated_pages) tar = make_tar(list(pages), updated_pages, self.locale, request, content_serializer=self.content_serializer_class) return self.render_to_tar_response(tar)
def fetch_csv_as_string(url): res = requests.get(url) last_modified = res.headers['Last-Modified'] jst_datetime = eut.parsedate_to_datetime(last_modified).astimezone(JST) # NOTE: A char code of Fukushima pref's CSV is Shift-JIS res.encoding = 'shift_jis' return res.text, jst_datetime
def parseweekmail(el, pl, st): ''' :param el 邮箱长度 :param pl poplib server对象 :param st 解析周报的开始时间 :return 邮箱列表 ''' sender_list = [] for index in range(el, 0, -1): lines = pl.retr(index)[1] msg = BytesParser(EmailMessage).parsebytes(b'\r\n'.join(lines)) # 判断是否是本周 判断是否接受者是周报组 mail_date = parsedate_to_datetime(msg.get('Date', "")).date() mail_receiver = parseaddr(msg.get('To', ""))[1] mail_cc = parseaddr(msg.get('Cc', ""))[1] if mail_date < st: break mail_subject = decode_str(msg.get('Subject', "")) if (mail_receiver == WEEKLY_GROUP or WEEKLY_GROUP in mail_cc) and not ( mail_subject.startswith('项目周报') or decode_str(mail_subject).split('(')[0].endswith('项目周报') or decode_str(mail_subject).split('(')[0].endswith('项目周报')): sender_list.append(parseaddr(msg.get('From', ""))[1]) return sender_list
def parse(cls, value, kwds): if not value: kwds["defects"].append(errors.HeaderMissingRequiredValue()) kwds["datetime"] = None kwds["decoded"] = "" kwds["parse_tree"] = parser.TokenList() return if isinstance(value, str): value = utils.parsedate_to_datetime(value) kwds["datetime"] = value kwds["decoded"] = utils.format_datetime(kwds["datetime"]) kwds["parse_tree"] = cls.value_parser(kwds["decoded"])
def gen_filename(mail_dict) : """ Generates a filename from a dictionary with keys 'Id', 'Date', 'Folder' """ from email.utils import parsedate_to_datetime dateObj = parsedate_to_datetime(mail_dict['Date']) fulldate = dateObj.strftime("%Y-%m-%d_%H.%M_utc%z") year = dateObj.strftime("%Y") Id = mail_dict['Id'].replace('<', '').replace('>', '').replace('%', '').replace('/', '-').replace(' ', '') Folder = mail_dict['Folder'].replace(' ', '_') return os.path.join(Folder, year, fulldate + "_" + Id + '.eml')
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--base-url", default=BASE_URL) parser.add_argument("--working-dir", default=WORKING_DIR) args = parser.parse_args() if args.working_dir is None: raise Exception("Working Directory is None") working_dir = Path(args.working_dir) remote_filelist = [] rs = RemoteSite(args.base_url) for url in rs.files: dst_file = working_dir / rs.relpath(url) remote_filelist.append(dst_file.relative_to(working_dir)) if dst_file.is_file(): r = requests.head(url) remote_filesize = int(r.headers['content-length']) remote_date = parsedate_to_datetime(r.headers['last-modified']) stat = dst_file.stat() local_filesize = stat.st_size local_mtime = stat.st_mtime if remote_filesize == local_filesize and remote_date.timestamp() == local_mtime: print("Skipping", dst_file.relative_to(working_dir), flush=True) continue dst_file.unlink() else: dst_file.parent.mkdir(parents=True, exist_ok=True) print("downloading", url, flush=True) try: curl_download(url, dst_file) except Exception: print("Failed to download", url, flush=True) if dst_file.is_file(): dst_file.unlink() local_filelist = [] for local_file in working_dir.glob('**/*'): if local_file.is_file(): local_filelist.append(local_file.relative_to(working_dir)) for old_file in set(local_filelist) - set(remote_filelist): print("deleting", old_file, flush=True) old_file = working_dir / old_file old_file.unlink()
def __init__(self, feed_url): self.feed_url = feed_url xml = urlopen(feed_url).readall().decode() self.root = ET.fromstring(xml) self.items = [] for item in self.root.iter('item'): title = item[0].text link = item[1].text description = item[2].text pub_date = parsedate_to_datetime(item[3].text) duration_string = item[9].text duration = None if duration_string: duration = int(duration_string) feed_item = RssFeedItem(title, link, description, pub_date, duration) self.items.append(feed_item)
def get_sig_string(req, cano_req, scope): """ Generate the AWS4 auth string to sign for the request. Checks to see if date has been passed as x-amz-date or date, if date then parse the date from a date string. req -- Requests PreparedRequest object. This should already include an x-amz-date or a date header. cano_req -- The Canonical Request, as returned by get_canonical_request() """ if 'x-amz-date' in req.headers: amz_date = req.headers['x-amz-date'] elif 'date' in req.headers: amz_date = parsedate_to_datetime(req.headers['date']).strftime( '%Y%m%dT%H%M%SZ') hsh = hashlib.sha256(cano_req.encode()) sig_items = ['AWS4-HMAC-SHA256', amz_date, scope, hsh.hexdigest()] sig_string = '\n'.join(sig_items) return sig_string
def _parse_mail(self, data): """ Parse data from email and return a model Email object """ msg = self._parser.parsestr(data) tos = msg.get_all('to', []) ccs = msg.get_all('cc', []) froms = msg.get_all('from', []) name, addr = getaddresses(froms)[0] email = { 'sender': addr, 'subject': msg['subject'], 'to': [addr for name, addr in getaddresses(tos)], 'cc': [addr for name, addr in getaddresses(ccs)], 'epoch': int(parsedate_to_datetime(msg['Date']).timestamp()), } self._extract_content(email, msg) return email
def sync_installer(repo_url, local_dir: Path): logging.info("Start syncing {}".format(repo_url)) local_dir.mkdir(parents=True, exist_ok=True) def remote_list(): r = requests.get(repo_url) d = pq(r.content) for tr in d('table').find('tr'): tds = pq(tr).find('td') if len(tds) != 4: continue fname = tds[0].find('a').text md5 = tds[3].text yield (fname, md5) for filename, md5 in remote_list(): pkg_url = "/".join([repo_url, filename]) dst_file = local_dir / filename if dst_file.is_file(): r = requests.head(pkg_url) remote_filesize = int(r.headers['content-length']) remote_date = parsedate_to_datetime(r.headers['last-modified']) stat = dst_file.stat() local_filesize = stat.st_size local_mtime = stat.st_mtime if remote_filesize == local_filesize and remote_date.timestamp() == local_mtime: logging.info("Skipping {}".format(filename)) continue dst_file.unlink() for retry in range(3): logging.info("Downloading {}".format(filename)) err = curl_download(pkg_url, dst_file, md5=md5) if err is None: break logging.error("Failed to download {}: {}".format(filename, err))
def datetime_from_rfc1123(string): return parsedate_to_datetime(string)
def _get_message_date(self, message): return parsedate_to_datetime(message['Date'])
def handle(self, *args, **options): current_date = parsedate_to_datetime(options['current_date']) if options['current_date'] else now() for i in Interview.objects.filter(planned_date__lte=current_date, process__state__in=Process.OPEN_STATE_VALUES, state=Interview.PLANNED): i.state = Interview.WAIT_INFORMATION i.save()
#!/usr/bin/env python3 import sys import re # import subprocess from email.utils import format_datetime, parsedate_to_datetime in_headers = True for line in sys.stdin.readlines(): if line == "\n": in_headers = False match = re.match(r'^Date: (.+)', line) if not in_headers or not match: print(line, end="") continue date_string = match.group(1) # use this if you do not have python 3.3+ # converted_date = subprocess.Popen(['date','-d',date_string], stdout=subprocess.PIPE).communicate()[0].strip() converted_date = format_datetime(parsedate_to_datetime(date_string).astimezone(tz=None)) print('Date:', converted_date)
def _parse_ratelimit_header(request): now = parsedate_to_datetime(request.headers['Date']) reset = datetime.datetime.fromtimestamp(int(request.headers['X-Ratelimit-Reset']), datetime.timezone.utc) return (reset - now).total_seconds()
def request(self, route, *, header_bypass_delay=None, **kwargs): bucket = route.bucket method = route.method url = route.url lock = self._locks.get(bucket) if lock is None: lock = asyncio.Lock(loop=self.loop) if bucket is not None: self._locks[bucket] = lock # header creation headers = { 'User-Agent': self.user_agent, } if self.token is not None: headers['Authorization'] = 'Bot ' + self.token if self.bot_token else self.token # some checking if it's a JSON request if 'json' in kwargs: headers['Content-Type'] = 'application/json' kwargs['data'] = utils.to_json(kwargs.pop('json')) kwargs['headers'] = headers if self._global_lock.locked(): # wait until the global lock is complete yield from self._global_lock yield from lock with MaybeUnlock(lock) as maybe_lock: for tries in range(5): r = yield from self.session.request(method, url, **kwargs) log.debug(self.REQUEST_LOG.format(method=method, url=url, status=r.status, json=kwargs.get('data'))) try: # even errors have text involved in them so this is safe to call data = yield from json_or_text(r) # check if we have rate limit header information remaining = r.headers.get('X-Ratelimit-Remaining') if remaining == '0' and r.status != 429: # we've depleted our current bucket if header_bypass_delay is None: now = parsedate_to_datetime(r.headers['Date']) reset = datetime.datetime.fromtimestamp(int(r.headers['X-Ratelimit-Reset']), datetime.timezone.utc) delta = (reset - now).total_seconds() else: delta = header_bypass_delay fmt = 'A rate limit bucket has been exhausted (bucket: {bucket}, retry: {delta}).' log.info(fmt.format(bucket=bucket, delta=delta)) maybe_lock.defer() self.loop.call_later(delta, lock.release) # the request was successful so just return the text/json if 300 > r.status >= 200: log.debug(self.SUCCESS_LOG.format(method=method, url=url, text=data)) return data # we are being rate limited if r.status == 429: fmt = 'We are being rate limited. Retrying in {:.2} seconds. Handled under the bucket "{}"' # sleep a bit retry_after = data['retry_after'] / 1000.0 log.info(fmt.format(retry_after, bucket)) # check if it's a global rate limit is_global = data.get('global', False) if is_global: log.info('Global rate limit has been hit. Retrying in {:.2} seconds.'.format(retry_after)) # acquire the global lock and block all processing yield from self._global_lock yield from asyncio.sleep(retry_after, loop=self.loop) # release the global lock now that the # global rate limit has passed if is_global: self._global_lock.release() continue # we've received a 502, unconditional retry if r.status == 502 and tries <= 5: yield from asyncio.sleep(1 + tries * 2, loop=self.loop) continue # the usual error cases if r.status == 403: raise Forbidden(r, data) elif r.status == 404: raise NotFound(r, data) else: raise HTTPException(r, data) finally: # clean-up just in case yield from r.release()
def test_parsedate_to_datetime_naive(self): self.assertEqual( utils.parsedate_to_datetime(self.datestring + ' -0000'), self.naive_dt)
def test_parsedate_to_datetime(self): self.assertEqual( utils.parsedate_to_datetime(self.datestring + self.offsetstring), self.aware_dt)
def parsed(raw, uid, time, flags): # "email.message_from_bytes" uses "email.policy.compat32" policy # and it's by intention, because new policies don't work well # with real emails which have no encodings, badly formated addreses, etc. orig = email.message_from_bytes(raw) htm, txt, files, headers, errors = parse_mime(orig, uid) meta = {'origin_uid': uid, 'files': [], 'errors': errors} if htm: embeds = { f['content-id']: f['url'] for f in files if 'content-id' in f } htm, extra_meta = html.clean(htm, embeds) meta.update(extra_meta) elif txt: htm = html.from_text(txt) meta['preview'] = preview(htm, files) meta['files'] = files fields = ( ('From', 1), ('Sender', 1), ('Reply-To', 0), ('To', 0), ('CC', 0), ('BCC', 0) ) for n, one in fields: v = headers.get(n) if not v: continue v = addresses(v) meta[n.lower()] = v[0] if one else v subj = headers['Subject'] meta['subject'] = str(subj).strip() if subj else '' refs = orig['references'] refs = [i.strip().lower() for i in refs.split()] if refs else [] parent = refs[-1] if refs else None in_reply_to = orig['in-reply-to'] and normalize_msgid(orig['in-reply-to']) if in_reply_to: parent = in_reply_to if not refs: refs = [in_reply_to] meta['parent'] = parent mid = orig['message-id'] if mid is None: log.info('UID=%s has no "Message-ID" header', uid) mid = '<mailur@noid>' else: mid = normalize_msgid(mid) meta['msgid'] = mid arrived = dt.datetime.strptime(time.strip('"'), '%d-%b-%Y %H:%M:%S %z') meta['arrived'] = int(arrived.timestamp()) date = orig['date'] try: date = date and int(parsedate_to_datetime(date).timestamp()) except Exception as e: meta['errors'].append('error on date: val=%r err=%r' % (date, e)) log.error('UID=%s can\'t parse date: val=%r err=%r', uid, date, e) date = None meta['date'] = date or meta['arrived'] msg = new() msg.add_header('X-UID', '<%s>' % uid) msg.add_header('Message-ID', mid) msg.add_header('Subject', meta['subject']) msg.add_header('Date', orig['Date']) for n, v in headers.items(): if n in msg: continue msg.add_header(n, v) is_draft = '\\Draft' in flags if is_draft: draft_id = orig['X-Draft-ID'] or mid msg.add_header('X-Draft-ID', draft_id) meta['draft_id'] = draft_id txt = parse_draft(orig)[0] elif orig['X-Draft-ID']: msg.add_header('X-Draft-ID', orig['X-Draft-ID']) thrid = None if not is_draft: addrs = [msg['from'] or msg['sender'], msg['to']] addrs = (a for a in addrs if a) addrs = ','.join(sorted( '"%s" <%s>' % (n, a) if n else a for n, a in getaddresses(addrs) )) addrs_n_subj = ' '.join(i for i in (addrs, subj) if i) thrid = hashlib.md5(addrs_n_subj.encode()).hexdigest() thrid = '<*****@*****.**>' % thrid thrid = ' '.join(i for i in (thrid, orig['X-Thread-ID']) if i) if thrid: meta['thrid'] = thrid msg.add_header('X-Thread-ID', thrid) refs.insert(0, thrid) if refs: msg.add_header('In-Reply-To', refs[-1]) msg.add_header('References', ' '.join(refs)) msg.make_mixed() meta_txt = json.dumps(meta, sort_keys=True, ensure_ascii=False, indent=2) msg.attach(binary(meta_txt, 'application/json')) body = new() body.make_alternative() body.attach(binary(htm, 'text/html')) if txt: body.attach(binary(txt)) msg.attach(body) flags = [] if meta['errors']: flags.append('#err') return msg, flags
def format_date(date): dt = parsedate_to_datetime(date) return {'timestamp': dt.timestamp(), 'date_str': dt.strftime('%Y-%m-%d %H:%M')}