Beispiel #1
0
def fetch_plugin(old_index, entry):
    lm_map = {plugin["thread_id"]: plugin for plugin in old_index.itervalues()}
    raw = read(entry.url)
    url, name = parse_plugin_zip_url(raw)
    if url is None:
        raise ValueError("Failed to find zip file URL for entry: %s" % repr(entry))
    plugin = lm_map.get(entry.thread_id, None)

    if plugin is not None:
        # Previously downloaded plugin
        lm = datetime(*tuple(map(int, re.split(r"\D", plugin["last_modified"])))[:6])
        request = urllib2.Request(url)
        request.get_method = lambda: "HEAD"
        with closing(urllib2.urlopen(request)) as response:
            info = response.info()
        slm = datetime(*parsedate(info.get("Last-Modified"))[:6])
        if lm >= slm:
            # The previously downloaded plugin zip file is up-to-date
            update_plugin_from_entry(plugin, entry)
            return plugin

    raw, info = read(url, get_info=True)
    slm = datetime(*parsedate(info.get("Last-Modified"))[:6])
    plugin = get_plugin_info(raw)
    plugin["last_modified"] = slm.isoformat()
    plugin["file"] = "staging_%s.zip" % entry.thread_id
    plugin["size"] = len(raw)
    plugin["original_url"] = url
    update_plugin_from_entry(plugin, entry)
    with open(plugin["file"], "wb") as f:
        f.write(raw)
    return plugin
Beispiel #2
0
def serve_static_content(content):
    """
    This is for serving arbritary content from sources like in-memory caches
    where we don't want to write the content to disk

    Headers like Content-Type and Last-Modified must have already been set as needed, e.g.
    cherrypy.response.headers['Content-Type'] = 'text/css'
    so we can properly compare Last-Modified with If-Modified-Since in the request
    If Last-Modified is not set, we will fall back to 200 and return the content
    """
    if cherrypy.request.headers.get('Pragma') == 'no-cache' or cherrypy.request.headers.get('Cache-Control') == 'no-cache':
        # Hard reload (Command-Shift-R)
        # HTTP response code will be set to 200 upstream
        cherrypy.response.body = content
    elif not cherrypy.request.headers.get('If-Modified-Since'):
        # first request
        # HTTP response code will be set to 200 upstream
        cherrypy.response.body = content
    elif not cherrypy.response.headers.get('Last-Modified'):
        # no way to see whether content is outdated - 200 to be safe
        # HTTP response code will be set to 200 upstream
        cherrypy.response.body = content
    elif parsedate(cherrypy.response.headers.get('Last-Modified')) > parsedate(cherrypy.request.headers.get('If-Modified-Since')):
        # content passed in is newer than what the browser has
        # HTTP response code will be set to 200 upstream
        cherrypy.response.body = content
    else:
        cherrypy.response.status = 304
        # unset unnecessary headers
        if cherrypy.response.headers.get('Last-Modified'):
            del cherrypy.response.headers['Last-Modified']
        if cherrypy.response.headers.get('Content-Type'):
            del cherrypy.response.headers['Content-Type']
        if cherrypy.response.headers.get('Expires'):
            del cherrypy.response.headers['Expires']
Beispiel #3
0
    def _update_response_headers(self, request, response, headers):
        """
        Combine all headers that were set by the different content types
        We are interested in Cache-Control, Last-Modified, Expires
        """
        from django.utils.http import http_date

        # Ideally, for the Cache-Control header, we'd want to do some intelligent
        # combining, but that's hard. Let's just collect and unique them and let
        # the client worry about that.
        cc_headers = set()
        for x in (cc.split(",") for cc in headers.get('Cache-Control', ())):
            cc_headers |= set((s.strip() for s in x))

        if len(cc_headers):
            response['Cache-Control'] = ", ".join(cc_headers)
        else:   # Default value
            response['Cache-Control'] = 'no-cache, must-revalidate'

        # Check all Last-Modified headers, choose the latest one
        lm_list = [parsedate(x) for x in headers.get('Last-Modified', ())]
        if len(lm_list) > 0:
            response['Last-Modified'] = http_date(mktime(max(lm_list)))

        # Check all Expires headers, choose the earliest one
        lm_list = [parsedate(x) for x in headers.get('Expires', ())]
        if len(lm_list) > 0:
            response['Expires'] = http_date(mktime(min(lm_list)))
Beispiel #4
0
 def wsgi_serve_static(self, path, environ, start_response):
     headers = []
     resource = vfs.internal_resources[path]
     if resource.mtime:
         # unfortunately, this is usually only present when running under python 3.x...
         mtime_formatted = formatdate(resource.mtime)
         etag = self.etag(id(vfs.internal_resources), resource.mtime, path)
         if_modified = environ.get("HTTP_IF_MODIFIED_SINCE")
         if if_modified:
             if parsedate(if_modified) >= parsedate(mtime_formatted):
                 # the resource wasn't modified since last requested
                 return self.wsgi_not_modified(start_response)
         if_none = environ.get("HTTP_IF_NONE_MATCH")
         if if_none and (if_none == "*" or etag in if_none):
             return self.wsgi_not_modified(start_response)
         headers.append(("ETag", etag))
         headers.append(("Last-Modified", formatdate(resource.mtime)))
     if type(resource.data) is bytes:
         headers.append(("Content-Type", resource.mimetype))
         data = resource.data
     else:
         headers.append(("Content-Type", resource.mimetype + "; charset=utf-8"))
         data = resource.data.encode("utf-8")
     start_response("200 OK", headers)
     return [data]
Beispiel #5
0
def fetch_plugin(old_index, entry):
    lm_map = {plugin['thread_id']:plugin for plugin in old_index.itervalues()}
    raw = read(entry.url)
    url, name = parse_plugin_zip_url(raw)
    plugin = lm_map.get(entry.thread_id, None)

    if plugin is not None:
        # Previously downloaded plugin
        lm = datetime(*tuple(map(int, re.split(r'\D', plugin['last_modified'])))[:6])
        request = urllib2.Request(url)
        request.get_method = lambda : 'HEAD'
        with closing(urllib2.urlopen(request)) as response:
            info = response.info()
        slm = datetime(*parsedate(info.get('Last-Modified'))[:6])
        if lm >= slm:
            # The previously downloaded plugin zip file is up-to-date
            update_plugin_from_entry(plugin, entry)
            return plugin

    raw, info = read(url, get_info=True)
    slm = datetime(*parsedate(info.get('Last-Modified'))[:6])
    plugin = get_plugin_info(raw)
    plugin['last_modified'] = slm.isoformat()
    plugin['file'] = 'staging_%s.zip' % entry.thread_id
    plugin['size'] = len(raw)
    plugin['original_url'] = url
    update_plugin_from_entry(plugin, entry)
    with open(plugin['file'], 'wb') as f:
        f.write(raw)
    return plugin
Beispiel #6
0
def UrlChecker(job, param, headers):
    if "last-modified" in headers:
        last_update = eut.parsedate(headers["last-modified"])
    else:
        last_update = eut.parsedate(strftime("%a, %d %b %Y %H:%M:%S GMT",
                                             gmtime()))
    db = NoSQL(param["database"]["engine"],
               {"host": param["database"]["host"],
                "port": param["database"]["port"],
                "db": param["database"]["db"]["urlcache"]})
    result_str = db.get(job.identifier)
    # For the url that has never been cached before or deleted by LRU
    if result_str is None:
        result = {
            "last-modified": last_update,
            "url": job.url,
        }
        # TODO: shouldn't pickle at this levl
        db.set(job.identifier, pk.dumps(result))
        return False, result
    result = pk.loads(result_str)
    # For the urls that is not cached but has the same identifer
    if result["url"] != job.url:
        result["url"] = job.url
        result["last-modified"] = last_update
        db.set(job.identifier, pk.dumps(result))
        return False, result
    cached_date = result["last-modified"]
    hour_diff = (mktime(last_update) - mktime(cached_date))/3600
    if hour_diff >= param["crawlperiod"]:
        result["last-modified"] = last_update
        db.set(job.identifier, pk.dumps(result))
        return False, result
    return True, result
Beispiel #7
0
 def wsgi_serve_static(self, path: str, environ: Dict[str, Any], start_response: WsgiStartResponseType) -> Iterable[bytes]:
     headers = []
     resource = vfs.internal_resources[path]
     if resource.mtime:
         mtime_formatted = formatdate(resource.mtime)
         etag = self.etag(id(vfs.internal_resources), resource.mtime, path)
         if_modified = environ.get('HTTP_IF_MODIFIED_SINCE')
         if if_modified:
             if parsedate(if_modified) >= parsedate(mtime_formatted):        # type: ignore
                 # the resource wasn't modified since last requested
                 return self.wsgi_not_modified(start_response)
         if_none = environ.get('HTTP_IF_NONE_MATCH')
         if if_none and (if_none == '*' or etag in if_none):
             return self.wsgi_not_modified(start_response)
         headers.append(("ETag", etag))
         headers.append(("Last-Modified", formatdate(resource.mtime)))
     if resource.is_text:
         # text
         headers.append(('Content-Type', resource.mimetype + "; charset=utf-8"))
         data = resource.text.encode("utf-8")
     else:
         # binary
         headers.append(('Content-Type', resource.mimetype))
         data = resource.data
     start_response('200 OK', headers)
     return [data]
Beispiel #8
0
    def content_loaded(self, url, response):
        if response.status_code > 399 or response.text is None:
            if response.text:
                headers = None
                if response.status_code == 404:
                    text = None
                else:
                    text = response.text.decode('rotunicode')
            else:
                text = 'Empty response.text'
                headers = response.headers

            msg = "Could not load '%s' (%s) - %s!" % (url,
                                                      response.status_code,
                                                      text)
            logging.error(msg)
            if headers is not None:
                logging.warning('Response is from cache: %s' % response.from_cache)
                logging.warning('Headers for "%s": %s' % (url, headers))
            return

        logging.debug('Content for url %s loaded.' % url)

        last_modified = None

        modified = response.headers.get('Last-Modified', None)
        if modified is not None:
            date = eut.parsedate(modified)
            if date is not None:
                last_modified = datetime(*date[:6])

        self.review_dao.last_modified = last_modified

        expires = None

        expires_key = response.headers.get('Expires', None)
        if expires_key:
            date = eut.parsedate(expires_key)
            if date is not None:
                expires = datetime(*date[:6])

        self.review_dao.expires = expires

        self._current = response

        try:
            self._current.html = lxml.html.fromstring(response.text)
        except (lxml.etree.XMLSyntaxError, lxml.etree.ParserError):
            self._current.html = None

        self.run_facters()
        self.wait_for_async_requests()

        self.run_validators()
        self.wait_for_async_requests()

        self.save_review()
Beispiel #9
0
    def check_statuses(self):
        debug("In check_statuses")
        try:
            updates = reversed(self.twitter.statuses.home_timeline())
        except Exception as e:
            print("Exception while querying twitter:", file=sys.stderr)
            traceback.print_exc(file=sys.stderr)
            return

        nextLastUpdate = self.lastUpdate
        for update in updates:
            try:
                # This part raises lots of exceptions which kill the bot
                # (Unicode errors, etc.)
                # Ignore any exceptions, as a band-aid.
                crt = parsedate(update['created_at'])
                if (crt > nextLastUpdate):
                    text = (htmlentitydecode(
                        update['text'].replace('\n', ' '))
                        .encode('utf8', 'replace'))

                    # Skip updates beginning with @
                    # TODO This would be better if we only ignored messages
                    #   to people who are not on our following list.
                    if not text.startswith(b"@"):
                        msg = "%s %s%s%s %s" %(
                            get_prefix(),
                            IRC_BOLD, update['user']['screen_name'],
                            IRC_BOLD, text.decode('utf8'))
                        self.privmsg_channels(msg)

                    nextLastUpdate = crt
            except Exception as e:
                print("Exception while sending updates:", file=sys.stderr)
                traceback.print_exc(file=sys.stderr)
                pass # don't return as this one is likely to keep happening

            crt = parsedate(update['created_at'])
            if (crt > nextLastUpdate):
                text = (htmlentitydecode(
                    update['text'].replace('\n', ' '))
                    .encode('utf8', 'replace'))

                # Skip updates beginning with @
                # TODO This would be better if we only ignored messages
                #   to people who are not on our following list.
                if not text.startswith(b"@"):
                    msg = "%s %s%s%s %s" %(
                        get_prefix(),
                        IRC_BOLD, update['user']['screen_name'],
                        IRC_BOLD, text.decode('utf8'))
                    self.privmsg_channels(msg)

                nextLastUpdate = crt

        self.lastUpdate = nextLastUpdate
Beispiel #10
0
    def __init__(self, url, calendar, metadata):
        EventResource.__init__(self, url)
        self.calendar = calendar
        self.metadata = metadata
        self.events = None

        if not self.metadata.has_key("created") and self.metadata.has_key("date"):
            self.metadata["created"] = DateTime(parsedate(self.metadata["date"])[:7])

        if self.metadata.has_key("last-modified") and not isinstance(self.metadata["last-modified"], DateTime):
            self.metadata["last-modified"] = DateTime(parsedate(self.metadata["last-modified"])[:7])
Beispiel #11
0
    def __call__(self, environ, start_response):
        """Respond to a request when called in the usual WSGI way."""
        if environ['REQUEST_METHOD'] not in ('GET', 'HEAD'):
            return self.method_not_allowed(environ, start_response)
        path_info = environ.get('PATH_INFO', '')
        full_path = self._full_path(path_info)
        """If not under root then return file not found"""
        if not self._is_under_root(full_path):
            return self.not_found(environ, start_response)
        """ if file is a directory then return moved permanently or a directory index file"""
        if path.isdir(full_path):
            if full_path[-1] != '/' or full_path == self.root:
                location = util.request_uri(environ, include_query=False) + '/'
                if environ.get('QUERY_STRING'):
                    location += '?' + environ.get('QUERY_STRING')
                headers = [('Location', location)]
                return self.moved_permanently(environ, start_response, headers)
            else:
                full_path = self._full_path(path_info + self.index_file)

        #Innocent unless proved guilty
        if_gzip = False
        #if accept encoding contain gzip
        if 'gzip' in environ['HTTP_ACCEPT_ENCODING']:
            # check if gzip version exists
            if path.exists(full_path + '.gz'):
                if_gzip = True
                full_path = full_path + '.gz'
        content_type = self._guess_type(full_path)
        try:
            etag, last_modified = self._conditions(full_path, environ)
            headers = [('Date', formatdate(time.time())),
                       ('Last-Modified', last_modified),
                       ('ETag', etag)]
            if_modified = environ.get('HTTP_IF_MODIFIED_SINCE')
            if if_modified and (parsedate(if_modified)
                                >= parsedate(last_modified)):
                return self.not_modified(environ, start_response, headers)
            if_none = environ.get('HTTP_IF_NONE_MATCH')
            if if_none and (if_none == '*' or etag in if_none):
                return self.not_modified(environ, start_response, headers)
            file_like = self._file_like(full_path)
            headers.append(('Content-Type', content_type))
            if if_gzip:
                headers.append(('Content-Encoding', 'gzip'))
                headers.append(('Vary', 'Accept-Encoding'))
            start_response("200 OK", headers)
            if environ['REQUEST_METHOD'] == 'GET':
                return self._body(full_path, environ, file_like)
            else:
                return ['']
        except (IOError, OSError):
            return self.not_found(environ, start_response)
Beispiel #12
0
 def lastMessageTime( self ):
     ''' Returns date of the last message in mailbox '''
     lastMsgTime = 0
     if len( self.inbox ) > 0:
         lastMsgTime = calendar.timegm(
                 parsedate(
                     sorted(
                         self.inbox.itervalues(),
                         key=lambda item: ( parsedate( item['Date'] ), item )
                     )[-1]['Date']
                 ) )
     return lastMsgTime
Beispiel #13
0
    def __call__(self, environ, start_response):
        """Respond to a request when called in the usual WSGI way."""
        if environ['REQUEST_METHOD'] not in ('GET', 'HEAD'):
            headers = [('Allow', 'GET, HEAD')]
            return self.method_not_allowed(environ, start_response, headers)
        path_info = environ.get('PATH_INFO', '')
        full_path = self._full_path(path_info)
        if not self._is_under_root(full_path):
            return self.not_found(environ, start_response)
        if path.isdir(full_path):
            if full_path[-1] != '/' or full_path == self.root:
                location = util.request_uri(environ, include_query=False) + '/'
                if environ.get('QUERY_STRING'):
                    location += '?' + environ.get('QUERY_STRING')
                headers = [('Location', location)]
                return self.moved_permanently(environ, start_response, headers)
            else:
                full_path = self._full_path(path_info + self.index_file)
        prezipped = ('gzip' in environ.get('HTTP_ACCEPT_ENCODING', [])
                     and path.exists(full_path + '.gz'))
        if prezipped:
            full_path += '.gz'
        content_type = self._guess_type(full_path)
        try:
            etag, last_modified = self._conditions(full_path, environ)
            headers = [('Date', rfc822.formatdate(time.time())),
                       ('Last-Modified', last_modified),
                       ('ETag', etag)]
            if_modified = environ.get('HTTP_IF_MODIFIED_SINCE')
            if if_modified and (rfc822.parsedate(if_modified)
                                >= rfc822.parsedate(last_modified)):
                return self.not_modified(environ, start_response, headers)
            if_none = environ.get('HTTP_IF_NONE_MATCH')
            if if_none and (if_none == '*' or etag in if_none):
                return self.not_modified(environ, start_response, headers)
            file_like = self._file_like(full_path)
            headers.append(('Content-Type', content_type))
            if prezipped:
                headers.extend([('Content-Encoding', 'gzip'),
                                ('Vary', 'Accept-Encoding')])
            self._add_headers(headers, path_info, content_type)
            start_response("200 OK", headers)
            if environ['REQUEST_METHOD'] == 'GET':
                return self._body(full_path, environ, file_like)

            else:
                return [b'']
        except (IOError, OSError) as e:
            print(e)
            return self.not_found(environ, start_response)
Beispiel #14
0
def _tweet_for_template(tweet, https=False):
    """Return the dict needed for tweets.html to render a tweet + replies."""
    data = json.loads(tweet.raw_json)
    parsed_date = parsedate(data['created_at'])
    date = datetime(*parsed_date[0:6])

    # Recursively fetch replies.
    if settings.CC_SHOW_REPLIES:
        # If ever slow, optimize to do fewer queries.
        replies = _get_tweets(limit=0, reply_to=tweet, https=https)
    else:
        replies = None

    if 'from_user' in data:  # For tweets collected using v1 API
        user_data = data
        from_user = data['from_user']
    else:
        user_data = data['user']
        from_user = user_data['screen_name']

    if https:
        img = bleach.clean(user_data['profile_image_url_https'])
    else:
        img = bleach.clean(user_data['profile_image_url'])

    return {'profile_img': img,
            'user': from_user,
            'text': bleach.clean(data['text']),
            'id': tweet.pk,
            'date': date,
            'reply_count': len(replies) if replies else 0,
            'replies': replies,
            'reply_to': tweet.reply_to and tweet.reply_to.pk,
            'hidden': tweet.hidden}
    def _callback_fetch_stylesheet(self, response, subreddit):
        if not response:
            logger.error("Failed to fetch css for {}".format(subreddit))
            return

        if response.status_code != 200:
            logger.error("Failed to fetch css for {} (Status {})".format(subreddit, response.status_code))
            return

        text = response.text.encode('utf-8')
        modified_date_tuple = parsedate(response.headers['Last-Modified'])
        modified_date_timestamp = calendar.timegm(modified_date_tuple)

        css_cache_file_path = get_file_path(response.url, rootdir=self.reddit_cache )
        with self.mutex:
            if not os.path.exists(os.path.dirname(css_cache_file_path)):
                os.makedirs(os.path.dirname(css_cache_file_path))
        css_subreddit_path = path.join(self.session_cache, subreddit.lower()) + '.css'

        with open( css_cache_file_path, 'w' ) as f:
            f.write( text )

        utime(css_cache_file_path, (time.time(), modified_date_timestamp))

        os.symlink(os.path.relpath(css_cache_file_path, self.session_cache + '/'), css_subreddit_path );
Beispiel #16
0
def update_releases(forced=False):
    '''Uses the sources list in DB to search for contracts'''
    sources =  db.session.query(Source).all()
    updated_sources = 0

    for source in sources:
        print source.url
        if re.match("^http", source.url):
            #TODO: With the fixture we are not testing this part which is fairly sensitive
            r = requests.head(source.url)

            #If Last-Modified not avaiable, we always process
            now = datetime.now()
            source_update = now
            if 'Last-Modified' in r.headers:
                source_update = datetime(*eut.parsedate(r.headers['Last-Modified'])[:6])

            if forced or source_update >= source.last_retrieve :
                load_source(source)
                updated_sources += 1
        else:
            load_source(source)
            updated_sources += 1

    if updated_sources > 0:
        compute_supplier_size()

    #Let's flush the cache
    cache.init_app(app, config={'CACHE_TYPE': 'simple'})
    with app.app_context():
        cache.clear()
Beispiel #17
0
    def closest(self):
        """
        Use the HTTP Last-Modified header to determine the most recent date.

        If we cannot determine the date, we fail (maybe fallback to some weekly value instead).
        """
        if not hasattr(self, 'sid'):
            raise AttributeError('assumed task has a parameter sid, but it does not')

        url = self.config.get('nrw', 'url%s' % self.sid)

        resp = requests.head(url)
        if resp.status_code != 200:
            raise RuntimeError('%s on %s' % (resp.status_code, self.url))

        value = resp.headers.get('Last-Modified')
        if value is None:
            raise RuntimeError('HTTP Last-Modified header not found')

        parsed_date = eut.parsedate(value)
        if parsed_date is None:
            raise RuntimeError('could not parse Last-Modifier header')

        last_modified_date = datetime.date(*parsed_date[:3])

        return last_modified_date
Beispiel #18
0
	def parse(self):
		page = Page()
		text = page.download(self.url)
		e = etree.parse(StringIO.StringIO(text))
		rss = e.getroot()
		for item in rss.findall('.//item'):
			ar = {"node":"solo"}
			fields = {"title":"title", "link":"link", "summary":"description", "content":"content","author":"author","pubdate":"pubDate","page_url":"guid"}
			for k2 in fields:
				k = fields[k2]
				node = item.find(k)
				if node is not None:
					text = node.text
					if k == "description":
						text = re.sub(r'</?\w+[^>]*>','',text);
					if k == "pubDate":
						t = datetime.datetime(*eut.parsedate(text)[:6])
						text = t.strftime('%Y-%m-%d %H:%M:%S')
					ar[k2] = text
			if ar.get("author") is None:
				ar["author"] = "cnbeta.com"
			ar["type"] = "1"
			if ar.has_key("page_url"):
				m = md5.new()
				m.update(ar.get("page_url"))
				ar["reference_id"] = m.hexdigest()
			#print ar
			rowid = self.db.addItem(ar)
			print ar.get("title"), rowid
Beispiel #19
0
def download(url, local, is_gzipped):
    modified_local = -1 
    modified_remote = 0

    if is_gzipped:
        url = url + '.gz'

    # get last-modified remote
    headers = urllib.urlopen(url).info().headers
    for header in headers:
        if 'Last-Modified' in header:
            modified_remote = time.mktime(parsedate(header.strip('Last-Modified: ').strip('\r\n')))

    # get last-modified local (if any)
    try:             modified_local = os.path.getmtime(local)
    except os.error: modified_local = -1

    # only download if last-modified differs
    if modified_local < modified_remote:
        logger.info("Downloading " + url)

        if (is_gzipped):
            urllib.urlretrieve(url, local + ".gz")
	    
            # gunzip
    	    in_f  = gzip.open(local + ".gz", "rb")
            out_f = open(local, 'wb')
            out_f.write(in_f.read() )
            in_f.close()
            out_f.close()
        else:
            urllib.urlretrieve(url, local)
	
    else:
        logger.info("Not downloading " + url + " - already got " + local)
    def _check_headers(self, headers, body, status=None):
        # check the response headers and process response body if needed.

        # 1, make sure we have all headers
        header_names = [
            'x-ots-contentmd5', 
            'x-ots-requestid', 
            'x-ots-date', 
            'x-ots-contenttype',
        ]

        if status >= 200 and status < 300:
            for name in header_names:
                if not name in headers:
                    raise OTSClientError('"%s" is missing in response header.' % name)

        # 2, check md5
        if 'x-ots-contentmd5' in headers:
            md5 = base64.b64encode(hashlib.md5(body).digest())
            if md5 != headers['x-ots-contentmd5']:
                raise OTSClientError('MD5 mismatch in response.')

        # 3, check date 
        if 'x-ots-date' in headers:
            try:
                server_time = parsedate(headers['x-ots-date'])
            except ValueError:
                raise OTSClientError('Invalid date format in response.')
        
            # 4, check date range
            server_unix_time = calendar.timegm(server_time)
            now_unix_time = time.time()
            if abs(server_unix_time - now_unix_time) > 15 * 60:
                raise OTSClientError('The difference between date in response and system time is more than 15 minutes.')
Beispiel #21
0
    def collect_frequency_data(self, message, headers=None):
        """
        Store data about frequency of message submission from sender of this
        message.  'headers', if specified, is a list of header names to store
        along with times, for use as discriminators.
        """
        user = message['From']

        date = message.get('Date')
        if date is not None:
            date = parsedate(date)
        if date is not None:
            date = datetime(*date[:6])
        else:
            date = datetime.now()

        if headers is None:
            headers = {}
        else:
            headers = dict([(name, message[name]) for name in headers])

        times = self._freq_data.get(user)
        if times is None:
            times = _FreqData()
            self._freq_data[user] = times
        times.append((date,headers))
Beispiel #22
0
def _parsegmtime(timestring):
    """Return a standard time tuple (see time and calendar), for a date/time string."""
    # Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
    try:
        return time.strptime(timestring, "%a, %d %b %Y %H:%M:%S GMT")   
    except:
        pass

    # Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036
    try:
        return time.strptime(timestring, "%A %d-%b-%y %H:%M:%S GMT")
    except:
        pass   

    # Sun Nov  6 08:49:37 1994       ; ANSI C's asctime() format  
    try:
        return time.strptime(timestring, "%a %b %d %H:%M:%S %Y")
    except:
        pass

    # Sun Nov  6 08:49:37 1994 +0100      ; ANSI C's asctime() format with timezon  
    try:
        return parsedate(timestring)
    except:
        pass

    return None
    def trigger(self):
        try:
            print "Processing email from:", self.message['From']
            e = {}
            e['From'] = [t.lower() for t in self.message['From'].strip().split(' ') if t.strip() != '' and t[0] == '<'] if 'From' in self.message else []
            e['From'] = e['From'][0]
            e['To'] =  [t.lower() for t in self.message['To'].strip().split(' ') if t.strip() != ''] if 'To' in self.message else []
            e['Cc'] =  [t.lower() for t in self.message['Cc'].strip().split(' ') if t.strip() != ''] if 'Cc' in self.message else []

            d_ = parsedate(self.message['Date'].strip())
            e['Date'] = time.mktime(d_) if d_ else None

            e['Body'] = self.get_email_body()

            f = features_for_email(e)

            est = lr.predict(f)[0]
            est = int(math.ceil(est))

            if "<*****@*****.**>" in e['To'] or "*****@*****.**" in e['To']:
                if est > 2 and est < 168:
                    print "Sending email for " + str(est) + " hour estimate to " + e['From']
                    send_email(e['From'], est, reply=self.message['Message-ID'])
                else:
                    print "Did not send email for " + str(est) + " hour estimate to " + e['From']
            else:
                print "Did not respond to " + e['From'] + " because the message was not directed to me"
        except:
            print "Error occurred - aborting..."
def parse_from_file(email_file):
    ''' return_type: message.Message '''
    with open(email_file) as f:
        e = email.message_from_file(f)
#         print(e["Message-ID"])
        date = datetime.datetime.fromtimestamp(time.mktime(parsedate(e["Date"])))
        print(type(e["Date"]))
        print(e["Date"])
        print(type(date))
        print(date)
#         print(e["From"])
#         print(e["To"])
#         print(e["Subject"])
#         print(e["Mime-Version"])
#         print(e["Content-Type"])
#         print(e["Content-Transfer-Encoding"])
#         print(e["X-From"])
#         print(e["X-To"])
#         print(e["X-cc"])
#         print(e["X-bcc"])
#         print(e["X-Folder"])
#         print(e["X-Origin"])
#         print(e["X-FileName"])
#         print(e.get_payload())
        if e.is_multipart():
            for payload in e.get_payload():
                # if payload.is_multipart(): ...
                print payload.get_payload()
        else:
            print e.get_payload()
            print len(e.get_payload())
Beispiel #25
0
 def update_headers(self, response):
     date = parsedate(response.headers['date'])
     expires = datetime(*date[:6]) + timedelta(seconds=0)
     return {
         'expires': formatdate(calendar.timegm(expires.timetuple())),
         'cache-control': 'public',
     }
Beispiel #26
0
    def check_statuses(self):
        debug("In check_statuses")
        try:
            updates = reversed(self.twitter.statuses.home_timeline())
        except Exception as e:
            print("Exception while querying twitter:", file=sys.stderr)
            traceback.print_exc(file=sys.stderr)
            return

        nextLastUpdate = self.lastUpdate
        for update in updates:
            crt = parsedate(update['created_at'])
            if (crt > nextLastUpdate):
                if 'retweeted_status' in update:
                    text = 'RT @{}: {}'.format(
                        update['retweeted_status']['user']['screen_name'],
                        htmlentitydecode(
                            update['retweeted_status']['text'].replace('\n', ' '))
                        .encode('utf8', 'replace')
                    )
                else:
                    text = (htmlentitydecode(
                        update['text'].replace('\n', ' '))
                        .encode('utf8', 'replace'))

                msg = "%s %s%s:%s %s" % (
                    get_prefix(),
                    IRC_BOLD, update['user']['screen_name'],
                    IRC_BOLD, text.decode('utf8'))
                self.privmsg_channels(msg)

                nextLastUpdate = crt

        self.lastUpdate = nextLastUpdate
Beispiel #27
0
def _extract_msg_data(msg, field):
    def do_multi(data):
        try: __x = filterNone(set(data.split(field_multis[field])))
        except AttributeError: __x = data
        if hasattr(__x, '__iter__'):
            try: __x = [ i.strip() for i in __x ]
            except AttributeError:
                pass
        return __x

    field_name = field_map.get(field, field)
    __data = field_action_map.get(field_name,
                                  lambda msg: msg.get(field_name, None))(msg)
    if __data is None:
        if field not in field_multis:
            return None
        else:
            __data = []

    if field in field_multis:
        __data = do_multi(__data)
    elif field == 'sent':
        __data = datetime(*parsedate(__data)[:6])

    return __data
Beispiel #28
0
def parse_status(status):
  if 'retweeted_status' in status and _user.official_retweet:
    status = status['retweeted_status']
  msg_dict = {'content': unescape(status['text']), 'id': str(status['id'])}
  if 'user' in status:
    msg_dict['username'] = status['user']['screen_name']
    Db.set_cache(status)
  elif 'sender' in status:
    msg_dict['username'] = status['sender_screen_name']
  else:
    msg_dict['username'] = ''
  if msg_dict['username'] and _user.bold_username:
    msg_dict['username'] = '******' % msg_dict['username']
  username = _user.enabled_user
  username_at = "@" + username
  short_id = None
  if username_at in msg_dict['content']:
    if _user.bold_username:
      msg_dict['content'] = msg_dict['content'].replace(username_at, '*%s*' % username_at)
  if 'user' in status:
    short_id = generate_short_id(status['id'])
  msg_dict['shortid'] = '#' + str(short_id) if short_id is not None else ''
  utc = pytz.utc
  t = parsedate(status['created_at'])[:6]
  t = datetime(*t)
  utc_dt = utc.localize(t)
  tz = pytz.timezone(_user.timezone)
  t = tz.normalize(utc_dt.astimezone(tz))
  msg_dict['time'] = t.strftime(_user.date_format.encode('UTF-8')).decode('UTF-8')
  if 'source' in status:
    source = re.match(r'<a .*>(.*)</a>', status['source'])
    msg_dict['source'] = source.group(1) if source else status['source']
  else:
    msg_dict['source'] = ''
  return Template(unicode(_user.msg_template)).safe_substitute(msg_dict)
Beispiel #29
0
	def fetch_data(self):
		url = self.get_url()
		self.log.info("Fetching %s" % url)
		response = self.http_session.get(url)
		responsetime = eut.parsedate(response.headers['date'])
		responsesec = calendar.timegm(responsetime)
		self.log.info('response date: %s -> %d (%d)' % (response.headers['date'], responsesec, int(responsesec / 5)))
		doc = etree.fromstring(response.content)
		dishList = doc.xpath('/dsn/dish')
		dishes = {}
		for dish in dishList:
			dish_name, data = self.parse_dish(dish)
			dishes[dish_name] = data
		stationList = doc.xpath('/dsn/station')
		stations = {}
		for station in stationList:
			station_name, data = self.parse_station(station)
			stations[station_name] = data
		timeElem = doc.xpath('/dsn/timestamp')
		result = {
			'stations': stations,
			'dishes': dishes,
			'time': to_int(timeElem[0].text)
		}
		return result
Beispiel #30
0
    def update_headers(self, resp):
        headers = resp.headers

        if 'expires' in headers:
            return {}

        if 'cache-control' in headers and headers['cache-control'] != 'public':
            return {}

        if resp.status not in self.cacheable_by_default_statuses:
            return {}

        if 'date' not in headers or 'last-modified' not in headers:
            return {}

        date = calendar.timegm(parsedate_tz(headers['date']))
        last_modified = parsedate(headers['last-modified'])
        if date is None or last_modified is None:
            return {}

        now = time.time()
        current_age = max(0, now - date)
        delta = date - calendar.timegm(last_modified)
        freshness_lifetime = max(0, min(delta / 10, 24 * 3600))
        if freshness_lifetime <= current_age:
            return {}

        expires = date + freshness_lifetime
        return {'expires': time.strftime(TIME_FMT, time.gmtime(expires))}
Beispiel #31
0
def main():
    # Part 0. Prepare environment

    log = open(MFCNS_LOGFILE, 'a')
    logfd = log.fileno()
    os.dup2(logfd, STDOUT_FILENO)
    os.dup2(logfd, STDERR_FILENO)
    lprintf('MFCns_handler started')
    atexit.register(cleanup)

    # Part I. Spool dir processing

    mfc_rex = re.compile(MFC_PTRN)

    for filename in os.listdir(MFCNS_SPOOL):
        filename = os.path.join(MFCNS_SPOOL, filename)
        if not os.path.isfile(filename):
            lprintf('%s: not a file found in the spool directory', filename)
            continue

        lprintf('Processing "%s"...', filename)

        fdes = open(filename, 'r', encoding='utf-8')
        message = message_from_file(fdes)

        date = list(parsedate(message['Date']))

        fdes.seek(0, 0)
        content = fdes.readlines()
        fdes.close()

        mfc_in = -1
        for line in content:
            result = mfc_rex.match(line)
            if result == None:
                continue
            mfc_in = int(result.group('ndays'))
            measure = result.group('measr')
            if measure == None:
                pass
            elif measure[0:4] == 'week':
                mfc_in *= 7
            elif measure[0:5] == 'month':
                mfc_in *= 30
        if mfc_in < 0:
            lprintf('%s: doesn\'t look like a MFC notification request',
                    filename)
            continue

        date[3] = date[4] = date[5] = 0
        timestamp = time.mktime(tuple(date))
        timestamp += mfc_in * SECSADAY
        date = time.localtime(timestamp)
        strdate = '%d%02d%02d' % tuple(date[0:3])

        destdir = os.path.join(MFCNS_QUEUE, strdate)
        if not os.path.exists(destdir):
            os.mkdir(destdir)
        if not os.path.isdir(destdir):
            raise IOError(errno.ENOTDIR, 'Not a directory', destdir)

        os.rename(filename, os.path.join(destdir, os.path.basename(filename)))

    # Part II. Queue processing

    timestamp = time.time()
    cdate = time.localtime(timestamp)
    today = int('%d%02d%02d' % tuple(cdate[0:3]))
    mfc_tral_rex = re.compile(MFC_TRAL)
    do_sleep = 0

    for dname in os.listdir(MFCNS_QUEUE):
        fdir = os.path.join(MFCNS_QUEUE, dname)
        if not (os.path.isdir(fdir) and len(dname) == 8
                and int(dname) <= today):
            continue

        for filename in os.listdir(fdir):
            if do_sleep == 1:
                time.sleep(SENDBREAK)
            filename = os.path.join(fdir, filename)
            if not os.path.isfile(filename):
                lprintf('%s: not a file found in the queue directory',
                        filename)
                continue

            lprintf('Processing "%s"...', filename)

            fdes = open(filename, 'r', encoding='utf-8')
            message = message_from_file(fdes)
            to = parseaddr(message['From'])
            subject = message['Subject']
            branch = message.get('X-FreeBSD-CVS-Branch', None)
            if branch == None:
                branch = message['X-SVN-Group']
            fdes.seek(0, 0)
            content = fdes.readlines()
            fdes.close()

            i = 0
            for line in content:
                result = mfc_tral_rex.match(line)
                if result != None:
                    content = content[:i]
                    break
                i += 1

            sendnote(to, subject, branch, content)
            lprintf('MFC notification sent to "%s" <%s>', to)
            os.unlink(filename)
            do_sleep = 1

        if len(os.listdir(fdir)) == 0:
            os.rmdir(fdir)
        else:
            lprintf('%s: directory can\'t be deleted because it is not empty',
                    fdir)
def twitter(SeriesHelper):
    try:
        for tweet in tweet_iter:
            # turn the date string into a date object that python can handle
            print(json.loads(json.dumps(tweet)))
            # lines = json.loads(tweet)
            # for line in lines:

            # print (tweet)
            tweet_id = tweet["id_str"]
            location_colored = colored(tweet["user"]["location"], "red")
            location = tweet["user"]["location"]
            # possibly_sensitive = tweet["possibly_sensitive"]
            # print (possibly_sensitive	,json.loads(json.dumps(tweet)))

            # withheld_in_countries = tweet["user"]["withheld_in_countries"]
            # if tweet["place"] != 'None':
            # 		print (tweet["place"])
            # 	place = json.loads(line)
            # 	print (place['country_code'])
            # place = json.dumps(tweet['place'])
            # print (place)
            timestamp = parsedate(tweet["created_at"])
            # now format this nicely into HH:MM:SS format
            timetext = strftime("%Y%m%d%H%M%S", timestamp)
            retweet_count = tweet["retweet_count"]
            # colour our tweet's time, user and text
            time_colored = colored(timetext, color="white", attrs=["bold"])
            user_colored = colored(tweet["user"]["screen_name"], "green")
            user = tweet["user"]["screen_name"]
            followers_count = tweet["user"]["followers_count"]
            lang = tweet["user"]["lang"]
            text = tweet["text"]
            symbols = tweet["entities"]["symbols"]
            # for line in hashtags:
            # 	print (line)
            time_zone = tweet["user"]["time_zone"]
            statuses_count = tweet["user"]["statuses_count"]
            # if 'text' in tweet["entities"]["hashtags"]:
            # 	hashtags = tweet["entities"]["hashtags"]['text']
            # 	print (hashtags)

            # replace each instance of our search terms with a highlighted version
            text_colored = pattern.sub(colored(search_term.upper(), "yellow"),
                                       text)

            # add some indenting to each line and wrap the text nicely
            indent = " " * 0
            text_colored = fill(text_colored,
                                180,
                                initial_indent=indent,
                                subsequent_indent=indent)
            # myclient.write_points(json.dump(tweet,separators=","))
            tweet_json = [{
                "measurement": "tweet",
                "tags": {
                    "lang": lang,
                    "time_zone": time_zone
                },
                "created_at": timestamp,
                "fields": {
                    "id": tweet_id,
                    "followers_count": followers_count,
                    "retweet_count": retweet_count,
                    "text": text,
                    "location": location,
                    "user": user,
                    "statuses_count": statuses_count,
                }
            }]
            # myclient.write_points(tweet_json)

            # tweet_record(id=tweet_id, created_at=timestamp, text=text, tweet='tweet')

            #    user = user,
            # now output our tweet
            # print (symbols)
            print("%s |%s| |%s| @%s |%s| [%s] %s %s" %
                  (time_colored, time_zone, location_colored, user_colored,
                   statuses_count, followers_count, lang, text_colored))
    except InfluxDBClientError as e:
        print("DB_ERROR:", 'Error %s' % e)
Beispiel #33
0
def fetch_house_committee_meetings(committees, options):
    # Load any existing meetings file so we can recycle any GUIDs.
    existing_meetings = []
    output_file = output_for("house")
    if os.path.exists(output_file):
        existing_meetings = json.load(open(output_file))

    opts = dict(options)
    opts["binary"] = True
    opts["force"] = True

    meetings = []
    seen_meetings = set()

    # Scrape the committee listing page for a list of committees with scrapable events.
    committee_html = utils.download(
        "http://docs.house.gov/Committee/Committees.aspx",
        "committee_schedule/house_overview.html", options)
    for cmte in re.findall(r'<option value="(....)">', committee_html):
        if cmte not in committees:
            logging.error("Invalid committee code: " + cmte)
            continue

        # Download the feed for this committee.
        logging.info("Fetching events for committee " + cmte)
        html = utils.download(
            "http://docs.house.gov/Committee/RSS.ashx?Code=%s" % cmte,
            "committee_schedule/house_%s.xml" % cmte, opts)

        # It's not really valid?
        html = html.replace(
            "&nbsp;", " "
        )  # who likes nbsp's? convert to spaces. but otherwise, entity is not recognized.
        #print html
        # Parse and loop through the meetings listed in the committee feed.
        dom = lxml.etree.fromstring(html)

        # original start to loop
        for mtg in dom.xpath("channel/item"):

            eventurl = unicode(mtg.xpath("string(link)"))
            event_id = re.search(r"EventID=(\d+)$", eventurl)
            if not event_id: continue  # weird empty event showed up
            event_id = event_id.group(1)
            pubDate = datetime.datetime.fromtimestamp(
                mktime(parsedate(mtg.xpath("string(pubDate)"))))
            # skip old records of meetings, some of which just give error pages
            if pubDate < (datetime.datetime.now() -
                          datetime.timedelta(days=60)):
                continue

            # Events can appear in multiple committee feeds if it is a joint meeting.
            if event_id in seen_meetings:
                logging.info("Duplicated multi-committee event: " + event_id)
                continue
            seen_meetings.add(event_id)

            # this loads the xml from the page and sends the xml to parse_house_committee_meeting
            load_xml_from_page(eventurl, options, existing_meetings,
                               committees, event_id, meetings)
            # if bad zipfile
            if load_xml_from_page == False: continue

    print "[house] Found %i meetings." % len(meetings)
    return meetings
Beispiel #34
0
def extract_date(email):
    date = email.get('Date')
    return parsedate(date)
Beispiel #35
0
def timestamp_from_http_modtime(str_modtime, str_format=format_epoch):
	t = datetime.datetime(*parsedate(str_modtime)[:6])
	s = str(int((t - datetime.datetime(1970,1,1)).total_seconds()))
	t = datetime.datetime.fromtimestamp(time.mktime(t.timetuple()))
	f = str_format.replace(format_epoch, s)
	return t.strftime(f)
Beispiel #36
0
def filter_parsedate(val):
    """Attempts to parse a date according to the rules in RFC 2822"""
    return datetime.fromtimestamp(mktime(parsedate(val)))
Beispiel #37
0
import calendar, time
import hashlib, re

url = "http://*****:*****@securityforeveryone.com"
}  #change this email with your registered wowonder email address
req = requests.post(url + "requests.php?f=recover",
                    headers=myheaders,
                    data=recoverdata)
b = eut.parsedate(req.headers["Date"])
respepoch = calendar.timegm(
    time.strptime(
        "{0}-{1}-{2} {3}:{4}:{5}".format(b[0], b[1], b[2], b[3], b[4], b[5]),
        '%Y-%m-%d %H:%M:%S'))

for token in range(111, 1000):
    str2hash = "{0}{1}".format(token, respepoch)
    email_code = hashlib.md5(str2hash.encode()).hexdigest()

    req_reset = requests.get(
        url + "index.php?link1=reset-password&code=1_{0}".format(email_code))
    if len(re.findall("New password", req_reset.text)) == 1:
        print(email_code)
        resetdata = {"password": "******", "id": "1_" + email_code}
        reqtoken = requests.post(url + "requests.php?f=reset_password",
Beispiel #38
0
def parse_date(s):
    if s is None:
        return datetime.now()
    return datetime(*parsedate(s)[:6])
def parser(data):
    items = []
    l = []
    match = re.compile('<item>(.+?)</item>', re.DOTALL).findall(data)
    for item in match:
        thumb = ''
        plot = ''
        title = re.compile('<title>(.+?)</title>', re.DOTALL).findall(item)[0]
        pubDate = re.compile('<pubDate>(.+?)</pubDate>',
                             re.DOTALL).findall(item)[0]
        description = re.compile('<description>(.+?)</description>',
                                 re.DOTALL).findall(item)[0]
        if '<category>' in item:
            category = cleanTitle(
                re.compile('<category>(.+?)</category>',
                           re.DOTALL).findall(item)[-1])
        else:
            category = ''
        if 'img src="' in description:
            thumb = re.compile('img src="(.+?)"',
                               re.DOTALL).findall(description)[0]
        infos = re.compile('&lt;p&gt;(.*?)&lt;/p&gt;',
                           re.DOTALL).findall(description)
        if len(infos) >= 4:
            d = {}

            if infos[1] == '' or infos[1].endswith('...') and len(
                    infos[1]) < len(title):
                plot = title + '\n\n' + infos[2]
            else:
                plot = infos[1].replace('\n', '') + '\n\n' + infos[2]
            link = re.compile('<link>(.+?)</link>', re.DOTALL).findall(item)[0]
            try:
                tmp = link.split('/')[4]
                tmp = tmp.lower()
                if 'Video-Podcast' in link or tmp.endswith(
                        'audio') or tmp.endswith('radio'):
                    continue
            except:
                pass
            documentId = link.split('documentId=')[1]
            if '&' in documentId:
                documentId = documentId.split('&')[0]
            split = infos[2].split('|')
            runtime = 0
            for part in split:
                if 'Min' in part or 'min' in part:
                    runtime = runtimeToInt(part)
                    if runtime:
                        d['duration'] = str(runtime)
                channel = part[1:]  # ugly
            if runtime > 0:

                bcastId = link.split('bcastId=')[1]
                if '&' in bcastId:
                    bcastId = bcastId.split('&')[0]
                #fanart = bcast2thumb.getThumb(bcastId)
                # if fanart:
                #	d['fanart'] = fanart
                # else:
                #	print 'bcastid not in archive '+bcastId
                #	print title
                d['_name'] = title
                d['url'] = link.replace('&amp;', '&')
                #d["epoch"] = int(time.mktime(time.strptime(pubDate, '%D, %d %M %Y %H:%i:%s %O')))#
                d["_epoch"] = int(time.mktime(parsedate(pubDate)))
                d["documentId"] = d['url'].split("documentId=")[-1].split(
                    "&")[0]
                d['_thumb'] = thumb
                d['_plot'] = plot
                d['_channel'] = channel
                d['_type'] = 'video'
                d['mode'] = 'libArdPlay'
                l.append(d)

    return l
Beispiel #40
0
def parse_date(str_date):
    return datetime(*(parsedate(str_date)[:6]))
Beispiel #41
0
              access_secret,
              signature_type='query')
stream = TwitterStream(auth=auth, secure=True)

# iterate over tweets matching this filter text
# IMPORTANT! this is not quite the same as a standard twitter search
tweet_iter = stream.statuses.filter(track=search_term)

pattern = re.compile("%s" % search_term, re.IGNORECASE)

for tweet in tweet_iter:
    # check whether this is a valid tweet
    if tweet.get('text'):

        # turn the date string into a date object that python can handle
        timestamp = parsedate(tweet["created_at"])
        # now format this nicely into HH:MM:SS format
        timetext = strftime("%H:%M:%S", timestamp)

        # colour our tweet's time, user and text
        time_colored = colored(timetext, color="white", attrs=["bold"])
        user_colored = colored(tweet["user"]["screen_name"], "green")
        text_colored = tweet["text"]
        # replace each instance of our search terms with a highlighted version
        text_colored = pattern.sub(colored(search_term.upper(), "yellow"),
                                   text_colored)

        # add some indenting to each line and wrap the text nicely
        indent = " " * 11
        text_colored = fill(text_colored,
                            80,
Beispiel #42
0
def rfc822_parsedate(v):
    from email.utils import parsedate
    return datetime.datetime.fromtimestamp(time.mktime(parsedate(v)))
Beispiel #43
0
 def parseHttpTime(timeStr):
     return time.mktime(eut.parsedate(timeStr))
Beispiel #44
0
def timerfc2822(s):
    d = parsedate(s)
    return "{:04}-{:02}-{:02} {:02}:{:02}:{:02}".format(d[0],d[1],d[2],d[3],d[4],d[5])
Beispiel #45
0
    def _fetch_archive(name, archive_type):
        """Fetches a blueprint archive from S3.

        Args:
            name (str): The name of the blueprint.
            archive_type (str): The type or the archive. Can be 'app' or 'kb'.

        Returns:
            str: The path of the local archive after it is downloaded.

        Raises:
            EnvironmentError: When AWS credentials are not available
        """
        cache_dir = path.get_cached_blueprint_path(name)
        try:
            os.makedirs(cache_dir)
        except (OSError, IOError):
            # dir already exists -- no worries
            pass

        filename = {
            "app": BLUEPRINT_APP_ARCHIVE,
            "kb": BLUEPRINT_KB_ARCHIVE
        }.get(archive_type)

        local_archive = os.path.join(cache_dir, filename)
        remote_url = BLUEPRINT_URL.format(mindmeld_url=BLUEPRINTS_URL,
                                          blueprint=name,
                                          filename=filename)

        res = requests.head(remote_url)
        if res.status_code == 401:
            # authentication error
            msg = (
                "Invalid MindMeld credentials. Cannot download blueprint. Please confirm "
                "they are correct and try again.")
            logger.error(msg)
            raise EnvironmentError(msg)
        if res.status_code != 200:
            # Unknown error
            msg = "Unknown error fetching {} archive from {!r}".format(
                archive_type, remote_url)
            logger.warning(msg)
            raise ValueError("Unknown error fetching archive")
        remote_modified = datetime.datetime(*parsedate(
            res.headers.get("last-modified"))[:6],
                                            tzinfo=tz.tzutc())
        try:
            local_modified = datetime.datetime.fromtimestamp(
                os.path.getmtime(local_archive), tz.tzlocal())
        except (OSError, IOError):
            # File doesn't exist, use minimum possible time
            local_modified = datetime.datetime(datetime.MINYEAR,
                                               1,
                                               1,
                                               tzinfo=tz.tzutc())

        if remote_modified < local_modified:
            logger.info("Using cached %r %s archive", name, archive_type)
        else:
            logger.info("Fetching %s archive from %r", archive_type,
                        remote_url)
            res = requests.get(remote_url, stream=True)
            if res.status_code == 200:
                with open(local_archive, "wb") as file_pointer:
                    res.raw.decode_content = True
                    shutil.copyfileobj(res.raw, file_pointer)
        return local_archive
Beispiel #46
0
def parse_datetime(string):
    if settings.USE_TZ:
        return datetime(*(parsedate(string)[:6]), tzinfo=current_timezone)
    else:
        return datetime(*(parsedate(string)[:6]))
Beispiel #47
0
 def file_not_modified(self, static_file, environ):
     try:
         last_requested = environ['HTTP_IF_MODIFIED_SINCE']
     except KeyError:
         return False
     return parsedate(last_requested) >= static_file.last_modified
Beispiel #48
0
async def handle_request(request, exception):
    start_time = time.time()
    format = 'html'
    url = request.path
    headers = dict()
    if url.startswith('/http'):
        url = url[1:]
    elif url.startswith('/html/http'):
        url = url[6:]
    elif url.startswith('/mhtml/http'):
        format = 'mhtml'
        url = url[7:]
    elif url.startswith('/pdf/http'):
        format = 'pdf'
        url = url[5:]
    elif url.startswith('/jpeg/http'):
        format = 'jpeg'
        url = url[6:]
    elif url.startswith('/png/http'):
        format = 'png'
        url = url[5:]
    if request.query_string:
        url = url + '?' + request.query_string
    parsed_url = urlparse(url)

    if not parsed_url.hostname:
        return response.text('Bad Request', status=400)

    if ALLOWED_DOMAINS:
        if parsed_url.hostname not in ALLOWED_DOMAINS:
            return response.text('Forbiden', status=403)

    skip_cache = request.method == 'POST'
    if not skip_cache:
        try:
            data = await cache.get(url, format)
            modified_since = await cache.modified_since(url) or time.time()
            headers['Last-Modified'] = formatdate(modified_since, usegmt=True)

            try:
                if_modified_since = parsedate(
                    request.headers.get('If-Modified-Since'))
                if_modified_since = time.mktime(if_modified_since)
            except TypeError:
                if_modified_since = 0

            if modified_since and if_modified_since >= modified_since:
                logger.info('Got 304 for %s in cache in %dms', url,
                            int((time.time() - start_time) * 1000))
                return response.text('', status=304, headers=headers)

            if data is not None:
                headers['X-Prerender-Cache'] = 'hit'
                logger.info('Got 200 for %s in cache in %dms', url,
                            int((time.time() - start_time) * 1000))
                if format == 'html':
                    return response.html(apply_filters(data.decode('utf-8'),
                                                       HTML_FILTERS),
                                         headers=headers)
                return response.raw(data, headers=headers)
        except Exception:
            logger.exception('Error reading cache')
            if sentry:
                sentry.captureException()

    if CONCURRENCY <= 0:
        # Read from cache only
        logger.warning('Got 502 for %s in %dms, prerender unavailable', url,
                       int((time.time() - start_time) * 1000))
        return response.text('Bad Gateway', status=502)

    try:
        if _ENABLE_CB:
            user_agent = request.headers.get('user-agent', '')
            _os, browser = httpagentparser.simple_detect(user_agent)
            breaker = _BREAKERS[browser]
            data, status_code = await breaker.run(
                lambda: _render(request.app.prerender, url, format))
        else:
            data, status_code = await _render(request.app.prerender, url,
                                              format)
        headers.update({
            'X-Prerender-Cache': 'miss',
            'Last-Modified': formatdate(usegmt=True)
        })
        logger.info('Got %d for %s in %dms', status_code, url,
                    int((time.time() - start_time) * 1000))
        if format == 'html':
            if 200 <= status_code < 300:
                executor.submit(_save_to_cache, url, data.encode('utf-8'),
                                format)
            return response.html(apply_filters(data, HTML_FILTERS),
                                 headers=headers,
                                 status=status_code)
        if 200 <= status_code < 300:
            executor.submit(_save_to_cache, url, data, format)
        return response.raw(data, headers=headers, status=status_code)
    except (asyncio.TimeoutError, asyncio.CancelledError,
            TemporaryBrowserFailure, RetriesExhausted):
        logger.warning('Got 504 for %s in %dms', url,
                       int((time.time() - start_time) * 1000))
        return response.text('Gateway timeout', status=504)
    except TooManyResponseError:
        logger.warning('Too many response error for %s in %dms', url,
                       int((time.time() - start_time) * 1000))
        return response.text('Service unavailable', status=503)
    except CircuitOpen:
        logger.warning('Circuit breaker open for %s', browser)
        return response.text('Service unavailable', status=503)
    except Exception:
        logger.exception('Internal Server Error for %s in %dms', url,
                         int((time.time() - start_time) * 1000))
        if sentry:
            sentry.captureException()
        return response.text('Internal Server Error', status=500)
Beispiel #49
0
t = Path(tmpdir / "KFMon")
t.mkdir(parents=True, exist_ok=True)

# Start with Plato
print("\n* Creating a one-click package for Plato . . .")
# It'll be staged in its own directory
pl = Path(t / "Plato")

# Download both packages...
print("* Downloading original package")
pl_main = Path(t / "Plato.zip")
with requests.get(plato_main_url, stream=True) as r:
    if r.status_code != 200:
        raise SystemExit("Couldn't download the latest Plato release!")
    # We'll restore its mtime later...
    plato_date = mktime(parsedate(r.headers["Last-Modified"]))
    clen = int(r.headers.get("Content-Length", 0))
    wrote = 0
    with pl_main.open(mode="w+b") as f:
        with tqdm(total=clen, unit='B', unit_scale=True,
                  unit_divisor=1024) as pbar:
            for data in r.iter_content(chunk_size=DEFAULT_BUFFER_SIZE):
                written = f.write(data)
                wrote += written
                pbar.update(written)
    if clen != 0 and wrote != clen:
        raise SystemExit(
            "Wrote {} bytes to disk instead of the {} expected!".format(
                wrote, clen))
pl_scripts = Path(t / "Plato-Scripts.zip")
with requests.get(plato_scripts_url, stream=True) as r:
Beispiel #50
0
def parse_mailbox(mailbox_path, my_name, my_email, timestamp_format, use_mbox):
    if not use_mbox:
        mailbox_path = os.path.join(mailbox_path, "")
        if not os.path.isdir(mailbox_path + 'new'):
            os.mkdir(mailbox_path + 'new')
        if not os.path.isdir(mailbox_path + 'tmp'):
            os.mkdir(mailbox_path + 'tmp')

    if use_mbox:
        mbox = mailbox.mbox(mailbox_path)
    else:
        mbox = mailbox.Maildir(mailbox_path, None)
    sorted_mails = sorted(mbox, key=extract_date_mbox)

    # Sometimes thunderbird will produce mbox files with duplicate messages.
    # Keep track of all seen Message-ID's to prevent writing out duplicate
    # lines to the logs.
    seen_ids = set()

    for message in sorted_mails:
        messageobj = []

        # Very rarely (happened to me with only 1 message out of 25,000),
        # Thunderbird/GMail will produce a malformed message with a payload,
        # but no metadata. Just skip these, but print a warning so the user
        # can ensure that this is not happening too often.
        if len(message.keys()) == 0:
            print("Warning: Skipping malformed message")
            continue

        # Skip duplicates
        if message['Message-ID'] in seen_ids:
            continue
        seen_ids.add(message['Message-ID'])

        name = re.sub("Chat with ", "", message['subject'])

        payload = message.get_payload()
        if type(payload) is str:
            # We're in one of the new hybrid-style single-use messages

            # Some (but not all) of these messages use quoted-printable
            # encoding (which uses = as an escape character).
            # The remainder are encoded with 7bit ASCII, which must not
            # be decoded, because treating = as an escape causes havoc.
            if message['Content-Transfer-Encoding'] == 'quoted-printable':
                payload = quopri.decodestring(payload)
                payload = payload.decode('utf-8')
            payload = payload.strip()
            to_name = re.sub(" <[^>]*>", "", message.get('To'))
            from_name = re.sub(" <[^>]*>", "", message.get('From'))
            if not name:
                name = to_name if to_name != my_name else from_name
            rawtimestr = message.get('Date')
            timestamp = time.strftime(timestamp_format, parsedate(rawtimestr))

            pars = HTMLParser.HTMLParser()
            outline = "%s <%s> %s\n" % (timestamp, from_name,
                                        pars.unescape(payload))
            messageobj.append(outline.encode('utf-8'))
        else:
            #We're in an old Google Talk Jabber conversation message

            payload = payload[0].as_string()
            # Seemingly all of these messages use quoted-printable encoding,
            # even though 'Content-Transfer-Encoding' is never set.
            payload = quopri.decodestring(payload)
            payload = payload.decode('utf-8')
            # The emails have a couple of chaff lines before the XML starts
            payload = re.sub(r'^[^<]*<', "<", payload)

            chatxml = xml.dom.minidom.parseString(payload.encode('utf-8'))

            for messagexml in chatxml.getElementsByTagName("cli:message"):
                speaker = messagexml.getAttribute("from")
                rawtimestr = messagexml.getElementsByTagName(
                    "time")[0].getAttribute("ms")
                timefloat = float(rawtimestr[:-3] + "." + rawtimestr[-3:])
                timestamp = time.strftime(timestamp_format,
                                          time.localtime(timefloat))
                try:
                    content = messagexml.getElementsByTagName(
                        "cli:body")[0].firstChild.data
                except AttributeError:
                    # No 'data' element means that it's an empty message
                    content = ""
                except IndexError:
                    # No "cli:body" elements means that it's a non-message event,
                    # like a time-gap or user-unavailable message
                    continue
                outline = "%s <%s> %s\n" % (timestamp, speaker, content)
                messageobj.append(outline.encode('utf-8'))

        write_to_file("%s.txt" % filename_sanitize(name)[:250], messageobj)
Beispiel #51
0
def run(environ, start_response):

    #    start_response('404 Not Found', [('Content-Type', 'text/html; charset=UTF-8')])
    #    return [str(environ)]

    path = environ['PATH_INFO'][1:].split('/')
    use_gzip = False
    try:
        if 'gzip' in environ['HTTP_ACCEPT_ENCODING'].split(','):
            use_gzip = True
    except KeyError:
        pass

    path0 = path[0]
    if path0 == '' and environ['PATH_INFO'][0] == '/':
        path0 = 'index.html'

    if path0 in arquivos:
        arquivo = arquivos[path0]

        if not 'uncompressed_length' in arquivo:
            fname = path0
            fp = open(fname, 'rb')
            arquivo['uncompressed_data'] = fp.read()
            fp.close()
            arquivo['uncompressed_length'] = str(os.path.getsize(fname))
            arquivo['last_modified_time'] = os.path.getmtime(fname)
            arquivo['last_modified_str'] = formatdate(
                arquivo['last_modified_time'], False, True)

        try:
            since_time = calendar.timegm(
                parsedate(environ['HTTP_IF_MODIFIED_SINCE']))
            if arquivo['last_modified_time'] <= since_time:
                start_response('304 Not Modified', [])
                return ['']
        except KeyError:
            pass

        content_length = arquivo['uncompressed_length']
        content = arquivo['uncompressed_data']
        content_encoding = None

        if use_gzip:
            if not 'compressed_length' in arquivo:
                fname = path0 + '.gz'
                fp = open(fname, 'rb')
                arquivo['compressed_data'] = fp.read()
                fp.close()
                arquivo['compressed_length'] = str(os.path.getsize(fname))

            content_length = arquivo['compressed_length']
            content = arquivo['compressed_data']
            content_encoding = 'gzip'

        headers = [
            ('Content-Type', arquivo['content_type']),
            #                  ('Expires', '-1'),
            ('Last-Modified', arquivo['last_modified_str']),
            ('X-Uncompressed-Content-Length', arquivo['uncompressed_length']),
            ('Content-Length', content_length),
        ]

        if content_encoding is not None:
            headers.append(('Content-Encoding', content_encoding))

        start_response('200 OK', headers)
        return [content]

    elif path0 == 'load2.cgi':
        fname = encoded_fname(environ)
        data = None
        headers = [('Content-Type', 'application/json'), ('Expires', '-1')]
        try:
            # os arquivos estão em gzip. se gzip for pedido, o arquivo é
            # aberto normalmente e não é decodificado
            if use_gzip:
                fp = open(dados_prefix + fname + '.gz', 'rb')
                headers.append(('Content-Encoding', 'gzip'))
            else:
                fp = gzip.open(dados_prefix + fname + '.gz', 'rb')
            data = fp.read()
            fp.close()
        except IOError:
            pass
        if data is None:
            data = ''
        start_response('200 OK', headers)
        return [data]
    elif path0 == 'save2.cgi':
        fname = encoded_fname(environ)
        data = environ['wsgi.input'].read()
        fp = gzip.open(dados_prefix + fname + '.gz', 'wb')
        fp.write(data)
        fp.close()
        start_response('200 OK', [('Content-Type', 'text/html'),
                                  ('Expires', '-1')])
        return ['OK']
    elif path0 == 'ping.cgi':
        content_disposition = 'attachment; filename=' + get_q(
            environ['QUERY_STRING'])
        wsgi_input = environ['wsgi.input'].read().split('\r\n')
        terminator = wsgi_input[0] + '--'
        data = []
        started = False
        for line in wsgi_input[1:]:
            if line == terminator:
                break
            if started:
                data.append(line)
            if line == '':
                started = True
        data = '\r\n'.join(data)
        start_response('200 OK', [('Content-Type', 'application/octet-stream'),
                                  ('Content-Disposition', content_disposition),
                                  ('Expires', '-1')])
        return [data]
    elif path0 == 'robots.txt':
        start_response('200 OK', [('Content-Type', 'text/plain')])
        data = "User-agent: *\nDisallow: /\n"
        return [data]

    raise IOError
 def get_time_remaining(self, request):
     """See if a request is static and how long it can be cached for"""
     from email.utils import parsedate
     re_max_age = re.compile(r'max-age[ ]*=[ ]*(?P<maxage>[\d]+)')
     is_static = False
     time_remaining = -1
     try:
         if 'response_headers' in request:
             content_length = self.get_header_value(
                 request['response_headers'], 'Content-Length')
             if content_length is not None:
                 content_length = int(
                     re.search(r'\d+', str(content_length)).group())
                 if content_length == 0:
                     return is_static, time_remaining
             if 'response_headers' in request:
                 content_type = self.get_header_value(
                     request['response_headers'], 'Content-Type')
                 if content_type is None or \
                         (content_type.find('/html') == -1 and \
                         content_type.find('/cache-manifest') == -1):
                     is_static = True
                     cache = self.get_header_value(
                         request['response_headers'], 'Cache-Control')
                     pragma = self.get_header_value(
                         request['response_headers'], 'Pragma')
                     expires = self.get_header_value(
                         request['response_headers'], 'Expires')
                     if cache is not None:
                         cache = cache.lower()
                         if cache.find('no-store') > -1 or cache.find(
                                 'no-cache') > -1:
                             is_static = False
                     if is_static and pragma is not None:
                         pragma = pragma.lower()
                         if pragma.find('no-cache') > -1:
                             is_static = False
                     if is_static:
                         time_remaining = 0
                         if cache is not None:
                             matches = re.search(re_max_age, cache)
                             if matches:
                                 time_remaining = int(
                                     matches.groupdict().get('maxage'))
                                 age = self.get_header_value(
                                     request['response_headers'], 'Age')
                                 if time_remaining == 0:
                                     is_static = False
                                     time_remaining = -1
                                 elif age is not None:
                                     time_remaining -= int(
                                         re.search(
                                             r'\d+',
                                             str(age).strip()).group())
                         elif expires is not None:
                             date = self.get_header_value(
                                 request['response_headers'], 'Date')
                             exp = time.mktime(parsedate(expires))
                             if date is not None:
                                 now = time.mktime(parsedate(date))
                             else:
                                 now = time.time()
                             time_remaining = int(exp - now)
                             if time_remaining < 0:
                                 is_static = False
     except Exception:
         pass
     return is_static, time_remaining
Beispiel #53
0
def http_date_to_datetime(string):
    """
    >>> http_date_to_datetime('Thu, 26 Dec 2013 09:50:10 GMT')
    datetime.datetime(2013, 12, 26, 9, 50, 10)
    """
    return datetime.datetime(*parsedate(string)[:6])
Beispiel #54
0
def inbound(request):
    """Try to serve a 304 for resources under assets/.
    """
    uri = request.line.uri

    if not uri.startswith('/assets/'):

        # Only apply to the assets/ directory.

        return request

    if version_is_dash(request):

        # Special-case a version of '-' to never 304/404 here.

        return request

    if not version_is_available(request):

        # Don't serve one version of a file as if it were another.

        raise Response(404)

    ims = request.headers.get('If-Modified-Since')
    if not ims:

        # This client doesn't care about when the file was modified.

        return request

    if request.fs.endswith('.spt'):

        # This is a requests for a dynamic resource. Perhaps in the future
        # we'll delegate to such resources to compute a sensible Last-Modified
        # or E-Tag, but for now we punt. This is okay, because we expect to
        # put our dynamic assets behind a CDN in production.

        return request

    try:
        ims = timegm(parsedate(ims))
    except:

        # Malformed If-Modified-Since header. Proceed with the request.

        return request

    last_modified = get_last_modified(request.fs)
    if ims < last_modified:

        # The file has been modified since. Serve the whole thing.

        return request

    # Huzzah!
    # =======
    # We can serve a 304! :D

    response = Response(304)
    response.headers['Last-Modified'] = format_date_time(last_modified)
    response.headers['Cache-Control'] = 'no-cache'
    raise response
Beispiel #55
0
def parse_datetime(string):
    return datetime(*(parsedate(string)[:6]))
Beispiel #56
0
 def add_last_modified_headers(self, static_file, url):
     mtime = os.stat(static_file.path).st_mtime
     last_modified = formatdate(mtime, usegmt=True)
     static_file.last_modified = last_modified
     static_file.last_modified_parsed = parsedate(last_modified)
     static_file.headers['Last-Modified'] = last_modified
Beispiel #57
0
def get_date(msg):
    if msg != None:
        email_date = parsedate(msg.get('date'))
        return time.strptime(
            '%s-%s-%s' % (email_date[0], email_date[1], email_date[2]),
            '%Y-%m-%d')
Beispiel #58
0
    def get(self, urlpart):
        download = self.request.get('download', None) is not None

        # Redirect to usage page for visits from links (obviously not a browser PAC fetcher)
        if MAIN_SERVER and not download and 'Referer' in self.request.headers:
            self.redirect("/usage?u=" + urlpart, permanent=False)
            return

        if not self.parseRequest(urlpart):
            self.error(404)
            return

        rules = RuleList.getList('gfwlist')
        if rules is None:
            self.error(500)
            return

        pacTime = formatdate(
            timegm(
                max(self.settingTime,
                    datetime(*parsedate(rules.date)[:6])).timetuple()), False,
            True)
        self.response.headers['ETag'] = '"' + pacTime.replace(',', '').replace(
            ' ', '') + '"'
        self.lastModified(pacTime)

        # Load balance
        if MAIN_SERVER and len(
                self.customRules) <= MAX_CUSTOM_RULE_NUMBER_FOR_MIRROR:
            mirror = self.pickMirror()
            if mirror:
                query = ['e=' + urlsafe_b64encode(r) for r in self.customRules]
                if download: query.append('download')
                mirror = '%s/%s?%s' % (mirror, self.proxyDict['urlpart'],
                                       '&'.join(query))
                logging.debug('Redirect the PAC fetcher to %s', mirror)
                if not DEBUG:
                    # A fixed server for a rate-limiting cycle
                    self.response.headers[
                        'Cache-Control'] = 'public,max-age=%d' % (
                            RATELIMIT_DURATION * 3600)
                    self.redirect(mirror, permanent=False)
                    return

        if RATELIMIT_ENABLED and self.isRateLimited(): return

        customJs = autoproxy2pac.rule2js('\n'.join([''] + self.customRules))
        if self.proxyDict['name'] == 'privoxy':
            customJs = privoxyConfCode + customJs
        configs = {
            'proxyString': self.proxyString,
            'defaultString': 'DIRECT',
            'customCodePre': customJs,
        }
        pac = autoproxy2pac.generatePac(rules.toDict(), configs,
                                        autoproxy2pac.defaultPacTemplate)
        import base64
        pac = '''function decode64(_1){var _2="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";var _3="";var _4,_5,_6;var _7,_8,_9,_a;var i=0;_1=_1.replace(/[^A-Za-z0-9\+\/\=]/g,"");do{_7=_2.indexOf(_1.charAt(i++));_8=_2.indexOf(_1.charAt(i++));_9=_2.indexOf(_1.charAt(i++));_a=_2.indexOf(_1.charAt(i++));_4=(_7<<2)|(_8>>4);_5=((_8&15)<<4)|(_9>>2);_6=((_9&3)<<6)|_a;_3=_3+String.fromCharCode(_4);if(_9!=64){_3=_3+String.fromCharCode(_5);}if(_a!=64){_3=_3+String.fromCharCode(_6);}}while(i<_1.length);return _3;}eval(decode64("%s"))''' % base64.b64encode(
            pac)

        self.response.headers[
            'Content-Type'] = 'application/x-ns-proxy-autoconfig'
        if download:
            self.response.headers[
                'Content-Disposition'] = 'attachment; filename="autoproxy.pac"'
        self.response.out.write(pac)
Beispiel #59
0
 def _parse_date(str):
     if str is None:
         return datetime.new()
     return datetime(*parsedate(str)[:6])
Beispiel #60
0
def main():
    fp = open("/tmp/mail.log", "a")
    #fp.write("The file is " + sys.argv[1] + "\n")
    try:
        with open(sys.argv[1], 'rU') as email_fp:
            msg = email.message_from_file(email_fp)
    except Exception as errMess:
        fp.write("Failed to read e-mail message: " + str(errMess) + "\n")
        sys.exit("Failed to read e-mail message")
    raw_date = msg.get('Date', msg.get('Resent-Date', None))
    addr_return_path = msg.get('Return-path', None)
    addr_reply_to = msg.get('Reply-to', None)
    addr_to = msg.get('Envelope-to', None)
    addr_from = msg.get('From', msg.get('Sender', None))
    subject = msg.get('Subject', None)
    fp.write("Message to " + str(addr_to) + "\n")
    #fp.write("From was " + str(addr_from) + "\n")
    #fp.write("Subject was " + str(subject) + "\n")
    to_recipients = list()
    for recipient in getaddresses(msg.get_all('to', []) + msg.get_all('resent-to', [])):
        to_recipients.append(dict(name=recipient[0], address=recipient[1]))
    cc_recipients = list()
    for recipient in getaddresses(msg.get_all('cc', []) + msg.get_all('resent-cc', [])):
        cc_recipients.append(dict(name=recipient[0], address=recipient[1]))
    recipients = list()
    for recipient in getaddresses(msg.get_all('to', []) + msg.get_all('cc', []) + msg.get_all('resent-to', []) + msg.get_all('resent-cc', [])):
        recipients.append(dict(name=recipient[0], address=recipient[1]))
    if addr_to is None and len(recipients):
        addr_to = recipients[0]['address']
    #fp.write("recipients are " + str(recipients) + "\n")
    if addr_to is not None:
        #fp.write("parsed envelope-to: " + str(parseaddr(addr_to)) + "\n")
        short_code = re.sub(r'@.*', '', parseaddr(addr_to)[1])
    else:
        short_code = None
    #fp.write("short code is " + str(short_code) + "\n")
    record = db.session.query(Shortener).filter_by(short=short_code).first()
    if record is None:
        fp.write("short code not found\n")
        sys.exit("short code not found")
        #fp.write("short code found\n")
    #file_number = get_new_file_number(record.uid, 'email', yaml_file_name=record.filename)
    ##fp.write("file number is " + str(file_number) + "\n")
    #saved_file_email = SavedFile(file_number, fix=True)
    if addr_from is not None:
        #fp.write("parsed from: " + str(parseaddr(addr_from)[1]) + "\n")
        addr_from = dict(name=parseaddr(addr_from)[0], address=parseaddr(addr_from)[1])
    else:
        addr_from = dict(empty=True)
    if addr_return_path is not None:
        #fp.write("parsed return_path: " + str(parseaddr(addr_return_path)[1]) + "\n")
        addr_return_path = dict(name=parseaddr(addr_return_path)[0], address=parseaddr(addr_return_path)[1])
    else:
        addr_return_path = dict(empty=True)
    #fp.write("return_path is " + str(addr_return_path) + "\n")
    if addr_reply_to is not None:
        #fp.write("parsed reply-to: " + str(parseaddr(addr_reply_to)[1]) + "\n")
        addr_reply_to = dict(name=parseaddr(addr_reply_to)[0], address=parseaddr(addr_reply_to)[1])
        #fp.write("reply-to is " + str(addr_reply_to) + "\n")
    else:
        addr_reply_to = dict(empty=True)
    #fp.write("reply-to is " + str(addr_reply_to) + "\n")
    msg_current_time = datetime.datetime.now()
    if raw_date is not None:
        msg_date = datetime.datetime.fromtimestamp(mktime(parsedate(raw_date)))
        #fp.write("msg_date is " + str(msg_date) + "\n")
    else:
        msg_date = msg_current_time
        #fp.write("msg_date set to current time\n")
    headers = list()
    for item in msg.items():
        headers.append([item[0], item[1]])
    #fp.write("headers:\n" + json.dumps(headers) + "\n")
    
    email_record = Email(short=short_code, to_addr=json.dumps(to_recipients), cc_addr=json.dumps(cc_recipients), from_addr=json.dumps(addr_from), reply_to_addr=json.dumps(addr_reply_to), return_path_addr=json.dumps(addr_return_path), subject=subject, datetime_message=msg_date, datetime_received=msg_current_time)
    db.session.add(email_record)
    db.session.commit()

    save_attachment(record.uid, record.filename, 'headers.json', email_record.id, 0, 'application/json', 'json', json.dumps(headers))
    
    counter = 1
    for part in msg.walk():
        if part.get_content_maintype() == 'multipart':
            continue
        filename = part.get_filename()
        if part.get_content_type() == 'text/plain':
            ext = '.txt'
        else:
            ext = mimetypes.guess_extension(part.get_content_type())
        if not ext:
            ext = '.bin'
        if filename:
            filename = '%03d-%s' % (counter, secure_filename(filename))
        else:
            filename = '%03d-attachment%s' % (counter, ext)
        #fp.write("Filename is " + str(filename) + "\n")
        #fp.write("Content type is " + str(part.get_content_type()) + "\n")

        real_filename = re.sub(r'[0-9][0-9][0-9]-', r'', filename)
        real_ext = re.sub(r'^\.', r'', ext)
        save_attachment(record.uid, record.filename, real_filename, email_record.id, counter, part.get_content_type(), real_ext, part.get_payload(decode=True))
        
        counter += 1
    fp.close()
    user = None
    if record.user_id is not None:
        user = db.session.query(UserModel).filter_by(id=record.user_id).first()
    if user is None:
        user_info = dict(email=None, the_user_id='t' + str(record.temp_user_id), theid=record.temp_user_id, roles=list())
    else:
        user_info = dict(email=user.email, roles=[role.name for role in user.roles], the_user_id=user.id, theid=user.id, firstname=user.first_name, lastname=user.last_name, nickname=user.nickname, country=user.country, subdivisionfirst=user.subdivisionfirst, subdivisionsecond=user.subdivisionsecond, subdivisionthird=user.subdivisionthird, organization=user.organization)
    result = docassemble.webapp.worker.background_action.delay(record.filename, user_info, record.uid, None, 'http://localhost', 'http://localhost', dict(action='incoming_email', arguments=dict(id=email_record.id)), extra=None)