def tweet_processing(tweetFile):
  tweet_file = open(tweetFile)                        # Open the file for reading
  tweet_hash = {}                                     # Define a dictionary for keeping the hashtags as values and their id as keys
  tweet_id={}                                         # Define a dictionary for keeping the created_at as values and their id as keys
  first_tweet=True
  latest_date=None
  print("Started reading tweets")
  for tweet_line in tweet_file:                       # Loop for every tweet in the tweets file
    tweet = json.loads(tweet_line)                    # convert the json string to dictionary object
    if "entities" in tweet.keys():                    # Check whether entities tags present
      hashtags = tweet["entities"]["hashtags"]        #  - if present then extract the hashtags
      if hashtags:
            if first_tweet:
              latest_date=datetime.datetime.fromtimestamp(time.mktime(datetime.datetime.now().timetuple()),pytz.utc)               #set latest date to current datetime if its first tweet
              #latest_date=datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz("Fri Oct 30 15:29:44 +0000 2015")),pytz.utc)
              first_tweet=False
            tweet_hash[tweet["id"]]=remove_dups(['#'+str((remove_unicode(ht['text']))).lower() for ht in hashtags if ht != None and len((remove_unicode(ht['text'])))>1 ]) #extracts the hastags cleans them and checks if length of hashtag is more than 1 character
            tweet_id[tweet["id"]]=[datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(tweet['created_at'])),pytz.utc),0]
            create_graph(tweet_hash[tweet["id"]]) #calls the

            for key,value in tweet_id.items():   #checks for old tweets if found evict them
                if (latest_date-value[0]).total_seconds()>60:
                    if len(tweet_hash[key])>=2: evict_graph(tweet_hash[key],key)

            #for i in older_tweet:  #removes old tweets from if
            #    if i in tweet_id.keys():del tweet_id[i]
            tweet_date=datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(tweet['created_at'])),pytz.utc)
            if tweet_date>=latest_date:latest_date=tweet_date

  feature2=open(outfile,'w')
  for degree in rolling_degree:feature2.write(str(degree)+'\n') #write into output file

  print("Processing is completed!!!")
Ejemplo n.º 2
0
def main():
    ts = 'Fri Dec 07 16:12:48 +0100 2012'
    dt = int(mktime_tz(parsedate_tz(ts.strip())))
    print dt
    ts = 'Fri Dec 07 16:12:48 +0000 2012'
    dt = int(mktime_tz(parsedate_tz(ts.strip())))
    print dt
Ejemplo n.º 3
0
    def handle(self, **options):
        count = TLE.objects.count()

        if count:
            last_TLE = TLE.objects.first()
            last_mktime = mktime_tz(parsedate_tz(last_TLE.datetime_in_lines.ctime()))
        else:
            last_mktime = 0

        url = 'http://www.celestrak.com/NORAD/elements/stations.txt'

        resp = get(url)

        url_mktime = mktime_tz(parsedate_tz(resp.headers['last-modified']))
        url_datetime = datetime.utcfromtimestamp(url_mktime)
        if url_mktime > last_mktime:
            self.stdout.write('Date and time of creation TLE: {}'.format(url_datetime.isoformat()))
            self.stdout.write('New TLE found, downloading...')

            result = urlretrieve(url, 'TLE.txt')
            fh = open(result[0], 'r', encoding='utf8')
            lines = fh.readlines()[:3]
            fh.close()
            os.remove(result[0])
            title = lines[0].strip()
            norad_id = int(lines[1][2:7])
            sat, status = Satellite.objects.get_or_create(title=title, satellite_number=norad_id)
            try:
                self.stdout.write('Start create and save object - ' + sat.title + '\n')
                TLE.objects.bulk_create(unique_tle(lines, sat))
                self.stdout.write('Finished create and save object - ' + sat.title + '\n')
            except:
                self.stdout.write('Fail create and save object - ' + sat.title + '\n')
        else:
            self.stdout.write('No new TLE. A new attempt after 5 minutes...')
Ejemplo n.º 4
0
def parse_pubdate(text):
    """Parse a date string into a Unix timestamp

    >>> parse_pubdate('Fri, 21 Nov 1997 09:55:06 -0600')
    880127706

    >>> parse_pubdate('2003-12-13T00:00:00+02:00')
    1071266400

    >>> parse_pubdate('2003-12-13T18:30:02Z')
    1071340202

    >>> parse_pubdate('Mon, 02 May 1960 09:05:01 +0100')
    -305049299

    >>> parse_pubdate('')
    0

    >>> parse_pubdate('unknown')
    0
    """
    if not text:
        return 0

    parsed = parsedate_tz(text)
    if parsed is not None:
        try:
            pubtimeseconds = int(mktime_tz(parsed))
            return pubtimeseconds
        except(OverflowError,ValueError):
            logger.warning('bad pubdate %s is before epoch or after end of time (2038)',parsed)
            return 0
        
    try:
        parsed = time.strptime(text[:19], '%Y-%m-%dT%H:%M:%S')
        if parsed is not None:
            m = re.match(r'^(?:Z|([+-])([0-9]{2})[:]([0-9]{2}))$', text[19:])
            if m:
                parsed = list(iter(parsed))
                if m.group(1):
                    offset = 3600 * int(m.group(2)) + 60 * int(m.group(3))
                    if m.group(1) == '-':
                        offset = 0 - offset
                else:
                    offset = 0
                parsed.append(offset)
                return int(mktime_tz(tuple(parsed)))
            else:
                return int(time.mktime(parsed))
    except Exception:
        pass

    logger.error('Cannot parse date: %s', repr(text))
    return 0
Ejemplo n.º 5
0
def _retrieve_mails(uri):
    LOG.debug('Retrieving mail archive from uri: %s', uri)
    content = utils.read_uri(uri)
    if not content:
        LOG.error('Error reading mail archive from uri: %s', uri)
        return

    content = utils.gzip_decompress(content)
    LOG.debug('Mail archive is loaded, start processing')

    content += TRAILING_RECORD

    for rec in re.finditer(MAIL_BOX_PATTERN, content):
        email = rec.groupdict()
        email['author_email'] = email['author_email'].replace(' at ', '@', 1)
        if not utils.check_email_validity(email['author_email']):
            continue

        email['date'] = int(email_utils.mktime_tz(
            email_utils.parsedate_tz(email['date'])))

        for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
            collection = set()
            for item in re.finditer(pattern, email['body']):
                groups = item.groupdict()
                item_id = groups['id']
                if 'module' in groups:
                    item_id = groups['module'] + ':' + item_id
                    email['module'] = groups['module']
                collection.add(item_id)
            email[pattern_name] = list(collection)

        yield email
Ejemplo n.º 6
0
 def fetch_photos(self, user):
     """Fetch up to 40 photos from the tumbleblog of a specified user"""
     user_cache_dir = os.path.join(self.cache_dir, user)
     if not os.path.exists(user_cache_dir):
         os.mkdir(user_cache_dir)
     feed = etree.parse(RSS_URL.format(user=user))
     for item in ITEM_XPATH(feed):
         (guid,) = GUID_XPATH(item)
         guid = guid.rsplit(':', 1)[1]
         cache_file_name = os.path.join(user_cache_dir, guid) + '.json'
         if os.path.exists(cache_file_name):
             continue
         attrs = json.loads(ATTR_XPATH(item)[0])
         if attrs['type'] != 'image':
             continue
         (datestr,) = DATE_XPATH(item)
         timestamp = int(mktime_tz(parsedate_tz(datestr)))
         image_url = attrs['url']
         image_file = image_url.rsplit('/', 1)[1]
         local_image = os.path.join(user_cache_dir, image_file)
         urlretrieve(image_url, local_image)
         details = dict(image_file=image_file, timestamp=timestamp,
                 body=attrs['body'])
         with file(cache_file_name, 'wb') as descriptor:
             json.dump(details, descriptor)
Ejemplo n.º 7
0
    def refresh(self, now=None):
        """
            This fairly complex and heuristic function refreshes a server
            response for replay.

                - It adjusts date, expires and last-modified headers.
                - It adjusts cookie expiration.
        """
        if not now:
            now = time.time()
        delta = now - self.timestamp_start
        refresh_headers = [
            "date",
            "expires",
            "last-modified",
        ]
        for i in refresh_headers:
            if i in self.headers:
                d = parsedate_tz(self.headers[i][0])
                if d:
                    new = mktime_tz(d) + delta
                    self.headers[i] = [formatdate(new)]
        c = []
        for i in self.headers["set-cookie"]:
            c.append(self._refresh_cookie(i, delta))
        if c:
            self.headers["set-cookie"] = c
Ejemplo n.º 8
0
    def was_modified_since(self, header=None, mtime=0, size=0):
        """
        Was something modified since the user last downloaded it?

        header
          This is the value of the If-Modified-Since header.  If this is None,
          I'll just return True.

        mtime
          This is the modification time of the item we're talking about.

        size
          This is the size of the item we're talking about.
        """
        try:
            if header is None:
                raise ValueError
            matches = re.match(r"^([^;]+)(; length=([0-9]+))?$", header,
                               re.IGNORECASE)
            header_mtime = mktime_tz(parsedate_tz(matches.group(1)))
            header_len = matches.group(3)
            if header_len and int(header_len) != size:
                raise ValueError()
            if mtime > header_mtime:
                raise ValueError()
        except (AttributeError, ValueError, OverflowError):
            return True
        return False
Ejemplo n.º 9
0
Archivo: stats.py Proyecto: Debian/dak
def parse_prod(logdate):
    global stats
    global users
    maildate = ''.join([x[-2:] for x in logdate.split('-')])
    mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive',
                       'mail-%s.xz' % maildate)
    if not isfile(mailarchive):
        return
    (fd, tmpfile) = utils.temp_filename(utils.get_conf()['Dir::TempPath'])
    system('xzcat %s > %s' % (mailarchive, tmpfile))
    for message in mbox(tmpfile):
        if (message['subject']
                and message['subject'].startswith('Comments regarding')):
            try:
                member = users[' '.join(message['From'].split()[:-1])]
            except KeyError:
                continue
            ts = mktime_tz(parsedate_tz(message['date']))
            timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S")
            date = parse_timestamp(timestamp)
            if date not in stats:
                stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
                                 'REJECT': 0, 'PROD': 0}, 'members': {}}
            if member not in stats[date]['members']:
                stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
                                                     'PROD': 0}
            if member not in stats['history']['members']:
                stats['history']['members'][member] = {'ACCEPT': 0,
                                                       'REJECT': 0, 'PROD': 0}
            stats[date]['stats']['PROD'] += 1
            stats[date]['members'][member]['PROD'] += 1
            stats['history']['stats']['PROD'] += 1
            stats['history']['members'][member]['PROD'] += 1
    unlink(tmpfile)
Ejemplo n.º 10
0
def datetime_from_rfc822(value):
    '''
    Turns an RFC822 formatted date into a datetime object.

    Example::

        inputs.datetime_from_rfc822('Wed, 02 Oct 2002 08:00:00 EST')

    :param str value: The RFC822-complying string to transform
    :return: The parsed datetime
    :rtype: datetime
    :raises ValueError: if value is an invalid date literal

    '''
    raw = value
    if not time_regex.search(value):
        value = ' '.join((value, '00:00:00'))
    try:
        timetuple = parsedate_tz(value)
        timestamp = mktime_tz(timetuple)
        if timetuple[-1] is None:
            return datetime.fromtimestamp(timestamp).replace(tzinfo=pytz.utc)
        else:
            return datetime.fromtimestamp(timestamp, pytz.utc)
    except Exception:
        raise ValueError('Invalid date literal "{0}"'.format(raw))
Ejemplo n.º 11
0
def get_mail_queue(site):
    """ Get a list of files that are still in the NEW mail_queue folder """
    join = os.path.join

    queue_path = os.environ.get('NAAYA_MAIL_QUEUE', None)
    if queue_path is None:
        return []

    mail_queue = []
    new_queue_path = join(queue_path, 'new')
    if os.path.isdir(new_queue_path):
        # Get all messages files
        messages = [join(new_queue_path, filename)
                    for filename in sorted(os.listdir(new_queue_path))]

        for message in messages:
            message_file = open(message, 'r+')
            mail = message_from_file(message_file)
            message_file.close()

            # Prepare the date to be formatted with utShowFullDateTime
            date = email_utils.parsedate_tz(mail.get('Date', ''))
            date = email_utils.mktime_tz(date)
            date = datetime.fromtimestamp(date)

            mail_queue.append({
                'subject': mail.get('Subject', '(no-subject)'),
                'content': mail.get_payload(decode=True),
                'recipients': mail.get_all('To'),
                'sender': mail.get('From'),
                'date': date,
                'filename': os.path.split(message)[-1]
            })

    return mail_queue
Ejemplo n.º 12
0
 def parse_item(self, item):
     ep = Storage()
     ep.title = item.findtext('title')
     match = self.title_strip_quality.search(ep.title)
     if match:
         title_ = match.group(1)
     else:
         title_ = ep.title
     info = self.parse_title(title_)
     ep.update(info)
     ep.description = item.findtext('description')
     ep.link = item.find('enclosure').get('url')
     ep.pubdate = item.findtext('pubDate')
     ep.filename = item.findtext('title')
     ep.pubdate = datetime.datetime.utcfromtimestamp(mktime_tz(parsedate_tz(ep.pubdate)))
     ep.filterwith = ep.title
     ep.size = item.find('enclosure').get('length')
     try:
         ep.size = int(ep.size)
     except:
         ep.size = 0
     if ep.size < 100*1024*1024:
         ep.size = 300*1024*1024
     ep.guid = item.findtext('guid')
     return ep
Ejemplo n.º 13
0
 def parse_item(self, item):
     ep = Storage()
     ep.filename = item.findtext('{http://xmlns.ezrss.it/0.1/}torrent/{http://xmlns.ezrss.it/0.1/}fileName')
     info = self.parse_title(ep.filename)
     ep.update(info)
     ep.title = item.findtext('title')
     ep.link = item.findtext('link')
     ep.description = item.findtext('description')
     ep.guid = item.findtext('guid')
     ep.pubdate = item.findtext('pubDate')
     ep.magnet = item.findtext('{http://xmlns.ezrss.it/0.1/}torrent/{http://xmlns.ezrss.it/0.1/}magnetURI')
     ep.filename = item.findtext('{http://xmlns.ezrss.it/0.1/}torrent/{http://xmlns.ezrss.it/0.1/}fileName')
     ep.pubdate = datetime.datetime.utcfromtimestamp(mktime_tz(parsedate_tz(ep.pubdate)))
     ep.size = item.find('enclosure').get('length')
     try:
         ep.size = int(ep.size)
     except:
         ep.size = 0
     if ep.size < 100*1024*1024:
         ep.size = 300*1024*1024
     if ep.magnet == 'magnet:?xt=urn:btih:&dn=':
         #check at least for a bt-chat info_hash
         btchat = self.magnetr.search(ep.link)
         if btchat:
             hash = btchat.group(1)
             ep.magnet = 'magnet:?xt=urn:btih:%s' % hash
             ep.link = None
     if not ep.guid:
         ep.guid = ep.description
     ep.filterwith = ep.title
     return ep
Ejemplo n.º 14
0
def get_webex_email(site, filename, where_to_read="sent-webex"):
    """ Show a specific webex email saved on the disk """
    save_path = get_log_dir(site)
    join = os.path.join

    if save_path:
        save_path = join(save_path, where_to_read)
        if os.path.isdir(save_path):
            message_path = join(save_path, filename)

            try:
                message_file = open(message_path, "r+")
            except IOError:
                return None
            mail = message_from_file(message_file)
            message_file.close()

            # Prepare the date to be formatted with utShowFullDateTime
            date = email_utils.parsedate_tz(mail.get("Date", ""))
            date = email_utils.mktime_tz(date)
            date = datetime.fromtimestamp(date)

            return {
                "subject": mail.get("Subject", "(no-subject)"),
                "content": mail.get_payload(decode=True).replace("\n\n", "</p><p>").replace("\n", "<br/>"),
                "recipients": mail.get_all("To"),
                "sender": mail.get("From"),
                "date": date,
                "webex": mail.get("X-Accept-Webex-Data", ""),
            }
Ejemplo n.º 15
0
def dbopen(environ, db_basename):
	stderr = environ['wsgi.errors']

	if not db_basename in databases.databases:
		db_filename = os.path.join(environ["DATADIR"], db_basename)
		stderr.write("db_filename: %s\n" % db_filename)
		conn = sqlite3.connect(db_filename)
		conn.enable_load_extension(True)
		conn.load_extension("mod_spatialite")
		conn.enable_load_extension(False)
		conn.row_factory = sqlite3.Row
		databases.databases[db_basename] = (conn.cursor(), int(os.path.getmtime(db_filename)))
	(cursor, last_modified) = databases.databases[db_basename]

	time_now = time.time()
	response_headers = [
		('Date', formatdate(time_now, usegmt=True)),
        ('Last-Modified', formatdate(last_modified, usegmt=True)),
		('Cache-Control', 'public,max-age=86400'),
		]

	if_modified_since = environ.get("HTTP_IF_MODIFIED_SINCE")
	if if_modified_since is not None:
		stderr.write("If-Modified-Since: %s\n" % if_modified_since)
		stderr.write("Last-Modified: %s\n" % formatdate(last_modified, usegmt=True))
		if_modified_since = mktime_tz(parsedate_tz(if_modified_since))
		if last_modified <= if_modified_since:
			stderr.write("304 Not Modified\n")
			return (None, response_headers)

	return (cursor, response_headers)
Ejemplo n.º 16
0
def get_bulk_email(site, filename, where_to_read="sent-bulk", message_file_path=None):
    """ Show a specific bulk email saved on the disk """
    try:
        if not message_file_path:
            save_path = _get_message_path(site, where_to_read)
            message_file_path = os.path.join(save_path, filename)
        message_file = open(message_file_path, "r+")
    except (IOError, TypeError, AttributeError):
        return None

    mail = message_from_file(message_file)
    message_file.close()

    # Prepare the date to be formatted with utShowFullDateTime
    date = email_utils.parsedate_tz(mail.get("Date", ""))
    date = email_utils.mktime_tz(date)
    date = datetime.fromtimestamp(date)

    r = {
        "subject": mail.get("Subject", "(no-subject)"),
        "content": mail.get_payload(decode=True).replace("\n\n", "</p><p>").replace("\n", "<br/>"),
        "recipients": mail.get_all("To"),
        "cc_recipients": mail.get_all("Cc"),
        "sender": mail.get("From"),
        "date": date,
        "webex": mail.get("X-Accept-Webex-Data", ""),
    }
    return r
Ejemplo n.º 17
0
def get_bulk_email(site, filename, where_to_read='sent-bulk',
                   message_file_path=None):
    """ Show a specific bulk email saved on the disk """
    try:
        if not message_file_path:
            save_path = _get_message_path(site, where_to_read)
            message_file_path = os.path.join(save_path, filename)
        message_file = open(message_file_path, 'r+')
    except (IOError, TypeError, AttributeError):
        return None

    mail = message_from_file(message_file)
    message_file.close()

    # Prepare the date to be formatted with utShowFullDateTime
    date = email_utils.parsedate_tz(mail.get('Date', ''))
    date = email_utils.mktime_tz(date)
    date = datetime.fromtimestamp(date)

    r = {
        'subject': mail.get('Subject', '(no-subject)'),
        'content': mail.get_payload(decode=True).replace(
            '\n\n', '</p><p>').replace('\n', '<br/>'),
        'recipients': mail.get_all('To'),
        'cc_recipients': mail.get_all('Cc'),
        'sender': mail.get('From'),
        'date': date,
        'webex': mail.get('X-Accept-Webex-Data', '')
    }
    return r
Ejemplo n.º 18
0
def get_bulk_email(site, filename):
    """ Show a specific bulk email saved on the disk """
    save_path = get_log_dir(site)
    join = os.path.join

    if save_path:
        save_path = join(save_path, 'sent-bulk')
        if os.path.isdir(save_path):
            message_path = join(save_path, filename)

            try:
                message_file = open(message_path, 'r+')
            except IOError:
                return None
            mail = message_from_file(message_file)
            message_file.close()

            # Prepare the date to be formatted with utShowFullDateTime
            date = email_utils.parsedate_tz(mail.get('Date', ''))
            date = email_utils.mktime_tz(date)
            date = datetime.fromtimestamp(date)

            return {
                'subject': mail.get('Subject', '(no-subject)'),
                'content': mail.get_payload(decode=True).replace(
                    '\n\n', '</p><p>').replace('\n', '<br/>'),
                'recipients': mail.get_all('To'),
                'sender': mail.get('From'),
                'date': date,
            }
Ejemplo n.º 19
0
  def __init__(self, mbFile, listAddrProg, startDate=None, endDate=None):
    self.mbFile = mbFile
    mb = mailbox.UnixMailbox(file(mbFile, "r"), email.message_from_file)
    checked = set()
    count = 0

    config = yaml.load(open("game.yaml"))
    tz = pytz.timezone(config.get("timezone", "UTC"))

    for msg in mb:
      try:
        if not listAddrProg.search(msg["to"]):
          continue
  
        timetuple = parsedate_tz(msg["date"])
        timestamp = mktime_tz(timetuple)
        date = datetime.fromtimestamp(timestamp)
        date = date.replace(tzinfo=tz)

        if startDate > date or date > endDate:
          continue

        mail = Mail(msg, date)
  
        self.msgs.append(mail)

#        count = count +1
#        print "count...", count
#        if count == 50:
#          break
      except Exception as e:
        print "failed analyzing a message!" , e
Ejemplo n.º 20
0
 def _refresh_cookie(self, c, delta):
     """
         Takes a cookie string c and a time delta in seconds, and returns
         a refreshed cookie string.
     """
     try:
         c = Cookie.SimpleCookie(str(c))
     except Cookie.CookieError:
         raise ValueError("Invalid Cookie")
     for i in c.values():
         if "expires" in i:
             d = parsedate_tz(i["expires"])
             if d:
                 d = mktime_tz(d) + delta
                 i["expires"] = formatdate(d)
             else:
                 # This can happen when the expires tag is invalid.
                 # reddit.com sends a an expires tag like this: "Thu, 31 Dec
                 # 2037 23:59:59 GMT", which is valid RFC 1123, but not
                 # strictly correct according to the cookie spec. Browsers
                 # appear to parse this tolerantly - maybe we should too.
                 # For now, we just ignore this.
                 del i["expires"]
     ret = c.output(header="").strip()
     if not ret:
         raise ValueError("Invalid Cookie")
     return ret
Ejemplo n.º 21
0
def refresh_set_cookie_header(c, delta):
    """
    Args:
        c: A Set-Cookie string
        delta: Time delta in seconds
    Returns:
        A refreshed Set-Cookie string
    """

    name, value, attrs = parse_set_cookie_header(c)
    if not name or not value:
        raise ValueError("Invalid Cookie")

    if "expires" in attrs:
        e = parsedate_tz(attrs["expires"])
        if e:
            f = mktime_tz(e) + delta
            attrs = attrs.with_set_all("expires", [formatdate(f)])
        else:
            # This can happen when the expires tag is invalid.
            # reddit.com sends a an expires tag like this: "Thu, 31 Dec
            # 2037 23:59:59 GMT", which is valid RFC 1123, but not
            # strictly correct according to the cookie spec. Browsers
            # appear to parse this tolerantly - maybe we should too.
            # For now, we just ignore this.
            attrs = attrs.with_delitem("expires")

    ret = format_set_cookie_header(name, value, attrs)
    if not ret:
        raise ValueError("Invalid Cookie")
    return ret
Ejemplo n.º 22
0
 def setUp(self):
     """
     Reads an arbitrary number of mail messages and
     stores them in a brand new messages table.
     
     DANGER: Any existing message table WILL be lost.
     """
     curs.execute("DROP TABLE IF EXISTS message")
     conn.commit()
     curs.execute(TBLDEF)
     conn.commit()
     files = glob(FILESPEC)
     self.msgids = {} # Keyed by message_id
     self.message_ids = {} # keyed by id
     self.msgdates = []
     self.rowcount = 0
     for f in files:
         ff = open(f)
         text = ff.read()
         msg = message_from_string(text)
         id = self.msgids[msg['message-id']] = maildb.store(msg)
         self.message_ids[id] = msg['message-id']
         date = msg['date']
         self.msgdates.append(datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(date))))
         self.rowcount += 1 # Assuming no duplicated Message-IDs
Ejemplo n.º 23
0
def fixDir(dname):
    for fname in os.listdir(dname):
        fname = os.path.join(dname, fname)
        fp = open(fname)
        msg = Parser().parse(fp, True)
        if 'Date' in msg:
            date = parsedate_tz(msg['Date'])
            if date:
                # Ok I had some old emails with messed up Date headers as so:
                # Date: Sun, 22 Aug US/E 13:01:00 -0400
                # I knew these were briefly from '99-'00 so I manually fix that here.
                '''
                if date[0] < 1900:
                    if date[1] < 3:
                        year = 2000
                    else:
                        year = 1999
                    date = (year,) + date[1:]
                    print >> sys.stderr, "Fixing up year '%s' => '%s' for %s" % (msg['Date'], date, fname)
                '''
                try:
                    timestamp = mktime_tz(date)
                    os.utime(fname, (timestamp, timestamp))
                except ValueError:
                    print >> sys.stderr, "Invalid date '%s' for %s: %s" % (msg['Date'], fname, date)
            else:
                print >> sys.stderr, "Could not parse date '%s' for %s" % (msg['Date'], fname)
        else:
            print >> sys.stderr, 'No Date header in %s' % (fname)
def handler(doc):
    # a 'rfc822' stores 'headers' as a dict, with each entry being a list.
    # We only care about headers which rfc5322 must appear 0 or 1 times, so
    # flatten the header values here...
    headers = dict((k, v[0]) for (k, v) in doc["headers"].iteritems())
    # for now, 'from' etc are all tuples of [identity_type, identity_id]
    callbacks = []
    ret = {}
    if "from" in headers:
        name, addr = parseaddr(headers["from"])
        ret["from"] = ["email", addr.lower()]
        ret["from_display"] = name

    if "to" in headers:
        id_list = ret["to"] = []
        disp_list = ret["to_display"] = []
        fill_identity_info(headers["to"], id_list, disp_list)

    if "cc" in headers:
        id_list = ret["cc"] = []
        disp_list = ret["cc_display"] = []
        fill_identity_info(headers["cc"], id_list, disp_list)

    if "subject" in headers:
        ret["subject"] = headers["subject"]
    if "date" in headers:
        dval = headers["date"]
        if dval:
            try:
                ret["timestamp"] = mktime_tz(parsedate_tz(dval))
            except (ValueError, TypeError), exc:
                logger.debug("Failed to parse date %r in doc %r: %s", dval, doc["_id"], exc)
                # later extensions will get upset if no attr exists
                # XXX - is this still true?  We should fix those extensions!
                ret["timestamp"] = 0
Ejemplo n.º 25
0
    def refresh(self, now=None):
        """
            This fairly complex and heuristic function refreshes a server
            response for replay.

                - It adjusts date, expires and last-modified headers.
                - It adjusts cookie expiration.
        """
        if not now:
            now = time.time()
        delta = now - self.timestamp_start
        refresh_headers = [
            "date",
            "expires",
            "last-modified",
        ]
        for i in refresh_headers:
            if i in self.headers:
                d = parsedate_tz(self.headers[i])
                if d:
                    new = mktime_tz(d) + delta
                    self.headers[i] = formatdate(new)
        c = []
        for set_cookie_header in self.headers.get_all("set-cookie"):
            try:
                refreshed = self._refresh_cookie(set_cookie_header, delta)
            except ValueError:
                refreshed = set_cookie_header
            c.append(refreshed)
        if c:
            self.headers.set_all("set-cookie", c)
Ejemplo n.º 26
0
 def setUp(self):
     """
     Reads an arbitrary number of mail messages and
     stores them in a brand new messages table.
     """
 
     self.conn = msc.Connect(**loginInfo)
     self.curs = self.conn.cursor()
     self.curs.execute("DROP TABLE IF EXISTS {0}".format(TBLNM))
     self.conn.commit()
     curs.execute(TBLDEF)
     conn.commit()
     files = glob(FILESPEC)
     self.msgIds = {} # Keyed by messageId
     self.messageIds = {} # Keyed by id
     self.msgdates = []
     self.rowcount = 0
     for f in files:
         ff = open(f)
         text = ff.read()
         msg = message_from_string(text)
         iD = self.msgIds[msg['message-id']] = maildb.store(msg, self.conn, self.curs, TBLNM)
         self.messageIds[iD] = msg['message-id']
         date = msg['date']
         self.msgdates.append(datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(date))))
         self.rowcount += 1 # Assuming no duplicated Message-IDs
         ff.close()
Ejemplo n.º 27
0
def parse_redis_message(payload):
    try:
        headers, body = payload.split('\r\n'*2, 1)
        headers = dict(map(lambda h: h.split(': ', 1), headers.split('\r\n')))
        return Message(mktime_tz(parsedate_tz(headers['Last-Modified'])), int(headers['Etag']), headers['Content-Type'], body)
    except:
        raise Message.Invalid()
	def moveByLabel (self):
		from email.utils import parsedate_tz,mktime_tz
		from mailbox import NoSuchMailboxError

		for folder in self.oldBox.list_folders():
			_c_moved=0
			_c_rej=0
			_c_total=self.oldBox.get_folder(folder).__len__()
			print("\n[I] Folder " + folder + "", end="")
			for key, msg in self.oldBox.get_folder(folder).iteritems():
				_date=msg['Date']
				if _date:
					if (mktime_tz(parsedate_tz(_date)) - self.deltaT) < 0:
						if _c_moved == 0:
							#To detect if no thing is moved, so this can be a new folder
							try:
								self.newBox.get_folder(folder)
							except NoSuchMailboxError:
								print("[I]\tCreating in new: %s" % folder)
								self.newBox.add_folder(folder)
						# Mooooooooooooo'ving!
						self.newBox.get_folder(folder).add(msg)
						self.oldBox.get_folder(folder).remove(key)
						_c_moved += 1
						print("\r[I]\tStats: Not moved (Bad Mail): %d/%d // Moved: %d/%d" % (_c_rej,_c_total,_c_moved,_c_total), end="")
				else:
					_c_rej += 1
			if _c_moved >= _c_total:
				print("\n[W]\tRemoving folder %s" % folder, end="")
		print("")
Ejemplo n.º 29
0
    def _parse_sibling(self, sibling, headers, data):
        """
        Parses a single sibling out of a response.
        """

        sibling.exists = True

        # Parse the headers...
        for header, value in headers:
            header = header.lower()
            if header == 'content-type':
                sibling.content_type, sibling.charset = \
                    self._parse_content_type(value)
            elif header == 'etag':
                sibling.etag = value
            elif header == 'link':
                sibling.links = self._parse_links(value)
            elif header == 'last-modified':
                sibling.last_modified = mktime_tz(parsedate_tz(value))
            elif header.startswith('x-riak-meta-'):
                metakey = header.replace('x-riak-meta-', '')
                sibling.usermeta[metakey] = value
            elif header.startswith('x-riak-index-'):
                field = header.replace('x-riak-index-', '')
                reader = csv.reader([value], skipinitialspace=True)
                for line in reader:
                    for token in line:
                        token = decode_index_value(field, token)
                        sibling.add_index(field, token)
            elif header == 'x-riak-deleted':
                sibling.exists = False

        sibling.encoded_data = data

        return sibling
 def extract_file(self):
     extracted_file = {}
     i = 0  # Used to count actual JSON messages (with time and hashtags information)
     # j=0  # Used to count API messages about connection and rate limits (without time and hashtags information)
     for line in self.my_file:
         temp = json.loads(line)  # read data line by line
         if 'created_at' in temp.keys():  # select the actual JSON messages
             temp_time = temp['created_at']  # extract time
             temp_tag = temp['entities']['hashtags']  # extract hashtags
             # transfer time string to datatime frame
             # temp_time = datetime.strptime(temp_time, '%a %b %d %H:%M:%S %z %Y')
             # strptime is Python platform only, do not use the above query if run by shell
             temp_time = datetime(1970, 1, 1) + timedelta(seconds=mktime_tz(parsedate_tz(temp_time)))
             # store time, hashtags information,and later extracted hashtag words to new file
             extracted_file[i] = [temp_time, temp_tag, []]
             # extract the hashtag words
             if temp_tag:
                 for tag in temp_tag:
                     extracted_file[i][2].append(tag['text'])
             else:  # no hashtags
                 pass
             i += 1
         else:
             # these messages are Twitter API resulting from the rate-limit
             # can be stored in apifile for future uses
             # here we remove these messages from dataset
             pass
             # apifile[j] = temp
             # j += 1
     return extracted_file
Ejemplo n.º 31
0
 def parse_date_time(stamp):
     ts = parsedate_tz(stamp)
     ts = mktime_tz(ts)
     return datetime.fromtimestamp(ts)
Ejemplo n.º 32
0
def _parse_date(date):
    tm = parsedate_tz(date)
    if tm:
        return mktime_tz(tm)
    return 0
Ejemplo n.º 33
0
 def from_value(cls, value):
     dtime = datetime.datetime.fromtimestamp(eut.mktime_tz(eut.parsedate_tz(value)))
     return cls(dtime, value)
Ejemplo n.º 34
0
def mail_date(mail):
    t = parsedate_tz(mail.get('Date', ''))
    if not t:
        return datetime.datetime.utcnow()
    return datetime.datetime.utcfromtimestamp(mktime_tz(t))
Ejemplo n.º 35
0
    def run(self):
        params = self.cfg_params
        now_datetime = datetime.datetime.now()

        mailbox = None
        try:
            logging.info('Logging into the mailbox...')
            mailbox = poplib.POP3(params['server'])
            mailbox.user(params['username'])
            mailbox.pass_(params['#password'])
        except poplib.error_proto as error:
            raise RuntimeError(
                'Unable to connect to the server. Please check: server, username and password'
            )
        except socket.gaierror as error:
            raise RuntimeError('Unable to resolve the server name')

        for i in reversed(range(len(mailbox.list()[1]))):
            logging.info("Reading an email from the mailbox...")
            lines = mailbox.retr(i + 1)[1]
            msg_content = b'\r\n'.join(lines).decode('utf-8')
            email = Parser().parsestr(msg_content)
            email_datetime = datetime.datetime.fromtimestamp(
                mktime_tz(parsedate_tz(email.get_all('Date')[0])))

            if now_datetime - timedelta(
                    hours=params['accept_timedelta_hours']) > email_datetime:
                logging.info("Email is older than 'accept_timedelta_hours'. "
                             "The email is ignored and extracting is done")
                break

            if not (params['accept_from'] in email.get_all('From')[0]):
                logging.info(
                    "Email is not from the 'accept_from' address (<%s>) but from <%s>. "
                    "The email is ignored" %
                    (params['accept_from'], email.get_all('From')[0]))
                continue

            logging.info("Parsing the content of the email...")

            for part in email.walk():
                if (part.get_filename() is None):
                    continue

                filename = None
                raw_filename = part.get_filename()
                if (raw_filename
                        is not None) and (decode_header(raw_filename)[0][1]
                                          is not None):
                    filename = decode_header(raw_filename)[0][0].decode(
                        decode_header(raw_filename)[0][1])
                else:
                    filename = raw_filename

                if (('accept_filename' in params)
                        and (filename != params['accept_filename'])):
                    logging.info(
                        "Email attachment is not the name that is accepted but '%s'. "
                        "The attachment is ignored" % (filename))
                    continue
                elif (('accept_re_filename' in params)
                      and (re.match(params['accept_re_filename'],
                                    part.get_filename())) is None):
                    logging.info(
                        "Email attachment is not accepted by RE: '%s'. "
                        "The attachment is ignored" % (filename))
                    continue

                logging.info("Valid email attachment found, downloading...")

                output_filename = None
                if ('accept_filename' in params):
                    output_filename = '%s/out/files/%s' % (os.getenv(
                        'KBC_DATADIR',
                        '.'), self.cfg_params['accept_filename'])
                elif ('accept_re_filename' in params):
                    output_filename = '%s/out/files/%s' % (os.getenv(
                        'KBC_DATADIR', '.'), filename)

                if (output_filename is None):
                    logging.info(
                        "Unable to determine the name of the output file")

                fp = open(output_filename, 'wb')
                fp.write(part.get_payload(decode=True))
                fp.close()

        logging.info('Logging out...')
        mailbox.quit()
Ejemplo n.º 36
0
def rfc2822_to_epoch(datestr):
    """Given rfc2822 date/time format, return seconds since epoch"""
    return mktime_tz(parsedate_tz(datestr))
Ejemplo n.º 37
0
    def has_been_modified(self, request, response, spider):
        """Return whether the response was modified since last seen.

        We check against the database here.
        If the response has been modified, we update the database.
        If there is no stored last modified date, we save one.
        """
        if hasattr(spider.scanner.scan_object, 'filescan'):
            try:
                # Removes unneeded prefix
                file_path = response.url.replace('file://', '')
                # Transform URL string into normal string
                file_path = unquote(file_path)
                # Retrieves file timestamp from mounted drive
                last_modified = datetime.datetime.fromtimestamp(
                    os.path.getmtime(file_path), tz=pytz.utc)
            except OSError as e:
                logging.error(
                    'Error occured while getting last modified for file %s' %
                    file_path)
                logging.error('Error message %s' % e)
        else:
            # Check the Last-Modified header to see if the content has been
            # updated since the last time we checked it.
            last_modified_header = response.headers.get("Last-Modified", None)
            if last_modified_header is not None:
                last_modified_header_date = datetime.datetime.fromtimestamp(
                    mktime_tz(
                        parsedate_tz(last_modified_header.decode('utf-8'))),
                    tz=pytz.utc)
            else:
                last_modified_header_date = None

            if last_modified_header_date is None and request.method == 'GET':
                content_type_header = response.headers.get(
                    "Content-Type", None).decode('utf-8')
                if content_type_header.startswith("text/html"):
                    # TODO: Check meta tag.
                    # TODO: This is correct, but find out where it goes :-)
                    try:
                        body_html = html.fromstring(response.body)
                    except:
                        logging.info('error occured.')
                    meta_dict = {
                        list(el.values())[0]: list(el.values())[1]
                        for el in body_html.findall('head/meta')
                    }
                    if 'last-modified' in meta_dict:
                        lm = meta_dict['last-modified']
                        try:
                            last_modified_header_date = arrow.get(lm).datetime
                        except:
                            logging.error(
                                "Date format error on last modied: {0}".format(
                                    lm))
            # lastmod comes from a sitemap.xml file
            sitemap_lastmod_date = request.meta.get("lastmod", None)
            if sitemap_lastmod_date is None:
                last_modified = last_modified_header_date
                logging.debug("Using header's last-modified date: %s" %
                              last_modified)
            else:
                if last_modified_header_date is None:
                    # No Last-Modified header, use the lastmod from the sitemap
                    last_modified = sitemap_lastmod_date
                    logging.debug("Using lastmod from sitemap %s" %
                                  last_modified)
                else:
                    # Take the most recent of the two
                    logging.debug(
                        "Taking most recent of (header) %sand (sitemap) %s" %
                        (last_modified_header_date, sitemap_lastmod_date))
                    last_modified = max(last_modified_header_date,
                                        sitemap_lastmod_date)
                    logging.debug("Last modified %s" % last_modified)

        if last_modified is not None:
            # Check against the database
            canonical_url = canonicalize_url(response.url)
            try:
                url_last_modified = UrlLastModified.objects.get(
                    url=canonical_url, scanner=self.get_scanner_object(spider))
                stored_last_modified = url_last_modified.last_modified
                logging.info("Comparing header %s against stored %s" %
                             (last_modified, stored_last_modified))
                if (stored_last_modified is not None
                        and last_modified == stored_last_modified):
                    return False
                else:
                    # Update last-modified date in database
                    url_last_modified.last_modified = last_modified
                    url_last_modified.save()
                    return True
            except UrlLastModified.DoesNotExist:
                logging.debug("No stored Last-Modified header found.")
                url_last_modified = UrlLastModified(
                    url=canonical_url,
                    last_modified=last_modified,
                    scanner=self.get_scanner_object(spider))
                logging.debug("Saving new last-modified value %s" %
                              url_last_modified)
                url_last_modified.save()
                return True
        else:
            # If there is no Last-Modified header, we have to assume it has
            # been modified.
            logging.debug('No Last-Modified header found at all.')
            return True
Ejemplo n.º 38
0
__project__ = "Veles Machine Learning Platform"
__versioninfo__ = 0, 9, 2
__version__ = ".".join(map(str, __versioninfo__))
__license__ = "Apache 2.0"
__copyright__ = u"© 2013-2015 Samsung Electronics Co., Ltd."
__authors__ = [
    "Gennady Kuznetsov", "Vadim Markovtsev", "Alexey Kazantsev",
    "Lyubov Podoynitsina", "Denis Seresov", "Dmitry Senin", "Alexey Golovizin",
    "Egor Bulychev", "Ernesto Sanches"
]
__contact__ = "Gennady Kuznetsov <*****@*****.**>"
__plugins__ = set()

try:
    __git__ = "$Commit$"
    __date__ = mktime_tz(parsedate_tz("$Date$"))
except Exception as ex:
    warn("Cannot expand variables generated by Git, setting them to None")
    __git__ = None
    __date__ = None

__logo_ext__ = ("Copyright %s" % __copyright__,
                "Released under Apache 2.0 license.", "https://velesnet.ml",
                "https://github.com/samsung/veles/issues")

__logo__ = \
    r" _   _ _____ _     _____ _____  " "\n" \
    r"| | | |  ___| |   |  ___/  ___| " + \
    (" Version %s." % __version__) + \
    (" %s\n" % formatdate(__date__, True)) + \
    r"| | | | |__ | |   | |__ \ `--.  " + \
Ejemplo n.º 39
0
def get_timestamp(string_date):
    tt = parsedate_tz(string_date)
    return mktime_tz(tt)
Ejemplo n.º 40
0
def generate_api(user, config):
    #global user_infos
    global auths
    user_infos = {}

    auth_key = config["consumer_key"] + config["consumer_secret"] +\
               config["access_token"] + config["access_secret"]

    if auth_key in auths:
        api = auths[auth_key]["api"]
    else:
        auth = tweepy.OAuthHandler(config["consumer_key"], config["consumer_secret"])
        auth.set_access_token(config["access_token"], config["access_secret"])

        api = tweepy.API(auth)

        auths[auth_key] = {
            "auth": auth,
            "api": api
        }

    if user not in user_infos:
        user_infos[user] = api.get_user(id=user)

    user_info = user_infos[user]

    username = user_info.screen_name.lower()

    title = "@" + user_info.screen_name

    if not config["author_username"] and "name" in user_info.__dict__ and len(user_info.name) > 0:
        title = user_info.name

    if "description" in user_info.__dict__ and len(user_info.description) > 0:
        description = user_info.description
    else:
        description = "%s's twitter" % title

    feed = {
        "title": title,
        "description": description,
        "author": username,
        "url": "https://twitter.com/" + username,
        "social": True,
        "entries": []
    }


    tl = []
    if config["count"] == -1:
        maxid = None

        while True:
            temp_tl = api.user_timeline(id=user, max_id=maxid, count=200, tweet_mode="extended")
            if not temp_tl:
                break

            tl = tl + temp_tl
            maxid = tl[-1].id - 1

            sys.stderr.write("\r" + str(len(tl)) + " / " + str(user_info.statuses_count))
    else:
        tl = api.user_timeline(id=user, count=config["count"], tweet_mode="extended")

    if not tl:
        return None

    for obj in tl:
        #caption = xml.sax.saxutils.unescape(re.sub(" *http[^ ]*t\.co/[^ ]*", "", obj.text))
        #caption = xml.sax.saxutils.unescape(obj.text)
        #pprint.pprint(obj.__dict__)

        is_retweeted = False
        if "retweeted_status" in obj.__dict__ and obj.retweeted_status:
            is_retweeted = True

        if is_retweeted and not config["with_retweets"]:
            continue

        origcaption = obj.full_text.replace("\r", "\n")
        newcaption = origcaption

        if "entities" in obj.__dict__:
            if "urls" in obj.entities:
                for url in obj.entities["urls"]:
                    newcaption = newcaption.replace(url["url"], url["expanded_url"])

        caption = xml.sax.saxutils.unescape(re.sub(" *https?://t\.co/[^ ]*", "", newcaption))
        #caption = xml.sax.saxutils.unescape(newcaption)

        date = rssit.util.localize_datetime(datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(obj._json["created_at"]))))

        entrydict = {
            "url": "https://twitter.com/" + obj.author.screen_name + "/status/" + obj.id_str,
            "caption": caption,
            "date": date,
            "updated_date": date,
            "author": obj.author.screen_name.lower(),
            "images": [],
            "videos": []
        }

        #pprint.pprint(obj.__dict__)

        if "extended_entities" in obj.__dict__:
            for media in obj.__dict__["extended_entities"]["media"]:
                if media["type"] == "photo":
                    url = media["media_url"]
                    url = get_orig_image(url)
                    entrydict["images"].append(url)
                    #entrydict["images"].append(media["media_url"])
                elif media["type"] == "video" or media["type"] == "animated_gif":
                    videodict = {
                        "image": media["media_url"]
                    }

                    variants = media["video_info"]["variants"]

                    max_bitrate = -1
                    curr = None
                    for variant in variants:
                        if "bitrate" in variant and variant["bitrate"] > max_bitrate:
                            curr = variant

                    if not curr:
                        curr = variants[0]

                    videodict["video"] = curr["url"]
                    entrydict["videos"].append(videodict)

        feed["entries"].append(entrydict)

    return feed
Ejemplo n.º 41
0
 def parse_date(self, value):
     try:
         return datetime.utcfromtimestamp(mktime_tz(parsedate_tz(value)))
     except (TypeError, OverflowError):
         raise RuntimeError("Received an ill-formed timestamp")
Ejemplo n.º 42
0
def rfc1123_to_epoch(date_str):
    try:
        date_str = to_unicode(date_str, encoding='ascii')
        return mktime_tz(parsedate_tz(date_str))
    except Exception:
        return None
Ejemplo n.º 43
0
"""
    Copyright 2011 by Brian C. Lane
"""
import sys
import email

raw_msg = sys.stdin.read()
msg = email.message_from_string(raw_msg)
date = msg.get('Date', None)
if date:
    from email.utils import mktime_tz, parsedate_tz, formatdate

    try:
        # Convert to local TZ
        tz_tuple = parsedate_tz(date)
        epoch_time = mktime_tz(tz_tuple)
        msg.add_header('X-Date', formatdate( epoch_time, localtime=True ))

        from cStringIO import StringIO
        from email.generator import Generator
        fp = StringIO()
        g = Generator(fp, mangle_from_=False, maxheaderlen=200)
        g.flatten(msg)
        sys.stdout.write(fp.getvalue())
    except:
        import traceback
        print traceback.format_exc()
        sys.stdout.write(raw_msg)
else:
    # just write it out
    sys.stdout.write(raw_msg)
    def fetch(self, url):
        """Attempts to fetch the URL requested which should refer to a 
        robots.txt file, e.g. http://example.com/robots.txt.
        """

        # ISO-8859-1 is the default encoding for text files per the specs for
        # HTTP 1.0 (RFC 1945 sec 3.6.1) and HTTP 1.1 (RFC 2616 sec 3.7.1).
        # ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
        encoding = "iso-8859-1"
        content = ""
        expires_header = None
        content_type_header = None
        self._response_code = None
        self._source_url = url

        if self.user_agent:
            req = urllib_request.Request(url, None,
                                         {'User-Agent': self.user_agent})
        else:
            req = urllib_request.Request(url)

        try:
            f = urllib_request.urlopen(req)
            content = f.read(MAX_FILESIZE)
            # As of Python 2.5, f.info() looks like it returns the HTTPMessage
            # object created during the connection.
            expires_header = f.info().get("expires")
            content_type_header = f.info().get("Content-Type")
            # As of Python 2.4, this file-like object reports the response
            # code, too.
            if hasattr(f, "code"):
                self._response_code = f.code
            else:
                self._response_code = 200
            f.close()
        except urllib_error.URLError:
            # This is a slightly convoluted way to get the error instance,
            # but it works under Python 2 & 3.
            error_instance = sys.exc_info()
            if len(error_instance) > 1:
                error_instance = error_instance[1]
            if hasattr(error_instance, "code"):
                self._response_code = error_instance.code

        # MK1996 section 3.4 says, "...robots should take note of Expires
        # header set by the origin server. If no cache-control directives
        # are present robots should default to an expiry of 7 days".

        # This code is lazy and looks at the Expires header but not
        # Cache-Control directives.
        self.expiration_date = None
        if self._response_code >= 200 and self._response_code < 300:
            # All's well.
            if expires_header:
                self.expiration_date = email_utils.parsedate_tz(expires_header)

                if self.expiration_date:
                    # About time zones -- the call to parsedate_tz() returns a
                    # 10-tuple with the time zone offset in the 10th element.
                    # There are 3 valid formats for HTTP dates, and one of
                    # them doesn't contain time zone information. (UTC is
                    # implied since all HTTP header dates are UTC.) When given
                    # a date that lacks time zone information, parsedate_tz()
                    # returns None in the 10th element. mktime_tz() interprets
                    # None in the 10th (time zone) element to mean that the
                    # date is *local* time, not UTC.
                    # Therefore, if the HTTP timestamp lacks time zone info
                    # and I run that timestamp through parsedate_tz() and pass
                    # it directly to mktime_tz(), I'll get back a local
                    # timestamp which isn't what I want. To fix this, I simply
                    # convert a time zone of None to zero. It's much more
                    # difficult to explain than to fix. =)
                    # ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
                    if self.expiration_date[9] == None:
                        self.expiration_date = self.expiration_date[:9] + (0, )

                    self.expiration_date = email_utils.mktime_tz(
                        self.expiration_date)
                    if self.use_local_time:
                        # I have to do a little more converting to get this
                        # UTC timestamp into localtime.
                        self.expiration_date = time.mktime(
                            time.gmtime(self.expiration_date))
                #else:
                # The expires header was garbage.

        if not self.expiration_date:
            self.expiration_date = self._now() + SEVEN_DAYS

        if (self._response_code >= 200) and (self._response_code < 300):
            # All's well.
            media_type, encoding = _parse_content_type_header(
                content_type_header)
            # RFC 2616 sec 3.7.1 --
            # When no explicit charset parameter is provided by the sender,
            # media subtypes  of the "text" type are defined to have a default
            # charset value of "ISO-8859-1" when received via HTTP.
            # http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
            if not encoding:
                encoding = "iso-8859-1"
        elif self._response_code in (401, 403):
            # 401 or 403 ==> Go away or I will taunt you a second time!
            # (according to MK1996)
            content = "User-agent: *\nDisallow: /\n"
        elif self._response_code == 404:
            # No robots.txt ==> everyone's welcome
            content = ""
        else:
            # Uh-oh. I punt this up to the caller.
            #raise urllib_error.URLError, self._response_code, None
            raise

        if ((PY_MAJOR_VERSION == 2) and isinstance(content, str)) or \
           ((PY_MAJOR_VERSION > 2)  and (not isinstance(content, str))):
            # This ain't Unicode yet! It needs to be.

            # Unicode decoding errors are another point of failure that I punt
            # up to the caller.
            try:
                content = content.decode(encoding)
            except UnicodeError:
                _raise_error(
                    UnicodeError,
                    "Robots.txt contents are not in the encoding expected (%s)."
                    % encoding)
            except (LookupError, ValueError):
                # LookupError ==> Python doesn't have a decoder for that encoding.
                # One can also get a ValueError here if the encoding starts with
                # a dot (ASCII 0x2e). See Python bug 1446043 for details. This
                # bug was supposedly fixed in Python 2.5.
                _raise_error(
                    UnicodeError,
                    "I don't understand the encoding \"%s\"." % encoding)

        # Now that I've fetched the content and turned it into Unicode, I
        # can parse it.
        self.parse(content)
Ejemplo n.º 45
0
    def parse(self, m, prefix=None):
        """Parse messages sent by the 'buildbot-cvs-mail' program.
        """
        # The mail is sent from the person doing the checkin. Assume that the
        # local username is enough to identify them (this assumes a one-server
        # cvs-over-rsh environment rather than the server-dirs-shared-over-NFS
        # model)
        _, addr = parseaddr(m["from"])
        if not addr:
            # no From means this message isn't from buildbot-cvs-mail
            return None
        at = addr.find("@")
        if at == -1:
            author = addr  # might still be useful
        else:
            author = addr[:at]
        author = util.bytes2unicode(author, encoding="ascii")

        # CVS accepts RFC822 dates. buildbot-cvs-mail adds the date as
        # part of the mail header, so use that.
        # This assumes cvs is being access via ssh or pserver, so the time
        # will be the CVS server's time.

        # calculate a "revision" based on that timestamp, or the current time
        # if we're unable to parse the date.
        log.msg('Processing CVS mail')
        dateTuple = parsedate_tz(m["date"])
        if dateTuple is None:
            when = util.now()
        else:
            when = mktime_tz(dateTuple)

        theTime = datetime.datetime.utcfromtimestamp(float(when))
        rev = theTime.strftime('%Y-%m-%d %H:%M:%S')

        catRE = re.compile(r'^Category:\s*(\S.*)')
        cvsRE = re.compile(r'^CVSROOT:\s*(\S.*)')
        cvsmodeRE = re.compile(r'^Cvsmode:\s*(\S.*)')
        filesRE = re.compile(r'^Files:\s*(\S.*)')
        modRE = re.compile(r'^Module:\s*(\S.*)')
        pathRE = re.compile(r'^Path:\s*(\S.*)')
        projRE = re.compile(r'^Project:\s*(\S.*)')
        singleFileRE = re.compile(r'(.*) (NONE|\d(\.|\d)+) (NONE|\d(\.|\d)+)')
        tagRE = re.compile(r'^\s+Tag:\s*(\S.*)')
        updateRE = re.compile(r'^Update of:\s*(\S.*)')
        comments = ""
        branch = None
        cvsroot = None
        fileList = None
        files = []
        isdir = 0
        path = None
        project = None

        lines = list(body_line_iterator(m))
        while lines:
            line = lines.pop(0)
            m = catRE.match(line)
            if m:
                category = m.group(1)
                continue
            m = cvsRE.match(line)
            if m:
                cvsroot = m.group(1)
                continue
            m = cvsmodeRE.match(line)
            if m:
                cvsmode = m.group(1)
                continue
            m = filesRE.match(line)
            if m:
                fileList = m.group(1)
                continue
            m = modRE.match(line)
            if m:
                # We don't actually use this
                # module = m.group(1)
                continue
            m = pathRE.match(line)
            if m:
                path = m.group(1)
                continue
            m = projRE.match(line)
            if m:
                project = m.group(1)
                continue
            m = tagRE.match(line)
            if m:
                branch = m.group(1)
                continue
            m = updateRE.match(line)
            if m:
                # We don't actually use this
                # updateof = m.group(1)
                continue
            if line == "Log Message:\n":
                break

        # CVS 1.11 lists files as:
        #   repo/path file,old-version,new-version file2,old-version,new-version
        # Version 1.12 lists files as:
        #   file1 old-version new-version file2 old-version new-version
        #
        # files consists of tuples of 'file-name old-version new-version'
        # The versions are either dotted-decimal version numbers, ie 1.1
        # or NONE. New files are of the form 'NONE NUMBER', while removed
        # files are 'NUMBER NONE'. 'NONE' is a literal string
        # Parsing this instead of files list in 'Added File:' etc
        # makes it possible to handle files with embedded spaces, though
        # it could fail if the filename was 'bad 1.1 1.2'
        # For cvs version 1.11, we expect
        #  my_module new_file.c,NONE,1.1
        #  my_module removed.txt,1.2,NONE
        #  my_module modified_file.c,1.1,1.2
        # While cvs version 1.12 gives us
        #  new_file.c NONE 1.1
        #  removed.txt 1.2 NONE
        #  modified_file.c 1.1,1.2

        if fileList is None:
            log.msg('CVSMaildirSource Mail with no files. Ignoring')
            return None  # We don't have any files. Email not from CVS

        if cvsmode == '1.11':
            # Please, no repo paths with spaces!
            m = re.search('([^ ]*) ', fileList)
            if m:
                path = m.group(1)
            else:
                log.msg(
                    'CVSMaildirSource can\'t get path from file list. Ignoring mail'
                )
                return None
            fileList = fileList[len(path):].strip()
            singleFileRE = re.compile(
                r'(.+?),(NONE|(?:\d+\.(?:\d+\.\d+\.)*\d+)),(NONE|(?:\d+\.(?:\d+\.\d+\.)*\d+))(?: |$)')  # noqa pylint: disable=line-too-long
        elif cvsmode == '1.12':
            singleFileRE = re.compile(
                r'(.+?) (NONE|(?:\d+\.(?:\d+\.\d+\.)*\d+)) (NONE|(?:\d+\.(?:\d+\.\d+\.)*\d+))(?: |$)')  # noqa pylint: disable=line-too-long
            if path is None:
                raise ValueError(
                    'CVSMaildirSource cvs 1.12 require path. Check cvs loginfo config'
                )
        else:
            raise ValueError(f'Expected cvsmode 1.11 or 1.12. got: {cvsmode}')

        log.msg(f"CVSMaildirSource processing filelist: {fileList}")
        while fileList:
            m = singleFileRE.match(fileList)
            if m:
                curFile = path + '/' + m.group(1)
                files.append(curFile)
                fileList = fileList[m.end():]
            else:
                log.msg('CVSMaildirSource no files matched regex. Ignoring')
                return None  # bail - we couldn't parse the files that changed
        # Now get comments
        while lines:
            line = lines.pop(0)
            comments += line

        comments = comments.rstrip() + "\n"
        if comments == '\n':
            comments = None
        return ('cvs',
                dict(author=author,
                     committer=None,
                     files=files,
                     comments=comments,
                     isdir=isdir,
                     when=when,
                     branch=branch,
                     revision=rev,
                     category=category,
                     repository=cvsroot,
                     project=project,
                     properties=self.properties))
Ejemplo n.º 46
0
def rfc1123_to_epoch(date_str):
    try:
        return mktime_tz(parsedate_tz(date_str))
    except Exception:
        return None
 def parse_date_str(self, date_string):
     return datetime.datetime.fromtimestamp(
         mktime_tz(parsedate_tz(date_string)))
Ejemplo n.º 48
0
    def handle(self, *args, **options):

        if not args[0]:
            raise CommandError('Need xml file')

        if len(args) > 1:
            if args[1] == 'delete':
                print('Flushing all posts ... ', end='')
                models.Post.objects.all().delete()
                print('Done!')

        xmldoc = minidom.parse(args[0])

        authors = xmldoc.getElementsByTagName('wp:author')
        for author in authors:
            username = author.getElementsByTagName(
                'wp:author_login')[0].childNodes[0].nodeValue
            email = author.getElementsByTagName(
                'wp:author_email')[0].childNodes[0].nodeValue

            first_name_nodes = author.getElementsByTagName(
                'wp:author_first_name')[0].childNodes
            first_name = ''
            if first_name_nodes:
                first_name = first_name_nodes[0].nodeValue

            last_name_nodes = author.getElementsByTagName(
                'wp:author_first_name')[0].childNodes
            last_name = ''
            if last_name_nodes:
                last_name = first_name_nodes[0].nodeValue

            User.objects.get_or_create(username=username,
                                       email=email,
                                       first_name=first_name,
                                       last_name=last_name)

        posts = xmldoc.getElementsByTagName('item')

        for field in models.Post._meta.local_fields:
            if field.name == "created_at":
                field.auto_now_add = False
            elif field.name == "updated_at":
                field.auto_now_add = False
                field.auto_now = False

        for post in posts:
            title = post.getElementsByTagName(
                'title')[0].childNodes[0].nodeValue

            slug_nodes = post.getElementsByTagName(
                'wp:post_name')[0].childNodes
            slug = None
            if len(slug_nodes):
                slug = slug_nodes[0].nodeValue

            pub_date = post.getElementsByTagName(
                'pubDate')[0].childNodes[0].nodeValue
            pub_date = datetime.fromtimestamp(mktime_tz(
                parsedate_tz(pub_date)))
            pub_date = timezone.make_aware(pub_date,
                                           timezone.get_current_timezone())

            creator = post.getElementsByTagName(
                'dc:creator')[0].childNodes[0].nodeValue
            creator = User.objects.get(username=creator)

            status = post.getElementsByTagName(
                'wp:status')[0].childNodes[0].nodeValue
            published = status == 'publish'

            content = post.getElementsByTagName(
                'content:encoded')[0].childNodes[0].nodeValue

            print('Adding "{}" ... '.format(title), end='')

            post = models.Post.objects.create(title=title,
                                              slug=slug,
                                              published=published,
                                              body=content,
                                              created_at=pub_date,
                                              updated_at=pub_date)
            if post.published:
                post.published_at = pub_date
            post.edited_by.add(creator)

            print('Done!')
Ejemplo n.º 49
0
def fromRfc2822(date):
    py_time = mktime_tz(parsedate_tz(str(date)))
    return time.strftime("%d %b  %Y,  %I:%M %P", time.localtime(py_time))
Ejemplo n.º 50
0
def conv_time(some_time):
    timestamp = mktime_tz(parsedate_tz(some_time))
    return (datetime(1970, 1, 1) +
            timedelta(seconds=timestamp)).replace(tzinfo=pytz.UTC)
Ejemplo n.º 51
0
def main():
    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(
        description='AWS S3 website deployment tool')
    parser.add_argument('-f',
                        '--force',
                        action='store_true',
                        dest='force',
                        help='force upload of all files')
    parser.add_argument('-n',
                        '--dry-run',
                        action='store_true',
                        dest='dry',
                        help='run without uploading any files')
    parser.add_argument(
        'path',
        help='the .s3_website.yaml configuration file or directory',
        default='.',
        nargs='?')
    args = parser.parse_args()

    # Open configuration file
    conf, base_path = config.load_config_file(args.path)

    bucket_name = conf['s3_bucket']
    cache_rules = conf.get('cache_rules', [])
    if 's3_reduced_redundancy' in conf.keys():
        reduced_redundancy = conf['s3_reduced_redundancy']
    else:
        reduced_redundancy = False

    logger.info('Connecting to bucket {}...'.format(bucket_name))

    conn = S3Connection(calling_format=OrdinaryCallingFormat())
    bucket = conn.get_bucket(bucket_name, validate=False)

    site_dir = os.path.join(base_path, conf['site'])

    logger.info('Site: {}'.format(site_dir))

    processed_keys = set()
    updated_keys = set()

    for key in bucket:
        processed_keys.add(key.key)
        path = os.path.join(site_dir, key.key)

        # Delete keys that have been deleted locally
        if not os.path.isfile(path):
            logger.info('Deleting {}...'.format(key.key))
            if not args.dry:
                key.delete()
            updated_keys.add(key.key)
            continue

        # Skip keys that have not been updated
        mtime = int(os.path.getmtime(path))
        if not args.force:
            # Update key metadata if not available.
            # The bucket list() call that is executed through the bucket
            # iteration above actually does obtain the last modified date
            # from the server, but boto currently does not update the key
            # variables based on that. We need to do an additional get_key()
            # request to get the field populated.
            key = bucket.get_key(key.key)
            key_mtime = mktime_tz(parsedate_tz(key.last_modified))
            if mtime <= key_mtime:
                logger.info('Not modified, skipping {}.'.format(key.key))
                continue

        upload_key(key,
                   path,
                   cache_rules,
                   args.dry,
                   replace=True,
                   reduced_redundancy=reduced_redundancy)
        updated_keys.add(key.key)

    for dirpath, dirnames, filenames in os.walk(site_dir):
        key_base = os.path.relpath(dirpath, site_dir)
        for name in filenames:
            path = os.path.join(dirpath, name)
            key_name = key_name_from_path(os.path.join(key_base, name))
            if key_name in processed_keys:
                continue

            # Create new key
            key = Key(bucket)
            key.key = key_name

            logger.info('Creating key {}...'.format(key_name))

            upload_key(key,
                       path,
                       cache_rules,
                       args.dry,
                       replace=False,
                       reduced_redundancy=reduced_redundancy)
            updated_keys.add(key_name)

    logger.info('Bucket update done.')

    # Invalidate files in cloudfront distribution
    if 'cloudfront_distribution_id' in conf:
        logger.info('Connecting to Cloudfront distribution {}...'.format(
            conf['cloudfront_distribution_id']))

        index_pattern = None
        if 'index_document' in conf:
            index_doc = conf['index_document']
            index_pattern = r'(^(?:.*/)?)' + re.escape(index_doc) + '$'

        def path_from_key_name(key_name):
            if index_pattern is not None:
                m = re.match(index_pattern, key_name)
                if m:
                    return m.group(1)
            return key_name

        t = PrefixCoverTree()
        for key_name in updated_keys:
            t.include(path_from_key_name(key_name))
        for key_name in processed_keys - updated_keys:
            t.exclude(path_from_key_name(key_name))

        paths = []
        for prefix, exact in t.matches():
            path = '/' + prefix + ('' if exact else '*')
            logger.info('Preparing to invalidate {}...'.format(path))
            paths.append(path)

        conn = boto.connect_cloudfront()

        if len(paths) > 0:
            dist_id = conf['cloudfront_distribution_id']
            if not args.dry:
                logger.info('Creating invalidation request...')
                conn.create_invalidation_request(dist_id, paths)
        else:
            logger.info('Nothing updated, skipping invalidation...')

        logger.info('Cloudfront invalidation done.')
Ejemplo n.º 52
0
    def make_call(self, path, body=None, delete=False):
        """
        Make a single UMAPI call with error handling and retry on temporary failure.
        :param path: the string endpoint path for the call
        :param body: (optional) list of dictionaries to be serialized into the request body
        :return: the requests.result object (on 200 response), raise error otherwise
        """
        if body:
            request_body = json.dumps(body)

            def call():
                return self.session.post(self.endpoint + path,
                                         auth=self.auth,
                                         data=request_body,
                                         timeout=self.timeout,
                                         verify=self.ssl_verify)
        else:
            if not delete:

                def call():
                    return self.session.get(self.endpoint + path,
                                            auth=self.auth,
                                            timeout=self.timeout,
                                            verify=self.ssl_verify)
            else:

                def call():
                    return self.session.delete(self.endpoint + path,
                                               auth=self.auth,
                                               timeout=self.timeout,
                                               verify=self.ssl_verify)

        start_time = time()
        result = None
        for num_attempts in range(1, self.retry_max_attempts + 1):
            try:
                result = call()
                if result.status_code in [200, 201, 204]:
                    return result
                elif result.status_code in [429, 502, 503, 504]:
                    if self.logger:
                        self.logger.warning(
                            "UMAPI timeout...service unavailable (code %d on try %d)",
                            result.status_code, num_attempts)
                    retry_wait = 0
                    if "Retry-After" in result.headers:
                        advice = result.headers["Retry-After"]
                        advised_time = parsedate_tz(advice)
                        if advised_time is not None:
                            # header contains date
                            retry_wait = int(mktime_tz(advised_time) - time())
                        else:
                            # header contains delta seconds
                            retry_wait = int(advice)
                    if retry_wait <= 0:
                        # use exponential back-off with random delay
                        delay = randint(0, self.retry_random_delay)
                        retry_wait = (int(pow(2, num_attempts - 1)) *
                                      self.retry_first_delay) + delay
                elif 201 <= result.status_code < 400:
                    raise ClientError(
                        "Unexpected HTTP Status {:d}: {}".format(
                            result.status_code, result.text), result)
                elif 400 <= result.status_code < 500:
                    raise RequestError(result)
                else:
                    raise ServerError(result)
            except requests.Timeout:
                if self.logger:
                    self.logger.warning(
                        "UMAPI connection timeout...(%d seconds on try %d)",
                        self.timeout, num_attempts)
                retry_wait = 0
                result = None
            if num_attempts < self.retry_max_attempts:
                if retry_wait > 0:
                    if self.logger:
                        self.logger.warning("Next retry in %d seconds...",
                                            retry_wait)
                    sleep(retry_wait)
                else:
                    if self.logger: self.logger.warning("Immediate retry...")
        total_time = int(time() - start_time)
        if self.logger:
            self.logger.error(
                "UMAPI timeout...giving up after %d attempts (%d seconds).",
                self.retry_max_attempts, total_time)
        raise UnavailableError(self.retry_max_attempts, total_time, result)
Ejemplo n.º 53
0
message_id = parsed.headers.get('Message-ID')

subject = parsed.headers.get('Subject').strip('Re: ')

sender = parsed.headers.get('Sender')
delivered_to = parsed.headers.get('Delivered-To')

_to = parsed.headers.get('To')
to_addr = parse_email_address_list(_to)[0][1]

_from = parsed.headers.get('From')
from_addr = parse_email_address_list(_from)[0][1]

date = parsed.headers.get('Date')
parsed_date = parsedate_tz(date)
timestamp = mktime_tz(parsed_date)
received_date = datetime.fromtimestamp(timestamp)

# We have to hard-code these values unfortunately
msg_id = 2
thread_id = 2
mailing_list_headers = {
    "List-Id": "<golang-nuts.googlegroups.com>",
    "List-Post": "<http://groups.google.com/group/golang-nuts/post>, <mailto:[email protected]>",
    "List-Owner": None,
    "List-Subscribe": "<http://groups.google.com/group/golang-nuts/subscribe>, <mailto:[email protected]>",
    "List-Unsubscribe": "<http://groups.google.com/group/golang-nuts/subscribe>, <mailto:[email protected]>",
    "List-Archive": "<http://groups.google.com/group/golang-nuts>",
    "List-Help": "<http://groups.google.com/support/>, <mailto:[email protected]>"
    }
Ejemplo n.º 54
0
    def genESDoc(self, msg, timeStampFromResponse=False):
        httpService = msg.getHttpService()
        doc = DocHTTPRequestResponse(protocol=httpService.getProtocol(),
                                     host=httpService.getHost(),
                                     port=httpService.getPort())
        doc.meta.index = self.confESIndex

        request = msg.getRequest()
        response = msg.getResponse()

        if request:
            iRequest = self.helpers.analyzeRequest(msg)
            doc.request.method = iRequest.getMethod()
            doc.request.url = iRequest.getUrl().toString()

            headers = iRequest.getHeaders()
            for header in headers:
                try:
                    doc.add_request_header(header)
                except:
                    doc.request.requestline = header

            parameters = iRequest.getParameters()
            for parameter in parameters:
                ptype = parameter.getType()
                if ptype == IParameter.PARAM_URL:
                    typename = "url"
                elif ptype == IParameter.PARAM_BODY:
                    typename = "body"
                elif ptype == IParameter.PARAM_COOKIE:
                    typename = "cookie"
                elif ptype == IParameter.PARAM_XML:
                    typename = "xml"
                elif ptype == IParameter.PARAM_XML_ATTR:
                    typename = "xmlattr"
                elif ptype == IParameter.PARAM_MULTIPART_ATTR:
                    typename = "multipartattr"
                elif ptype == IParameter.PARAM_JSON:
                    typename = "json"
                else:
                    typename = "unknown"

                name = parameter.getName()
                value = parameter.getValue()
                doc.add_request_parameter(typename, name, value)

            ctype = iRequest.getContentType()
            if ctype == IRequestInfo.CONTENT_TYPE_NONE:
                doc.request.content_type = "none"
            elif ctype == IRequestInfo.CONTENT_TYPE_URL_ENCODED:
                doc.request.content_type = "urlencoded"
            elif ctype == IRequestInfo.CONTENT_TYPE_MULTIPART:
                doc.request.content_type = "multipart"
            elif ctype == IRequestInfo.CONTENT_TYPE_XML:
                doc.request.content_type = "xml"
            elif ctype == IRequestInfo.CONTENT_TYPE_JSON:
                doc.request.content_type = "json"
            elif ctype == IRequestInfo.CONTENT_TYPE_AMF:
                doc.request.content_type = "amf"
            else:
                doc.request.content_type = "unknown"

            bodyOffset = iRequest.getBodyOffset()
            doc.request.body = request[bodyOffset:].tostring().decode(
                "ascii", "replace")

        if response:
            iResponse = self.helpers.analyzeResponse(response)

            doc.response.status = iResponse.getStatusCode()
            doc.response.content_type = iResponse.getStatedMimeType()
            doc.response.inferred_content_type = iResponse.getInferredMimeType(
            )

            headers = iResponse.getHeaders()
            dateHeader = None
            for header in headers:
                try:
                    doc.add_response_header(header)
                    match = reDateHeader.match(header)
                    if match:
                        dateHeader = match.group(1)
                except:
                    doc.response.responseline = header

            cookies = iResponse.getCookies()
            for cookie in cookies:
                expCookie = cookie.getExpiration()
                expiration = None
                if expCookie:
                    try:
                        expiration = str(
                            datetime.fromtimestamp(expCookie.time / 1000))
                    except:
                        pass
                doc.add_response_cookie(cookie.getName(), cookie.getValue(),
                                        cookie.getDomain(), cookie.getPath(),
                                        expiration)

            bodyOffset = iResponse.getBodyOffset()
            doc.response.body = response[bodyOffset:].tostring().decode(
                "ascii", "replace")

            if timeStampFromResponse:
                if dateHeader:
                    try:
                        doc.timestamp = datetime.fromtimestamp(
                            mktime_tz(parsedate_tz(dateHeader)),
                            tz)  # try to use date from response header "Date"
                        self.lastTimestamp = doc.timestamp
                    except:
                        doc.timestamp = self.lastTimestamp  # fallback: last stored timestamp. Else: now

        return doc
Ejemplo n.º 55
0
def parse_datetime(value):
    time_tuple = parsedate_tz(value)
    timestamp = mktime_tz(time_tuple)
    return datetime.datetime.fromtimestamp(timestamp)
Ejemplo n.º 56
0
 def _onsuccess(boto_key):
     checksum = boto_key.etag.strip('"')
     last_modified = boto_key.last_modified
     modified_tuple = parsedate_tz(last_modified)
     modified_stamp = int(mktime_tz(modified_tuple))
     return {'checksum': checksum, 'last_modified': modified_stamp}
Ejemplo n.º 57
0
    def parse_email(self, message_string, existing_email=None):
        """ Creates or replace a email from a string """
        parsed_email = email.message_from_string(message_string)
        body = None
        error_description = None

        def get_payload(message):
            """ Returns the first text/html body, and falls back to text/plain body """
            def process_part(part, default_charset, text_part, html_part):
                """ Returns the first text/plain body as a unicode object, and the first text/html body """
                if part.is_multipart():
                    for part in part.get_payload():
                        charset = part.get_content_charset(default_charset)
                        (text_part,
                         html_part) = process_part(part, charset, text_part,
                                                   html_part)
                else:
                    charset = part.get_content_charset(default_charset)
                    decoded_part = part.get_payload(decode=True)
                    decoded_part = decoded_part.decode(charset, 'replace')
                    if part.get_content_type(
                    ) == 'text/plain' and text_part is None:
                        text_part = decoded_part
                    elif part.get_content_type(
                    ) == 'text/html' and html_part is None:
                        html_part = decoded_part
                return (text_part, html_part)

            html_part = None
            text_part = None
            default_charset = message.get_charset() or 'ISO-8859-1'
            (text_part, html_part) = process_part(message, default_charset,
                                                  text_part, html_part)
            if html_part:
                return ('text/html',
                        sanitize_html(
                            AbstractMailbox.strip_full_message_quoting_html(
                                html_part)))
            elif text_part:
                return ('text/plain',
                        AbstractMailbox.strip_full_message_quoting_plaintext(
                            text_part))
            else:
                return (
                    'text/plain',
                    u"Sorry, no assembl-supported mime type found in message parts"
                )

        (mimeType, body) = get_payload(parsed_email)

        def email_header_to_unicode(header_string, join_crlf=True):
            decoded_header = decode_email_header(header_string)
            default_charset = 'ASCII'

            text = ''.join([
                unicode(t[0], t[1] or default_charset) for t in decoded_header
            ])
            if join_crlf:
                text = u''.join(text.split('\r\n'))

            return text

        new_message_id = parsed_email.get('Message-ID', None)
        if new_message_id:
            new_message_id = self.clean_angle_brackets(
                email_header_to_unicode(new_message_id))
        else:
            error_description = "Unable to parse the Message-ID for message string: \n%s" % message_string
            return (None, None, error_description)

        assert new_message_id

        new_in_reply_to = parsed_email.get('In-Reply-To', None)
        if new_in_reply_to:
            new_in_reply_to = self.clean_angle_brackets(
                email_header_to_unicode(new_in_reply_to))

        sender = email_header_to_unicode(parsed_email.get('From'))
        sender_name, sender_email = parseaddr(sender)
        sender_email_account = EmailAccount.get_or_make_profile(
            self.db, sender_email, sender_name)
        creation_date = datetime.utcfromtimestamp(
            mktime_tz(parsedate_tz(parsed_email['Date'])))
        subject = email_header_to_unicode(parsed_email['Subject'], False)
        recipients = email_header_to_unicode(parsed_email['To'])
        body = body.strip()
        # Try/except for a normal situation is an anti-pattern,
        # but sqlalchemy doesn't have a function that returns
        # 0, 1 result or an exception
        try:
            email_object = self.db.query(Email).filter(
                Email.source_post_id == new_message_id,
                Email.discussion_id == self.discussion_id,
                Email.source == self).one()
            if existing_email and existing_email != email_object:
                raise ValueError(
                    "The existing object isn't the same as the one found by message id"
                )
            email_object.recipients = recipients
            email_object.sender = sender
            email_object.creation_date = creation_date
            email_object.source_post_id = new_message_id
            email_object.in_reply_to = new_in_reply_to
            email_object.body_mime_type = mimeType
            email_object.imported_blob = message_string
            # TODO MAP: Make this nilpotent.
            email_object.subject = LangString.create(subject)
            email_object.body = LangString.create(body)
        except NoResultFound:
            email_object = Email(discussion=self.discussion,
                                 source=self,
                                 recipients=recipients,
                                 sender=sender,
                                 subject=LangString.create(subject),
                                 creation_date=creation_date,
                                 source_post_id=new_message_id,
                                 in_reply_to=new_in_reply_to,
                                 body=LangString.create(body),
                                 body_mime_type=mimeType,
                                 imported_blob=message_string)
        except MultipleResultsFound:
            """ TO find duplicates (this should no longer happen, but in case it ever does...

SELECT * FROM post WHERE id in (SELECT MAX(post.id) as max_post_id FROM imported_post JOIN post ON (post.id=imported_post.id) GROUP BY message_id, source_id HAVING COUNT(post.id)>1)

To kill them:


USE assembl;
UPDATE  post p
SET     parent_id = (
SELECT new_post_parent.id AS new_post_parent_id
FROM post AS post_to_correct
JOIN post AS bad_post_parent ON (post_to_correct.parent_id = bad_post_parent.id)
JOIN post AS new_post_parent ON (new_post_parent.message_id = bad_post_parent.message_id AND new_post_parent.id <> bad_post_parent.id)
WHERE post_to_correct.parent_id IN (
  SELECT MAX(post.id) as max_post_id
  FROM imported_post
  JOIN post ON (post.id=imported_post.id)
  GROUP BY message_id, source_id
  HAVING COUNT(post.id)>1
  )
AND p.id = post_to_correct.id
)

USE assembl;
DELETE
FROM post WHERE post.id IN (SELECT MAX(post.id) as max_post_id FROM imported_post JOIN post ON (post.id=imported_post.id) GROUP BY message_id, source_id HAVING COUNT(post.id)>1)

"""
            raise MultipleResultsFound("ID %s has duplicates in source %d" %
                                       (new_message_id, self.id))
        email_object.creator = sender_email_account.profile
        # email_object = self.db.merge(email_object)
        email_object.guess_languages()
        return (email_object, parsed_email, error_description)
Ejemplo n.º 58
0
def parseRFC2616Date(s):
	"""returns seconds since unix epoch representing UTC from the HTTP-compatible
	time specification s.
	"""
	parts = emailutils.parsedate_tz(s)
	return emailutils.mktime_tz(parts)
Ejemplo n.º 59
0
def create_title_from_libsyn_rss(rss_feed_url):
    """Parses a libsyn-generated RSS feed"""

    if rss_feed_url.startswith('http'):
        feed = urllib.urlopen(rss_feed_url)
        feed_tree = ElementTree.parse(feed).getroot()
        libsyn_slug = re.search('//(.*).podiobooks', rss_feed_url).group(1)
    else:  # Only unit tests hit this side
        feed_tree = ElementTree.parse(rss_feed_url).getroot()
        libsyn_slug = 'linus'

    if feed_tree is None:
        return None

    feed_tree = feed_tree.find('channel')

    title = Title()

    title.name = feed_tree.find('title').text

    title.slug = slugify(title.name)
    existing_slug_count = Title.objects.all().filter(slug=title.slug).count()
    if existing_slug_count > 0:
        title.slug += "---CHANGEME--" + str(time.time())

    title.old_slug = title.slug
    title.libsyn_slug = libsyn_slug

    title.description = strip_tags(feed_tree.find('description').text).strip()
    if feed_tree.find('{http://www.itunes.com/dtds/podcast-1.0.dtd}explicit'
                      ).text == 'yes':
        title.is_explicit = True
    title.deleted = True

    title.libsyn_cover_image_url = feed_tree.find('image').find('url').text

    default_license = License.objects.get(slug='by-nc-nd')
    title.license = default_license

    title.save()
    items = feed_tree.findall('item')

    start_date = datetime.datetime.now(timezone.utc)

    for item in items:
        episode = Episode()
        episode.title = title
        episode.name = item.find('title').text
        episode.description = strip_tags(item.find('description').text).strip()
        episode.filesize = item.find('enclosure').get('length')
        episode.url = item.find('enclosure').get('url').replace(
            'traffic.libsyn.com', 'media.podiobooks.com')
        episode.duration = item.find(
            '{http://www.itunes.com/dtds/podcast-1.0.dtd}duration').text
        episode.media_date_created = datetime.datetime.fromtimestamp(
            mktime_tz(parsedate_tz(item.find('pubDate').text)), timezone.utc)
        try:
            episode.sequence = int(episode.url[episode.url.rfind('.') -
                                               2:episode.url.rfind('.')]
                                   )  # Use URL File Name to Calc Seq
            episode.media_date_created = start_date + datetime.timedelta(
                10, episode.sequence)
        except ValueError:
            print(episode.url)
            episode.sequence = 0
        episode.save()

    return title
Ejemplo n.º 60
0
def rfc2822_to_datetime(rfc_date):
    """Converts a RFC 2822 date string to a Python datetime"""
    timestamp = mktime_tz(parsedate_tz(rfc_date))
    raw_dt = datetime.datetime.utcfromtimestamp(timestamp)
    return raw_dt.replace(tzinfo=pytz.utc)