Example #1
0
    def get_parsed(self, message_set):
        parsed_list = []
        header_list = []
        header_time = []

        headers = False
        local_time = timezone('UTC')

        if not isinstance(message_set, str):
            for message in message_set:
                headers = message.get_all(
                    'RECEIVED')  # only handles single email, will revisit

            while headers:
                try:
                    match = re.search('([a-zA-Z]{3},.+.[0-9].+[0-9]{4})',
                                      headers[len(headers) - 1], re.DOTALL)
                    if match:
                        match = match.group().replace('\r\n', '')
                        header_time.append(
                            parsedate_to_datetime(match).astimezone(
                                local_time).strftime("%Y-%m-%d %H:%M:%S"))
                except IndexError:
                    continue
                header_list.append(headers[len(headers) - 1])
                headers.pop(len(headers) - 1)
            parsed_list.append(list(zip(header_list, header_time)))
            return parsed_list
        else:
            self.error = message_set
            return self
Example #2
0
        def new_func(*args, **kwargs):
            r = func(*args, **kwargs)
            attempts = 5
            while attempts > 0 and r is not None and r.status_code == 429:
                attempts -= 1

                logger.info(
                    'Too many req, sleeping for %d secs and retrying another %d times',
                    sleeptime, attempts)

                if 'retry-after' in r.headers:
                    logger.info('App suggested retrying after "%s"',
                                r.headers['retry-after'])
                    try:
                        secs = int(r.headers['retry-after'])
                    except ValueError:
                        # parse HTTP-date, MUST also support RFC 850 and ANSI C asctime (but f that for the moment!)
                        # rfc5322 date-time
                        secs = rfc5322.parsedate_to_datetime(
                            r.headers['retry-after'])
                        secs = secs - datetime.now()
                        secs = secs.total_seconds()

                    if secs <= 0:
                        secs = None

                    logger.info('App suggested sleeping for %s secs',
                                str(secs))
                    # todo: replace sleeptime with suggested retry-after value?
                    #       (3600 seems default, we really wanna wait an hour?)

                sleep(sleeptime)
                r = func(*args, **kwargs)
            return r
Example #3
0
def sync_installer(repo_url, local_dir: Path):
    logging.info("Start syncing {}".format(repo_url))
    local_dir.mkdir(parents=True, exist_ok=True)
    full_scan = random.random() < 0.1  # Do full version check less frequently

    def remote_list():
        r = requests.get(repo_url, timeout=TIMEOUT_OPTION)
        d = pq(r.content)
        for tr in d('table').find('tr'):
            tds = pq(tr).find('td')
            if len(tds) != 4:
                continue
            fname = tds[0].find('a').text
            md5 = tds[3].text
            yield (fname, md5)

    for filename, md5 in remote_list():
        pkg_url = "/".join([repo_url, filename])
        dst_file = local_dir / filename
        dst_file_wip = local_dir / ('.downloading.' + filename)

        if dst_file.is_file():
            r = requests.head(pkg_url,
                              allow_redirects=True,
                              timeout=TIMEOUT_OPTION)
            len_avail = 'content-length' in r.headers
            if len_avail:
                remote_filesize = int(r.headers['content-length'])
            remote_date = parsedate_to_datetime(r.headers['last-modified'])
            stat = dst_file.stat()
            local_filesize = stat.st_size
            local_mtime = stat.st_mtime

            # Do content verification on ~5% of files (see issue #25)
            if (not len_avail or remote_filesize == local_filesize) and remote_date.timestamp() == local_mtime and \
                    (random.random() < 0.95 or md5_check(dst_file, md5)):
                logging.info("Skipping {}".format(filename))

                # Stop the scanning if the most recent version is present
                if not full_scan:
                    logging.info("Stop the scanning")
                    break

                continue

            logging.info("Removing {}".format(filename))
            dst_file.unlink()

        for retry in range(3):
            logging.info("Downloading {}".format(filename))
            err = ''
            try:
                err = curl_download(pkg_url, dst_file_wip, md5=md5)
                if err is None:
                    dst_file_wip.rename(dst_file)
            except sp.CalledProcessError:
                err = 'CalledProcessError'
            if err is None:
                break
            logging.error("Failed to download {}: {}".format(filename, err))
Example #4
0
    def inbox(self, actor_id):
        if request.json is None:
            abort(400, "json body is necessary")

        if request.headers.get("Host") != current_app.config["SERVER_NAME"]:
            abort(401, "Incorrect host")

        raw_date = request.headers.get("Date")
        if not raw_date:
            abort(401, "Date not provided")
        time_difference = datetime.now(timezone.utc) - parsedate_to_datetime(raw_date)
        if abs(time_difference.total_seconds()) > 30:
            abort(401, "Invalid date")

        # TODO: implement digest check

        signature_check = signature_is_valid(
            self.manager.external_key_store.get,
            request.headers,
            url_for(".inbox", actor_id=actor_id),
            "POST",
        )
        if not signature_check and not current_app.config.get("DEBUG_INBOX_IGNORE_SIGNATURE"):
            abort(401, "Request signature could not be verified")

        actor = self.manager.find(actor_id)

        if not actor:
            abort(404)

        response = self.manager.handle_activity(actor, request.json)
        return activityjsonify(response)
Example #5
0
    def get_date_from_email_header(header: str) -> Optional[datetime]:
        """Parse an email header such as Date or Received. The format is either just the date
        or name value pairs followed by ; and the date specification. For example:
        by 2002:a17:90a:77cb:0:0:0:0 with SMTP id e11csp4670216pjs;        Mon, 21 Dec 2020 12:11:57 -0800 (PST)

        Args:
            header (str): header value to parse

        Returns:
            Optional[datetime]: parsed datetime
        """
        if not header:
            return None
        try:
            date_part = header.split(';')[-1].strip()
            res = parsedate_to_datetime(date_part)
            if res.tzinfo is None:
                # some headers may contain a non TZ date so we assume utc
                res = res.replace(tzinfo=timezone.utc)
            return res
        except Exception as ex:
            demisto.debug(
                f'Failed parsing date from header value: [{header}]. Err: {ex}. Will ignore and continue.'
            )
        return None
Example #6
0
def save_email(message):
    """
    Saves a given email in the database
    :param message: email message
    :return:
    """
    ccs = message.cc
    recipients = message.sent_to

    parsed_date = parsedate_to_datetime(message.date)

    email = MyEmail(
        email_subject=message.subject,
        email_from=message.sent_from[0].get('email'),
        email_body=message.body.get('plain'),
        date=parsed_date
    )

    email.save()
    if ccs:
        for cc in ccs:
            email.copies.create(email_address=cc.get('email'))

    for r in recipients:
        email.recipients.create(email_address=r.get('email'))
    def get_parsed(self, message_set):
        parsed_list = []
        header_list = []
        header_time = []

        headers = False
        local_time = timezone('UTC')

        if not isinstance(message_set, str):
            for message in message_set:
                headers = message.get_all('RECEIVED') # only handles single email, will revisit

            while headers:
                try:
                    match = re.search('([a-zA-Z]{3},.+.[0-9].+[0-9]{4})', headers[len(headers) - 1], re.DOTALL)
                    if match:
                        match = match.group().replace('\r\n', '')
                        header_time.append(parsedate_to_datetime(match).astimezone(local_time).strftime("%Y-%m-%d %H:%M:%S"))
                except IndexError:
                    continue
                header_list.append(headers[len(headers) - 1])
                headers.pop(len(headers)-1)
            parsed_list.append(list(zip(header_list, header_time)))
            return parsed_list
        else:
            self.error = message_set
            return self
Example #8
0
    def ingest(self, file_path, entity):
        entity.schema = model.get("Email")
        try:
            msg = Message(file_path.as_posix())
        except Exception as exc:
            msg = "Cannot open message file: %s" % exc
            raise ProcessingException(msg) from exc

        self.extract_olefileio_metadata(msg, entity)

        try:
            self.extract_msg_headers(entity, msg.header)
        except Exception:
            log.exception("Cannot parse Outlook-stored headers")

        entity.add("subject", msg.subject)
        entity.add("threadTopic", msg.getStringField("0070"))
        entity.add("encoding", msg.encoding)
        entity.add("bodyText", msg.body)
        entity.add("bodyHtml", msg.htmlBody)
        entity.add("messageId", self.parse_message_ids(msg.message_id))

        if not entity.has("inReplyTo"):
            entity.add("inReplyTo", self.parse_references(msg.references, []))

        try:
            date = parsedate_to_datetime(msg.date).isoformat()
            entity.add("date", date)
        except Exception:
            log.warning("Could not parse date: %s", msg.date)

        # sender name and email
        sender = self.get_identities(msg.sender)
        self.apply_identities(entity, sender, "emitters", "sender")

        # received by
        sender = self.get_identity(msg.getStringField("0040"),
                                   msg.getStringField("0076"))
        self.apply_identities(entity, sender, "emitters")

        froms = self.get_identities(msg.getStringField("1046"))
        self.apply_identities(entity, froms, "emitters", "from")

        tos = self.get_identities(msg.to)
        self.apply_identities(entity, tos, "recipients", "to")

        ccs = self.get_identities(msg.cc)
        self.apply_identities(entity, ccs, "recipients", "cc")

        bccs = self.get_identities(msg.bcc)
        self.apply_identities(entity, bccs, "recipients", "bcc")

        self.resolve_message_ids(entity)
        for attachment in msg.attachments:
            if attachment.type != "data":
                continue
            name = stringify(attachment.longFilename)
            name = name or stringify(attachment.shortFilename)
            self.ingest_attachment(entity, name, attachment.type,
                                   attachment.data)
def write_table(mboxfile, mailTable):
    """
    Takes a list and extends it with lists of data, which is
    extracted from mbox messages.
    :param mboxfile: Mbox file name/path
    :param mailTable: A list (of lists)
    :return: An extended list of lists
    """
    mail_box_contents = mailbox.mbox(mboxfile)

    m_pbar = tqdm.tqdm(range(0, len(mail_box_contents)))
    m_pbar.set_description('Extracting mbox messages...')

    count = 0
    update_interval = min(50, len(mail_box_contents))

    for message in mail_box_contents:
        count += 1
        if count % update_interval == 0:
            m_pbar.update(update_interval)
        clean_from = clean_address(message['From'])
        clean_to = clean_addresses(message['To'])
        clean_cc = clean_addresses(message['Cc'])

        try:
            clean_date = email.parsedate_to_datetime(message['Date'])
        except:
            clean_date = None

        mailTable.append([
            clean_from, clean_to, clean_cc, clean_date, message['Subject'],
            get_body(message)
        ])
Example #10
0
    def get_date(self: object) -> datetime:
        date_element = self.soup.find('time', {'datetime': True})

        if not date_element:
            raise ScraperError(f'Date missing from page.')

        date_string = date_element['datetime']

        for date_format in self.date_formats:
            try:
                return datetime.strptime(date_string, date_format)\
                    .replace(tzinfo=timezone.utc)
            except ValueError:
                pass

        try:
            # RFC-822
            return parsedate_to_datetime(date_string)
        except TypeError:
            pass
        try:
            # ISO-8601
            return datetime.fromisoformat(date_string)
        except ValueError:
            pass

        raise ScraperError(f'Unrecognized date format "{date_string}".')
Example #11
0
def _convert_to_timestamp(date):
    # covert HTTP date to POSIX timestamp
    from email.utils import parsedate_to_datetime
    try:
        return parsedate_to_datetime(date).timestamp()
    except TypeError:
        return None
Example #12
0
def _record_call_status(request, related_cr):
    # Create dict to hold information of interest from call status response
    call_status_info = {}

    # Retrieve information we need to determine success of call
    # CallDuration will return the duration in seconds
    status_entries = ['CallStatus', 'Timestamp', 'AnsweredBy']

    for entry in status_entries:
        call_status_info[entry] = request.GET[entry]

    # Set Timestamp entry to datetime obj instead of RFC 2822
    call_status_info['Timestamp'] = parsedate_to_datetime(
        call_status_info['Timestamp'])

    # Check whether call was completed by a human (success metric)
    if (call_status_info['CallStatus'] == 'completed' and
            call_status_info['AnsweredBy'] == 'human'):
        # Save call duration and set related CallRequest.call_completed-True
        call_status_info['CallDuration'] = request.GET['CallDuration']
        call_status_info['Success'] = True
        related_cr.call_completed = True
        related_cr.save()
    else:
        # Save duration=0, log miss, don't change CallRequest.call_completed
        call_status_info['CallDuration'] = 0
        call_status_info['Success'] = False
        err_msg = 'Call ended with {} status, answered by {}'
        log.error(err_msg.format(
            call_status_info['CallStatus'], call_status_info['AnsweredBy']))

    return call_status_info
Example #13
0
async def get_rss_content(url: str) -> list:
    result = []

    async with aiohttp.ClientSession(trust_env=True) as sess:
        logger.debug(f'Start fetching {url}')
        resp = await sess.get(url)
        doc = fromstring(await resp.text())

        desc = doc.find('channel/description')
        if desc:
            desc = desc.text

        for i in doc.iterfind('channel/item'):
            item = {}
            # RFC822
            item['date'] = parsedate_to_datetime(
                i.findtext('pubDate')).timestamp()
            item['title'] = i.findtext('title')
            item['description'] = i.findtext('description')
            item['link'] = i.findtext('link')
            item['enclosure_url'] = i.find('enclosure').attrib['url']

            result.append(item)

    return result
Example #14
0
 def _process_message(self, message):
     msg = Message()
     msg.mailbox = self
     if 'subject' in message:
         msg.subject = convert_header_to_unicode(message['subject'])[0:255]
     if 'message-id' in message:
         msg.message_id = message['message-id'][0:255]
     if 'from' in message:
         msg.from_header = convert_header_to_unicode(message['from'])
     if 'to' in message:
         msg.to_header = convert_header_to_unicode(message['to'])
     if 'date' in message:
         sent_time_str = convert_header_to_unicode(message['date'])
         msg.sent_time = parsedate_to_datetime(sent_time_str)
     elif 'Delivered-To' in message:
         msg.to_header = convert_header_to_unicode(message['Delivered-To'])
     msg.save()
     message = self._get_dehydrated_message(message, msg)
     msg.set_body(message.as_string())
     if message['in-reply-to']:
         try:
             msg.in_reply_to = Message.objects.filter(
                 message_id=message['in-reply-to']
             )[0]
         except IndexError:
             pass
     msg.save()
     return msg
Example #15
0
def parse_http_date(value: str) -> Optional[datetime]:
    """Attempt to parse an HTTP (RFC 5322-compatible) timestamp"""
    try:
        return parsedate_to_datetime(value)
    except (TypeError, ValueError):
        logger.debug(f'Failed to parse timestamp: {value}')
        return None
def downloading_worker(q):
    while True:
        item = q.get()
        if item is None:
            break

        try:
            url, dst_file, working_dir = item
            if dst_file.is_file():
                print("checking", url, flush=True)
                r = requests.head(url, timeout=TIMEOUT_OPTION)
                remote_filesize = int(r.headers['content-length'])
                remote_date = parsedate_to_datetime(r.headers['last-modified'])
                stat = dst_file.stat()
                local_filesize = stat.st_size
                local_mtime = stat.st_mtime

                if remote_filesize == local_filesize and remote_date.timestamp() == local_mtime:
                    print("skipping", dst_file.relative_to(working_dir), flush=True)
                    continue

                dst_file.unlink()
            print("downloading", url, flush=True)
            requests_download(url, dst_file)
        except Exception:
            traceback.print_exc()
            print("Failed to download", url, flush=True)
            if dst_file.is_file():
                dst_file.unlink()
        finally:
            q.task_done()
Example #17
0
def _fetch_entries(feed, loop=None):
    url = feed.url
    request_etag = feed.etag
    request_last_modified = feed.last_modified

    response = yield from _request_entries(
        url,
        etag=request_etag,
        last_modified=request_last_modified,
        loop=loop
    )

    if response is None:
        log.info('%s is not modified', feed)
        return None

    response_headers, xml = response

    etag = response_headers.get('etag','')
    if isinstance(etag, bytes):
        etag = etag.decode('utf-8', 'ignore')

    last_modified = response_headers.get('last-modified')
    if last_modified:
        last_modified = parsedate_to_datetime(last_modified)

    data = feedparser.parse(xml)

    return FeedResponse(etag, last_modified, data.entries)
Example #18
0
    def new_format(self, navbar: BeautifulSoup,
                   content: BeautifulSoup) -> List[str]:
        """
        Extracts email message information if it uses the new Mailman format
        Args:
            content: BeautifulSoup

        Returns: List[str]
        """

        sender = content.find(id='from').text.split('via')[0][6:].strip()
        date_str = content.find(id='date').text.split(': ')[1].strip()
        date = parsedate_to_datetime(date_str).isoformat()
        body = content.find(id='body').text.strip()
        nxt, rep_to = None, None

        links = navbar.findAll('a')
        for l in links:
            if 'Next in thread' in str(l):
                nxt = '/'.join(
                    self.email_url.split('/')[:-1]) + '/' + l['href']
                nxt = nxt[1:] if nxt[0] == '/' else nxt
            elif 'reply to' in str(l):
                rep_to = '/'.join(
                    self.email_url.split('/')[:-1]) + '/' + l['href']
                rep_to = rep_to[1:] if rep_to[0] == '/' else rep_to
        return [str(i) for i in [sender, date, body, nxt, rep_to]]
Example #19
0
def getResponse_bf(path):
    global endPoint_bf
    url = endPoint_bf + path
    response = urllib.request.urlopen(url, timeout=30)
    res_date = parsedate_to_datetime(response.headers['date'])
    content = json.loads(response.read().decode('utf8'))
    return res_date, content
def PREPROCESS_FUNC(filename, row):
    return {
        "id": int(row["TweetId"]),
        "text": row["TweetText"],
        "created_at": parsedate_to_datetime(row["TweetDate"]),
        "tags": [row["Topic"], row["Sentiment"]]
    }
Example #21
0
def asDate(raw):
    if raw:
        try:
            return parsedate_to_datetime(raw)
        except Exception:
            return None
    return None
 def parse(cls, api, json):
     status = cls(api)
     setattr(status, '_json', json)
     for k, v in json.items():
         if k == 'user':
             user_model = getattr(api.parser.model_factory,
                                  'user') if api else User
             user = user_model.parse(api, v)
             setattr(status, 'author', user)
             setattr(status, 'user', user)  # DEPRECIATED
         elif k == 'created_at':
             setattr(status, k, parsedate_to_datetime(v))
         elif k == 'source':
             if '<' in v:
                 # At this point, v should be of the format:
                 # <a href="{source_url}" rel="nofollow">{source}</a>
                 setattr(status, k, v[v.find('>') + 1:v.rfind('<')])
                 start = v.find('"') + 1
                 end = v.find('"', start)
                 setattr(status, 'source_url', v[start:end])
             else:
                 setattr(status, k, v)
                 setattr(status, 'source_url', None)
         elif k == 'retweeted_status':
             setattr(status, k, Status.parse(api, v))
         elif k == 'quoted_status':
             setattr(status, k, Status.parse(api, v))
         elif k == 'place':
             if v is not None:
                 setattr(status, k, Place.parse(api, v))
             else:
                 setattr(status, k, None)
         else:
             setattr(status, k, v)
     return status
def download_list(url):
    headers = None

    cache = Path(config['cache'], hashlib.sha1(url.encode()).hexdigest())

    if cache.is_file():
        last_modified = datetime.utcfromtimestamp(cache.stat().st_mtime)
        headers = {
            'If-modified-since':
            eut.format_datetime(last_modified),
            'User-Agent':
            'Bind adblock zonfile updater v1.0 (https://github.com/Trellmor/bind-adblock)'
        }

    try:
        r = requests.get(url, headers=headers, timeout=config['req_timeout_s'])

        if r.status_code == 200:
            with cache.open('w') as f:
                f.write(r.text)

            if 'last-modified' in r.headers:
                last_modified = eut.parsedate_to_datetime(
                    r.headers['last-modified']).timestamp()
                os.utime(str(cache), times=(last_modified, last_modified))

            return r.text
    except requests.exceptions.RequestException as e:
        print(e)

    if cache.is_file():
        with cache.open() as f:
            return f.read()
Example #24
0
def parse_email_date(value: str) -> datetime.datetime:
    """
    Parsing the date described in rfc2822
    1900-1-1 for unparsed, may be naive or with tzinfo
    """
    try:
        return parsedate_to_datetime(value)
    except Exception:  # noqa
        pass
    match = re.search(
        r'(?P<date>\d{1,2}\s+(' + '|'.join(SHORT_MONTH_NAMES) +
        r')\s+\d{4})\s+' + r'(?P<time>\d{1,2}:\d{1,2}(:\d{1,2})?)\s*' +
        r'(?P<zone_sign>[+-])?(?P<zone>\d{4})?', value)
    if match:
        group = match.groupdict()
        day, month, year = group['date'].split()
        time_values = group['time'].split(':')
        zone_sign = int('{}1'.format(group.get('zone_sign') or '+'))
        zone = group['zone']
        return datetime.datetime(
            year=int(year),
            month=SHORT_MONTH_NAMES.index(month) + 1,
            day=int(day),
            hour=int(time_values[0]),
            minute=int(time_values[1]),
            second=int(time_values[2]) if len(time_values) > 2 else 0,
            tzinfo=datetime.timezone(
                datetime.timedelta(hours=int(zone[:2]) * zone_sign,
                                   minutes=int(zone[2:]) *
                                   zone_sign)) if zone else None,
        )
    else:
        return datetime.datetime(1900, 1, 1)
Example #25
0
def parse_date(date_string):
    if type(date_string) is datetime:
        return date_string

    date_string = date_string.strip().rstrip("-").strip()

    dt = _parse_datestrings(date_string)

    if dt and type(dt) is datetime:
        return dt

    try:
        dt = parsedate_to_datetime(date_string)
        return dt
    except (TypeError, KeyError, ValueError, AttributeError):
        pass

    date9tuple = _parse_date(date_string)

    if date9tuple:
        try:
            dt = datetime(date9tuple)
        except Exception as e:
            pass
        return dt
Example #26
0
def check_and_download(url: str, dst_file: Path, caching=False) -> int:
    try:
        if caching:
            if url in download_cache:
                print(f"Using cached content: {url}", flush=True)
                with dst_file.open('wb') as f:
                    f.write(download_cache[url])
                return 0
            download_cache[url] = bytes()
        start = time.time()
        with requests.get(url, stream=True, timeout=(5, 10)) as r:
            r.raise_for_status()
            if 'last-modified' in r.headers:
                remote_ts = parsedate_to_datetime(
                    r.headers['last-modified']).timestamp()
            else:
                remote_ts = None

            with dst_file.open('wb') as f:
                for chunk in r.iter_content(chunk_size=1024**2):
                    if time.time() - start > DOWNLOAD_TIMEOUT:
                        raise TimeoutError("Download timeout")
                    if not chunk: continue  # filter out keep-alive new chunks

                    f.write(chunk)
                    if caching: download_cache[url] += chunk
            if remote_ts is not None:
                os.utime(dst_file, (remote_ts, remote_ts))
        return 0
    except BaseException as e:
        print(e, flush=True)
        if dst_file.is_file(): dst_file.unlink()
        if url in download_cache: del download_cache[url]
    return 1
Example #27
0
def get_news():
    news = []
    try:
        feed = urlopen(
            'https://kozeagroup.wordpress.com/category/backoffice/feed/',
            timeout=3)
    except socket.timeout:
        return news
    tree = ElementTree.parse(feed)
    for item in tree.find('channel').findall('item'):
        date = parsedate_to_datetime(item.find('pubDate').text)
        entry = {
            'title': item.find('title').text,
            'description': item.find('description').text,
            'link': item.find('link').text,
            'isodate': date.strftime('%Y-%m-%d'),
            'date': date.strftime('%d %B %Y')
        }
        image = item.find(
            'media:thumbnail',
            namespaces={'media': 'http://search.yahoo.com/mrss/'})
        if image is not None:
            entry['image'] = image.attrib['url']
        news.append(entry)
    return news
Example #28
0
 def handle_selection(self, k):
     data = self.parent.parentApp.login_form.emails[self.cursor_line]
     self.parent.parentApp.email_detail_form.form_addr.value = data['from']
     self.parent.parentApp.email_detail_form.subject.value = data['subject']
     self.parent.parentApp.email_detail_form.date.value = parsedate_to_datetime(data['date']).strftime('%a, %d %b')
     self.parent.parentApp.email_detail_form.content.value = '\n\n'+data['body']
     self.parent.parentApp.switchForm('EMAIL_DETAIL')
Example #29
0
def insertNewsIntoTable(channel):
    '''
    1. fill table with news
    2. convert date into timestamp
    '''
    news = list()

    for index, item in enumerate(channel.entries):

        description = getDescription(item.description)

        try:
            pub_date_stamp = time.mktime(
                parsedate_to_datetime(item.published).timetuple())
        except ValueError as error:
            logg.logging.error("ValueError: " + str(error))

        media_content = checkMediaContent(item)
        image = ''

        if (media_content):
            try:
                response = requests.get(media_content)
                image = psycopg2.Binary(response.content)
            except Exception as error:
                logg.logging.error("Exception: " + str(e))

        row = (html.unescape(item.title), item.link, image, description,
               pub_date_stamp)
        news.append(row)
    return news
Example #30
0
    def check(since):
        conn = imaplib.IMAP4_SSL(settings.EMAIL_HOST)
        conn.login(settings.EMAIL_HOST_USER, settings.EMAIL_HOST_PASSWORD)
        conn.select('INBOX')

        # Format today as string
        today_string = date.today().strftime("%d-%b-%Y")

        # Search inbox (IMAP doesn't allow searching by timestamp)
        result, data = conn.search(
            None, f'(SENTSINCE {today_string} FROM {settings.CHRIS_EMAIL})')

        if not data:
            logger.debug('No emails found today.')
            return None

        # Look for emails since since
        for email_id in data[0].split():
            msg_string = conn.fetch(email_id, '(RFC822)')[1][0][1]
            msg = message_from_bytes(msg_string)
            response_time_naive = parsedate_to_datetime(msg['Date'])
            response_time = utc.localize(response_time_naive)
            if response_time > since:
                return response_time
        else:
            logger.debug('No new emails found.')
            return None
Example #31
0
    def get_member_details(self, id):
        member = {}
        try:
            message = self.service.users().messages().get(
                userId='me', id=id, format='metadata').execute()

            header_data = message["payload"]["headers"]

            correct_subject = False

            for data in header_data:
                if 'subject' == data['name'].lower(
                ) and self.subject in data['value']:
                    correct_subject = True

            if not correct_subject:
                return ''

            for data in header_data:
                if "Date" == data["name"]:
                    date = parsedate_to_datetime(data["value"])
                    member["time"] = date.isoformat()
                if "From" == data["name"]:
                    print(data["value"])
                    email_id = data["value"]
                    if '<' in email_id:
                        start = email_id.find('<')
                        end = email_id.find('>')
                        email_id = email_id[start + 1:end]
                    member["email"] = email_id
            print(member)
            return member

        except errors.HttpError as error:
            print('An error occurred: %s' % error)
Example #32
0
    def test_last_modified_header_is_set(self):
        resp = self.app.get(self.changeset_uri, headers=self.headers)
        timestamp = resp.json["timestamp"]

        dt = parsedate_to_datetime(resp.headers["Last-Modified"])

        assert dt.timestamp() == int(timestamp / 1000)
Example #33
0
    def get(self, request, locale, collection, *args, **kwargs):
        self.collection = self.get_collection(collection).first()
        self.locale = locale

        pages = self.get_queryset()
        updated_pages = list(pages)

        if not pages or len(pages) == 0:
            raise Http404

        last_updated = None
        if 'lastUpdated' in request.GET:
            last_updated = datetime.strptime(request.GET['lastUpdated'],
                                             '%Y-%m-%dT%H:%M:%SZ')

        if 'HTTP_IF_MODIFIED_SINCE' in request.META:
            last_updated = parsedate_to_datetime(
                request.META['HTTP_IF_MODIFIED_SINCE'])

        if last_updated:
            updated_pages = Page.objects.filter(
                last_published_at__gt=last_updated,
                id__in=pages.values_list("id", flat=True))
            if updated_pages.count() == 0:
                return HttpResponse(status=304)  # not modified
            updated_pages = list(updated_pages)

        tar = make_tar(list(pages),
                       updated_pages,
                       self.locale,
                       request,
                       content_serializer=self.content_serializer_class)
        return self.render_to_tar_response(tar)
Example #34
0
def fetch_csv_as_string(url):
  res = requests.get(url)
  last_modified = res.headers['Last-Modified']
  jst_datetime = eut.parsedate_to_datetime(last_modified).astimezone(JST)
  # NOTE: A char code of Fukushima pref's CSV is Shift-JIS
  res.encoding = 'shift_jis'
  return res.text, jst_datetime
Example #35
0
def parseweekmail(el, pl, st):
    '''
    :param el 邮箱长度
    :param pl poplib server对象
    :param st 解析周报的开始时间
    :return 邮箱列表
    '''
    sender_list = []
    for index in range(el, 0, -1):
        lines = pl.retr(index)[1]
        msg = BytesParser(EmailMessage).parsebytes(b'\r\n'.join(lines))

        # 判断是否是本周  判断是否接受者是周报组
        mail_date = parsedate_to_datetime(msg.get('Date', "")).date()
        mail_receiver = parseaddr(msg.get('To', ""))[1]
        mail_cc = parseaddr(msg.get('Cc', ""))[1]
        if mail_date < st:
            break
        mail_subject = decode_str(msg.get('Subject', ""))
        if (mail_receiver == WEEKLY_GROUP or WEEKLY_GROUP in mail_cc) and not (
                mail_subject.startswith('项目周报')
                or decode_str(mail_subject).split('(')[0].endswith('项目周报')
                or decode_str(mail_subject).split('(')[0].endswith('项目周报')):
            sender_list.append(parseaddr(msg.get('From', ""))[1])
    return sender_list
Example #36
0
 def parse(cls, value, kwds):
     if not value:
         kwds["defects"].append(errors.HeaderMissingRequiredValue())
         kwds["datetime"] = None
         kwds["decoded"] = ""
         kwds["parse_tree"] = parser.TokenList()
         return
     if isinstance(value, str):
         value = utils.parsedate_to_datetime(value)
     kwds["datetime"] = value
     kwds["decoded"] = utils.format_datetime(kwds["datetime"])
     kwds["parse_tree"] = cls.value_parser(kwds["decoded"])
Example #37
0
    def gen_filename(mail_dict) :
        """ Generates a filename from a dictionary with keys 'Id', 'Date', 'Folder' """
        from email.utils import parsedate_to_datetime
        dateObj = parsedate_to_datetime(mail_dict['Date'])
        fulldate = dateObj.strftime("%Y-%m-%d_%H.%M_utc%z")
        year = dateObj.strftime("%Y")

        Id = mail_dict['Id'].replace('<', '').replace('>', '').replace('%', '').replace('/', '-').replace(' ', '')

        Folder = mail_dict['Folder'].replace(' ', '_')

        return os.path.join(Folder, year, fulldate + "_" + Id + '.eml')
Example #38
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--base-url", default=BASE_URL)
    parser.add_argument("--working-dir", default=WORKING_DIR)
    args = parser.parse_args()

    if args.working_dir is None:
        raise Exception("Working Directory is None")

    working_dir = Path(args.working_dir)

    remote_filelist = []
    rs = RemoteSite(args.base_url)
    for url in rs.files:
        dst_file = working_dir / rs.relpath(url)
        remote_filelist.append(dst_file.relative_to(working_dir))

        if dst_file.is_file():
            r = requests.head(url)
            remote_filesize = int(r.headers['content-length'])
            remote_date = parsedate_to_datetime(r.headers['last-modified'])
            stat = dst_file.stat()
            local_filesize = stat.st_size
            local_mtime = stat.st_mtime

            if remote_filesize == local_filesize and remote_date.timestamp() == local_mtime:
                print("Skipping", dst_file.relative_to(working_dir), flush=True)
                continue

            dst_file.unlink()
        else:
            dst_file.parent.mkdir(parents=True, exist_ok=True)

        print("downloading", url, flush=True)
        try:
            curl_download(url, dst_file)
        except Exception:
            print("Failed to download", url, flush=True)
            if dst_file.is_file():
                dst_file.unlink()

    local_filelist = []
    for local_file in working_dir.glob('**/*'):
        if local_file.is_file():
            local_filelist.append(local_file.relative_to(working_dir))

    for old_file in set(local_filelist) - set(remote_filelist):
        print("deleting", old_file, flush=True)
        old_file = working_dir / old_file
        old_file.unlink()
Example #39
0
 def __init__(self, feed_url):
     self.feed_url = feed_url
     xml = urlopen(feed_url).readall().decode()
     self.root = ET.fromstring(xml)
     self.items = []
     for item in self.root.iter('item'):
         title = item[0].text
         link = item[1].text
         description = item[2].text
         pub_date = parsedate_to_datetime(item[3].text)
         duration_string = item[9].text
         duration = None
         if duration_string:
             duration = int(duration_string)
         feed_item = RssFeedItem(title, link, description, pub_date, duration)
         self.items.append(feed_item)
Example #40
0
    def get_sig_string(req, cano_req, scope):
        """
        Generate the AWS4 auth string to sign for the request.

        Checks to see if date has been passed as x-amz-date or date, if
        date then parse the date from a date string.

        req      -- Requests PreparedRequest object. This should already
                    include an x-amz-date or a date header.
        cano_req -- The Canonical Request, as returned by
                    get_canonical_request()

        """
        if 'x-amz-date' in req.headers:
            amz_date = req.headers['x-amz-date']
        elif 'date' in req.headers:
            amz_date = parsedate_to_datetime(req.headers['date']).strftime(
                '%Y%m%dT%H%M%SZ')
        hsh = hashlib.sha256(cano_req.encode())
        sig_items = ['AWS4-HMAC-SHA256', amz_date, scope, hsh.hexdigest()]
        sig_string = '\n'.join(sig_items)
        return sig_string
Example #41
0
    def _parse_mail(self, data):
        """
        Parse data from email and return a model Email object
        """
        msg = self._parser.parsestr(data)

        tos = msg.get_all('to', [])
        ccs = msg.get_all('cc', [])
        froms = msg.get_all('from', [])
        name, addr = getaddresses(froms)[0]

        email = {
            'sender': addr,
            'subject': msg['subject'],
            'to': [addr for name, addr in getaddresses(tos)],
            'cc': [addr for name, addr in getaddresses(ccs)],
            'epoch': int(parsedate_to_datetime(msg['Date']).timestamp()),
        }

        self._extract_content(email, msg)

        return email
Example #42
0
def sync_installer(repo_url, local_dir: Path):
    logging.info("Start syncing {}".format(repo_url))
    local_dir.mkdir(parents=True, exist_ok=True)

    def remote_list():
        r = requests.get(repo_url)
        d = pq(r.content)
        for tr in d('table').find('tr'):
            tds = pq(tr).find('td')
            if len(tds) != 4:
                continue
            fname = tds[0].find('a').text
            md5 = tds[3].text
            yield (fname, md5)

    for filename, md5 in remote_list():
        pkg_url = "/".join([repo_url, filename])
        dst_file = local_dir / filename

        if dst_file.is_file():
            r = requests.head(pkg_url)
            remote_filesize = int(r.headers['content-length'])
            remote_date = parsedate_to_datetime(r.headers['last-modified'])
            stat = dst_file.stat()
            local_filesize = stat.st_size
            local_mtime = stat.st_mtime

            if remote_filesize == local_filesize and remote_date.timestamp() == local_mtime:
                logging.info("Skipping {}".format(filename))
                continue

            dst_file.unlink()

        for retry in range(3):
            logging.info("Downloading {}".format(filename))
            err = curl_download(pkg_url, dst_file, md5=md5)
            if err is None:
                break
            logging.error("Failed to download {}: {}".format(filename, err))
Example #43
0
def datetime_from_rfc1123(string):
    return parsedate_to_datetime(string)
Example #44
0
 def _get_message_date(self, message):
     return parsedate_to_datetime(message['Date'])
Example #45
0
 def handle(self, *args, **options):
     current_date = parsedate_to_datetime(options['current_date']) if options['current_date'] else now()
     for i in Interview.objects.filter(planned_date__lte=current_date, process__state__in=Process.OPEN_STATE_VALUES,
                                       state=Interview.PLANNED):
         i.state = Interview.WAIT_INFORMATION
         i.save()
Example #46
0
#!/usr/bin/env python3
import sys
import re
# import subprocess
from email.utils import format_datetime, parsedate_to_datetime

in_headers = True
for line in sys.stdin.readlines():
    if line == "\n": in_headers = False
    match = re.match(r'^Date: (.+)', line)

    if not in_headers or not match:
        print(line, end="")
        continue

    date_string = match.group(1)
    # use this if you do not have python 3.3+
    # converted_date = subprocess.Popen(['date','-d',date_string], stdout=subprocess.PIPE).communicate()[0].strip()
    converted_date = format_datetime(parsedate_to_datetime(date_string).astimezone(tz=None))
    print('Date:', converted_date)
Example #47
0
def _parse_ratelimit_header(request):
    now = parsedate_to_datetime(request.headers['Date'])
    reset = datetime.datetime.fromtimestamp(int(request.headers['X-Ratelimit-Reset']), datetime.timezone.utc)
    return (reset - now).total_seconds()
Example #48
0
    def request(self, route, *, header_bypass_delay=None, **kwargs):
        bucket = route.bucket
        method = route.method
        url = route.url

        lock = self._locks.get(bucket)
        if lock is None:
            lock = asyncio.Lock(loop=self.loop)
            if bucket is not None:
                self._locks[bucket] = lock

        # header creation
        headers = {
            'User-Agent': self.user_agent,
        }

        if self.token is not None:
            headers['Authorization'] = 'Bot ' + self.token if self.bot_token else self.token

        # some checking if it's a JSON request
        if 'json' in kwargs:
            headers['Content-Type'] = 'application/json'
            kwargs['data'] = utils.to_json(kwargs.pop('json'))

        kwargs['headers'] = headers

        if self._global_lock.locked():
            # wait until the global lock is complete
            yield from self._global_lock

        yield from lock
        with MaybeUnlock(lock) as maybe_lock:
            for tries in range(5):
                r = yield from self.session.request(method, url, **kwargs)
                log.debug(self.REQUEST_LOG.format(method=method, url=url, status=r.status, json=kwargs.get('data')))
                try:
                    # even errors have text involved in them so this is safe to call
                    data = yield from json_or_text(r)

                    # check if we have rate limit header information
                    remaining = r.headers.get('X-Ratelimit-Remaining')
                    if remaining == '0' and r.status != 429:
                        # we've depleted our current bucket
                        if header_bypass_delay is None:
                            now = parsedate_to_datetime(r.headers['Date'])
                            reset = datetime.datetime.fromtimestamp(int(r.headers['X-Ratelimit-Reset']), datetime.timezone.utc)
                            delta = (reset - now).total_seconds()
                        else:
                            delta = header_bypass_delay

                        fmt = 'A rate limit bucket has been exhausted (bucket: {bucket}, retry: {delta}).'
                        log.info(fmt.format(bucket=bucket, delta=delta))
                        maybe_lock.defer()
                        self.loop.call_later(delta, lock.release)

                    # the request was successful so just return the text/json
                    if 300 > r.status >= 200:
                        log.debug(self.SUCCESS_LOG.format(method=method, url=url, text=data))
                        return data

                    # we are being rate limited
                    if r.status == 429:
                        fmt = 'We are being rate limited. Retrying in {:.2} seconds. Handled under the bucket "{}"'

                        # sleep a bit
                        retry_after = data['retry_after'] / 1000.0
                        log.info(fmt.format(retry_after, bucket))

                        # check if it's a global rate limit
                        is_global = data.get('global', False)
                        if is_global:
                            log.info('Global rate limit has been hit. Retrying in {:.2} seconds.'.format(retry_after))
                            # acquire the global lock and block all processing
                            yield from self._global_lock

                        yield from asyncio.sleep(retry_after, loop=self.loop)

                        # release the global lock now that the
                        # global rate limit has passed
                        if is_global:
                            self._global_lock.release()

                        continue

                    # we've received a 502, unconditional retry
                    if r.status == 502 and tries <= 5:
                        yield from asyncio.sleep(1 + tries * 2, loop=self.loop)
                        continue

                    # the usual error cases
                    if r.status == 403:
                        raise Forbidden(r, data)
                    elif r.status == 404:
                        raise NotFound(r, data)
                    else:
                        raise HTTPException(r, data)
                finally:
                    # clean-up just in case
                    yield from r.release()
Example #49
0
 def test_parsedate_to_datetime_naive(self):
     self.assertEqual(
         utils.parsedate_to_datetime(self.datestring + ' -0000'),
         self.naive_dt)
Example #50
0
 def test_parsedate_to_datetime(self):
     self.assertEqual(
         utils.parsedate_to_datetime(self.datestring + self.offsetstring),
         self.aware_dt)
Example #51
0
def parsed(raw, uid, time, flags):
    # "email.message_from_bytes" uses "email.policy.compat32" policy
    # and it's by intention, because new policies don't work well
    # with real emails which have no encodings, badly formated addreses, etc.
    orig = email.message_from_bytes(raw)
    htm, txt, files, headers, errors = parse_mime(orig, uid)
    meta = {'origin_uid': uid, 'files': [], 'errors': errors}
    if htm:
        embeds = {
            f['content-id']: f['url']
            for f in files if 'content-id' in f
        }
        htm, extra_meta = html.clean(htm, embeds)
        meta.update(extra_meta)
    elif txt:
        htm = html.from_text(txt)

    meta['preview'] = preview(htm, files)
    meta['files'] = files

    fields = (
        ('From', 1), ('Sender', 1),
        ('Reply-To', 0), ('To', 0), ('CC', 0), ('BCC', 0)
    )
    for n, one in fields:
        v = headers.get(n)
        if not v:
            continue
        v = addresses(v)
        meta[n.lower()] = v[0] if one else v

    subj = headers['Subject']
    meta['subject'] = str(subj).strip() if subj else ''

    refs = orig['references']
    refs = [i.strip().lower() for i in refs.split()] if refs else []
    parent = refs[-1] if refs else None
    in_reply_to = orig['in-reply-to'] and normalize_msgid(orig['in-reply-to'])
    if in_reply_to:
        parent = in_reply_to
        if not refs:
            refs = [in_reply_to]
    meta['parent'] = parent

    mid = orig['message-id']
    if mid is None:
        log.info('UID=%s has no "Message-ID" header', uid)
        mid = '<mailur@noid>'
    else:
        mid = normalize_msgid(mid)
    meta['msgid'] = mid

    arrived = dt.datetime.strptime(time.strip('"'), '%d-%b-%Y %H:%M:%S %z')
    meta['arrived'] = int(arrived.timestamp())

    date = orig['date']
    try:
        date = date and int(parsedate_to_datetime(date).timestamp())
    except Exception as e:
        meta['errors'].append('error on date: val=%r err=%r' % (date, e))
        log.error('UID=%s can\'t parse date: val=%r err=%r', uid, date, e)
        date = None
    meta['date'] = date or meta['arrived']

    msg = new()
    msg.add_header('X-UID', '<%s>' % uid)
    msg.add_header('Message-ID', mid)
    msg.add_header('Subject', meta['subject'])
    msg.add_header('Date', orig['Date'])

    for n, v in headers.items():
        if n in msg:
            continue
        msg.add_header(n, v)

    is_draft = '\\Draft' in flags
    if is_draft:
        draft_id = orig['X-Draft-ID'] or mid
        msg.add_header('X-Draft-ID', draft_id)
        meta['draft_id'] = draft_id
        txt = parse_draft(orig)[0]
    elif orig['X-Draft-ID']:
        msg.add_header('X-Draft-ID', orig['X-Draft-ID'])

    thrid = None
    if not is_draft:
        addrs = [msg['from'] or msg['sender'], msg['to']]
        addrs = (a for a in addrs if a)
        addrs = ','.join(sorted(
            '"%s" <%s>' % (n, a) if n else a
            for n, a in getaddresses(addrs)
        ))
        addrs_n_subj = ' '.join(i for i in (addrs, subj) if i)
        thrid = hashlib.md5(addrs_n_subj.encode()).hexdigest()
        thrid = '<*****@*****.**>' % thrid

    thrid = ' '.join(i for i in (thrid, orig['X-Thread-ID']) if i)
    if thrid:
        meta['thrid'] = thrid
        msg.add_header('X-Thread-ID', thrid)
        refs.insert(0, thrid)

    if refs:
        msg.add_header('In-Reply-To', refs[-1])
        msg.add_header('References', ' '.join(refs))

    msg.make_mixed()
    meta_txt = json.dumps(meta, sort_keys=True, ensure_ascii=False, indent=2)
    msg.attach(binary(meta_txt, 'application/json'))
    body = new()
    body.make_alternative()
    body.attach(binary(htm, 'text/html'))
    if txt:
        body.attach(binary(txt))
    msg.attach(body)

    flags = []
    if meta['errors']:
        flags.append('#err')
    return msg, flags
def format_date(date):
    dt = parsedate_to_datetime(date)
    return {'timestamp': dt.timestamp(), 'date_str': dt.strftime('%Y-%m-%d %H:%M')}