Ejemplo n.º 1
0
    def parse_profile(self, profile, consts):
        if profile['online']:
            self.status = Contact.STATUS_ONLINE
            self.status_msg = u'online'
            self.status_msg = u'since %s' % profile['last_cnx']
        else:
            self.status = Contact.STATUS_OFFLINE
            self.status_msg = u'last connection %s' % profile['last_cnx']

        self.summary = unicode(unescape(profile.get('announce', '').strip()))
        if len(profile.get('shopping_list', '')) > 0:
            self.summary += u'\n\nLooking for:\n%s' % unescape(
                profile['shopping_list'].strip())

        for photo in profile['pics']:
            self.set_photo(photo.split('/')[-1],
                           url=photo + '/full',
                           thumbnail_url=photo + '/small',
                           hidden=False)
        self.profile = OrderedDict()

        if 'sex' in profile:
            for section, d in self.TABLE.items():
                flags = ProfileNode.SECTION
                if section.startswith('_'):
                    flags |= ProfileNode.HEAD
                if (section.startswith('+') and int(profile['sex']) != 1) or \
                   (section.startswith('-') and int(profile['sex']) != 0):
                    continue

                section = section.lstrip('_+-')

                s = ProfileNode(section,
                                section.capitalize(),
                                OrderedDict(),
                                flags=flags)

                for key, builder in d.items():
                    try:
                        value = builder.get_value(profile,
                                                  consts[int(profile['sex'])])
                    except KeyError:
                        pass
                    else:
                        s.value[key] = ProfileNode(
                            key,
                            key.capitalize().replace('_', ' '), value)

                self.profile[section] = s

        self._aum_profile = profile
Ejemplo n.º 2
0
 def test_unescape(self):
     self.assertEqual(
         '<pre>and then<div> & other tags',
         html2text.unescape(
             '&lt;pre&gt;and then&lt;div&gt; &amp; other tags'
         )
     )
Ejemplo n.º 3
0
	def decode(self, content):
		content = self._replace(content, self.replace_html)
		# < and > that are still present need to be distinguishable from actual entities that get decoded to < and >
		content = re.sub('(<|>)', r'_!!\1', content)
		content = html2text.unescape(content)
		content = content.replace('&nbsp_place_holder;', ' ')
		return content
Ejemplo n.º 4
0
    def parse_profile(self, profile, consts):
        if profile['online']:
            self.status = Contact.STATUS_ONLINE
            self.status_msg = u'online'
            self.status_msg = u'since %s' % profile['last_cnx']
        else:
            self.status = Contact.STATUS_OFFLINE
            self.status_msg = u'last connection %s' % profile['last_cnx']

        self.summary = unicode(unescape(profile.get('announce', '').strip()))
        if len(profile.get('shopping_list', '')) > 0:
            self.summary += u'\n\nLooking for:\n%s' % unescape(profile['shopping_list'].strip())

        for photo in profile['pics']:
            self.set_photo(photo.split('/')[-1],
                              url=photo + '/full',
                              thumbnail_url=photo + '/small',
                              hidden=False)
        self.profile = OrderedDict()

        if 'sex' in profile:
            for section, d in self.TABLE.iteritems():
                flags = ProfileNode.SECTION
                if section.startswith('_'):
                    flags |= ProfileNode.HEAD
                if (section.startswith('+') and int(profile['sex']) != 1) or \
                   (section.startswith('-') and int(profile['sex']) != 0):
                    continue

                section = section.lstrip('_+-')

                s = ProfileNode(section, section.capitalize(), OrderedDict(), flags=flags)

                for key, builder in d.iteritems():
                    try:
                        value = builder.get_value(profile, consts[int(profile['sex'])])
                    except KeyError:
                        pass
                    else:
                        s.value[key] = ProfileNode(key, key.capitalize().replace('_', ' '), value)

                self.profile[section] = s

        self._aum_profile = profile
def unescape(value):
    """
    Convert HTML entities
    """
    try:
        import html2text as h2t

        return h2t.unescape(value,
                            unicode_snob=True).replace("--", "-").replace(
                                '&nbsp_place_holder;', ' ')
    except ImportError:
        return value
Ejemplo n.º 6
0
    def _get_entry_name(self, parsed, entry):
        """Get the best name

        >>> import feedparser
        >>> f = Feed(name='test-feed')
        >>> parsed = feedparser.parse(
        ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
        ...     '  <entry>\\n'
        ...     '    <author>\\n'
        ...     '      <name>Example author</name>\\n'
        ...     '      <email>[email protected]</email>\\n'
        ...     '      <url>http://example.com/</url>\\n'
        ...     '    </author>\\n'
        ...     '  </entry>\\n'
        ...     '</feed>\\n'
        ...     )
        >>> entry = parsed.entries[0]
        >>> f.name_format = ''
        >>> f._get_entry_name(parsed, entry)
        ''
        >>> f.name_format = '{author}'
        >>> f._get_entry_name(parsed, entry)
        'Example author'
        >>> f.name_format = '{feed-title}: {author}'
        >>> f._get_entry_name(parsed, entry)
        ': Example author'
        >>> f.name_format = '{author} ({feed.name})'
        >>> f._get_entry_name(parsed, entry)
        'Example author (test-feed)'
        """
        if not self.name_format:
            return ''
        data = {
            'feed': self,
            'feed-name': self.name,
            'feed-url': self.url,
            'feed-title': '<feed title>',
            'author': '<author>',
            'publisher': '<publisher>',
            }
        feed = parsed.feed
        data['feed-title'] = feed.get('title', '')
        for x in [entry, feed]:
            if 'name' in x.get('author_detail', []):
                if x.author_detail.name:
                    data['author'] = x.author_detail.name
                    break
        if 'name' in feed.get('publisher_detail', []):
            data['publisher'] = feed.publisher_detail.name
        name = self.name_format.format(**data)
        return _html2text.unescape(name)
Ejemplo n.º 7
0
    def get_thread(self, thread):
        if not isinstance(thread, Thread):
            thread = Thread(thread)
            thread.flags = Thread.IS_DISCUSSION

        messages = self.browser.get_thread_messages(thread.id)

        contact = self.storage.get('s***s',
                                   thread.id,
                                   default={'lastmsg': datetime(1970, 1, 1)})
        thread.title = u'Discussion with %s' % messages['fields']['username']

        me = OkcContact(self.browser.get_profile(self.browser.me['userid']))
        other = OkcContact(self.browser.get_profile(thread.id))

        parent = None
        for message in messages['messages']['messages']:
            date = datetime.fromtimestamp(message['timestamp'])

            flags = 0
            if contact['lastmsg'] < date:
                flags = Message.IS_UNREAD

            if message['from'] == thread.id:
                sender = other
                receiver = me
            else:
                receiver = other
                sender = me

            msg = Message(thread=thread,
                          id=message['id'],
                          title=thread.title,
                          sender=sender.name,
                          receivers=[receiver.name],
                          date=date,
                          content=to_unicode(unescape(message['body'])),
                          children=[],
                          parent=parent,
                          signature=sender.get_text(),
                          flags=flags)

            if parent:
                parent.children = [msg]
            else:
                thread.root = msg

            parent = msg

        return thread
Ejemplo n.º 8
0
    def _get_entry_name(self, parsed, entry):
        """Get the best name

        >>> import feedparser
        >>> f = Feed(name='test-feed')
        >>> parsed = feedparser.parse(
        ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
        ...     '  <entry>\\n'
        ...     '    <author>\\n'
        ...     '      <name>Example author</name>\\n'
        ...     '      <email>[email protected]</email>\\n'
        ...     '      <url>http://example.com/</url>\\n'
        ...     '    </author>\\n'
        ...     '  </entry>\\n'
        ...     '</feed>\\n'
        ...     )
        >>> entry = parsed.entries[0]
        >>> f.name_format = ''
        >>> f._get_entry_name(parsed, entry)
        ''
        >>> f.name_format = '{author}'
        >>> f._get_entry_name(parsed, entry)
        'Example author'
        >>> f.name_format = '{feed-title}: {author}'
        >>> f._get_entry_name(parsed, entry)
        ': Example author'
        >>> f.name_format = '{author} ({feed.name})'
        >>> f._get_entry_name(parsed, entry)
        'Example author (test-feed)'
        """
        if not self.name_format:
            return ''
        data = {
            'feed': self,
            'feed-title': '<feed title>',
            'author': '<author>',
            'publisher': '<publisher>',
            }
        feed = parsed.feed
        data['feed-title'] = feed.get('title', '')
        for x in [entry, feed]:
            if 'name' in x.get('author_detail', []):
                if x.author_detail.name:
                    data['author'] = x.author_detail.name
                    break
        if 'name' in feed.get('publisher_detail', []):
            data['publisher'] = feed.publisher_detail.name
        name = self.name_format.format(**data)
        return _html2text.unescape(name)
Ejemplo n.º 9
0
def search_problems(cn):
    _, t = cn.split('id="cnblogs_post_body"', 1)
    t = t[t.find('>') + 1:]
    ls = []
    chunks = t.split('\n')
    for chunk in chunks:
        if '。' in chunk or ',' in chunk:
            break
        ls.append(chunk)
    html = ''.join(ls)

    desc = html2text.unescape(html2text.html2text(html))
    desc = re.sub(r' +\n+', '\n', desc)
    desc = desc.strip()
    return desc
Ejemplo n.º 10
0
    def get_thread(self, thread):
        if not isinstance(thread, Thread):
            thread = Thread(thread)
            thread.flags = Thread.IS_DISCUSSION

        messages = self.browser.get_thread_messages(thread.id)

        contact = self.storage.get('s***s', thread.id, default={'lastmsg': datetime(1970,1,1)})
        thread.title = u'Discussion with %s' % messages['fields']['username']

        me = OkcContact(self.browser.get_profile(self.browser.me['userid']))
        other = OkcContact(self.browser.get_profile(thread.id))

        parent = None
        for message in messages['messages']['messages']:
            date = datetime.fromtimestamp(message['timestamp'])

            flags = 0
            if contact['lastmsg'] < date:
                flags = Message.IS_UNREAD

            if message['from'] == thread.id:
                sender = other
                receiver = me
            else:
                receiver = other
                sender = me

            msg = Message(thread=thread,
                          id=message['id'],
                          title=thread.title,
                          sender=sender.name,
                          receivers=[receiver.name],
                          date=date,
                          content=to_unicode(unescape(message['body'])),
                          children=[],
                          parent=parent,
                          signature=sender.get_text(),
                          flags=flags)

            if parent:
                parent.children = [msg]
            else:
                thread.root = msg

            parent = msg

        return thread
Ejemplo n.º 11
0
    def get_torrent(self, id):
        url = NotAvailable
        magnet = NotAvailable
        for div in self.document.getiterator('div'):
            if div.attrib.get('id', '') == 'title':
                title = unicode(unescape(div.text.strip()))
            elif div.attrib.get('class', '') == 'download':
                for link in self.parser.select(div, 'a'):
                    href = link.attrib.get('href', '')
                    # https fails on the download server, so strip it
                    if href.startswith('https://'):
                        href = href.replace('https://', 'http://', 1)
                    if href.startswith('magnet:'):
                        magnet = unicode(href)
                    elif len(href):
                        url = unicode(href)
            elif div.attrib.get('id', '') == 'details':
                size = float(div.getchildren()[0].getchildren()[5].text.split(
                    '(')[1].split('Bytes')[0])
                if len(div.getchildren()) > 1 \
                and div.getchildren()[1].attrib.get('class', '') == 'col2':
                    child_to_explore = div.getchildren()[1]
                else:
                    child_to_explore = div.getchildren()[0]
                prev_child_txt = "none"
                seed = "-1"
                leech = "-1"
                for ch in child_to_explore.getchildren():
                    if prev_child_txt == "Seeders:":
                        seed = ch.text
                    if prev_child_txt == "Leechers:":
                        leech = ch.text
                    prev_child_txt = ch.text
            elif div.attrib.get('class', '') == 'nfo':
                description = unicode(
                    div.getchildren()[0].text_content().strip())
        torrent = Torrent(id, title)
        torrent.url = url or NotAvailable
        torrent.magnet = magnet
        torrent.size = size
        torrent.seeders = int(seed)
        torrent.leechers = int(leech)
        torrent.description = description
        torrent.files = NotAvailable

        return torrent
Ejemplo n.º 12
0
    def get_torrent(self, id):
        url = NotAvailable
        magnet = NotAvailable
        for div in self.document.getiterator('div'):
            if div.attrib.get('id', '') == 'title':
                title = unicode(unescape(div.text.strip()))
            elif div.attrib.get('class', '') == 'download':
                for link in self.parser.select(div, 'a'):
                    href = link.attrib.get('href', '')
                    # https fails on the download server, so strip it
                    if href.startswith('https://'):
                        href = href.replace('https://', 'http://', 1)
                    if href.startswith('magnet:'):
                        magnet = unicode(href)
                    elif len(href):
                        url = unicode(href)
            elif div.attrib.get('id', '') == 'details':
                size = float(div.getchildren()[0].getchildren()[5].text.split('(')[1].split('Bytes')[0])
                if len(div.getchildren()) > 1 \
                and div.getchildren()[1].attrib.get('class', '') == 'col2':
                    child_to_explore = div.getchildren()[1]
                else:
                    child_to_explore = div.getchildren()[0]
                prev_child_txt = "none"
                seed = "-1"
                leech = "-1"
                for ch in child_to_explore.getchildren():
                    if prev_child_txt == "Seeders:":
                        seed = ch.text
                    if prev_child_txt == "Leechers:":
                        leech = ch.text
                    prev_child_txt = ch.text
            elif div.attrib.get('class', '') == 'nfo':
                description = unicode(div.getchildren()[0].text_content().strip())
        torrent = Torrent(id, title)
        torrent.url = url or NotAvailable
        torrent.magnet = magnet
        torrent.size = size
        torrent.seeders = int(seed)
        torrent.leechers = int(leech)
        torrent.description = description
        torrent.files = NotAvailable

        return torrent
Ejemplo n.º 13
0
    def _get_entry_name(self, parsed, entry):
        """Get the best name

        >>> import feedparser
        >>> f = Feed(name='test-feed')
        >>> parsed = feedparser.parse(
        ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
        ...     '  <entry>\\n'
        ...     '    <author>\\n'
        ...     '      <name>Example author</name>\\n'
        ...     '      <email>[email protected]</email>\\n'
        ...     '      <url>http://example.com/</url>\\n'
        ...     '    </author>\\n'
        ...     '  </entry>\\n'
        ...     '</feed>\\n'
        ...     )
        >>> entry = parsed.entries[0]
        >>> f.friendly_name = False
        >>> f._get_entry_name(parsed, entry)
        ''
        >>> f.friendly_name = True
        >>> f._get_entry_name(parsed, entry)
        'Example author'
        """
        if not self.friendly_name:
            return ''
        parts = ['']
        feed = parsed.feed
        parts.append(feed.get('title', ''))
        for x in [entry, feed]:
            if 'name' in x.get('author_detail', []):
                if x.author_detail.name:
                    if ''.join(parts):
                        parts.append(': ')
                    parts.append(x.author_detail.name)
                    break
        if not ''.join(parts) and self.use_publisher_email:
            if 'name' in feed.get('publisher_detail', []):
                if ''.join(parts):
                    parts.append(': ')
                parts.append(feed.publisher_detail.name)
        return _html2text.unescape(''.join(parts))
Ejemplo n.º 14
0
    def _get_entry_name(self, parsed, entry):
        """Get the best name

        >>> import feedparser
        >>> f = Feed(name='test-feed')
        >>> parsed = feedparser.parse(
        ...     '<feed xmlns="http://www.w3.org/2005/Atom">\\n'
        ...     '  <entry>\\n'
        ...     '    <author>\\n'
        ...     '      <name>Example author</name>\\n'
        ...     '      <email>[email protected]</email>\\n'
        ...     '      <url>http://example.com/</url>\\n'
        ...     '    </author>\\n'
        ...     '  </entry>\\n'
        ...     '</feed>\\n'
        ...     )
        >>> entry = parsed.entries[0]
        >>> f.friendly_name = False
        >>> f._get_entry_name(parsed, entry)
        ''
        >>> f.friendly_name = True
        >>> f._get_entry_name(parsed, entry)
        'Example author'
        """
        if not self.friendly_name:
            return ''
        parts = ['']
        feed = parsed.feed
        parts.append(feed.get('title', ''))
        for x in [entry, feed]:
            if 'name' in x.get('author_detail', []):
                if x.author_detail.name:
                    if ''.join(parts):
                        parts.append(': ')
                    parts.append(x.author_detail.name)
                    break
        if not ''.join(parts) and self.use_publisher_email:
            if 'name' in feed.get('publisher_detail', []):
                if ''.join(parts):
                    parts.append(': ')
                parts.append(feed.publisher_detail.name)
        return _html2text.unescape(''.join(parts))
Ejemplo n.º 15
0
    def iter_torrents(self):
        try:
            table = self.parser.select(self.document.getroot(),
                                       'table#searchResult', 1)
        except BrokenPageError:
            return
        first = True
        for tr in table.getiterator('tr'):
            if first:
                first = False
                continue
            if tr.get('class', '') != "header":
                td = tr.getchildren()[1]
                div = td.getchildren()[0]
                link = div.find('a').attrib['href']
                title = unicode(unescape(div.find('a').text))
                idt = link.split('/')[2]

                a = td.getchildren()[1]
                url = unicode(a.attrib['href'])

                size = td.find('font').text.split(',')[1].strip()
                u = size.split(' ')[1].split(u'\xa0')[1].replace('i', '')
                size = size.split(' ')[1].split(u'\xa0')[0]

                seed = tr.getchildren()[2].text
                leech = tr.getchildren()[3].text

                torrent = Torrent(idt, title)
                torrent.url = url
                torrent.size = self.unit(float(size), u)
                torrent.seeders = int(seed)
                torrent.leechers = int(leech)
                torrent.description = NotLoaded
                torrent.files = NotLoaded
                torrent.magnet = NotLoaded
                yield torrent
Ejemplo n.º 16
0
    def iter_torrents(self):
        try:
            table = self.parser.select(self.document.getroot(), 'table#searchResult', 1)
        except BrokenPageError:
            return
        first = True
        for tr in table.getiterator('tr'):
            if first:
                first = False
                continue
            if tr.get('class', '') != "header":
                td = tr.getchildren()[1]
                div = td.getchildren()[0]
                link = div.find('a').attrib['href']
                title = unicode(unescape(div.find('a').text))
                idt = link.split('/')[2]

                a = td.getchildren()[1]
                url = unicode(a.attrib['href'])

                size = td.find('font').text.split(',')[1].strip()
                u = size.split(' ')[1].split(u'\xa0')[1].replace('i', '')
                size = size.split(' ')[1].split(u'\xa0')[0]

                seed = tr.getchildren()[2].text
                leech = tr.getchildren()[3].text

                torrent = Torrent(idt, title)
                torrent.url = url
                torrent.size = self.unit(float(size), u)
                torrent.seeders = int(seed)
                torrent.leechers = int(leech)
                torrent.description = NotLoaded
                torrent.files = NotLoaded
                torrent.magnet = NotLoaded
                yield torrent
Ejemplo n.º 17
0
    str).str.replace('_fr', '')
wc_translation_data = wc_en_data.merge(
    wc_fr_data, left_on=['SKU'], right_on=['original_sku'])

wc_translation_data
# %%
wc_title_translations = wc_translation_data[['Name_x', 'Name_y']]
wc_title_translations.columns = ['source', 'target']
wc_description_translations = wc_translation_data[[
    'Description_x', 'Description_y']].dropna(axis=0)
content_translation = pd.DataFrame()
content_translation['source'] = '<p>' + \
    wc_description_translations['Description_x']+'</p>'
content_translation['target'] = '<p>' + \
    wc_description_translations['Description_y']+'</p>'

wc_description_translations.columns = ['source', 'target']
shopify_langify_import = pd.concat(
    [wc_title_translations, content_translation], ignore_index=True)
shopify_langify_import.drop_duplicates(inplace=True)
shopify_langify_import['source'] = shopify_langify_import['source'].apply(
    lambda x: unescape(x).replace('  ',' '))
shopify_langify_import['target'] = shopify_langify_import['target'].apply(
    lambda x: unescape(x).replace('  ',' '))
# %%
WC_TRANSLATION_CSV = 'translation/shopify-import-langify-aw.csv'
shopify_langify_import.to_csv(WC_TRANSLATION_CSV, mode='w+', index=False)


# %%
Ejemplo n.º 18
0
$NetBSD: patch-rss2email_feed.py,v 1.3 2019/08/28 19:44:48 schmonz Exp $

html2text no longer provides unescape(). Python 3.4's html module does.
<https://github.com/rss2email/rss2email/commit/81824e25723dcd2936f25f64ebc16f2e8ec9f310>

--- rss2email/feed.py.orig	2014-09-01 23:21:01.000000000 +0000
+++ rss2email/feed.py
@@ -48,6 +48,7 @@ import xml.sax.saxutils as _saxutils
 
 import feedparser as _feedparser
 import html2text as _html2text
+import html as _html
 
 from . import __url__
 from . import __version__
@@ -595,7 +596,7 @@ class Feed (object):
         if 'name' in feed.get('publisher_detail', []):
             data['publisher'] = feed.publisher_detail.name
         name = self.name_format.format(**data)
-        return _html2text.unescape(name)
+        return _html.unescape(name)
 
     def _validate_email(self, email, default=None):
         """Do a basic quality check on email address
Ejemplo n.º 19
0
    def do_login(self):
        """
        Attempt to log in.
        Note: this method does nothing if we are already logged in.
        """
        self.BASEURL = 'https://%s/' % self.first_domain
        self._sag = None

        if not self.home_page.is_here():
            self.home_page.go()

        if self.new_login:
            self.page.go_to_auth()
            parsed = urlparse(self.url)
            self.BASEURL = '%s://%s' % (parsed.scheme, parsed.netloc)
        else:
            # On the homepage, we get the URL of the auth service.
            url = self.page.get_post_url()
            if url is None:
                raise WebsiteNotSupported()

            # First, post account number to get the password prompt.
            data = {
                'CCPTE': self.username[:11].encode('iso8859-15'),
                'canal': 'WEB',
                'hauteur_ecran': 768,
                'largeur_ecran': 1024,
                'liberror': '',
                'matrice': 'true',
                'origine': 'vitrine',
                'situationTravail': 'BANCAIRE',
                'typeAuthentification': 'CLIC_ALLER',
                'urlOrigine': self.page.url,
                'vitrine': 0,
            }

            parsed = urlparse(url)
            self.BASEURL = '%s://%s' % (parsed.scheme, parsed.netloc)
            self.location(url, data=data)

        assert self.login_page.is_here()

        # Then, post the password.
        self.page.login(self.username, self.password)

        if self.new_login:
            url = self.page.get_accounts_url()
        else:
            # The result of POST is the destination URL.
            url = self.page.get_result_url()

        if not url.startswith('http'):
            raise BrowserIncorrectPassword(unescape(url, unicode_snob=True))

        self.location(url.replace('Synthese', 'Synthcomptes'))

        if self.login_error.is_here():
            raise BrowserIncorrectPassword()

        if self.page is None:
            raise WebsiteNotSupported()

        if not self.accounts.is_here():
            # Sometimes the home page is Releves.
            new_url = re.sub('act=([^&=]+)', 'act=Synthcomptes', self.page.url,
                             1)
            self.location(new_url)

        if not self.accounts.is_here():
            raise BrowserIncorrectPassword()

        if self.code_caisse is None:
            self.code_caisse = self.page.get_code_caisse()

        # Store the current url to go back when requesting accounts list.
        self.accounts_url = re.sub('sessionSAG=[^&]+', 'sessionSAG={0}',
                                   self.page.url)

        # we can deduce the URL to "savings" and "loan" accounts from the regular accounts one
        self.savings_url = re.sub('act=([^&=]+)', 'act=Synthepargnes',
                                  self.accounts_url, 1)
        self.loans_url = re.sub('act=([^&=]+)', 'act=Synthcredits',
                                self.accounts_url, 1)
        self.advisor_url = re.sub('act=([^&=]+)', 'act=Contact',
                                  self.accounts_url, 1)
        self.profile_url = re.sub('act=([^&=]+)', 'act=Coordonnees',
                                  self.accounts_url, 1)

        if self.page.check_perimeters() and not self.broken_perimeters:
            self.perimeter_url = re.sub('act=([^&=]+)', 'act=Perimetre',
                                        self.accounts_url, 1)
            self.chg_perimeter_url = '%s%s' % (re.sub(
                'act=([^&=]+)', 'act=ChgPerim', self.accounts_url,
                1), '&typeaction=ChgPerim')
            self.location(self.perimeter_url.format(self.sag))
            self.page.check_multiple_perimeters()
Ejemplo n.º 20
0
def run(num=None):
    feeds, feedfileObject = load()
    smtpserver = None
    try:
        # We store the default to address as the first item in the feeds list.
        # Here we take it out and save it for later.
        default_to = ""
        if feeds and isstr(feeds[0]):
            default_to = feeds[0]
            ifeeds = feeds[1:]
        else:
            ifeeds = feeds

        if num: ifeeds = [feeds[num]]
        feednum = 0

        for f in ifeeds:
            try:
                feednum += 1
                if not f.active: continue

                if VERBOSE:
                    print >> warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
                r = {}
                try:
                    r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag,
                                                       f.modified)
                except TimeoutError:
                    print >> warn, 'W: feed [%d] "%s" timed out' % (feednum,
                                                                    f.url)
                    continue

                # Handle various status conditions, as required
                if 'status' in r:
                    if r.status == 301: f.url = r['url']
                    elif r.status == 410:
                        print >> warn, "W: feed gone; deleting", f.url
                        feeds.remove(f)
                        continue

                http_status = r.get('status', 200)
                if VERBOSE > 1: print >> warn, "I: http status", http_status
                http_headers = r.get('headers', {
                    'content-type': 'application/rss+xml',
                    'content-length': '1'
                })
                exc_type = r.get("bozo_exception", Exception()).__class__
                if http_status != 304 and not r.entries and not r.get(
                        'version', ''):
                    if http_status not in [200, 302]:
                        print >> warn, "W: error %d [%d] %s" % (http_status,
                                                                feednum, f.url)

                    elif contains(http_headers.get('content-type', 'rss'),
                                  'html'):
                        print >> warn, "W: looks like HTML [%d] %s" % (feednum,
                                                                       f.url)

                    elif http_headers.get('content-length', '1') == '0':
                        print >> warn, "W: empty page [%d] %s" % (feednum,
                                                                  f.url)

                    elif hasattr(socket,
                                 'timeout') and exc_type == socket.timeout:
                        print >> warn, "W: timed out on [%d] %s" % (feednum,
                                                                    f.url)

                    elif exc_type == IOError:
                        print >> warn, 'W: "%s" [%d] %s' % (r.bozo_exception,
                                                            feednum, f.url)

                    elif hasattr(feedparser,
                                 'zlib') and exc_type == feedparser.zlib.error:
                        print >> warn, "W: broken compression [%d] %s" % (
                            feednum, f.url)

                    elif exc_type in socket_errors:
                        exc_reason = r.bozo_exception.args[1]
                        print >> warn, "W: %s [%d] %s" % (exc_reason, feednum,
                                                          f.url)

                    elif exc_type == urllib2.URLError:
                        if r.bozo_exception.reason.__class__ in socket_errors:
                            exc_reason = r.bozo_exception.reason.args[1]
                        else:
                            exc_reason = r.bozo_exception.reason
                        print >> warn, "W: %s [%d] %s" % (exc_reason, feednum,
                                                          f.url)

                    elif exc_type == AttributeError:
                        print >> warn, "W: %s [%d] %s" % (r.bozo_exception,
                                                          feednum, f.url)

                    elif exc_type == KeyboardInterrupt:
                        raise r.bozo_exception

                    elif r.bozo:
                        print >> warn, 'E: error in [%d] "%s" feed (%s)' % (
                            feednum, f.url,
                            r.get("bozo_exception", "can't process"))

                    else:
                        print >> warn, "=== rss2email encountered a problem with this feed ==="
                        print >> warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                        print >> warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                        print >> warn, "E:", r.get("bozo_exception",
                                                   "can't process"), f.url
                        print >> warn, r
                        print >> warn, "rss2email", __version__
                        print >> warn, "feedparser", feedparser.__version__
                        print >> warn, "html2text", h2t.__version__
                        print >> warn, "Python", sys.version
                        print >> warn, "=== END HERE ==="
                    continue

                r.entries.reverse()

                for entry in r.entries:
                    id = getID(entry)

                    # If TRUST_GUID isn't set, we get back hashes of the content.
                    # Instead of letting these run wild, we put them in context
                    # by associating them with the actual ID (if it exists).

                    frameid = entry.get('id')
                    if not (frameid): frameid = id
                    if type(frameid) is DictType:
                        frameid = frameid.values()[0]

                    # If this item's ID is in our database
                    # then it's already been sent
                    # and we don't need to do anything more.

                    if frameid in f.seen:
                        if f.seen[frameid] == id: continue

                    if not (f.to or default_to):
                        print "No default email address defined. Please run 'r2e email emailaddress'"
                        print "Ignoring feed %s" % f.url
                        break

                    if 'title_detail' in entry and entry.title_detail:
                        title = entry.title_detail.value
                        if contains(entry.title_detail.type, 'html'):
                            title = html2text(title)
                    else:
                        title = getContent(entry)[:70]

                    title = title.replace("\n", " ").strip()

                    datetime = time.gmtime()

                    if DATE_HEADER:
                        for datetype in DATE_HEADER_ORDER:
                            kind = datetype + "_parsed"
                            if kind in entry and entry[kind]:
                                datetime = entry[kind]

                    link = entry.get('link', "")

                    from_addr = getEmail(r, entry)

                    name = h2t.unescape(getName(r, entry))
                    fromhdr = formataddr((
                        name,
                        from_addr,
                    ))
                    tohdr = (f.to or default_to)
                    subjecthdr = title
                    datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000",
                                            datetime)
                    useragenthdr = "rss2email"

                    # Add post tags, if available
                    tagline = ""
                    if 'tags' in entry:
                        tags = entry.get('tags')
                        taglist = []
                        if tags:
                            for tag in tags:
                                taglist.append(tag['term'])
                        if taglist:
                            tagline = ",".join(taglist)

                    extraheaders = {
                        'Date': datehdr,
                        'User-Agent': useragenthdr,
                        'X-RSS-Feed': f.url,
                        'X-RSS-ID': id,
                        'X-RSS-URL': link,
                        'X-RSS-TAGS': tagline
                    }
                    if BONUS_HEADER != '':
                        for hdr in BONUS_HEADER.strip().splitlines():
                            pos = hdr.strip().find(':')
                            if pos > 0:
                                extraheaders[hdr[:pos]] = hdr[pos + 1:].strip()
                            else:
                                print >> warn, "W: malformed BONUS HEADER", BONUS_HEADER

                    entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
                    contenttype = 'plain'
                    content = ''
                    if USE_CSS_STYLING and HTML_MAIL:
                        contenttype = 'html'
                        content = "<html>\n"
                        content += '<head><style><!--' + STYLE_SHEET + '//--></style></head>\n'
                        content += '<body>\n'
                        content += '<div id="entry">\n'
                        content += '<h1'
                        content += ' class="header"'
                        content += '><a href="' + link + '">' + subjecthdr + '</a></h1>\n'
                        if ishtml(entrycontent):
                            body = entrycontent[1].strip()
                        else:
                            body = entrycontent.strip()
                        if body != '':
                            content += '<div id="body"><table><tr><td>\n' + body + '</td></tr></table></div>\n'
                        content += '\n<p class="footer">URL: <a href="' + link + '">' + link + '</a>'
                        if hasattr(entry, 'enclosures'):
                            for enclosure in entry.enclosures:
                                if (hasattr(enclosure, 'url')
                                        and enclosure.url != ""):
                                    content += ('<br/>Enclosure: <a href="' +
                                                enclosure.url + '">' +
                                                enclosure.url + "</a>\n")
                                if (hasattr(enclosure, 'src')
                                        and enclosure.src != ""):
                                    content += ('<br/>Enclosure: <a href="' +
                                                enclosure.src + '">' +
                                                enclosure.src +
                                                '</a><br/><img src="' +
                                                enclosure.src + '"\n')
                        if 'links' in entry:
                            for extralink in entry.links:
                                if ('rel' in extralink
                                    ) and extralink['rel'] == u'via':
                                    extraurl = extralink['href']
                                    extraurl = extraurl.replace(
                                        'http://www.google.com/reader/public/atom/',
                                        'http://www.google.com/reader/view/')
                                    viatitle = extraurl
                                    if ('title' in extralink):
                                        viatitle = extralink['title']
                                    content += '<br/>Via: <a href="' + extraurl + '">' + viatitle + '</a>\n'
                        content += '</p></div>\n'
                        content += "\n\n</body></html>"
                    else:
                        if ishtml(entrycontent):
                            contenttype = 'html'
                            content = "<html>\n"
                            content = (
                                "<html><body>\n\n" + '<h1><a href="' + link +
                                '">' + subjecthdr + '</a></h1>\n\n' +
                                entrycontent[1].strip()
                                +  # drop type tag (HACK: bad abstraction)
                                '<p>URL: <a href="' + link + '">' + link +
                                '</a></p>')

                            if hasattr(entry, 'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('Enclosure: <a href="' +
                                                    enclosure.url + '">' +
                                                    enclosure.url +
                                                    "</a><br/>\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink
                                        ) and extralink['rel'] == u'via':
                                        content += 'Via: <a href="' + extralink[
                                            'href'] + '">' + extralink[
                                                'title'] + '</a><br/>\n'

                            content += ("\n</body></html>")
                        else:
                            content = entrycontent.strip() + "\n\nURL: " + link
                            if hasattr(entry, 'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('\nEnclosure: ' +
                                                    enclosure.url + "\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink
                                        ) and extralink['rel'] == u'via':
                                        content += '<a href="' + extralink[
                                            'href'] + '">Via: ' + extralink[
                                                'title'] + '</a>\n'

                    smtpserver = send(fromhdr, tohdr, subjecthdr, content,
                                      contenttype, extraheaders, smtpserver)

                    f.seen[frameid] = id

                f.etag, f.modified = r.get('etag',
                                           None), r.get('modified', None)
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                print >> warn, "=== rss2email encountered a problem with this feed ==="
                print >> warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                print >> warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                print >> warn, "E: could not parse", f.url
                traceback.print_exc(file=warn)
                print >> warn, "rss2email", __version__
                print >> warn, "feedparser", feedparser.__version__
                print >> warn, "html2text", h2t.__version__
                print >> warn, "Python", sys.version
                print >> warn, "=== END HERE ==="
                continue

    finally:
        unlock(feeds, feedfileObject)
        if smtpserver:
            smtpserver.quit()
Ejemplo n.º 21
0
def run(num=None):
    feeds, feedfileObject = load()
    mailserver = None
    try:
        # We store the default to address as the first item in the feeds list.
        # Here we take it out and save it for later.
        default_to = ""
        if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
        else: ifeeds = feeds
        
        if num: ifeeds = [feeds[num]]
        feednum = 0
        
        for f in ifeeds:
            try: 
                feednum += 1
                if not f.active: continue
                
                if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
                r = {}
                try:
                    r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
                except TimeoutError:
                    print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
                    continue
                
                # Handle various status conditions, as required
                if 'status' in r:
                    if r.status == 301: f.url = r['url']
                    elif r.status == 410:
                        print >>warn, "W: feed gone; deleting", f.url
                        feeds.remove(f)
                        continue
                
                http_status = r.get('status', 200)
                if VERBOSE > 1: print >>warn, "I: http status", http_status
                http_headers = r.get('headers', {
                  'content-type': 'application/rss+xml', 
                  'content-length':'1'})
                exc_type = r.get("bozo_exception", Exception()).__class__
                if http_status != 304 and not r.entries and not r.get('version', ''):
                    if http_status not in [200, 302]: 
                        print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)

                    elif contains(http_headers.get('content-type', 'rss'), 'html'):
                        print >>warn, "W: looks like HTML [%d] %s"  % (feednum, f.url)

                    elif http_headers.get('content-length', '1') == '0':
                        print >>warn, "W: empty page [%d] %s" % (feednum, f.url)

                    elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
                        print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
                    
                    elif exc_type == IOError:
                        print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
                    
                    elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
                        print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
                    
                    elif exc_type in socket_errors:
                        exc_reason = r.bozo_exception.args[1]
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)

                    elif exc_type == urllib2.URLError:
                        if r.bozo_exception.reason.__class__ in socket_errors:
                            exc_reason = r.bozo_exception.reason.args[1]
                        else:
                            exc_reason = r.bozo_exception.reason
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
                    
                    elif exc_type == AttributeError:
                        print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
                    
                    elif exc_type == KeyboardInterrupt:
                        raise r.bozo_exception
                        
                    elif r.bozo:
                        print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))

                    else:
                        print >>warn, "=== rss2email encountered a problem with this feed ==="
                        print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                        print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                        print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
                        print >>warn, r
                        print >>warn, "rss2email", __version__
                        print >>warn, "feedparser", feedparser.__version__
                        print >>warn, "html2text", h2t.__version__
                        print >>warn, "Python", sys.version
                        print >>warn, "=== END HERE ==="
                    continue
                
                r.entries.reverse()
                
                for entry in r.entries:
                    id = getID(entry)
                    
                    # If TRUST_GUID isn't set, we get back hashes of the content.
                    # Instead of letting these run wild, we put them in context
                    # by associating them with the actual ID (if it exists).
                    
                    frameid = entry.get('id')
                    if not(frameid): frameid = id
                    if type(frameid) is DictType:
                        frameid = frameid.values()[0]
                    
                    # If this item's ID is in our database
                    # then it's already been sent
                    # and we don't need to do anything more.
                    
                    if frameid in f.seen:
                        if f.seen[frameid] == id: continue

                    if not (f.to or default_to):
                        print "No default email address defined. Please run 'r2e email emailaddress'"
                        print "Ignoring feed %s" % f.url
                        break
                    
                    if 'title_detail' in entry and entry.title_detail:
                        title = entry.title_detail.value
                        if contains(entry.title_detail.type, 'html'):
                            title = html2text(title)
                    else:
                        title = getContent(entry)[:70]

                    title = title.replace("\n", " ").strip()
                    
                    datetime = time.gmtime()

                    if DATE_HEADER:
                        for datetype in DATE_HEADER_ORDER:
                            kind = datetype+"_parsed"
                            if kind in entry and entry[kind]: datetime = entry[kind]
                        
                    link = entry.get('link', "")
                    
                    from_addr = getEmail(r, entry)
                    
                    name = h2t.unescape(getName(r, entry))
                    fromhdr = formataddr((name, from_addr,))
                    tohdr = (f.to or default_to)
                    subjecthdr = title
                    datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
                    useragenthdr = "rss2email"
                    
                    # Add post tags, if available
                    tagline = ""
                    if 'tags' in entry:
                        tags = entry.get('tags')
                        taglist = []
                        if tags:
                            for tag in tags:
                                taglist.append(tag['term'])
                        if taglist:
                            tagline = ",".join(taglist)
                    
                    extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''}
                    if BONUS_HEADER != '':
                        for hdr in BONUS_HEADER.strip().splitlines():
                            pos = hdr.strip().find(':')
                            if pos > 0:
                                extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
                            else:
                                print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER 
                    
                    entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
                    contenttype = 'plain'
                    content = ''
                    if THREAD_ON_TAGS and len(tagline):
                        extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')])
                    if USE_CSS_STYLING and HTML_MAIL:
                        contenttype = 'html'
                        content = "<html>\n" 
                        content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n'
                        content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n'
                        content += '<div id="entry">\n'
                        content += '<h1 class="header"'
                        content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
                        if ishtml(entrycontent):
                            body = entrycontent[1].strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>")
                        else:
                            body = entrycontent.strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>")
                        if THREAD_ON_LINKS:
                            parser = Parser()
                            parser.feed(body)
                            extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs])
                        if INLINE_IMAGES_DATA_URI:
                            parser = Parser(tag='img', attr='src')
                            parser.feed(body)
                            for src in parser.attrs:
                                try:
                                    img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {})
                                    data = img.read()
                                    if hasattr(img, 'headers'):
                                        headers = dict((k.lower(), v) for k, v in dict(img.headers).items())
                                        ctype = headers.get('content-type', None)
                                        if ctype and INLINE_IMAGES_DATA_URI:
                                            body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data)))
                                except:
                                    print >>warn, "Could not load image: %s" % src
                                    pass
                        if body != '':  
                            content += '<div id="body">\n' + body + '</div>\n'
                        content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
                        if hasattr(entry,'enclosures'):
                            for enclosure in entry.enclosures:
                                if (hasattr(enclosure, 'url') and enclosure.url != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
                                if (hasattr(enclosure, 'src') and enclosure.src != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
                        if 'links' in entry:
                            for extralink in entry.links:
                                if ('rel' in extralink) and extralink['rel'] == u'via':
                                    extraurl = extralink['href']
                                    extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
                                    viatitle = extraurl
                                    if ('title' in extralink):
                                        viatitle = extralink['title']
                                    content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n'
                        content += '</p></div>\n'
                        content += "\n\n</body></html>"
                    else:   
                        if ishtml(entrycontent):
                            contenttype = 'html'
                            content = "<html>\n" 
                            content = ("<html><body>\n\n" + 
                                       '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
                                       entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
                                       '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
                                       
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
                                                                
                            content += ("\n</body></html>")
                        else:
                            content = entrycontent.strip() + "\n\nURL: "+link
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('\nEnclosure: ' + enclosure.url + "\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'

                    mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, datetime, extraheaders, mailserver, f.folder)
            
                    f.seen[frameid] = id
                    
                f.etag, f.modified = r.get('etag', None), r.get('modified', None)
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                print >>warn, "=== rss2email encountered a problem with this feed ==="
                print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                print >>warn, "E: could not parse", f.url
                traceback.print_exc(file=warn)
                print >>warn, "rss2email", __version__
                print >>warn, "feedparser", feedparser.__version__
                print >>warn, "html2text", h2t.__version__
                print >>warn, "Python", sys.version
                print >>warn, "=== END HERE ==="
                continue

    finally:        
        unlock(feeds, feedfileObject)
        if mailserver:
            if IMAP_MARK_AS_READ:
                for folder in IMAP_MARK_AS_READ:
                    mailserver.select(folder)
                    res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)')
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)')
            if IMAP_MOVE_READ_TO:
                typ, data = mailserver.list(pattern='*')
                # Parse folder listing as a CSV dialect (automatically removes quotes)
                reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist')
                # Iterate over each folder
                for row in reader:
                    folder = row[-1:][0]
                    if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]:
                        continue
                    mailserver.select(folder)
                    res, data = mailserver.search(None, '(SEEN UNFLAGGED)')
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO)
                                if res == 'OK':
                                    res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)')
                                    mailserver.expunge()
            try:
                mailserver.quit()
            except:
                mailserver.logout()
Ejemplo n.º 22
0
    def do_login(self):
        """
        Attempt to log in.
        Note: this method does nothing if we are already logged in.
        """
        self.BASEURL = 'https://%s/' % self.first_domain
        self._sag = None

        if not self.home_page.is_here():
            self.home_page.go()

        if self.new_website.is_here():
            self.logger.warning('This connection uses the new API website')
            raise SiteSwitch('api')

        if self.new_login:
            self.page.go_to_auth()
            parsed = urlparse(self.url)
            self.BASEURL = '%s://%s' % (parsed.scheme, parsed.netloc)
        else:
            # On the homepage, we get the URL of the auth service.
            url = self.page.get_post_url()
            if url is None:
                raise WebsiteNotSupported()

            # First, post account number to get the password prompt.
            data = {'CCPTE':                self.username[:11].encode('iso8859-15'),
                    'canal':                'WEB',
                    'hauteur_ecran':        768,
                    'largeur_ecran':        1024,
                    'liberror':             '',
                    'matrice':              'true',
                    'origine':              'vitrine',
                    'situationTravail':     'BANCAIRE',
                    'typeAuthentification': 'CLIC_ALLER',
                    'urlOrigine':           self.page.url,
                    'vitrine':              0,
                }

            parsed = urlparse(url)
            self.BASEURL = '%s://%s' % (parsed.scheme, parsed.netloc)
            self.location(url, data=data)

        assert self.login_page.is_here()

        # Then, post the password.
        self.page.login(self.username, self.password)

        if self.new_login:
            url = self.page.get_accounts_url()
        else:
            # The result of POST is the destination URL.
            url = self.page.get_result_url()

        if not url.startswith('http'):
            raise BrowserIncorrectPassword(unescape(url, unicode_snob=True))

        self.location(url.replace('Synthese', 'Synthcomptes'))

        if self.login_error.is_here():
            raise BrowserIncorrectPassword()

        if self.page is None:
            raise WebsiteNotSupported()

        if not self.accounts.is_here():
            # Sometimes the home page is Releves.
            new_url  = re.sub('act=([^&=]+)', 'act=Synthcomptes', self.page.url, 1)
            self.location(new_url)

        if not self.accounts.is_here():
            raise BrowserIncorrectPassword()

        if self.code_caisse is None:
            self.code_caisse = self.page.get_code_caisse()

        # Store the current url to go back when requesting accounts list.
        self.accounts_url = re.sub('sessionSAG=[^&]+', 'sessionSAG={0}', self.page.url)

        # we can deduce the URL to "savings" and "loan" accounts from the regular accounts one
        self.savings_url  = re.sub('act=([^&=]+)', 'act=Synthepargnes', self.accounts_url, 1)
        self.loans_url  = re.sub('act=([^&=]+)', 'act=Synthcredits', self.accounts_url, 1)
        self.advisor_url  = re.sub('act=([^&=]+)', 'act=Contact', self.accounts_url, 1)
        self.profile_url  = re.sub('act=([^&=]+)', 'act=Coordonnees', self.accounts_url, 1)

        if self.page.check_perimeters() and not self.broken_perimeters:
            self.perimeter_url = re.sub('act=([^&=]+)', 'act=Perimetre', self.accounts_url, 1)
            self.chg_perimeter_url = '%s%s' % (re.sub('act=([^&=]+)', 'act=ChgPerim', self.accounts_url, 1), '&typeaction=ChgPerim')
            self.location(self.perimeter_url.format(self.sag))
            self.page.check_multiple_perimeters()
Ejemplo n.º 23
0
    def get_thread(self, id, contacts=None, get_profiles=False):
        """
        Get a thread and its messages.

        The 'contacts' parameters is only used for internal calls.
        """
        thread = None
        if isinstance(id, Thread):
            thread = id
            id = thread.id

        if not thread:
            thread = Thread(int(id))
            thread.flags = Thread.IS_DISCUSSION
            full = False
        else:
            full = True

        with self.browser:
            mails = self.browser.get_thread_mails(id, 100)
            my_name = self.browser.get_my_name()

        child = None
        msg = None
        s**t = self._get_slut(id)
        if contacts is None:
            contacts = {}

        if not thread.title:
            thread.title = u'Discussion with %s' % mails['who']['pseudo']

        self.storage.set('s***s', int(thread.id), 'status', mails['status'])
        self.storage.save()

        for mail in mails['results']:
            flags = 0
            if self.antispam and not self.antispam.check_mail(mail):
                self.logger.info('Skipped a spam-mail from %s' % mails['who']['pseudo'])
                self.report_spam(thread.id)
                break

            if parse_dt(mail['date']) > s**t['lastmsg']:
                flags |= Message.IS_UNREAD

                if get_profiles:
                    if not mail['from'] in contacts:
                        try:
                            with self.browser:
                                contacts[mail['from']] = self.get_contact(mail['from'])
                        except BrowserHTTPNotFound:
                            pass
                    if self.antispam and mail['from'] in contacts and not self.antispam.check_contact(contacts[mail['from']]):
                        self.logger.info('Skipped a spam-mail-profile from %s' % mails['who']['pseudo'])
                        self.report_spam(thread.id)
                        break

            if int(mail['from']) == self.browser.my_id:
                if mails['remote_status'] == 'new' and msg is None:
                    flags |= Message.IS_NOT_RECEIVED
                else:
                    flags |= Message.IS_RECEIVED

            signature = u''
            #if mail.get('src', None):
            #    signature += u'Sent from my %s\n\n' % mail['src']
            if mail['from'] in contacts:
                signature += contacts[mail['from']].get_text()

            msg = Message(thread=thread,
                          id=int(time.strftime('%Y%m%d%H%M%S', parse_dt(mail['date']).timetuple())),
                          title=thread.title,
                          sender=to_unicode(my_name if int(mail['from']) == self.browser.my_id else mails['who']['pseudo']),
                          receivers=[to_unicode(my_name if int(mail['from']) != self.browser.my_id else mails['who']['pseudo'])],
                          date=parse_dt(mail['date']),
                          content=to_unicode(unescape(mail['message'] or '').strip()),
                          signature=signature,
                          children=[],
                          flags=flags)
            if child:
                msg.children.append(child)
                child.parent = msg

            child = msg

        if full and msg:
            # If we have get all the messages, replace NotLoaded with None as
            # parent.
            msg.parent = None
        if not full and not msg:
            # Perhaps there are hidden messages
            msg = NotLoaded

        thread.root = msg

        return thread
Ejemplo n.º 24
0
    rsuffix='_other',
    lsuffix='_original',
)

# %%
# Assign to shopify_data
shopify_data['Handle'] = jointed['slug_other']
shopify_data['Title'] = wc_data['Name']
shopify_data['Variant SKU'] = wc_data['SKU']
shopify_data['Body (HTML)'] = wc_data['Description']
shopify_data['Vendor'] = wc_data['Manufacturer']

wc_data['new_tags'] = wc_data[['Tags', 'Categories']].fillna(value='').apply(
    lambda x: re.split('>|,', (x['Categories'] + x['Tags']).strip()), axis=1)
shopify_data['Tags'] = wc_data['new_tags'].apply(
    lambda x: unescape(','.join(np.unique([y.strip() for y in x]))))

published_dict = defaultdict(lambda: 'FALSE')
published_dict[1] = 'TRUE'
shopify_data['Published'] = wc_data['Published'].map(published_dict)
shopify_data['WC Type'] = wc_data['Type']

is_variation = shopify_data['WC Type'] == 'variation'
is_simple = shopify_data['WC Type'] == 'simple'
is_variable = shopify_data['WC Type'] == 'variable'
is_not_variation = shopify_data['WC Type'] != 'variation'

wc_data['packaging'].fillna(value='')

shopify_data['Option1 Name'] = 'Packaging'
shopify_data['Option1 Value'] = wc_data['packaging']
Ejemplo n.º 25
0
    def get_thread(self, id, contacts=None, get_profiles=False):
        """
        Get a thread and its messages.

        The 'contacts' parameters is only used for internal calls.
        """
        thread = None
        if isinstance(id, Thread):
            thread = id
            id = thread.id

        if not thread:
            thread = Thread(int(id))
            thread.flags = Thread.IS_DISCUSSION
            full = False
        else:
            full = True

        with self.browser:
            mails = self.browser.get_thread_mails(id, 100)
            my_name = self.browser.get_my_name()

        child = None
        msg = None
        s**t = self._get_slut(mails["member"]["pseudo"])
        if contacts is None:
            contacts = {}

        if not thread.title:
            thread.title = u"Discussion with %s" % mails["member"]["pseudo"]

        for mail in mails["messages"]:
            flags = Message.IS_HTML
            if parse_dt(mail["date"]) > s**t["lastmsg"] and mail["id_from"] != self.browser.get_my_name():
                flags |= Message.IS_UNREAD

                if get_profiles:
                    if not mail["id_from"] in contacts:
                        with self.browser:
                            contacts[mail["id_from"]] = self.get_contact(mail["id_from"])

            signature = u""
            if mail.get("src", None):
                signature += u"Sent from my %s\n\n" % mail["src"]
            if mail["id_from"] in contacts:
                signature += contacts[mail["id_from"]].get_text()

            msg = Message(
                thread=thread,
                id=int(time.strftime("%Y%m%d%H%M%S", parse_dt(mail["date"]).timetuple())),
                title=thread.title,
                sender=mail["id_from"],
                receivers=[my_name if mail["id_from"] != my_name else mails["member"]["pseudo"]],
                date=parse_dt(mail["date"]),
                content=unescape(mail["message"]).strip(),
                signature=signature,
                children=[],
                flags=flags,
            )
            if child:
                msg.children.append(child)
                child.parent = msg

            child = msg

        if full and msg:
            # If we have get all the messages, replace NotLoaded with None as
            # parent.
            msg.parent = None
        if not full and not msg:
            # Perhaps there are hidden messages
            msg = NotLoaded

        thread.root = msg

        return thread
Ejemplo n.º 26
0
    def get_thread(self, id, contacts=None, get_profiles=False):
        """
        Get a thread and its messages.

        The 'contacts' parameters is only used for internal calls.
        """
        thread = None
        if isinstance(id, Thread):
            thread = id
            id = thread.id

        if not thread:
            thread = Thread(int(id))
            thread.flags = Thread.IS_DISCUSSION
            full = False
        else:
            full = True

        with self.browser:
            mails = self.browser.get_thread_mails(id, 100)
            my_name = self.browser.get_my_name()

        child = None
        msg = None
        s**t = self._get_slut(id)
        if contacts is None:
            contacts = {}

        if not thread.title:
            thread.title = u'Discussion with %s' % mails['who']['pseudo']

        self.storage.set('s***s', int(thread.id), 'status', mails['status'])
        self.storage.save()

        for mail in mails['results']:
            flags = 0
            if self.antispam and not self.antispam.check_mail(mail):
                self.logger.info('Skipped a spam-mail from %s' %
                                 mails['who']['pseudo'])
                self.report_spam(thread.id)
                break

            if parse_dt(mail['date']) > s**t['lastmsg']:
                flags |= Message.IS_UNREAD

                if get_profiles:
                    if not mail['from'] in contacts:
                        try:
                            with self.browser:
                                contacts[mail['from']] = self.get_contact(
                                    mail['from'])
                        except BrowserHTTPNotFound:
                            pass
                    if self.antispam and mail[
                            'from'] in contacts and not self.antispam.check_contact(
                                contacts[mail['from']]):
                        self.logger.info(
                            'Skipped a spam-mail-profile from %s' %
                            mails['who']['pseudo'])
                        self.report_spam(thread.id)
                        break

            if int(mail['from']) == self.browser.my_id:
                if mails['remote_status'] == 'new' and msg is None:
                    flags |= Message.IS_NOT_RECEIVED
                else:
                    flags |= Message.IS_RECEIVED

            signature = u''
            #if mail.get('src', None):
            #    signature += u'Sent from my %s\n\n' % mail['src']
            if mail['from'] in contacts:
                signature += contacts[mail['from']].get_text()

            msg = Message(
                thread=thread,
                id=int(
                    time.strftime('%Y%m%d%H%M%S',
                                  parse_dt(mail['date']).timetuple())),
                title=thread.title,
                sender=to_unicode(my_name if int(mail['from']) == self.browser.
                                  my_id else mails['who']['pseudo']),
                receivers=[
                    to_unicode(my_name if int(mail['from']) != self.browser.
                               my_id else mails['who']['pseudo'])
                ],
                date=parse_dt(mail['date']),
                content=to_unicode(unescape(mail['message'] or '').strip()),
                signature=signature,
                children=[],
                flags=flags)
            if child:
                msg.children.append(child)
                child.parent = msg

            child = msg

        if full and msg:
            # If we have get all the messages, replace NotLoaded with None as
            # parent.
            msg.parent = None
        if not full and not msg:
            # Perhaps there are hidden messages
            msg = NotLoaded

        thread.root = msg

        return thread
Ejemplo n.º 27
0
def run(num=None):
	feeds, feedfileObject = load()
	smtpserver = None
	try:
		# We store the default to address as the first item in the feeds list.
		# Here we take it out and save it for later.
		default_to = ""
		if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
		else: ifeeds = feeds
		
		if num: ifeeds = [feeds[num]]
		feednum = 0
		
		for f in ifeeds:
			try: 
				feednum += 1
				if not f.active: continue
				
				if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
				r = {}
				try:
					r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
				except TimeoutError:
					print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
					continue
				
				# Handle various status conditions, as required
				if 'status' in r:
					if r.status == 301: f.url = r['url']
					elif r.status == 410:
						print >>warn, "W: feed gone; deleting", f.url
						feeds.remove(f)
						continue
				
				http_status = r.get('status', 200)
				if VERBOSE > 1: print >>warn, "I: http status", http_status
				http_headers = r.get('headers', {
				  'content-type': 'application/rss+xml', 
				  'content-length':'1'})
				exc_type = r.get("bozo_exception", Exception()).__class__
				if http_status != 304 and not r.entries and not r.get('version', ''):
					if http_status not in [200, 302]: 
						print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)

					elif contains(http_headers.get('content-type', 'rss'), 'html'):
						print >>warn, "W: looks like HTML [%d] %s"  % (feednum, f.url)

					elif http_headers.get('content-length', '1') == '0':
						print >>warn, "W: empty page [%d] %s" % (feednum, f.url)

					elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
						print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
					
					elif exc_type == IOError:
						print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
					
					elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
						print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
					
					elif exc_type in socket_errors:
						exc_reason = r.bozo_exception.args[1]
						print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)

					elif exc_type == urllib2.URLError:
						if r.bozo_exception.reason.__class__ in socket_errors:
							exc_reason = r.bozo_exception.reason.args[1]
						else:
							exc_reason = r.bozo_exception.reason
						print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
					
					elif exc_type == AttributeError:
						print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
					
					elif exc_type == KeyboardInterrupt:
						raise r.bozo_exception
						
					elif r.bozo:
						print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))

					else:
						print >>warn, "=== rss2email encountered a problem with this feed ==="
						print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
						print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
						print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
						print >>warn, r
						print >>warn, "rss2email", __version__
						print >>warn, "feedparser", feedparser.__version__
						print >>warn, "html2text", h2t.__version__
						print >>warn, "Python", sys.version
						print >>warn, "=== END HERE ==="
					continue
				
				r.entries.reverse()
				
				for entry in r.entries:
					id = getID(entry)
					
					# If TRUST_GUID isn't set, we get back hashes of the content.
					# Instead of letting these run wild, we put them in context
					# by associating them with the actual ID (if it exists).
					
					frameid = entry.get('id')
					if not(frameid): frameid = id
					if type(frameid) is DictType:
						frameid = frameid.values()[0]
					
					# If this item's ID is in our database
					# then it's already been sent
					# and we don't need to do anything more.
					
					if frameid in f.seen:
						if f.seen[frameid] == id: continue

					if not (f.to or default_to):
						print "No default email address defined. Please run 'r2e email emailaddress'"
						print "Ignoring feed %s" % f.url
						break
					
					if 'title_detail' in entry and entry.title_detail:
						title = entry.title_detail.value
						if contains(entry.title_detail.type, 'html'):
							title = html2text(title)
					else:
						title = getContent(entry)[:70]

					title = title.replace("\n", " ").strip()
					
					datetime = time.gmtime()

					if DATE_HEADER:
						for datetype in DATE_HEADER_ORDER:
							kind = datetype+"_parsed"
							if kind in entry and entry[kind]: datetime = entry[kind]
						
					link = entry.get('link', "")
					
					from_addr = getEmail(r, entry)
					
					name = h2t.unescape(getName(r, entry))
					fromhdr = formataddr((name, from_addr,))
					tohdr = (f.to or default_to)
					subjecthdr = title
					datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
					useragenthdr = "rss2email"
					
					# Add post tags, if available
					tagline = getTags(entry)

					extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline}
					if BONUS_HEADER != '':
						for hdr in BONUS_HEADER.strip().splitlines():
							pos = hdr.strip().find(':')
							if pos > 0:
								extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
							else:
								print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER	
					
					entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
					contenttype = 'plain'
					content = ''
					if USE_CSS_STYLING and HTML_MAIL:
						contenttype = 'html'
						content = "<html>\n" 
						content += '<head><style><!--' + STYLE_SHEET + '//--></style></head>\n'
						content += '<body>\n'
						content += '<div id="entry">\n'
						content += '<h1'
						content += ' class="header"'
						content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
						if ishtml(entrycontent):
							body = entrycontent[1].strip()
						else:
							body = entrycontent.strip()
						if body != '':	
							content += '<div id="body"><table><tr><td>\n' + body + '</td></tr></table></div>\n'
						content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
						if hasattr(entry,'enclosures'):
							for enclosure in entry.enclosures:
								if (hasattr(enclosure, 'url') and enclosure.url != ""):
									content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
								if (hasattr(enclosure, 'src') and enclosure.src != ""):
									content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
						if 'links' in entry:
							for extralink in entry.links:
								if ('rel' in extralink) and extralink['rel'] == u'via':
									extraurl = extralink['href']
									extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
									viatitle = extraurl
									if ('title' in extralink):
									    viatitle = extralink['title']
									content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n'
						content += '</p></div>\n'
						content += "\n\n</body></html>"
					else:	
						if ishtml(entrycontent):
							contenttype = 'html'
							content = "<html>\n" 
							content = ("<html><body>\n\n" + 
							           '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
							           entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
							           '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
							           
							if hasattr(entry,'enclosures'):
								for enclosure in entry.enclosures:
									if enclosure.url != "":
										content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
							if 'links' in entry:
								for extralink in entry.links:
									if ('rel' in extralink) and extralink['rel'] == u'via':
										content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
                                                                
							content += ("\n</body></html>")
						else:
							content = entrycontent.strip() + "\n\nURL: "+link
							if hasattr(entry,'enclosures'):
								for enclosure in entry.enclosures:
									if enclosure.url != "":
										content += ('\nEnclosure: ' + enclosure.url + "\n")
							if 'links' in entry:
								for extralink in entry.links:
									if ('rel' in extralink) and extralink['rel'] == u'via':
										content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'

					smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)
			
					f.seen[frameid] = id
					
				f.etag, f.modified = r.get('etag', None), r.get('modified', None)
			except (KeyboardInterrupt, SystemExit):
				raise
			except:
				print >>warn, "=== rss2email encountered a problem with this feed ==="
				print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
				print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
				print >>warn, "E: could not parse", f.url
				traceback.print_exc(file=warn)
				print >>warn, "rss2email", __version__
				print >>warn, "feedparser", feedparser.__version__
				print >>warn, "html2text", h2t.__version__
				print >>warn, "Python", sys.version
				print >>warn, "=== END HERE ==="
				continue

	finally:		
		unlock(feeds, feedfileObject)
		if smtpserver:
			smtpserver.quit()
Ejemplo n.º 28
0
def run(num=None):
	feeds, feedfileObject = load()
	try:
		# We store the default to address as the first item in the feeds list.
		# Here we take it out and save it for later.
		default_to = ""
		if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
		else: ifeeds = feeds
		
		if num: ifeeds = [feeds[num]]
		feednum = 0
		
		smtpserver = None
		
		for f in ifeeds:
			try: 
				feednum += 1
				if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, hidepass(f.url))
				r = {}
				try:
					r = parse(f.url, f.etag, f.modified, FEED_TIMEOUT)
				except TimeoutError:
					print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, hidepass(f.url))
					continue
				
				# Handle various status conditions, as required
				if 'status' in r:
					if r.status == 301: f.url = r['url']
					elif r.status == 410:
						print >>warn, "W: feed gone; deleting", hidepass(f.url)
						feeds.remove(f)
						continue
				
				http_status = r.get('status', 200)
				http_headers = r.get('headers', {
				  'content-type': 'application/rss+xml', 
				  'content-length':'1'})
				exc_type = r.get("bozo_exception", Exception()).__class__
				if http_status != 304 and not r.get('version', ''):
					if http_status not in [200, 302]: 
						print >>warn, "W: error %d [%d] %s" % (http_status, feednum, hidepass(f.url))

					elif contains(http_headers.get('content-type', 'rss'), 'html'):
						print >>warn, "W: looks like HTML [%d] %s"  % (feednum, hidepass(f.url))

					elif http_headers.get('content-length', '1') == '0':
						print >>warn, "W: empty page [%d] %s" % (feednum, hidepass(f.url))

					elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
						print >>warn, "W: timed out on [%d] %s" % (feednum, hidepass(f.url))
					
					elif exc_type == IOError:
						print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, hidepass(f.url))
					
					elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
						print >>warn, "W: broken compression [%d] %s" % (feednum, hidepass(f.url))
					
					elif exc_type in socket_errors:
						exc_reason = r.bozo_exception.args[1]
						print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, hidepass(f.url))

					elif exc_type == urllib2.URLError:
						if r.bozo_exception.reason.__class__ in socket_errors:
							exc_reason = r.bozo_exception.reason.args[1]
						else:
							exc_reason = r.bozo_exception.reason
						print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, hidepass(f.url))
					
					elif exc_type == AttributeError:
						print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, hidepass(f.url))
					
					elif exc_type == KeyboardInterrupt:
						raise r.bozo_exception
						
					elif r.bozo:
						print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, hidepass(f.url), r.get("bozo_exception", "can't process"))

					else:
						print >>warn, "=== SEND THE FOLLOWING TO [email protected] ==="
						print >>warn, "E:", r.get("bozo_exception", "can't process"), hidepass(f.url)
						print >>warn, r
						print >>warn, "rss2email", __version__
						print >>warn, "feedparser", feedparser.__version__
						print >>warn, "html2text", h2t.__version__
						print >>warn, "Python", sys.version
						print >>warn, "=== END HERE ==="
					continue
				
				r.entries.reverse()
				
				for entry in r.entries:
					id = getID(entry)
					
					# If TRUST_GUID isn't set, we get back hashes of the content.
					# Instead of letting these run wild, we put them in context
					# by associating them with the actual ID (if it exists).
					
					frameid = entry.get('id', id)
					
					# If this item's ID is in our database
					# then it's already been sent
					# and we don't need to do anything more.
					
					if f.seen.has_key(frameid) and f.seen[frameid] == id: continue

					if not (f.to or default_to):
						print "No default email address defined. Please run 'r2e email emailaddress'"
						print "Ignoring feed %s" % hidepass(f.url)
						break
					
					if 'title_detail' in entry and entry.title_detail:
						title = entry.title_detail.value
						if contains(entry.title_detail.type, 'html'):
							title = html2text(title)
					else:
						title = getContent(entry)[:70]

					title = title.replace("\n", " ").strip()
					
					datetime = time.gmtime()

					if DATE_HEADER:
						for datetype in DATE_HEADER_ORDER:
							kind = datetype+"_parsed"
							if kind in entry and entry[kind]: datetime = entry[kind]
						
					link = entry.get('link', "")
					
					from_addr = getEmail(r.feed, entry)
					
					name = getName(r, entry)

					entrycontent = getContent(entry, HTMLOK=HTML_MAIL)

					force_html = USE_CSS_STYLING and HTML_MAIL
					if force_html or ishtml(entrycontent):
						contenttype = 'html'
						enctpl = TPL_ENCLOSURE_HTML
						if force_html:
							tpl = TPL_HTML_CSS
						else:
							tpl = TPL_HTML_PLAIN
					else:
						contenttype = 'plain'
						enctpl = TPL_ENCLOSURE_TEXT
						tpl = TPL_TEXT

					if hasattr(entry,'enclosures'):
						encs = [enclosure.url for enclosure in entry.enclosures if enclosure.url != ""]
					else:
						encs = ()

					substs = {
						'STYLE_SHEET': STYLE_SHEET,
						'link': link,
						'title': title,
						}

					if ishtml(entrycontent):
						substs['body'] = entrycontent[1].strip()
					else:
						substs['body'] = entrycontent.strip()

					if len(encs):
						substs['ENCLOSURE_TPL'] = (
							enctpl[0] +
							"".join([enctpl[1] % dict(substs.items() + {'enclosure': eurl}.items()) for eurl in encs]) +
							enctpl[2])
					else:
						substs['ENCLOSURE_TPL'] = ""

					fromhdr = '"'+ name + '" <' + from_addr + ">"
					tohdr = (f.to or default_to)
					subjecthdr = h2t.unescape(title)
					content = tpl % substs
					extraheaders = {
							'Date': time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime),
							'User-Agent': "rss2email/"+__version__ }
					if FEED_URL_HEADER:
						extraheaders[FEED_URL_HEADER] = hidepass(f.url)
					if BONUS_HEADER != '':
						for hdr in BONUS_HEADER.strip().splitlines():
							pos = hdr.strip().find(':')
							if pos > 0:
								extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
							else:
								print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER	

					smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)
			
					f.seen[frameid] = id
					
				f.etag, f.modified = r.get('etag', None), r.get('modified', None)
			except (KeyboardInterrupt, SystemExit):
				raise
			except:
				print >>warn, "=== SEND THE FOLLOWING TO [email protected] ==="
				print >>warn, "E: could not parse", hidepass(f.url)
				traceback.print_exc(file=warn)
				print >>warn, "rss2email", __version__
				print >>warn, "feedparser", feedparser.__version__
				print >>warn, "html2text", h2t.__version__
				print >>warn, "Python", sys.version
				print >>warn, "=== END HERE ==="
				continue

	finally:		
		unlock(feeds, feedfileObject)
		if smtpserver:
			smtpserver.quit()
Ejemplo n.º 29
0
def run(num=None):
    feeds, feedfileObject = load()
    mailserver = None
    try:
        # We store the default to address as the first item in the feeds list.
        # Here we take it out and save it for later.
        default_to = ""
        if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
        else: ifeeds = feeds
        
        if num: ifeeds = [feeds[num]]
        feednum = 0
        
        for f in ifeeds:
            try: 
                feednum += 1
                if not f.active: continue
                
                if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
                r = {}
                try:
                    r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
                except TimeoutError:
                    print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
                    continue
                
                # Handle various status conditions, as required
                if 'status' in r:
                    if r.status == 301: f.url = r['url']
                    elif r.status == 410:
                        print >>warn, "W: feed gone; deleting", f.url
                        feeds.remove(f)
                        continue
                
                http_status = r.get('status', 200)
                if VERBOSE > 1: print >>warn, "I: http status", http_status
                http_headers = r.get('headers', {
                  'content-type': 'application/rss+xml', 
                  'content-length':'1'})
                exc_type = r.get("bozo_exception", Exception()).__class__
                if http_status != 304 and not r.entries and not r.get('version', ''):
                    if http_status not in [200, 302]: 
                        print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)

                    elif contains(http_headers.get('content-type', 'rss'), 'html'):
                        print >>warn, "W: looks like HTML [%d] %s"  % (feednum, f.url)

                    elif http_headers.get('content-length', '1') == '0':
                        print >>warn, "W: empty page [%d] %s" % (feednum, f.url)

                    elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
                        print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
                    
                    elif exc_type == IOError:
                        print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
                    
                    elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
                        print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
                    
                    elif exc_type in socket_errors:
                        exc_reason = r.bozo_exception.args[1]
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)

                    elif exc_type == urllib2.URLError:
                        if r.bozo_exception.reason.__class__ in socket_errors:
                            exc_reason = r.bozo_exception.reason.args[1]
                        else:
                            exc_reason = r.bozo_exception.reason
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
                    
                    elif exc_type == AttributeError:
                        print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
                    
                    elif exc_type == KeyboardInterrupt:
                        raise r.bozo_exception
                        
                    elif r.bozo:
                        print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))

                    else:
                        print >>warn, "=== rss2email encountered a problem with this feed ==="
                        print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                        print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                        print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
                        print >>warn, r
                        print >>warn, "rss2email", __version__
                        print >>warn, "feedparser", feedparser.__version__
                        print >>warn, "html2text", h2t.__version__
                        print >>warn, "Python", sys.version
                        print >>warn, "=== END HERE ==="
                    continue
                
                r.entries.reverse()
                
                for entry in r.entries:
                    id = getID(entry)
                    
                    # If TRUST_GUID isn't set, we get back hashes of the content.
                    # Instead of letting these run wild, we put them in context
                    # by associating them with the actual ID (if it exists).
                    
                    frameid = entry.get('id')
                    if not(frameid): frameid = id
                    if type(frameid) is DictType:
                        frameid = frameid.values()[0]
                    
                    # If this item's ID is in our database
                    # then it's already been sent
                    # and we don't need to do anything more.
                    
                    if frameid in f.seen:
                        if f.seen[frameid] == id: continue

                    if not (f.to or default_to):
                        print "No default email address defined. Please run 'r2e email emailaddress'"
                        print "Ignoring feed %s" % f.url
                        break
                    
                    if 'title_detail' in entry and entry.title_detail:
                        title = entry.title_detail.value
                        if contains(entry.title_detail.type, 'html'):
                            title = html2text(title)
                    else:
                        title = getContent(entry)[:70]

                    title = title.replace("\n", " ").strip()
                    
                    when = time.gmtime()

                    if DATE_HEADER:
                        for datetype in DATE_HEADER_ORDER:
                            kind = datetype+"_parsed"
                            if kind in entry and entry[kind]: when = entry[kind]
                        
                    link = entry.get('link', "")
                    
                    from_addr = getEmail(r, entry)
                    
                    name = h2t.unescape(getName(r, entry))
                    fromhdr = formataddr((name, from_addr,))
                    tohdr = (f.to or default_to)
                    subjecthdr = title
                    datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", when)
                    useragenthdr = "rss2email"
                    
                    # Add post tags, if available
                    tagline = ""
                    if 'tags' in entry:
                        tags = entry.get('tags')
                        taglist = []
                        if tags:
                            for tag in tags:
                                taglist.append(tag['term'])
                        if taglist:
                            tagline = ",".join(taglist)
                    
                    extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''}
                    if BONUS_HEADER != '':
                        for hdr in BONUS_HEADER.strip().splitlines():
                            pos = hdr.strip().find(':')
                            if pos > 0:
                                extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
                            else:
                                print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER 
                    
                    entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
                    contenttype = 'plain'
                    content = ''
                    if THREAD_ON_TAGS and len(tagline):
                        extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')])
                    if USE_CSS_STYLING and HTML_MAIL:
                        contenttype = 'html'
                        content = "<html>\n" 
                        content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n'
                        content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n'
                        content += '<div id="entry">\n'
                        content += '<h1 class="header"'
                        content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
                        if ishtml(entrycontent):
                            body = entrycontent[1].strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>")
                        else:
                            body = entrycontent.strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>")
                        if THREAD_ON_LINKS:
                            parser = Parser()
                            parser.feed(body)
                            extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs])
                        if INLINE_IMAGES_DATA_URI:
                            parser = Parser(tag='img', attr='src')
                            parser.feed(body)
                            for src in parser.attrs:
                                try:
                                    img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {})
                                    data = img.read()
                                    if hasattr(img, 'headers'):
                                        headers = dict((k.lower(), v) for k, v in dict(img.headers).items())
                                        ctype = headers.get('content-type', None)
                                        if ctype and INLINE_IMAGES_DATA_URI:
                                            body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data)))
                                except:
                                    print >>warn, "Could not load image: %s" % src
                                    pass
                        if body != '':  
                            content += '<div id="body">\n' + body + '</div>\n'
                        content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
                        if hasattr(entry,'enclosures'):
                            for enclosure in entry.enclosures:
                                if (hasattr(enclosure, 'url') and enclosure.url != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
                                if (hasattr(enclosure, 'src') and enclosure.src != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
                        if 'links' in entry:
                            for extralink in entry.links:
                                if ('rel' in extralink) and extralink['rel'] == u'via':
                                    extraurl = extralink['href']
                                    extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
                                    viatitle = extraurl
                                    if ('title' in extralink):
                                        viatitle = extralink['title']
                                    content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n'
                        content += '</p></div>\n'
                        content += "\n\n</body></html>"
                    else:   
                        if ishtml(entrycontent):
                            contenttype = 'html'
                            content = "<html>\n" 
                            content = ("<html><body>\n\n" + 
                                       '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
                                       entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
                                       '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
                                       
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
                                                                
                            content += ("\n</body></html>")
                        else:
                            content = entrycontent.strip() + "\n\nURL: "+link
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('\nEnclosure: ' + enclosure.url + "\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'

                    mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, when, extraheaders, mailserver, f.folder)
            
                    f.seen[frameid] = id
                    
                f.etag, f.modified = r.get('etag', None), r.get('modified', None)
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                print >>warn, "=== rss2email encountered a problem with this feed ==="
                print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                print >>warn, "E: could not parse", f.url
                traceback.print_exc(file=warn)
                print >>warn, "rss2email", __version__
                print >>warn, "feedparser", feedparser.__version__
                print >>warn, "html2text", h2t.__version__
                print >>warn, "Python", sys.version
                print >>warn, "=== END HERE ==="
                continue

    finally:        
        unlock(feeds, feedfileObject)
        if mailserver:
            if IMAP_MARK_AS_READ:
                for folder in IMAP_MARK_AS_READ:
                    mailserver.select(folder)
                    res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)')
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)')
            if IMAP_MOVE_READ_TO:
                typ, data = mailserver.list(pattern='*')
                # Parse folder listing as a CSV dialect (automatically removes quotes)
                reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist')
                # Iterate over each folder
                for row in reader:
                    folder = row[-1:][0]
                    if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]:
                        continue
                    mailserver.select(folder)
                    yesterday = (datetime.now() - timedelta(days=1)).strftime("%d-%b-%Y")
                    res, data = mailserver.search(None, '(SEEN BEFORE %s UNFLAGGED)' % yesterday)
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO)
                                if res == 'OK':
                                    res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)')
                                    mailserver.expunge()
            try:
                mailserver.quit()
            except:
                mailserver.logout()
Ejemplo n.º 30
0
 def text(node):
     if node is not None:
         return unescape(node.text)
Ejemplo n.º 31
0
    def get_thread(self, id, contacts=None, get_profiles=False):
        """
        Get a thread and its messages.

        The 'contacts' parameters is only used for internal calls.
        """
        thread = None
        if isinstance(id, Thread):
            thread = id
            id = thread.id

        if not thread:
            thread = Thread(int(id))
            thread.flags = Thread.IS_DISCUSSION
            full = False
        else:
            full = True

        with self.browser:
            mails = self.browser.get_thread_mails(id, 100)
            my_name = self.browser.get_my_name()

        child = None
        msg = None
        s**t = self._get_slut(mails['member']['pseudo'])
        if contacts is None:
            contacts = {}

        if not thread.title:
            thread.title = u'Discussion with %s' % mails['member']['pseudo']

        for mail in mails['messages']:
            flags = Message.IS_HTML
            if parse_dt(mail['date']) > s**t['lastmsg'] and mail[
                    'id_from'] != self.browser.get_my_name():
                flags |= Message.IS_UNREAD

                if get_profiles:
                    if not mail['id_from'] in contacts:
                        with self.browser:
                            contacts[mail['id_from']] = self.get_contact(
                                mail['id_from'])

            signature = u''
            if mail.get('src', None):
                signature += u'Sent from my %s\n\n' % mail['src']
            if mail['id_from'] in contacts:
                signature += contacts[mail['id_from']].get_text()

            msg = Message(thread=thread,
                          id=int(
                              time.strftime('%Y%m%d%H%M%S',
                                            parse_dt(
                                                mail['date']).timetuple())),
                          title=thread.title,
                          sender=mail['id_from'],
                          receivers=[
                              my_name if mail['id_from'] != my_name else
                              mails['member']['pseudo']
                          ],
                          date=parse_dt(mail['date']),
                          content=unescape(mail['message']).strip(),
                          signature=signature,
                          children=[],
                          flags=flags)
            if child:
                msg.children.append(child)
                child.parent = msg

            child = msg

        if full and msg:
            # If we have get all the messages, replace NotLoaded with None as
            # parent.
            msg.parent = None
        if not full and not msg:
            # Perhaps there are hidden messages
            msg = NotLoaded

        thread.root = msg

        return thread