Example #1
0
 def test_lxml_etree_bug(self):
     try:
         import lxml.etree
     except ImportError:
         pass
     else:
         doc = u"<feed>&illformed_charref</feed>".encode('utf8')
         # Importing lxml.etree currently causes libxml2 to
         # throw SAXException instead of SAXParseException.
         feedparser.parse(feedparser._StringIO(doc))
     self.assertTrue(True)
 def test_lxml_etree_bug(self):
     try:
         import lxml.etree
     except ImportError:
         pass
     else:
         doc = u"<feed>&illformed_charref</feed>".encode('utf8')
         # Importing lxml.etree currently causes libxml2 to
         # throw SAXException instead of SAXParseException.
         feedparser.parse(feedparser._StringIO(doc))
     self.assertTrue(True)
Example #3
0
    def send_head(self):
        """Send custom headers defined in test case

        Example:
        <!--
        Header:   Content-type: application/atom+xml
        Header:   X-Foo: bar
        -->
        """
        # Short-circuit the HTTP status test `test_redirect_to_304()`
        if self.path == "/-/return-304.xml":
            self.send_response(304)
            self.send_header("Content-type", "text/xml")
            self.end_headers()
            return feedparser._StringIO(u"".encode("utf-8"))
        path = self.translate_path(self.path)
        # the compression tests' filenames determine the header sent
        if self.path.startswith("/tests/compression"):
            if self.path.endswith("gz"):
                headers = {"Content-Encoding": "gzip"}
            else:
                headers = {"Content-Encoding": "deflate"}
            headers["Content-type"] = "application/xml"
        else:
            headers = dict(
                [
                    (k.decode("utf-8"), v.decode("utf-8").strip())
                    for k, v in self.headers_re.findall(open(path, "rb").read())
                ]
            )
        f = open(path, "rb")
        if (self.headers.get("if-modified-since") == headers.get("Last-Modified", "nom")) or (
            self.headers.get("if-none-match") == headers.get("ETag", "nomatch")
        ):
            status = 304
        else:
            status = 200
        headers.setdefault("Status", status)
        self.send_response(int(headers["Status"]))
        headers.setdefault("Content-type", self.guess_type(path))
        self.send_header("Content-type", headers["Content-type"])
        self.send_header("Content-Length", str(os.stat(f.name)[6]))
        for k, v in headers.items():
            if k not in ("Status", "Content-type"):
                self.send_header(k, v)
        self.end_headers()
        return f
Example #4
0
    def send_head(self):
        """Send custom headers defined in test case

        Example:
        <!--
        Header:   Content-type: application/atom+xml
        Header:   X-Foo: bar
        -->
        """
        # Short-circuit the HTTP status test `test_redirect_to_304()`
        if self.path == '/-/return-304.xml':
            self.send_response(304)
            self.send_header('Content-type', 'text/xml')
            self.end_headers()
            return feedparser._StringIO(u''.encode('utf-8'))
        path = self.translate_path(self.path)
        # the compression tests' filenames determine the header sent
        if self.path.startswith('/tests/compression'):
            if self.path.endswith('gz'):
                headers = {'Content-Encoding': 'gzip'}
            else:
                headers = {'Content-Encoding': 'deflate'}
            headers['Content-type'] = 'application/xml'
        else:
            headers = dict([
                (k.decode('utf-8'), v.decode('utf-8').strip())
                for k, v in self.headers_re.findall(open(path, 'rb').read())
            ])
        f = open(path, 'rb')
        if (self.headers.get('if-modified-since') == headers.get('Last-Modified', 'nom')) \
            or (self.headers.get('if-none-match') == headers.get('ETag', 'nomatch')):
            status = 304
        else:
            status = 200
        headers.setdefault('Status', status)
        self.send_response(int(headers['Status']))
        headers.setdefault('Content-type', self.guess_type(path))
        self.send_header("Content-type", headers['Content-type'])
        self.send_header("Content-Length", str(os.stat(f.name)[6]))
        for k, v in headers.items():
            if k not in ('Status', 'Content-type'):
                self.send_header(k, v)
        self.end_headers()
        return f
Example #5
0
    def send_head(self):
        """Send custom headers defined in test case

        Example:
        <!--
        Header:   Content-type: application/atom+xml
        Header:   X-Foo: bar
        -->
        """
        # Short-circuit the HTTP status test `test_redirect_to_304()`
        if self.path == '/-/return-304.xml':
            self.send_response(304)
            self.send_header('Content-type', 'text/xml')
            self.end_headers()
            return feedparser._StringIO(u''.encode('utf-8'))
        path = self.translate_path(self.path)
        # the compression tests' filenames determine the header sent
        if self.path.startswith('/tests/compression'):
            if self.path.endswith('gz'):
                headers = {'Content-Encoding': 'gzip'}
            else:
                headers = {'Content-Encoding': 'deflate'}
        else:
            headers = dict(
                [(k.decode('utf-8'), v.decode('utf-8').strip())
                 for k, v in self.headers_re.findall(open(path, 'rb').read())])
        f = open(path, 'rb')
        if (self.headers.get('if-modified-since') == headers.get('Last-Modified', 'nom')) \
            or (self.headers.get('if-none-match') == headers.get('ETag', 'nomatch')):
            status = 304
        else:
            status = 200
        headers.setdefault('Status', status)
        self.send_response(int(headers['Status']))
        headers.setdefault('Content-type', self.guess_type(path))
        self.send_header("Content-type", headers['Content-type'])
        self.send_header("Content-Length", str(os.stat(f.name)[6]))
        for k, v in headers.items():
            if k not in ('Status', 'Content-type'):
                self.send_header(k, v)
        self.end_headers()
        return f
Example #6
0
        handlers = [handlers]
    try:
        f = _open_resource(url_file_stream_or_string, etag, modified, agent,
                           referrer, handlers)
        data = f.read()
    except Exception, e:
        result['bozo'] = 1
        result['bozo_exception'] = e
        data = ''
        f = None

    # if feed is gzip-compressed, decompress it
    if f and data and hasattr(f, 'headers'):
        if gzip and f.headers.get('content-encoding', '') == 'gzip':
            try:
                data = gzip.GzipFile(fileobj=_StringIO(data)).read()
            except Exception, e:
                # Some feeds claim to be gzipped but they're not, so
                # we get garbage.  Ideally, we should re-request the
                # feed without the 'Accept-encoding: gzip' header,
                # but we don't.
                result['bozo'] = 1
                result['bozo_exception'] = e
                data = ''
        elif zlib and f.headers.get('content-encoding', '') == 'deflate':
            try:
                data = zlib.decompress(data, -zlib.MAX_WBITS)
            except Exception, e:
                result['bozo'] = 1
                result['bozo_exception'] = e
                data = ''
Example #7
0
    if type(handlers) == types.InstanceType:
        handlers = [handlers]
    try:
        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
        data = f.read()
    except Exception, e:
        result['bozo'] = 1
        result['bozo_exception'] = e
        data = ''
        f = None

    # if feed is gzip-compressed, decompress it
    if f and data and hasattr(f, 'headers'):
        if gzip and f.headers.get('content-encoding', '') == 'gzip':
            try:
                data = gzip.GzipFile(fileobj=_StringIO(data)).read()
            except Exception, e:
                # Some feeds claim to be gzipped but they're not, so
                # we get garbage.  Ideally, we should re-request the
                # feed without the 'Accept-encoding: gzip' header,
                # but we don't.
                result['bozo'] = 1
                result['bozo_exception'] = e
                data = ''
        elif zlib and f.headers.get('content-encoding', '') == 'deflate':
            try:
                data = zlib.decompress(data, -zlib.MAX_WBITS)
            except Exception, e:
                result['bozo'] = 1
                result['bozo_exception'] = e
                data = ''
if len(xyz) == 0:
    sys.exit(
        "config file not setup. newsid wont be  updated,setup new config before run"
    )

df = pd.read_csv(
    '../../PythonFlask/Extraction/Sources/RSS_ExtractionFormatV2.csv',
    index_col=0)

# In[2]:

df2 = df[['Name', 'Rss', 'NTags', 'SCOPE', 'Type']]

# In[4]:

feedparser._open_resource = lambda *args, **kwargs: feedparser._StringIO(
    requests.get(args[0], timeout=15).content)
feeds = []
posts = []
description = []
counter = 0
for url in df2['Rss']:

    feed = feedparser.parse(url)
    Name = df2['Name'][counter]
    Ntags = df2['NTags'][counter]
    Scope = df2['SCOPE'][counter]
    Type = df2['Type'][counter]
    counter = counter + 1
    for post in feed.entries:

        try: