Example #1
0
def get_rss2(url):
    try:
        with XmlReader.Create(url) as reader:
            return [
                RSSItem(
                    i.Title.Text, i.Summary.Text,
                    i.Links[0].Uri.AbsoluteUri if i.Links.Count > 0 else "")
                for i in SyndicationFeed.Load(reader).Items
            ]
    except XmlException:
        wc = WebClient()
        wc.Encoding = UTF8
        xmlstr = wc.DownloadString(url)
        xdoc = XmlDocument()
        xdoc.LoadXml(xmlstr)
        xelem = xdoc.DocumentElement

        titles = [
            i.InnerText.Replace("\n", "").Replace("\r", "")
            for i in xelem.SelectNodes("//item//title")
        ]
        links = [i.InnerText for i in xelem.SelectNodes("//item//link")]
        descriptions = [
            i.InnerText for i in xelem.SelectNodes("//item//description")
        ]
        return [
            RSSItem(t, d, l) for t, d, l in zip(titles, descriptions, links)
        ]
Example #2
0
def get_rss2(url):
    try:
        with XmlReader.Create(url) as reader:
            return [
                RSSItem(
                    i.Title.Text, 
                    i.Summary.Text, 
                    i.Links[0].Uri.AbsoluteUri if i.Links.Count > 0 else ""
                    ) for i in 
                SyndicationFeed.Load(reader).Items
                ]
    except XmlException:
        wc = WebClient()
        wc.Encoding = UTF8
        xmlstr = wc.DownloadString(url)
        xdoc = XmlDocument()
        xdoc.LoadXml(xmlstr)
        xelem = xdoc.DocumentElement

        titles = [
            i.InnerText.Replace("\n", "").Replace("\r", "")
            for i in xelem.SelectNodes("//item//title")]
        links = [i.InnerText for i in xelem.SelectNodes("//item//link")]
        descriptions = [
            i.InnerText for i in xelem.SelectNodes("//item//description")
            ]
        return [
            RSSItem(t, d, l) for t, d, l 
            in zip(titles, descriptions, links)
            ]
Example #3
0
    def process(self):
        empty = False
        try:
            client = WebClient()
            client.Encoding = Encoding.UTF8
            client.Headers['Accept'] = 'text/html'
            client.Headers[
                'User-Agent'] = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)'

            body = client.DownloadString(
                'http://search.twitter.com/search/thread/%d' % self.status.Id)
            divs = re.findall(r'<div class="msg">(.*?)</div>', body, re.S)
            if divs:
                for div in divs:
                    match = re.search(
                        r'<a[^>]*>(.*?)</a>.*<span[^>]*>(.*?)</span>', div)
                    name = match.group(1)
                    text = re.sub(r'<[^>]*>', '', match.group(2))
                    self.notice(text, nick=name)
            else:
                empty = True
        except WebException, e:
            if e.Response.StatusCode == 404:
                # クロールされていないかプロテクトか
                empty = True
            else:
                raise
Example #4
0
def _get_htmldoc(url, encode=UTF8):
    """指定したURLのHTMLテキストを取得。失敗するとNoneを返す"""
    wc = WebClient()
    wc.Encoding = encode
    try:
        html = wc.DownloadString(url)
    except WebException as ex:
        return None

    htmlDoc = HtmlDocument()
    htmlDoc.LoadHtml(html)
    return htmlDoc
Example #5
0
def _get_htmldoc(url, encode=UTF8):
    """指定したURLのHTMLテキストを取得。失敗するとNoneを返す"""
    wc = WebClient()
    wc.Encoding = encode
    try:
        html = wc.DownloadString(url)
    except WebException as ex:
        return None

    htmlDoc = HtmlDocument()
    htmlDoc.LoadHtml(html)
    return htmlDoc
Example #6
0
    def process(self):
        empty = False
        try:
            client = WebClient()
            client.Encoding = Encoding.UTF8
            client.Headers['Accept'] = 'text/html'
            client.Headers['User-Agent'] = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)'

            body = client.DownloadString('http://search.twitter.com/search/thread/%d' % self.status.Id)
            divs = re.findall(r'<div class="msg">(.*?)</div>', body, re.S)
            if divs:
                for div in divs:
                    match = re.search(r'<a[^>]*>(.*?)</a>.*<span[^>]*>(.*?)</span>', div)
                    name = match.group(1)
                    text = re.sub(r'<[^>]*>', '', match.group(2))
                    self.notice(text, nick=name)
            else:
                empty = True
        except WebException, e:
            if e.Response.StatusCode == 404:
                # クロールされていないかプロテクトか
                empty = True
            else:
                raise