Beispiel #1
0
Datei: fetcher.py Projekt: fay/wt
def update_from_google_reader():
    reader = googleclient.get_reader()
    feed_list = reader.get_subscription_list()['subscriptions']
    for f in feed_list:
        link = f['id'].encode('utf-8')
        print link
        fetch_from_google_reader(reader,link)
Beispiel #2
0
Datei: fetcher.py Projekt: fay/wt
def addrss2google(url,count=100):
    reader = googleclient.get_reader()
    reader.add_subscription(feed='feed/'+url)
    fetch_from_google_reader(reader,'feed/'+url,count)
    

    
Beispiel #3
0
    def crawle(self, pages, depth=2):
        reader = googleclient.get_reader()
        for i in range(depth):
            newpages = []
            for page in pages:
                print "crawling... ", page
                query = Link.objects.filter(link=page)
                if query:
                    # 如果已经爬过则pass
                    if query.get().is_crawled == "y":
                        print page, "already crawled"
                        continue
                try:
                    c = urllib2.urlopen(page)
                except:
                    print "Could not open %s" % page
                    if query:
                        linkModel = query.get()
                        if linkModel.is_crawled == "n":
                            linkModel.is_crawled = "y"
                            linkModel.save()
                    continue
                try:
                    soup = BeautifulSoup(c.read())
                    if query:
                        linkModel = query.get()
                        if linkModel.is_crawled == "n":
                            linkModel.is_crawled = "y"
                            linkModel.save()
                    else:
                        self.addtoDB(page, "y")
                    # 找到所有超链接
                    links = soup("a")
                    for link in links:
                        if "href" in dict(link.attrs):
                            url = urljoin(page, link["href"])
                            if url.find("'") != -1:
                                continue
                            url = url.split("#")[0]  # remove location portion
                            match1 = re.match("(http://[\w.]*/)", page)
                            match2 = re.match("(http://[\w.]*/)", url)
                            if url[0:4] == "http" and not Link.objects.filter(link=url):
                                if match1 and match2 and match1.group() == match2.group():
                                    continue
                                newpages.append(url)
                                print "appending:", url
                                self.addtoDB(url, "n")
                    rss_links = soup("link")
                    for link in rss_links:
                        if "type" in dict(link.attrs) and link["type"] == "application/rss+xml":
                            url = urljoin(page, link["href"])
                            if url.find("'") != -1:
                                continue
                            if url[0:4] == "http" and not Feed.objects.filter(link=url):
                                ret = fetcher.fetch(url, False)
                                if ret == "pass":
                                    print url, ":already exists"
                                    continue
                                # filter rss comes from delicious
                                if url.startswith("http://delicious.com"):
                                    continue
                                reader.add_subscription(feed="feed/" + url)
                                fetcher.fetch_from_google_reader(reader, "feed/" + url, 100)

                except Exception, e:
                    print e
                    if query:
                        linkModel = query.get()
                        if linkModel.is_crawled == "n":
                            linkModel.is_crawled = "y"
                            linkModel.save()
            pages = newpages