def crawl_url(self, url, level=0): if ( url.count("/") == 2 ): # If the user provides 'http://www.google.com' append an / to it. url += "/" code = self.__simpleGetRequest(url) domain = self.getDomain(url, True) if (code != None): soup = None try: soup = BeautifulSoup(code) except: pass if soup != None: for tag in soup.findAll('a'): isCool = False new_url = None try: new_url = tag['href'] except KeyError, err: pass if new_url != None and not new_url.startswith( "#") and not new_url.startswith("javascript:"): if (new_url.startswith("http://") or new_url.startswith("https://")): if (new_url.lower().startswith(domain.lower())): isCool = True else: if (new_url.startswith("/")): new_url = os.path.join(domain, new_url[1:]) else: new_url = os.path.join(os.path.dirname(url), new_url) isCool = True if (isCool and self.isURLinPool(new_url)): isCool = False if (isCool): tmpUrl = new_url if (tmpUrl.find("?") != -1): tmpUrl = tmpUrl[:tmpUrl.find("?")] for suffix in self.goodTypes: if (tmpUrl.endswith(suffix)): if (level + 1 <= self.config["p_depth"]): self.urlpool.append( (new_url, level + 1)) break
def google(search, tokenizer): search = search.replace(" ", "%20") url = 'http://www.google.com/search?q=' + search user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib2.Request(url, None, headers) response = urllib2.urlopen(req) html = response.read() soup = BeautifulSoup(html) html = soup.prettify().split("\n") html = html[167:] html = "\n".join(html) links = re.findall(r"url\?q=(.+)&s", html) return links
def google(search,tokenizer): search = search.replace(" ","%20") url = 'http://www.google.com/search?q='+search user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent' : user_agent} req = urllib2.Request(url, None, headers) response = urllib2.urlopen(req) html = response.read() soup = BeautifulSoup(html) html = soup.prettify().split("\n") html = html[167 :] html = "\n".join(html) links =re.findall(r"url\?q=(.+)&s",html) return links
def crawl_url(self, url, level=0): if (url.count("/") == 2): # If the user provides 'http://www.google.com' append an / to it. url += "/" code = self.__simpleGetRequest(url) domain = self.getDomain(url, True) if (code != None): soup = None try: soup = BeautifulSoup(code) except: pass if soup != None: for tag in soup.findAll('a'): isCool = False new_url = None try: new_url = tag['href'] except KeyError, err: pass if new_url != None and not new_url.startswith("#") and not new_url.startswith("javascript:"): if(new_url.startswith("http://") or new_url.startswith("https://")): if (new_url.lower().startswith(domain.lower())): isCool = True else: if (new_url.startswith("/")): new_url = os.path.join(domain, new_url[1:]) else: new_url = os.path.join(os.path.dirname(url), new_url) isCool = True if (isCool and self.isURLinPool(new_url)): isCool = False if (isCool): tmpUrl = new_url if (tmpUrl.find("?") != -1): tmpUrl = tmpUrl[:tmpUrl.find("?")] for suffix in self.goodTypes: if (tmpUrl.endswith(suffix)): if (level+1 <= self.config["p_depth"]): self.urlpool.append((new_url, level+1)) break
os.chdir('/home/caozhzh/work/rss') import re import urllib from xgoogle.BeautifulSoup import BeautifulSoup from xgoogle.browser import Browser, BrowserError from xgoogle.GeneralFetch import GeneralFetch url = "http://blog.sina.com.cn/u/1696709200" b = Browser() page = b.get_page(url) page = page.replace('<!–[if lte IE 6]>', '') page = page.replace('<![endif]–>', '') #print page be = BeautifulSoup(page) div = be.find('div', {'class': 'diywidget'}) txt = ''.join(div.findAll(text=True)) #print type(txt) import feedparser origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml') from feedformatter import Feed import time import datetime import uuid # Create the feed feed = Feed()
os.chdir('/home/caozhzh/work/rss') import re import urllib from xgoogle.BeautifulSoup import BeautifulSoup from xgoogle.browser import Browser, BrowserError from xgoogle.GeneralFetch import GeneralFetch url = "http://blog.sina.com.cn/u/1696709200" b = Browser() page = b.get_page(url) page = page.replace('<!–[if lte IE 6]>','') page = page.replace('<![endif]–>','') #print page be = BeautifulSoup(page) div = be.find('div', {'class': 'diywidget'}) txt = ''.join(div.findAll(text=True)) #print type(txt) import feedparser origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml') from feedformatter import Feed import time import datetime import uuid # Create the feed feed = Feed()