Example #1
0
    def crawl_url(self, url, level=0):
        if (
                url.count("/") == 2
        ):  # If the user provides 'http://www.google.com' append an / to it.
            url += "/"

        code = self.__simpleGetRequest(url)
        domain = self.getDomain(url, True)

        if (code != None):
            soup = None

            try:
                soup = BeautifulSoup(code)
            except:
                pass

            if soup != None:
                for tag in soup.findAll('a'):
                    isCool = False
                    new_url = None
                    try:
                        new_url = tag['href']
                    except KeyError, err:
                        pass

                    if new_url != None and not new_url.startswith(
                            "#") and not new_url.startswith("javascript:"):
                        if (new_url.startswith("http://")
                                or new_url.startswith("https://")):
                            if (new_url.lower().startswith(domain.lower())):
                                isCool = True
                        else:
                            if (new_url.startswith("/")):
                                new_url = os.path.join(domain, new_url[1:])
                            else:
                                new_url = os.path.join(os.path.dirname(url),
                                                       new_url)
                            isCool = True

                        if (isCool and self.isURLinPool(new_url)):
                            isCool = False

                        if (isCool):
                            tmpUrl = new_url
                            if (tmpUrl.find("?") != -1):
                                tmpUrl = tmpUrl[:tmpUrl.find("?")]

                            for suffix in self.goodTypes:
                                if (tmpUrl.endswith(suffix)):
                                    if (level + 1 <= self.config["p_depth"]):
                                        self.urlpool.append(
                                            (new_url, level + 1))
                                        break
Example #2
0
def google(search, tokenizer):
    search = search.replace(" ", "%20")
    url = 'http://www.google.com/search?q=' + search
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent': user_agent}
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    html = response.read()
    soup = BeautifulSoup(html)
    html = soup.prettify().split("\n")
    html = html[167:]
    html = "\n".join(html)
    links = re.findall(r"url\?q=(.+)&amp;s", html)
    return links
Example #3
0
def google(search,tokenizer):
    search = search.replace(" ","%20")
    url = 'http://www.google.com/search?q='+search
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent' : user_agent}
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    html = response.read()
    soup = BeautifulSoup(html)
    html =  soup.prettify().split("\n")
    html = html[167 :]
    html = "\n".join(html)
    links =re.findall(r"url\?q=(.+)&amp;s",html)
    return links
Example #4
0
    def crawl_url(self, url, level=0):
        if (url.count("/") == 2): # If the user provides 'http://www.google.com' append an / to it.
            url += "/" 
        
        code = self.__simpleGetRequest(url)
        domain = self.getDomain(url, True)

        if (code != None):
            soup = None
            
            try:
                soup = BeautifulSoup(code)
            except:
                pass

            if soup != None:
                for tag in soup.findAll('a'):
                    isCool = False
                    new_url = None
                    try:
                        new_url = tag['href']
                    except KeyError, err:
                        pass

                    if new_url != None and not new_url.startswith("#") and not new_url.startswith("javascript:"):
                        if(new_url.startswith("http://") or new_url.startswith("https://")):
                            if (new_url.lower().startswith(domain.lower())):
                                isCool = True
                        else:
                            if (new_url.startswith("/")):
                                new_url = os.path.join(domain, new_url[1:])
                            else:
                                new_url = os.path.join(os.path.dirname(url), new_url)
                            isCool = True

                        if (isCool and self.isURLinPool(new_url)):
                            isCool = False

                        if (isCool):
                            tmpUrl = new_url
                            if (tmpUrl.find("?") != -1):
                                tmpUrl = tmpUrl[:tmpUrl.find("?")]

                            for suffix in self.goodTypes:
                                if (tmpUrl.endswith(suffix)):
                                    if (level+1 <= self.config["p_depth"]):
                                        self.urlpool.append((new_url, level+1))
                                        break
Example #5
0
File: kq.py Project: caozhzh/rss
os.chdir('/home/caozhzh/work/rss')

import re
import urllib
from xgoogle.BeautifulSoup import BeautifulSoup
from xgoogle.browser import Browser, BrowserError
from xgoogle.GeneralFetch import GeneralFetch

url = "http://blog.sina.com.cn/u/1696709200"
b = Browser()
page = b.get_page(url)
page = page.replace('<!–[if lte IE 6]>', '')
page = page.replace('<![endif]–>', '')
#print page

be = BeautifulSoup(page)
div = be.find('div', {'class': 'diywidget'})
txt = ''.join(div.findAll(text=True))
#print type(txt)

import feedparser
origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml')

from feedformatter import Feed
import time
import datetime
import uuid

# Create the feed
feed = Feed()
Example #6
0
File: kq.py Project: caozhzh/rss
os.chdir('/home/caozhzh/work/rss')

import re
import urllib
from xgoogle.BeautifulSoup import BeautifulSoup
from xgoogle.browser import Browser, BrowserError
from xgoogle.GeneralFetch import GeneralFetch

url = "http://blog.sina.com.cn/u/1696709200"
b = Browser()
page = b.get_page(url)
page = page.replace('<!–[if lte IE 6]>','')
page = page.replace('<![endif]–>','')
#print page

be = BeautifulSoup(page)
div = be.find('div', {'class': 'diywidget'})
txt = ''.join(div.findAll(text=True))
#print type(txt)

import feedparser
origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml')

from feedformatter import Feed
import time
import datetime
import uuid

# Create the feed
feed = Feed()