コード例 #1
0
ファイル: Crawlable.py プロジェクト: WangCHX/Crawler
def isCrawlable(url):
    # use naive method to get root for given url
    url = CheckUrl.validifyUrl(url)
    strs = url.split('/')
    if len(strs) > 2:
        url = strs[0] + "//" + strs[2]

    robotUrl = url + "/robots.txt"

    if robotHash.get(robotUrl) == None:
         rerp = RobotExclusionRulesParser.RobotExclusionRulesParser()
         try:
             rerp.fetch(robotUrl,3)
         except urllib2.URLError as e:
             return False
         if rerp.is_allowed("*",url):
             return True
         else:
             return False
    else:
        rerp = robotHash[robotUrl]
        if rerp.is_allowed("*", url):
            return True
        else:
            return False
コード例 #2
0
def isCrawlable(url):
    # use naive method to get root for given url
    url = CheckUrl.validifyUrl(url)
    strs = url.split('/')
    if len(strs) > 2:
        url = strs[0] + "//" + strs[2]

    robotUrl = url + "/robots.txt"

    if robotHash.get(robotUrl) == None:
        rerp = RobotExclusionRulesParser.RobotExclusionRulesParser()
        try:
            rerp.fetch(robotUrl, 3)
        except urllib2.URLError as e:
            return False
        if rerp.is_allowed("*", url):
            return True
        else:
            return False
    else:
        rerp = robotHash[robotUrl]
        if rerp.is_allowed("*", url):
            return True
        else:
            return False
コード例 #3
0
 def processUrl(self, href):
     """
     :param href: Current url to be processed.
     if this href is not in dict, so we just push it in;
     else this href is already in dict, we must compute new score for it,
     In this way, I just give average score for it.
     """
     href = urlparse.urljoin(self.baseUrl, href)
     href = CheckUrl.validifyUrl(href)
     if not href == -1:
     # not in dict
         if not dict.get(href) == None:
             #heapq.heappush(queue, [self.score, href])
             if not queue.get(href) == None:
                 queue[href] = queue[href] + self.score
         else :
             # in dict ,first find that url, then compute new score, and heapify it again.
             queue[href] = self.score
             dict[href] = self.score
コード例 #4
0
    this is used to computer priority score of that page, use naive method
    compute the number of keywords in the content of that page
    """
    content = content.lower()
    content = content.split()
    priorityScore = 0
    for keyword in keywords:
        for word in content:
            if keyword == word:
                priorityScore += 1
    return priorityScore

queryUrl = "https://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=" + urllib.quote(query)
response1 = urllib2.urlopen(queryUrl + "&rsz=8")
for result in json.load(response1)['responseData']['results']:
    url = CheckUrl.validifyUrl(result['unescapedUrl'])
    queue[url] = -1000 # put them into heap, [-1000, url] represents the score is -1000, and url is url.
    dict[url] = -1000

response1.close()
response2 = urllib2.urlopen(queryUrl + "&rsz=2&start=8")

for result in json.load(response2)['responseData']['results']:
    url = CheckUrl.validifyUrl(result['unescapedUrl'])
    #heapq.heappush(queue, [-1000,url])
    queue[url] = -1000
    """
    because heapq in python is small root based, so using negative num can make it big-root-based.
    """
    dict[url] = -1000