コード例 #1
0
ファイル: Crawlable.py プロジェクト: WangCHX/Crawler
def isCrawlable(url):
    # use naive method to get root for given url
    url = CheckUrl.validifyUrl(url)
    strs = url.split('/')
    if len(strs) > 2:
        url = strs[0] + "//" + strs[2]

    robotUrl = url + "/robots.txt"

    if robotHash.get(robotUrl) == None:
         rerp = RobotExclusionRulesParser.RobotExclusionRulesParser()
         try:
             rerp.fetch(robotUrl,3)
         except urllib2.URLError as e:
             return False
         if rerp.is_allowed("*",url):
             return True
         else:
             return False
    else:
        rerp = robotHash[robotUrl]
        if rerp.is_allowed("*", url):
            return True
        else:
            return False
コード例 #2
0
ファイル: WebCrawler.py プロジェクト: max11max/WebCrawler-1
def Queue_Check_Push_Front(page):
    """Check the url and push into queue.

    If check is not needed, push into queue directly.

    :param page: {
        url: the url of each page,
        depth: the depth of each page, i.e., its minimum distance from one of the 10 start pages
    }
    """
    global number_collected_url
    global pagesNumber
    if len(queue) + number_collected_url > pagesNumber * 1.5:
        return
    href = page["url"]

    global hash_table
    global number_visited_url
    href = CheckUrl.checkUrl(href)
    if href != -1:
        if CheckSite.checkSite_Visitable(href) == 1:
            if not hash_table.has_key(href):
                print "queue push front: " + str(len(queue)) + " " + href
                queue.append(page)
                hash_table[href] = number_visited_url
                number_visited_url += 1
コード例 #3
0
ファイル: pakman.py プロジェクト: RaminNietzsche/pakman
def main(argv):
     try:
	    print "Loading PKGs list ..."
	    if argv[1] == "-Syu":
    		p = subprocess.Popen(["pacman -Syup | tac | head -n -5"], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
	    elif argv[1] == "-S" and len(argv) > 2:
	    	p = subprocess.Popen(["pacman -Sp "+ argv[2]], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
	    elif len(argv) > 2:
		p = subprocess.Popen(["pacman "+ argv[1] +" "+ argv[2]], shell=True, stderr=subprocess.STDOUT)
		exit()
	    else:
                p = subprocess.Popen(["pacman "+ argv[1]], shell=True, stderr=subprocess.STDOUT)
                exit()
	    for line in p.stdout.readlines():
	        if CheckUrl.checkURL(line) == 0:
	            return
	    print "You must download : " + convert_bytes(CheckUrl.DownloadSize[0])
	    if YesNoQ.query_yes_no("Proceed with installation? "):
		Runner(CheckUrl.DownloadList)
	        while threading.activeCount() > 1:
                    time.sleep(1)
	    else:
		print "BYE BYE :D"
		exit(1)
	    if len(sys.argv) == 2:
	    	pacman = subprocess.Popen(["pacman "+ argv[1]], shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
	    elif len(sys.argv) > 2:
	    	pacman = subprocess.Popen(["pacman -S "+ argv[2]], shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
	    else:
	    	print bcolors.FAIL + "invalid args!" + bcolors.ENDC
		exit(1)
	    stdout_data = pacman.communicate(input='')[0]
            print bcolors.OKBLUE + "FIN! ;) (PA|< Man)" + bcolors.ENDC
     except ValueError:
	    print bcolors.FAIL + "What happend? Report it ([email protected])" + bcolors.ENDC
コード例 #4
0
def isCrawlable(url):
    # use naive method to get root for given url
    url = CheckUrl.validifyUrl(url)
    strs = url.split('/')
    if len(strs) > 2:
        url = strs[0] + "//" + strs[2]

    robotUrl = url + "/robots.txt"

    if robotHash.get(robotUrl) == None:
        rerp = RobotExclusionRulesParser.RobotExclusionRulesParser()
        try:
            rerp.fetch(robotUrl, 3)
        except urllib2.URLError as e:
            return False
        if rerp.is_allowed("*", url):
            return True
        else:
            return False
    else:
        rerp = robotHash[robotUrl]
        if rerp.is_allowed("*", url):
            return True
        else:
            return False
コード例 #5
0
 def processUrl(self, href):
     """
     :param href: Current url to be processed.
     if this href is not in dict, so we just push it in;
     else this href is already in dict, we must compute new score for it,
     In this way, I just give average score for it.
     """
     href = urlparse.urljoin(self.baseUrl, href)
     href = CheckUrl.validifyUrl(href)
     if not href == -1:
     # not in dict
         if not dict.get(href) == None:
             #heapq.heappush(queue, [self.score, href])
             if not queue.get(href) == None:
                 queue[href] = queue[href] + self.score
         else :
             # in dict ,first find that url, then compute new score, and heapify it again.
             queue[href] = self.score
             dict[href] = self.score
コード例 #6
0
    this is used to computer priority score of that page, use naive method
    compute the number of keywords in the content of that page
    """
    content = content.lower()
    content = content.split()
    priorityScore = 0
    for keyword in keywords:
        for word in content:
            if keyword == word:
                priorityScore += 1
    return priorityScore

queryUrl = "https://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=" + urllib.quote(query)
response1 = urllib2.urlopen(queryUrl + "&rsz=8")
for result in json.load(response1)['responseData']['results']:
    url = CheckUrl.validifyUrl(result['unescapedUrl'])
    queue[url] = -1000 # put them into heap, [-1000, url] represents the score is -1000, and url is url.
    dict[url] = -1000

response1.close()
response2 = urllib2.urlopen(queryUrl + "&rsz=2&start=8")

for result in json.load(response2)['responseData']['results']:
    url = CheckUrl.validifyUrl(result['unescapedUrl'])
    #heapq.heappush(queue, [-1000,url])
    queue[url] = -1000
    """
    because heapq in python is small root based, so using negative num can make it big-root-based.
    """
    dict[url] = -1000