コード例 #1
0
    def process_url(self, url):
        Helper.debug("process start")
        try:
            source = request.urlopen(url).read()
        except:
            return set()
        Helper.debug("process 1:db")

        self.db_cache(url, source)

        #db = sqlite3.connect("data/pages.db")
        #cursor = db.cursor()
        #cursor.execute("""SELECT url FROM pages""")
        #all_urls = [''.join(item) for item in cursor.fetchall()]
        #if url in all_urls:
        #    cursor.execute("""
        #        UPDATE pages SET html = ? WHERE url = ? """, (source, url))
        #else:
        #    cursor.execute("""
        #        INSERT INTO pages(url, html) VALUES (?,?)""", (url, source))
        #db.commit()
        #db.close()

        Helper.debug("process 2:re")
        # Regex for finding links
        rgx = re.compile('a href="(\/\S+|[\/aA-zZ0-9]\S+\.\S+)"')

        linkMatches = rgx.findall(str(source))

        tempFrontier = set()

        tempFrontier.add(url)
        Helper.debug("process 3:add links")
        if self.frontier.frontQueue.qsize() < 10:
            for link in linkMatches:
                if ('https://' in link or 'http://' in link or link[0] == '/') \
                    and 'ftp.' not in link \
                    and'ftp://' not in link \
                    and 'mailto:' not in link:
                    tempFrontier.add(
                        self.normalize_url(link, Helper.get_domain(url)))

        #tempFrontier = tempFrontier - set(self.get_disallowed_sites(url, 'GingerWhiskeyCrawler'))
        Helper.debug("process end")
        return tempFrontier
コード例 #2
0
ファイル: WebCrawler.py プロジェクト: Roknahr/pyCrawler
    def process_url(self, url):
        Helper.debug("process start")
        try:
            source = request.urlopen(url).read()
        except:
            return set()
        Helper.debug("process 1:db")
        
        self.db_cache(url, source)

        #db = sqlite3.connect("data/pages.db")
        #cursor = db.cursor()
        #cursor.execute("""SELECT url FROM pages""")
        #all_urls = [''.join(item) for item in cursor.fetchall()]
        #if url in all_urls:
        #    cursor.execute("""
        #        UPDATE pages SET html = ? WHERE url = ? """, (source, url))
        #else:
        #    cursor.execute("""
        #        INSERT INTO pages(url, html) VALUES (?,?)""", (url, source))
        #db.commit()
        #db.close()
        
        Helper.debug("process 2:re")
        # Regex for finding links
        rgx = re.compile('a href="(\/\S+|[\/aA-zZ0-9]\S+\.\S+)"')

        linkMatches = rgx.findall(str(source))

        tempFrontier = set()

        tempFrontier.add(url)
        Helper.debug("process 3:add links")
        if self.frontier.frontQueue.qsize() < 10:
            for link in linkMatches:
                if ('https://' in link or 'http://' in link or link[0] == '/') \
                    and 'ftp.' not in link \
                    and'ftp://' not in link \
                    and 'mailto:' not in link:
                    tempFrontier.add(self.normalize_url(link, Helper.get_domain(url)))
        
        #tempFrontier = tempFrontier - set(self.get_disallowed_sites(url, 'GingerWhiskeyCrawler'))
        Helper.debug("process end")
        return tempFrontier
コード例 #3
0
ファイル: WebCrawler.py プロジェクト: Roknahr/pyCrawler
    def get_disallowed_sites(self, url, myAgent):
        Helper.debug("Get disallowed sites 1")

        domain = Helper.get_domain(url)

        if domain in self.robots.keys():
            return self.robots[domain]

        try:
            robot = request.urlopen('http://' + domain + '/robots.txt')
            Helper.debug('    Fetching robots.txt: '+domain)
        except:
            return []

        reAgent = re.compile("User-[aA]gent: *(\S+) *$")
        reDis = re.compile("Disallow: *(/\S*) *$")

        agent = None
        disallowed = {}
        Helper.debug("Get disallowed sites 2")
        for line in robot:
            l = str(line).replace("\\n", "").replace("\\r", "")[:-1]
            if reAgent.findall(l): 
                agent = reAgent.findall(l)[0]
                disallowed[agent] = []
            if reDis.findall(l): 
                if agent in disallowed:
                    disallowed[agent].append(reDis.findall(l)[0])
        Helper.debug("Get disallowed sites 3")    
        result = []
        if myAgent in disallowed:
            for link in disallowed[myAgent]:
                result.append(link)  # self.normalize_url(link, domain))
        if '*' in disallowed:
            for link in disallowed['*']:
                result.append(link)  # self.normalize_url(link, domain))
        Helper.debug("Get disallowed sites 4")
        self.robots[domain] = result
        return result
コード例 #4
0
    def get_disallowed_sites(self, url, myAgent):
        Helper.debug("Get disallowed sites 1")

        domain = Helper.get_domain(url)

        if domain in self.robots.keys():
            return self.robots[domain]

        try:
            robot = request.urlopen('http://' + domain + '/robots.txt')
            Helper.debug('    Fetching robots.txt: ' + domain)
        except:
            return []

        reAgent = re.compile("User-[aA]gent: *(\S+) *$")
        reDis = re.compile("Disallow: *(/\S*) *$")

        agent = None
        disallowed = {}
        Helper.debug("Get disallowed sites 2")
        for line in robot:
            l = str(line).replace("\\n", "").replace("\\r", "")[:-1]
            if reAgent.findall(l):
                agent = reAgent.findall(l)[0]
                disallowed[agent] = []
            if reDis.findall(l):
                if agent in disallowed:
                    disallowed[agent].append(reDis.findall(l)[0])
        Helper.debug("Get disallowed sites 3")
        result = []
        if myAgent in disallowed:
            for link in disallowed[myAgent]:
                result.append(link)  # self.normalize_url(link, domain))
        if '*' in disallowed:
            for link in disallowed['*']:
                result.append(link)  # self.normalize_url(link, domain))
        Helper.debug("Get disallowed sites 4")
        self.robots[domain] = result
        return result