def fetch_source_URI(self, sourceURI):
        if not Util.is_valid_uri(sourceURI, is_source_uri=True):
            raise CachingError("Invalid URI: %s" % sourceURI)
        visited = set()
        uri_queue = Queue()
        uri_queue.put(sourceURI)
        thread_list = []
        for index in xrange(FETCHING_THREAD_NUM):
            parser = _URIParser(visited,
                                uri_queue,
                                cache_root=self.cache_dir,
                                fetch_data=True,
                                print_out=self.print_out)
            thread_list.append(parser)
            parser.start()

        compiled_list = list()
        for t in thread_list:
            t.join()
            compiled_list.extend(t.compiled_list)

        # update to REDIS since it saved to cache
        self.update_cachedfile_info(compiled_list)
        Util.organize_compiled_list(compiled_list)
        return compiled_list
    def get_compiled_URIs(cache_root, sourceURI):
        """Return list of URIItem
        Items in the URIItem list has tree structure, which is necessary for FUSE
        """
        if sourceURI.endswith("/") == False:
            sourceURI += "/"
        if not Util.is_valid_uri(sourceURI, is_source_uri=True):
            msg = "Invalid URI: %s" % sourceURI
            raise CachingError(msg)
        uri_queue = Queue()
        visited = set()
        uri_queue.put(sourceURI)
        visited.add(sourceURI)
        thread_list = []
        for index in xrange(FETCHING_THREAD_NUM):
            parser = _URIParser(visited, uri_queue, \
                    cache_root=cache_root, fetch_data=False)
            thread_list.append(parser)
            parser.start()

        compiled_list = list()
        try:
            while len(thread_list) > 0:
                t = thread_list[0]
                t.join(timeout=1.0)
                if not t.is_alive():
                    compiled_list.extend(t.compiled_list)
                    thread_list.remove(t)
        except KeyboardInterrupt, e:
            for t in thread_list:
                t.terminate()
                t.join()
            sys.stderr.write("Keyboard Interrupt")
    def get_compiled_URIs(cache_root, sourceURI):
        """Return list of URIItem
        Items in the URIItem list has tree structure, which is necessary for FUSE
        """
        if sourceURI.endswith("/") == False:
                sourceURI += "/"
        if not Util.is_valid_uri(sourceURI, is_source_uri=True):
            msg = "Invalid URI: %s" % sourceURI
            raise CachingError(msg)
        uri_queue = Queue()
        visited = set()
        uri_queue.put(sourceURI)
        visited.add(sourceURI)
        thread_list = []
        for index in xrange(FETCHING_THREAD_NUM):
            parser = _URIParser(visited, uri_queue, \
                    cache_root=cache_root, fetch_data=False)
            thread_list.append(parser)
            parser.start()

        compiled_list = list()
        try:
            while len(thread_list) > 0:
                t = thread_list[0]
                t.join(timeout=1.0)
                if not t.is_alive():
                    compiled_list.extend(t.compiled_list) 
                    thread_list.remove(t)
        except KeyboardInterrupt, e:
            for t in thread_list:
                t.terminate()
                t.join()
            sys.stderr.write("Keyboard Interrupt")
    def fetch_source_URI(self, sourceURI):
        if not Util.is_valid_uri(sourceURI, is_source_uri=True):
            raise CachingError("Invalid URI: %s" % sourceURI)
        visited = set()
        uri_queue = Queue()
        uri_queue.put(sourceURI)
        thread_list = []
        for index in xrange(FETCHING_THREAD_NUM):
            parser = _URIParser(visited, uri_queue, cache_root=self.cache_dir, 
                    fetch_data=True, print_out=self.print_out)
            thread_list.append(parser)
            parser.start()

        compiled_list = list()
        for t in thread_list:
            t.join()
            compiled_list.extend(t.compiled_list) 

        # update to REDIS since it saved to cache
        self.update_cachedfile_info(compiled_list)
        Util.organize_compiled_list(compiled_list)
        return compiled_list
Esempio n. 5
0
File: spider.py Progetto: baboy/BK
	
	def download_page(self,page):
		print "page:",page
		url = "http://123.57.80.206:8080/user/search?pagesize=20&business=&uid=725&industy=&area=&page="+str(page)
		print "url:",url
		stime = time.time()
		print "<download start> at time: %s"%time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(stime))
		#get rss data
		urllib2.socket.setdefaulttimeout(10)
		response = urllib2.urlopen(url)
		print response.headers
		data = response.read() 
		response.close()
		etime = time.time()
		print "<download end> at time: %s cost:%f"%(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(etime)),etime-stime)
		# parse
		stime = etime
		self.parse_json(page,json.loads(data) );
		etime = time.time()
		print "<parse end> at time: %s cost:%f"%(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(etime)),etime-stime)
		self.db.close()
	def start(self,page):
		self.download_page(page)

#parser = WeiboParser("http://m.tvie.com.cn/mcms/api2/mod/sns/feeds.php?uid=2214257545")
#parser = WeiboParser("http://m.tvie.com.cn/mcms/api2/mod/sns/feeds.php?uid=1640601392")
page = int(sys.argv[1])
parser = ContactParser()
parser.start(page)
#print sohu.jsonMap