Ejemplo n.º 1
0
    def my_fetcher(self):                           #fetcher的工作内容就是从fetcher_queue中取节点,操作后,放入crawler_queue中
        while self.fetcher_flag:
            if not self.fetcher_queue.empty():      #如果不为空
                tmp_node = self.fetcher_queue.get(block = False)
                fetcher(tmp_node,self.spider_type)
                print str(time.ctime()) + ' ' + tmp_node.url
                self.crawler_queue.put(tmp_node)

            else:                                   #如果下载队里为空
                gevent.sleep(0)
        return
Ejemplo n.º 2
0
    def my_fetcher(
            self):  #fetcher的工作内容就是从fetcher_queue中取节点,操作后,放入crawler_queue中
        while self.fetcher_flag:
            if not self.fetcher_queue.empty():  #如果不为空
                tmp_node = self.fetcher_queue.get(block=False)
                fetcher(tmp_node, self.spider_type)
                print str(time.ctime()) + ' ' + tmp_node.url
                self.crawler_queue.put(tmp_node)

            else:  #如果下载队里为空
                gevent.sleep(0)
        return
Ejemplo n.º 3
0
			pass
			#print "Table data is already exists"

	def SaveDB(self, nodes):
		self.nodes = nodes
		for x in self.nodes:
			self.cursor.execute("insert into data (key,url,html) values (?,?,?)",(self.key, x.url, x.html))
		self.conn.commit()

	def Fetch_url_from_DB(self, keyword = None):
		self.cursor.execute("select * from data")
		r = self.cursor.fetchall()
		result = []
		for i in r:
			if i[1] == self.key:
				result.append(i[2])
		return result

	def CloseDB(self):
		self.cursor.close()
		self.conn.close()

if __name__ == '__main__':
	t = DataNode("http://www.sohu.com")
	from Fetcher import fetcher
	fetcher(t)
	key = 'sina'
	sql_db = sql3_DB('sina')
	sql_db.SaveDB((t,))
	print sql_db.Fetch_url_from_DB()
	sql_db.CloseDB()
Ejemplo n.º 4
0
            link_list = list(set([i[2] for i in links]))
            node.set_links(link_list)
        except Exception, e:
            return 
        '''
        待补充其他操作
        如获取页面title等
        '''
        try:
            pass
        except Exception, e:
            pass 
        node.reset_html()
        return

if __name__ == '__main__':
    t = DataNode("http://www.sina.com.cn")

    fetcher(t)
    print 'static:'
    print len(t.html)
    crawler(t)
    print len(set(t.links))

    print "dynamic:"
    fetcher(t,"dynamic")
    print len(t.html)
    crawler(t)
    print len(set(t.links))