# -*- coding: utf-8 -*- import scrapy from googleprj.items import DmozItem, gurl from lxml import etree import re, time import urllib import requests from googleprj.settings import * from googleprj.redisM import RedisM import MySQLdb count = 0 rm = RedisM() db = MySQLdb.connect(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB) cursor = db.cursor() params = {"Home & Kitchen":["1055398","1248816011","1248915011"],"Beauty & Personal Care":["3760911","1248798011","1248873011"],\ "Electronics":["172282","1248801011","1248879011"],"Health, Household & Baby Care":["3760901","1248810011","1248903011"],\ "Pet Supplies":["2619533011","2661601011","2661618011"],"Tools & Home Improvement":["228013","1248813011","1248909011"],\ "Office Products":["1064954","1248831011","1248945011"]} #{"bigname":["id","include","star"]"} class amazon(scrapy.Spider): name = "amazonsearch" allowed_domains = ["amazon.com"] def start_requests(self): global rm, cursor, db if rm.getvalue("amazonpool") == None or len( rm.getvalue("amazonpool")) < 5: rm.pushredis("amazonpool", IPPOOL)
# -*- coding: utf-8 -*- import scrapy from googleprj.items import DmozItem, gurl from lxml import etree import re, time import urllib import requests from googleprj.settings import * from googleprj.redisM import RedisM import MySQLdb count = 0 rm = RedisM() db = MySQLdb.connect(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB) cursor = db.cursor() class amazon(scrapy.Spider): name = "amazon" allowed_domains = ["amazon.com"] def start_requests(self): global rm, cursor, db if rm.getvalue("amazonpool") == None or len( rm.getvalue("amazonpool")) < 5: rm.pushredis("amazonpool", IPPOOL) sql = "select url from tb_tempurl where isvalid=1" try: cursor.execute(sql)