Esempio n. 1
0
# -*- coding: utf-8 -*-
import scrapy
from googleprj.items import DmozItem, gurl
from lxml import etree
import re, time
import urllib
import requests
from googleprj.settings import *
from googleprj.redisM import RedisM
import MySQLdb

count = 0
rm = RedisM()
db = MySQLdb.connect(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB)
cursor = db.cursor()
params = {"Home & Kitchen":["1055398","1248816011","1248915011"],"Beauty & Personal Care":["3760911","1248798011","1248873011"],\
          "Electronics":["172282","1248801011","1248879011"],"Health, Household & Baby Care":["3760901","1248810011","1248903011"],\
          "Pet Supplies":["2619533011","2661601011","2661618011"],"Tools & Home Improvement":["228013","1248813011","1248909011"],\
          "Office Products":["1064954","1248831011","1248945011"]}  #{"bigname":["id","include","star"]"}


class amazon(scrapy.Spider):

    name = "amazonsearch"
    allowed_domains = ["amazon.com"]

    def start_requests(self):
        global rm, cursor, db
        if rm.getvalue("amazonpool") == None or len(
                rm.getvalue("amazonpool")) < 5:
            rm.pushredis("amazonpool", IPPOOL)
Esempio n. 2
0
# -*- coding: utf-8 -*-
import scrapy
from googleprj.items import DmozItem, gurl
from lxml import etree
import re, time
import urllib
import requests
from googleprj.settings import *
from googleprj.redisM import RedisM
import MySQLdb

count = 0
rm = RedisM()
db = MySQLdb.connect(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB)
cursor = db.cursor()


class amazon(scrapy.Spider):

    name = "amazon"
    allowed_domains = ["amazon.com"]

    def start_requests(self):
        global rm, cursor, db
        if rm.getvalue("amazonpool") == None or len(
                rm.getvalue("amazonpool")) < 5:
            rm.pushredis("amazonpool", IPPOOL)

        sql = "select url from tb_tempurl where isvalid=1"
        try:
            cursor.execute(sql)