#-*- coding:utf-8 -*- #!/usr/bin/python from __future__ import absolute_import from spider.base import spider from spider.base import request from utils.database import mongo as db from utils.log import log as _log from lxml import html import time,random import datetime import re log = _log('rq.'+__name__) class email(spider): name = "email" factory = True def start(self,url=None): # url ='https://purchaser.mingluji.com/Gift' url ='https://purchaser.mingluji.com/Electronic_and_Information_Products' yield request(url=url,callback='link') for i in map( lambda x: url+'/'+str(x),range(0,23)): i.rstrip("%0A").strip() yield request(url=i,callback='link') # break # with open("erro.txt","r") as f: # for i in f.readlines(): # i.rstrip("%0A").strip() # if i is not None: # yield request(url="https://purchaser.mingluji.com/BASIS_%26_BASIS_S.A.",callback='content') # break def link(self,response):
from __future__ import unicode_literals, absolute_import, division, print_function import os import sys import json import time import copy import signal import request import geoip2.database from .utils import signal_name, load_object from utils.log import log as _log logger = _log(__name__) class GetProxy(object): base_dir = os.path.dirname(os.path.realpath(__file__)) def __init__(self, input_proxies_file=None, output_proxies_file=None): self.pool = gevent.pool.Pool(500) self.plugins = [] self.web_proxies = [] self.valid_proxies = [] self.input_proxies = [] self.input_proxies_file = input_proxies_file self.output_proxies_file = output_proxies_file self.proxies_hash = {} self.origin_ip = None
#-*- coding:utf-8 -*- #!/usr/bin/python from __future__ import absolute_import from utils.log import log as _log log = _log(__name__) class request(object): def __init__(self, url=None, dupefilter=False ,callback=None, \ meta=None,setup=None,grab=None,timeout=1000,\ result_ttl=1000,ttl=3000,queue_name='default'): self.url = url self.setup=setup self.dupefilter = dupefilter self.callback = callback self.meta=meta self.grab = None self.interval =None self.repeat = None self.timeout = timeout self.result_ttl =result_ttl self.ttl = ttl self.cron_string=None self.status=None self.connection=None # for item, value in kwargs.items(): # # if item=='meta': # setattr(self,item,value) def get(self,key,default=None): return getattr(self,key,default)