Exemple #1
0
#-*- coding:utf-8 -*-
#!/usr/bin/python
from __future__ import absolute_import
from spider.base import spider
from spider.base import request
from utils.database import  mongo as db
from utils.log import log as _log
from lxml import html
import time,random
import datetime
import re
log = _log('rq.'+__name__)
class email(spider):
    name = "email"
    factory = True
    def start(self,url=None):
        # url ='https://purchaser.mingluji.com/Gift'
        url ='https://purchaser.mingluji.com/Electronic_and_Information_Products'
        yield   request(url=url,callback='link')
        for i in map( lambda x: url+'/'+str(x),range(0,23)):
            i.rstrip("%0A").strip()
            yield request(url=i,callback='link')
            # break
        # with open("erro.txt","r") as f:
        #     for i in f.readlines():
        #         i.rstrip("%0A").strip()
        #         if  i is not None:
        #             yield request(url="https://purchaser.mingluji.com/BASIS_%26_BASIS_S.A.",callback='content')
                # break
    def link(self,response):
Exemple #2
0
from __future__ import unicode_literals, absolute_import, division, print_function

import os
import sys
import json
import time
import copy
import signal

import request
import geoip2.database

from .utils import signal_name, load_object
from utils.log import log as _log
logger = _log(__name__)


class GetProxy(object):
    base_dir = os.path.dirname(os.path.realpath(__file__))

    def __init__(self, input_proxies_file=None, output_proxies_file=None):
        self.pool = gevent.pool.Pool(500)
        self.plugins = []
        self.web_proxies = []
        self.valid_proxies = []
        self.input_proxies = []
        self.input_proxies_file = input_proxies_file
        self.output_proxies_file = output_proxies_file
        self.proxies_hash = {}
        self.origin_ip = None
Exemple #3
0
#-*- coding:utf-8 -*-
#!/usr/bin/python
from __future__ import absolute_import
from utils.log import log as _log
log = _log(__name__)
class request(object):
    def __init__(self,  url=None, dupefilter=False ,callback=None, \
                        meta=None,setup=None,grab=None,timeout=1000,\
                        result_ttl=1000,ttl=3000,queue_name='default'):

        self.url = url
        self.setup=setup
        self.dupefilter = dupefilter
        self.callback = callback
        self.meta=meta
        self.grab = None
        self.interval =None
        self.repeat = None
        self.timeout = timeout
        self.result_ttl =result_ttl
        self.ttl = ttl
        self.cron_string=None
        self.status=None
        self.connection=None

        # for item, value  in  kwargs.items():
        #     # if item=='meta':
        #         setattr(self,item,value)
    def get(self,key,default=None):

            return getattr(self,key,default)