Ejemplo n.º 1
0
    def __init__(self):
        self.proxies_dict = []
        self.read_proxy("proxy_20160218.txt")
        Spider.__init__(self, len(self.proxies_dict))

        self.num_count = 0
        #self.filter_name = []
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_info = FileSaver("t-query_company_info.txt")
        #根据公司名字查询到的公司列表局部信息
        #self.query_company_info_part = FileSaver("t-query_company_info_part.txt")
        #根据公司名字查询到的公司列表信息失败的
        self.query_company_info_failure = FileSaver(
            "t-query_company_info_failure.txt")
        #已经爬取过的公司名
        self.already_cname = FileSaver("t-already_cname.txt")
        #初始化已经爬过的公司
        self.init_cname()
        #查询详情失败的公司名
        self.detail_failure = FileSaver("t-detail_failure1.txt")
        #APP可以拿到的公司全部信息 包含股东信息
        self.detail_company = FileSaver("t-detail_company.txt")
        self.extJson = self._aes_.encrypt(
            spider.util.utf8str({
                "cl_screenSize": "640x960",
                "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02",
                "Org_iOS_Version": "2.0.1"
            }))
        self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
        self.headers = {"Content-Type": "application/json"}
Ejemplo n.º 2
0
 def __init__(self, tc):
     Spider.__init__(self, tc)
     self._logport = 5556
     # self.channel = 'gsid'
     # self.job_queue = 'gsid'
     self.savebin = BinSaver("gongshang.bin")
     self.faillog = open("fail_list.txt", "w+b")
Ejemplo n.º 3
0
    def __init__(self):
        self.is_debug = True
        self._can_use_proxy_num = 0
        if self.is_debug:
            Spider.__init__(self, 80)
        else:
            self.proxies_dict = []
            self.read_proxy("../../_ct_proxy/proxy_all_filter.txt")
            Spider.__init__(self, len(self.proxies_dict))
        self.error_cnt = 0
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_list = FileSaver("all_company_list.txt")

        #已经爬取过的公司名
        self.already_cname_list = FileSaver("all_company_list_already.txt")

        #爬过的 错误类型
        self.already_error_type = FileSaver("all_already_error_type.txt")

        self.need_flip_page_data = FileSaver("beijing_need_flip_page_data.txt")

        #初始化已经爬过的公司
        self.init_cname()
        self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="]

        self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]

        self.bloom = set()
        self.proxy_error_cnt = 0
        self.lock = threading.Lock()
Ejemplo n.º 4
0
    def __init__(self, thcnt, acs, type=1, process_num=0, max_process_cnt=1):
        Spider.__init__(self, thcnt)
        self._name = 'cvlpspider'
        self.lpm = MRLManager(acs, new_LPQYLogin)

        if type == 2:
            self.lpm = MRLManager(acs, new_LPLTLogin)
        self.pagestore = LPCVStore()
        self.hasher = spider.util.LocalHashChecker()
        self.lpm.ensure_login_do(None, lambda n: 1, None)
        self.lpm.release_obj()
        self.imgcnt = 0
        self._type = type

        self._process_num = process_num
        self._max_process_cnt = max_process_cnt

        self._spider_cnt = 0
        self._start_time = datetime.datetime.today()
        self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        if self._type == 2:
            self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        self.stat = spider.runtime.StatDict()

        self._limit_cnt = 200
Ejemplo n.º 5
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self.num_count = 0
     self.page_store = PageStoreJobUI()
     self.page_store.testmode = False
     self.bin_list = [
         'jobui_job_data1.bin', 'jobui_job_bu.bin', 'jobui_job_data2.bin'
     ]
Ejemplo n.º 6
0
 def __init__(self,thcnt):
     Spider.__init__(self, thcnt)
     self.sessionReq = YouzyLogin()
     self.sessionReq.do_login(1)
     self.num_count = 0
     self.parse_count = 0
     self.savefile=CsvSaver("spider_url_zhuanke_np.csv",fixed+pfcolumn)
     self.__fail_urls = FileSaver("spider_url_fail_zhuanke_np.txt")
Ejemplo n.º 7
0
 def __init__(self, thcnt, company):
     Spider.__init__(self, thcnt)
     self.default_headers = {'Cookie': 'guide=1'}
     self.pagestore = PageStore51()
     self._name = "jd51"
     self.list = []
     with open(company) as file_:
         for line in file_:
             self.list.append(line.strip())
Ejemplo n.º 8
0
 def __init__(self):
     self.proxies_dict = []
     self.read_proxy("proxy_030814.txt")
     Spider.__init__(self, len(self.proxies_dict))
     self.success_count = 0
     self.request_count = 0
     self.__fail_urls = FileSaver("fail_urls.txt")
     self.start_time = time.time()
     self.page_store = PageStoreJobUI()
     self.page_store.testmode = False
Ejemplo n.º 9
0
 def __init__(self):
     self.proxies_dict = []
     self.read_proxy("../spider/proxy/proxy.txt")
     Spider.__init__(self, len(self.proxies_dict))
     self.success_count = 0
     self.request_count = 0
     self.__fail_add_url = FileSaver("fail_add_url.txt")
     self.start_time = time.time()
     self.domain = self.read_domain()
     self.domain_file = FileSaver("domains.txt")
Ejemplo n.º 10
0
 def __init__(self,thcnt):
     Spider.__init__(self,thcnt)
     # self.uc_count = 0
     # self.tc_count = 0
     # self.yy_count = 0
     self.all_count = 0
     self.bin_list = ['jobui_job_data1.bin','jobui_job_bu.bin','jobui_job_data2.bin']
     #self.bin_list = ['jobui_job.bin','jobui_job2.bin','jobui_job4.bin']
     self.domains = []
     self.file_s = FileSaver('domains.txt')
Ejemplo n.º 11
0
 def __init__(self):
     self.proxies_dict = []
     self.read_proxy("../spider/proxy/proxy.txt")
     Spider.__init__(self, len(self.proxies_dict))
     self.success_count = 0
     self.request_count = 0
     self.__fail_ids = FileSaver("fail_ids.txt")
     self.start_time = time.time()
     self.page_store = PageStoreJobUI()
     self.page_store.testmode = True
Ejemplo n.º 12
0
 def __init__(self, *proxyfile):
     threadcnt = self.prepare_proxy(*proxyfile)
     Spider.__init__(self, threadcnt)
     if not os.path.exists("data1"):
         os.makedirs("data1")
     self.namefile = open(
         "data1/corpname." + str(time.time()).split(".")[0] + ".txt", "w+b")
     self.failfile = open(
         "data1/fail." + str(time.time()).split(".")[0] + ".txt", "w+b")
     self.binsaver = BinSaver("data1/gsinfo" +
                              str(time.time()).split(".")[0] + ".bin")
Ejemplo n.º 13
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self.request = SessionRequests()
     self.view_state = None
     self.event_valid = None
     self.rand = None
     self.loc = "浙江"
     self.data_file = FileSaver("浙江_data.txt")
     self.have_get_url_file = FileSaver("浙江_get_url.txt")
     self.init_already()
     self.login("38037395", "773950")
Ejemplo n.º 14
0
 def _crawl_link(self, link):
     spider = Spider(link, self.user_agent, get_tor_session(9150))
     spider.crawl()
     self.log.debug(
         'Creating document for: {0}, title {1}, body: {2}'.format(
             link, spider.title, spider.body[0::50]))
     self._create_document(link, spider.title, spider.html)
     self._manager.mark_link_crawled(link, spider.success)
     if spider.success:
         return spider.links
     else:
         return []
Ejemplo n.º 15
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self.fail_file = FileSaver("fail2db.txt")
     self.sus_file = FileSaver("SZ2DB.txt")
     #self.invest_detail_url = FileSaver("invest_detail_url.txt")
     self.init_filter()
     self.proxies = {'http': 'http://*****:*****@haohr.com:50001', 'https': 'https://*****:*****@haohr.com:50001'}
         #{'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'}
     self.select_user_agent("=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36")
     self.all_count = 0
     self.update_count = 0
     self.new_count = 0
     self.fail_count = 0
Ejemplo n.º 16
0
 def __init__(self):
     #self.proxies_dict = []
     #self.read_proxy("../spider/proxy/proxy.txt")
     #Spider.__init__(self, len(self.proxies_dict))
     Spider.__init__(self, 1)
     self.num_count = 0
     self._aes_ = CCIQ_AES()
     #APP可以拿到的公司全部信息
     self.save_success = FileSaver("exist_company.txt")
     #APP可以拿到的公司局部信息
     self.part_success = FileSaver("part_company.txt")
     #查询失败的公司名
     self.fail_name = FileSaver("fail_name.txt")
Ejemplo n.º 17
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self.counter = 0
     self.skipcnt = 0
     self.skip_j = 1
     self.ofile = FileSaver('people_result')
     self.headers = {
         'X-Requested-With': 'XMLHttpRequest',
         #'Referer':'https://www.baidu.com/',
         'DNT': 1,
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
     }
Ejemplo n.º 18
0
    def __init__(self):
        Spider.__init__(self, 20)
        self._aes_ = CCIQ_AES()

        #self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)")
        self.proxy_filter = FileSaver("proxy_filter_030309_detail1.txt")


        self.extJsons = ['"Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr\/uapICH92P\/Crryt63u28aP4QP665AzcT\/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4="',
                         '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a\/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4="',
                         '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49\/aDwt3NZNp4TGa5iBFpYLm69F\/6PPFoXIR\/Aw5p48\/\/8OgZFpddDUwQ="']

        self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]
Ejemplo n.º 19
0
class Start:
    thisSpider = Spider()
    link_cache = LinkCache()
    link = None

    def __init__(self):
        pass

    def run(self):
        RedisCon().print()
        self.link = 'https://movie.douban.com/'
        self.loop_link()
        # json = self.thisSpider.get_json('https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=0')
        # print('The JSON DATA:',json)

    def loop_link(self):
        while self.link is not None:
            print('本次link', self.link)
            html = self.thisSpider.get_html(self.link)
            links = AnalysisLink().check_link(html)
            length = len(links)
            print('本次解析到:', length, '个')
            for i in range(length):
                self.link_cache.add_link(links[i])

            self.link = str(self.link_cache.get_link())
            print('目前的连接数为', self.link_cache.get_total())
 def news_content(self, url):
     x = Spider(url)
     print "------------------------------------"
     print url, self.news_type
     news_content_page = self.news_page_object(x.get_page())
     news = {}
     news = news_content_page.news_all(news)
     news['news_url'] = url.encode('utf8')
     news['news_type'] = self.news_type
     if self.database_object.is_existed(url):
         print 'this url is existed'
         return False
     else:
         database = self.database_object(**news)
         database.save()
     return True
Ejemplo n.º 21
0
 def _do_requests(self, url, **kwargs):
     r = Spider._do_requests(self, url, **kwargs)
     if r is None:
         return r
     if r.text.strip() == u"":
         raise ProxyError('ip blocked.')
     return r
Ejemplo n.º 22
0
    def setUp(self):
        client = pymongo.MongoClient()
        client.drop_database('test')
        db = client['test']
        self.term_code = '021'
        self.major_code = '0120123111'
        self.p = mock.patch(
            'spider.spider.Spider.iter_term_and_major',
            lambda v: ((self.term_code, None), (self.term_code, self.major_code))
        )
        self.p.start()
        self.shortcut = hfut.Student(2013217413, '123456789012', 'XC')

        self.job_manager = JobManager(pool_size=20)
        self.db_manager = DatabaseManager(db, batch_size=80)

        self.j = Spider(self.shortcut, self.job_manager, self.db_manager)
Ejemplo n.º 23
0
 def __init__(self):
     spider.util.use_utf8()
     self.is_debug = True
     if self.is_debug:
         Spider.__init__(self, 1)
     else:
         self.proxies_dict = []
         self.read_proxy("../../_ct_proxy/proxy_all_filter.txt")
         Spider.__init__(self, len(self.proxies_dict))
     self._curltls = threading.local()
     self.success_name = FileSaver("query_success_name.txt")
     self.success_detail = FileSaver("query_success_detail.txt")
     #初始化已经爬过的链接
     self.init_spider_url()
     self.cnt = 1
     self.run_time = time.time()
     self.lock = threading.Lock()
Ejemplo n.º 24
0
 def __init__(self):
     self.is_debug = True
     if self.is_debug:
         Spider.__init__(self, 1)
     else:
         self.proxies_dict = []
         self.read_proxy("../spider/proxy/proxy.txt")
         Spider.__init__(self, len(self.proxies_dict))
     self.success_count = 0
     self.request_count = 0
     self.__fail_ids = FileSaver("fail_url.txt")
     self.start_time = time.time()
     self.page_store = PageStoreJobUI()
     self.page_store.testmode = True
     self.init_time = time.time()
     self.already_url = FileSaver("already_url.txt")
     self.init_already_url()
 def progress(self):
     while True:
         news_next_url = self.news_base_url
         while news_next_url:
             print news_next_url
             if news_next_url is not None:
                 x = Spider(news_next_url)
                 news_list_source = self.main_page_object(x.get_page())
                 news_list = news_list_source.find_news_list(self.base_url)
                 if not self.news_content_by_list(news_list):
                     break
                 news_next_url = news_list_source.find_news_next_page(self.base_url)
                 time.sleep(5)
             else:
                 break
         print 'sleep 300s current_thread name is %s' % threading.current_thread().getName()
         time.sleep(300)
Ejemplo n.º 26
0
 def __init__(self):
     spider.util.use_utf8()
     self.saver = RunGansu.Saver()
     self.is_debug = True
     if self.is_debug:
         Spider.__init__(self, 200)
         # self.proxies_dict = [{'http': 'http://*****:*****@106.75.134.189:18889',
         #                       'https': 'https://*****:*****@106.75.134.189:18889'},
         #                      {'http': 'http://*****:*****@106.75.134.190:18889',
         #                       'https': 'https://*****:*****@106.75.134.190:18889'},
         #                      {'http': 'http://*****:*****@106.75.134.191:18889',
         #                       'https': 'https://*****:*****@106.75.134.191:18889'},
         #                      {'http': 'http://*****:*****@106.75.134.192:18889',
         #                       'https': 'https://*****:*****@106.75.134.192:18889'},
         #                      {'http': 'http://*****:*****@106.75.134.193:18889',
         #                       'https': 'https://*****:*****@106.75.134.193:18889'}]
         self.proxies_dict = [{
             'http':
             'http://*****:*****@192.168.1.39:3428',
             'https':
             'https://*****:*****@192.168.1.39:3428'
         }, {
             'http':
             'http://*****:*****@121.40.186.237:50001',
             'https':
             'https://*****:*****@121.40.186.237:50001'
         }]
         #self.proxies_dict = [{}]
         self.gsweb = SearchGSWebGansu(self.saver)
     else:
         self.proxies_dict = []
         self.read_proxy("../../_ct_proxy/proxy_040510.txt")
         Spider.__init__(self, len(self.proxies_dict))
         self._curltls = threading.local()
     self.gswebs = {}
     #已经查询成功的关键字
     self.success_kw = FileSaver("gsinfo_Gansu_success_kw.txt")
     #对于查到的列表信息,爬取成功就写入到这个文本,防止重复爬取
     self.success_queries = FileSaver("gsinfo_Gansu_success_queries.txt")
     #初始化已经爬过的链接
     #self.init_spider_url()
     #time.sleep(2)
     self.cnt = 1
     self.run_time = time.time()
     self.cnt_q = 1
Ejemplo n.º 27
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self._user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:42.0) Gecko/20100101 Firefox/42.0'
     self.baseurl = 'http://www.ewt360.com/LNLQXX/SearchResult?act=mark'
     self.headers = {'Referer': self.baseurl}
     #scores = range(450,750+1) + range(449, 0, -1) + [0]
     scores = range(750, 0, -1) + [0]
     self.possmap = {
         'Years': range(2009, 2014 + 1),
         'WL': ['l', 'w'],
         'BZ': ['b', 'z'],
         'PiCi': 0,
         'Score': scores,
         'ProvinceCode': 0,
         'page': 1
     }
     self.bs = BinSaver("fo.bin")
     self.racer = RaceValueByKey()
Ejemplo n.º 28
0
def run_threading(words):
    crawl = Spider()
    threads=[threading.Thread(target=crawl.crawl_each,args=(w,)) for w in words]

    for t in threads:
        t.start()

    for t in threads:
        t.join()
Ejemplo n.º 29
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self._name = "GuangxiCourtSpider"
     self.test_mode = False
     self.enable_mainjob_timedlock = False
     self.prlist = []
     self.pagestore = GXCourtStore()
     self._paper_url_format = 'http://ws.gxcourt.gov.cn:23001/WDocManage.asmx/GetDocFileInfo?param={"Param":"{\'DocID\':\'%s\'}"}'
     self.case_types = [
         {'key': '案件种类', 'value': 1, 'info': '案.案件种类', 'count': 67381},
         {'key': '案件种类', 'value': 2, 'info': '案.案件种类', 'count': 178674},
         {'key': '案件种类', 'value': 3, 'info': '案.案件种类', 'count': 6839},
         {'key': '案件种类', 'value': 4, 'info': '案.案件种类', 'count': 46387},
         {'key': '案件涉及', 'value': 12, 'info': '案.J案件特征.J民事案件特征.J案件涉及.案件涉及', 'count': 1618},
         {'key': '案件类型', 'value': 16, 'info': '案.CLS', 'count': 40}
     ]
     self.pagesize = 20
     self.job_file = 'queries'
     self.param_format = "{'Param':{'Dic':[{'@Key':'%s','@Value':'%d','@SearchType':'eq'},{'@Key':'searchType','@Value':'高级检索'}]}}"
Ejemplo n.º 30
0
class UpSpider(QThread):
    spiderDone = pyqtSignal(dict)

    def __init__(self, uid):
        super(UpSpider, self).__init__()
        self.spider = Spider('up', uid, 0)

    def run(self):
        for item in self.spider.parse():
            self.spiderDone.emit(item)
        self.exit(0)
Ejemplo n.º 31
0
 def __init__(self):
     spider.util.use_utf8()
     self.is_debug = False
     if self.is_debug:
         Spider.__init__(self, 1)
         self.gsweb = gs_guangdong.SearchGSWebGuangdong(None)
     else:
         self.proxies_dict = []
         self.read_proxy("../../_ct_proxy/proxy_041209.txt")
         Spider.__init__(self, len(self.proxies_dict))
         self._curltls = threading.local()
     self.gswebs = {}
     self.already = FileSaver("gsinfo_out_spidered_cname1.txt")
     self.success = FileSaver("gsinfo_out.txt")
     self.result_null = FileSaver("gsinfo_out_null.txt")
     #初始化已经爬过的公司
     self.init_cname()
     time.sleep(2)
     self.cnt = 1
     self.run_time = time.time()
Ejemplo n.º 32
0
    def __init__(self, thcnt, channel):

        # 子类需要覆盖
        self.log = None

        Spider.__init__(self, thcnt)
        self.channel = channel
        self._name = "%s_download" % channel

        self._download_url = config.DOWNLOAD_URLS.get(channel, '')
        self._retry_times = 3

        assert self._download_url

        # cv status
        self._cv_status = {}

        # test search
        test_search = threading.Thread(target=self.test_search)
        test_search.start()
Ejemplo n.º 33
0
        parser.error('invalid verbosity')

    return args

if __name__ == '__main__':
    args = parse_args()
    if args.verbose:
        level = logging.DEBUG
    elif args.quiet:
        level = logging.WARN
    elif args.silent:
        level = SILENT
    else:
        level = logging.INFO
    log(level=level)

    spider = Spider.site(args.target,
                         robots=args.robots,
                         sitemap=args.sitemap,
                         cookies=args.cookies,
                         workers=args.workers)
    if args.html:
        spider.events.register(html.Handler())
    if args.response:
        spider.events.register(response.Handler())
    if args.skip_forms:
        spider.events.register(skip_forms.Handler())
    if args.request_lulz:
        spider.events.register(request_lulz.Handler())
    spider.crawl()