def __init__(self): self.proxies_dict = [] self.read_proxy("proxy_20160218.txt") Spider.__init__(self, len(self.proxies_dict)) self.num_count = 0 #self.filter_name = [] self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_info = FileSaver("t-query_company_info.txt") #根据公司名字查询到的公司列表局部信息 #self.query_company_info_part = FileSaver("t-query_company_info_part.txt") #根据公司名字查询到的公司列表信息失败的 self.query_company_info_failure = FileSaver( "t-query_company_info_failure.txt") #已经爬取过的公司名 self.already_cname = FileSaver("t-already_cname.txt") #初始化已经爬过的公司 self.init_cname() #查询详情失败的公司名 self.detail_failure = FileSaver("t-detail_failure1.txt") #APP可以拿到的公司全部信息 包含股东信息 self.detail_company = FileSaver("t-detail_company.txt") self.extJson = self._aes_.encrypt( spider.util.utf8str({ "cl_screenSize": "640x960", "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02", "Org_iOS_Version": "2.0.1" })) self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)") self.headers = {"Content-Type": "application/json"}
def __init__(self, tc): Spider.__init__(self, tc) self._logport = 5556 # self.channel = 'gsid' # self.job_queue = 'gsid' self.savebin = BinSaver("gongshang.bin") self.faillog = open("fail_list.txt", "w+b")
def __init__(self): self.is_debug = True self._can_use_proxy_num = 0 if self.is_debug: Spider.__init__(self, 80) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) self.error_cnt = 0 self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_list = FileSaver("all_company_list.txt") #已经爬取过的公司名 self.already_cname_list = FileSaver("all_company_list_already.txt") #爬过的 错误类型 self.already_error_type = FileSaver("all_already_error_type.txt") self.need_flip_page_data = FileSaver("beijing_need_flip_page_data.txt") #初始化已经爬过的公司 self.init_cname() self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=", "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="] self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"] self.bloom = set() self.proxy_error_cnt = 0 self.lock = threading.Lock()
def __init__(self, thcnt, acs, type=1, process_num=0, max_process_cnt=1): Spider.__init__(self, thcnt) self._name = 'cvlpspider' self.lpm = MRLManager(acs, new_LPQYLogin) if type == 2: self.lpm = MRLManager(acs, new_LPLTLogin) self.pagestore = LPCVStore() self.hasher = spider.util.LocalHashChecker() self.lpm.ensure_login_do(None, lambda n: 1, None) self.lpm.release_obj() self.imgcnt = 0 self._type = type self._process_num = process_num self._max_process_cnt = max_process_cnt self._spider_cnt = 0 self._start_time = datetime.datetime.today() self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' if self._type == 2: self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' self.stat = spider.runtime.StatDict() self._limit_cnt = 200
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.num_count = 0 self.page_store = PageStoreJobUI() self.page_store.testmode = False self.bin_list = [ 'jobui_job_data1.bin', 'jobui_job_bu.bin', 'jobui_job_data2.bin' ]
def __init__(self,thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login(1) self.num_count = 0 self.parse_count = 0 self.savefile=CsvSaver("spider_url_zhuanke_np.csv",fixed+pfcolumn) self.__fail_urls = FileSaver("spider_url_fail_zhuanke_np.txt")
def __init__(self, thcnt, company): Spider.__init__(self, thcnt) self.default_headers = {'Cookie': 'guide=1'} self.pagestore = PageStore51() self._name = "jd51" self.list = [] with open(company) as file_: for line in file_: self.list.append(line.strip())
def __init__(self): self.proxies_dict = [] self.read_proxy("proxy_030814.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_urls = FileSaver("fail_urls.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = False
def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_add_url = FileSaver("fail_add_url.txt") self.start_time = time.time() self.domain = self.read_domain() self.domain_file = FileSaver("domains.txt")
def __init__(self,thcnt): Spider.__init__(self,thcnt) # self.uc_count = 0 # self.tc_count = 0 # self.yy_count = 0 self.all_count = 0 self.bin_list = ['jobui_job_data1.bin','jobui_job_bu.bin','jobui_job_data2.bin'] #self.bin_list = ['jobui_job.bin','jobui_job2.bin','jobui_job4.bin'] self.domains = [] self.file_s = FileSaver('domains.txt')
def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_ids = FileSaver("fail_ids.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = True
def __init__(self, *proxyfile): threadcnt = self.prepare_proxy(*proxyfile) Spider.__init__(self, threadcnt) if not os.path.exists("data1"): os.makedirs("data1") self.namefile = open( "data1/corpname." + str(time.time()).split(".")[0] + ".txt", "w+b") self.failfile = open( "data1/fail." + str(time.time()).split(".")[0] + ".txt", "w+b") self.binsaver = BinSaver("data1/gsinfo" + str(time.time()).split(".")[0] + ".bin")
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.request = SessionRequests() self.view_state = None self.event_valid = None self.rand = None self.loc = "浙江" self.data_file = FileSaver("浙江_data.txt") self.have_get_url_file = FileSaver("浙江_get_url.txt") self.init_already() self.login("38037395", "773950")
def _crawl_link(self, link): spider = Spider(link, self.user_agent, get_tor_session(9150)) spider.crawl() self.log.debug( 'Creating document for: {0}, title {1}, body: {2}'.format( link, spider.title, spider.body[0::50])) self._create_document(link, spider.title, spider.html) self._manager.mark_link_crawled(link, spider.success) if spider.success: return spider.links else: return []
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.fail_file = FileSaver("fail2db.txt") self.sus_file = FileSaver("SZ2DB.txt") #self.invest_detail_url = FileSaver("invest_detail_url.txt") self.init_filter() self.proxies = {'http': 'http://*****:*****@haohr.com:50001', 'https': 'https://*****:*****@haohr.com:50001'} #{'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'} self.select_user_agent("=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36") self.all_count = 0 self.update_count = 0 self.new_count = 0 self.fail_count = 0
def __init__(self): #self.proxies_dict = [] #self.read_proxy("../spider/proxy/proxy.txt") #Spider.__init__(self, len(self.proxies_dict)) Spider.__init__(self, 1) self.num_count = 0 self._aes_ = CCIQ_AES() #APP可以拿到的公司全部信息 self.save_success = FileSaver("exist_company.txt") #APP可以拿到的公司局部信息 self.part_success = FileSaver("part_company.txt") #查询失败的公司名 self.fail_name = FileSaver("fail_name.txt")
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.counter = 0 self.skipcnt = 0 self.skip_j = 1 self.ofile = FileSaver('people_result') self.headers = { 'X-Requested-With': 'XMLHttpRequest', #'Referer':'https://www.baidu.com/', 'DNT': 1, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' }
def __init__(self): Spider.__init__(self, 20) self._aes_ = CCIQ_AES() #self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)") self.proxy_filter = FileSaver("proxy_filter_030309_detail1.txt") self.extJsons = ['"Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr\/uapICH92P\/Crryt63u28aP4QP665AzcT\/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4="', '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a\/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4="', '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49\/aDwt3NZNp4TGa5iBFpYLm69F\/6PPFoXIR\/Aw5p48\/\/8OgZFpddDUwQ="'] self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)", "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]
class Start: thisSpider = Spider() link_cache = LinkCache() link = None def __init__(self): pass def run(self): RedisCon().print() self.link = 'https://movie.douban.com/' self.loop_link() # json = self.thisSpider.get_json('https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=0') # print('The JSON DATA:',json) def loop_link(self): while self.link is not None: print('本次link', self.link) html = self.thisSpider.get_html(self.link) links = AnalysisLink().check_link(html) length = len(links) print('本次解析到:', length, '个') for i in range(length): self.link_cache.add_link(links[i]) self.link = str(self.link_cache.get_link()) print('目前的连接数为', self.link_cache.get_total())
def news_content(self, url): x = Spider(url) print "------------------------------------" print url, self.news_type news_content_page = self.news_page_object(x.get_page()) news = {} news = news_content_page.news_all(news) news['news_url'] = url.encode('utf8') news['news_type'] = self.news_type if self.database_object.is_existed(url): print 'this url is existed' return False else: database = self.database_object(**news) database.save() return True
def _do_requests(self, url, **kwargs): r = Spider._do_requests(self, url, **kwargs) if r is None: return r if r.text.strip() == u"": raise ProxyError('ip blocked.') return r
def setUp(self): client = pymongo.MongoClient() client.drop_database('test') db = client['test'] self.term_code = '021' self.major_code = '0120123111' self.p = mock.patch( 'spider.spider.Spider.iter_term_and_major', lambda v: ((self.term_code, None), (self.term_code, self.major_code)) ) self.p.start() self.shortcut = hfut.Student(2013217413, '123456789012', 'XC') self.job_manager = JobManager(pool_size=20) self.db_manager = DatabaseManager(db, batch_size=80) self.j = Spider(self.shortcut, self.job_manager, self.db_manager)
def __init__(self): spider.util.use_utf8() self.is_debug = True if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_all_filter.txt") Spider.__init__(self, len(self.proxies_dict)) self._curltls = threading.local() self.success_name = FileSaver("query_success_name.txt") self.success_detail = FileSaver("query_success_detail.txt") #初始化已经爬过的链接 self.init_spider_url() self.cnt = 1 self.run_time = time.time() self.lock = threading.Lock()
def __init__(self): self.is_debug = True if self.is_debug: Spider.__init__(self, 1) else: self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_ids = FileSaver("fail_url.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = True self.init_time = time.time() self.already_url = FileSaver("already_url.txt") self.init_already_url()
def progress(self): while True: news_next_url = self.news_base_url while news_next_url: print news_next_url if news_next_url is not None: x = Spider(news_next_url) news_list_source = self.main_page_object(x.get_page()) news_list = news_list_source.find_news_list(self.base_url) if not self.news_content_by_list(news_list): break news_next_url = news_list_source.find_news_next_page(self.base_url) time.sleep(5) else: break print 'sleep 300s current_thread name is %s' % threading.current_thread().getName() time.sleep(300)
def __init__(self): spider.util.use_utf8() self.saver = RunGansu.Saver() self.is_debug = True if self.is_debug: Spider.__init__(self, 200) # self.proxies_dict = [{'http': 'http://*****:*****@106.75.134.189:18889', # 'https': 'https://*****:*****@106.75.134.189:18889'}, # {'http': 'http://*****:*****@106.75.134.190:18889', # 'https': 'https://*****:*****@106.75.134.190:18889'}, # {'http': 'http://*****:*****@106.75.134.191:18889', # 'https': 'https://*****:*****@106.75.134.191:18889'}, # {'http': 'http://*****:*****@106.75.134.192:18889', # 'https': 'https://*****:*****@106.75.134.192:18889'}, # {'http': 'http://*****:*****@106.75.134.193:18889', # 'https': 'https://*****:*****@106.75.134.193:18889'}] self.proxies_dict = [{ 'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428' }, { 'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001' }] #self.proxies_dict = [{}] self.gsweb = SearchGSWebGansu(self.saver) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_040510.txt") Spider.__init__(self, len(self.proxies_dict)) self._curltls = threading.local() self.gswebs = {} #已经查询成功的关键字 self.success_kw = FileSaver("gsinfo_Gansu_success_kw.txt") #对于查到的列表信息,爬取成功就写入到这个文本,防止重复爬取 self.success_queries = FileSaver("gsinfo_Gansu_success_queries.txt") #初始化已经爬过的链接 #self.init_spider_url() #time.sleep(2) self.cnt = 1 self.run_time = time.time() self.cnt_q = 1
def __init__(self, thcnt): Spider.__init__(self, thcnt) self._user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:42.0) Gecko/20100101 Firefox/42.0' self.baseurl = 'http://www.ewt360.com/LNLQXX/SearchResult?act=mark' self.headers = {'Referer': self.baseurl} #scores = range(450,750+1) + range(449, 0, -1) + [0] scores = range(750, 0, -1) + [0] self.possmap = { 'Years': range(2009, 2014 + 1), 'WL': ['l', 'w'], 'BZ': ['b', 'z'], 'PiCi': 0, 'Score': scores, 'ProvinceCode': 0, 'page': 1 } self.bs = BinSaver("fo.bin") self.racer = RaceValueByKey()
def run_threading(words): crawl = Spider() threads=[threading.Thread(target=crawl.crawl_each,args=(w,)) for w in words] for t in threads: t.start() for t in threads: t.join()
def __init__(self, thcnt): Spider.__init__(self, thcnt) self._name = "GuangxiCourtSpider" self.test_mode = False self.enable_mainjob_timedlock = False self.prlist = [] self.pagestore = GXCourtStore() self._paper_url_format = 'http://ws.gxcourt.gov.cn:23001/WDocManage.asmx/GetDocFileInfo?param={"Param":"{\'DocID\':\'%s\'}"}' self.case_types = [ {'key': '案件种类', 'value': 1, 'info': '案.案件种类', 'count': 67381}, {'key': '案件种类', 'value': 2, 'info': '案.案件种类', 'count': 178674}, {'key': '案件种类', 'value': 3, 'info': '案.案件种类', 'count': 6839}, {'key': '案件种类', 'value': 4, 'info': '案.案件种类', 'count': 46387}, {'key': '案件涉及', 'value': 12, 'info': '案.J案件特征.J民事案件特征.J案件涉及.案件涉及', 'count': 1618}, {'key': '案件类型', 'value': 16, 'info': '案.CLS', 'count': 40} ] self.pagesize = 20 self.job_file = 'queries' self.param_format = "{'Param':{'Dic':[{'@Key':'%s','@Value':'%d','@SearchType':'eq'},{'@Key':'searchType','@Value':'高级检索'}]}}"
class UpSpider(QThread): spiderDone = pyqtSignal(dict) def __init__(self, uid): super(UpSpider, self).__init__() self.spider = Spider('up', uid, 0) def run(self): for item in self.spider.parse(): self.spiderDone.emit(item) self.exit(0)
def __init__(self): spider.util.use_utf8() self.is_debug = False if self.is_debug: Spider.__init__(self, 1) self.gsweb = gs_guangdong.SearchGSWebGuangdong(None) else: self.proxies_dict = [] self.read_proxy("../../_ct_proxy/proxy_041209.txt") Spider.__init__(self, len(self.proxies_dict)) self._curltls = threading.local() self.gswebs = {} self.already = FileSaver("gsinfo_out_spidered_cname1.txt") self.success = FileSaver("gsinfo_out.txt") self.result_null = FileSaver("gsinfo_out_null.txt") #初始化已经爬过的公司 self.init_cname() time.sleep(2) self.cnt = 1 self.run_time = time.time()
def __init__(self, thcnt, channel): # 子类需要覆盖 self.log = None Spider.__init__(self, thcnt) self.channel = channel self._name = "%s_download" % channel self._download_url = config.DOWNLOAD_URLS.get(channel, '') self._retry_times = 3 assert self._download_url # cv status self._cv_status = {} # test search test_search = threading.Thread(target=self.test_search) test_search.start()
parser.error('invalid verbosity') return args if __name__ == '__main__': args = parse_args() if args.verbose: level = logging.DEBUG elif args.quiet: level = logging.WARN elif args.silent: level = SILENT else: level = logging.INFO log(level=level) spider = Spider.site(args.target, robots=args.robots, sitemap=args.sitemap, cookies=args.cookies, workers=args.workers) if args.html: spider.events.register(html.Handler()) if args.response: spider.events.register(response.Handler()) if args.skip_forms: spider.events.register(skip_forms.Handler()) if args.request_lulz: spider.events.register(request_lulz.Handler()) spider.crawl()