def filter_corp_name(): filter = set() save = FileSaver("hunan_cname.txt") with open("corp_name.txt", "r") as f: i = 0 j = 0 for line in f: line = line.strip() ary = line.split(" ") code = int(ary[1].strip()) #地区代码 if code < 430000 or code >= 440000: continue a = 0 for name in ary: if a == 0 or a == 1: a += 1 continue if name in filter: a += 1 j += 1 #print name, " already in filter..." continue print code, name save.append(name) filter.add(name) a += 1 i += 1 print '重复条数:', j, "去重后条数:", i
def guolv(): filter1 = set() filter2 = set() save = FileSaver("r1_fenci_local_filter.txt") with open("r1_fenci.txt", "r") as f: for line in f: line = line.strip() filter1.add(line) cat = [u"公司", u"有限公司", u"分公司", u"子公司", u"责任公司", u"集团", u"委员会", u"商行", u"合作社", u"经营部", u"工作室", u"维修部", u"影楼", u"生活馆", u"网吧", u"经销处", u"服饰店", u"营业厅", u"西餐厅", u"商店", u"票务部", u"经销处", u"五金厂", u"超市", u"咖啡店", u"咨询中心", u"茶餐厅", u"酒吧", u"针织厂", u"塑料厂", u"服务部", u"酒店", u"宾馆", u"旅馆"] with open("r1_fenci_local.txt", "r") as f: cnt = 0 i = 0 for line in f: line = line.strip() if line in filter2: print line, "...... 自身重复 ......" continue else: filter2.add(line) if line in filter1: i += 1 print i, line, "...... 重复 ......" else: if line in cat: print cnt, line, "...... 要去掉 ......" continue cnt += 1 save.append(line) print "重复 %d 行 , 获得 %d 个分词:" % (i, cnt)
def tiqu_spidered_cname(): filter = set() save = FileSaver("guangdong_already_detail_cname.txt") with open("gsinfo_guangdong_success_url.txt", "r") as f: i = 0 j = 0 for line in f: line = line.strip() r = None try: r = eval(line) except Exception as e: print "ERROR:", e, "--->", line time.sleep(2) continue name = r["name"] if name in filter: j += 1 print j, name, 'already exist!!!' else: save.append(name) filter.add(name) i += 1 #print "第", i, "行:", r #utf8str(r) #1443977 print '重复条数:', j, "去重后条数:", i, "总条数:", (j + i)
def check_guangzhou(): # m = re.search(".*([\u4e00-\u9fa5]+).*", "abcdef阿瓦大实打实ghijk") # if m: # print m.group(1) # temp = "abcdef阿瓦大实打实ghijk" # xx = u"([\u4e00-\u9fa5]+)" # pat = re.compile(xx) # result = pat.findall(temp.decode("utf8")) # for res in result: # print res save = FileSaver("guangzhou4.txt") cnt = 0 with open("guangzhou3.txt", "r") as f: for line in f: cnt += 1 line = line.strip() xx = u"([\u4e00-\u9fa5]+)" pat = re.compile(xx) result = pat.findall(line.decode("utf8")) for res in result: if len( res ) < 4 or res == u"有限公司" or res == u"贸易公司" or res == u"服装公司": print "error: ", cnt, res continue print cnt, len(res), res save.append(res.strip())
def gen_queries(self): remove_file(self.job_file) fs = FileSaver(self.job_file) for ct in self.case_types: pcnt = ct['count'] / self.pagesize + 1 for page in range(1, pcnt + 1): fs.append(ct['key'] + '|' + str(ct['value']) + '|' + str(page) + '|' + str(self.pagesize))
def test_check(): filter = set() old = 0 with open("gsinfo_out.txt", "r") as f: for line in f: line = line.strip() name = eval(line)["name"] if name not in filter: filter.add(name) old += 1 print "公司名加载条数:", old time.sleep(1) i = 0 j = 0 save = FileSaver("guangdong_cname_new.txt") with open("guangdong_cname.txt", "r") as f: for line in f: r = line.strip() if r in filter: j += 1 print j, 'already exist!!!' else: i += 1 print i, r, "not in " save.append(r) print "旧的公司名条数:", old, '已经查到的公司名条数:', j, "没有查到的公司名条数:", i, "总条数:", (j + i)
def filter_corp_name_all(): filter = set() save = FileSaver("all_cname.txt") cnt = 0 with open("corp_name.txt", "r") as f: i = 0 j = 0 for line in f: cnt += 1 line = line.strip() ary = line.split(" ") #code = int(ary[1].strip()) #地区代码 # if code < 110000 or code >= 120000: # continue a = 0 for name in ary: if a == 0 or a == 1: a += 1 continue name = name.strip() if name in filter: a += 1 j += 1 #print name, " already in filter..." continue print cnt, name save.append(name) filter.add(name) a += 1 i += 1 print '重复条数:', j, "去重后条数:", i
def tiqu_cname_by_corp_name(): filter = set() filter_name = set() i = 0 j = 0 save = FileSaver("corp_name_tiqu.txt") with open("corp_name.txt", "r") as f: for line in f: line = line.strip() if line in filter: i += 1 continue ary = line.split(" ") x = 0 for ay in ary: x += 1 if x == 1 or x == 2: continue cname = ay.strip() if cname in filter_name: continue save.append(cname) filter_name.add(cname) j += 1 filter.add(line) print "重复数:", i, "拿到公司名:", j
def daochu1(): filter = set() save = FileSaver("un_spider_queries.txt") with open("b_query_detail.txt", "r") as f: for line in f: line = line.strip() #print line detail = eval(line) code = detail["list"][0]["oc_code"] filter.add(code) re = 0 new = 0 with open("a_queried_company_list1.txt", "r") as f: for line in f: lst = eval(line.strip()) code = lst['oc_code'] if code in filter: re += 1 print re, 'already get details!' else: s = line.strip() new += 1 filter.add(code) save.append(s) print new, s print "new:", new, "re:", re
def tiqu_already_spider_cname(): re = 0 new = 0 filter_self = set() filter_cname = set() save = FileSaver("already_guangzhou_query_inc_name2.txt") with open("gsinfo_guangdong_guangzhou_gz1.txt") as f: for line in f: line = line.strip() if line in filter_self: re += 1 continue else: try: r = eval(line) cname = r["basicInfo"]["名称"] if cname in filter_cname: re += 1 continue else: save.append(cname) new += 1 except Exception as e: print "ERROR:", e, line print "重复:", re, "新:", new
def tiqu_oc_code(): filter_code = set() save = FileSaver("oc_code.txt") files = ["nacao_queries_info_local.txt", "nacao_queries_info.txt"] s = [] i = 0 x = 0 y = 0 for fs in files: with open(fs, "r") as f: for line in f: i += 1 r = line.strip() jn = json.loads(r) oc = jn["jgdm"] if oc in filter_code: x += 1 else: filter_code.add(oc) s.append(oc) y += 1 print "总条数:", i, "重复注册号:", x, "不重注册号:", y z = 0 if len(s) != 0: s.sort() for c in s: z += 1 save.append(c) print "结束,写入", z, "条数据..."
def __init__(self,thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login() self.num_count = 0 self.savefile=FileSaver("youzy.txt") self.__fail_urls = FileSaver("fail_urls.txt")
def __init__(self): self.bs = BinSaver("gsinfo_Guangdong_html.bin") self.fs_QyxyDetail = FileSaver("gsinfo_guangdong_QyxyDetail.txt") self.fs_GSpublicityList = FileSaver( "gsinfo_guangdong_GSpublicityList.txt") self.fs_entityShow = FileSaver("gsinfo_guangdong_entityShow.txt") self.fs_guangzhou = FileSaver("gsinfo_guangdong_guangzhou.txt")
def tiqu_oc_code_cname(): save = FileSaver("oc_code_cname.txt") files = ["nacao_queries_info_local.txt", "nacao_queries_info.txt"] s2 = {} i = 0 x = 0 y = 0 for fs in files: with open(fs, "r") as f: for line in f: i += 1 r = line.strip() jn = json.loads(r) oc = jn["jgdm"] cname = jn["bzjgmcs"] try: s2[oc.strip()] = cname.strip() print i, oc except Exception as e: print "出错......", e, r continue print "总条数:", i, "重复注册号:", x, "不重注册号:", y, "字典内元素个数:",len(s2) z = 0 if len(s2) != 0: items = s2.items() print "排序开始...", time.time() items.sort() print "排序完毕...", time.time() for k, v in items: z += 1 save.append(k + " " + v) print "结束,写入", z, "条数据...", time.time()
def __init__(self): spider.util.use_utf8() self.saver = RunGuangdong.Saver() self.is_debug = False if self.is_debug: Spider.__init__(self, 1) self.proxies_dict = [{ 'http': 'http://*****:*****@106.75.134.189:18889', 'https': 'https://*****:*****@106.75.134.189:18889' }] else: self.proxies_dict = [] self.read_proxy("../../../_ct_proxy/proxy_041810.txt") Spider.__init__(self, len(self.proxies_dict)) self._curltls = threading.local() self.gswebs = {} #已经访问成功的URL self.success_url = FileSaver("gsinfo_guangdong_success_url.txt") #初始化已经爬过的链接 self.init_spider_url() self.cnt = 1 self.run_time = time.time() self.lock = threading.Lock() self.not_show_save = FileSaver("not_show_error_out.txt")
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.success_count = 0 self.fail_count = 0 self.fail_file = FileSaver("fail2db.txt") self.sus_file = FileSaver("SZ2DB.txt") self.init_filter()
def __init__(self, fn, mode='w', buffer_size=100): FileSaver.__init__(self, fn) self.fd = open(fn, mode) self.lock = threading.Lock() self.link_buffer = [] self.buffer_size = buffer_size self.count = 0
def __init__(self): self.proxies_dict = [] self.read_proxy("proxy_20160218.txt") Spider.__init__(self, len(self.proxies_dict)) self.num_count = 0 #self.filter_name = [] self._aes_ = CCIQ_AES() #根据公司名字查询到的公司列表全部信息 self.query_company_info = FileSaver("t-query_company_info.txt") #根据公司名字查询到的公司列表局部信息 #self.query_company_info_part = FileSaver("t-query_company_info_part.txt") #根据公司名字查询到的公司列表信息失败的 self.query_company_info_failure = FileSaver( "t-query_company_info_failure.txt") #已经爬取过的公司名 self.already_cname = FileSaver("t-already_cname.txt") #初始化已经爬过的公司 self.init_cname() #查询详情失败的公司名 self.detail_failure = FileSaver("t-detail_failure1.txt") #APP可以拿到的公司全部信息 包含股东信息 self.detail_company = FileSaver("t-detail_company.txt") self.extJson = self._aes_.encrypt( spider.util.utf8str({ "cl_screenSize": "640x960", "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02", "Org_iOS_Version": "2.0.1" })) self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)") self.headers = {"Content-Type": "application/json"}
def __init__(self,thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login(1) self.num_count = 0 self.parse_count = 0 self.savefile=CsvSaver("spider_url_zhuanke_np.csv",fixed+pfcolumn) self.__fail_urls = FileSaver("spider_url_fail_zhuanke_np.txt")
def init_conditions(self): GQDataHelper.add(self, 'companyType', companyType) GQDataHelper.add(self, 'degreeType', degreeType) GQDataHelper.add(self, 'experienceType', experienceType) GQDataHelper.add(self, 'industry', industry) GQDataHelper.add(self, 'jobType', jobType) GQDataHelper.add(self, 'payType', payType) GQDataHelper.add(self, 'cityKw', cityKw) self.bs2 = FileSaver("failed_urls.txt")
class PageStoreJobUI(PageStoreBase): def __init__(self): super(PageStoreJobUI, self).__init__('jd_jobui') self.crawlered_ids = set() self.log_file = FileSaver("./data/chentao/jobui_log/jobui-" + str(time.strftime('%Y-%m-%d')) + ".txt") def extract_content(self): content = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="hasVist cfix sbox fs16"', 0) try: content = content.get_text() except: Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) return None return content def page_time(self): #TODO #tag = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="publish_time"', 0) tag = re.search('class="uptime common-icon"></em>(.*?)</dd>', self.get_cur_doc().cur_content) try: #tag = tag.get_text() tag = tag.group(1) except: Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) raise return TimeHandler.fmt_time(tag) def getopath(self): dirs = ['./data/chentao/jobui_data_re'] for di in dirs: if os.path.isdir(di) and os.access(di, os.W_OK): return di raise RuntimeError("no dir to write files.") def save_time_log(self, indexUrl, cur_tm): """记录更新时间""" #db = conn.gaokao_crawler #content = db.page_store_jd_jobui.find_one({"indexUrl": indexUrl}) db = conn.jobui content = db.page_store_jd_jobui.find_one({"indexUrl": indexUrl}) cur_tm = time.strftime("%Y-%m-%d", time.localtime(cur_tm / 1000)) log = indexUrl + "|0|" + cur_tm if content is not None: pre_tm = time.strftime( "%Y-%m-%d", time.localtime(content['updateTime'] / 1000)) if pre_tm == cur_tm: print "time is not change , don't recorde !! " return log = indexUrl + "|" + pre_tm + "|" + cur_tm self.log_file.append(log)
def __init__(self): self.bs = BinSaver("gsinfo_Guangdong_html_gz3.bin") self.pic = BinSaver("gsinfo_Guangdong_pic.bin") self.fs_QyxyDetail = FileSaver( "gsinfo_guangdong_QyxyDetail_gz3.txt") self.fs_GSpublicityList = FileSaver( "gsinfo_guangdong_GSpublicityList_gz3.txt") self.fs_entityShow = FileSaver( "gsinfo_guangdong_entityShow_gz3.txt") self.fs_guangzhou = FileSaver("gsinfo_guangdong_guangzhou_gz3.txt")
def __init__(self,thcnt): Spider.__init__(self,thcnt) # self.uc_count = 0 # self.tc_count = 0 # self.yy_count = 0 self.all_count = 0 self.bin_list = ['jobui_job_data1.bin','jobui_job_bu.bin','jobui_job_data2.bin'] #self.bin_list = ['jobui_job.bin','jobui_job2.bin','jobui_job4.bin'] self.domains = [] self.file_s = FileSaver('domains.txt')
def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_ids = FileSaver("fail_ids.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = True
def __init__(self): self.proxies_dict = [] self.read_proxy("proxy_030814.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_urls = FileSaver("fail_urls.txt") self.start_time = time.time() self.page_store = PageStoreJobUI() self.page_store.testmode = False
def __init__(self): self.proxies_dict = [] self.read_proxy("../spider/proxy/proxy.txt") Spider.__init__(self, len(self.proxies_dict)) self.success_count = 0 self.request_count = 0 self.__fail_add_url = FileSaver("fail_add_url.txt") self.start_time = time.time() self.domain = self.read_domain() self.domain_file = FileSaver("domains.txt")
def __init__(self, thcnt): GenQueries.__init__(self, thcnt) self.thread_count = 1 self._name = "jobui_queries" self.no_match_url = FileSaver("not_match_%s.txt" % self._name) self.bs2 = FileSaver("failed_urls.txt") self.job_url = FileSaver("job_url.txt") self.cnt = 0 self.domains = FileSaver("domains.txt") self.start_time = time.time()
class LPCVStore(PageStoreBase): def __init__(self): PageStoreBase.__init__(self, 'cv_liepin', dburl=LPCVConfig.mongdb_url) self.testmode = False self._not_need_cv_fs = FileSaver(LPCVConfig.NOT_NEED_CV_FN) self._not_access_by_qiye = FileSaver(LPCVConfig.NOT_ACCESS_BY_QIYE) def extract_content(self): cur_content = self.get_cur_doc().cur_content if isinstance(cur_content, unicode): cur_content = cur_content.encode('utf-8') fields = htmlfind.findTag(cur_content, 'table') content = '' for field in fields: if r'所在行业:' in field: content = htmlfind.remove_tag(field, True) break elif r'Industry:' in field or r'Industry:' in field: print "Ignore..... is English page!" self._save_not_need_cv(self.get_cur_doc().cur_jdid) break if r'抱歉,该简历已经设置为对猎头顾问不开放!' in cur_content: print "Ignore..... can not access by lietou" return None if r'该简历人才已经设置了对企业不开放简历,可能该人才已经找到工作,或者暂时没有换工作的意向。' in cur_content: print "Ignore..... can not access by qiye" self._not_access_by_qiye.append(self.get_cur_doc().cur_jdid) return None return content def _save_not_need_cv(self, cvId): self._not_need_cv_fs.append(cvId) def page_time(self): try: t = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="resume-info"') if not t: t = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="tab"') #猎头页面 if not t: return None return TimeHandler.fmt_time(t[0]) return TimeHandler.fmt_time(t[0]) except Exception as e: self._save_not_need_cv(self.get_cur_doc().cur_jdid) def check_should_fetch(self, jdid): indexUrl = "%s://%s" % (self.channel, jdid) return not self.find_any(indexUrl)
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.request = SessionRequests() self.view_state = None self.event_valid = None self.rand = None self.loc = "浙江" self.data_file = FileSaver("浙江_data.txt") self.have_get_url_file = FileSaver("浙江_get_url.txt") self.init_already() self.login("38037395", "773950")
class GenJobQuery(GenQueries): def __init__(self): GenQueries.__init__(self) self.thread_count = 1 self._name = "jobui_queries" def init_conditions(self): GQDataHelper.add(self, 'companyType', companyType) GQDataHelper.add(self, 'degreeType', degreeType) GQDataHelper.add(self, 'experienceType', experienceType) GQDataHelper.add(self, 'industry', industry) GQDataHelper.add(self, 'jobType', jobType) GQDataHelper.add(self, 'payType', payType) GQDataHelper.add(self, 'cityKw', cityKw) self.bs2 = FileSaver("failed_urls.txt") def need_split(self, params, level, islast): url = self.gen_url(params) con = self.request_url(url) if con is not None: #m = re.search(ur"(?:共|多于)<em>(\d+)</em>个职位满足条件", con.text) m = re.search(ur'<small class="info"> ((\d+) 条信息)</small>', con.text) if m: found = m.group(1).encode('utf-8') #print 'get count == ',found count = parseInt(found) with self.locker: print "[%d] %s ==> %s %s" % (level, url, found, 'failed' if (count >= 1000) else '') if parseInt(found) >= 1000: return True else: return False with self.locker: self.bs2.append("==failed get con: %s====" % url) print "===%s===" % url #print con.text return False def process_failed_url(self, params): self.bs2.append(self.gen_url(params)) return False def log_url(self, params): url = self.gen_url(params) self.fs.append(url) return True def gen_url(self, p): url = "http://www.jobui.com/jobs?jobKw=&cityKw=全国" for i in p: url = self.compose_url(url, i, p[i]) return url