Example #1
0
def filter_corp_name():
    filter = set()
    save = FileSaver("hunan_cname.txt")
    with open("corp_name.txt", "r") as f:
        i = 0
        j = 0
        for line in f:
            line = line.strip()
            ary = line.split(" ")
            code = int(ary[1].strip())
            #地区代码
            if code < 430000 or code >= 440000:
                continue
            a = 0
            for name in ary:
                if a == 0 or a == 1:
                    a += 1
                    continue
                if name in filter:
                    a += 1
                    j += 1
                    #print name, " already in filter..."
                    continue
                print code, name
                save.append(name)
                filter.add(name)
                a += 1
                i += 1
    print '重复条数:', j, "去重后条数:", i
Example #2
0
def guolv():
    filter1 = set()
    filter2 = set()
    save = FileSaver("r1_fenci_local_filter.txt")
    with open("r1_fenci.txt", "r") as f:
        for line in f:
            line = line.strip()
            filter1.add(line)

    cat = [u"公司", u"有限公司", u"分公司", u"子公司", u"责任公司", u"集团", u"委员会", u"商行", u"合作社", u"经营部", u"工作室", u"维修部", u"影楼", u"生活馆", u"网吧", u"经销处", u"服饰店", u"营业厅",
           u"西餐厅", u"商店", u"票务部", u"经销处", u"五金厂", u"超市", u"咖啡店", u"咨询中心", u"茶餐厅", u"酒吧", u"针织厂", u"塑料厂", u"服务部", u"酒店", u"宾馆", u"旅馆"]
    with open("r1_fenci_local.txt", "r") as f:
        cnt = 0
        i = 0
        for line in f:
            line = line.strip()
            if line in filter2:
                print line, "...... 自身重复 ......"
                continue
            else:
                filter2.add(line)

            if line in filter1:
                i += 1
                print i, line, "...... 重复 ......"
            else:
                if line in cat:
                    print cnt, line, "...... 要去掉 ......"
                    continue
                cnt += 1
                save.append(line)
        print "重复 %d 行 , 获得 %d 个分词:" % (i, cnt)
Example #3
0
def tiqu_spidered_cname():
    filter = set()
    save = FileSaver("guangdong_already_detail_cname.txt")
    with open("gsinfo_guangdong_success_url.txt", "r") as f:
        i = 0
        j = 0
        for line in f:
            line = line.strip()
            r = None
            try:
                r = eval(line)
            except Exception as e:
                print "ERROR:", e, "--->", line
                time.sleep(2)
                continue
            name = r["name"]
            if name in filter:
                j += 1
                print j, name, 'already exist!!!'
            else:
                save.append(name)
                filter.add(name)
                i += 1
                #print "第", i, "行:", r #utf8str(r)   #1443977
    print '重复条数:', j, "去重后条数:", i, "总条数:", (j + i)
Example #4
0
def check_guangzhou():
    # m = re.search(".*([\u4e00-\u9fa5]+).*", "abcdef阿瓦大实打实ghijk")
    # if m:
    #     print m.group(1)
    # temp = "abcdef阿瓦大实打实ghijk"
    # xx = u"([\u4e00-\u9fa5]+)"
    # pat = re.compile(xx)
    # result = pat.findall(temp.decode("utf8"))
    # for res in result:
    #     print res

    save = FileSaver("guangzhou4.txt")
    cnt = 0
    with open("guangzhou3.txt", "r") as f:
        for line in f:
            cnt += 1
            line = line.strip()
            xx = u"([\u4e00-\u9fa5]+)"
            pat = re.compile(xx)
            result = pat.findall(line.decode("utf8"))
            for res in result:
                if len(
                        res
                ) < 4 or res == u"有限公司" or res == u"贸易公司" or res == u"服装公司":
                    print "error: ", cnt, res
                    continue
                print cnt, len(res), res
                save.append(res.strip())
Example #5
0
 def gen_queries(self):
     remove_file(self.job_file)
     fs = FileSaver(self.job_file)
     for ct in self.case_types:
         pcnt = ct['count'] / self.pagesize + 1
         for page in range(1, pcnt + 1):
             fs.append(ct['key'] + '|' + str(ct['value']) + '|' + str(page) + '|' + str(self.pagesize))
Example #6
0
def test_check():
    filter = set()
    old = 0
    with open("gsinfo_out.txt", "r") as f:
        for line in f:
            line = line.strip()
            name = eval(line)["name"]
            if name not in filter:
                filter.add(name)
                old += 1
    print "公司名加载条数:", old
    time.sleep(1)
    i = 0
    j = 0
    save = FileSaver("guangdong_cname_new.txt")
    with open("guangdong_cname.txt", "r") as f:
        for line in f:
            r = line.strip()
            if r in filter:
                j += 1
                print j, 'already exist!!!'
            else:
                i += 1
                print i, r, "not in "
                save.append(r)

    print "旧的公司名条数:", old, '已经查到的公司名条数:', j, "没有查到的公司名条数:", i, "总条数:", (j + i)
Example #7
0
def filter_corp_name_all():
    filter = set()
    save = FileSaver("all_cname.txt")
    cnt = 0
    with open("corp_name.txt", "r") as f:
        i = 0
        j = 0
        for line in f:
            cnt += 1
            line = line.strip()
            ary = line.split(" ")
            #code = int(ary[1].strip())
            #地区代码
            # if code < 110000 or code >= 120000:
            #     continue
            a = 0
            for name in ary:
                if a == 0 or a == 1:
                    a += 1
                    continue
                name = name.strip()
                if name in filter:
                    a += 1
                    j += 1
                    #print name, " already in filter..."
                    continue
                print cnt, name
                save.append(name)
                filter.add(name)
                a += 1
                i += 1
    print '重复条数:', j, "去重后条数:", i
Example #8
0
def tiqu_cname_by_corp_name():
    filter = set()
    filter_name = set()
    i = 0
    j = 0
    save = FileSaver("corp_name_tiqu.txt")
    with open("corp_name.txt", "r") as f:
        for line in f:
            line = line.strip()
            if line in filter:
                i += 1
                continue
            ary = line.split(" ")
            x = 0
            for ay in ary:
                x += 1
                if x == 1 or x == 2:
                    continue
                cname = ay.strip()
                if cname in filter_name:
                    continue
                save.append(cname)
                filter_name.add(cname)
                j += 1
            filter.add(line)
    print "重复数:", i, "拿到公司名:", j
Example #9
0
def daochu1():
    filter = set()
    save = FileSaver("un_spider_queries.txt")
    with open("b_query_detail.txt", "r") as f:
        for line in f:
            line = line.strip()
            #print line
            detail = eval(line)
            code = detail["list"][0]["oc_code"]
            filter.add(code)

    re = 0
    new = 0
    with open("a_queried_company_list1.txt", "r") as f:
        for line in f:
            lst = eval(line.strip())
            code = lst['oc_code']
            if code in filter:
                re += 1
                print re, 'already get details!'
            else:
                s = line.strip()
                new += 1
                filter.add(code)
                save.append(s)
                print new, s
    print "new:", new, "re:", re
Example #10
0
def tiqu_already_spider_cname():
    re = 0
    new = 0
    filter_self = set()
    filter_cname = set()
    save = FileSaver("already_guangzhou_query_inc_name2.txt")
    with open("gsinfo_guangdong_guangzhou_gz1.txt") as f:
        for line in f:
            line = line.strip()
            if line in filter_self:
                re += 1
                continue
            else:
                try:
                    r = eval(line)
                    cname = r["basicInfo"]["名称"]
                    if cname in filter_cname:
                        re += 1
                        continue
                    else:
                        save.append(cname)
                        new += 1
                except Exception as e:
                    print "ERROR:", e, line
    print "重复:", re, "新:", new
Example #11
0
def tiqu_oc_code():
    filter_code = set()
    save = FileSaver("oc_code.txt")
    files = ["nacao_queries_info_local.txt", "nacao_queries_info.txt"]
    s = []
    i = 0
    x = 0
    y = 0
    for fs in files:
        with open(fs, "r") as f:
            for line in f:
                i += 1
                r = line.strip()
                jn = json.loads(r)
                oc = jn["jgdm"]
                if oc in filter_code:
                    x += 1
                else:
                    filter_code.add(oc)
                    s.append(oc)
                    y += 1
    print "总条数:", i, "重复注册号:", x, "不重注册号:", y
    z = 0
    if len(s) != 0:
        s.sort()
        for c in s:
            z += 1
            save.append(c)
    print "结束,写入", z, "条数据..."
Example #12
0
 def __init__(self,thcnt):
     Spider.__init__(self, thcnt)
     self.sessionReq = YouzyLogin()
     self.sessionReq.do_login()
     self.num_count = 0
     self.savefile=FileSaver("youzy.txt")
     self.__fail_urls = FileSaver("fail_urls.txt")
Example #13
0
 def __init__(self):
     self.bs = BinSaver("gsinfo_Guangdong_html.bin")
     self.fs_QyxyDetail = FileSaver("gsinfo_guangdong_QyxyDetail.txt")
     self.fs_GSpublicityList = FileSaver(
         "gsinfo_guangdong_GSpublicityList.txt")
     self.fs_entityShow = FileSaver("gsinfo_guangdong_entityShow.txt")
     self.fs_guangzhou = FileSaver("gsinfo_guangdong_guangzhou.txt")
Example #14
0
def tiqu_oc_code_cname():
    save = FileSaver("oc_code_cname.txt")
    files = ["nacao_queries_info_local.txt", "nacao_queries_info.txt"]
    s2 = {}
    i = 0
    x = 0
    y = 0
    for fs in files:
        with open(fs, "r") as f:
            for line in f:
                i += 1
                r = line.strip()
                jn = json.loads(r)
                oc = jn["jgdm"]
                cname = jn["bzjgmcs"]
                try:
                    s2[oc.strip()] = cname.strip()
                    print i, oc
                except Exception as e:
                    print "出错......", e, r
                    continue
    print "总条数:", i, "重复注册号:", x, "不重注册号:", y, "字典内元素个数:",len(s2)
    z = 0
    if len(s2) != 0:
        items = s2.items()
        print "排序开始...", time.time()
        items.sort()
        print "排序完毕...", time.time()
        for k, v in items:
            z += 1
            save.append(k + " " + v)
    print "结束,写入", z, "条数据...", time.time()
Example #15
0
    def __init__(self):
        spider.util.use_utf8()
        self.saver = RunGuangdong.Saver()
        self.is_debug = False
        if self.is_debug:
            Spider.__init__(self, 1)
            self.proxies_dict = [{
                'http':
                'http://*****:*****@106.75.134.189:18889',
                'https':
                'https://*****:*****@106.75.134.189:18889'
            }]
        else:
            self.proxies_dict = []
            self.read_proxy("../../../_ct_proxy/proxy_041810.txt")
            Spider.__init__(self, len(self.proxies_dict))
        self._curltls = threading.local()
        self.gswebs = {}
        #已经访问成功的URL
        self.success_url = FileSaver("gsinfo_guangdong_success_url.txt")
        #初始化已经爬过的链接
        self.init_spider_url()
        self.cnt = 1
        self.run_time = time.time()
        self.lock = threading.Lock()

        self.not_show_save = FileSaver("not_show_error_out.txt")
Example #16
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self.success_count = 0
     self.fail_count = 0
     self.fail_file = FileSaver("fail2db.txt")
     self.sus_file = FileSaver("SZ2DB.txt")
     self.init_filter()
Example #17
0
 def __init__(self, fn, mode='w', buffer_size=100):
     FileSaver.__init__(self, fn)
     self.fd = open(fn, mode)
     self.lock = threading.Lock()
     self.link_buffer = []
     self.buffer_size = buffer_size
     self.count = 0
Example #18
0
    def __init__(self):
        self.proxies_dict = []
        self.read_proxy("proxy_20160218.txt")
        Spider.__init__(self, len(self.proxies_dict))

        self.num_count = 0
        #self.filter_name = []
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_info = FileSaver("t-query_company_info.txt")
        #根据公司名字查询到的公司列表局部信息
        #self.query_company_info_part = FileSaver("t-query_company_info_part.txt")
        #根据公司名字查询到的公司列表信息失败的
        self.query_company_info_failure = FileSaver(
            "t-query_company_info_failure.txt")
        #已经爬取过的公司名
        self.already_cname = FileSaver("t-already_cname.txt")
        #初始化已经爬过的公司
        self.init_cname()
        #查询详情失败的公司名
        self.detail_failure = FileSaver("t-detail_failure1.txt")
        #APP可以拿到的公司全部信息 包含股东信息
        self.detail_company = FileSaver("t-detail_company.txt")
        self.extJson = self._aes_.encrypt(
            spider.util.utf8str({
                "cl_screenSize": "640x960",
                "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02",
                "Org_iOS_Version": "2.0.1"
            }))
        self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
        self.headers = {"Content-Type": "application/json"}
Example #19
0
 def __init__(self,thcnt):
     Spider.__init__(self, thcnt)
     self.sessionReq = YouzyLogin()
     self.sessionReq.do_login(1)
     self.num_count = 0
     self.parse_count = 0
     self.savefile=CsvSaver("spider_url_zhuanke_np.csv",fixed+pfcolumn)
     self.__fail_urls = FileSaver("spider_url_fail_zhuanke_np.txt")
Example #20
0
 def init_conditions(self):
     GQDataHelper.add(self, 'companyType', companyType)
     GQDataHelper.add(self, 'degreeType', degreeType)
     GQDataHelper.add(self, 'experienceType', experienceType)
     GQDataHelper.add(self, 'industry', industry)
     GQDataHelper.add(self, 'jobType', jobType)
     GQDataHelper.add(self, 'payType', payType)
     GQDataHelper.add(self, 'cityKw', cityKw)
     self.bs2 = FileSaver("failed_urls.txt")
Example #21
0
class PageStoreJobUI(PageStoreBase):
    def __init__(self):
        super(PageStoreJobUI, self).__init__('jd_jobui')
        self.crawlered_ids = set()
        self.log_file = FileSaver("./data/chentao/jobui_log/jobui-" +
                                  str(time.strftime('%Y-%m-%d')) + ".txt")

    def extract_content(self):
        content = spider.util.htmlfind(self.get_cur_doc().cur_content,
                                       'class="hasVist cfix sbox fs16"', 0)
        try:
            content = content.get_text()
        except:
            Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            return None
        return content

    def page_time(self):
        #TODO
        #tag = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="publish_time"', 0)
        tag = re.search('class="uptime common-icon"></em>(.*?)</dd>',
                        self.get_cur_doc().cur_content)
        try:
            #tag = tag.get_text()
            tag = tag.group(1)
        except:
            Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise

        return TimeHandler.fmt_time(tag)

    def getopath(self):
        dirs = ['./data/chentao/jobui_data_re']
        for di in dirs:
            if os.path.isdir(di) and os.access(di, os.W_OK):
                return di
        raise RuntimeError("no dir to write files.")

    def save_time_log(self, indexUrl, cur_tm):
        """记录更新时间"""
        #db = conn.gaokao_crawler
        #content = db.page_store_jd_jobui.find_one({"indexUrl": indexUrl})
        db = conn.jobui
        content = db.page_store_jd_jobui.find_one({"indexUrl": indexUrl})
        cur_tm = time.strftime("%Y-%m-%d", time.localtime(cur_tm / 1000))
        log = indexUrl + "|0|" + cur_tm
        if content is not None:
            pre_tm = time.strftime(
                "%Y-%m-%d", time.localtime(content['updateTime'] / 1000))
            if pre_tm == cur_tm:
                print "time is not change , don't recorde !! "
                return
            log = indexUrl + "|" + pre_tm + "|" + cur_tm
        self.log_file.append(log)
Example #22
0
 def __init__(self):
     self.bs = BinSaver("gsinfo_Guangdong_html_gz3.bin")
     self.pic = BinSaver("gsinfo_Guangdong_pic.bin")
     self.fs_QyxyDetail = FileSaver(
         "gsinfo_guangdong_QyxyDetail_gz3.txt")
     self.fs_GSpublicityList = FileSaver(
         "gsinfo_guangdong_GSpublicityList_gz3.txt")
     self.fs_entityShow = FileSaver(
         "gsinfo_guangdong_entityShow_gz3.txt")
     self.fs_guangzhou = FileSaver("gsinfo_guangdong_guangzhou_gz3.txt")
Example #23
0
 def __init__(self,thcnt):
     Spider.__init__(self,thcnt)
     # self.uc_count = 0
     # self.tc_count = 0
     # self.yy_count = 0
     self.all_count = 0
     self.bin_list = ['jobui_job_data1.bin','jobui_job_bu.bin','jobui_job_data2.bin']
     #self.bin_list = ['jobui_job.bin','jobui_job2.bin','jobui_job4.bin']
     self.domains = []
     self.file_s = FileSaver('domains.txt')
Example #24
0
 def __init__(self):
     self.proxies_dict = []
     self.read_proxy("../spider/proxy/proxy.txt")
     Spider.__init__(self, len(self.proxies_dict))
     self.success_count = 0
     self.request_count = 0
     self.__fail_ids = FileSaver("fail_ids.txt")
     self.start_time = time.time()
     self.page_store = PageStoreJobUI()
     self.page_store.testmode = True
Example #25
0
 def __init__(self):
     self.proxies_dict = []
     self.read_proxy("proxy_030814.txt")
     Spider.__init__(self, len(self.proxies_dict))
     self.success_count = 0
     self.request_count = 0
     self.__fail_urls = FileSaver("fail_urls.txt")
     self.start_time = time.time()
     self.page_store = PageStoreJobUI()
     self.page_store.testmode = False
Example #26
0
 def __init__(self):
     self.proxies_dict = []
     self.read_proxy("../spider/proxy/proxy.txt")
     Spider.__init__(self, len(self.proxies_dict))
     self.success_count = 0
     self.request_count = 0
     self.__fail_add_url = FileSaver("fail_add_url.txt")
     self.start_time = time.time()
     self.domain = self.read_domain()
     self.domain_file = FileSaver("domains.txt")
Example #27
0
 def __init__(self, thcnt):
     GenQueries.__init__(self, thcnt)
     self.thread_count = 1
     self._name = "jobui_queries"
     self.no_match_url = FileSaver("not_match_%s.txt" % self._name)
     self.bs2 = FileSaver("failed_urls.txt")
     self.job_url = FileSaver("job_url.txt")
     self.cnt = 0
     self.domains = FileSaver("domains.txt")
     self.start_time = time.time()
Example #28
0
class LPCVStore(PageStoreBase):
    def __init__(self):
        PageStoreBase.__init__(self, 'cv_liepin', dburl=LPCVConfig.mongdb_url)
        self.testmode = False
        self._not_need_cv_fs = FileSaver(LPCVConfig.NOT_NEED_CV_FN)
        self._not_access_by_qiye = FileSaver(LPCVConfig.NOT_ACCESS_BY_QIYE)

    def extract_content(self):

        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        fields = htmlfind.findTag(cur_content, 'table')

        content = ''
        for field in fields:
            if r'所在行业:' in field:
                content = htmlfind.remove_tag(field, True)
                break
            elif r'Industry:' in field or r'Industry:' in field:
                print "Ignore..... is English page!"
                self._save_not_need_cv(self.get_cur_doc().cur_jdid)
                break

        if r'抱歉,该简历已经设置为对猎头顾问不开放!' in cur_content:
            print "Ignore..... can not access by lietou"
            return None
        if r'该简历人才已经设置了对企业不开放简历,可能该人才已经找到工作,或者暂时没有换工作的意向。' in cur_content:
            print "Ignore..... can not access by qiye"
            self._not_access_by_qiye.append(self.get_cur_doc().cur_jdid)
            return None

        return content

    def _save_not_need_cv(self, cvId):
        self._not_need_cv_fs.append(cvId)

    def page_time(self):
        try:
            t = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                                 'class="resume-info"')
            if not t:
                t = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                                     'class="tab"')  #猎头页面
                if not t:
                    return None
                return TimeHandler.fmt_time(t[0])
            return TimeHandler.fmt_time(t[0])
        except Exception as e:
            self._save_not_need_cv(self.get_cur_doc().cur_jdid)

    def check_should_fetch(self, jdid):
        indexUrl = "%s://%s" % (self.channel, jdid)
        return not self.find_any(indexUrl)
Example #29
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self.request = SessionRequests()
     self.view_state = None
     self.event_valid = None
     self.rand = None
     self.loc = "浙江"
     self.data_file = FileSaver("浙江_data.txt")
     self.have_get_url_file = FileSaver("浙江_get_url.txt")
     self.init_already()
     self.login("38037395", "773950")
Example #30
0
class GenJobQuery(GenQueries):
    def __init__(self):
        GenQueries.__init__(self)
        self.thread_count = 1
        self._name = "jobui_queries"

    def init_conditions(self):
        GQDataHelper.add(self, 'companyType', companyType)
        GQDataHelper.add(self, 'degreeType', degreeType)
        GQDataHelper.add(self, 'experienceType', experienceType)
        GQDataHelper.add(self, 'industry', industry)
        GQDataHelper.add(self, 'jobType', jobType)
        GQDataHelper.add(self, 'payType', payType)
        GQDataHelper.add(self, 'cityKw', cityKw)
        self.bs2 = FileSaver("failed_urls.txt")

    def need_split(self, params, level, islast):
        url = self.gen_url(params)
        con = self.request_url(url)
        if con is not None:
            #m = re.search(ur"(?:共|多于)<em>(\d+)</em>个职位满足条件", con.text)
            m = re.search(ur'<small class="info">&nbsp;((\d+) 条信息)</small>',
                          con.text)
            if m:
                found = m.group(1).encode('utf-8')
                #print 'get count == ',found
                count = parseInt(found)
                with self.locker:
                    print "[%d] %s ==> %s %s" % (level, url, found, 'failed' if
                                                 (count >= 1000) else '')
                if parseInt(found) >= 1000:
                    return True
                else:
                    return False
        with self.locker:
            self.bs2.append("==failed get con: %s====" % url)
            print "===%s===" % url
            #print con.text
        return False

    def process_failed_url(self, params):
        self.bs2.append(self.gen_url(params))
        return False

    def log_url(self, params):
        url = self.gen_url(params)
        self.fs.append(url)
        return True

    def gen_url(self, p):
        url = "http://www.jobui.com/jobs?jobKw=&cityKw=全国"
        for i in p:
            url = self.compose_url(url, i, p[i])
        return url