Exemple #1
0
def get_cst():
    from opener import Opener
    reg = re.compile(r'nyear=(?P<year>\d+).*?nmonth=(?P<month>\d+).*?nday=(?P<day>\d+).*?nwday=(\d+).*?nhrs=(?P<hour>\d+).*?nmin=(?P<minute>\d+).*?nsec=(?P<second>\d+)', re.S)

    opener = Opener(encoding='utf8')
    content = opener.urlopen('http://www.beijing-time.org/time.asp', times=0)
    search_obj = reg.search(content)
    return search_obj and datetime.datetime(**dict(((item[0], int(item[1])) for item in search_obj.groupdict().items()))) or datetime.datetime.now()
class Register_corp:
    def __init__(self):
        logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
        self.model = CorpModel
        self.invalid_code_model = InvalidCodeModel
        self.max_code_model = MaxCodeModel
        self.info_from = '广东红盾网'
        self.charset = 'gbk'
        self.opener = Opener(encoding=self.charset)
        # 6位机关代码; 6位
        self.org_code = '441900'
        # 企业性质号码; 2位, 00~30为内资;40~50为外资.
        self.nature_num = 0
        # 流水号; 4位
        self.ord_num= 1
        self.corp_url = 'http://wsnj.gdgs.gov.cn/aiccps/SearchServlet?service=getEntityInfoByPage&registerNo=%s'
        self.search_text_reg = re.compile(r'^<table width="100%" border="0"><tr><td align=center height=200 >找不到相关的数据\.\.</td></tr></table>$')
        self.regs = [
            re.compile(r'<td align=left width=100% colspan=6 height=25>(?P<name>[^<]+)', re.S),
            re.compile(r'址:</td><td align=left valign=top colspan=5>(?P<addr>[^<]*)', re.S),
            re.compile(r'号:</td><td align=left valign=top><font color="red">(?P<register_code>[^<]*)', re.S),
            re.compile(r'[名人]:</td><td align=left valign=top colspan=3>(?P<representative>[^<]*)', re.S),
            re.compile(r'型:</td><td align=left valign=top>(?P<nature>[^<]*)', re.S),
            re.compile(r'限:</td><td align=left valign=top colspan=3>(?P<period>[^<]*)', re.S),
            re.compile(r'本:</td><td align=left valign=top>(?P<capital>[^<]*)', re.S),
            re.compile(r'关:</td><td align=left valign=top colspan=3>(?P<register_department>[^<]*)', re.S),
            re.compile(r'态:</td><td align=left valign=top>(?P<status>[^<]*)', re.S),
            re.compile(r'期:</td><td align=left valign=top colspan=3>(?P<establishment_data>[^<]+)', re.S),
            re.compile(r'围:</td><td align=left valign=top colspan=5>(?P<scope>[^<]*)', re.S),
        ]
        self._save_times = 0
        self._today = get_cst()

    def _msg(self, text):
        return '%s %s' % (time.asctime(), text)

    def _process(self, corp_dict):
        corp_dict['insert_date'] = self._today
        corp_dict['info_from'] = self.info_from
        if 'establishment_data' in corp_dict and corp_dict['establishment_data']:
            corp_dict['establishment_data'] = self.model._str2date(corp_dict['establishment_data'])
        return corp_dict

    def calc_code15(self, org_code, nature_num, ord_num):
        """ 计算15位注册号的检验码. 并返回15位注册号. """
        code14 = '%s%02d%06d' % (org_code, nature_num, ord_num)
        temp =  reduce(lambda x,y: ((x+int(y))%10 or 10)*2%11, code14, 10)
        return '%s%s' % (code14, (11-temp)%10)

    def calc_code13(self, org_code, nature_num, ord_num):
        """ 返回13位注册号. """
        return '%s%s%04d' % (org_code, nature_num, ord_num)

    def init_invalid_codes(self):
        i = 0
        for query_obj in self.max_code_model.get_all():
            org_code, nature_num, max_ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num
            for ord_num in range(0, max_ord_num):
                register_code = self.calc_code15(org_code, nature_num, ord_num)
                if self.model.exists_by(register_code=register_code):
                    continue
                self.invalid_code_model.add({'register_code': register_code}, is_commit=False)
                i += 1
                if not i%200:
                    self.model.commit()
                    print('Save 200 invalid codes!')
        self.model.commit()

    def fetch(self, code):
        print('###############################################################')
        print(self._msg('注册码: %s' % code))
        url = self.corp_url % code
        content = self.opener.urlopen(url, timeout=10, times=0)
        #if content.find(self.search_text) < 0:
        if self.search_text_reg.match(content):
            print('没找到相关信息.')
            return []
        result = []
        temp = {}
        for search_obj in (reg.search(content) for reg in self.regs):
            if search_obj:
                temp.update(search_obj.groupdict())
        print(temp)
        result.append(temp)
        if not result:
            logging.info('register code: %s' % code)
        return result

    def save(self, register_code):
        self._save_times = (self._save_times + 1) % 101
        self._save_times or self.model.commit()
        corps = self.fetch(register_code)
        if not corps:
            return False
        for corp in corps:
            self.model.add(self._process(corp), is_commit=False)
            print('添加成功!')
        print('###############################################################')
        return True

    def action(self):
        invalid_times = 0
        while self.nature_num<=99:
            if self.save(self.calc_code15(self.org_code, self.nature_num, self.ord_num)):
                invalid_times = 0
            else:
                invalid_times += 1
            self.ord_num += 1
            if invalid_times >= 500:
                invalid_times = 0
                self.ord_num = 0
                if self.nature_num >=40:
                    self.nature_num = 0
                    self.org_code = str(int(self.org_code)+1)
                    if int(self.org_code)>=440200:
                        break
                else:
                    self.nature_num = 40
                logging.info('org_code: %s nature code: %s' % (self.org_code, self.nature_num))
        self.model.commit()

    def action_test(self):
        pass

    def action_new(self, invalid_times=20):
        for query_obj in self.max_code_model.get_all():
            org_code, nature_num, ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num+1
            times = 0
            while times<invalid_times:
                if self.save(self.calc_code15(org_code, nature_num, ord_num)):
                    times = 0
                    query_obj.ord_num = ord_num
                    # 预防 query_obj.ord_num = ord_num 发生特殊的未commit事件.
                    self._save_times or self.model.commit()
                else:
                    times += 1
                ord_num += 1
            self.model.commit()

    def action_from_invalid_codes(self):
        for query_obj in self.invalid_code_model.get_all():
            if self.save(query_obj.register_code):
                query_obj.delete()
        self.model.commit()

    def action_from_file(self):
        f = open('others.txt')
        for line in f:
            code = line[:-1]
            self.save(code)
        f.close()
        self.model.commit()

    def report(self):
        corps = self.model.filter_by(status='登记成立', insert_date=datetime.date.today())
        #corps = self.model.filter_by(status='登记成立', insert_date=datetime.date(2011,11,3))
        rows = []
        fields = (
            ('名称', 'name'),
            ('注册码', 'register_code'),
            ('地址', 'addr'),
            ('经营范围', 'scope'),
            ('注册资金', 'capital'),
            ('成立日期', 'establishment_data'),
            ('企业性质', 'nature'),
            ('法人', 'representative'),
            ('企业状态', 'status'),
            ('期限', 'period'),
            ('登记单位', 'register_department'),
            ('信息来源', 'info_from'),
            ('更新日期', 'insert_date'),
        )
        self.model.report('东莞红盾网最新注册公司信息_%s.csv' % time.strftime('%Y-%m-%d'), fields=fields, rows=corps, encoder='gbk')
Exemple #3
0
class Corp(multiprocessing.Process):
    def __init__(self, corplist_url, corp_url, info_from, corplist_post_data=None, corp_post_data=None, corplist_reg=None, corp_regs=[], timeout=5, commit_each_times=30, has_cookie=True, charset='utf8', model=None):
        """ 参数 corplist_url 和 corp_url 取胜字符串的高级格式化:format, 使用{0},{1}等通配符; """
        super().__init__()
        self.charset = charset
        self.info_from = info_from
        self.corplist_url = corplist_url
        self.corp_url = corp_url
        self.opener = Opener(has_cookie=has_cookie, encoding=self.charset)

        self.corplist_post_data = corplist_post_data
        self.corp_post_data = corp_post_data
        self.corplist_reg = corplist_reg
        self.corp_regs = corp_regs
        self.commit_each_times = commit_each_times
        self.timeout = timeout

        if model:
            self.model = model
        else:
            from lib.models import CorpModel
            self.model = CorpModel

        #self._today = get_cst()
        self._today = datetime.date.today()

    def _msg(self, msg=''):
        #print('%s %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg))
        logging.info(msg)

    def set_queue(self, queue):
        self.queue = queue


    def process_corp_info(self, corp_info, date_reg=r'(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+)'):
        for key, values in corp_info.items():
            corp_info[key] = values.strip()
        if 'insert_date' in corp_info:
            corp_info['insert_date'] = self.model._str2date(corp_info['insert_date'], date_reg=date_reg)
        else:
            corp_info['insert_date'] = self._today
        return corp_info

    def get_next_page_url(self):
        """ 必须是一个非协程的Generator, 或者返回一个iterable. """
        return (self.corplist_url)

    def get_corp_url(self, corp_info={}):
        return self.corp_url.format(**corp_info)

    def prepare(self):
        pass

    def fetch_corplist(self, page_url):
        """ 如果成功抓取, 返回一个包含 Corp Info dict 的列表或者iterable; 否则返回 {}. """
        content = self.opener.urlopen(page_url, data=self.corplist_post_data, timeout=self.timeout, times=0)
        return ({} if not search_obj else search_obj.groupdict() for search_obj in self.corplist_reg.finditer(content))

    def fetch_corp(self, corp_info=None):
        """ 如果成功抓取, 返回一个Corp Info 的 dict. """
        corp_url = self.get_corp_url(corp_info)
        content = self.opener.urlopen(corp_url, data=self.corp_post_data, timeout=self.timeout, times=0)
        for reg in self.corp_regs:
            search_obj = reg.search(content)
            search_obj and corp_info.update(search_obj.groupdict())
        return corp_info

    def before_save(self, corp_info):
        corp_info = self.process_corp_info(corp_info)
        corp_info['info_from'] = self.info_from
        return corp_info

    def commit(self):
        self.model.commit()

    @coroutine
    def check_exists(self):
        """ Generator, 存在的话返回其 info_from, 否则返回 None. """
        corp_names_cache = {}
        corp_names_cache_list = []
        cache_length = 0
        result = None
        while 1:
            corp_info = (yield result)
            result = None
            corp_name = corp_info['name'].strip()
            if corp_name not in corp_names_cache:
                corp_names_cache[corp_name] = self.info_from
                corp_names_cache_list.insert(0,corp_name)
                cache_length += 1
                if cache_length > self.commit_each_times:
                    del corp_names_cache[corp_names_cache_list.pop()]
                    cache_length -= 1
                exists_corp = self.model.filter_by(name=corp_name).first()
                if exists_corp:
                    result = exists_corp.info_from
                    corp_names_cache[corp_name] = result
            else:
                result = corp_names_cache[corp_name]

    def run(self):
        self.prepare()
        check_exists = self.check_exists()
        cur_page = itertools.count()
        for page_url in self.get_next_page_url():
            print('\n%s 第%s页' % (self.info_from, next(cur_page)+1))
            for corp_info in self.fetch_corplist(page_url):
                self._msg('***************************************************')
                print(corp_info['name'], end=' ')
                info_from = check_exists.send(corp_info)
                if not info_from:
                    if self.corp_regs:
                        corp_info = self.fetch_corp(corp_info)
                    corp_info = self.before_save(corp_info)
                    self.queue.put(corp_info)
                    print('保存成功!')
                else:
                    print('已经存在于: %s' % info_from)
        self._msg('\n%s 抓取完毕!' % self.info_from)
        self.queue.put(None)

    def report(self, fields=None):
        corps = self.model.filter_by(info_from=self.info_from, insert_date=datetime.date.today())
        #corps = self.model.filter_by(info_from=self.info_from, insert_date=datetime.date(2011,12,8))
        fields = fields or (
            ('名称', 'name'),
            ('地址', 'addr'),
            ('联系人', 'contact_person'),
            ('区号', 'contact_tel_code'),
            ('电话号码', 'contact_tel_no'),
            ('邮箱', 'mail'),
            ('网址', 'website'),
            ('信息来源', 'info_from'),
            ('更新日期', 'insert_date'),
            ('链接', self.corp_url),
        )
        self.model.report('%s最新公司信息_%s.csv' % (self.info_from, time.strftime('%Y-%m-%d')), fields=fields, rows=corps, encoder='gbk')
class Register_corp:
    def __init__(self):
        logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
        self.model = CorpModel
        self.invalid_code_model = InvalidCodeModel
        self.max_code_model = MaxCodeModel
        self.info_from = '广东红盾网'
        self.charset = 'utf8'
        self.opener = Opener(has_cookie=True, encoding=self.charset)
        # 6位机关代码; 6位
        self.org_code = '440101'
        # 企业性质号码; 2位, 00~30为内资;40~50为外资.
        self.nature_num = 0
        # 流水号; 4位
        self.ord_num= 0
        self.query_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseopeninfo.aspx'
        self.corp_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseOpeninfoDetail.aspx?key={0}'
        self.search_string = '查看'
        self.reg = re.compile(r'lbSREGNO">(?P<register_code>[^<]*)</span></td>.*?lbSNAME">(?P<name>[^<]*)</span></td>.*?lbSDOM">(?P<addr>[^<]*)</span></td>.*?lbSMAN">(?P<representative>[^<]*)</span></td>.*?lbSENTCLASS">\s*(?P<nature>[^<]*)</span></td>.*?lbSREGCAP">(?P<capital>[^<]*)</span></td>.*?lbSSSZB">[^<]*</span></td>.*?lbSOPSCOPE">(?P<scope>[^<]*)</span></td>.*?LbSREGRDATE">(?P<establishment_data>[^<]*)</span></td>.*?lbSAPPRDATE">[^<]*</span></td>.*?lbSREGORG">(?P<register_department>[^<]*)</span></td>', re.S)
        self.event_regs = [
            re.compile(r'__VIEWSTATE" value="(?P<__VIEWSTATE>[^"]*)'),
            re.compile(r'__EVENTVALIDATION" value="(?P<__EVENTVALIDATION>[^"]*)'),
        ]
        self.key_reg = re.compile(r'key=(?P<key>[^\']*)')
        self.post_data = {
            '__EVENTTARGET': 'QueryButton',
            '__EVENTARGUMENT': '',
            '__VIEWSTATE': '',
            '__EVENTVALIDATION': '',
            'txtName': '',
            'txtReg': '',
        }

        self._save_times = 0
        self._today = get_cst()

        self.initial()

    # 初始化 ufps
    def initial(self):
        self.fetch_content(self.query_url)

    def _msg(self, text):
        return '%s %s' % (time.asctime(), text)

    def _process(self, corp_dict):
        corp_dict.update({
            'insert_date': self._today,
            'info_from': self.info_from,
            'status': '登记成立',
            'period': '长期',
        })
        if 'establishment_data' in corp_dict and corp_dict['establishment_data']:
            corp_dict['establishment_data'] = self.model._str2date(corp_dict['establishment_data'])
        return corp_dict

    def calc_code15(self, org_code, nature_num, ord_num):
        """ 计算15位注册号的检验码. 并返回15位注册号. """
        code14 = '%s%02d%06d' % (org_code, nature_num, ord_num)
        temp =  reduce(lambda x,y: ((x+int(y))%10 or 10)*2%11, code14, 10)
        return '%s%s' % (code14, (11-temp)%10)

    def update_events(self, content):
        for reg in self.event_regs:
            search_obj = reg.search(content)
            search_obj and self.post_data.update(search_obj.groupdict())

    def fetch_content(self, url, data=None, timeout=10, raw_string=False, update_events=True):
        content = self.opener.urlopen(url, data, timeout=timeout, times=0)
        update_events and self.update_events(content)
        return content

    def fetch_query_content(self, code):
        self.post_data.update({
            '__EVENTTARGET': 'QueryButton',
            'txtReg': code,
        })
        return self.fetch_content(self.query_url, self.post_data)

    def fetch_key_content(self):
        self.post_data['__EVENTTARGET'] = 'GridView1$ctl02$LinkButton1'
        return self.fetch_content(self.query_url, self.post_data)

    def fetch_corp_key(self, code):
        content = self.fetch_query_content(code)
        if content.find(self.search_string) < 0:
            return None
        content = self.fetch_key_content()
        search_obj = self.key_reg.search(content)
        return search_obj and search_obj.groups()[0]

    def fetch_corp_info(self, code):
        print('###############################################################')
        print(self._msg('注册码: %s' % code))
        corp_key = self.fetch_corp_key(code)
        if corp_key == None:
            print('没找到相关信息.')
            return []
        print('key: %s' % corp_key)
        url = self.corp_url.format(corp_key)
        content = self.fetch_content(url, update_events=False)
        result = []
        search_obj = self.reg.search(content)
        if search_obj:
            search_obj.groupdict() and result.append(search_obj.groupdict())
            print(search_obj.groupdict())
        if not result:
            logging.info('register code: %s' % code)
        return result

    def save(self, register_code):
        self._save_times = (self._save_times + 1) % 101
        self._save_times or self.model.commit()
        corps = self.fetch_corp_info(register_code)
        if not corps:
            return False
        for corp in corps:
            self.model.add(self._process(corp), is_commit=False)
            print('添加成功!')
        print('###############################################################')
        return True

    def action(self):
        invalid_times = 0
        while self.nature_num<=99:
            if self.save(self.calc_code15(self.org_code, self.nature_num, self.ord_num)):
                invalid_times = 0
            else:
                invalid_times += 1
            self.ord_num += 1
            if invalid_times >= 500:
                invalid_times = 0
                self.ord_num = 0
                if self.nature_num >=40:
                    self.nature_num = 0
                    self.org_code = str(int(self.org_code)+1)
                    if int(self.org_code)>=440200:
                        break
                else:
                    self.nature_num = 40
                logging.info('org_code: %s nature code: %s' % (self.org_code, self.nature_num))
        self.model.commit()

    def action_test(self):
        pass

    def action_new(self, invalid_times=10):
        for query_obj in self.max_code_model.get_all():
            org_code, nature_num, ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num+1
            times = 0
            while times<invalid_times:
                if self.save(self.calc_code15(org_code, nature_num, ord_num)):
                    times = 0
                    query_obj.ord_num = ord_num
                    # 预防 query_obj.ord_num = ord_num 发生特殊的未commit事件.
                    self._save_times or self.model.commit()
                else:
                    times += 1
                ord_num += 1
            self.model.commit()

    def action_from_invalid_codes(self):
        for query_obj in self.invalid_code_model.get_all():
            if self.save(query_obj.register_code):
                query_obj.delete()
        self.model.commit()

    def action_from_file(self):
        f = open('others.txt')
        for line in f:
            code = line[:-1]
            self.save(code)
        f.close()
        self.model.commit()

    def report(self):
        corps = self.model.filter_by(status='登记成立', insert_date=datetime.date.today())
        #corps = self.model.filter_by(status='登记成立', insert_date=datetime.date(2011,12,8))
        rows = []
        fields = (
            ('名称', 'name'),
            ('注册码', 'register_code'),
            ('地址', 'addr'),
            ('经营范围', 'scope'),
            ('注册资金', 'capital'),
            ('成立日期', 'establishment_data'),
            ('企业性质', 'nature'),
            ('法人', 'representative'),
            ('企业状态', 'status'),
            ('期限', 'period'),
            ('登记单位', 'register_department'),
            ('信息来源', 'info_from'),
            ('更新日期', 'insert_date'),
        )
        self.model.report('广州红盾网最新注册公司信息_%s.csv' % time.strftime('%Y-%m-%d'), fields=fields, rows=corps, encoder='gbk')