Ejemplo n.º 1
0
    def parse_info(self, detail_url, guid, pro_name, pro_date,
                   registration_num):
        global ar_name, com_name
        ar_name = ''
        com_name = ''
        pro_name = pro_name
        pro_date = pro_date
        registration_num = registration_num
        duration = ''
        money = str(0)
        headers = {
            'Accept':
            '*/*',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
            'Ajax-method':
            'GetZBJGGSHXRNewByGuid',
            'Connection':
            'keep-alive',
            'Content-Length':
            '53',
            'Content-Type':
            'text/plain; charset=UTF-8',
            'Cookie':
            'ASP.NET_SessionId=xdoj1k24jseqj1l4dnutjdf2',
            'Host':
            'www.kmggzy.com',
            'Origin':
            'https://www.kmggzy.com',
            'Referer':
            'https://www.kmggzy.com/Jyweb/ZBJGGSNewView2.aspx?isBG=0&guid=71be0490-2e7c-4ce1-8663-4a71cf95112c&subType2=11&subType=1&type=%E4%BA%A4%E6%98%93%E4%BF%A1%E6%81%AF&area=1&zbtype=0',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36',
        }
        a = int(time.time() * 1000)
        p_data = '["' + guid + '"]' + str(a)
        url = 'https://www.kmggzy.com/TrueLoreAjax/TrueLore.Web.WebUI.WebAjaxService,TrueLore.Web.WebUI.ashx'
        original_url = detail_url
        print detail_url
        res = requests.post(url, data=p_data, headers=headers)
        if re.findall(',1:"(.*?)",', res.content):
            com_name = re.findall(',1:"(.*?)",', res.content)[1]
            ar_name = re.findall(',3:"(.*?)",', res.content)[1]

        ProjectModel().insert_project(
            self.name, com_name, ar_name, pro_name, pro_date, money,
            self.source, original_url, duration, registration_num,
            self.ch_area, self.ch_city, self.ch_region, self.en_area,
            self.en_city, self.en_region, self.crawler_id, self.spider_id,
            self.headers['User-Agent'])
Ejemplo n.º 2
0
    def parse_info(self, response):
        global registration_num, pro_name, com_name, ar_name, money, tr_list
        money = ''
        tr_list = []
        registration_num = ''
        pro_name = ''
        com_name = ''
        ar_name = ''
        duration = ''
        original_url = response.url
        pro_date = response.meta['pro_date']
        ch_region = response.meta['ch_region']
        en_region = response.meta['en_region']
        soup = BeautifulSoup(response.body, 'lxml')
        if soup.find('div', class_='con'):
            tr_list = soup.find('div',
                                class_='con').find('table').find_all('tr')
        for tr in tr_list:
            tr_content = tr.text

            if '备注说明:' not in tr_content:
                # 编号
                if '标段编号:' in tr_content:
                    registration_num = tr_content.split('标段编号:')[1].strip()
                # 项目名
                if '标段名称:' in tr_content:
                    pro_name = tr_content.split('标段名称:')[1].strip()
                # 中标公司
                if '中标人:' in tr_content:
                    com_name = tr_content.split('中标人:')[1].strip()
                # 项目负责人
                if '项目经理:' in tr_content:
                    ar_name = tr_content.split(':')[1].strip()
                # 项目金额
                if '中标价:' in tr_content:
                    money = tr_content.split('中标价:')[1].strip()
                    if '万元' in money:
                        money = money.split('万元')[0]
        if ar_name == '无' or ar_name == '/':
            ar_name = ''
        ProjectModel().insert_project(self.name, com_name, ar_name, pro_name,
                                      pro_date, money, self.source,
                                      original_url, duration, registration_num,
                                      self.ch_area, self.ch_city, ch_region,
                                      self.en_area, self.en_city, en_region,
                                      self.crawler_id, self.spider_id,
                                      self.headers['User-agent'])
Ejemplo n.º 3
0
    def parse_info(self, response):
        global registration_num, pro_name, com_name, ar_name, money, tr_list
        money = ''
        tr_list = []
        registration_num = ''
        pro_name = ''
        com_name = ''
        ar_name = ''
        duration = ''
        original_url = response.url
        pro_date = response.meta['pro_date']
        ch_region = response.meta['ch_region']
        en_region = response.meta['en_region']
        soup = BeautifulSoup(response.body, 'lxml')
        if soup.find('div', class_='con'):
            tr_list = soup.find('div',
                                class_='con').find('table').find_all('tr')
            for tr in tr_list:
                tr_content = tr.text
                if '备注说明:' not in tr_content:
                    # 编号
                    if '标段编号:' in tr_content:
                        registration_num = tr_content.split('标段编号:')[1].strip()
                    # 项目名
                    elif '标段名称:' in tr_content:
                        pro_name = tr_content.split('标段名称:')[1].strip()
                        if '招标文件' in pro_name:
                            pro_name = pro_name.split('招标文件')[0].strip()
                    # 中标公司
                    elif '中标人:' in tr_content:
                        com_name = tr_content.split('中标人:')[1].strip()
                    # 项目负责人
                    elif '项目经理:' in tr_content:
                        ar_name = tr_content.split(':')[1].strip()
                    # 项目金额
                    elif '中标价:' in tr_content:
                        money = tr_content.split('中标价:')[1].strip()
                        if '万元' in money:
                            money = money.split('万元')[0]
            if com_name != '':
                # print pro_name, com_name, ar_name, response.url
                ProjectModel().insert_project(
                    self.name, com_name, ar_name, pro_name, pro_date, money,
                    self.source, original_url, duration, registration_num,
                    self.ch_area, self.ch_city, ch_region, self.en_area,
                    self.en_city, en_region, self.crawler_id, self.spider_id,
                    self.headers['User-agent'])

        elif soup.find('div', class_='news-layout'):
            tr_list = soup.find(
                'div', class_='news-layout').find('table').find_all('tr')
            for tr in tr_list:
                tc = tr.text
                # 项目名
                if '项目名称' in tc or '工程名称' in tc:
                    pro_name = tr.text.split('名称')[1].strip()
                    if ':' in pro_name:
                        pro_name = pro_name.split(':')[1].strip()

                # 中标公司
                elif '中标候选人' in tc:
                    com_name = tr.text.split('中标候选人')[1].strip()
                    if '无' in com_name:
                        com_name = ''
                    if '得分' in com_name:
                        com_name = com_name.split('得分')[0].strip()
                    if ':' in com_name:
                        com_name = com_name.split(':')[1].strip()
                    if ';' in com_name:
                        com_name = com_name.split(';')[0].strip()
                # 项目负责人
                elif '项目经理:' in tc:
                    ar_name = tr.text.split(':')[1].split('项目经理编号')[0].strip()

                if com_name != '':
                    # print pro_name, com_name, ar_name, response.url
                    ProjectModel().insert_project(
                        self.name, com_name, ar_name, pro_name, pro_date,
                        money, self.source, original_url, duration,
                        registration_num, self.ch_area, self.ch_city,
                        ch_region, self.en_area, self.en_city, en_region,
                        self.crawler_id, self.spider_id,
                        self.headers['User-agent'])
Ejemplo n.º 4
0
    def parse_info(self, response):
        global pro_name, com_name, ar_name, money, registration_num
        pro_name = ''
        com_name = ''
        ar_name = ''
        money = ''
        original_url = response.url
        registration_num = ''
        pro_date = response.meta['pro_date']
        pro_name = response.meta['pro_name']
        web_data = response.body
        # 项目编号
        if re.match(".*?标段编号\:(.*?)标段", web_data, re.DOTALL):
            registration_num = remove_tags(
                re.match(".*?标段编号\:(.*?)标段",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace(":", "")
        else:
            registration_num = ''
        # 中标单位
        if re.match(".*?中标人\:(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?中标人\:(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""

        elif re.match(".*?第一中标侯选人\:(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?第一中标侯选人\:(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "公司" in com_name:
                com_name = com_name.split("公司")[0] + "公司"
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""
        elif re.match(".*?中标单位\:(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?中标单位\:(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "公司" in com_name:
                com_name = com_name.split("公司")[0] + "公司"
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""
        elif re.match(".*?中标候选人\:(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?中标候选人\:(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "公司" in com_name:
                com_name = com_name.split("公司")[0] + "公司"
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""
        elif response.xpath(
                "//div[@class='detail_contect']/table/tr[8]/td/table/tr[2]/td[2]"
        ).extract():
            com_name = remove_tags(
                response.xpath(
                    "//div[@class='detail_contect']/table/tr[8]/td/table/tr[2]/td[2]"
                ).extract_first())
            if "公司" in com_name:
                com_name = com_name
            elif response.xpath(
                    "//div[@class='detail_contect']/table/tr[8]/td/table/tr[2]/td[1]"
            ).extract():
                com_name = remove_tags(
                    response.xpath(
                        "//div[@class='detail_contect']/table/tr[8]/td/table/tr[2]/td[1]"
                    ).extract_first())
            else:
                com_name = ""
        elif response.xpath(
                "//table[@class='MsoNormalTable']/tbody/tr[2]/td[3]/p[@class='MsoNormal']/span[1]"
        ).extract():
            com_name = remove_tags(
                response.xpath(
                    "//table[@class='MsoNormalTable']/tbody/tr[2]/td[3]/p[@class='MsoNormal']/span[1]"
                ).extract_first())
        elif re.match(".*?第一中标候选人为\:(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?.*?第一中标候选人为\:(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""
        elif re.match(".*?第一中标候选人(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?第一中标候选人(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""
        elif re.match(".*?中标单位(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?中标单位(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "公司" in com_name:
                com_name = com_name.split("公司")[0] + "公司"
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""
        elif re.match(".*?中标人名称(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?中标人名称(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "公司" in com_name:
                com_name = com_name.split("公司")[0] + "公司"
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""
        elif re.match(".*?拟中标人(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?拟中标人(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "公司" in com_name:
                com_name = com_name.split("公司")[0] + "公司"
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""
        elif re.match(".*?第一中标候选单位\:(.*?公司)", web_data, re.DOTALL):
            com_name = remove_tags(
                re.match(".*?第一中标候选单位\:(.*?公司)",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace("/",
                                                       "").replace(":", "")
            if "公司" in com_name:
                com_name = com_name.split("公司")[0] + "公司"
            if "院" in com_name:
                com_name = com_name.split("院")[0] + "院"
            if "局" in com_name:
                com_name = com_name.split("局")[0] + "局"
            if "中心" in com_name:
                com_name = com_name.split("中心")[0] + "中心"
            if '"' in com_name:
                com_name = com_name.split('"')[-1]
            if "中标" in com_name or "招标" in com_name:
                com_name = ""
        else:
            com_name = ''
        # 中标价格
        if re.match(".*?中标价\:(.*?)元", web_data, re.DOTALL):
            try:
                money = remove_tags(
                    re.match(".*?中标价\:(.*?)元",
                             web_data, re.DOTALL).group(1)).strip().replace(
                                 "\n", "").replace("\r", "").replace(
                                     "\t", "").replace(" ",
                                                       "").replace(":", "")
                if "万" in money:
                    money = money.replace("万", "")
                    money = float(money)
                else:
                    money = float(money) / 10000
            except Exception:
                money = 0
        else:
            money = 0
        # 工期
        if re.match(".*?中标工期\:(.*?)天", web_data, re.DOTALL):
            duration = remove_tags(
                re.match(".*?中标工期\:(.*?)天",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace(":", "")
            if "日" in duration:
                duration = duration.split("日")[0]
            if len(duration) > 5:
                duration = str(0)
        else:
            duration = str(0)
        # 建造师或者项目负责人
        if re.match(".*?项目经理\:(.*?)备注", web_data, re.DOTALL):
            ar_name = remove_tags(
                re.match(".*?项目经理\:(.*?)备注",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace(":", "")
            if "招标" in ar_name:
                ar_name = ar_name.split("招标")[0]
            if "," in ar_name:
                ar_name = ar_name.split(",")[0]
            if len(ar_name) > 3:
                ar_name = ""
        elif re.match(".*?建造师名称(.*?)等", web_data, re.DOTALL):
            ar_name = remove_tags(
                re.match(".*?建造师名称(.*?)等",
                         web_data, re.DOTALL).group(1)).strip().replace(
                             "\n",
                             "").replace("\r", "").replace("\t", "").replace(
                                 " ", "").replace(":", "")
            if "招标" in ar_name:
                ar_name = ar_name.split("招标")[0]
            if "," in ar_name:
                ar_name = ar_name.split(",")[0]
            if len(ar_name) > 3:
                ar_name = ""
        else:
            ar_name = ""
        money = str(money)
        ProjectModel().insert_project(
            self.name, com_name, ar_name, pro_name, pro_date, money,
            self.source, original_url, duration, registration_num,
            self.ch_area, self.ch_city, self.ch_region, self.en_area,
            self.en_city, self.en_region, self.crawler_id, self.spider_id,
            self.headers['User-agent'])