def generate_IDF_dic(self, col_name=None):
        if not self.input_sheet:
            self.input_sheet = ToolsBox.read_excel(self.file_name,
                                                   self.sheet_name)
        if col_name is None: col_name = self.col_name
        total_row = len(self.input_sheet)
        count_dict = {}
        # 统计数量
        for item in self.input_sheet:
            # item[col_name] = ToolsBox.clearStr(item[col_name])
            # for char in item[col_name]:
            new_item = copy.deepcopy(item)
            new_item[col_name] = ToolsBox.clearStr(new_item[col_name])
            for char in new_item[col_name]:
                count_dict[
                    char] = count_dict[char] + 1 if char in count_dict else 1

        # 求取IDF值
        for k, v in count_dict.items():
            count_dict[k] = math.log(total_row / v)

        # 排序
        count_dict = dict(
            sorted(count_dict.items(), key=lambda x: x[1], reverse=True))
        # print(count_dict)
        # print(type(count_dict))

        return count_dict
Esempio n. 2
0
    def parse_datas(self, soup):
        totalfind = soup.select("h2.total.fl > span")
        if 0 == ToolsBox.strToInt(totalfind[0].get_text()): return '0'
        page_datas = []
        communitys = soup.select("div.info>div.title>a")
        regions = soup.select('a.district')
        blocks = soup.select('a.bizcircle')
        prices = soup.select('.totalPrice>span')
        forsales = soup.select('.totalSellCount>span')
        buildyears = soup.select('.positionInfo')

        for community, region, block, price, forsale, buildyear in zip(
                communitys, regions, blocks, prices, forsales, buildyears):
            each_data = dict()
            each_data['community_name'] = community.get_text()
            each_data['community_url'] = community.get('href')
            each_data['region'] = region.get_text()
            each_data['block'] = block.get_text()
            each_data['builded_year'] = ToolsBox.strToInt(buildyear.get_text())
            each_data['forsale_num'] = ToolsBox.strToInt(forsale.get_text())
            each_data['price'] = ToolsBox.strToInt(price.get_text())
            # each_data['date']
            each_data['from'] = "LJ"
            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
            # ToolsBox.printDic(page_datas)

        return page_datas
Esempio n. 3
0
    def matchid(self, data):
        # comm_id = ''
        getid = self.get_id_from_arr(data, self.comm_arr)
        try:
            if len(getid) == 1:  # 如果匹配到唯一id
                # self.update_id(data['id'], getid[0][2])
                comm_id = getid[0][2]
            elif len(getid) == 0:  # 如果没匹配到comm,就看看按road是否能匹配
                getroad = self.get_id_from_arr(data, self.road_arr)
                if len(getroad) == 1:  # 匹配到唯一road
                    # self.update_id(data['id'], getroad[0][2])
                    comm_id = getroad[0][2]
                elif len(getroad) == 0:
                    #如果连road也没匹配成功,空在那里
                    # self.insert_err(data)
                    print("---------未匹配成功---------")
                    ToolsBox.printDic(data)
                    comm_id = 0
                elif len(getroad) > 1:  # 如果匹配到不止一个road,进行处理
                    comm_id = self.handle_match_mul(data, getroad)
            elif len(getid) > 1:  # 如果comm匹配到不止一个,进行处理
                comm_id = self.handle_match_mul(data, getid)

        except MySQLdb.Error as e:
            if e.args[0] == 1062:
                print(str(dupli) + "aready have")
                # dupli = dupli + 1

        return comm_id
Esempio n. 4
0
    def parse_datas(self,soup):

        page_datas = []

        titles = soup.select("h1 > a")
        infos = soup.select("p.house_info")
        hots = soup.select("p.house_hot")
        areas = soup.select("div.the_area span")
        prices = soup.select("div.the_price span")
        splitby = re.compile(r']|,|\s')

        for title, info, hot, area, price in zip(titles, infos, hots, areas, prices):

            each_data = {'advantage': '', 'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0,
                         'total_floor': 0}

            each_data['title'] = title.get_text()
            each_data['details_url'] = 'http://xm.maitian.cn' + title.get('href')

            try:
                each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            except Exception as e:
                with open('logtest.txt', 'a+') as fout:
                    fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
                    fout.write('麦田解析total_price出错,待解析的数据:' + price.get_text())
                    traceback.print_exc(file=fout)
                    print(traceback.format_exc())

            try:
                each_data['block'] = info.get_text().strip()
                each_data['community_name'] = splitby.split(each_data['block'])[-1].strip()
                each_data['block'] = each_data['block'].replace(each_data['community_name'],'')
            except Exception as e:
                with open('logtest.txt', 'a+') as fout:
                    fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
                    fout.write('Parse Failt of :%s \n' % info.get_text())
                    traceback.print_exc(file=fout)
                    print(traceback.format_exc())

            # try:
                # 麦田的格式,这里是户型、优势和楼层
            temp = ToolsBox.clearStr(hot.text).split('|')
            for item in temp:
                d1 = self.parse_item(item)
                each_data = self.add_advantage(d1, each_data)   #each_data = dict(each_data, **d1)

            # 这是解析面积
            each_data = dict(each_data, **self.parse_item(area.get_text()))

            each_data['from'] = "MT"

            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        return page_datas
Esempio n. 5
0
    def parse_datas(self, soup):
        i = 1
        page_datas = []

        details = soup.select('dd.detail ')
        hrefs = soup.select('span.c_blue0041d9.aVisited.f14B > a')
        comms = soup.select('span.xuzhentian > a')
        prices = soup.select('span > em')

        for detail, href, comm, price in zip(details, hrefs, comms, prices):

            each_data = dict(advantage='',
                             builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0)
            each_data['title'] = href.get_text().strip()
            each_data['community_name'] = comm.get_text().strip()
            each_data['details_url'] = "http://esf.xmhouse.com" + href.get(
                'href')
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            h_infos = re.search(r'<span style="margin-left: 5px; color: #000000">.*</span>(.*) <div', str(detail), re.S) \
                .group(1).replace('<br/>', '').replace('\r\n', '').replace(' ', '').split(',')

            for item in h_infos:
                try:
                    d1 = {}
                    d1 = self.parse_item(item)
                    each_data = self.add_advantage(
                        d1, each_data)  #each_data = dict(each_data, **d1)
                except Exception as e:
                    with open('logtest.txt', 'a+') as fout:
                        fout.write('*************' +
                                   str(datetime.datetime.now()) +
                                   '*************\n')
                        fout.write('      获取的数据:')
                        for i1 in h_infos:
                            fout.write(i1 + ',')
                        fout.write('\n      XmParser解析时发生错误的Item是: ' +
                                   str(item) + '\n')
                        traceback.print_exc(file=fout)
                        print(traceback.format_exc())

            each_data['from'] = "XMHouse"
            # ToolsBox.printDic(each_data)
            # print('******************{0}******************'.format(i))
            # i += 1

            each_data = self.pipe(each_data)
            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
 def get_Comprehensive_similar(self, in_str1, in_str2):
     str1 = ToolsBox.clearStr(in_str1).upper()
     str2 = ToolsBox.clearStr(in_str2).upper()
     # str1 = (in_str1).upper()
     # str2 = (in_str2).upper()
     fuzzSimilar = fuzz.ratio(str1, str2) / 100
     # print("fuzz: %f" %(fuzzSimilar))
     selfSimilar = self.get_similar(str1, str2)
     # print("mysimilar:%f"%(selfSimilar))
     return fuzzSimilar * self.fuzzPercentage + selfSimilar * (
         1 - self.fuzzPercentage)
Esempio n. 7
0
    def parse_datas(self, soup):

        page_datas = []

        details = soup.select(".houseInfo")
        comms = soup.select(".positionInfo a")
        prices = soup.select(".totalPrice")
        titles = soup.select("div.title a.CLICKDATA")

        for title, detail, price, comm in zip(titles, details, prices, comms):
            each_data = dict(builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0,
                             details_url=title.get('href'),
                             advantage='')
            each_data['title'] = title.get_text().strip()

            houseInfos = re.split(r'\s*[|,\s]\s*',
                                  ToolsBox.clearStr(detail.get_text()))
            # print(houseInfos)
            # print("1"*20)
            each_data['community_name'] = comm.get_text().strip()
            if len(each_data['community_name']) >= 20:
                input(each_data['community_name'] + ':' +
                      str(len(each_data['community_name'])))

            # houseInfos = houseInfos[1:]         #第一个是小区名称,切片去除
            for item in houseInfos:
                # print(item)
                d1 = self.parse_item(item)
                each_data = self.add_advantage(d1, each_data)

            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            each_data['from'] = "Beike"
            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
            # print(each_data)

        if not page_datas:
            item_num = soup.select(".fl span")
            if item_num:
                page_datas = item_num[0].get_text().strip()

        return page_datas
Esempio n. 8
0
    def parse_datas(self, soup):

        page_datas = []

        items = soup.select('div.info')
        titles = soup.select('p.title a ')
        comms = soup.select('p.hlistP  a span')
        addresses = soup.select('p.hlistP a.addressChange')
        regions = soup.select('p.hlistP > span')
        mores = soup.select('.moreInfo')
        prices = soup.select('.price')

        for item,title,comm,addr,region,price,more in \
                zip(items,titles,comms,addresses,regions,prices,mores):

            each_data = dict(builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0)

            each_data['title'] = title.get_text()
            each_data['details_url'] = 'http://www.917.com' + title.get('href')

            details = item.select('p')
            for string in details[1].stripped_strings:
                d1 = self.parse_item(string.strip())
                each_data = self.add_advantage(d1, each_data)

            each_data['community_name'] = comm.get_text()
            each_data['community_address'] = addr.get_text()
            each_data['region'] = region.get_text().replace('|', '').replace(
                ' ', '')
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            each_data['from'] = "917"

            getP = more.select('p')
            for p in getP:
                if '建筑面积' in p.get_text():
                    d1 = self.parse_item(p.get_text().strip())
                    each_data = self.add_advantage(d1, each_data)

            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
        return page_datas
Esempio n. 9
0
    def parse_datas(self,soup):
        page_datas = []

        titles = soup.select("div.title > a")
        houseinfo = soup.select("div.houseInfo")
        positionInfo = soup.select("div.positionInfo")
        totalprices = soup.select("div.totalPrice")
        #
        for title, info, position, totalPrice in zip(titles, houseinfo, positionInfo, totalprices):
            each_data = {'builded_year': 0, 'spatial_arrangement': '', 'floor_index': 0, 'total_floor': 0}
            each_data['title'] = title.get_text()
            each_data['details_url'] = title.get('href')
            each_data['total_price'] = ToolsBox.strToInt(totalPrice.get_text())

            info_item = info.get_text().split('|')

            # each_data['community_name'] = info_item[0].strip()  # 第1个总是小区名称
            for i in range(0, len(info_item)):
                d1 = self.parse_item(info_item[i].strip())
                each_data = self.add_advantage(d1,each_data)

            position = position.get_text().replace('\t', '').replace('\n', '').split()
            each_data['community_name'] = position[0].strip()  # 10月21日改变了小区名称位置
            # print(position)
            each_data['block'] = position[-1]

            if ')' not in position[0]:  # 链前的别墅会用'4层2008年建'的形式,加入')',以便分隔
                position[0] = position[0].replace('层', '层)')

            for item in position[0].split(')'):  # 2017.4.1链家格式有改
                d1 = self.parse_item(item.strip())  # 2017.4.1链家格式有改
                each_data = self.add_advantage(d1, each_data)
                # each_data = dict(each_data, **d1)

            each_data['from'] = "lianjia"

            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        if not page_datas:
            total_num = soup.select('.total span')
            if total_num:
                page_datas = total_num[0].get_text().strip()
        return page_datas
Esempio n. 10
0
 def get_comm_arr_fromMysql(self):
     sql = "SELECT id,com_name,alias_com_name FROM `t_base_community` WHERE city_name='厦门市'"
     arr = self.get_list_fromMysql(sql)
     # print(len(arr))
     comm_arr = {}
     for item in arr:
         if item['alias_com_name']:
             comms = item['alias_com_name'].split(';')
             for comm in comms:
                 comm = ToolsBox.clearStr(comm)
                 if comm not in comm_arr.keys(): comm_arr[comm] = item['id']
         if item['com_name']:
             comm = ToolsBox.clearStr(item['com_name'])
             if comm not in comm_arr.keys(): comm_arr[comm] = item['id']
         # 返回key-value值的字典
         # {'城南阳翟教师楼': '04b90367549011ebb98a98039b073fcc', '国联大厦': '04bc8a7f549011ebb98a98039b073fcc'...}
     return comm_arr
Esempio n. 11
0
    def parse_datas(self, soup):

        page_datas = []

        # title = soup.select("title")
        # if len(title) > 0:
        #     print("The page's title is : {0}".format(title[0].get_text()))
        # else:
        #     print("There is no title finded!")

        titles = soup.select(".shop_list > dl h4 a")
        houses = soup.select("p.tel_shop")
        comms = soup.select(".shop_list > dl dd p.add_shop a")
        comm_addresses = soup.select(".shop_list > dl dd p.add_shop span")
        prices = soup.select(".price_right .red b")
        for title, comm, comm_addresse, house, price in zip(
                titles, comms, comm_addresses, houses, prices):
            each_data = dict(builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0,
                             advantage='')

            each_data['title'] = title.get('title')
            each_data['details_url'] = "https://xm.esf.fang.com" + title.get(
                'href')
            for item in house.children:
                if isinstance(item, bs4.element.NavigableString):
                    d1 = self.parse_item(ToolsBox.clearStr(item))
                    each_data = self.add_advantage(d1, each_data)

            each_data['community_name'] = comm.get('title').strip()
            each_data['community_address'] = comm_addresse.get_text().strip()
            each_data['comm_url'] = comm.get('href').strip()
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            each_data['from'] = "Soufan"
            #
            each_data = self.pipe(each_data)
            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
Esempio n. 12
0
    def parse_test(self, soup):
        adds = soup.select('.property-content-info-comm-address')
        for add in adds:
            add_list = []
            for string in add.strings:
                add_list.append(ToolsBox.clearStr(string.strip()))
            print(add_list)

            print("*" * 50)
Esempio n. 13
0
 def __init__(self):
     try:
         self.conn = ToolsBox.get_database()
         # self.conn = pymysql.connect(host="office.xmcdhpg.cn", user="******", passwd="root", db="property_info",
         #                             charset="utf8", port=6153)
     except:
         print("Connect failed")
     self.cur = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
     self.rows = []
     self.where = 'first_acquisition_time <= DATE_SUB(CURDATE(), INTERVAL 3 month )'
Esempio n. 14
0
 def __init__(self):
     try:
         self.db = ToolsBox.get_database()
         # self.db = pymysql.connect(host ='office.xmcdhpg.cn', user ="******", passwd ="root", db ="property_info", charset ="utf8", port = 6153)
     except:
         print("连接失败!")
     if self.db:
         self.cursor = self.db.cursor(cursor=pymysql.cursors.DictCursor)
         self.comm_arr = self.get_comm_arr_fromMysql(0)
         self.road_arr = self.get_comm_arr_fromMysql(1)
Esempio n. 15
0
    def parse_datas(self,soup):

        page_datas = []

        titles = soup.select("h2.fix a")
        houses = soup.select('p.moudle')
        houses1 = soup.select('td.sm222 p.msg')
        # comms = soup.select('span.comm-address')
        prices = soup.select('div.percent b')
        # print(titles)
        for title,detail,detail1,price in zip(titles,houses,houses1,prices):
            # each_data = {}
            each_data = dict(advantage='', builded_year=0, spatial_arrangement='', floor_index=0, total_floor=0)
            each_data['title'] = title.get_text()
            each_data['details_url'] = 'https://danxia.com' + title.get('href')

            each_data['community_name'] = detail.select('a')[0].get_text()
            temp = detail.select('span')
            for item in temp:
                d1 = self.parse_item(item.get_text())
                each_data = self.add_advantage(d1, each_data)
                # each_data = dict(each_data, **d1)

            temp1 = detail1.select('span')
            for item in temp1:
                d1 = self.parse_item(item.get_text())
                each_data = self.add_advantage(d1, each_data)
                # each_data = dict(each_data, **d1)

            each_data['total_price'] = ToolsBox.strToInt(price.get_text())

            each_data['from'] = "Danxia"

            each_data = self.pipe(each_data)  # 2016.6.4增加一个专门的数据处理

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        return page_datas
Esempio n. 16
0
    def __init__(self):
        self.raw_datas = []                     #数据集:原始数据
        self.datasWithoutClear = []
        self.dupli_count = 0                    #计数:重复的数据
        self.now = datetime.date.today()        #字段:插入记录的日期

        self.key_infos = bloomfilter.BloomFilter(0.001,1000000)     #学习使用bloomfilter

        try:
            # self.conn=pymysql.connect(host = "192.168.1.207",user = "******",passwd = "root",db = "property_info",charset = "utf8")
            self.conn = ToolsBox.get_database()
            # self.conn=pymysql.connect(host = "office.xmcdhpg.cn",user = "******",passwd = "root",db = "property_info",charset = "utf8",port = 6153)
        except:
            print( "初始化时Connect failed")
        self.cur = self.conn.cursor(cursor=pymysql.cursors.DictCursor)            # 用字典
Esempio n. 17
0
    def match_list2comm(self):
        # 取挂牌记录
        ListRecords = ML.get_list_fromMysql(
            "SELECT distinct community_name FROM `ods_hse_detail`")
        for item in ListRecords:
            item['clear_name'] = ToolsBox.clearStr(item['community_name'])

        result = []  # 以开发库的小区名为主,匹配出挂牌数据的列表
        for key, value in self.comm_arr.items():
            name_dic = dict()
            name_dic['comm_name'] = key
            name_dic['comm_id'] = value
            name_dic['vol'] = 0
            name_dic['match_list_comm_name'] = ''
            name_dic['match_all'] = ''  #存放所有匹配度>0.8的小区名
            for item in ListRecords:
                vol = MyVote.cmntVol(key, item['clear_name'])
                if vol > name_dic['vol']:
                    name_dic['vol'] = vol
                    name_dic['match_list_comm_name'] = item['community_name']
                if vol >= 0.8:
                    name_dic['match_all'] = name_dic['match_all'] + item[
                        'community_name'] + '(' + '%f' % vol + ');'
            result.append(name_dic)

        for item in ListRecords:
            item['matchid'] = '0'
            item['match_vol'] = 0
            for key, value in self.comm_arr.items():
                vol = MyVote.cmntVol(key, item['clear_name'])
                if vol > item['match_vol']:
                    item['match_vol'] = vol
                    item['matchid'] = value
                    item['match_comm_name'] = key
        ToolsBox.saveExcel('match.xlsx', result, "Sheet1")
        ToolsBox.saveExcel('match.xlsx', ListRecords, "Sheet2")
Esempio n. 18
0
    def CommsController(self,url):
        self.craw_controller(url)
        while self.comms.has_new_url():
            comm = self.comms.get_new_url
            comm = ToolsBox.clear_comm(comm)
            c1,c2 = self.comms.get_quantity()
            comm_url = 'https://xm.ke.com/ershoufang/pg1rs' + parse.quote(comm) + '/'
            print('*******{0}/{1}:{2}*********'.format(self.comm_count,c1+c2,comm))
            url_list = []
            url_list.append(comm_url)
            self.craw_controller(url_list)
            self.comm_count += 1

        self.total = self.total + self.outputer.out_mysql()
        print('==================共抓取{0}个记录=================='.format(self.total))
Esempio n. 19
0
    def CommsController(self, url):
        self.craw_controller(url)
        while self.comms.has_new_url():
            comm = self.comms.get_new_url
            comm = ToolsBox.clear_comm(comm)
            c1, c2 = self.comms.get_quantity()
            comm_url = 'https://xm.anjuke.com/sale/p1-rd1/?kw=' + parse.quote(
                comm) + '&from_url=kw_final#filtersort'
            # comm_url = 'https://xm.anjuke.com/sale/p1-rd1/?kw=' + parse.quote(comm) + '&from_url=kw_final#filtersort'
            print('*******{0}/{1}:{2}*********'.format(self.comm_count,
                                                       c1 + c2, comm))
            url_list = []
            url_list.append(comm_url)
            self.craw_controller(url_list)
            self.comm_count += 1

        self.total = self.total + self.outputer.out_mysql()
        print('==================共抓取{0}个记录=================='.format(
            self.total))
Esempio n. 20
0
    def CommsController(self, url):
        self.craw_controller(url)

        while self.comms.has_new_url():
            comm = self.comms.get_new_url
            comm = ToolsBox.clear_comm(comm)
            c1, c2 = self.comms.get_quantity()
            # comm_url = "http://xm.58.com/ershoufang/?key=" + (comm)
            comm_url = "https://xm.58.com/ershoufang/?key=" + quote(
                comm, safe='/:?=')
            print('*******{0}/{1}:{2}*********'.format(self.comm_count,
                                                       c1 + c2, comm))
            url_list = []
            url_list.append(comm_url)
            self.craw_controller(url_list)
            self.comm_count += 1

        self.total = self.total + self.outputer.out_mysql()
        print('==================共抓取{0}个记录=================='.format(
            self.total))
Esempio n. 21
0
    def parse_datas(self, soup):
        totalfind = soup.select("span.tit em")
        if 0 == ToolsBox.strToInt(totalfind[1].get_text()): return '0'
        page_datas = []
        communitys = soup.select("h3 > a")
        adds = soup.select('.li-info>address')
        dates = soup.select('p.date')
        prices = soup.select('p>strong')
        forsales = soup.select('p.bot-tag>span>a')
        for community, add, date, price, forsale in zip(
                communitys, adds, dates, prices, forsales):
            each_data = dict()
            each_data['community_name'] = community.get('title')
            each_data['community_url'] = community.get('href')
            add1 = ToolsBox.clearStr(add.get_text())
            addlist = add1.split(']')
            if len(addlist) > 1:
                regionlist = addlist[0].replace('[', '').split('-')
                if len(regionlist) > 1:
                    each_data['region'], each_data['block'] = regionlist
                else:
                    each_data['region'] = regionlist
                each_data['address'] = addlist[1]
            else:
                each_data['address'] = add1
            each_data['builded_year'] = ToolsBox.strToInt(date.get_text())
            each_data['forsale_num'] = ToolsBox.strToInt(forsale.get_text())
            each_data['price'] = ToolsBox.strToInt(price.get_text())
            # each_data['date']
            each_data['from'] = "AJK"

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
            # ToolsBox.printDic(page_datas)
        return page_datas
Esempio n. 22
0
    def handle_match_mul(self, data, getid):
        """
        处理 getid 中匹配成功不止一个id
        处理的原则是以起始字段在前的为准,
        如果起始字段相同,则以字符串长的为准
        如果起始与字串长度都一样,则人工判断
        """
        flag = False  #标志位,如果能解析出唯一id,则标志位设成ture

        getid.sort(key=lambda x: x[0])  #按照匹配关键字的起始位置排序

        # print('getid ofter sorted:%s'%getid)

        ##########以下是2010/10/21重写的##############
        result = []
        result.append(getid[0])  #先放第一个
        for l in range(1, len(getid)):  #循环比较getid里的每个元素
            if (getid[l][0] >
                    result[0][0]):  # 如果第二个匹配到的关键字起始位置大于第一个,就以第一个为准,不用再匹配了
                break
            else:  #如果有并列第一:
                if len(getid[l][1]) > len(result[0][1]):  #字符串长的优先
                    # 只要发现了比原来存的字串还长的小区,就把原来的清空,标志成无重复
                    result = []
                    result.append(getid[l])
                    flag = False
                    # result[0] = getid[l]
                elif len(getid[l][1]) == len(result[0][1]):  #字符串长度相同的
                    if getid[l][2] != result[0][2]:
                        # 起始位置相同,关键字长度也相同,如果小区id也相同,那就不处理了。
                        # 否则把标志位设成ture,要人工判断一下
                        result.append(getid[l])
                        flag = True

        # first = getid[0]
        # # 用第一个匹配成的合成一个字段:起始位置+小区名称+小区id
        # # 传进getid,挑选出get
        # get = getid[0]
        #
        # for l in range(1,len(getid)):
        #     # 如果第二个匹配到的关键字起始位置大于第一个,就以第一个为准,不用再匹配了
        #     print('getid[l] is %s'%getid[l])
        #     print('get is %s'%get)
        #     print('first is %s'%first)
        #     if(getid[l][0] > first[0]):
        #         print('now break')
        #         break
        #     else:                               #如果有并列第一:
        #         if len(getid[l][1]) > len(first[1]):        #字符串长的优先
        #             first = getid[l]
        #         elif len(getid[l][1]) == len(first[1]):     #字符串长度相同的
        #             if getid[l][2] != first[2]:
        #             # 起始位置相同,关键字长度也相同,有时是取到了同一个小区id,那就不处理了。
        #             # 否则把标志位设成ture,要人工判断一下
        #                 get.append(getid[l])
        #                 # get += str(getid[l][0]) +',' + str(getid[l][1]) +',' + str(getid[l][2]) + "/"
        #                 flag = True
        # print('first is %s'%first)
        # 成功就写进id,没成功就空着即可

        if flag:
            print('*********匹配多个小区id**********')
            ToolsBox.printDic(data)
            print('||||||||||||||||匹配多个小区|||||||||||||||||')
            ToolsBox.printDic(result)
            return len(result)
        else:
            return result[0][2]
 def IDF_dic_output_excel(self, out_file, sheet_name="Sheet1"):
     if not self.IDFDic: self.IDFDic = self.generate_IDF_dic()
     # print("IDFDic's type is :%s"%(type(self.IDFDic)))
     ToolsBox.dic2Excel(out_file, self.IDFDic)
Esempio n. 24
0
    def pipe(self, datadic):
        # 有效性检验
        # 把小区的区块、板块及小区地址写到title里去
        for key in datadic:
            datadic[key] = ToolsBox.clearStr(datadic[key])

        title_temp = ''
        if 'region' in datadic.keys():
            if self.excep(datadic['region'].strip()):
                return False
            else:
                title_temp += ' r:' + datadic['region'].strip()

        if 'block' in datadic.keys():
            if self.excep(datadic['block'].strip()):
                return False
            else:
                title_temp += ' b:' + datadic['block'].strip()
        if 'community_address' in datadic.keys():
            datadic['community_address'] = datadic['community_address'].strip()
            title_temp += ' a:' + datadic['community_address'].strip()
        if 'title' in datadic.keys():
            title2 = title_temp.strip() + ' ' + datadic['title']
        else:
            title2 = title_temp.strip()
        if len(title2) > 50:
            title2 = title2[:50]
        datadic['title'] = title2.strip()

        if ('community_name'
                not in datadic.keys()) or len(datadic['community_name']) < 2:
            return False

        datadic['community_id'] = self.MI.matchid(datadic)
        if ('total_floor' in datadic.keys()) and (
                'total_price' in datadic.keys()) and ('area'
                                                      in datadic.keys()):
            if datadic['total_price'] is None or datadic[
                    'area'] is None or datadic['area'] == 0:
                return False
            else:
                datadic['price'] = round(
                    float(datadic['total_price'] * 10000 / datadic['area']), 2)
            if datadic['price'] < 1500 or datadic['price'] > 300000:
                return False

            # if datadic['community_name'] is None or len(datadic['community_name'])<2:
            #     return False
            if datadic['total_floor'] > 60:
                datadic['total_floor'] = 35  #把过高楼层的设为35层
            if datadic['total_price'] == 0: return False  #2016.9.13 价格为0的过滤掉

            if 'builded_year' in datadic.keys():
                if datadic['builded_year'] < 1900: datadic['builded_year'] = 0

            if datadic['area'] > 20000:
                return False  #面积过大,有时是填写错误,而且面积大于20000的价格参考意义也不大,舍弃
            if 'price' not in datadic.keys():
                return False  #2016.8.1 有时解析过程中出错,跳过了price字段解析,造成没有price,舍弃

            #2017.4.14 detail_url字段太长,处理一下
            if len(datadic['details_url']) > 250:
                datadic['details_url'] = datadic['details_url'][:249]
            if 'advantage' in datadic.keys():
                if len(datadic['advantage']) > 20:
                    datadic['advantage'] = datadic['advantage'][:20]
            return datadic
        else:
            if not ('total_floor' in datadic.keys()) and (
                    'total_price' in datadic.keys()) and (
                        'area' in datadic.keys()) and ('community_name'
                                                       in datadic.keys()):
                if u"别墅" in datadic['title']:
                    if datadic['total_price'] is None or datadic[
                            'area'] is None or datadic['area'] == 0:
                        return False
                    else:
                        datadic['price'] = round(
                            float(datadic['total_price'] * 10000 /
                                  datadic['area']), 2)
                    datadic['total_floor'] = 4
                    datadic['floor_index'] = 1
                    datadic['spatial_arrangement'] = datadic[
                        'spatial_arrangement'] + u"别墅" if 'spatial_arrangement' in datadic.keys(
                        ) else u"别墅"
                    return datadic
            return False
Esempio n. 25
0
    def parse_item(self, string):
        # 2016.8.17增加:传入一个字符串,用正则判断它是面积?户型?单价?楼层?建成年份?优势?解析后返回一个键对值
        try:
            string = string.decode('utf8').strip()
        except:
            string = string.strip()

        parse_dict = {}

        r1_1 = '(\d+)平方米'
        r1_2 = '(\d+.?\d+)平米'  #厦门house的面积是浮点数
        r1_3 = '(\d+.?\d+)㎡'  #2016.9.13增加麦田的面积解析
        r1_4 = '(\d+.?\d+)m²'  #2017.3.8安居客
        r1_5 = '(\d+.?\d+)�O'  #2018.8.3搜房,这个乱码就是㎡
        r1_6 = '(\d+.?\d+)平'  #2018.8.3搜房,这个乱码就是㎡
        r2_1 = '\d+室'
        r2_2 = '\d+房'
        r3_1 = '(\d+)元/'
        r3_2 = '(\d+)万'
        r4 = '\d+层'
        r4_1 = '((?P<floor>[高中低])楼?层)?.?共?(?P<total>\d+)层'
        r5_1 = '(\d{4})年'
        r5_2 = '年.*(\d{4})'

        if re.search(r1_1, string, flags=0)\
            or re.search(r1_2, string, flags=0)\
            or re.search(r1_3, string, flags=0)\
            or re.search(r1_4, string, flags=0)\
            or re.search(r1_5, string, flags=0)\
            or re.search(r1_6, string, flags=0)\
            or re.search(r2_1, string, flags=0)\
            or re.search(r2_2, string, flags=0)\
            or re.search(r3_1, string, flags=0)\
            or re.search(r3_2, string, flags=0)\
            or re.search(r4, string, flags=0)\
            or re.search(r5_1, string, flags=0)\
            or re.search(r5_2, string, flags=0):

            if re.search(r1_1, string, flags=0):
                parse_dict['area'] = int(re.search(r1_1, string).groups(0)[0])
            elif re.search(r1_2, string, flags=0):
                parse_dict['area'] = int(
                    round(float(re.search(r1_2, string).groups(0)[0]), 0))
            elif re.search(r1_3, string, flags=0):  #2016.9.13增加麦田的面积解析
                parse_dict['area'] = int(
                    round(float(re.search(r1_3, string).groups(0)[0]), 0))
            elif re.search(r1_4, string, flags=0):  #2017.3.8安居客的面积解析
                parse_dict['area'] = int(
                    round(float(re.search(r1_4, string).groups(0)[0]), 0))
            elif re.search(r1_5, string, flags=0):  # 2018.8.3搜房的面积解析
                parse_dict['area'] = int(
                    round(float(re.search(r1_5, string).groups(0)[0]), 0))
            elif re.search(r1_6, string, flags=0):  # 2019.9.9乐居的面积解析
                parse_dict['area'] = int(
                    round(float(re.search(r1_6, string).groups(0)[0]), 0))
            else:
                pass

            # if re.search(r4, string, flags=0):
            #     parse_dict['floor_index'],parse_dict['total_floor'] = self.parse_floor(string)

            if re.search(r4_1, string, flags=0):
                parse_dict['floor_index'], parse_dict[
                    'total_floor'] = self.parse_floor(string, r4_1)
                # temp_str = re.search(r4_1, string, flags=0).groupdict()
                # # print(temp_str)
                # parse_dict['total_floor'] = int(temp_str['total'])
                # if temp_str['floor']:
                #     if u"高" in temp_str['floor']:
                #         parse_dict['floor_index'] = int(parse_dict['total_floor'] * 5 / 6)
                #     elif u"低" in temp_str['floor']:
                #         parse_dict['floor_index'] = int(parse_dict['total_floor'] / 6)
                #     else:
                #         parse_dict['floor_index'] = int(parse_dict['total_floor'] / 2)
                # else:
                #     parse_dict['floor_index'] = 1
                # print(temp_str.group(0))
                # parse_dict['floor_index'] = floor_index
                # print(temp_str)
                # string = re.sub(r4_1, "", string, count=0, flags=0)
                string = ToolsBox.clearStr(
                    re.sub(r4_1, "", string, count=0, flags=0))
                # print(string)

            if re.search(r2_1, string, flags=0) \
                    or re.search(r2_2, string, flags=0):
                parse_dict['spatial_arrangement'] = string.strip()

            if re.search(r5_1, string, flags=0):
                parse_dict['builded_year'] = int(
                    re.search(r5_1, string).groups(0)[0])
            elif re.search(r5_2, string, flags=0):
                parse_dict['builded_year'] = int(
                    re.search(r5_2, string).groups(0)[0])
            else:
                pass
        else:
            if string == '|' or string == '|':
                pass
            elif string == '':
                pass
            else:  #re.search('[南北东西]', string, flags=0):
                parse_dict['advantage'] = string.strip()
        # print(parse_dict)
        return parse_dict
Esempio n. 26
0
    def parse_datas(self, soup):

        page_datas = []

        # details = soup.select("div.house-info")
        # comms = soup.select("div.house-info > a ")
        # positions = soup.select("div.house-position")
        # prices = soup.select("span.georgia")
        # titles = soup.select("h3 > a")
        # regions = soup.select(".region")

        # for title,comm,detail,position,price,region in zip(titles,comms,details,positions,prices,regions):

        # 2019/9/9乐居网改版面的
        titles = soup.select("div.title_in")
        d_urls = soup.select("div.title_in > a")
        adds = soup.select("div.address")
        infos = soup.select("div.house_info")
        prices = soup.select("div.price > span")
        for title, add, d_url, info, price in zip(titles, adds, d_urls, infos,
                                                  prices):
            each_data = dict(builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0,
                             title=title.get('title'))
            comms = add.select('span')
            each_data['community_name'] = ToolsBox.clearStr(
                comms[0].get_text())
            for comm in comms:
                comm = ToolsBox.clearStr(comm.get_text())
                if '-' != comm:
                    if '-' in comm:
                        c_item = comm.split('-')
                        each_data['region'] = c_item[0]
                        each_data['block'] = c_item[1]
                    if '年' in comm:
                        out = self.parse_item(comm)
                        each_data = self.add_advantage(out, each_data)
            h_info = info.select('span')
            for item in h_info:
                item = ToolsBox.clearStr(item.get_text())
                each_data = self.add_advantage(self.parse_item(item),
                                               each_data)
            each_data['details_url'] = 'https:' + d_url.get('href')
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())

            # , details_url='http://xm.esf.leju.com' + title.get('href')
            # mr20 = detail.select("span.mr20")
            # posi = position.select("span")
            # for j in range(1,len(posi)):
            #     out = self.parse_item(posi[j].get_text())
            #     each_data = self.add_advantage(out, each_data)
            #     # if len(out) > 0:
            #     #     if ('advantage' in each_data.keys()) and ('advantage' in out.keys()):
            #     #         each_data['advantage'] = each_data['advantage'] + ',' + out['advantage']
            #     #     else:
            #     #         each_data = dict(each_data, **out)
            # for item in mr20:
            #     d1 = self.parse_item(item.get_text())
            #     each_data = self.add_advantage(d1, each_data)
            #     # if len(d1) > 0:
            #     #     if ('advantage' in each_data.keys()) and ('advantage' in d1.keys()):
            #     #         each_data['advantage'] = each_data['advantage'] + ',' + d1['advantage']
            #     #     else:
            #     #         each_data = dict(each_data, **d1)
            # each_data['community_address'] = region.get_text().strip()
            # each_data['community_name'] = comm.get_text()
            # each_data['total_price'] =ToolsBox.strToInt(price.get_text())
            # each_data['price'] = round(float(each_data['total_price']*10000/each_data['area']),2)

            each_data['from'] = "lejv"
            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas
Esempio n. 27
0
            each_data['from'] = "lejv"
            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)

        return page_datas


if __name__ == "__main__":
    downloader = Downloader.Downloader()
    parser = LejvPage()
    url = 'https://xm.esf.leju.com/house'
    headers = {
        "Host":
        "xm.esf.leju.com",
        "Referer":
        "http://xm.esf.leju.com/house/",
        'User-Agent':
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/7.0)',
    }

    html_cont, code = downloader.download(url, headers=headers)
    # print(html_cont)
    urls, datas = parser.page_parse(html_cont)
    ToolsBox.priList(datas)
Esempio n. 28
0
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())

            each_data['from'] = "Danxia"

            each_data = self.pipe(each_data)  # 2016.6.4增加一个专门的数据处理

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data): page_datas.append(each_data)

        return page_datas
        # return each_data
if __name__ == "__main__":
    downloader = Downloader.Downloader()
    parser = DanxiaPage()
    url = 'https://danxia.com/house/all/PG2'
    headers = {
        "Referer": "https://danxia.com/house/all",
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }
    html_cont,code = downloader.download(url,headers=headers)
    # soup = parser.get_soup(html_cont)
    # datas = parser.parse_datas(soup)
    # print(datas)
    urls,datas = parser.page_parse(html_cont)
    for data in datas:
        print('='*50)
        ToolsBox.printDic(data)
Esempio n. 29
0
                    d1 = self.parse_item(p.get_text().strip())
                    each_data = self.add_advantage(d1, each_data)

            each_data = self.pipe(each_data)

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
        return page_datas


if __name__ == "__main__":
    downloader = Downloader.Downloader()
    parser = www917Page()
    url = 'https://www.917.com/sell/pn10/'
    headers = {
        "Host":
        "www.917.com",
        "Referer":
        "http://www.917.com/",
        'User-Agent':
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/7.0)',
    }
    html_cont, code = downloader.download(url, headers=headers)

    urls, datas = parser.page_parse(html_cont)

    ToolsBox.priList(urls)
Esempio n. 30
0
    def parse_page(self, soup):
        page_datas = []
        details_urls = soup.select(".property>a")
        titles = soup.select("h3.property-content-title-name")
        houses = soup.select("div.property-content-detail>section")
        # houses = soup.select('div.house-details')
        comms = soup.select('.property-content-info-comm-name')
        adds = soup.select('.property-content-info-comm-address')
        prices = soup.select('.property-price-total-num')
        for details_url, title, details, comm, price, add in zip(
                details_urls, titles, houses, comms, prices, adds):
            each_data = dict(advantage='',
                             builded_year=0,
                             spatial_arrangement='',
                             floor_index=0,
                             total_floor=0)
            each_data['title'] = title.get_text()
            each_data['details_url'] = details_url.get('href')

            houses = details.select(".property-content-info")
            detail = houses[0].select("p")
            for string in detail:
                d1 = self.parse_item(
                    ToolsBox.clearStr(string.get_text().strip()))
                each_data = self.add_advantage(d1, each_data)

            each_data['community_name'] = comm.get_text().strip()

            add_list = []
            for string in add.strings:
                add_list.append(ToolsBox.clearStr(string.strip()))
            try:
                each_data['region'], each_data['block'], each_data[
                    'community_address'] = add_list
            except Exception as e:
                with open('logtest.txt', 'a+') as fout:
                    fout.write('*************' + str(datetime.datetime.now()) +
                               '*************\n')
                    fout.write('AJK解析区、板块、地址时出错,待解析的数据:')
                    traceback.print_exc(file=fout)
                    print(traceback.format_exc())
            # print(price)
            each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            each_data['from'] = "AJK"
            # try:  # 2016.8.1 这里解析也时有出差,把它保留下来
            #     each_data['total_price'] = ToolsBox.strToInt(price.get_text())
            # except Exception as e:
            #     with open('logtest.txt', 'a+') as fout:
            #         fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
            #         fout.write('AJK解析total_price出错,待解析的数据:' + price.get_text())
            #         traceback.print_exc(file=fout)
            #         print(traceback.format_exc())

            # try:
            #     comminfo = comm.get('title').split()
            #     each_data['community_name'] = comminfo[0]
            #     each_data['region'], each_data['block'], each_data['community_address'] = comminfo[1].split('-', 2)
            # except Exception as e:
            #     with open('logtest.txt', 'a+') as fout:
            #         fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
            #         fout.write('Parse Failt of :%s \n' % comm.get('title'))
            #         traceback.print_exc(file=fout)
            #         print(traceback.format_exc())
            # each_data['community_name'] = each_data['community_name'].strip()

            # try:
            #     house = details.select('span')
            #     # 2016.8.17 重写了字段解析,抽象出一个parse_item方法
            #     for h in house:
            #         if len(h.attrs) == 0:
            #             string = h.get_text().encode('utf8')
            #             d1 = {}
            #             d1 = self.parse_item(string)
            #             each_data = self.add_advantage(d1, each_data)   #each_data = dict(each_data, **d1)
            #     each_data['from'] = "AJK"
            # except Exception as e:
            #     with open('logtest.txt', 'a+') as fout:
            #         fout.write('*************' + str(datetime.datetime.now()) + '*************\n')
            #         fout.write('      待解析的数据:\n')
            #         for i1 in house:
            #             fout.write(str(i1) + '\n')
            #         fout.write('\n      字段数:' + str(len(house)) + '\n')
            #         traceback.print_exc(file=fout)
            #         print(traceback.format_exc())
            each_data = self.pipe(each_data)  # 2016.6.4增加一个专门的数据处理

            if each_data:
                page_datas.append(each_data)
            else:
                if ToolsBox.ShowInvalideData(each_data):
                    page_datas.append(each_data)
        return page_datas