def listingMonthlyStatistics(listingId,statDate): db = dbSettings.db_connect() cursor = db.cursor() start = time.time() cursor.execute(sql_listingMonthlyStatistics(listingId,statDate)) item = cursor.fetchall()[0] monthlyPurchasePrice = item['monthlyPurchasePrice'] monthlySumPrice = item['monthlySumPrice'] rentRate = item['rentRate'] if monthlyPurchasePrice == None : monthlySumPrice = 0 rentRate = 0 monthlyPurchasePrice = 0 cursor.execute(sql_listingPriceMonthlyStatistics(listingId,statDate)) item = cursor.fetchall()[0] print(item) monthlyAvgPrice = item['monthlyAvgPrice'] if not item['monthlyAvgPrice'] == None else 0 end = time.time() # print(1000*(end-start),"sql_listingMonthlyStatistics") sm.release() sql = """ INSERT INTO `airbnb_scrapy_us`.`monthlystat_us` (`dimension`,`listingId`,`year`,`month`,`monthlyAvgPrice`,`monthlyPurchasePrice`,`monthlySumPrice`,`rentRate`,`repeat_flag`) VALUES ('listing','{}','{}','{}' ,'{}' ,'{}' ,'{}' ,'{}' ,'{}') """.format(str(listingId),statDate[2:4],statDate[5:7],monthlyAvgPrice,monthlyPurchasePrice,monthlySumPrice,rentRate,statDate[2:4]+"-"+statDate[5:7]+"-"+str(listingId)) # print(sql) cursor.execute(sql) db.commit() return {'listingId':listingId,'monthlyAvgPrice':monthlyAvgPrice,'monthlySumPrice':monthlySumPrice,'rentRate':rentRate}
def __init__(self): self.table = "`proxypool`" self.db = dbSettings.db_connect() self.cursor = self.db.cursor() self.mapTable = "`map`" self.listTable = "`houselist`" self.mapresponseTable = "`mapresponse`"
def parseStart(bias): # parse = calendarParse() # parse.getItem(bias*10) # sm.release() # return try: parse = calendarParse() parse.getItem(bias * 10) except Exception as e: print(e) try: parse = calendarParse() parse.getItem(bias * 10) except: try: parse = calendarParse() parse.getItem(bias * 10) except Exception as e: print(e) print("err in {}".format(bias)) pass sm.release() db = dbSettings.db_connect() cursor = db.cursor() sql = "INSERT INTO `calendarparselog` ( `type`, `infor`)\ VALUES ('{}','{}');".format("start parse", bias * 10) print(sql) cursor.execute(sql) db.commit()
def detailParse(): if __name__ == "__main__": # f_in = open( 'src.json', 'r',encoding = 'utf-8' ) # f_out = open( 'tgt.json', 'w',encoding = 'utf-8' ) # jsonData = json.loads(f_in.read()) db = dbSettings.db_connect() cursor = db.cursor() startResponseId = 0 endResponseId = getMaxNumOfDetailResponse(db,cursor) landmark = set() amenity = set() for responseId in range(startResponseId, endResponseId+1,1000): landmark,amenity = detailParse(responseId,landmark,amenity) decode = decodeDetail() meta = decode.decode(jsonData) landmark |= set(meta['landmark']) amenity |= set(meta['amenity']) pprint(landmark,amenity)
def batchGeoEncode(bias, existLocation): db = dbSettings.db_connect() cursor = db.cursor() cursor.execute( "SELECT id,listingid,lat,lng FROM `detail` where id between {} and {}". format(bias, bias + 100)) print( "SELECT id,listingid,lat,lng FROM `detail` where id between {} and {}". format(bias, bias + 100)) results = cursor.fetchall() sql = "INSERT INTO `listing_location`(`listingid`, `lng`, `lat`, `formatted_address`, `province`, `city`, `district`, `township`)\ VALUE (%s,%s,%s,%s,%s,%s,%s,%s)" vals = [] count = 0 for row in results: # count += 1 # print(count) if row["listingid"] in existLocation: continue print(row["lng"], ",", row["lat"]) addressComponent = decodeLocation(row["lng"], row["lat"]) vals.append( (row["listingid"], row["lng"], row["lat"], addressComponent["formatted_address"], addressComponent["province"], addressComponent["city"], addressComponent["district"], addressComponent["township"])) # print(row["listingid"],row["lng"],row["lat"],addressComponent["formatted_address"],addressComponent["province"],addressComponent["city"],addressComponent["district"],addressComponent["township"]) cursor.executemany(sql, vals) db.commit()
def dbInsert(listingId, Lng, Lat, locationInfor): db = dbSettings.db_connect() cursor = db.cursor() if locationInfor == None: sql = ''' INSERT INTO `airbnb_scrapy_us`.`listing_location_us` (`listingid`,`lng`,`lat`,`country`) VALUES ('{}',{},{},'unknown') '''.format(listingId, Lng, Lat) else: sql = ''' INSERT INTO `airbnb_scrapy_us`.`listing_location_us` (`listingid`,`lng`,`lat`,`country`,`state`,`county`,`postcode`,`town`,`formatted_address`) VALUES ('{}',{},{},'{}','{}','{}','{}','{}','{}'); '''.format(listingId, Lng, Lat, locationInfor['country'], locationInfor['state'], locationInfor['county'], locationInfor['postcode'], locationInfor['town'], locationInfor['display_name']) # print(sql) try: cursor.execute(sql) db.commit() except Exception as e: print(e.args) print(sql)
def __init__(self): self.db = dbSettings.db_connect() self.cursor = self.db.cursor() self.listTable = "`houselist`" self.calendarResponseTable = "`calendarresponse`" self.redis = redis.Redis.from_url(REDIS_URL) print(REDIS_URL)
def __init__(self): self.db = dbSettings.db_connect() self.cursor = self.db.cursor() self.listTable = "`houselist`" self.detailTable = "`detail`" self.redis = redis.Redis.from_url(REDIS_URL) print(REDIS_URL)
def __init__(self): self.table = "`proxypool`" self.db = dbSettings.db_connect() self.cursor = self.db.cursor() self.redis = redis.Redis.from_url(REDIS_URL) self.mapTable = "`map`" self.listTable = "`houselist`" self.mapresponseTable = "`mapresponse`"
def __init__(self): self.proxyId = 0 self.ip = "" self.table = "`proxypool_us`" # print("middleline-proxyPool1") self.db = dbSettings.db_connect() # print("middline-proxyPool2") self.cursor = self.db.cursor()
def __init__(self): self.db = dbSettings.db_connect() self.cursor = self.db.cursor() self.listTable = "`houselist`" self.responseTable = "calendarresponse" self.responseLogTable = "calendarparselog" self.redis = redis.Redis.from_url(REDIS_URL) print(time.asctime(time.localtime(time.time())))
def getListingId(): db = dbSettings.db_connect() cursor = db.cursor() sql = "SELECT house_id FROM `houselist_us` ORDER BY RAND() LIMIT 30000" cursor.execute(sql) db.commit() result = cursor.fetchall() return [i['house_id'] for i in result]
def dbInsertparselog(type, responseId, infor): # 数据库链接 db = dbSettings.db_connect() cursor = db.cursor() sql = "INSERT IGNORE INTO `calendarparselog` (`type`, `response_id`, `infor`)\ VALUES ('{}',{},'{}');".format(type, responseId, infor) print(sql) cursor.execute(sql) db.commit()
def getIp(): db = dbSettings.db_connect() cursor = db.cursor() while(1): print("test ip pool") time.sleep(0.5) sql = "SELECT * from `proxypool` WHERE `state` != 'del'" cursor.execute(sql) db.commit() results = cursor.fetchall() if(len(results) < 10): proxypool = proxyPool() proxies = proxypool.get(num=10)
def getIp(): db = dbSettings.db_connect() cursor = db.cursor() timeNow = datetime.datetime.now() while (1): print("test ip pool") time.sleep(1) sql = "SELECT * from `proxypool_us` WHERE `state` != 'del'" cursor.execute(sql) db.commit() results = cursor.fetchall() if (len(results) < 10): proxypool = proxyPool() proxies = proxypool.get(num=10) for row in results: deltaTime = (timeNow - row['init_time']).seconds
def __init__(self): self.table = "`proxypool`" self.db = dbSettings.db_connect() self.cursor = self.db.cursor() self.mapTable = "`map`" self.listTable = "`houselist`" self.mapresponseTable = "`mapresponse`" self.calendarresponseTable = "`calendarresponse`" localtime = time.localtime(time.time()) self.mouth = localtime[1] self.year = localtime[0] self.day = localtime[2] self.dtToday = "{}-{}-{}".format(self.year, self.mouth, self.day) self.orderList = [] self.priceList = [] self.house_id = 0
def dbInsert(listingId, Lng, Lat, locationInfor): db = dbSettings.db_connect() cursor = db.cursor() if locationInfor == None: sql = ''' INSERT INTO `airbnb_scrapy_us`.`listing_location_us` (`listingid`,`lng`,`lat`,`country`) VALUES ('{}',{},{},'abroad') '''.format(listingId, Lng, Lat) else: sql = ''' INSERT INTO `airbnb_scrapy_us`.`listing_location_us` (`listingid`,`lng`,`lat`,`country`,`state`,`city`,`MSACode`,`MSAName`) VALUES ('{}',{},{},'{}','{}','{}','{}','{}'); '''.format(listingId, Lng, Lat, "US", locationInfor['sStateName'], locationInfor['sCountyName'], locationInfor['sMSACode'], locationInfor['sMSAName']) cursor.execute(sql) db.commit()
def areaMonthlyStatistics(areaRow,statDate,dimension): db = dbSettings.db_connect() cursor = db.cursor() # print(sql_areaMonthlyStatistics(areaRow,statDate)) cursor.execute(sql_areaMonthlyStatistics(areaRow,statDate,dimension)) print(areaRow) result = cursor.fetchall()[0] print(result) if result['monthlySumPrice'] == None: return statResult = result cursor.execute(sql_areaFeatureStatistics(areaRow,statDate,dimension)) print(areaRow) result = cursor.fetchall()[0] FeatureResult = {} rentalType = {"Entire Home":0,"Private Room":0,"Shared Room":0} reviewSummary = {"reviewCount":0,"reviewSummary_wei_zhi_bian_li":0,"reviewSummary_ru_zhu_bian_jie":0,"reviewSummary_ru_shi_miao_shu":0,"reviewSummary_gan_jing_wei_sheng":0,"reviewSummary_gou_tong_shun_chang":0,"reviewSummary_gao_xing_jia_bi":0} for row in result : # rentalType if "整套" in row["propertyType"]: rentalType["Entire Home"] += 1 if "独立" in row["propertyType"]: rentalType["Private Room"] += 1 if "合住" in row["propertyType"]: rentalType["Shared Room"] += 1 # reviewSummary for item in ["reviewSummary_wei_zhi_bian_li","reviewSummary_ru_zhu_bian_jie","reviewSummary_ru_shi_miao_shu","reviewSummary_gan_jing_wei_sheng","reviewSummary_gou_tong_shun_chang","reviewSummary_gao_xing_jia_bi"]: reviewSummary[item] += row[item] reviewSummary["reviewCount"] += 1 for item in ["reviewSummary_wei_zhi_bian_li","reviewSummary_ru_zhu_bian_jie","reviewSummary_ru_shi_miao_shu","reviewSummary_gan_jing_wei_sheng","reviewSummary_gou_tong_shun_chang","reviewSummary_gao_xing_jia_bi"]: reviewSummary[item] = reviewSummary[item]/reviewSummary["reviewCount"] sql = "INSERT INTO `airbnb_scrapy_us`.`monthlystat_us` (`dimension`,`province` , `city`, `district`,`year`,`month`,`monthlyAvgPrice`,"+\ "`monthlySumPrice`,`rentRate`,`repeat_flag`,`monthlyPurchasePrice`) VALUES ('{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}')".format(dimension,areaRow['province'],areaRow['city'],areaRow['district'],statDate[2:4],statDate[5:7],result['monthlyAvgPrice'],result['monthlySumPrice'],result['rentRate'],dimension+"-"+statDate[2:4]+"-"+statDate[5:7]+"-"+areaRow['province']+areaRow['city']+areaRow['district'],result['monthlyPurchasePrice']) cursor.execute(sql) db.commit()
def getIp(): db = dbSettings.db_connect() cursor = db.cursor() timeNow = datetime.datetime.now() proxypool = proxyPool() while (1): time.sleep(1) sql = "SELECT * from `proxypool` WHERE `state` != 'del'" cursor.execute(sql) db.commit() results = cursor.fetchall() n_ip_kuaidaili = proxypool.checkBalance() if n_ip_kuaidaili == 0: SMTP("*****@*****.**", "发信内容", "快代理ip数耗尽警告", "DaduosuMonitor") time.sleep(180) print("test ip pool ,n_ip_proxypool: {} , n_ip_kuaidaili: {}".format( len(results), n_ip_kuaidaili)) if (len(results) < 5): proxies = proxypool.get(num=3) for row in results: deltaTime = (timeNow - row['init_time']).seconds
def __init__(self): self.table = "`proxypool_us`" self.db = dbSettings.db_connect() self.cursor = self.db.cursor() self.redis = redis.Redis.from_url(REDIS_URL)
def __init__(self): self.db = dbSettings.db_connect() self.cursor = self.db.cursor() self.detailresponseTable = "`detailresponse`"
cursor.execute(sql) db.commit() return { 'listingId': listingId, 'monthlyAvgPrice': monthlyAvgPrice, 'monthlySumPrice': monthlySumPrice, 'rentRate': rentRate } sm = threading.Semaphore(30) if __name__ == "__main__": statDate = '2021-03-01' db = dbSettings.db_connect() cursor = db.cursor() # YEAR & MONTH cursor.execute("SELECT YEAR('{}') as `YEAR`".format(statDate)) YEAR = cursor.fetchall()[0]['YEAR'] cursor.execute("SELECT MONTH('{}') as `MONTH`".format(statDate)) MONTH = cursor.fetchall()[0]['MONTH'] print(YEAR, MONTH) ########## listing start ########## # cursor.execute("SELECT `house_id` FROM houselist") # results = cursor.fetchall() # start = time.time() # count = 0
def dbUtils(sql): db = dbSettings.db_connect() cursor = db.cursor() cursor.execute(sql) db.commit() return cursor.fetchall()
def areaMonthlyStatistics(areaRow, statDate, dimension): db = dbSettings.db_connect() cursor = db.cursor() # print(sql_areaMonthlyStatistics(areaRow,statDate)) cursor.execute(sql_areaMonthlyStatistics(areaRow, statDate, dimension)) print(areaRow) result = cursor.fetchall()[0] print(result) if result['monthlySumPrice'] == None: return statResult = result cursor.execute(sql_areaFeatureStatistics(areaRow, statDate, dimension)) print(areaRow) result = cursor.fetchall() FeatureResult = {} rentalType = {"Entire Home": 0, "Private Room": 0, "Shared Room": 0} airbnbRating = { "reviewCount": 0, "reviewSummary_wei_zhi_bian_li": 0, "reviewSummary_ru_zhu_bian_jie": 0, "reviewSummary_ru_shi_miao_shu": 0, "reviewSummary_gan_jing_wei_sheng": 0, "reviewSummary_gou_tong_shun_chang": 0, "reviewSummary_gao_xing_jia_bi": 0 } rentalSize = {} amenity = {} # rentalSize init for count in [1, 2, 3, 4, 5]: rentalSize["roomCount_" + str(count)] = 0 rentalSize["roomCount_5+"] = 0 # amenity init for k, v in result[0].items(): # print(k[:6]) if k[:7] == "amenity": amenity[k] = 0 for row in result: # print(row) # rentalType if "整套" in row["propertyType"]: rentalType["Entire Home"] += 1 if "独立" in row["propertyType"]: rentalType["Private Room"] += 1 if "合住" in row["propertyType"]: rentalType["Shared Room"] += 1 # airbnbRating for item in [ "reviewSummary_wei_zhi_bian_li", "reviewSummary_ru_zhu_bian_jie", "reviewSummary_ru_shi_miao_shu", "reviewSummary_gan_jing_wei_sheng", "reviewSummary_gou_tong_shun_chang", "reviewSummary_gao_xing_jia_bi" ]: airbnbRating[item] += row[item] if row["reviewSummary_wei_zhi_bian_li"] > 0: airbnbRating["reviewCount"] += 1 # print(airbnbRating["reviewSummary_wei_zhi_bian_li"]) # rentalSize rowRoom = row["chinaTitleDetails_ROOM"] if rowRoom == None: rowRoom = "" if "卧室" in rowRoom: for count in [1, 2, 3, 4, 5]: if str(count) in rowRoom: rentalSize["roomCount_" + str(count)] += 1 for num in range(6, 20): if str(num) in rowRoom: rentalSize["roomCount_5+"] += 1 for k, v in row.items(): if k[:7] == "amenity": amenity[k] += row[k] for item in [ "reviewSummary_wei_zhi_bian_li", "reviewSummary_ru_zhu_bian_jie", "reviewSummary_ru_shi_miao_shu", "reviewSummary_gan_jing_wei_sheng", "reviewSummary_gou_tong_shun_chang", "reviewSummary_gao_xing_jia_bi" ]: if airbnbRating["reviewCount"] > 0: airbnbRating[ item] = airbnbRating[item] / airbnbRating["reviewCount"] pprint(rentalType) # pprint(airbnbRating) # pprint(rentalSize) # pprint(amenity) result = statResult sql = "INSERT INTO `airbnb_scrapy`.`monthlystat` (`dimension`,`province` , `city`, `district`,`year`,`month`,`monthlyAvgPrice`,"+\ "`monthlySumPrice`,`rentRate`,`repeat_flag`,`monthlyPurchasePrice`,`rentalType`,`airbnbRating`,`rentalSize`,`amenity`) "+\ "VALUES ('{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}')".format(dimension,areaRow['province'],areaRow['city'],areaRow['district'],statDate[2:4],statDate[5:7],result['monthlyAvgPrice'],result['monthlySumPrice'],result['rentRate'],dimension+"-"+statDate[2:4]+"-"+statDate[5:7]+"-"+areaRow['province']+areaRow['city']+areaRow['district'],result['monthlyPurchasePrice'],json.dumps(rentalType),json.dumps(airbnbRating),json.dumps(rentalSize),json.dumps(amenity)) # print(sql) cursor.execute(sql) db.commit()
def __init__(self): self.proxyId = 0 self.ip = "" self.table = "`proxypool_us`" self.db = dbSettings.db_connect() self.cursor = self.db.cursor()