def orgPrice_data(params): columns = ["orgPrice_id", "fc", "orgPrice"] with open(params["localFileName_org_orgPrice_data"], 'w') as f: f.write(','.join(columns)) f.write('\n') for sample in cursor1.find( {'date': { '$gte': datetime.today().strftime("%Y-%m-%d") }}): orgPrice_id = sample.get('_id') del sample['_id'] del sample["date"] del sample["ut"] del sample["src"] for key, value in sample.items(): try: orgPrice = value.get('Y').get('price') content = ','.join([orgPrice_id, key, orgPrice]) f.write(content + '\n') except: continue logger.info("====\"{}\" finished====".format( params["localFileName_org_orgPrice_data"].split('/')[-1])) utils.delete_before4_localData(params["localFileName_org_orgPrice_data"], params) utils.upload_to_hdfs(params["localFileName_org_orgPrice_data"], params["sparkDirName_org_orgPrice_data"], params)
def infoBase_data(params): columns = ["infoBase_id", "departTime", "arriveTime", "isShare"] with open(params["localFileName_org_infoBase_data"], 'w') as f: f.write(','.join(columns)) f.write('\n') for sample in cursor4.find( {'date': { '$gte': datetime.today().strftime("%Y-%m-%d") }}): infoBase_id = sample.get("_id") departtime = sample.get('origindeparttime') arrivetime = sample.get('originarrivetime') isShare = sample.get('isshare') try: content = ','.join( [infoBase_id, departtime, arrivetime, str(isShare)]) f.write(content + '\n') except: continue logger.info("====\"{}\" finished====".format( params["localFileName_org_infoBase_data"].split('/')[-1])) utils.delete_before4_localData(params["localFileName_org_infoBase_data"], params) utils.upload_to_hdfs(params["localFileName_org_infoBase_data"], params["sparkDirName_org_infoBase_data"], params)
def lowPrice_data_main(params): write_idList(params, cursor) utils.delete_before4_localData(params["localFileName_lowPrice_idList"], params) if os.path.exists(params["localFileName_org_lowPrice_data"]): os.remove(params["localFileName_org_lowPrice_data"]) columns = ["queryDate", 'price', 'id', 'org', 'dst'] with open(params["localFileName_org_lowPrice_data"], 'a') as f_write: f_write.write(",".join(columns)) f_write.write("\n") f_write.seek(2) p = Pool(10) counter = 0 with open(params["localFileName_lowPrice_idList"], 'r') as f_read: for line in f_read: counter += 1 L = line.strip().split(',') p.apply_async(lowPrice_data, args=( params, L, )) p.close() p.join() logger.info("=====\"{}\" finished======".format( params["localFileName_org_lowPrice_data"].split('/')[-1])) utils.delete_before4_localData(params["localFileName_org_lowPrice_data"], params) utils.upload_to_hdfs(params["localFileName_org_lowPrice_data"], params["sparkDirName_org_lowPrice_data"], params)
def seatleft_data(params): columns = ['queryDatetime', 'seatLeft', 'seatLeft_id'] with open(params["localFileName_org_seatLeft_data"], 'w') as f: f.write(','.join(columns)) f.write('\n') f.seek(2) for sample in cursor3.find({}): seatLeft_id = sample.get('_id') df = pd.DataFrame.from_dict( sample.get('fc'), orient='index').reset_index().rename(columns={ 'index': 'queryDatetime', 0: 'seatLeft' }) df['seatLeft_id'] = seatLeft_id df.to_csv(params["localFileName_org_seatLeft_data"], header=False, index=False, mode='a') logger.info("====\"{}\" finished====".format( params["localFileName_org_seatLeft_data"].split('/')[-1])) utils.delete_before4_localData(params["localFileName_org_seatLeft_data"], params) utils.upload_to_hdfs(params["localFileName_org_seatLeft_data"], params["sparkDirName_org_seatLeft_data"], params)
def lowPrice_online_data(params): tomorrow_str2 = datetime.strftime( params["generateDate"] + timedelta(days=1), '%Y-%m-%d') after30_str2 = datetime.strftime( params["generateDate"] + timedelta(days=30), '%Y-%m-%d') monthDay_list = pd.date_range( start=tomorrow_str2, end=after30_str2, freq='d').map(lambda x: datetime.strftime(x, '%m-%d')) columns = ["queryDate", 'price', 'id', 'org', 'dst'] with open(params["localFileName_org_lowPrice_onlineData"], 'w') as f: f.write(','.join(columns)) f.write('\n') for monthDay in monthDay_list: #对于online数据,根据id的条件筛选,然后只要拿到最大查询日期对应的价格即可(一条记录)。进一步的判断信息在spark上完成 #如果最大查询日期距离今天相隔7天以内,即该id在最近7天有价格记录,则将最大查询日期改为yesterday(模型只能通过昨天的信息,预测今天之后的价格趋势) for sample in cursor.find( {'_id': { "$regex": r'.*{}$'.format(monthDay) }}): lowPrice_id = sample.get('_id') del sample['_id'] org = sample.get('dairport') del sample['dairport'] dst = sample.get('aairport') del sample['aairport'] # df = pd.DataFrame.from_dict(sample, orient='index').reset_index().rename( # columns={'index': 'queryDate', 0: 'price'}) queryDate = max(sample.keys()) price = sample.get(queryDate) # historyLowPrice_fn_domestic中有异常,例如_id="3U3100_null_null_09-24" try: content = ','.join( [queryDate, price, lowPrice_id, org, dst]) f.write(content + '\n') except: continue logger.info("=====\"{}\" finished======".format( params["localFileName_org_lowPrice_onlineData"].split('/')[-1])) utils.delete_before4_localData( params["localFileName_org_lowPrice_onlineData"], params) utils.upload_to_hdfs(params["localFileName_org_lowPrice_onlineData"], params["sparkDirName_org_lowPrice_onlineData"], params)
def write_DCN_featureDict(df_all, params): df = df_all.drop(*params["dropFeatures"]).drop(params['label']) featureDict = {} tc = 0 for colName in df.columns: if colName in params["numericCols"]: continue else: # colName in categoryCols uniqueFeature = df.select( colName).distinct().toPandas()[colName].astype('float').values featureDict[colName] = dict( zip(uniqueFeature, range(tc, len(uniqueFeature) + tc))) tc = tc + len(uniqueFeature) with open(params["featureDict_fileName"], 'wb') as f: pickle.dump(featureDict, f) logger.info("====\"{}\" finished ====".format( params["featureDict_fileName"].split('/')[-1])) utils.delete_before4_localData(params["featureDict_fileName"], params)
def globalAirport_data(params): columns = ["Airport_code", "latitude", "longitude"] with open(params["localFileName_org_Airport_data"], 'w') as f: f.write(','.join(columns)) f.write('\n') for sample in cursor2.find({}): Airport_code = sample.get("_id") latitude = sample.get("latitude") longitude = sample.get("longitude") try: content = ','.join([Airport_code, latitude, longitude]) f.write(content + '\n') except: continue logger.info("====\"{}\" finished====".format( params["localFileName_org_Airport_data"].split('/')[-1])) utils.delete_before4_localData(params["localFileName_org_Airport_data"], params) utils.upload_to_hdfs(params["localFileName_org_Airport_data"], params["sparkDirName_org_Airport_data"], params)
def seatleft_data_add(params): tomorrow_str2 = datetime.strftime( params["generateDate"] + timedelta(days=1), '%Y-%m-%d') today_monthDay = datetime.strftime(params["generateDate"], '%m-%d') yesterday_str2 = (params["generateDate"] - timedelta(days=1)).strftime('%Y-%m-%d') after30_str2 = datetime.strftime( params["generateDate"] + timedelta(days=30), '%Y-%m-%d') monthDay_list = pd.date_range( start=yesterday_str2, end=after30_str2, freq='d').map(lambda x: datetime.strftime(x, '%m-%d')).to_list() monthDay_list.remove(today_monthDay) columns = ['queryDatetime', 'seatLeft', 'seatLeft_id'] with open(params["localFileName_org_seatLeft_data_add"], 'w') as f: f.write(','.join(columns)) f.write('\n') f.seek(2) for monthDay in monthDay_list: for sample in cursor3.find( {'_id': { "$regex": r'.*{}$'.format(monthDay) }}): seatLeft_id = sample.get('_id') df = pd.DataFrame.from_dict( sample.get('fc'), orient='index').reset_index().rename(columns={ 'index': 'queryDatetime', 0: 'seatLeft' }) df['seatLeft_id'] = seatLeft_id df.to_csv(params["localFileName_org_seatLeft_data_add"], header=False, index=False, mode='a') logger.info("====\"{}\" finished====".format( params["localFileName_org_seatLeft_data_add"].split('/')[-1])) utils.delete_before4_localData( params["localFileName_org_seatLeft_data_add"], params) utils.upload_to_hdfs(params["localFileName_org_seatLeft_data_add"], params["sparkDirName_org_seatLeft_data_add"], params)
def lowPrice_train_data_add(params): columns = ["queryDate", 'price', 'id', 'org', 'dst'] yesterday_monthDay_str = (params["generateDate"] - timedelta(days=1)).strftime('%m-%d') with open(params["localFileName_org_lowPrice_data_add"], 'w') as f: f.write(','.join(columns)) f.write('\n') f.seek(2) for sample in cursor.find( {'_id': { "$regex": r'.*{}$'.format(yesterday_monthDay_str) }}): lowPrice_id = sample.get('_id') del sample['_id'] org = sample.get('dairport') del sample['dairport'] dst = sample.get('aairport') del sample['aairport'] df = pd.DataFrame.from_dict( sample, orient='index').reset_index().rename(columns={ 'index': 'queryDate', 0: 'price' }) df['id'] = lowPrice_id df['org'] = org df['dst'] = dst df.to_csv(params["localFileName_org_lowPrice_data_add"], header=False, index=False, mode='a') logger.info("=====\"{}\" finished======".format( params["localFileName_org_lowPrice_data_add"].split('/')[-1])) utils.delete_before4_localData( params["localFileName_org_lowPrice_data_add"], params) utils.upload_to_hdfs(params["localFileName_org_lowPrice_data_add"], params["sparkDirName_org_lowPrice_data_add"], params)