Ejemplo n.º 1
0
def merge_xgboost_local_substitute():
    xgboost = get_pred_for_shops(pred_file = os.path.join(getHome(),"Dropbox",'dataset','Analysis','competition','xgBoost_comp_iterLength_7mode_filled_tarLength_14_iterOrC_copy_0.08607781.csv'))
    local = get_pred_for_shops(pred_file = os.path.join(getHome(),"Dropbox",'dataset','Scripts','LocalLearning','neighbours_10000_alg_ETR_estimator_500_mss_2_msl_1_copy_7x2_drop_xiaohaozi_localLearning714_0.08547515.csv'))
    shop_set = get_substitutes()
    for shop in local:
        if shop in shop_set:
            local[shop] = xgboost[shop]
    with open('merge_xgboost_local.csv','w') as fw:
        for s in xrange(1,2001):
            fw.write(','.join(map(str,[s]+local[s]))+'\n')
Ejemplo n.º 2
0
def get_substitutes():
    filename = os.path.join(getHome(),"Dropbox",'dataset','Scripts','VisualizePredictionResult','shops_recorded.txt')
    shop_set = set()
    with open(filename,'r') as fp:
        lines = fp.readlines()
        for l in lines:
            shop_set.add(int(l.strip('\r\n')))
    return shop_set
def get_original_pay_trend(shop_id):
    """
    :param shop_id:
    :return: a dataframe with columns ‘day' and 'cnt'
    """
    HOME = getHome()
    PayTrendFolder = os.path.join(HOME, "Dropbox", "dataset", "Analysis", "PayTrend")
    FileNameTemp = "CustomerFlow_%s.csv"
    csvFileName = os.path.join(PayTrendFolder, FileNameTemp % shop_id)
    shopTrend = pd.read_csv(csvFileName, header=None, names=['time', 'cnt'], parse_dates=[0])
    shopTrend['day'] = shopTrend.apply(lambda row: row['time'].date(), axis=1)
    shopTrend.drop('time',axis=1, inplace=True)
    return shopTrend
def load_data_set(continuous_zero_filled_threshold, consider_anomaly, lag,
                  outputlength, startshop, endshop):
    HOME = getHome()
    SourceFolder = os.path.join(HOME, \
                                "Dropbox", "dataset", "Analysis",\
                                "PayTrend_Filled_threshold_%s_%s_anomaly"%(continuous_zero_filled_threshold, "consider" if consider_anomaly else "not_consider"), \
                                "NewFeatures")
    #"training_allDefaultFeatures_lag%s_output%s"%(lag,outputlength))
    list_df = []
    print "loading data"
    for shop_id in xrange(startshop, endshop + 1):
        if shop_id % 200 == 0:
            print shop_id
        src_csv_file = "feature_shop_%s.csv" % shop_id
        src_csv_file = os.path.join(SourceFolder, src_csv_file)
        each_shop = pd.read_csv(src_csv_file)
        if each_shop.shape[0] != 0:
            list_df.append(each_shop)
    list_df = pd.concat(list_df, ignore_index=True)
    print 'finish loading'
    return list_df
Ejemplo n.º 5
0
        sys_name = "Lin"
        HOME_modelFolder = os.path.expanduser('~')
    elif system.startswith("Win"):
        HOME = r"C:\Users\SI30YD"
        if not os.path.exists(HOME):
            HOME = r"C:\Users\KH44IM"
        sys_name = "Win"
        HOME_modelFolder = r"H:\Model"
    else:
        print "Unknown platform"
        sys_name = "No"
        sys.exit(0)
    return HOME, HOME_modelFolder


HOME, HOME_modelFolder = getHome()
sys.path.append(os.path.join(HOME, "Dropbox", "dataset", "Scripts"))
from tianchi_api.system import getHome
from tianchi_api.metrics import loss, loss_reverse
from sklearn.model_selection import KFold
from tianchi_api.metrics import loss
from tianchi_api.competition import CompetitionPredictionModel, IterativePredictionModel, NewCompetitionPredictionModel
from tianchi_api.features import *
from tianchi_api.models import *


def Model_for_competition(algorithm_name,
                          pickle_model_file_name,
                          fgfs,
                          ReportFolder,
                          source,
# -*- coding: utf-8 -*-
'''
The improved KNN predictor considers all the features.
'''

import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import sys
import os
import numpy as np
from tianchi_api.system import getHome
#from zero_statistics import get_original_pay_trend
import zero_statistics as zt
global_anamolies = set()
DESFOLDER = os.path.join(getHome(), "Dropbox", "dataset", "Analysis",
                         "AnamolyDetect")
FILENAME = "firstpart.csv"


def write_anamolies():
    try:
        os.makedirs(DESFOLDER)
    except:
        pass
    with open(os.path.join(DESFOLDER, FILENAME), 'w') as fw:
        for rec in global_anamolies:
            fw.write("%s,%s,%s\n" % (rec[0], rec[1], rec[2]))


def click_data_show(shop_id):
    """
    return the home directory according to the platform
    :return:
    """
    system = platform.system()
    if system.startswith("Lin"):
        HOME = os.path.expanduser('~')
    elif system.startswith("Win"):
        HOME = r"C:\Users\KH44IM"
    else:
        print "Unknown platform"
        sys.exit(0)
    return HOME


HOME = getHome()
sys.path.append(os.path.join(HOME, "Dropbox", "dataset", "Scripts"))
from tianchi_api.system import getHome
from tianchi_api.metrics import loss, loss_reverse

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

from sklearn.model_selection import KFold


def scorer(estimator, X, y):
    return loss_reverse(estimator.predict(X), y, False)
Ejemplo n.º 8
0
    filename = os.path.join(getHome(),"Dropbox",'dataset','Scripts','VisualizePredictionResult','shops_recorded.txt')
    shop_set = set()
    with open(filename,'r') as fp:
        lines = fp.readlines()
        for l in lines:
            shop_set.add(int(l.strip('\r\n')))
    return shop_set

def merge_xgboost_local_substitute():
    xgboost = get_pred_for_shops(pred_file = os.path.join(getHome(),"Dropbox",'dataset','Analysis','competition','xgBoost_comp_iterLength_7mode_filled_tarLength_14_iterOrC_copy_0.08607781.csv'))
    local = get_pred_for_shops(pred_file = os.path.join(getHome(),"Dropbox",'dataset','Scripts','LocalLearning','neighbours_10000_alg_ETR_estimator_500_mss_2_msl_1_copy_7x2_drop_xiaohaozi_localLearning714_0.08547515.csv'))
    shop_set = get_substitutes()
    for shop in local:
        if shop in shop_set:
            local[shop] = xgboost[shop]
    with open('merge_xgboost_local.csv','w') as fw:
        for s in xrange(1,2001):
            fw.write(','.join(map(str,[s]+local[s]))+'\n')



if __name__=='__main__':
    pred_file = os.path.join(getHome(),"Dropbox",'dataset','Scripts','LocalLearning','Ext_LocalKernel_weight+114_77copy_0.08390_1.1_0.0832.csv')
    #print "Here"
    #pred_file = os.path.join(getHome(),"Dropbox",'dataset','Analysis','competition','features_final','xgBoost_comp_iterLength_7mode_filled_tarLength_14_iterOrC_iterative_77copy.csv')
    #print pred_file
    savefolder = os.path.join(getHome(), 'Dropbox', 'dataset', 'Analysis', 'Pred_Visulization','BestScore')
    concat_pred_to_original(range(1,2001),5,True,pred_file,savefolder)
    #for s in range(1,2001):
        #compare_preds(s)
    #merge_xgboost_local_substitute()
    if system.startswith("Lin"):
        HOME = os.path.expanduser('~')
        sys_name = "Lin"
        HOME_modelFolder = os.path.expanduser('~')
    elif system.startswith("Win"):
        HOME = r"C:\Users\KH44IM"
        sys_name = "Win"
        HOME_modelFolder = r"H:\Model"
    else:
        print "Unknown platform"
        sys_name = "No"
        sys.exit(0)
    return HOME, HOME_modelFolder


sys.path.append(os.path.join(getHome()[0], "Dropbox", "dataset", "Scripts"))
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
from tianchi_api.system import getHome
from tianchi_api.metrics import loss
from tianchi_api.system import getHome
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import NearestNeighbors
from tianchi_api.competition import CompetitionPredictionModel, IterativePredictionModel, NewCompetitionPredictionModel
from tianchi_api.features import *
from tianchi_api.getPredictors import predictors_WeatherAirTem