Exemple #1
0
def featureConstruct(flag):
    """
    construct feature
    one hot encoding for feature category
    for continuous features: 
        flag=0, no nomarlization
        flag=1, normalize
    """

    configInstanceTrain = Config('feature.csv')
    loadInstanceTrain = load_data.Load_data(configInstanceTrain)
    data_train = loadInstanceTrain.feature_data()

    configInstanceTest = Config('feature_test.csv')
    loadInstanceTest = load_data.Load_data(configInstanceTest)
    data_test = loadInstanceTest.feature_data()

    if flag == 1:
        data_train_continuous = data_train[:, 3:]
        data_test_continuous = data_test[:, 3:]
        scaler = preprocessing.StandardScaler().fit(data_train_continuous)
        feature_train_continuous = scaler.transform(data_train_continuous)
        feature_test_continuous = scaler.transform(data_test_continuous)
        data_train[:, 3:] = feature_train_continuous
        data_test[:, 3:] = feature_test_continuous

    enc = OneHotEncoder(categorical_features=np.array([0, 1, 2]),
                        n_values=[67, 145, 7])

    enc.fit(data_train)
    feature_train = enc.transform(data_train).toarray()
    feature_test = enc.transform(data_test).toarray()

    filename_train = "feature_train_processed.csv"
    filename_test = "feature_test_processed.csv"

    loadInstanceTrain.save_file(filename_train, pd.DataFrame(feature_train))
    loadInstanceTrain.save_file(filename_test, pd.DataFrame(feature_test))
Exemple #2
0
def constructTrain():

    date, week = constructTime()
    n = 66
    m = 144

    dateLen = len(date)
    
    
    #dateLen = 1  # uncomment this
    
    for i in range(dateLen):
        configInstance = Config(date[i])
        tableInstance = Table(configInstance)
        tableReq, tableAns, tableGap, tableDest,  tableWeather = tableInstance.constructTable(0)
        
        disList = list()    
        timeList = list() # ???
        dateList = list() # ??
        weekList = list() # ??
        reqList = list()
        ansList = list()
        gapList = list()
        for j in range(n):
            for k in range(m):
                disList.append(j+1)
                timeList.append(k+1)
                dateList.append(date[i])
                weekList.append(week[i])
                reqList.append(tableReq[j][k])
                ansList.append(tableAns[j][k])
                gapList.append(tableGap[j][k])
                #destList.append(tableDest[j][k])
        
        tableData = pd.DataFrame({'districtID':disList, 'time':timeList,'date':dateList,'week':weekList,'req':reqList,'ans':ansList, 'gap':gapList},columns=['districtID', 'time', 'date', 'week', 'req', 'ans', 'gap'])
        fileName = 'train/train_data_'+date[i]+'.csv'
        
        #tableWeather.loc[tableWeather['time'], 'time'] = 
        
        ResultTable = pd.merge(tableWeather,tableData, on = ['time'])
        #print ResultTable
        #ResultTable.head()
        loadInstance = load_data.Load_data(configInstance)
        loadInstance.save_file(fileName, ResultTable)
Exemple #3
0
tree_classify_network_cell = str(input_lines[4].strip())
hidden_size = int(input_lines[5].strip())
nb_epoch = int(input_lines[6].strip())
batch_size = int(input_lines[7].strip())
divide_file_factor = int(input_lines[8].strip())


del input_lines
input_lines = None


# Load Training Data

train_file = str(sys.argv[2])

ld = load_data.Load_data(train_file, max_word_length, divide_file_factor)   #give the training file name

train_char_data = ld.load_train_char_data()
print ('train_char_data.shape: ' + str(train_char_data.shape))

train_data_class_annotation, unique_edit_trees_from_train_data = ld.load_class_annotation_and_unique_edit_trees_from_train_data()
print ('train_data_class_annotation: ' + str(train_data_class_annotation.shape))
print ('size of unique_edit_trees_from_train_data: ' + str(len(unique_edit_trees_from_train_data)))

train_data_applicable_trees = ld.load_applicable_trees_data(train_file)
print ('train_data_applicable_trees: ' + str(train_data_applicable_trees.shape))

train_data_global_vectors = ld.load_global_word_vectors(train_file, global_vector_file)
print ('train_data_global_vectors.shape: ' + str(train_data_global_vectors.shape))

nb_tree_classes = len(unique_edit_trees_from_train_data)
Exemple #4
0
    def constructTable(self, dataType):
        """
        66 by 144
        """
        loadDataInstance = load_data.Load_data(self.config)
        
        # dataType: 0????,1????
        if dataType == 0:
            orderData = loadDataInstance.train_order_data()
            weatherData = loadDataInstance.train_weather_data()
        else:
            orderData = loadDataInstance.test_order_data()
            weatherData = loadDataInstance.test_weather_data()
        
        
        for index, row in weatherData.iterrows():
            weatherData.ix[index, 'time']=self.timeConvert(row['time'])
        
        clusterMap = loadDataInstance.train_cluster_map()
        clusterDict = pd.Series(clusterMap[:,1], index=clusterMap[:,0])
        n = 66
        m = 144
        tableReq = np.zeros((n, m)) # ???
        tableAns = np.zeros((n, m)) # ???
        tableGap = np.zeros((n, m)) # ???
        
        tablePassenger = np.zeros((n, m)) # ???
        tableDriver = np.zeros((n, m)) # ???
        tableStart = np.zeros((n, m)) # ????       
        tablePriceMax = np.zeros((n, m)) # ????
        tablePriceMin = np.zeros((n, m)) # ????
        tablePriceSum = np.zeros((n, m)) # ???
        tablePriceMean = np.zeros((n, m)) # ????
        
        driverSet = [[set() for col in range(m)] for row in range(n)] #????
        passengerSet = [[set() for col in range(m)] for row in range(n)] #???? 
        startSet = [[set() for col in range(m)] for row in range(n)] #?????             
        priceSet = [[set() for col in range(m)] for row in range(n)] #???? 
        
        for i in range(len(orderData)):
            d = clusterDict[orderData[i][3]]
            r = self.timeConvert(orderData[i][6])
                      
            driverId = orderData[i][1]
            passengerId = orderData[i][2]
            start =  orderData[i][3]
            price = orderData[i][5]                         

            tableReq[d-1][r-1]+=1  
            tablePriceSum[d-1][r-1]+=price            
                        
            passengerSet[d-1][r-1].add(passengerId)
            startSet[d-1][r-1].add(start)
            priceSet[d-1][r-1].add(price)

            if pd.isnull(driverId):
                tableGap[d-1][r-1]+=1
            else:
                tableAns[d-1][r-1]+=1
                driverSet[d-1][r-1].add(driverId)
                
        for i in range(n):
            for j in range(m):
                tablePassenger[i][j]=len(passengerSet[i][j])
                tableDriver[i][j]=len(driverSet[i][j])
                tableStart[i][j]=len(startSet[i][j])
                tablePriceMax[i][j]=max(priceSet[i][j] or [0])
                tablePriceMin[i][j]=min(priceSet[i][j] or [0])
                

        return tableReq, tableAns, tableGap, tableStart, weatherData
Exemple #5
0
def constructTest():

  

    configInstance = Config('01-22')
    loadInstance = load_data.Load_data(configInstance)

    timeArr = loadInstance.test_set_1_readme1()
    TestLength = len(timeArr)
    #TestLength = 1
    
    dateDict  = defaultdict(list)

    for j in range(TestLength):   
        strTime = timeArr[j][0]
        tmp = strTime.split('-')
        ind = int(tmp[3])-1
        day = int(tmp[2])
        date = ('01-%02d')%(day)
        dateDict[date].append(ind-3)
        dateDict[date].append(ind-2)
        dateDict[date].append(ind-1)
           
    n = 66


    
    for date in dateDict:
        configInstance = Config(date)
        tableInstance = Table(configInstance)
        tableReq, tableAns, tableGap, tableDest, tableWeather= tableInstance.constructTable(1)
        
        disList = list()    
        timeList = list() # ???
        dateList = list() # ??
        weekList = list() # ??
        reqList = list()
        ansList = list()
        gapList = list()
        destList = list()

        
        tmp = date.split('-')
        week = datetime(2016,1,int(tmp[1])).weekday()
        
        for j in range(n):
            for k in dateDict[date]:
                disList.append(j+1)
                timeList.append(k+1)
                dateList.append(date)
                weekList.append(week)
                reqList.append(tableReq[j][k])
                ansList.append(tableAns[j][k])
                gapList.append(tableGap[j][k])
                destList.append(tableDest[j][k])
        
        tableData = pd.DataFrame({'districtID':disList, 'time':timeList,'date':dateList,'week':weekList,'req':reqList,'ans':ansList, 'gap':gapList},columns=['districtID', 'time', 'date', 'week', 'req', 'ans', 'gap'])
        fileName = 'test/test_data_'+date+'.csv'
        
        
        ResultTable = pd.merge(tableWeather,tableData, on = ['time'])
        
        loadInstance = load_data.Load_data(configInstance)
        loadInstance.save_file(fileName, ResultTable)