Example #1
0
def multi_index():
    #df=DataFrame(np.random.randn(10),index=[['a','b','a','c','a','b','c','a','a','c'],
    df = DataFrame(np.random.randn(16).reshape(4, 4),
                   index=[['a', 'b', 'a', 'c'], [1, 2, 3, 2]],
                   columns=[["Hot", "Cold", "Hot", "Cold"],
                            ["Good", "Bad", "Bad", "Good"]])
    print df
    print df.index
    df2 = DataFrame(np.arange(16).reshape(4, 4),
                    index=["1", '2', '3', '4'],
                    columns=["A", "B", "C", "D"])
    print df2
    print df2.icol(2)
    print df2.ix['3']
    print df2['C']
Example #2
0
def multi_index():
    #df=DataFrame(np.random.randn(10),index=[['a','b','a','c','a','b','c','a','a','c'],
    df = DataFrame(np.random.randn(16).reshape(4, 4), index=[['a', 'b', 'a', 'c'],
                                                             [1, 2, 3, 2]],
                   columns=[["Hot", "Cold", "Hot", "Cold"], ["Good", "Bad", "Bad", "Good"]])
    print(df)
    print(df.index)
    df2 = DataFrame(np.arange(16).reshape(4, 4), index=["1", '2', '3', '4'], columns=["A", "B", "C", "D"])
    print(df2)
    print(df2.icol(2))
    print(df2.ix['3'])
    print(df2['C'])
    Series([1, 2, 3]),
    Series(['John', 'Amy', 'Mark']),
    Series([True, False, True])
])
DataFrame([[1, 2, 3], ['J', 'A', 'M'], [True, False, True]])
'c' in frame2.index
'Washu' in frame2.columns
'Washu' in frame2
frame2.drop('d')
frame2.drop('Washu', axis=1)
frame2[frame2['Washu'] > 0]
frame2 = frame2.fillna(0)
frame2.ix[frame2.Washu > 0, frame2.ix['d'] > 0]
frame2.xs('d')
frame2.xs('UM', axis=1)
frame2.icol(2)
frame2.irow(4)
frame2.add(frame3, fill_value=0)
frame2.applymap(lambda x: '%.2f' % x)
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df.index.is_unique
df.index.unique()
df.ix['a']
frame2.describe()
"""
Unique values, value counts, membership
Not unique indices
"""
obj = Series(list('cbdaabbcc'))
obj.unique()
s = obj.value_counts()
Example #4
0
def looding():
    start = clock()
    data1 = '0814'
    data2 = '0811'
    dflte = read_csv(ur'4G指标%s.csv' % data1,
                     skiprows=[0],
                     header=None,
                     encoding="gbk")
    dfwcdma = read_csv(ur'%swcdma.csv' % data1,
                       skiprows=[0],
                       header=None,
                       encoding="gbk")
    dfgsm = read_excel(ur'%sgsm.xls' % data1,
                       skiprows=[0],
                       header=None,
                       sheetname='Sheet')
    dfnamelist = read_excel(ur'校园清单0802.xlsx', skiprows=[0], header=None)
    end = clock()
    print str((end - start) / 60) + 'mins'
    print u'完成读取'
    start = clock()
    dflte[11] = dflte[7].astype(float) + dflte[8].astype(float)
    dflte[11] = dflte[11] / (1024 * 1024)
    dflte[12] = dflte[2] * 256 + dflte[3]

    dfgsm[5] = dfgsm[5] / (1024 * 1024 * 1024)
    dfwcdma[19] = dfwcdma[10] + dfwcdma[11]
    dfwcdma[19] = dfwcdma[19] / (1024 * 1024 * 1024)
    varssnamelte1 = dfnamelist[dfnamelist[6].str.contains(u'LTE')]
    varssnamelte = varssnamelte1.ix[:, [8]]
    varssnamewcdma = dfnamelist[dfnamelist[6].str.contains(u'WCDMA')]
    varssnamegsm = dfnamelist[dfnamelist[6].str.contains(u'GSM')]
    varssnamewcdmagsm1 = varssnamewcdma.append(varssnamegsm, ignore_index=True)
    varssnamewcdmagsm = varssnamewcdmagsm1.ix[:, [7, 8]]
    varssnamewcdmagsm.columns = ['LAC', 'CI']
    varssnamelte.columns = ['cellid']
    varssnamelte = varssnamelte.drop_duplicates(['cellid'])
    dflte.columns = [
        'city', 'changjia', 'ENODB ID', 'id', 'cellid1', 'data', 'time',
        'shang', 'xia', 'PRB', 'rrc', 'ltetraffic', 'cellid'
    ]
    dfgsm.columns = [
        'time', 'nodbname', 'LAC', 'CI', 'huawu', 'gsmtraffic', 'city', 'PRB'
    ]
    dfwcdma.columns = [
        '0', 'city', '2', '3', 'LAC', 'CI', '6', '7', '8', 'huawu', '10', '11',
        '12', '13', '14', '15', '16', '17', 'PRB', 'wcdmatraffic'
    ]
    dflteschool = merge(dflte, varssnamelte, how='inner', on='cellid')
    dfgsmschool = merge(dfgsm,
                        varssnamewcdmagsm,
                        how='inner',
                        on=['LAC', 'CI'])
    dfwcdmaschool = merge(dfwcdma,
                          varssnamewcdmagsm,
                          how='inner',
                          on=['LAC', 'CI'])
    # dflteschool_zero = dflteschool[dflteschool['ltetraffic']  == 0]
    dfgsmschool_zero = dfgsmschool[dfgsmschool['huawu'] == 0]
    dfwcdmaschool_zero = dfwcdmaschool[dfwcdmaschool['huawu'] == 0]
    dfwcdmaschool_zero = dfwcdmaschool_zero[dfwcdmaschool_zero['wcdmatraffic']
                                            == 0]

    pivot_tablelte1_zero = pivot_table(dflte,
                                       index=['cellid1', 'city'],
                                       values=['ltetraffic'],
                                       aggfunc=[np.sum, len])
    dflteschool_zero = pivot_tablelte1_zero[pivot_tablelte1_zero.icol(0) == 0]
    dflteschool_zero = dflteschool_zero[dflteschool_zero.icol(1) == 24]
    dflteschool_zero['ltetraffic'] = dflteschool_zero.icol(1) / 24
    dflteschool_zero.index.names = ['key1', 'city']
    # print dflteschool_zero['ltetraffic'].sum(level='city')
    pivot_tablelte = pivot_table(dflte,
                                 index=['city'],
                                 values=['ltetraffic', 'PRB'],
                                 aggfunc=[np.sum, np.mean, len])
    pivot_tablewcdma = pivot_table(dfwcdma,
                                   index=['city'],
                                   values=['wcdmatraffic', 'PRB'],
                                   aggfunc=[np.sum, np.mean, len])
    pivot_tablegsm = pivot_table(dfgsm,
                                 index=['city'],
                                 values=['gsmtraffic', 'PRB'],
                                 aggfunc=[np.sum, np.mean, len])
    pivot_tableschoollte = pivot_table(dflteschool,
                                       index=['city'],
                                       values=['ltetraffic', 'PRB'],
                                       aggfunc=[np.sum, np.mean, len])
    pivot_tableschoolwcdma = pivot_table(dfwcdmaschool,
                                         index=['city'],
                                         values=['wcdmatraffic', 'PRB'],
                                         aggfunc=[np.sum, np.mean, len])
    pivot_tableschoolgsm = pivot_table(dfgsmschool,
                                       index=['city'],
                                       values=['gsmtraffic', 'PRB'],
                                       aggfunc=[np.sum, np.mean, len])
    tempcell = DataFrame([
        pivot_tablelte.iloc[:, 1], pivot_tablewcdma.iloc[:, 1],
        pivot_tablegsm.iloc[:, 1]
    ])
    tempcell = DataFrame(tempcell.T,
                         index=[
                             u'南宁', u'柳州', u'桂林', u'崇左', u'梧州', u'贵港', u'百色',
                             u'北海', u'玉林', u'钦州', u'贺州', u'来宾', u'防城港', u'河池'
                         ])

    tempcell[4] = tempcell.icol(0) + tempcell.icol(1) + tempcell.icol(2)
    tempschool = DataFrame([
        pivot_tableschoollte.iloc[:, 1], pivot_tableschoolwcdma.iloc[:, 1],
        pivot_tableschoolgsm.iloc[:, 1]
    ], )
    tempschool = DataFrame(tempschool.T,
                           index=[
                               u'南宁', u'柳州', u'桂林', u'崇左', u'梧州', u'贵港', u'百色',
                               u'北海', u'玉林', u'钦州', u'贺州', u'来宾', u'防城港', u'河池'
                           ])
    tempschool[4] = tempschool.icol(0) + tempschool.icol(1) + tempschool.icol(
        2)
    tempschoolprb = DataFrame([
        pivot_tableschoollte.iloc[:, 2] / (0.9 * 0.6 * 0.5 * 0.01),
        pivot_tableschoolwcdma.iloc[:, 2], pivot_tableschoolgsm.iloc[:, 2]
    ], )
    tempschoolprb = DataFrame(tempschoolprb.T,
                              index=[
                                  u'南宁', u'柳州', u'桂林', u'崇左', u'梧州', u'贵港',
                                  u'百色', u'北海', u'玉林', u'钦州', u'贺州', u'来宾',
                                  u'防城港', u'河池'
                              ])
    tempschool_zero = DataFrame(
        dflteschool_zero['ltetraffic'].sum(level='city'),
        index=[
            u'南宁', u'柳州', u'桂林', u'崇左', u'梧州', u'贵港', u'百色', u'北海', u'玉林',
            u'钦州', u'贺州', u'来宾', u'防城港', u'河池'
        ])
    end = clock()
    print str((end - start) / 60) + 'mins'
    print u'完成统计'
    start = clock()
    with ExcelWriter('result_alltest%s.xlsx' % data1) as writer:
        # varssnamelte.to_excel(writer, sheet_name=u'lte列表', encoding="gbk")
        # dflteschool.to_excel(writer, sheet_name=u'lte学校列表',encoding="gbk")
        # Statistical_celllte.to_excel(writer, sheet_name=u'lte按地市流量统计', encoding="gbk")
        # Statistical_cell_schoollte.to_excel(writer, sheet_name=u'lte校园按地市流量统计', encoding="gbk")
        tempcell.to_excel(writer, sheet_name=u'按地市流量汇总', encoding="gbk")
        tempschool.to_excel(writer, sheet_name=u'按校园地市流量汇总', encoding="gbk")
        tempschoolprb.to_excel(writer,
                               sheet_name=u'按校园地市资源利用率汇总',
                               encoding="gbk")
        tempschool_zero.to_excel(writer, sheet_name=u'零话务小区汇总', encoding="gbk")
        pivot_tablelte.to_excel(writer,
                                sheet_name=u'lte按地市透视图',
                                encoding="gbk")
        pivot_tableschoollte.to_excel(writer,
                                      sheet_name=u'lte校园按地市透视图',
                                      encoding="gbk")
        dflteschool.to_excel(writer, sheet_name=u'lte校园按地市清单', encoding="gbk")
        # Statistical_cellgsm.to_excel(writer, sheet_name=u'gsm按地市流量统计', encoding="gbk")
        # Statistical_cell_schoolgsm.to_excel(writer, sheet_name=u'gsm校园按地市流量统计', encoding="gbk")
        pivot_tablegsm.to_excel(writer,
                                sheet_name=u'gsm按地市透视图',
                                encoding="gbk")
        pivot_tableschoolgsm.to_excel(writer,
                                      sheet_name=u'gsm校园按地市透视图',
                                      encoding="gbk")

        # Statistical_cellwcdma.to_excel(writer, sheet_name=u'wcdma按地市流量统计', encoding="gbk")
        # Statistical_cell_schoolwcdma.to_excel(writer, sheet_name=u'wcdma校园按地市流量统计', encoding="gbk")
        pivot_tablewcdma.to_excel(writer,
                                  sheet_name=u'wcdma按地市透视图',
                                  encoding="gbk")
        pivot_tableschoolwcdma.to_excel(writer,
                                        sheet_name=u'wcdma校园按地市透视图',
                                        encoding="gbk")
        dflteschool_zero.to_excel(writer,
                                  sheet_name=u'lte校园零话务小区清单',
                                  encoding="gbk")
        dfwcdmaschool_zero.to_excel(writer,
                                    sheet_name=u'wcdma校园零话务小区清单',
                                    encoding="gbk")
        dfgsmschool_zero.to_excel(writer,
                                  sheet_name=u'gsm校园零话务小区清单',
                                  encoding="gbk")
        print str((end - start) / 60) + 'mins'
        print u'完成输出文档'
Example #5
0
data.w  # 选择表格中的'w'列,使用点属性,返回的是Series类型

data[['w']]  # 选择表格中的'w'列,返回的是DataFrame属性

data[['w', 'z']]  # 选择表格中的'w'、'z'列

data[0:2]  # 返回第1行到第2行的所有行,前闭后开,包括前不包括后

data[1:2]  # 返回第2行,从0计,返回的是单行,通过有前后值的索引形式,
# 如果采用data[1]则报错

data.ix[1:2]  # 返回第2行的第三种方法,返回的是DataFrame,跟data[1:2]同

data['a':'b']  # 利用index值进行切片,返回的是**前闭后闭**的DataFrame,
# 即末端是包含的
data.irow(0)  # 取data的第一行
data.icol(0)  # 取data的第一列

data.head()  # 返回data的前几行数据,默认为前五行,需要前十行则dta.head(10)
data.tail()  # 返回data的后几行数据,默认为后五行,需要后十行则data.tail(10)

ser.iget_value(0)  # 选取ser序列中的第一个
ser.iget_value(-1)  # 选取ser序列中的最后一个,这种轴索引包含索引器的series不能采用ser[-1]去获取最后一个,这回引起歧义。

data.iloc[-1]  # 选取DataFrame最后一行,返回的是Series
data.iloc[-1:]  # 选取DataFrame最后一行,返回的是DataFrame

data.loc['a', ['w', 'x']]  # 返回‘a’行'w'、'x'列,这种用于选取行索引列索引已知

data.iat[1, 1]  # 选取第二行第二列,用于已知行、列位置的选取。
Example #6
0
    index=[2,0,1]
)
frame
'''
   0  1
2  0  1
0  2  3
1  4  5
'''
frame.irow(0)
# get row 0 (index==2), get values of all its cols
'''
0    0
1    1
Name: 2, dtype: int64
'''
frame.irow(2)
# get row 2 (index==1), get vals of all its cols
'''
0    4
1    5
Name: 1, dtype: int64
'''
frame.icol(1) # in this case, ==frame.icol(-1)
'''
2    1
0    3
1    5
Name: 1, dtype: int64
'''
def main():
    # reindex
    obj = Series(range(4), index="a b c d".split(" ")[::-1])
    print obj

    obj2 = obj.reindex("a b c d e".split(" "))
    print obj2

    # Change NaN
    print obj.reindex("a b c d e".split(" "), fill_value=0)
    colors = ["blue", "purple", "yellow"]
    index = [0, 2, 4]
    obj3 = Series(colors, index=index)
    print obj3.reindex(range(6))
    print obj3.reindex(range(6), method="ffill")  # not found forward fill
    print obj3.reindex(range(6), method="backfill")  # bfill

    # DataFrame
    states = ["Ohio", "Texas", "California"]
    frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"])
    print frame
    frame2 = frame.reindex("a b c d".split(" "))
    print frame2
    states[0] = "Utah"
    states[1], states[0] = states[:2]
    print frame.reindex(columns=states)
    # fill
    print frame.reindex("a b c d".split(" "), method="ffill", columns=states)
    print frame.ix["a b c d".split(" ")]
    print frame.ix["a b c d".split(" "), states]

    # Delete column
    print "", ""
    obj = Series(range(5), index="a b c d e".split(" "))
    new_obj = obj.drop("c")
    print new_obj
    print obj

    # Index reference
    print "", ""
    obj = Series(np.arange(4.0), index="a b c d".split(" "))
    print obj["b"]
    print obj[1]  # same
    print obj[2:4]
    print obj[["b", "a", "c"]]
    print obj[[1, 3]]
    print obj[obj < 2]
    # Slice with label
    print obj["b":"c"]  # include 'c'
    obj["b":"c"] = 5
    print obj

    data = DataFrame(
        np.arange(16).reshape((4, 4)),
        index=["Ohio", "Colorado", "Utah", "New York"],
        columns=["one", "two", "three", "four"],
    )
    print data
    # column
    print data["two"]
    print data[["three", "one"]]
    # row
    print data[:2]
    print data[data["three"] > 5]
    # all values
    print data < 5
    data[data < 5] = 0
    print data
    # row and column
    print data.ix[["Colorado"], ["two", "three"]]
    print data.ix[["Colorado", "Utah"], [3, 0, 1]]
    # row
    print data.ix[2]
    # label row and column, return column
    print data.ix[:"Utah", "two"]
    # xs
    # row
    print data.xs("Utah")
    print data.xs("Utah", axis=0)
    # rows
    print data.xs("two", axis=1)
    # icol/irow i is index
    print data.icol(1)
    print data.irow(1)

    # Union
    print "", ""
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
    print s1
    print s2
    # index is union, but d, f, g are NaN
    print s1 + s2
    df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
    df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print df1
    print df2
    print df1 + df2

    # arithmetic method
    print "", ""
    df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd"))
    df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde"))
    print df1
    print df2
    print df1.add(df2, fill_value=0)
    # reindex has fill_value argument
    # other arithmetic method are sub/div/mul(ti)

    # Calculation in a DataFrame and Series
    print "", ""
    # subtract from each row. broadcat
    arr = np.arange(12.0).reshape((3, 4))
    print arr
    print arr[0]
    print arr - arr[0]
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    series = frame.ix[0]
    print frame
    print series
    print frame - series

    series2 = Series(range(3), index=list("bef"))
    print frame + series2

    series3 = frame["d"]
    series4 = frame.ix[0]
    print frame
    print series3
    print series4
    print frame.sub(series3, axis=0)
    print frame.sub(series4, axis=1)

    # apply function and mapping
    print "", ""
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print frame
    f = lambda x: x.max() - x.min()
    print frame.apply(f)
    print frame.apply(f, axis=1)

    f = lambda x: Series([x.min(), x.max()], index=["min", "max"])
    print frame.apply(f)

    format = lambda x: "{0:.2f}".format(x)
    print frame.applymap(format)  # frame
    print frame["e"].map(format)  # series

    # sort and rank
    print "", ""
    obj = Series(range(4), index=list("dabc"))
    print obj
    print obj.sort_index()

    frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
    print frame
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(axis=1, ascending=False)

    # Sorting series
    print "", ""
    obj = Series([4, 7, -3, 2])
    print obj.order()
    obj = Series([4, np.nan, 7, np.nan, -3, 2])
    print obj.order()
    print obj.order(ascending=False)

    # order by multi columns
    print "", ""
    frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
    print frame.sort_index(by=["a", "b"])

    # rank
    print "", ""
    obj = Series([7, -5, 7, 4, 2, 0, 4])
    print obj.rank()  # method is average
    print obj.rank(method="first")  # No Duplicates
    print obj.rank(ascending=False, method="min")
    print obj.rank(ascending=False, method="max")
    f1 = DataFrame(obj, columns=["data"])
    f2 = DataFrame(obj.rank(), columns=["rank"])
    # merge by each index
    print pd.merge(f1, f2, left_index=True, right_index=True)

    # Index of the axis with duplicate values
    print "", ""
    obj = Series(range(5), index=list("aaabc"))
    print obj
    print obj.index.is_unique
    print obj["a"]
    print obj["c"]

    df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd"))
    print df
    print df.ix["b"]
    print df["c"]
#%%
d[0]          # error 
#%% 
d['a']        # Series,列
#%%
d[['a','c']]  # DataFrame,列
#%%
d[:5]         # DataFrame,行
#%% 
d.ix[:5]      # position-based,行
#%%
d1.ix[:5]     # label-based,行
#%%           
d.irow(0)     # Series
#%% 
d.icol(0)     # Series
#%%
d.get_value('e','a')    # get_value(row_name,col_name)
#%% 强制使用位置来访问元素的方法
d.iget_value(0,1)       # iget_value(irow,icol)  

#%% 使用条件过滤
d[d>5]
#%% 
d[d.a>5]
#%%
d[(d>5)&(d%3==0)]

#%% 使用条件过滤的本质
d>5       # DataFrame
#%%       
class GradeBook(object):
    """A class encapsulating a pandas DataFrame and meant to store 
    the grades for a whole class. It provides the method compute_total_grades
    that compute the totla grade for each student according to a weights provided
    by the caller.
    """

    def __init__(self, grade_arr, student_ids, item_list, max_scores):
        """
        Constructor of the class grade frame: 
	It should set the following attributes:

	(1) self.raw_grades, which is a DataFrame with 
	        - row labels given by student_ids
	        - column labels given by item_list
	        - values given by grade_arr

        (2) self.total_grades, set to None

	(3) self.letter_grades, set to None

	(4) self.max_scores, set to max_scores
        
        Parameters
        ----------
        grade_arr : numpy array of grades as returned by simulate_grades

        student_ids: a list of student ids 

	item_list: a list of grade items (e.g. ['HW', 'M', 'F'])

	max_scores: a list of the maximum possible score for each grade item
        
        Returns
        -------
        nothing 
        
        Examples
        --------
        >>> a = GradeBook(array([[1,2],[3,4]]),['22','34'],['F','M'],[30, 50])
        >>> a.letter_grades == None
        True
        >>> a.total_grades == None
        True
        >>> a.raw_grades.shape == (2,2)
        True
        >>> a.raw_grades.ix[0,0] == 1
        True
	>>> a.max_scores[0] == 30
	True
        """
        self.raw_grades = DataFrame(grade_arr, index=student_ids, columns=item_list)
        self.total_grades = None
        self.letter_grades = None
        self.max_scores = max_scores

    def compute_total_grades(self, item_weights=None, max_score=100):
        """
        Compute student total class grades as a weighted average of the column in self.raw_grades 
        according to the weights passed to item_weight for each of the columns.
        The student total class grades are then stored in the Series attribute self.total_grades
        The return value should be a Series containing a numerical summary
        (as returned by the Series method describe) of the total class grade distribution. 
    
        Parameters
        ----------
        item_weights: list of floats summing up to one
            List of weights to be applied to each grade item (e.g. [0.3, 0.4, 0.3]) 
        
        max_score: float 
            Maximal possible score for the total class grade	
    
        Returns
        -------
        out : Series 
            A Series containing a numerical summary of the total 
    	grade distribution previously stored by the function 
    	in the attribute self.total_grades; this Series is the
    	output of the Series method describe.
        ----
    
        Examples
        --------
        >>> a = GradeBook(array([[5,5],[1,1]]),['22','34'],['F','M'],[10, 10])
	>>> b = a.compute_total_grades([0.5, 0.5], 100)
	>>> len(b) == 5
	False
	>>> a.total_grades['22'] == 50
	True
	>>> a.total_grades['34'] == 10
	True
        """
        self.total_grades = 0 
        for i in range(0, len(self.raw_grades.columns)-1):
            self.total_grades += item_weights[i] * self.raw_grades.icol(i)
	return self.total_grades.describe()
Example #10
0
train_y = pd.read_csv('../train_knn_1/train_y' + day_time + '.csv')
test_x = pd.read_csv('../test_knn_1/test_x' + day_time + '.csv')
test_y = pd.read_csv('../test_knn_1/test_y' + day_time + '.csv')

scaler = preprocessing.MinMaxScaler()

train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

KNN = KNeighborsRegressor(n_neighbors=5,
                          weights='distance',
                          algorithm='auto',
                          p=1)

Bagging_KNN = BaggingRegressor(base_estimator=KNN,
                               n_estimators=100,
                               random_state=1,
                               max_features=0.2)
#Boosting_KNN = AdaBoostRegressor(base_estimator=KNN,n_estimators=100,random_state=1)

pre = DataFrame()

for i in range(7):
    Bagging_KNN.fit(train_x, list(train_y.icol(i).values))
    pre['col_' + str(i)] = (Bagging_KNN.predict(test_x)).round()
    tmp_score = calculate_score(pre.icol(i).values, test_y.icol(i).values)
    print str(i) + ': ', tmp_score

score = calculate_score(pre.values, test_y.values)
print score