예제 #1
0
def train():

    features = []
    train = []
    keys = {}

    with open('train.csv', 'r') as dataset:

        for x, i in enumerate(next(dataset).split(',')):
            keys[i] = x  # Initialize list of feature names

        features = ([
            next(dataset).strip('\n').replace(' ', '').split(',')
            for x in range(35000)
        ])
        #Way too messy

    dataset.close()

    for i in range(len(features)):
        train.append(features[i].pop())

    #Remove unwanted data

    keep_list = []  #Only keep data points where room was booked
    for i in range(len(features)):

        if features[i][keys['is_booking']] == '1':
            keep_list.append(i)

    features = [features[i] for i in keep_list]
    train = [train[i] for i in keep_list]

    remove = ['date_time', 'srch_ci', 'srch_co', 'cnt']
    features = zip(*features)
    rem_list = []
    for i in remove:
        rem_list.append(keys[i])

    for i in features:
        if '' in i:
            rem_list.append(features.index(i))  #Deal with nulls later

    features = [features[i] for i in range(len(features)) if i not in rem_list]
    features = zip(*features)

    #Make CDT
    CDT, feature_order, cluster_order = CA.make_CDT(features, train)

    #Run MCA
    u_p, lambda_s, u, chi_sq, psi_sq = CA.MCA(CDT)

    #Project clusters onto principal axes
    cluster_proj = CA.projections(CDT, u)

    return cluster_proj, u, feature_order