Example #1
0
def file_join(file_from, file_to, key_word):
    if os.path.isdir(file_to):
        file_to = my_filter.walk_dir(file_to, 'csv')

    fp_from = open(file_from, 'r')
    lable_from = fp_from.readline().strip().split(',')
    index_from = lable_from.index(key_word)
    dict_from = {}
    for line in fp_from:
        line = line.strip().split(',')
        temp = line[:index_from] + line[index_from+1:]
        value = ''
        for i in temp:
            value += i + ','
        value = value[:-1]
        dict_from[line[index_from]] = value

    if type(file_to) == types.ListType:
        for f in file_to:
            fp_to = open(f, 'r')
            lable_to = fp_to.readline().strip().split(',')
            index_to = lable_to.index(key_word)

            file_new = f[:f.rfind('.')] + '-joined.csv'
            fp_write = open(file_new, 'w')
            lable_new = lable_to
            for word in lable_from:
                if key_word != word:
                    lable_new.append(word)
            value = ''
            for i in lable_new:
                value += i + ','
            value = value[:-1]
            fp_write.write(value+'\n')

            for line in fp_to:
                temp = line.strip().split(',')
                line_new = line.strip() + ',' + dict_from[temp[index_to]]
                fp_write.write(line_new+'\n')
    else:
        fp_to = open(file_to, 'r')
        lable_to = fp_to.readline().strip().split(',')
        index_to = lable_to.index(key_word)

        file_new = file_to[:file_to.rfind('.')] + '-joined.csv'
        fp_write = open(file_new, 'w')
        lable_new = lable_to
        for word in lable_from:
            if key_word != word:
                lable_new.append(word)
        value = ''
        for i in lable_new:
            value += i + ','
        value = value[:-1]
        fp_write.write(value+'\n')

        for line in fp_to:
            temp = line.strip().split(',')
            line_new = line.strip() + ',' + dict_from[temp[index_to]]
            fp_write.write(line_new+'\n')
Example #2
0
def analyse_data(file_path_read, task_type = 1): 
    weather_dict = data_dict('./weather-analysed.csv', 'date')
    # task_type = train is 1, test is 2
    if task_type != 1 and task_type != 2:
        print 'wrong task_type, please input 1 for train, 2 for test.\n'
        return -1
    if task_type == 1:
        labels = 'date,what_day,hour,holiday,weekend,weather,temp_high,temp_low,temp_mean,temp_predict,num'
    else:
        labels = 'date,what_day,hour,holiday,weekend,weather,temp_high,temp_low,temp_mean,temp_predict'
    
    if os.path.isdir(file_path_read):
        file_path_read = my_filter.walk_dir(file_path_read, 'txt')
    else:
        file_path_read = [file_path_read]

    for path_read in file_path_read:
        file_path_write = path_read[:path_read.rfind('.')]
        file_path_write += '-analysed.csv'

        fp_read = open(path_read, 'r')
        fp_write = open(file_path_write, 'w')
        fp_write.write(labels+'\n')
        for line in fp_read:
            write_line = ''
            write_dict = {}

            line = line.strip().split(',')
            date_hour = line[0] 
            write_dict['date'] = date_hour[:-2]
            if task_type == 1:
                write_dict['num'] = line[1]
            write_dict['hour'] = date_hour[-2:]
            write_dict['what_day'] = my_filter.what_day(write_dict['date'])
            if write_dict['date'] in holiday_date:
                write_dict['holiday'] = '1'
            else:
                write_dict['holiday'] = '0'
            if int(write_dict['what_day']) >= 6 and write_dict['date'] not in work_date :
                write_dict['weekend'] = '1'
            elif write_dict['date'] in holiday_date:
                write_dict['weekend'] = '1'
            else:
                write_dict['weekend'] = '0'

            write_dict['weather'] = weather_dict[write_dict['date']]['weather']
            write_dict['temp_high'] = weather_dict[write_dict['date']]['temp_high']
            write_dict['temp_low'] = weather_dict[write_dict['date']]['temp_low']
            write_dict['temp_mean'] = weather_dict[write_dict['date']]['temp_mean']

            hour = int(date_hour[-2:])
            temp_low = float(write_dict['temp_low'])
            temp_high = float(write_dict['temp_high'])
            if hour < 6:
                temp_predict = temp_high - (hour+24-14)/16.0*(temp_high - temp_low)
            elif hour < 14:
                temp_predict = temp_low + (hour-6)/8.0*(temp_high - temp_low)
            else:
                temp_predict = temp_high - (hour-14)/16.0*(temp_high - temp_low)
            write_dict['temp_predict'] = str(temp_predict)

            label_list = labels.strip().split(',')
            for l in label_list:
                write_line += write_dict[l] + ','
            write_line = write_line[:-1]
            # delete the last ','
            fp_write.write(write_line+'\n')
        fp_read.close()
        fp_write.close()


if __name__ == '__main__':
    file_path_read = raw_input(prompt['file_path_read'])
    line_num = int(raw_input('what the line number: '))
    algorithm_type = int(raw_input('what kind of algorithm you want to use:\
                                \n1: linear regression\
                                \n2: random forest\n'))
    df_test = pd.read_csv('./date-7-analysed.csv')
    # X_test = df_test[feature_cols]
    # X_test = preprocessing.normalize(X_test)
    # print X_test.head()

    if os.path.isdir(file_path_read):
        file_path_read = my_filter.walk_dir(file_path_read, 'csv')
    else:
        file_path_read = [file_path_read]

    while(algorithm_type not in algorithm):
        print '\n\nwrong algorithm type!\n'
        algorithm_type = int(raw_input('what kind of algorithm you want to use:\
                                \n1: linear regression\
                                \n2: random forests\n'))

    if algorithm_type == 1:
        result = my_linear_regression(file_path_read, df_test)
        file_path_write = 'predict' + '-linear' + '-line' + str(line_num) + '.txt'
    else:
        result = my_random_forests(file_path_read, df_test)
        file_path_write = 'predict' + '-randomforest' + '-line' + str(line_num) + '.txt'