def getCollum(feaName): lines = readFromFile.readLines('trainProcessed/idexamples_train.txt') feaValues = [] feaDict = {} for line in lines: line = line.replace(',', ',\"') line = line.replace(':', '\":') feaDict = json.loads('{\"'+line+'}') feaValues.append(feaDict[feaName]) return feaValues
def loadToArray(filePath): lines = readFromFile.readLines(filePath) X_stuId = [] X = [] for line in lines: stuId = int(line.split(',', 1)[0]) X_stuId.append(stuId) values = [float(v.split(':')[1]) for v in line.split(',')[1:]] X.append(values) X_np = np.array(X) feaNames = [v.split(':')[0] for v in lines[0].split(',')[1:]] return X_np, feaNames, X_stuId
def invertForm(filePath): students = {} lines = readFromFile.readLines(filePath) for each in lines: l = each.strip().split(',', 1) stuId = int(l[0]) if stuId in students: students[stuId] += "$" + l[1] else: students[stuId] = l[1] ret = "" for key, value in students.iteritems(): ret += str(key) + "$" + value + '\n' fw = open( "../studentForm/test/" + filePath.split('/')[-1].split('.')[0] + "_invert.txt", 'w') fw.write(ret) fw.close()
def addLabel(students): lines = readFromFile.readLines(originalFile) out = lines[0] + ',label\n' i = 0 j = 1 while i < len(students) and j < len(lines): stuId = int(lines[j].split(',')[0]) if int(students[i]['stuId']) == stuId: out += lines[j] + ',' + students[i]['subsidy'] + '\n' i += 1 j += 1 elif int(students[i]['stuId']) < stuId: i += 1 else: out += lines[j] + ',' + 'null' + '\n' j += 1 while j < len(lines): out += lines[j] + ',null\n' j += 1 with open(originalFile, 'w') as fw: fw.write(out)
def distinctIds(): files = [] files.append(readFromFile.readLines('../data/train/borrow_train.txt')) files.append(readFromFile.readLines('../data/train/card_train.txt')) files.append(readFromFile.readLines('../data/train/dorm_train.txt')) files.append(readFromFile.readLines('../data/train/library_train.txt')) files.append(readFromFile.readLines('../data/train/score_train.txt')) files.append(readFromFile.readLines('../data/train/subsidy_train.txt')) ids = [] for each in files: for line in each: items = line.split(',') if items[0] != '\n' and items[0] != "": ids.append(int(items[0])) distinctIds = list(set(ids)) distinctIds.sort() print "Get " + str(len(distinctIds)) + "distinct ids in total." print "The first 10 of them is: ", distinctIds[:10] print "The last 10 of them is: ", distinctIds[-10:] return distinctIds
import json import readFromFile import sys students = "" if sys.argv[1] == 'train': lines = readFromFile.readLines( '../studentForm/train/score_train_invert.txt') elif sys.argv[1] == 'test': lines = readFromFile.readLines('../studentForm/test/score_test_invert.txt') else: print 'Invalid arguments' print lines[0] count = 0 maxRankOfFaculties = {} for line in lines: records = line.split('$') twoNumber = records[1].split(',') faculty = int(twoNumber[0]) rank = int(twoNumber[1]) if faculty in maxRankOfFaculties: if rank > maxRankOfFaculties[faculty]: maxRankOfFaculties[faculty] = rank else: maxRankOfFaculties[faculty] = rank print maxRankOfFaculties linesTest = readFromFile.readLines('../studentForm/test/score_test_invert.txt') for line in linesTest: records = line.split('$')
import json import operator # for sort diction by value from sets import Set from datetime import date, time, datetime import readFromFile import sys timeCritera = [ '00_00', '01_00', '03_00', '06_00', '09_00', '11_20', '12_50', '16_50', '19_00', '22_00', '24_00' ] # '24:00' does not exsits, but is useful as the last element. if sys.argv[1] == 'train': lines = readFromFile.readLines( '../studentForm/train/dorm_train_invert.txt') elif sys.argv[1] == 'test': lines = readFromFile.readLines('../studentForm/test/dorm_test_invert.txt') else: print "Invalid arguments" print lines[0] students_earest = {} students_lastest = {} # see rank: for line in lines: # Split one students records apart. records = line.split('$') # stuId = int(records[0]) # If there exsits records for this student if len(records) > 1: earest = '24:00:00'
def readFeaturesFromFile(filePath): lines = readFromFile.readLines(filePath) students = [json.loads(line) for line in lines] sortedStudents = sorted(students, key=lambda k: int(k['stuId'])) return sortedStudents
def addFeatures(students, fileName, normalization=True): lines = readFromFile.readLines(originalFile) # Calculate the exists collum number existsColNum = len(lines[0].split(',')) - 1 # Give order to dictionary keys so we can visite them in the same order sortedFeaNames = [key for key, val in sorted(students[0].items())] sortedFeaNames.remove('stuId') print sortedFeaNames with open(sys.argv[4] + 'Processed/collumInfo.txt', 'a') as fw: out2 = "" for i in range(0, len(sortedFeaNames)): out2 += 'fea' + str( i + existsColNum ) + ' ' + fileName + ' ' + sortedFeaNames[i] + '\n' fw.write(out2) # Dealing with the missing features value for some students that have no records. # Here we will use one of: 1.0 2.Mean 3.Median 4. MostFrequent 5.Min 6.Max Minus = {} Zero = {} Mean = {} Median = {} MostFre = {} Min = {} Max = {} for key in sortedFeaNames: oneCollum = [x[key] for x in students] Minus[key] = -1 Zero[key] = 0 Mean[key] = np.mean(oneCollum) Median[key] = np.median(oneCollum) MostFre[key] = stats.mode(oneCollum)[0][0] Min[key] = min(oneCollum) Max[key] = max(oneCollum) ''' For Debug Only print Mean print Median print MostFre print Min print Max sys.exit(0) ''' staDict = { '-1': Minus, '0': Zero, 'mean': Mean, 'median': Median, 'mostFre': MostFre, 'min': Min, 'max': Max } FeaturesFill = { 'BorrowProcessed.txt-ifBorrowed': '0', 'BorrowProcessed.txt-numInCate': '0', 'BorrowProcessed.txt-numOfBorrowed': '0', 'BorrowProcessed.txt-numOfCateBorrowed': '0', 'BorrowProcessed.txt-ratioOfBorrowedMonths': '0', 'BorrowProcessed.txt-timesOfKaoyan': '0', 'BorrowProcessed.txt-timesOfProg': '0', 'BorrowProcessed.txt-timesOfTOEFL': '0', 'DormProcessed.txt-enterExit': 'mean', 'DormProcessed.txt-maxMonthDensity': 'mean', 'DormProcessed.txt-timesPerDay': 'mean', 'DormProcessed.txt-totalDays': 'mean', 'DormProcessed.txt-weekendTimes': 'mean', 'LibraryProcessed.txt-eveningTimes': 'mean', 'LibraryProcessed.txt-timesPerDay': 'mean', 'LibraryProcessed.txt-totalDays': 'mean', 'LibraryProcessed.txt-totalRecordTimes': 'mean', 'LibraryProcessed.txt-weekendTimes': 'mean', 'ScoreProcessed.txt-faculty': '0', 'ScoreProcessed.txt-rankPercent': '-1' } if fileName == 'CardProcessed.txt': for key in sortedFeaNames: FeaturesFill['CardProcessed.txt-' + key] = '0' if fileName == 'LibraryProcessed.txt': for key in sortedFeaNames: FeaturesFill['LibraryProcessed.txt-' + key] = '0' if fileName == 'ScoreProcessed.txt': for key in sortedFeaNames: FeaturesFill['ScoreProcessed.txt-' + key] = '0' FeaturesFill['ScoreProcessed.txt-rankPercent'] = '-1' if fileName == 'DormProcessed.txt': for key in sortedFeaNames: FeaturesFill['DormProcessed.txt-' + key] = '0' if fileName == 'BorrowProcessed.txt': for key in sortedFeaNames: FeaturesFill['BorrowProcessed.txt-' + key] = '0' # Begin join two forms (two form rows are sorted by their 'stuId') i = 0 j = 0 fw = open(originalFile, 'w') while i < len(students) and j < len(lines): print lines[j] stuId = int(lines[j].split(',')[0]) print i, stuId temp = "" if int(students[i]['stuId']) == stuId: temp += lines[j] for key in sortedFeaNames: if normalization == False or key == 'ifBorrowed' or key[: -1] == 'faculty': temp += ',' + key + ':' + str(students[i][key]) else: temp += ',' + key + ':' + str( (students[i][key] - Mean[key]) / float(Max[key] - Min[key])) temp += '\n' i += 1 j += 1 elif int(students[i]['stuId']) < stuId: i += 1 else: temp += lines[j] for key in sortedFeaNames: if key[:-1] == 'numInCate': if normalization: temp += ',' + key + ':' + str( (staDict[FeaturesFill[filePath + '-numInCate']] [key] - Mean[key]) / float(Max[key] - Min[key])) else: temp += ',' + key + ':' + str( staDict[FeaturesFill[filePath + '-numInCate']][key]) elif key[5:] == 'enter' or key[5:] == 'exit': if normalization: temp += ',' + key + ':' + str( (staDict[FeaturesFill[filePath + '-enterExit']] [key] - Mean[key]) / float(Max[key] - Min[key])) else: temp += ',' + key + ':' + str( staDict[FeaturesFill[filePath + '-enterExit']][key]) else: if normalization == False or key == 'ifBorrowed' or key[: -1] == 'faculty': temp += ',' + key + ':' + str( staDict[FeaturesFill[filePath + '-' + key]][key]) else: temp += ',' + key + ':' + str( (staDict[FeaturesFill[filePath + '-' + key]][key] - Mean[key]) / float(Max[key] - Min[key])) temp += '\n' j += 1 fw.write(temp) while j < len(lines): print lines[j] temp = lines[j] for key in sortedFeaNames: if key[:-1] == 'numInCate': temp += ',' + key + ':' + str( staDict[FeaturesFill[filePath + '-numInCate']][key]) elif key[5:] == 'enter' or key[5:] == 'exit': temp += ',' + key + ':' + str( staDict[FeaturesFill[filePath + '-enterExit']][key]) else: if normalization: temp += ',' + key + ':' + str( (staDict[FeaturesFill[filePath + '-' + key]][key] - Mean[key]) / float(Max[key] - Min[key])) else: temp += ',' + key + ':' + str( staDict[FeaturesFill[filePath + '-' + key]][key]) temp += '\n' j += 1 fw.write(temp)
def getCollumNames(): lines = readFromFile.readLines('trainProcessed/collumInfo.txt') feaNames = [] for line in lines: feaNames.append(line.split(' ')[-1]) return feaNames