Exemple #1
0
def getCollum(feaName):
    lines = readFromFile.readLines('trainProcessed/idexamples_train.txt')
    feaValues = []
    feaDict = {}
    for line in lines:
        line = line.replace(',', ',\"')
        line = line.replace(':', '\":')
        feaDict = json.loads('{\"'+line+'}')
        feaValues.append(feaDict[feaName])
    return feaValues
Exemple #2
0
def loadToArray(filePath):
    lines = readFromFile.readLines(filePath)
    X_stuId = []
    X = []
    for line in lines:
        stuId = int(line.split(',', 1)[0])
        X_stuId.append(stuId)
        values = [float(v.split(':')[1]) for v in line.split(',')[1:]]
        X.append(values)
    X_np = np.array(X)
    feaNames = [v.split(':')[0] for v in lines[0].split(',')[1:]]
    return X_np, feaNames, X_stuId
Exemple #3
0
def invertForm(filePath):
    students = {}
    lines = readFromFile.readLines(filePath)
    for each in lines:
        l = each.strip().split(',', 1)
        stuId = int(l[0])
        if stuId in students:
            students[stuId] += "$" + l[1]
        else:
            students[stuId] = l[1]
    ret = ""
    for key, value in students.iteritems():
        ret += str(key) + "$" + value + '\n'
    fw = open(
        "../studentForm/test/" + filePath.split('/')[-1].split('.')[0] +
        "_invert.txt", 'w')
    fw.write(ret)
    fw.close()
Exemple #4
0
def addLabel(students):
    lines = readFromFile.readLines(originalFile)
    out = lines[0] + ',label\n'
    i = 0
    j = 1
    while i < len(students) and j < len(lines):
        stuId = int(lines[j].split(',')[0])
        if int(students[i]['stuId']) == stuId:
            out += lines[j] + ',' + students[i]['subsidy'] + '\n'
            i += 1
            j += 1
        elif int(students[i]['stuId']) < stuId:
            i += 1
        else:
            out += lines[j] + ',' + 'null' + '\n'
            j += 1
    while j < len(lines):
        out += lines[j] + ',null\n'
        j += 1
    with open(originalFile, 'w') as fw:
        fw.write(out)
Exemple #5
0
def distinctIds():
    files = []
    files.append(readFromFile.readLines('../data/train/borrow_train.txt'))
    files.append(readFromFile.readLines('../data/train/card_train.txt'))
    files.append(readFromFile.readLines('../data/train/dorm_train.txt'))
    files.append(readFromFile.readLines('../data/train/library_train.txt'))
    files.append(readFromFile.readLines('../data/train/score_train.txt'))
    files.append(readFromFile.readLines('../data/train/subsidy_train.txt'))

    ids = []
    for each in files:
        for line in each:
            items = line.split(',')
            if items[0] != '\n' and items[0] != "":
                ids.append(int(items[0]))

    distinctIds = list(set(ids))
    distinctIds.sort()

    print "Get " + str(len(distinctIds)) + "distinct ids in total."
    print "The first 10 of them is: ", distinctIds[:10]
    print "The last 10 of them is: ", distinctIds[-10:]
    return distinctIds
Exemple #6
0
import json
import readFromFile
import sys

students = ""
if sys.argv[1] == 'train':
    lines = readFromFile.readLines(
        '../studentForm/train/score_train_invert.txt')
elif sys.argv[1] == 'test':
    lines = readFromFile.readLines('../studentForm/test/score_test_invert.txt')
else:
    print 'Invalid arguments'

print lines[0]
count = 0
maxRankOfFaculties = {}
for line in lines:
    records = line.split('$')
    twoNumber = records[1].split(',')
    faculty = int(twoNumber[0])
    rank = int(twoNumber[1])
    if faculty in maxRankOfFaculties:
        if rank > maxRankOfFaculties[faculty]:
            maxRankOfFaculties[faculty] = rank
    else:
        maxRankOfFaculties[faculty] = rank
print maxRankOfFaculties

linesTest = readFromFile.readLines('../studentForm/test/score_test_invert.txt')
for line in linesTest:
    records = line.split('$')
Exemple #7
0
import json
import operator  # for sort diction by value
from sets import Set
from datetime import date, time, datetime
import readFromFile
import sys

timeCritera = [
    '00_00', '01_00', '03_00', '06_00', '09_00', '11_20', '12_50', '16_50',
    '19_00', '22_00', '24_00'
]  # '24:00' does not exsits, but is useful as the last element.

if sys.argv[1] == 'train':
    lines = readFromFile.readLines(
        '../studentForm/train/dorm_train_invert.txt')
elif sys.argv[1] == 'test':
    lines = readFromFile.readLines('../studentForm/test/dorm_test_invert.txt')
else:
    print "Invalid arguments"
print lines[0]

students_earest = {}
students_lastest = {}
# see rank:
for line in lines:
    # Split one students records apart.
    records = line.split('$')
    # stuId  = int(records[0])
    # If there exsits records for this student
    if len(records) > 1:
        earest = '24:00:00'
Exemple #8
0
def readFeaturesFromFile(filePath):
    lines = readFromFile.readLines(filePath)
    students = [json.loads(line) for line in lines]
    sortedStudents = sorted(students, key=lambda k: int(k['stuId']))
    return sortedStudents
Exemple #9
0
def addFeatures(students, fileName, normalization=True):
    lines = readFromFile.readLines(originalFile)
    # Calculate the exists collum number
    existsColNum = len(lines[0].split(',')) - 1
    # Give order to dictionary keys so we can visite them in the same order
    sortedFeaNames = [key for key, val in sorted(students[0].items())]
    sortedFeaNames.remove('stuId')
    print sortedFeaNames
    with open(sys.argv[4] + 'Processed/collumInfo.txt', 'a') as fw:
        out2 = ""
        for i in range(0, len(sortedFeaNames)):
            out2 += 'fea' + str(
                i + existsColNum
            ) + '    ' + fileName + '    ' + sortedFeaNames[i] + '\n'
        fw.write(out2)

    # Dealing with the missing features value for some students that have no records.
    # Here we will use one of: 1.0 2.Mean 3.Median 4. MostFrequent 5.Min 6.Max
    Minus = {}
    Zero = {}
    Mean = {}
    Median = {}
    MostFre = {}
    Min = {}
    Max = {}
    for key in sortedFeaNames:
        oneCollum = [x[key] for x in students]
        Minus[key] = -1
        Zero[key] = 0
        Mean[key] = np.mean(oneCollum)
        Median[key] = np.median(oneCollum)
        MostFre[key] = stats.mode(oneCollum)[0][0]
        Min[key] = min(oneCollum)
        Max[key] = max(oneCollum)
    ''' For Debug Only
    print Mean
    print Median
    print MostFre
    print Min
    print Max
    sys.exit(0)
    '''
    staDict = {
        '-1': Minus,
        '0': Zero,
        'mean': Mean,
        'median': Median,
        'mostFre': MostFre,
        'min': Min,
        'max': Max
    }
    FeaturesFill = {
        'BorrowProcessed.txt-ifBorrowed': '0',
        'BorrowProcessed.txt-numInCate': '0',
        'BorrowProcessed.txt-numOfBorrowed': '0',
        'BorrowProcessed.txt-numOfCateBorrowed': '0',
        'BorrowProcessed.txt-ratioOfBorrowedMonths': '0',
        'BorrowProcessed.txt-timesOfKaoyan': '0',
        'BorrowProcessed.txt-timesOfProg': '0',
        'BorrowProcessed.txt-timesOfTOEFL': '0',
        'DormProcessed.txt-enterExit': 'mean',
        'DormProcessed.txt-maxMonthDensity': 'mean',
        'DormProcessed.txt-timesPerDay': 'mean',
        'DormProcessed.txt-totalDays': 'mean',
        'DormProcessed.txt-weekendTimes': 'mean',
        'LibraryProcessed.txt-eveningTimes': 'mean',
        'LibraryProcessed.txt-timesPerDay': 'mean',
        'LibraryProcessed.txt-totalDays': 'mean',
        'LibraryProcessed.txt-totalRecordTimes': 'mean',
        'LibraryProcessed.txt-weekendTimes': 'mean',
        'ScoreProcessed.txt-faculty': '0',
        'ScoreProcessed.txt-rankPercent': '-1'
    }
    if fileName == 'CardProcessed.txt':
        for key in sortedFeaNames:
            FeaturesFill['CardProcessed.txt-' + key] = '0'

    if fileName == 'LibraryProcessed.txt':
        for key in sortedFeaNames:
            FeaturesFill['LibraryProcessed.txt-' + key] = '0'

    if fileName == 'ScoreProcessed.txt':
        for key in sortedFeaNames:
            FeaturesFill['ScoreProcessed.txt-' + key] = '0'
        FeaturesFill['ScoreProcessed.txt-rankPercent'] = '-1'

    if fileName == 'DormProcessed.txt':
        for key in sortedFeaNames:
            FeaturesFill['DormProcessed.txt-' + key] = '0'

    if fileName == 'BorrowProcessed.txt':
        for key in sortedFeaNames:
            FeaturesFill['BorrowProcessed.txt-' + key] = '0'
    # Begin join two forms (two form rows are sorted by their 'stuId')
    i = 0
    j = 0
    fw = open(originalFile, 'w')
    while i < len(students) and j < len(lines):
        print lines[j]
        stuId = int(lines[j].split(',')[0])
        print i, stuId
        temp = ""
        if int(students[i]['stuId']) == stuId:
            temp += lines[j]
            for key in sortedFeaNames:
                if normalization == False or key == 'ifBorrowed' or key[:
                                                                        -1] == 'faculty':
                    temp += ',' + key + ':' + str(students[i][key])
                else:
                    temp += ',' + key + ':' + str(
                        (students[i][key] - Mean[key]) /
                        float(Max[key] - Min[key]))
            temp += '\n'
            i += 1
            j += 1
        elif int(students[i]['stuId']) < stuId:
            i += 1
        else:
            temp += lines[j]
            for key in sortedFeaNames:
                if key[:-1] == 'numInCate':
                    if normalization:
                        temp += ',' + key + ':' + str(
                            (staDict[FeaturesFill[filePath + '-numInCate']]
                             [key] - Mean[key]) / float(Max[key] - Min[key]))
                    else:
                        temp += ',' + key + ':' + str(
                            staDict[FeaturesFill[filePath +
                                                 '-numInCate']][key])
                elif key[5:] == 'enter' or key[5:] == 'exit':
                    if normalization:
                        temp += ',' + key + ':' + str(
                            (staDict[FeaturesFill[filePath + '-enterExit']]
                             [key] - Mean[key]) / float(Max[key] - Min[key]))
                    else:
                        temp += ',' + key + ':' + str(
                            staDict[FeaturesFill[filePath +
                                                 '-enterExit']][key])
                else:
                    if normalization == False or key == 'ifBorrowed' or key[:
                                                                            -1] == 'faculty':
                        temp += ',' + key + ':' + str(
                            staDict[FeaturesFill[filePath + '-' + key]][key])
                    else:
                        temp += ',' + key + ':' + str(
                            (staDict[FeaturesFill[filePath + '-' + key]][key] -
                             Mean[key]) / float(Max[key] - Min[key]))
            temp += '\n'
            j += 1
        fw.write(temp)
    while j < len(lines):
        print lines[j]
        temp = lines[j]
        for key in sortedFeaNames:
            if key[:-1] == 'numInCate':
                temp += ',' + key + ':' + str(
                    staDict[FeaturesFill[filePath + '-numInCate']][key])
            elif key[5:] == 'enter' or key[5:] == 'exit':
                temp += ',' + key + ':' + str(
                    staDict[FeaturesFill[filePath + '-enterExit']][key])
            else:
                if normalization:
                    temp += ',' + key + ':' + str(
                        (staDict[FeaturesFill[filePath + '-' + key]][key] -
                         Mean[key]) / float(Max[key] - Min[key]))
                else:
                    temp += ',' + key + ':' + str(
                        staDict[FeaturesFill[filePath + '-' + key]][key])
        temp += '\n'
        j += 1
        fw.write(temp)
Exemple #10
0
def getCollumNames():
    lines = readFromFile.readLines('trainProcessed/collumInfo.txt')
    feaNames = []
    for line in lines:
        feaNames.append(line.split('    ')[-1])
    return feaNames