def inputFiles(trainingFile, testingFile): # we have two "header" rows before the data actually starts, which will throw off our csv parser trainingFile, dataDescriptionLine = removeHeaderRows(trainingFile) testingFile, testingDataDescriptionLine = removeHeaderRows(testingFile) # grab the dataDescription row and make it lowercase expectedRowLength = len( dataDescriptionLine ) dataDescriptionRaw = [x.lower() for x in dataDescriptionLine] hasID, testHeaderValidationLength, hasCustomValidationSplit = validation.dataDescription( dataDescriptionRaw ) # the user told us whether this is 'output regression' or 'output category' # we need to split out the problem type (regression, category, or multi-category), and leave only 'output' dataDescription = [] for columnType in dataDescriptionRaw: if columnType[0:6] == 'output': dataDescription.append('output') problemType = columnType[7:] elif columnType[0:8] == 'groupby ': dataDescription.append( columnType[8:] ) else: dataDescription.append(columnType) testingDataDescription = [x.lower() for x in testingDataDescriptionLine] # we will break out separately the ID column, the output column, and then the rest of the data outputData = [] idColumn = [] validationSplitColumn = [] outputColumn = [] with open(trainingFile, 'rU') as trainingInput: # detect the "dialect" of this type of csv file try: dialect = csv.Sniffer().sniff(trainingInput.read(1024)) except: dialect = 'excel' trainingInput.seek(0) trainingRows = csv.reader(trainingInput, dialect) rowCount = 0 for row in trainingRows: # grab the header row and make it lowercase if rowCount == 0: validation.rowLength( row, expectedRowLength, rowCount ) headerRow = [x.lower() for x in row] else: validation.rowLength( row, expectedRowLength, rowCount ) trimmedRow = [] if hasID == False: # while we won't be using these IDs, we do need to make sure our idColumn has the right number of rows, so we are putting them in here. idColumn.append( int(rowCount + 8000000000) ) for idx, val in enumerate(row): if dataDescription[idx] == 'id': idColumn.append(val) elif dataDescription[idx] == 'validation split': validationSplitColumn.append(val) elif dataDescription[idx] == 'output': # TODO: add in some error handling around making sure everything in the outputColumn is the same type. try: outputColumn.append(float(val)) except: outputColumn.append(val) elif dataDescription[idx] == 'ignore': # some columns contain data we do not want to use. It seems trivial to remove these from our dataset here, rather than forcing them to try to open the dataset up in some other program to attempt to delete the column. pass else: trimmedRow.append(val) outputData.append(trimmedRow) # keep track of which row we are on for error logging purposes rowCount += 1 # keep track of how long our training data set is so we can split back out again later trainingLength = len(outputData) # TODO TODO TODO: properly handle paring off the top line from our testing csv file with open(testingFile, 'rU') as testingInput: # detect the dialect of the csv file try: dialect = csv.Sniffer().sniff(testingInput.read(1024)) except: dialect = 'excel' testingInput.seek(0) testingRows = csv.reader(testingInput, dialect) testingRowCount = 0 # set missingOutputIndex equal to infinity to start with missingOutputIndex = float('inf') for row in testingRows: if testingRowCount == 0: testingHeader = [x.lower() for x in row] # check to make sure that with all the IGNOREs considered, we have the right number of columns colsValidated = validation.testingHeaderRow( row, expectedRowLength, headerRow ) if colsValidated == False: # if not, assume that the missing column is the output column, and store that index position missingOutputIndex = dataDescription.index('output') expectedTestingRowLength = len( row ) else: # build up each row in the testing dataset validation.testingRowLength( row, expectedTestingRowLength, testingRowCount ) trimmedRow = [] for idx, val in enumerate(row): if testingDataDescription[idx] == 'id': idColumn.append(val) elif testingDataDescription[idx] == 'output': outputColumn.append(val) elif testingDataDescription[idx] == 'ignore': pass else: trimmedRow.append(val) # NOTE: we are appending both the training and the testing data into one dataset # this ensures we will be processing them consistently # if we treated them separately, it could cause an issue if we have a feature present in the testing data but not the training data, for example outputData.append(trimmedRow) testingRowCount += 1 os.remove(trainingFile) os.remove(testingFile) try: idHeader = headerRow[ dataDescription.index('id') ] except: idHeader = testingHeader[ testingDataDescription.index('id') ] return dataDescription, headerRow, trainingLength, outputData, idColumn, outputColumn, idHeader, problemType, dataDescriptionRaw, hasCustomValidationSplit, validationSplitColumn
def datasets(X, joinFileName, XHeaderRow, dataDescription, args, groupByIndices, dateIndices): # TODO: read in and remove the first row. having two "header" rows appers to be throwing off the sniffer when we have multiple commas in a single column, even if they are quote-enclosed. # write all but the first row (or two, possibly) to the temp.csv file. # read in that temp.csv file later. # with open(joinFileName, 'rU') as f: # with open(args['outputFolder'] + 'temp.csv','w+') as f1: # f.next() # skip header line # for line in f: # f1.write(line) # 1. read in data in joinFileName joinDict = {} with open(joinFileName, "rU") as joinFile: # detect the "dialect" of this type of csv file try: dialect = csv.Sniffer().sniff(joinFile.read(2048)) joinFile.seek(0) except: dialect = "excel" joinRows = csv.reader(joinFile, dialect) rowCount = 0 joinedValsLength = -1 for row in joinRows: if rowCount < 2: # grab the joinDataDescription row and the header row, and make them both lowercase if rowCount == 0: expectedRowLength = len(row) # 2. get joinDataDescription row from that file joinDataDescription = [x.lower() for x in row] validation.joinDataDescription(joinDataDescription) else: validation.rowLength(row, expectedRowLength, rowCount) # 3. get header row from that file headerRow = [x.lower() for x in row] joinOnIdColumn = False # determine joinIndex try: # 4. see if we have an args['on'] property to join the files on joinHeader = args["on"].lower() joinIndex = headerRow.index(joinHeader) xJoinIndex = XHeaderRow.index(joinHeader) except: try: # see if our idColumn is what we're joining on (which seems like it'll happen at some point) joinIndex = headerRow.index(args["idHeader"]) joinOnIdColumn = True except: # 5. see if we have the same headerRow name in both files to join on for x in headerRow: if x in XHeaderRow: joinHeader = x # joinHeader = set(headerRow).intersection(XHeaderRow) joinIndex = headerRow.index(joinHeader) xJoinIndex = XHeaderRow.index(joinHeader) else: validation.rowLength(row, expectedRowLength, rowCount) trimmedRow = [] joinVal = row[joinIndex] for idx, val in enumerate(row): if joinDataDescription[idx] != "id" and joinDataDescription[idx] != "ignore" and idx != joinIndex: trimmedRow.append(val) joinDict[joinVal] = trimmedRow if len(trimmedRow) > joinedValsLength: joinedValsLength = len(trimmedRow) # keep track of which row we are on for error logging purposes rowCount += 1 newX = [] # 5. join the files blankVals = [None for x in range(0, joinedValsLength)] for rowIndex, row in enumerate(X): if joinOnIdColumn: try: joinID = idColumn[rowIndex] newVals = joinDict[joinID] newX.append(row.join(newVals)) except: # append blank values so all rows still have the same number of columns newX.append(row.join(blankVals)) else: try: joinID = row[xJoinIndex] newVals = joinDict[joinID] newX.append(row + newVals) except: # append blank values so all rows still have the same number of columns newX.append(row + blankVals) X[rowIndex] = None del row # just do it myself so we have more control and don't need to convert to dataFrames and back # read the join file into a dict # the keys for the dict will be the matching column # ignore any ignore columns # remove any ignored columns from the header row and dataDescription row # iterate through X. for each row: # append all the values in the joinFile dict for that id # append header rows for idx, name in enumerate(headerRow): if joinDataDescription[idx] != "id" and joinDataDescription[idx] != "ignore" and idx != joinIndex: XHeaderRow.append(name) # append dataDescription rows, and our groupByIndices and dateIndices originalDataDescriptionLength = len(dataDescription) for idx, name in enumerate(joinDataDescription): if name != "id" and name != "ignore" and idx != joinIndex: if name[0:7] == "groupby": # append only the non groupby part of this name dataDescription.append(name[8:]) groupByIndices.append(idx + originalDataDescriptionLength) elif name == "date": dataDescription.append(name) dateIndices.append(idx + originalDataDescriptionLength) else: dataDescription.append(name) del X return newX, dataDescription, XHeaderRow, groupByIndices, dateIndices
def datasets(X, joinFileName, XHeaderRow, dataDescription, args, groupByIndices, dateIndices): # TODO: read in and remove the first row. having two "header" rows appers to be throwing off the sniffer when we have multiple commas in a single column, even if they are quote-enclosed. # write all but the first row (or two, possibly) to the temp.csv file. # read in that temp.csv file later. # with open(joinFileName, 'rU') as f: # with open(args['outputFolder'] + 'temp.csv','w+') as f1: # f.next() # skip header line # for line in f: # f1.write(line) # 1. read in data in joinFileName joinDict = {} with open(joinFileName, 'rU') as joinFile: # detect the "dialect" of this type of csv file try: dialect = csv.Sniffer().sniff(joinFile.read(2048)) joinFile.seek(0) except: dialect = 'excel' joinRows = csv.reader(joinFile, dialect) rowCount = 0 joinedValsLength = -1 for row in joinRows: if rowCount < 2: # grab the joinDataDescription row and the header row, and make them both lowercase if rowCount == 0: expectedRowLength = len(row) # 2. get joinDataDescription row from that file joinDataDescription = [x.lower() for x in row] validation.joinDataDescription(joinDataDescription) else: validation.rowLength(row, expectedRowLength, rowCount) # 3. get header row from that file headerRow = [x.lower() for x in row] joinOnIdColumn = False # determine joinIndex try: # 4. see if we have an args['on'] property to join the files on joinHeader = args['on'].lower() joinIndex = headerRow.index(joinHeader) xJoinIndex = XHeaderRow.index(joinHeader) except: try: # see if our idColumn is what we're joining on (which seems like it'll happen at some point) joinIndex = headerRow.index(args['idHeader']) joinOnIdColumn = True except: # 5. see if we have the same headerRow name in both files to join on for x in headerRow: if x in XHeaderRow: joinHeader = x # joinHeader = set(headerRow).intersection(XHeaderRow) joinIndex = headerRow.index(joinHeader) xJoinIndex = XHeaderRow.index(joinHeader) else: validation.rowLength(row, expectedRowLength, rowCount) trimmedRow = [] joinVal = row[joinIndex] for idx, val in enumerate(row): if joinDataDescription[idx] != 'id' and joinDataDescription[ idx] != 'ignore' and idx != joinIndex: trimmedRow.append(val) joinDict[joinVal] = trimmedRow if len(trimmedRow) > joinedValsLength: joinedValsLength = len(trimmedRow) # keep track of which row we are on for error logging purposes rowCount += 1 newX = [] # 5. join the files blankVals = [None for x in range(0, joinedValsLength)] for rowIndex, row in enumerate(X): if joinOnIdColumn: try: joinID = idColumn[rowIndex] newVals = joinDict[joinID] newX.append(row.join(newVals)) except: # append blank values so all rows still have the same number of columns newX.append(row.join(blankVals)) else: try: joinID = row[xJoinIndex] newVals = joinDict[joinID] newX.append(row + newVals) except: # append blank values so all rows still have the same number of columns newX.append(row + blankVals) X[rowIndex] = None del row # just do it myself so we have more control and don't need to convert to dataFrames and back # read the join file into a dict # the keys for the dict will be the matching column # ignore any ignore columns # remove any ignored columns from the header row and dataDescription row # iterate through X. for each row: # append all the values in the joinFile dict for that id # append header rows for idx, name in enumerate(headerRow): if joinDataDescription[idx] != 'id' and joinDataDescription[ idx] != 'ignore' and idx != joinIndex: XHeaderRow.append(name) # append dataDescription rows, and our groupByIndices and dateIndices originalDataDescriptionLength = len(dataDescription) for idx, name in enumerate(joinDataDescription): if name != 'id' and name != 'ignore' and idx != joinIndex: if name[0:7] == 'groupby': # append only the non groupby part of this name dataDescription.append(name[8:]) groupByIndices.append(idx + originalDataDescriptionLength) elif name == 'date': dataDescription.append(name) dateIndices.append(idx + originalDataDescriptionLength) else: dataDescription.append(name) del X return newX, dataDescription, XHeaderRow, groupByIndices, dateIndices