Ejemplo n.º 1
0
def testingRowLength( row, expectedRowLength, rowCount ):
    if len( row ) != expectedRowLength:
        printParent( 'This row did not have the same number of columns as the testing dataset header row.')
        printParent( row )
        printParent( 'Within the testing dataset, this is row number:')
        printParent( rowCount )
        printParent( 'Please make sure that all rows have the same number of columns, even if those values are blank')
def nlp(X, dataDescription, headerRow):
    hasnlpColumn = False
    try:
        nlpColumnIndex = dataDescription.index('nlp')
        hasnlpColumn = True
    except:
        printParent('we did not find any nlp column to perform feature engineering on')
        pass

    if hasnlpColumn:
        # TODO: use TfidfVectorizer
            # iterate through each row, grabbing the nlp column
            # run this entire collected corpus through TfidfVectorizer, store into tfVectorized
            # figure out what to add to the headerRow and dataDescription row
                # one option is to add the actual word, if we can get that (we should be able to). it appears to exist in get_feature_names()
                # we might decide that we want to keep all the nlp words, in which case we'd want to prefix all these columns in dataDescription and headerRow with "nlp"
            # don't actually add tfVectorized to X yet. X is still dense, while tfVectorized is sparse. 
                # simply pass tfVectorized (along with what should be added to headerRow and dataDescription) back.
                # then stack it horizontally to X once we turn X into a sparse matrix later on. 
                # no need to disrupt the entire rest of the process by converting everything to sparse right now
        
        corpus = []

        for rowIdx, row in enumerate(X):
            rawString = row[nlpColumnIndex]
            cleanedString = unicode(rawString, errors='replace')
            corpus.append(cleanedString)

            # right now the value stored at the nlpColumnIndex is the entire text string
            # go through and overwrite that with a simple number representing the number of characters in that string. We will have the fuller representation of the string (using bag of words or tf-idf) stored elsewhere in this row
            row[nlpColumnIndex] = len(row[nlpColumnIndex])
            X[rowIdx] = row
        dataDescription[nlpColumnIndex] = 'continuous'
        headerRow[nlpColumnIndex] = 'lengthOf' + headerRow[nlpColumnIndex]

        # TODO: properly set the parameters here. how many words do we want to include, etc.
        # if we face a decoding error, ignore it
        # strip the accents from words to make them more consistent
        # if amalyzer='char', each word feature will be made up of character n-grams. this means 'calling' and 'called' will be more similar, because they share the characters 'c','a','l',and 'l'. if words, they would be considered two completely unrelated entities
        # if analyzer='word', each word feature will simply be the count of times that word appears in this document
        # remove english "stop words": words like 'the','it','a' that appear so frequently as to be pretty useless in creating distinguishing documents. research has shown that for most corpora, removing stop words speeds up calculation time and increases accuracy (removes noise)
        # convert all charactes to lowercase before tokenizing
        # only include the most frequently occurring 'max_features' features when building the vocabulary. In other words, if we have 80,000 unique words that appear throughout our corpus, but max_features is only 5,000, we will only include the most popular 5,000 words in the final features. This reduces noise, memory, and computation time, at the risk of ignoring useful data.

        vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', analyzer='word', stop_words='english', lowercase=True, max_features=5000)
        corpus = vectorizer.fit_transform(corpus)

        # TODO:
            # Before writing vectorizer to file, remove the stop_words attribute. Otherwise, it will take up totally unnecessary space
            # vectorizer.stop_words = None

        # TODO: get the feature names
        nlpHeaderRow = vectorizer.get_feature_names()
        nlpHeaderRow = ['_nlp' + x for x in nlpHeaderRow]
        nlpDataDescription = ['continuous' for x in nlpHeaderRow]


    return X, corpus, nlpDataDescription, nlpHeaderRow
Ejemplo n.º 3
0
def testingRowLength(row, expectedRowLength, rowCount):
    if len(row) != expectedRowLength:
        printParent(
            'This row did not have the same number of columns as the testing dataset header row.'
        )
        printParent(row)
        printParent('Within the testing dataset, this is row number:')
        printParent(rowCount)
        printParent(
            'Please make sure that all rows have the same number of columns, even if those values are blank'
        )
def calculateReplacementValues(columnMatrix, columnsWithMissingValues,
                               dataDescription):

    # fillInVals will have keys for each column index, and values for what the filled in value should be
    # this way we only need to check continuous or categorical once
    fillInVals = {}
    # for colIndex, column in enumerate(columnMatrix):
    # do this only for columns with missing values
    for colIndex in columnsWithMissingValues:
        try:
            # we have a string in our columnsWithMissingValues obj (countOfMissingValues), so we need to try to convert it into an int to make sure we're actually on a numerical key representing a column number
            colIndex = int(colIndex)
            if dataDescription[colIndex] == 'continuous':
                # Manually calculating the median value
                # the numpy way of doing this assumes that None is a number and includes it when calculating the median value
                # whereas we want the median of all the values other than None.
                # copy the list
                copiedList = list(columnMatrix[colIndex])
                # sort the list
                copiedList.sort()
                # find the index of None
                for rowIndex, value in enumerate(copiedList):
                    if value == "":
                        noneIndex = rowIndex
                        break
                        # TODO: delete the copied list
                # divide that number in half (make it an int)
                medianIndex = int(noneIndex / 2)
                # access that position in the copied & sorted list
                medianVal = copiedList[medianIndex]
                # store that number into fillInVals
                fillInVals[colIndex] = medianVal
                # TODO: delete that sorted/copied list

            elif dataDescription[colIndex] == 'categorical':
                column = columnMatrix[colIndex]
                # the mode value
                fillInVals[colIndex] = max(set(column), key=column.count)
        except:
            printParent('we failed to create a fillInVals value for this key')
            printParent(colIndex)
            pass

    # remove all values of None from fillInVals
    # this way we will only create imputed columns if we can replace missing values in that column with something useful
    fillInVals = {k: v for k, v in fillInVals.items() if v is not None}

    return fillInVals
Ejemplo n.º 5
0
def writeDataSparse(X, args, headerRow, nn):

    # grab the name of the training and testing files from the full path to those datasets
    trainingFileName = args['trainingPrettyName'] + '.npz'
    testingFileName = args['testingPrettyName'] + args[
        'trainingPrettyName'] + '.npz'

    if type(nn) is not bool:
        trainingFileName = 'nn_' + trainingFileName
        testingFileName = 'nn_' + testingFileName
        yFileName = 'y_train_' + 'nn_' + args['trainingPrettyName'] + '.npz'
        y_train = path.join(args['outputFolder'], yFileName)

        y = np.array(nn)
        y = [float(i) for i in y]

        # if our values are not already stored as numbers, convert them to numbers
        try:
            ySparse = csr_matrix(y)
            printParent('successfully turned y into a sparse matrix!')
        except:
            yInt = [float(i) for i in y[0:trainingLength]]
            ySparse = csr_matrix(yInt)

        save_sparse_csr(y_train, ySparse)

    # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent.
    X_train = path.join(args['outputFolder'], 'X_train_' + trainingFileName)
    X_test = path.join(args['outputFolder'], 'X_test_' + testingFileName)

    # scipy sparse matrices need a list of indices to slice
    # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best
    trainRange = range(args['trainingLength'])
    testRange = range(args['trainingLength'],
                      args['trainingLength'] + args['testingLength'])

    save_sparse_csr(X_train, X[trainRange, :])
    save_sparse_csr(X_test, X[testRange])

    if type(nn) is not bool:
        fileNames = {
            'X_train_nn': X_train,
            'X_test_nn': X_test,
            'y_train_nn': y_train
        }
    else:
        fileNames = {'X_train': X_train, 'X_test': X_test}
    messageParent(fileNames, 'fileNames')
Ejemplo n.º 6
0
def format(X, y, idColumn, args):
    X = minMax.normalize(X).tolist()

    # TODO: just min-max-normalize, no need to format into an obj for brainjs
    # TODO: send back fileNames to parent
    # X_train_nn:
    # X_test_nn:

    brainArr = []
    for rowIndex, row in enumerate(X):
        rowObj = {}
        # we might need to wrap output in an array if the output is a single number, like we have.
        # input is an array, so
        rowObj['output'] = []
        yRow = y[rowIndex]
        if (isinstance(yRow, list)):
            rowObj['output'].extend(yRow)
        else:
            # the output value is expected to be an array, so if y values are not arrays (they have no len() ability), then we need to wrap the y value in an array
            rowObj['output'].append(yRow)
        rowObj['id'] = idColumn[rowIndex]

        rowObj['input'] = row
        brainArr.append(rowObj)

    trainingFileName = path.split(args['trainingData'])[-1]
    testingFileName = path.split(args['testingData'])[-1]

    with open(path.join(args['outputFolder'], 'brainJS' + trainingFileName),
              'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)
        for rowIndex, row in enumerate(brainArr):
            if (rowIndex < args['trainingLength']):
                csvOutputFile.writerow([row])

    with open(path.join(args['outputFolder'], 'brainJS' + testingFileName),
              'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)
        for rowIndex, row in enumerate(brainArr):
            if (rowIndex >= args['trainingLength']):
                csvOutputFile.writerow([row])

    printParent(
        'we have written your fully transformed brainJS data to a file at:')
    printParent(args['outputFolder'])

    return brainArr
def calculateReplacementValues( columnMatrix, columnsWithMissingValues, dataDescription ):

    # fillInVals will have keys for each column index, and values for what the filled in value should be
        # this way we only need to check continuous or categorical once
    fillInVals = {}
    # for colIndex, column in enumerate(columnMatrix):
        # do this only for columns with missing values
    for colIndex in columnsWithMissingValues:
        try:
            # we have a string in our columnsWithMissingValues obj (countOfMissingValues), so we need to try to convert it into an int to make sure we're actually on a numerical key representing a column number
            colIndex = int( colIndex )
            if dataDescription[ colIndex ] == 'continuous':
            # Manually calculating the median value
            # the numpy way of doing this assumes that None is a number and includes it when calculating the median value
            # whereas we want the median of all the values other than None. 
                # copy the list
                copiedList = list( columnMatrix[ colIndex ])
                # sort the list
                copiedList.sort()
                # find the index of None
                for rowIndex, value in enumerate(copiedList):
                    if value == "":
                        noneIndex = rowIndex
                        break
                        # TODO: delete the copied list
                # divide that number in half (make it an int)
                medianIndex = int( noneIndex / 2 )
                # access that position in the copied & sorted list
                medianVal = copiedList[ medianIndex ]
                # store that number into fillInVals
                fillInVals[ colIndex ] = medianVal
                # TODO: delete that sorted/copied list

            elif dataDescription[ colIndex ] == 'categorical':
                column = columnMatrix[ colIndex ]
                # the mode value
                fillInVals[ colIndex ] = max(set(column), key=column.count)
        except: 
            printParent('we failed to create a fillInVals value for this key')
            printParent(colIndex)
            pass

    # remove all values of None from fillInVals
    # this way we will only create imputed columns if we can replace missing values in that column with something useful
    fillInVals = { k: v for k, v in fillInVals.items() if v is not None}

    return fillInVals
Ejemplo n.º 8
0
def select( X, y, trainingLength, featureImportanceThreshold, headerRow, test, problemType ):
    # after dictVectorizing.py, we do not have a dataDescription row, nor do we need one. 
    # however, we've modularized prune so that prune expects a dataDescription row
    # here, we are simply creating a dummy dataDescription row that we will not use except to avoid an error in prune
    dataDescription = ['dummyValue' for x in range(len(headerRow)) ]

    # first, train linearly to remove all the completely useless features
        # this lets us send fewer features into our random forest (or eventaully RFECV), which leads to dramatically faster training times (~ 2-3x improvement)
    # repeat this process twice with different feature thresholds. 
    # first four orders of magnitude less than the most important feature, then two orders of magnitude less
    # hopefully by removing the features that are pure noise, we can all the signal to be found more reliably, and certainly more quickly.
    # if problemType == 'category':
    #     estimator = LogisticRegression(n_jobs=-1)
    # else:
    #     estimator = LinearRegression(n_jobs=-1)

    # estimator.fit( X[ 0 : trainingLength ], y[ 0 : trainingLength ] )

    # try:
    #     coefList = estimator.coef_[0]
    #     len(coefList)
    # except:
    #     coefList = estimator.coef_


    # # remove everything that is at least 4 orders of magnitude shy of the best feature
    # X, headerRow, printingOutput, dataDescription = cleanDataset(X, coefList, 10000, headerRow, dataDescription) 

    # printParent('here are the features that were kept by the first round of regression, sorted by their feature importance')
    # printParent(printingOutput)


    rfStartTime = time.time()

    # train a random forest
    if problemType == 'category':
        classifier = RandomForestClassifier( n_jobs=-1, n_estimators=20 )
    else:
        classifier = RandomForestRegressor( n_jobs=-1, n_estimators=20 )
    classifier.fit( X[ 0 : trainingLength ], y[ 0 : trainingLength ] )

    # remove features that are at least 3 orders of magnitude shy of our most important feature
    X, filteredHeaderRow, printingOutput, dataDescription = cleanDataset(X, classifier.feature_importances_, 1000, headerRow, dataDescription )
    

    if( not test ):
        printParent('here are the features that were kept, sorted by their feature importance')
        printParent(printingOutput)

    printParent('total time for the random forest part of feature selection, in minutes:')
    # this will get us execution time in minutes, to one decimal place
    printParent( round( (time.time() - rfStartTime)/60, 1 ) )


    return X, filteredHeaderRow 
Ejemplo n.º 9
0
def format( X, y, idColumn, args ):
    X = minMax.normalize( X ).tolist()

    # TODO: just min-max-normalize, no need to format into an obj for brainjs
    # TODO: send back fileNames to parent
        # X_train_nn:
        # X_test_nn:

    brainArr = []
    for rowIndex, row in enumerate(X):
        rowObj = {}
        # we might need to wrap output in an array if the output is a single number, like we have. 
            # input is an array, so 
        rowObj['output'] = []
        yRow = y[ rowIndex ]
        if( isinstance( yRow, list )):
            rowObj['output'].extend( yRow )
        else:
            # the output value is expected to be an array, so if y values are not arrays (they have no len() ability), then we need to wrap the y value in an array
            rowObj['output'].append( yRow )
        rowObj[ 'id' ] = idColumn[ rowIndex ]
            
        rowObj['input'] = row
        brainArr.append( rowObj )

    trainingFileName = path.split( args['trainingData'] )[ -1 ]
    testingFileName = path.split( args['testingData'] )[ -1 ]

    with open( path.join( args['outputFolder'], 'brainJS' + trainingFileName ), 'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)
        for rowIndex, row in enumerate(brainArr):
            if( rowIndex < args['trainingLength'] ):
                csvOutputFile.writerow( [ row ] )

    with open( path.join( args['outputFolder'], 'brainJS' + testingFileName ), 'w+') as outputFile:
        csvOutputFile = csv.writer(outputFile)
        for rowIndex, row in enumerate(brainArr):
            if( rowIndex >= args['trainingLength'] ):
                csvOutputFile.writerow( [ row ] )

    printParent('we have written your fully transformed brainJS data to a file at:')
    printParent( args['outputFolder'] )

    return brainArr
Ejemplo n.º 10
0
def writeDataSparse(X, args, headerRow, nn):

    # grab the name of the training and testing files from the full path to those datasets
    trainingFileName = args["trainingPrettyName"] + ".npz"
    testingFileName = args["testingPrettyName"] + args["trainingPrettyName"] + ".npz"

    if type(nn) is not bool:
        trainingFileName = "nn_" + trainingFileName
        testingFileName = "nn_" + testingFileName
        yFileName = "y_train_" + "nn_" + args["trainingPrettyName"] + ".npz"
        y_train = path.join(args["outputFolder"], yFileName)

        y = np.array(nn)
        y = [float(i) for i in y]

        # if our values are not already stored as numbers, convert them to numbers
        try:
            ySparse = csr_matrix(y)
            printParent("successfully turned y into a sparse matrix!")
        except:
            yInt = [float(i) for i in y[0:trainingLength]]
            ySparse = csr_matrix(yInt)

        save_sparse_csr(y_train, ySparse)

    # save the file names into variables- we will use them to create the file and in the fileNames hash messaged out to the parent.
    X_train = path.join(args["outputFolder"], "X_train_" + trainingFileName)
    X_test = path.join(args["outputFolder"], "X_test_" + testingFileName)

    # scipy sparse matrices need a list of indices to slice
    # http://stackoverflow.com/questions/13352280/slicing-sparse-matrices-in-scipy-which-types-work-best
    trainRange = range(args["trainingLength"])
    testRange = range(args["trainingLength"], args["trainingLength"] + args["testingLength"])

    save_sparse_csr(X_train, X[trainRange, :])
    save_sparse_csr(X_test, X[testRange])

    if type(nn) is not bool:
        fileNames = {"X_train_nn": X_train, "X_test_nn": X_test, "y_train_nn": y_train}
    else:
        fileNames = {"X_train": X_train, "X_test": X_test}
    messageParent(fileNames, "fileNames")
Ejemplo n.º 11
0
def prune(  X, y, trainingLength, featureImportanceThreshold, headerRow, dataDescription, test, problemType ):
    rfStartTime = time.time()

    # train a random forest
    if problemType == 'category':
        classifier = RandomForestClassifier( n_jobs=-1, n_estimators=20 )
    else:
        classifier = RandomForestRegressor( n_jobs=-1, n_estimators=20 )
    classifier.fit( X[ 0 : trainingLength ], y[ 0 : trainingLength ] )

    X, filteredHeaderRow, printingOutput, filteredDataDescription = cleanDataset(X, classifier.feature_importances_, featureImportanceThreshold, headerRow, dataDescription )
    

    if( not test ):
        printParent('here are the features that were kept, sorted by their feature importance')
        printParent(printingOutput)

    printParent('total time for the pruning part of feature selection, in minutes:')
    # this will get us execution time in minutes, to one decimal place
    printParent( round( (time.time() - rfStartTime)/60, 1 ) )


    return [ X, filteredHeaderRow, filteredDataDescription ]
Ejemplo n.º 12
0
    trainingIndices = []
    for idx, item in enumerate(validationSplitColumn.todense().tolist()[0]):
        if item == 1:
            validationIndices.append(idx)
        else:
            trainingIndices.append(idx)

else:
    # try to load in existing validationIndices
    try:
        with open(validationIndicesFile, "rb") as openFile:
            validationIndices = pickle.load(openFile)

            # check to make sure that the validation length is less than the length of our X dataset
            if len(validationIndices) > numRows * (validationPercent + 0.02):
                printParent("validationIndices too long")
                # if it isn't, create a new validationIndices for this dataset, but do not write it to file
                # this lets us keep our larger validationIndices split (for the full training data set), while still having something to work with for this smaller dataset we're currently testing on.
                writeToFile = False
                raise IndexError("this dataset is shorter than the one we built the validation split on previously")

            # check to make sure that the validation length is within a few percentage points of our validationPercent number (in other words, if X is 10,000 rows long, and the length of the validationIndices is only 1,200, then we know validationIndices was built on a smaller test dataset earlier.)
            elif len(validationIndices) < numRows * validationPercent * 0.98:
                printParent("validationIndices too short")
                # If it is not, create a new validationIndices and write that to file
                raise IndexError("this dataset is longer than the one we built the validation split on previously")

            # In both cases, fall into the except state below
            # but create a variable that lays out whether to write that new validationIndices to file or not in the try block, and then use that in the except block below

            # if we found existing validationIndices that meet the criteria above, we still want to split our incoming dataset on those indices
Ejemplo n.º 13
0
def dates(X, dataDescription, headerRow):
    hasDateColumn = False
    try:
        dateColumnIndex = dataDescription.index('date')
        hasDateColumn = True
    except:
        printParent('we were not able to feature engineer the dates')
        pass

    if hasDateColumn:
        headerRow.append('dayOfWeek')
        dataDescription.append('categorical')

        headerRow.append('year')
        dataDescription.append('categorical')

        headerRow.append('month')
        dataDescription.append('categorical')

        headerRow.append('dayOfMonth')
        dataDescription.append('categorical')

        headerRow.append('isWeekend')
        dataDescription.append('categorical')

        # the machine learning algorithms won't know how to intrepret a datetime object
        # so we will instead replace the datetime object with a measure of how many days this row has been since the minimum date in the dataset.
        headerRow[dateColumnIndex] = 'daysSinceMinDate'
        dataDescription[dateColumnIndex] = 'continuous'

        # note, the holidays will only apply to US holidays at first.
        # i'd love a PR that expands support to other countries!
        # sweet, holidays shouldn't be too difficult!
        # http://stackoverflow.com/questions/2394235/detecting-a-us-holiday
        # headerRow.append('isFederalHoliday')
        # dataDescription.append('categorical')
        # headerRow.append('isNonFederalHoliday')
        # dataDescription.append('categorical')

        # set our minDate equal to the first date in the dataset as a starting value
        # we will then compare each date against this to find the lowest.
        minDate = parse(X[0][dateColumnIndex])

        for rowIdx, row in enumerate(X):
            # TODO: put each iteration inside it's own try block, so that if we do not have a date for one row, the rest of the rows will be ok, and then we can go through and impute values for the missing date
            # turn the string of the date into a datetime object
            # dateutil.parser.parse will automatically detect the format of the string (or at least attempt to)
            rowDate = parse(row[dateColumnIndex])

            # save that datetime object in place of the original string (temporarily)
            # this will save us having to parse this date again on the second iteration through when we set daysSinceMinDate
            row[dateColumnIndex] = rowDate
            if rowDate < minDate:
                minDate = rowDate

            # turn all of these integers into strings, because they are going to be handled as categorical values.
            # data-formatter assumes categorical values are strings for things like using as the key in dictionaries
            dayOfWeek = rowDate.weekday()
            row.append(str(dayOfWeek))
            row.append(str(rowDate.year))
            row.append(str(rowDate.month))
            row.append(str(rowDate.day))

            # (stringified) boolean flag for whether this is a weekend or not
            # note that in Python, the week starts at 0 on Mondays, whereas in JS, the week starts at 0 on Sundays
            if dayOfWeek in [5, 6]:
                row.append(str(True))
            else:
                row.append(str(False))

            X[rowIdx] = row

        for rowIdx, row in enumerate(X):
            # right now the value stored at the dateColumnIndex is a datetime object from the previous iteration
            # go through and overwrite that with a simple number representing the number of days since the first day in the dataset
            row[dateColumnIndex] = (row[dateColumnIndex] - minDate).days
            X[rowIdx] = row

        printParent('successfully ran featureEngineering.dates!')

    return X, dataDescription, headerRow
def dates(X, dataDescription, headerRow):
    hasDateColumn = False
    try:
        dateColumnIndex = dataDescription.index('date')
        hasDateColumn = True
    except:
        printParent('we were not able to feature engineer the dates')
        pass

    if hasDateColumn:
        headerRow.append('dayOfWeek')
        dataDescription.append('categorical')
        
        headerRow.append('year')
        dataDescription.append('categorical')
        
        headerRow.append('month')
        dataDescription.append('categorical')
        
        headerRow.append('dayOfMonth')
        dataDescription.append('categorical')
        
        headerRow.append('isWeekend')
        dataDescription.append('categorical')

        # the machine learning algorithms won't know how to intrepret a datetime object
        # so we will instead replace the datetime object with a measure of how many days this row has been since the minimum date in the dataset. 
        headerRow[dateColumnIndex] = 'daysSinceMinDate'
        dataDescription[dateColumnIndex] = 'continuous'

        # note, the holidays will only apply to US holidays at first.
        # i'd love a PR that expands support to other countries!
        # sweet, holidays shouldn't be too difficult!
            # http://stackoverflow.com/questions/2394235/detecting-a-us-holiday
        # headerRow.append('isFederalHoliday')
        # dataDescription.append('categorical')
        # headerRow.append('isNonFederalHoliday')
        # dataDescription.append('categorical')

        # set our minDate equal to the first date in the dataset as a starting value
        # we will then compare each date against this to find the lowest.
        minDate = parse(X[0][dateColumnIndex])

        for rowIdx, row in enumerate(X):
            # TODO: put each iteration inside it's own try block, so that if we do not have a date for one row, the rest of the rows will be ok, and then we can go through and impute values for the missing date
            # turn the string of the date into a datetime object
            # dateutil.parser.parse will automatically detect the format of the string (or at least attempt to)
            rowDate = parse(row[dateColumnIndex])

            # save that datetime object in place of the original string (temporarily)
            # this will save us having to parse this date again on the second iteration through when we set daysSinceMinDate
            row[dateColumnIndex] = rowDate
            if rowDate < minDate:
                minDate = rowDate

            # turn all of these integers into strings, because they are going to be handled as categorical values.
            # data-formatter assumes categorical values are strings for things like using as the key in dictionaries
            dayOfWeek = rowDate.weekday()
            row.append( str(dayOfWeek) )
            row.append(str(rowDate.year))
            row.append(str(rowDate.month))
            row.append(str(rowDate.day))

            # (stringified) boolean flag for whether this is a weekend or not
            # note that in Python, the week starts at 0 on Mondays, whereas in JS, the week starts at 0 on Sundays
            if dayOfWeek in [5,6]:
                row.append(str(True))
            else:
                row.append(str(False))

            X[rowIdx] = row

        for rowIdx, row in enumerate(X):
            # right now the value stored at the dateColumnIndex is a datetime object from the previous iteration
            # go through and overwrite that with a simple number representing the number of days since the first day in the dataset
            row[dateColumnIndex] = (row[dateColumnIndex] - minDate).days
            X[rowIdx] = row

        printParent('successfully ran featureEngineering.dates!')

    return X, dataDescription, headerRow
Ejemplo n.º 15
0
    trainingIndices = []
    for idx, item in enumerate(validationSplitColumn.todense().tolist()[0]):
        if item == 1:
            validationIndices.append(idx)
        else:
            trainingIndices.append(idx)

else:
    # try to load in existing validationIndices
    try:
        with open(validationIndicesFile, 'rb') as openFile:
            validationIndices = pickle.load(openFile)

            # check to make sure that the validation length is less than the length of our X dataset
            if len(validationIndices) > numRows * ( validationPercent + .02):
                printParent('validationIndices too long')
                # if it isn't, create a new validationIndices for this dataset, but do not write it to file
                # this lets us keep our larger validationIndices split (for the full training data set), while still having something to work with for this smaller dataset we're currently testing on.
                writeToFile = False
                raise IndexError("this dataset is shorter than the one we built the validation split on previously")

            # check to make sure that the validation length is within a few percentage points of our validationPercent number (in other words, if X is 10,000 rows long, and the length of the validationIndices is only 1,200, then we know validationIndices was built on a smaller test dataset earlier.)
            elif len(validationIndices) < numRows * validationPercent * .98:
                printParent('validationIndices too short')
                # If it is not, create a new validationIndices and write that to file
                raise IndexError("this dataset is longer than the one we built the validation split on previously")
                
            # In both cases, fall into the except state below
            # but create a variable that lays out whether to write that new validationIndices to file or not in the try block, and then use that in the except block below

            # if we found existing validationIndices that meet the criteria above, we still want to split our incoming dataset on those indices
Ejemplo n.º 16
0
def joinDataDescription(dataDescription):
    allowableValues = ['id','continuous','groupby continuous','categorical','groupby categorical','date','groupby date','ignore', 'validation split', 'nlp']

    for name in dataDescription:
        try:
            allowableValues.index(name)

        except:
            printParent('*********************************************************************')
            printParent('\n')
            printParent('Warning, we have received a value in the dataDescription row that is not valid:')
            printParent(name)
            printParent('The entire dataDescription row is:')
            printParent(dataDescription)
            printParent('Please remember that the first row must contain information describing that column of data')
            printParent('Acceptable values are: "ID", "Continuous", "Categorical", "Date", "IGNORE", "Validation Split", and "NLP", though they are not case sensitive')
            printParent('\n')
            printParent('*********************************************************************')
            printParent('This is an error that prevents the rest of the prorgram from running. Please fix and run machineJS again.')
            printParent('\n')
            printParent('\n')
            printParent('\n')
            printParent('\n')

            raise
Ejemplo n.º 17
0
def dataDescription(arr):
    expectedTestRowLength = 0
    expectedValues = {
        'id': False,
        'output category': False,
        'output regression': False,
        'output multi-category': False,
        'nlp': False,
        'validation split': False
    }
    allowableValues = ['id','output category','output multi-category','output regression','continuous','categorical','date','ignore','validation split','nlp']


    for colIndex, name in enumerate(arr):

        # remove groupBy from in front of any other dataDescription words it might be paired with
        if name[0:8] == 'groupby ':
            name = name[8:]

        try:
            if name == 'output multicategory' or name == 'output multi category':
                name = 'output multi-category'
            allowableValues.index(name)
            expectedValues[name] = True
            # sometimes we will include columns in our training dataset that we will not include in our testing dataset. we want to allow for that
            # we already have logic in place for handling missing output values in our testing dataset. 
            if name not in ['ignore', 'validation split']:
                expectedTestRowLength += 1

        except:
            printParent('*********************************************************************')
            printParent('')
            printParent('Warning, we have received a value in the first row that is not valid:')
            printParent(name)
            printParent('Please remember that the first row must contain information describing that column of data')
            printParent('Acceptable values are: "ID", "Output Category", "Output Multi-Category", "Output Regression", "Continuous", "Categorical", "Date", "IGNORE", "Validation Split", and "NLP", though they are not case sensitive.')
            printParent('')
            printParent('The column index of this unexpected value is:')
            printParent(colIndex)
            printParent('The entire row that we received is:')
            printParent(str(arr))
            printParent('*********************************************************************')
            printParent('This is an error that prevents the rest of the prorgram from running. Please fix and run machineJS again.')
            printParent('')
            printParent('')
            printParent('')
            printParent('')
            raise
    if( not expectedValues['output category'] and not expectedValues['output regression'] and not expectedValues['output multi-category']):
        printParent('Warning, there is no column with an "Output" label in the first row')
        raise TypeError('dataDescription row incomplete')

    if( not expectedValues['id'] ):
        printParent('Warning, there is no column with an "ID" label in the first row')
        printParent('Not having an ID column is ok, as long as this is intentional.')
        # our testing dataset must have an id in it, so if our training data does not have an id column, we would expect our testing data to have one more column
        expectedTestRowLength += 1
        return False, expectedTestRowLength, expectedValues['validation split']
        raise TypeError('dataDescription row incomplete')

    # returning True means that we do have all the pieces we need to continue as normal
    return True, expectedTestRowLength, expectedValues['validation split']
Ejemplo n.º 18
0
def rowLength(row, expectedRowLength, rowCount):
    if len(row) != expectedRowLength:
        printParent(
            'This row did not have the same number of columns as the dataDescription row.'
        )
        printParent(row)
        printParent('This is row number:')
        printParent(rowCount)
        printParent(
            'Please make sure that all rows have the same number of columns, even if those values are blank'
        )
        printParent(
            'And it might be worth double checking that your dataDescription row has an accurate description for each column in the dataset'
        )
Ejemplo n.º 19
0
def testingHeaderRow( row, expectedRowLength, trainingHeader ):
    if len( row ) != expectedRowLength:
        printParent('We noticed that the testing and training datasets have different numbers of columns.')
        printParent('We are going to assume that the "Output" column is simply not included for the testing dataset.')
        printParent( 'Here is the header row for the training data set:')
        printParent( trainingHeader )
        printParent( 'And here is the header row for your testing dataset:')
        row = [x.lower() for x in row]
        printParent( row )
        return False
    return True
Ejemplo n.º 20
0
def rowLength( row, expectedRowLength, rowCount ):
    if len( row ) != expectedRowLength:
        printParent( 'This row did not have the same number of columns as the dataDescription row.')
        printParent( row )
        printParent( 'This is row number:')
        printParent( rowCount )
        printParent( 'Please make sure that all rows have the same number of columns, even if those values are blank')
        printParent( 'And it might be worth double checking that your dataDescription row has an accurate description for each column in the dataset')
Ejemplo n.º 21
0
import pandas as pd
import json
import sys
import csv

from os import path

from sendMessages import printParent

printParent('inside matrixOutput.py')

# for multi-category data, we can choose to output a single column with all the categories contained in that column, or we can translate that into a set of binary columns, where each column represents a single categorical value.
# if the final output is matrixOutput, create a separate file that can be easily referenced by the user

argv = json.loads(sys.argv[1])

printParent('argv')
printParent(argv)

resultsFileName = argv['resultsFile']

idHeader = ''
rowCount = 0
ids = []
predictions = []
with open(resultsFileName, 'rU') as resultsFile:
    inputRows = csv.reader(resultsFile)
    for row in inputRows:
        if rowCount == 0:
            idHeader = row[0]
        else:
Ejemplo n.º 22
0
validationIndexFolder = path.dirname(args['predict'])
validationIndexFileName = 'dfValidationIndices' + args['testOutputFileName'] + '.pkl'
validationIndicesFile = path.join( validationIndexFolder, validationIndexFileName )


writeToFile = True
createNewSplit = False

# try to load in existing validationIndices
try:
    with open(validationIndicesFile, 'rb') as openFile:
        validationIndices = pickle.load(openFile)

        # check to make sure that the validation length is less than the length of our X dataset
        if len(validationIndices) > numRows * ( validationPercent + .02):
            printParent('validationIndices too long')
            # if it isn't, create a new validationIndices for this dataset, but do not write it to file
            # this lets us keep our larger validationIndices split (for the full training data set), while still having something to work with for this smaller dataset we're currently testing on.
            writeToFile = False
            raise IndexError("this dataset is shorter than the one we built the validation split on previously")

        # check to make sure that the validation length is within a few percentage points of our validationPercent number (in other words, if X is 10,000 rows long, and the length of the validationIndices is only 1,200, then we know validationIndices was built on a smaller test dataset earlier.)
        elif len(validationIndices) < numRows * validationPercent * .98:
            printParent('validationIndices too short')
            # If it is not, create a new validationIndices and write that to file
            raise IndexError("this dataset is longer than the one we built the validation split on previously")
            
        # In both cases, fall into the except state below
        # but create a variable that lays out whether to write that new validationIndices to file or not in the try block, and then use that in the except block below

        # if we found existing validationIndices that meet the criteria above, we still want to split our incoming dataset on those indices
Ejemplo n.º 23
0
import pandas as pd
import json
import sys
import csv

from os import path

from sendMessages import printParent

printParent('inside matrixOutput.py')

# for multi-category data, we can choose to output a single column with all the categories contained in that column, or we can translate that into a set of binary columns, where each column represents a single categorical value. 
# if the final output is matrixOutput, create a separate file that can be easily referenced by the user

argv = json.loads(sys.argv[1])

printParent('argv')
printParent(argv)

resultsFileName = argv['resultsFile']

idHeader = ''
rowCount = 0
ids = []
predictions = []
with open(resultsFileName, 'rU') as resultsFile:
    inputRows = csv.reader(resultsFile)
    for row in inputRows:
        if rowCount == 0:
            idHeader = row[0]
        else:
Ejemplo n.º 24
0
def testingHeaderRow(row, expectedRowLength, trainingHeader):
    if len(row) != expectedRowLength:
        printParent(
            'We noticed that the testing and training datasets have different numbers of columns.'
        )
        printParent(
            'We are going to assume that the "Output" column is simply not included for the testing dataset.'
        )
        printParent('Here is the header row for the training data set:')
        printParent(trainingHeader)
        printParent('And here is the header row for your testing dataset:')
        row = [x.lower() for x in row]
        printParent(row)
        return False
    return True
Ejemplo n.º 25
0
    # our X_train file has a header row, so the user can see the results of data-formatter in a pretty way if they'd like.
    # we need to remove this row form our actual dataset
    # none of our other files from data-formatter have header rows
    with open(X_file_name, 'rU') as openInputFile:
        inputRows = csv.reader(openInputFile)
        firstRow=False
        for row in inputRows:
            if(firstRow):
                rowAsFloats = []
                # make sure that floats that were saved as scientific notation are actually read in as floats
                # this should be non-controversial, as by this point we should have turned all categorical data into binary representation (0 or 1).
                for idx, val in enumerate(row):
                    try:
                        val = float(val)
                    except:
                        printParent(headerRow[idx])
                        printParent(val)
                    rowAsFloats.append( val )
                X.append(row)
            else:
                headerRow = row
                firstRow=True
            

    X = np.array(X)

try:
    y = load_sparse_csr(y_file_name)

except:
    # supports dense input, which is used in validationRound
Ejemplo n.º 26
0
def nlp(X, dataDescription, headerRow):
    hasnlpColumn = False
    try:
        nlpColumnIndex = dataDescription.index('nlp')
        hasnlpColumn = True
    except:
        printParent(
            'we did not find any nlp column to perform feature engineering on')
        pass

    if hasnlpColumn:
        # TODO: use TfidfVectorizer
        # iterate through each row, grabbing the nlp column
        # run this entire collected corpus through TfidfVectorizer, store into tfVectorized
        # figure out what to add to the headerRow and dataDescription row
        # one option is to add the actual word, if we can get that (we should be able to). it appears to exist in get_feature_names()
        # we might decide that we want to keep all the nlp words, in which case we'd want to prefix all these columns in dataDescription and headerRow with "nlp"
        # don't actually add tfVectorized to X yet. X is still dense, while tfVectorized is sparse.
        # simply pass tfVectorized (along with what should be added to headerRow and dataDescription) back.
        # then stack it horizontally to X once we turn X into a sparse matrix later on.
        # no need to disrupt the entire rest of the process by converting everything to sparse right now

        corpus = []

        for rowIdx, row in enumerate(X):
            rawString = row[nlpColumnIndex]
            cleanedString = unicode(rawString, errors='replace')
            corpus.append(cleanedString)

            # right now the value stored at the nlpColumnIndex is the entire text string
            # go through and overwrite that with a simple number representing the number of characters in that string. We will have the fuller representation of the string (using bag of words or tf-idf) stored elsewhere in this row
            row[nlpColumnIndex] = len(row[nlpColumnIndex])
            X[rowIdx] = row
        dataDescription[nlpColumnIndex] = 'continuous'
        headerRow[nlpColumnIndex] = 'lengthOf' + headerRow[nlpColumnIndex]

        # TODO: properly set the parameters here. how many words do we want to include, etc.
        # if we face a decoding error, ignore it
        # strip the accents from words to make them more consistent
        # if amalyzer='char', each word feature will be made up of character n-grams. this means 'calling' and 'called' will be more similar, because they share the characters 'c','a','l',and 'l'. if words, they would be considered two completely unrelated entities
        # if analyzer='word', each word feature will simply be the count of times that word appears in this document
        # remove english "stop words": words like 'the','it','a' that appear so frequently as to be pretty useless in creating distinguishing documents. research has shown that for most corpora, removing stop words speeds up calculation time and increases accuracy (removes noise)
        # convert all charactes to lowercase before tokenizing
        # only include the most frequently occurring 'max_features' features when building the vocabulary. In other words, if we have 80,000 unique words that appear throughout our corpus, but max_features is only 5,000, we will only include the most popular 5,000 words in the final features. This reduces noise, memory, and computation time, at the risk of ignoring useful data.

        vectorizer = TfidfVectorizer(decode_error='ignore',
                                     strip_accents='unicode',
                                     analyzer='word',
                                     stop_words='english',
                                     lowercase=True,
                                     max_features=5000)
        corpus = vectorizer.fit_transform(corpus)

        # TODO:
        # Before writing vectorizer to file, remove the stop_words attribute. Otherwise, it will take up totally unnecessary space
        # vectorizer.stop_words = None

        # TODO: get the feature names
        nlpHeaderRow = vectorizer.get_feature_names()
        nlpHeaderRow = ['_nlp' + x for x in nlpHeaderRow]
        nlpDataDescription = ['continuous' for x in nlpHeaderRow]

    return X, corpus, nlpDataDescription, nlpHeaderRow
Ejemplo n.º 27
0
def dataDescription(arr):
    expectedTestRowLength = 0
    expectedValues = {
        'id': False,
        'output category': False,
        'output regression': False,
        'output multi-category': False,
        'nlp': False,
        'validation split': False
    }
    allowableValues = [
        'id', 'output category', 'output multi-category', 'output regression',
        'continuous', 'categorical', 'date', 'ignore', 'validation split',
        'nlp'
    ]

    for colIndex, name in enumerate(arr):

        # remove groupBy from in front of any other dataDescription words it might be paired with
        if name[0:8] == 'groupby ':
            name = name[8:]

        try:
            if name == 'output multicategory' or name == 'output multi category':
                name = 'output multi-category'
            allowableValues.index(name)
            expectedValues[name] = True
            # sometimes we will include columns in our training dataset that we will not include in our testing dataset. we want to allow for that
            # we already have logic in place for handling missing output values in our testing dataset.
            if name not in ['ignore', 'validation split']:
                expectedTestRowLength += 1

        except:
            printParent(
                '*********************************************************************'
            )
            printParent('')
            printParent(
                'Warning, we have received a value in the first row that is not valid:'
            )
            printParent(name)
            printParent(
                'Please remember that the first row must contain information describing that column of data'
            )
            printParent(
                'Acceptable values are: "ID", "Output Category", "Output Multi-Category", "Output Regression", "Continuous", "Categorical", "Date", "IGNORE", "Validation Split", and "NLP", though they are not case sensitive.'
            )
            printParent('')
            printParent('The column index of this unexpected value is:')
            printParent(colIndex)
            printParent('The entire row that we received is:')
            printParent(str(arr))
            printParent(
                '*********************************************************************'
            )
            printParent(
                'This is an error that prevents the rest of the prorgram from running. Please fix and run machineJS again.'
            )
            printParent('')
            printParent('')
            printParent('')
            printParent('')
            raise
    if (not expectedValues['output category']
            and not expectedValues['output regression']
            and not expectedValues['output multi-category']):
        printParent(
            'Warning, there is no column with an "Output" label in the first row'
        )
        raise TypeError('dataDescription row incomplete')

    if (not expectedValues['id']):
        printParent(
            'Warning, there is no column with an "ID" label in the first row')
        printParent(
            'Not having an ID column is ok, as long as this is intentional.')
        # our testing dataset must have an id in it, so if our training data does not have an id column, we would expect our testing data to have one more column
        expectedTestRowLength += 1
        return False, expectedTestRowLength, expectedValues['validation split']
        raise TypeError('dataDescription row incomplete')

    # returning True means that we do have all the pieces we need to continue as normal
    return True, expectedTestRowLength, expectedValues['validation split']
Ejemplo n.º 28
0
        validationYFile = fileNames['y_trainvalidationData']
    validationY = load_sparse_csr(validationYFile).todense().tolist()[0]

    if problemType == 'category':
        try:
            validationPredictions = classifier.predict_proba(validationData)
        except:
            validationPredictions = classifier.predict(validationData)

    else:
        # else will handle both regression and multi-category predictions for now
        validationPredictions = classifier.predict(validationData)

    validationScore = classifier.score(validationData, validationY)

    printParent('\n')
    printParent('***************')
    printParent(classifierName + "'s score on the validation set is:")
    printParent(validationScore)
    printParent('***************')
else:
    # we still need something to write to the file. we will write the score from the hyperparameter search, which is the cross-validation score on the holdout data from that search. in that way, it's actualy a pretty accurate score to be using.
    validationScore = searchScore

# write our predictions on the test data to a file
if argv['validationRound']:
    predictionsPath = path.join(argv['predictionsFolder'],
                                'ensembledPredictions')

else:
    predictionsPath = argv['predictionsFolder']
Ejemplo n.º 29
0

predictions = []
with open(predictionsFile, 'rU') as predictionsFile:
    inputRows = csv.reader(predictionsFile)
    for row in inputRows:
        predictions.append(row)

for rowIdx, predictionRow in enumerate(predictions):
    # printParent('predictionRow in predictions')
    # printParent(predictionRow)
    for colIdx, prediction in enumerate(predictionRow):
        try:
            predictions[rowIdx][colIdx] = float(prediction)
        except:
            printParent(repr(prediction))
            raise

# scipy.sparse matrices work on np.arrays, not python lists
# they also do not deal well with mixed data types
# TODO: this is potentially fragile as we expand to more and more problem types.
# the predictions files will likely always be floats, but the validation data oftentimes won't be
try:
    predictions = np.array(predictions, dtype=float)
except:
    predictions = np.array(predictions)
predictions = csr_matrix(predictions)

validationData = load_sparse_csr(validationData)

# we must pass in 'csr', otherwise it will oftentimes pick another sparse matrix format for us that is not compatible with our sparse saver and loader function
Ejemplo n.º 30
0
def joinDataDescription(dataDescription):
    allowableValues = [
        'id', 'continuous', 'groupby continuous', 'categorical',
        'groupby categorical', 'date', 'groupby date', 'ignore',
        'validation split', 'nlp'
    ]

    for name in dataDescription:
        try:
            allowableValues.index(name)

        except:
            printParent(
                '*********************************************************************'
            )
            printParent('\n')
            printParent(
                'Warning, we have received a value in the dataDescription row that is not valid:'
            )
            printParent(name)
            printParent('The entire dataDescription row is:')
            printParent(dataDescription)
            printParent(
                'Please remember that the first row must contain information describing that column of data'
            )
            printParent(
                'Acceptable values are: "ID", "Continuous", "Categorical", "Date", "IGNORE", "Validation Split", and "NLP", though they are not case sensitive'
            )
            printParent('\n')
            printParent(
                '*********************************************************************'
            )
            printParent(
                'This is an error that prevents the rest of the prorgram from running. Please fix and run machineJS again.'
            )
            printParent('\n')
            printParent('\n')
            printParent('\n')
            printParent('\n')

            raise
Ejemplo n.º 31
0
    validationY = load_sparse_csr(validationYFile).todense().tolist()[0]


    if problemType == 'category':
        try:
            validationPredictions = classifier.predict_proba(validationData)
        except:
            validationPredictions = classifier.predict(validationData)
            
    else:
        # else will handle both regression and multi-category predictions for now
        validationPredictions = classifier.predict(validationData)

    validationScore = classifier.score(validationData,validationY)

    printParent('\n')
    printParent('***************')
    printParent(classifierName + "'s score on the validation set is:")
    printParent(validationScore)
    printParent('***************')
else:
    # we still need something to write to the file. we will write the score from the hyperparameter search, which is the cross-validation score on the holdout data from that search. in that way, it's actualy a pretty accurate score to be using. 
    validationScore = searchScore

# write our predictions on the test data to a file
if argv['validationRound']:
    predictionsPath = path.join( argv['predictionsFolder'], 'ensembledPredictions' )

else:
    predictionsPath = argv['predictionsFolder']
    return csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) 

predictions = []
with open(predictionsFile, 'rU') as predictionsFile:
    inputRows = csv.reader(predictionsFile)
    for row in inputRows:
        predictions.append(row)

for rowIdx, predictionRow in enumerate(predictions):
    # printParent('predictionRow in predictions')
    # printParent(predictionRow)
    for colIdx, prediction in enumerate(predictionRow):
        try:
            predictions[rowIdx][colIdx] = float(prediction)
        except:
            printParent(repr(prediction))
            raise

# scipy.sparse matrices work on np.arrays, not python lists
# they also do not deal well with mixed data types
# TODO: this is potentially fragile as we expand to more and more problem types. 
    # the predictions files will likely always be floats, but the validation data oftentimes won't be
try:
    predictions = np.array(predictions, dtype=float)
except:
    predictions = np.array(predictions)
predictions = csr_matrix(predictions)

validationData = load_sparse_csr(validationData)

# we must pass in 'csr', otherwise it will oftentimes pick another sparse matrix format for us that is not compatible with our sparse saver and loader function