Example #1
0
    def fit(self, X, y):
        """
        @param y: an array of strings such as '0100122'
        """
        assert len(y[0]) == len(self.clfs), "outputTable must have the same num of columns as len(self.clfs)"

        for col, clf in enumerate(self.clfs):
            t0 = time()
            print 'Fitting', col
            curY = np.array([int(s[col]) for s in y])
            clf.fit(X, curY)
            printDoneTime(t0)
Example #2
0
def buildModel(data, testData, fieldMaps, n_jobs, useJJ, selectedClfs = None, colNames = 'all', random_states=[None],
               writeResults=True, cvNumSplits=50, test_size=0.25, verbose=False, **fitArgs):
    """
    @type data DatasetPair
    @type testData DatasetPair
    @type fieldMaps dict
    @param selectedClfs: the classifiers to run. if None, runs all classifiers
    @param colNames: if 'all' no splicing is done; otherwise is a list of fields to splice by
    @return: test results
    @rtype: Iterable
    """

    pprint({k:fieldMaps[k] for k in colNames})
    print fieldMaps.keys()

    t0 = time()
    res_all = {}
    bestClf_by_split = {}   # {colVals: bestClf}. If bestClf is a scalar, just use it as predictions regardless of the input

    # ------- set up data -------
    if colNames=='all':
        colIndices = range(len(data.fieldNames))
        splits_all = {'all': data}
        splits_test = {'all': testData}
    elif isinstance(colNames, Iterable):
        colIndices = [data.fieldNames.index(name) for name in colNames]
        splits_all = data.spliceByColumnNames(colNames, removeColumns=True)
        splits_test = testData.spliceByColumnNames(colNames, removeColumns=True)
    else:
        raise ValueError("colNames of type %s isn't one of the recognized types." % type(colNames))


    # ------- fit classifiers -------
    for colVals in splits_all.keys():

        colVal_names = 'all' if colNames=='all' else tuple(reverseDict(fieldMaps[name])[colVal] if name in fieldMaps else colVal for name, colVal in zip(colNames, colVals))

        # if not colVal_names==('female', 'Mrs', ''):
        #     continue

        print '='*10, colVal_names, '='*10, splits_all[colVals].dataCount, 'training data.'
        if colVal_names == ('female', 'other', 'Q'):
            print mask2DArrayByCol(testData.X, dict(zip(colIndices, colVals)))[1]
            print splits_all[colVals].X, splits_all[colVals].Y, splits_test[colVals].X, splits_test[colVals].Y


        if splits_all[colVals].dataCount == 0:
            if colVals not in splits_test or splits_test[colVals].dataCount==0:
                print 'Irrelevant category. Skipping...'
            else:   # there is training data but no testing data. use the mode of training data results
                v = type(data.Y[0])(mode(data.Y)[0])
                res_all[colVals] = np.repeat(v, splits_test[colVals].dataCount)
                bestClf_by_split[colVals] = v
                print 'Nothing training data. Using the mode of all training Y values', v
            continue
        elif colVals not in splits_test or splits_test[colVals]==0:
            print 'No testing data for this category. Skipping...'
            continue

        # get this slice's data
        train_cur = splits_all[colVals]
        test_cur = splits_test[colVals]
        print '%s has %d training data, %d testing data' % (colVal_names, train_cur.dataCount, test_cur.dataCount)

        # fit
        if len(np.unique(train_cur.Y))==1 or train_cur.dataCount<=5:  # nothing to fit if the training data has only one class
            v = type(train_cur.Y[0])(mode(train_cur.Y)[0])
            print 'Using the most common one class (%d) in training data for prediction.' % v
            res_all[colVals] = np.repeat(v, test_cur.dataCount)
            bestClf_by_split[colVals] = v

        else:
            _, (bestClfName, bestClf, bestscore) = fitClassifiers(train_cur, selectedClfs=selectedClfs, random_states=random_states,
                                                                  useJJ=useJJ, n_jobs=n_jobs, overwriteSavedResult=True,
                                                                  cvSplitNum=cvNumSplits, test_size=test_size, verbose=verbose, **fitArgs)
            print '>>>>>>> The best classifier for %s is %s, with score %f.' % (colVal_names, bestClfName, bestscore)
            res_all[colVals] = bestClf.fit(*train_cur.getPair()).predict(test_cur.X)
            bestClf_by_split[colVals] = bestClf

    # ------- compute overall cv score ---------
    cvResults = []

    for randomState in random_states:
        cvObj = StratifiedShuffleSplit(data.Y, cvNumSplits, test_size=test_size, random_state=randomState)

        for trainInds, testInds in cvObj:
            if colNames=='all':
                curTrainDataSplitted = {'all': DatasetPair(data.X[trainInds], data.Y[trainInds], data.fieldNames)}
                curTestDataSplitted = {'all': DatasetPair(data.X[testInds], data.Y[testInds], data.fieldNames)}
            else:
                curTrainDataSplitted = DatasetPair(data.X[trainInds], data.Y[trainInds], data.fieldNames).spliceByColumnNames(colNames, removeColumns=True)
                curTestDataSplitted = DatasetPair(data.X[testInds], data.Y[testInds], data.fieldNames).spliceByColumnNames(colNames, removeColumns=True)

            curTotalCount = len(testInds)
            curScore = 0

            for colVals in curTrainDataSplitted.keys():
                if colVals not in bestClf_by_split or curTestDataSplitted[colVals].dataCount==0 or curTrainDataSplitted[colVals].dataCount==0:
                    continue

                trainD = curTrainDataSplitted[colVals]
                testD = curTestDataSplitted[colVals]
                clf = deepcopy(bestClf_by_split[colVals])

                if isinstance(clf, (int, float)) or len(np.unique(trainD.Y))==1:
                    ypred = [clf] * len(testD.Y)
                else:
                    ypred = clf.fit(*trainD.getPair()).predict(testD.X)

                curScore += accuracy_score(testD.Y, ypred) * testD.dataCount / curTotalCount

            cvResults.append(curScore)

    cvScore = np.mean(cvResults)
    print 'OVERALL CV SCORE =', cvScore

    # ------- collect results -------
    if colNames=='all':
        testRes = res_all['all']
    else:
        testRes = np.repeat(99, testData.dataCount)
        for colVals, res in res_all.iteritems():
            _, curMask = mask2DArrayByCol(testData.X, dict(zip(colIndices, colVals)))
            testRes[curMask] = res

    # print testRes
    #
    # print 'jjjjjjjjjjj', list(testRes).index(99), testData.X[list(testRes).index(99)]
    # for i in range(len(testData.fieldNames)):
    #     n = testData.fieldNames[i]
    #
    #     if n in fieldMaps:
    #         print n, ':', fieldMaps[n][testData.X[list(testRes).index(99)][i]]
    #     else:
    #         print n, ':', testData.X[list(testRes).index(99)][i]

    assert np.logical_or(testRes==0, testRes==1).all()    # make sure all values are filled

    # ------- featureSelectionOutput results -------
    if writeResults: writeTestingResToFile("by" + '_'.join(colNames), testRes)

    print 'Total amount of time spent:'
    printDoneTime(t0)

    return testRes, cvScore
        if customerId in np.array(historyAndOffers.id):
            blockTransDict[customerId] = pandas.read_csv(IterStreamer(rawData), names = transHeaders)

        # ---- finished building. run the pool for this major block
        if len(blockTransDict) == chunkSize_major or rowNum == transIndexData.shape[0]-1:

            totalTime = time() - t_dict
            print "Building transactions dict total:", totalTime
            print "Building transactions dict each:", 1000000.* totalTime/ sum(df.shape[0] for df in blockTransDict.values())

            print '--------- Finished building. Running pool. --------'

            t0 = time()
            pool = MyPool(processes=16, initializer = initStep,
                          initargs = (historyAndOffers, compEmptyDf, blockTransDict))
            printDoneTime(t0, "Making the pool")

            t0 = time()
            poolOutputs = runPool(pool, innerFunc, chunks(blockTransDict.keys(), chunkSize_minor))
            printDoneTime(t0, 'Running the pool')

            # dump pool output to file
            for chunk in poolOutputs:
                chunk.to_csv(compressedTransFile, header=False, index=False)

            print '--- dumping ---', len(poolOutputs), sum(chunk.shape[0] for chunk in poolOutputs)

            pool.close()
            pool.join()
            pool.terminate()