def selectFeaturesForKeyword(keyword, threshold = 0.223):
    import Trainer, PythonVersionHandler
    from pyspark.mllib.regression import LabeledPoint
    featureList = Trainer.featuresList[:-2]
    Trainer.setFeatureVector(featureList)
    trainData, testData, weights, accuracy = getTrainedWeights(keyword)
    removedFeatures = []
    accuracies = [accuracy]
    weightsRow = list(weights)
    while (not isImportant(weights, threshold = threshold)) and len(weights) > 1:
        index, featureList, removedFeature = eliminate(weights, featureList)
        removedFeatures.append(removedFeature)
        Trainer.setFeatureVector(featureList)
        def getReducedVector(lp):
            newFeatures = list(lp.features)
            newFeatures.pop(index)
            return LabeledPoint(lp.label, newFeatures)
        trainData = trainData.map(getReducedVector)
        testData = testData.map(getReducedVector)
        model = Trainer.trainPairWiseData(trainData, dataName = 'TrainData')
        accuracy = Trainer.evaluateModelOnData(model, testData, dataName = 'TestData')
        accuracies.append(accuracy)
        weights = list(model.weights)
        weightsRow.append('X')
        weightsRow.extend(weights)
    PythonVersionHandler.print_('Keyword: ' + keyword)
    PythonVersionHandler.print_('Selected features: ' + str(featureList))
    PythonVersionHandler.print_('Following features have reduced by order: ' + str(removedFeatures))
    PythonVersionHandler.print_('Accuracies from each step: ' + str(accuracies))
    row = [keyword]
    row.extend(featureList)
    row.append('X')
    row.extend(removedFeatures)
    row.extend(accuracies)
    return row, weightsRow
def extractPairs():
    import paths, PythonVersionHandler, Trainer, ReadyTests
    feature_names = ['photos', 'soldCount', 'feedbackPercentage', 'memberSoldCount', 'memberSegment', 
                     'subtitleFlag', 'brandNew', 'freeCargo', 'dailyOffer', 'windowOptionFlag', 'price', 'productCount']
    Trainer.setFeatureVector(feature_names)
    keywords = ReadyTests.get27Keywords()[23:]
    for c, keyword in enumerate(keywords): 
        PythonVersionHandler.print_logging(str(c+1)+'.', keyword.upper() + ':')
        trainTesting(keyword)
def trainingTest21():
    import paths, FinalizedRunners, Trainer
    feature_names = ['photos', 'feedbackPercentage', 'memberSoldCount', 'soldCount',
            'memberSegment', 'subtitleFlag', 'brandNew', 'freeCargo', 'windowOptionFlag']
    Trainer.setFeatureVector(feature_names)
    keywords = ['besiktas', 'kol_saati', 'iphone_7', 'iphone_7_kilif']
    for keyword in keywords: 
        folder = paths.joinPath(paths.joinPath(paths.HDFSRootFolder, 'secondWeek'), keyword)
        FinalizedRunners.trainForKeyword(keyword, folder, saving = True)
def trainExtendedPairsLoop(onlyFollowings = False, AllPageButId = False):
    import paths, PythonVersionHandler, Trainer, ReadyTests
    feature_names = ['photos', 'soldCount', 'feedbackPercentage', 'memberSoldCount', 'memberSegment', 
                     'subtitleFlag', 'brandNew', 'freeCargo', 'dailyOffer', 'windowOptionFlag', 'sameDay']
    Trainer.setFeatureVector(feature_names)
    keywords = ReadyTests.get27Keywords()
    for c, keyword in enumerate(keywords): 
        PythonVersionHandler.print_logging(str(c+1)+'.', keyword.upper() + ':')
        trainExtendedPairs(keyword, onlyFollowings = onlyFollowings, AllPageButId = AllPageButId)
    Trainer.saveOutputTable()
    Trainer.printOutputTable()
def trainingTestAllLoop(feature_names):
    import paths, PythonVersionHandler, FinalizedRunners, Trainer, ReadyTests
    Trainer.setFeatureVector(feature_names)
    keywords = ReadyTests.get27Keywords()
    for c, keyword in enumerate(keywords): 
        PythonVersionHandler.print_logging(str(c+1)+'.', keyword.upper() + ':')
        keyword = keyword.replace(' ', '_')
        folder = paths.joinPath(paths.joinPath(paths.HDFSRootFolder, 'weekAugust'), keyword)
        FinalizedRunners.trainForKeyword(keyword, folder, saving = False)
    Trainer.printOutputTable()
    Trainer.saveOutputTable()
    Trainer.outputTable = []