Beispiel #1
0
def extract_audio(input_file):
    output_args = ['-y', '-vn', '-acodec', 'copy']
    output = '{}/input_audio.aac'.format(TMP_FOLDER)
    ff = ffmpy.FFmpeg(inputs={input_file: None}, outputs={output: output_args})
    try:
        ff.run()
    except Exception as e:
        line_break(3)
        print('Failed to increase audio.\n{}'.format(e))

    return output
Beispiel #2
0
def increase_audio(input_audio, amt):
    output_args = ['-y', '-af', 'volume=3, bass=g=5, treble=g=-10']
    audio_file = None
    for idx in xrange(0, amt):
        input_file = audio_file or input_audio
        output = '{}/tmp_audio_{}.wav'.format(TMP_FOLDER, idx)
        ff = ffmpy.FFmpeg(inputs={input_file: None},
                          outputs={output: output_args})
        try:
            ff.run()
            audio_file = output
        except Exception as e:
            line_break(3)
            print('Failed to increase audio.\n{}'.format(e))

    return audio_file
Beispiel #3
0
def deep_fry_video(input_file, video_dip):
    emojified_video = add_random_emojis(input_file)
    inputs = create_inputs(emojified_video)

    output_args = create_base_args() + create_filter_args()

    for idx in xrange(0, video_dip):
        output = '{}/deep_fried_{}.mp4'.format(TMP_FOLDER, idx)
        outputs = create_outputs(output, output_args)

        ff = ffmpy.FFmpeg(inputs=inputs, outputs=outputs)
        try:
            ff.run()
            inputs = create_inputs(output)
        except Exception as e:
            line_break(3)
            print('Failed to deep fry video.\n{}'.format(e))

    return output
def print_model_header(model):
    """Print header with model info
    """
    features_list = (map(str,model.features.keys()))
    features_list.sort()
    log.info(utils.line_break())
    log.info('MODEL: %s    SEGMENT: %s     TARGET: %s '  % (model.model_name, model.segment, model.target))
    log.info('FEATURES: %s' % features_list)
    log.info('ESTIMATOR CLASS: %s ' % model.estimator)
    log.info('POST-PROCESS SCALAR: %s ' % model.postprocess_scalar)
Beispiel #5
0
def add_random_emojis(input_file):
    """
    Overlays emojis at random angles, size, durations, and start frames over a
    given input file. The amount of emojis is based on input file length.
    """
    emoji_filters = create_emoji_filters(input_file)
    inputs = create_inputs(input_file, emoji_filters)

    output_args = ['-an'] + create_base_args()

    output_args += ['-filter_complex', ''.join(emoji_filters)]

    tmp_output = '{}/emojied_video.mp4'.format(TMP_FOLDER)
    outputs = create_outputs(tmp_output, output_args)

    ff = ffmpy.FFmpeg(inputs=inputs, outputs=outputs)
    try:
        ff.run()
        return tmp_output
    except Exception as e:
        line_break(3)
        print('Failed to add emojis.\n{}'.format(e))
Beispiel #6
0
def create_final_video(fried_video, boosted_audio, output_file):
    inputs = OrderedDict([
        (fried_video, None),
        (boosted_audio, None),
    ])
    outputs = OrderedDict([
        (output_file, ['-y', '-vcodec', 'libx264']),
    ])
    ff = ffmpy.FFmpeg(inputs=inputs, outputs=outputs)
    try:
        ff.run()
        line_break(3)
        print('Succesfully deep fried video at {}!'.format(output_file))
        line_break(3)
        return output_file
    except Exception as e:
        line_break(3)
        print('Failed to create final video.\n{}'.format(e))
Beispiel #7
0
def cross_validate_temporal(mtxTrn,mtxTest,mtxTrnTarget,mtxTestTarget,model):
    start_time = datetime.now()
    log.info('Temporal CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M')))
    utils.line_break()
    train_cv = mtxTrn
    test_cv = mtxTest
    y_target = mtxTrnTarget
    y_true = mtxTestTarget
    #If target variable has been transformed, transform y_true back to normal state for comparison to predictions
    y_true = [np.exp(x)-1 for x in y_true]
    #--------Hyperparameter optimization---------#
    #Make predictions
    try:
        model.estimator.fit(train_cv, y_target)
        preds = model.estimator.predict(test_cv)
    except TypeError:
        model.estimator.fit(train_cv.todense(), y_target)
        preds = model.estimator.predict(test_cv.todense())
    #----------Post processing rules----------#
    #If target variable has been transformed, transform predictions back to original state
    preds = [np.exp(x)-1 for x in preds]
    #Apply scalar
    if model.postprocess_scalar != 1:
        preds = [x*model.postprocess_scalar for x in preds]
    #set <0 predictions to 0 if views or comments, set <1 predictions to 1 if votes
    if model.target == 'num_votes':
        preds = [1 if x < 1 else x for x in preds]
    else:
        preds = [0 if x < 0 else x for x in preds]
    ##score the prediction by measuring the error using the chosen error metric
    score = ml_metrics.rmsle(y_true, preds)
    finish_time = datetime.now()
    log.info('Error Measure:' , score)
    log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' %
               (np.mean(preds)), (np.max(preds),np.std(preds),np.min(preds),np.max(preds)))
    utils.line_break()
    log.info('Temporal CV completed at: %s.  Total runtime: %s' \
          % (datetime.now().strftime('%m-%d-%y %H:%M'),str(finish_time-start_time)))
    utils.line_break()
    return preds
Beispiel #8
0
def cross_validate_using_benchmark(benchmark_name, dfTrn, mtxTrn,mtxTarget,model,folds=5,SEED=42,test_size=.15):
    fold_scores = []
    SEED = SEED *  time.localtime().tm_sec
    start_time = datetime.now()
    log.info('Benchmark CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M')))
    utils.line_break()
    for i in range(folds):
        #For each fold, create a test set (test_holdout) by randomly holding out X% of the data as CV set, where X is test_size (default .15)
        train_cv, test_cv, y_target, y_true = cross_validation.train_test_split(mtxTrn, mtxTarget, test_size=test_size, random_state=SEED*i+10)
        #If target variable has been transformed, transform y_true back to normal state for comparison to predictions
        y_true = [np.exp(x)-1 for x in y_true]
        #Calc benchmarks and use them to make a prediction
        benchmark_preds = 0
        if benchmark_name =='global_mean':
            benchmark_preds = [13.899 for x in test_cv]
        if benchmark_name =='all_ones':
            #find user avg stars mean
            benchmark_preds = [1 for x in test_cv]
        if benchmark_name =='9999':
            #find user avg stars mean
            benchmark_preds = [9999 for x in test_cv]
        log.info('Using benchmark %s:' % (benchmark_name))
        #For this CV fold, measure the error
        score = ml_metrics.rmsle(y_true, benchmark_preds)
        #print score
        fold_scores += [score]
        log.info('RMSLE (fold %d/%d): %f' % (i + 1, folds, score))

    ##Now that folds are complete, calculate and print the results
    finish_time = datetime.now()
    log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' %
            (np.mean(fold_scores)), (np.max(fold_scores),np.std(fold_scores),np.min(fold_scores),np.max(fold_scores)))
    utils.line_break()
    log.info('CV completed at: %s.  Total runtime: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'),
                                                       str(finish_time-start_time)))
    utils.line_break()
Beispiel #9
0
def cross_validate_kfold(mtxTrn,mtxTarget,model,folds=5,SEED=42,test_size=.15,pred_fg='false'):
    fold_scores = []
    SEED = SEED *  time.localtime().tm_sec
    start_time = datetime.now()
    log.info('K-Fold CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M')))
    utils.line_break()
    #If predictions are wanted, initialize the dict so that its length will match all records in the training set,
    #even if not all records are predicted during the CV (randomness is a bitch)
    if pred_fg == 'true':
        cv_preds = {key[0]:[] for key in mtxTrn.getcol(0).toarray()}
    for i in range(folds):
        ##For each fold, create a test set (test_cv) by randomly holding out test_size% of the data as CV set
        train_cv, test_cv, y_target, y_true = \
           cross_validation.train_test_split(mtxTrn, mtxTarget, test_size=test_size, random_state=i*SEED+1)
        #If target variable has been transformed, transform y_true back to normal state for comparison to predictions
        y_true = [np.exp(x)-1 for x in y_true]
        #if predictions are wanted, parse off the first row from train and test cv sets. First row contains ID
        if pred_fg == 'true':
            #TODO: create dense matrix copies for the clf's that only use dense matrices
            train_cv = sparse.csr_matrix(train_cv)[:,1:]
            test_cv2 = sparse.csr_matrix(test_cv)[:,1:]
            test_cv = sparse.csr_matrix(test_cv)[:,1:]
        #----------Hyperparameter optimization------#
        try:
            model.estimator.fit(train_cv, y_target)
            preds = model.estimator.predict(test_cv)
        except TypeError:
            model.estimator.fit(train_cv.todense(), y_target)
            preds = model.estimator.predict(test_cv.todense())
        preds = model.estimator.predict(test_cv)
        #----------Post processing rules----------#
        #If target variable has been transformed, transform predictions back to original state
        preds = [np.exp(x)-1 for x in preds]
        #Apply scalar
        if model.postprocess_scalar != 1:
            preds = [x*model.postprocess_scalar for x in preds]
        #set <0 predictions to 0 if views or comments, set <1 predictions to 1 if votes
        if model.target == 'num_votes':
            preds = [1 if x < 1 else x for x in preds]
        else:
            preds = [0 if x < 0 else x for x in preds]
        ##For each fold, score the prediction by measuring the error using the chosen error metric
        score = ml_metrics.rmsle(y_true, preds)
        fold_scores += [score]
        log.info('RMLSE (fold %d/%d): %f' % (i + 1, folds, score))
        ##IF we want to record predictions, then for each fold add the predictions to the cv_preds dict for later output
        if pred_fg == 'true':
            for i in range(0,test_cv2.shape[0]):
                if test_cv2.getcol(0).toarray()[i][0] in cv_preds.keys():
                    cv_preds[test_cv2.getcol(0).toarray()[i][0]] += [preds[i]]
                else:
                    cv_preds[test_cv2.getcol(0).toarray()[i][0]] = [preds[i]]
    ##Now that folds are complete, calculate and print the results
    finish_time = datetime.now()
    log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' %
            (np.mean(fold_scores)), (np.max(fold_scores),np.std(fold_scores),np.min(fold_scores),np.max(fold_scores)))
    utils.line_break()
    log.info('K-Fold CV completed at: %s.  Total runtime: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'),
                                                              str(finish_time-start_time)))
    utils.line_break()
    if pred_fg == 'true':
        return cv_preds
def main():
    #---Load environment settings from SETTINGS.json in root directory and build filepaths for all base submissions---#
    settings = utils.load_settings('SETTINGS.json')
    base_filepaths = (settings['file_bryan_submission'],
                      settings['file_miroslaw_submission'])
    segment_weights = settings['ensemble_segment_weights']
    segments =  segment_weights.keys()
    targets = segment_weights[segments[0]].keys()

    #---Output the segment weights to be used for ensemble averaging of base submissions---#
    log.info('==========ENSEMBLE WEIGHTS (B,M)============')
    for segment in segment_weights:
        log.info(segment.upper()+':')
        for target in segment_weights[segment]:
            log.info('    '+target.upper()+' -- ['+segment_weights[segment][target]['0']+','+
                      segment_weights[segment][target]['1']+']')

    #---Load each base submission to a list of dataframes---#
    base_subs = []
    for file in base_filepaths:
        try:
            base_subs.append(pd.read_csv(file).set_index(['id'], drop=False).sort())
            log.info('Base submission successfully loaded: %s.' % file)
        except IOError:
            log.info('Base submission file does not exist: %s. Run base model to generate, or update filepath.' %file)
            sys.exit('---Exiting---')

    utils.line_break()

    #---Load id's labeled with segments to a dataframe used for segment based averaging---#
    file = settings['file_segment_ids']
    try:
        segment_ids = pd.read_csv(file)
        log.info('Segment IDs successfully loaded from: %s.' % file)
    except IOError:
        log.info('Segment IDs file does not exist: %s. Update filepath in SETTINGS.json.' % file)
    utils.line_break()

    #---Transform base predictions to log space prior to averaging, if selected in settings---#
    if settings['avg_log_space'] == 'y':
        log.info('Transforming base predictions to log space prior to averaging.')
        for i in range(len(base_subs)):
            for target in targets:
                base_subs[i][target] = np.log(base_subs[i][target]+1)
        utils.line_break()

    #---Apply segment based weights to each base submission then combine them to create ensemble submission---#
    log.info('Applying segment weights to base submissions then combining to create ensemble.')
    for i in range(len(base_subs)):
        #Merge the segment labels from the segment id's file with the base submission dataframe
        base_subs[i] = base_subs[i].merge(segment_ids,on='id',how='inner')
        for segment in segments:
            for target in targets:
                base_subs[i][target][base_subs[i]['Segment'] == segment] \
                    *= float(segment_weights[segment][target][str(i)])
        del base_subs[i]['Segment']
    ensemble_sub = base_subs[0].ix[:]
    for i in range(len(base_subs)-1):
        for target in targets:
            ensemble_sub[target] += base_subs[i+1][target]
    utils.line_break()

    #---Transform ensemble predictions back to normal, if use log space averaging was selected in settings---#
    if settings['avg_log_space'] == 'y':
        log.info('Transforming ensemble predictions back to normal from log space.')
        for target in targets:
            ensemble_sub[target] = np.exp(ensemble_sub[target])-1
        utils.line_break()

    #---Apply any final target scalars to ensemble predictions---#
    for target in targets:
        ensemble_sub[target] *= float(settings['target_scalars'][target])

    #---Output ensemble submission to directory set in SETTINGS.json, appending creation date and time---#
    timestamp = datetime.now().strftime('%m-%d-%y_%H%M')
    filename = settings['dir_ensemble_submissions']+'ensemble_predictions_'+timestamp+'.csv'
    ensemble_sub.to_csv(filename, index=False)
    log.info('Ensemble submission saved: %s' % filename)
    utils.line_break()

    #End main
    log.info('Program executed successfully without error! Exiting.')
Beispiel #11
0
def setup_repo():
  """Set up repository / checkout"""
  timestamp = run("date '+%Y%m%d%H%M%S'")
  timestamp_with_dots = timestamp[0:len(timestamp) - 2] + '.' + timestamp[len(timestamp)-2:len(timestamp)]

  sha = local('git ls-remote %(repo)s %(branch)s' % {'repo': env.repository, 'branch': env.branch}, capture=True)
  sha = sha.split("\t")[0]

  cache_dir = '%(app_dir)s/shared/cached-copy' % {'app_dir': env.applicationdir}
  deploy_dir = '%(app_dir)s/releases/%(timestamp)s' % {'app_dir': env.applicationdir, 'timestamp': timestamp}
  geo_lite_file = '%(app_dir)s/shared/config/GeoLiteCity.dat' % {'app_dir': env.applicationdir}

  if(files.exists(cache_dir)):
    with cd(cache_dir):
      run('git fetch -q origin')
      run('git reset -q --hard %(sha)s' % {'sha': sha})
      run('git clean -q -d -x -f')
  else:
    run('git clone -q %(repo)s %(cache_dir)s' % {'repo': env.repository, 'cache_dir': cache_dir})
    with cd('%(cache_dir)s' % {'cache_dir': cache_dir}):
      run('git checkout -q -b %(user)s %(sha)s;' % {'sha': sha, 'user': env.user})

  with cd(cache_dir):
    run('cp -RPp %(cache_dir)s %(deploy_dir)s' % {'cache_dir': cache_dir, 'deploy_dir': deploy_dir})
    run('echo %(sha)s > %(deploy_dir)s/REVISION' % {'sha': sha, 'deploy_dir': deploy_dir})

  with cd(deploy_dir):
    run('bundle install --gemfile %(deploy_dir)s/Gemfile --path %(app_dir)s/shared/bundle --deployment --quiet --without development test cucumber' % {'deploy_dir': deploy_dir, 'app_dir': env.applicationdir})
    run('./script/gem_downgrade_time')

  run('chmod -R g+w %(deploy_dir)s' % {'deploy_dir': deploy_dir})
  run('rm -rf %(deploy_dir)s/log %(deploy_dir)s/public/system %(deploy_dir)s/tmp/pids' % {'deploy_dir': deploy_dir})
  run('mkdir -p %(deploy_dir)s/public' % {'deploy_dir': deploy_dir})
  run('mkdir -p %(deploy_dir)s/tmp' % {'deploy_dir': deploy_dir})
  run('ln -s %(app_dir)s/shared/log %(deploy_dir)s/log' % {'app_dir': env.applicationdir, 'deploy_dir': deploy_dir})
  run('ln -s %(app_dir)s/shared/system %(deploy_dir)s/public/system' % {'app_dir': env.applicationdir, 'deploy_dir': deploy_dir})
  run('ln -s %(app_dir)s/shared/pids %(deploy_dir)s/tmp/pids' % {'app_dir': env.applicationdir, 'deploy_dir': deploy_dir})
  run("find %(deploy_dir)s/public/images %(deploy_dir)s/public/stylesheets %(deploy_dir)s/public/javascripts -exec touch -t %(timestamp_with_dots)s {} ';'; true" % {'deploy_dir': deploy_dir, 'timestamp': timestamp, 'timestamp_with_dots': timestamp_with_dots})

  if(env.env == 'production'):
    with cd(deploy_dir):
      run('bundle exec whenever --clear-crontab %(app_name)s' % {'app_name': env.application})

  if(not files.exists(geo_lite_file)):
    utils.line_break()
    print("ERROR: GeoLiteCity file doesn't exist: %(file)s" % {'file': geo_lite_file})
    utils.line_break()
    return False

  fs = [
    {'file': 'shards-replication.yml', 'final_file': 'shards.yml'},
    {'file': 'database.yml'},
    {'file': 'core.yml'},
    {'file': 'authorize_net.yml'},
    {'file': 'braintree.yml'},
    {'file': 'braintree.yml'},
    {'file': 'google_maps.yml'},
    {'file': 'server.yml'},
    {'file': 's3.yml'},
    {'file': 'GeoLiteCity.dat'},
    {'file': 'unicorn.rb'}
  ]

  for f in fs:
    try:
      final = f['final_file']
    except KeyError:
      final = f['file']
    run('ln -nfs %(app_dir)s/shared/config/%(f)s %(deploy_dir)s/config/%(final)s' % {'f': f['file'], 'final': final, 'app_dir': env.applicationdir, 'deploy_dir': deploy_dir})

  run('ln -nfs %(app_dir)s/shared/cache %(deploy_dir)s/public/cache' % {'app_dir': env.applicationdir, 'deploy_dir': deploy_dir})
  run('ls -x %(app_dir)s/releases' % {'app_dir': env.applicationdir})

  with cd(deploy_dir):
    run('bundle exec rake RAILS_ENV=%(state)s db:migrate compass:compile db:seed 1> /dev/null' % {'state': env.env})
    run('ln -sf %(deploy_dir)s %(app_dir)s/current' % {'deploy_dir': deploy_dir, 'app_dir': env.applicationdir})
    run('bundle exec jammit')
    run('cp public/robots_disallow.txt public/robots.txt')
    run('rm -f %(app_dir)s/current' % {'app_dir': env.applicationdir})
    run('ln -s %(deploy_dir)s %(app_dir)s/current' % {'deploy_dir': deploy_dir, 'app_dir': env.applicationdir})
  
  with cd('%(deploy_dir)s/..' % {'deploy_dir': deploy_dir}):
    to_delete = 5
    dirs = run("ls -ltr | awk '{print $8}'").split('\n')
    total = len(dirs)
    if(total > to_delete):
      del_dirs = dirs[0:total - to_delete]
      for d in del_dirs:
        d = d[0:len(d)-1]
        run('rm -Rf %(dd)s' % {'dd': d})

  with cd('%(app_dir)s/current' % {'app_dir': env.applicationdir}):
    run('bundle exec whenever --update-crontab %(app_name)s --set environment=%(state)s' % {'state': env.env, 'app_name': env.application})

  if(env.env == 'staging'):
    utils.line_break()
    print(red("Killing unicorns, the bastards..."))
    utils.line_break()
    with settings(warn_only=True):
      run('pkill -KILL -f unicorn')
      run('pkill -KILL -f delayed')
      if(files.exists('%(app_dir)s/current/config/unicorn/%(state)s.rb' % {'state': env.env, 'app_dir': env.applicationdir})):
        with cd('%(app_dir)s/current' % {'app_dir': env.applicationdir}):
          run('BUNDLE_GEMFILE=%(app_dir)s/current/Gemfile bundle exec unicorn_rails -c %(app_dir)s/current/config/unicorn/%(state)s.rb -E %(state)s -D' % {'app_dir': env.applicationdir, 'state': env.env})
  elif(env.env == 'production'):
    if(files.exists('%(app_dir)s/current/tmp/pids/unicorn.pid' % {'app_dir': env.applicationdir})):
      print("PRODUCTION UNICORN RELOAD VOILA")

  with cd('%(app_dir)s' % {'app_dir': env.applicationdir}):
    with cd('%(app_dir)s/current' % {'app_dir': env.applicationdir}):
      run('bundle exec rake page_cache:refresher:disable_all cache:clear_rescue cache:clear_storehouse dj:enable dj:start RAILS_ENV=%(state)s' % {'state': env.env})
      run('rm -fr shared/cache/*')
Beispiel #12
0
def main():
    log.info('********New program instance started********')

    #-------------Load Environment----------------------#
    #Get program settings and model settings from SETTINGS.json file in root directory
    settings, model_settings = utils.load_settings()

    #If not using cached data, then load raw data, clean/munge it, create hand-crafted features, slice it for CV
    if settings['use_cached_data'] == 'y':
        log.info('==========LOADING CACHED FEATURES===========')
        dfTrn = data_io.load_cached_object('dfTrn')
        dfTest = data_io.load_cached_object('dfTest')
        dfCV = data_io.load_flatfile_to_df('Data/CV.csv')
    else:
        #-------Data Loading/Cleaning/Munging------------#
        #Load the data
        log.info('===============LOADING DATA=================')
        dfTrn = data_io.load_flatfile_to_df(settings['file_data_train'])
        dfTest = data_io.load_flatfile_to_df(settings['file_data_test'])
        dfCV = data_io.load_flatfile_to_df('Data/CV.csv')

        #Clean/Munge the data
        log.info('=======CLEANING AND MUNGING DATA============')
        dfTrn = munge.clean(dfTrn)
        dfTest = munge.clean(dfTest)

        #-------Feature creation-------------------------#
        #Add all currently used hand crafted features to dataframes
        log.info('====CREATING HAND-CRAFTED DATA FEATURES=====')
        features.add(dfTrn)
        features.add(dfTest)

        #---------Data slicing/parsing--------------------------#
        #Split data for CV
        if settings['generate_cv_score'] == 'y':
            log.info('=====SPLITTING DATA FOR CROSS-VALIDATION====')
            if settings['cv_method'] == 'april':
                dfTrnCV, dfTestCV = munge.temporal_split(dfTrn, (2013, 04, 1))
            elif settings['cv_method'] == 'march':
                #take an addtional week from February b/c of lack of remote_api source issues in March
                dfTrnCV, dfTestCV = munge.temporal_split(dfTrn, (2013, 02, 21))
            elif settings['cv_method'] == 'list_split':
                #load stored list of data points and use those for CV
                dfCVlist = pd.DataFrame({'id': data_io.load_cached_object('Cache/cv_issue_ids.pkl'), 'dummy': 0})
                dfTrnCV, dfTestCV = munge.list_split(dfTrn, dfCVlist)

    #--------------Modeling-------------------------#
    #If cached models exist then load them for reuse into segment_models.  Then run through model_settings and for
    # each model where 'use_cached_model' is false then clear the cached model and recreate it fresh
    log.info('=========LOADING CACHED MODELS==============')
    segment_models = data_io.load_cached_object('segment_models')
    if segment_models == None:
        log.info('=========CACHED MODELS NOT LOADED===========')
        for model in model_settings:
            model['use_cached_model'] = 'n'
        segment_models = []
    #Initialize new model for models not set to use cache
    log.info('=======INITIALIZING UN-CACHED MODELS========')
    index = 0
    for model in model_settings:
        if model_settings[model]['use_cached_model'] == 'n':
            new_model = ensembles.Model(model_name=model,target=model_settings[model]['target'],
                                        segment=model_settings[model]['segment'],
                                        estimator_class=model_settings[model]['estimator_class'],
                                        estimator_params=model_settings[model]['estimator_params'],
                                        features=model_settings[model]['features'],
                                        postprocess_scalar=model_settings[model]['postprocess_scalar'])
            #Flag the model as not cached, so that it does not get skipped when running the modeling process
            new_model.use_cached_model='n'
            #Project specific model attributes not part of base class
            new_model.KNN_neighborhood_threshold=model_settings[model]['KNN_neighborhood_threshold']
            new_model.sub_zip_neighborhood=model_settings[model]['sub_zip_neighborhood']
            segment_models[index] = new_model
            log.info('Model %s intialized at index %i' % (model,index))
        index += 1

    #Cross validate all segment models (optional)
    if settings['export_cv_predictions_all_models'] == 'y' or settings['export_cv_predictions_new_models'] == 'y':
        log.info('============CROSS VALIDATION================')
        for model in segment_models[:]:
            #If model has cached CV predictions then skip predicting and just export them (if selected in settings)
            if hasattr(model,'dfCVPredictions'):
                log.info('Cached CV predictions found.  Using cached CV predictions.')
                if settings['export_cv_predictions_all_models'] == 'y':
                    data_io.save_predictions(model.dfCVPredictions,model.target,model_name=model.model_name,
                                             directory=settings['dir_submissions'],
                                             estimator_class=model.estimator_class, note='CV_list')
            else:
                print_model_header(model)
                #Prepare segment model:  segment and create feature vectors for the CV data set
                dfTrn_Segment, dfTest_Segment = prepare_segment_model(dfTrnCV,dfTestCV,model)
                #Generate CV predictions
                train.cross_validate(model, settings, dfTrn_Segment, dfTest_Segment)
                #Cache the CV predictions as a dataframe stored in each segment model
                model.dfCVPredictions = dfTest_Segment.ix[:,['id',model.target]]
                if settings['export_cv_predictions_new_models'] == 'y':
                    data_io.save_predictions(model.dfCVPredictions,model.target,model_name=model.model_name,
                                             directory=settings['dir_submissions'],
                                             estimator_class=model.estimator_class, note='CV_list')

    #Generate predictions on test set for all segment models (optional)
    if settings['export_predictions_all_models'] == 'y' or settings['export_predictions_new_models'] == 'y'\
        or settings['export_predictions_total'] == 'y':
        log.info('=======GENERATING TEST PREDICTIONS==========')
        for model in segment_models[:]:
            #If model has cached test predictions then skip predicting and just export them (if selected in settings)
            if hasattr(model,'dfPredictions'):
                log.info('Cached test predictions found for model %s.  Using cached predictions.' % model.model_name)
                if settings['export_predictions_all_models'] == 'y':
                    data_io.save_predictions(model.dfPredictions,model.target,model_name=model.model_name,
                             directory=settings['dir_submissions'],
                             estimator_class=model.estimator_class,note='TESTset')
            else:
                print_model_header(model)
                #Prepare segment model:  segment and create feature vectors for the full TEST data set
                dfTrn_Segment, dfTest_Segment = prepare_segment_model(dfTrn,dfTest,model)
                #Generate TEST set predictions
                model.predict(dfTrn_Segment, dfTest_Segment)
                if settings['export_predictions_all_models'] == 'y' or settings['export_predictions_new_models'] == 'y':
                    data_io.save_predictions(model.dfPredictions,model.target,model_name=model.model_name,
                                             directory=settings['dir_submissions'],
                                             estimator_class=model.estimator_class,note='TESTset')
                log.info(utils.line_break())

    #Cache the trained models and predictions to file (optional)
    if settings['export_cached_models'] == 'y':
        log.info('==========EXPORTING CACHED MODELS===========')
        data_io.save_cached_object(segment_models,'segment_models')

    #Merge each segment model's CV predictions into a master dataframe and export it (optional)----#
    if settings['export_cv_predictions_total'] == 'y':
        log.info('====MERGING CV PREDICTIONS FROM SEGMENTS====')
        dfTestPredictionsTotal = merge_segment_predictions(segment_models, dfTestCV, cv=True)
        #---Apply post process rules to master dataframe---#
        #Set all votes and comments for remote_api segment to 1 and 0
        dfTestPredictionsTotal = dfTestPredictionsTotal.merge(dfTest.ix[:][['source','id']], on='id', how='left')
        for x in dfTestPredictionsTotal.index:
            if dfTestPredictionsTotal.source[x] == 'remote_api_created':
                dfTestPredictionsTotal.num_votes[x] = 1
                dfTestPredictionsTotal.num_comments[x] = 0
        #Export
        timestamp = datetime.now().strftime('%m-%d-%y_%H%M')
        filename = 'Submits/'+timestamp+'--bryan_CV_predictions.csv'
        dfTestPredictionsTotal.to_csv(filename)


    #Merge each segment model's TEST predictions into a master dataframe and export it (optional)----#
    if settings['export_predictions_total'] == 'y':
        log.info('===MERGING TEST PREDICTIONS FROM SEGMENTS===')
        dfTestPredictionsTotal = merge_segment_predictions(segment_models, dfTest)
        #---Apply post process rules to master dataframe---#
        #Set all votes and comments for remote_api segment to 1 and 0
        dfTestPredictionsTotal = dfTestPredictionsTotal.merge(dfTest.ix[:][['source','id']], on='id', how='left')
        for x in dfTestPredictionsTotal.index:
            if dfTestPredictionsTotal.source[x] == 'remote_api_created':
                dfTestPredictionsTotal.num_votes[x] = 1
                dfTestPredictionsTotal.num_comments[x] = 0
        del dfTestPredictionsTotal['source']
        #Export
        filename = 'bryan_test_predictions.csv'
        data_io.save_combined_predictions(dfTestPredictionsTotal, settings['dir_submissions'], filename)

    #End main
    log.info('********Program ran successfully. Exiting********')
Beispiel #13
0
def main():
    #---Load environment settings from SETTINGS.json in root directory and build filepaths for all base submissions---#
    settings = utils.load_settings('SETTINGS.json')
    base_filepaths = (settings['file_bryan_submission'],
                      settings['file_miroslaw_submission'])
    segment_weights = settings['ensemble_segment_weights']
    segments = segment_weights.keys()
    targets = segment_weights[segments[0]].keys()

    #---Output the segment weights to be used for ensemble averaging of base submissions---#
    log.info('==========ENSEMBLE WEIGHTS (B,M)============')
    for segment in segment_weights:
        log.info(segment.upper() + ':')
        for target in segment_weights[segment]:
            log.info('    ' + target.upper() + ' -- [' +
                     segment_weights[segment][target]['0'] + ',' +
                     segment_weights[segment][target]['1'] + ']')

    #---Load each base submission to a list of dataframes---#
    base_subs = []
    for file in base_filepaths:
        try:
            base_subs.append(
                pd.read_csv(file).set_index(['id'], drop=False).sort())
            log.info('Base submission successfully loaded: %s.' % file)
        except IOError:
            log.info(
                'Base submission file does not exist: %s. Run base model to generate, or update filepath.'
                % file)
            sys.exit('---Exiting---')

    utils.line_break()

    #---Load id's labeled with segments to a dataframe used for segment based averaging---#
    file = settings['file_segment_ids']
    try:
        segment_ids = pd.read_csv(file)
        log.info('Segment IDs successfully loaded from: %s.' % file)
    except IOError:
        log.info(
            'Segment IDs file does not exist: %s. Update filepath in SETTINGS.json.'
            % file)
    utils.line_break()

    #---Transform base predictions to log space prior to averaging, if selected in settings---#
    if settings['avg_log_space'] == 'y':
        log.info(
            'Transforming base predictions to log space prior to averaging.')
        for i in range(len(base_subs)):
            for target in targets:
                base_subs[i][target] = np.log(base_subs[i][target] + 1)
        utils.line_break()

    #---Apply segment based weights to each base submission then combine them to create ensemble submission---#
    log.info(
        'Applying segment weights to base submissions then combining to create ensemble.'
    )
    for i in range(len(base_subs)):
        #Merge the segment labels from the segment id's file with the base submission dataframe
        base_subs[i] = base_subs[i].merge(segment_ids, on='id', how='inner')
        for segment in segments:
            for target in targets:
                base_subs[i][target][base_subs[i]['Segment'] == segment] \
                    *= float(segment_weights[segment][target][str(i)])
        del base_subs[i]['Segment']
    ensemble_sub = base_subs[0].ix[:]
    for i in range(len(base_subs) - 1):
        for target in targets:
            ensemble_sub[target] += base_subs[i + 1][target]
    utils.line_break()

    #---Transform ensemble predictions back to normal, if use log space averaging was selected in settings---#
    if settings['avg_log_space'] == 'y':
        log.info(
            'Transforming ensemble predictions back to normal from log space.')
        for target in targets:
            ensemble_sub[target] = np.exp(ensemble_sub[target]) - 1
        utils.line_break()

    #---Apply any final target scalars to ensemble predictions---#
    for target in targets:
        ensemble_sub[target] *= float(settings['target_scalars'][target])

    #---Output ensemble submission to directory set in SETTINGS.json, appending creation date and time---#
    timestamp = datetime.now().strftime('%m-%d-%y_%H%M')
    filename = settings[
        'dir_ensemble_submissions'] + 'ensemble_predictions_' + timestamp + '.csv'
    ensemble_sub.to_csv(filename, index=False)
    log.info('Ensemble submission saved: %s' % filename)
    utils.line_break()

    #End main
    log.info('Program executed successfully without error! Exiting.')