コード例 #1
0
def run_MLA(XX,XXpredict,yy,yypredict,unique_IDS_tr,unique_IDS_pr,uniquetarget_tr,uniquetarget_pr,n_feat,ind_run_name,n_run):
    logger.info('Starting MLA run')
    logger.info('------------')
    if settings.pyspark_on == 1:                # Use pyspark or not? Pyspark makes cross node (HPC) calculation possible.
        from pyspark import SparkContext        # It's slower, manages resources between nodes using HTTP. 
        from pyspark.sql import SQLContext      # So far, it does not include feature importance outputs.
        from pyspark.ml import Pipeline         # I would have to program feature importances myself. May be time consuming.
        from pyspark.ml.feature import VectorAssembler
        from pyspark.ml.classification import RandomForestClassifier
        from pyspark.ml.feature import StringIndexer, VectorIndexer
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        # pyspark go
        
        if settings.pyspark_remake_csv == 1: # Making the csv files for the pyspark MLA to read in is time consuming, turn off the file generation?
            logger.info('Remaking csvs for pysparks...')
            numpy.savetxt(temp_train, XX, delimiter=",")
            logger.info('Training csv saved')
            numpy.savetxt(temp_pred, XXpredict, delimiter=",")
            logger.info('Predict csv saved')
        sc = SparkContext(appName="ML_RF") # Initiate spark
        
        sclogger=sc._jvm.org.apache.log4j # Initiate spark logging
        sclogger.LogManager.getLogger("org").setLevel(sclogger.Level.ERROR)
        sclogger.LogManager.getLogger("akka").setLevel(sclogger.Level.ERROR)
        sqlContext=SQLContext(sc)
        # Read in data
        data_tr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_train)
        data_pr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_pred)
        data_tr=data_tr.withColumnRenamed(data_tr.columns[-1],"label") # rename last column (answers), to label
        data_pr=data_pr.withColumnRenamed(data_pr.columns[-1],"label")
        
        assembler=VectorAssembler(inputCols=data_tr.columns[:-1],outputCol="features")
        reduced=assembler.transform(data_tr.select('*')) # Assemble feature vectos for spark MLA
        
        assembler_pr=VectorAssembler(inputCols=data_pr.columns[:-1],outputCol="features")
        reduced_pr=assembler_pr.transform(data_pr.select('*'))
        
        labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(reduced) # Index vectors        
        featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(reduced)
        # Initiate MLA alg
        rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=100,maxDepth=5,maxBins=200)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) # Set up fitting pipeline
        start, end=[],[] # Timer
        logger.info('Fit start')
        logger.info('------------')
        start = time.time()
        model=pipeline.fit(reduced) # Fit
        end = time.time()
        logger.info('Fit ended in %s seconds' %(end-start))
        logger.info('------------')
        start, end=[],[]
        logger.info('Predict start')
        logger.info('------------')
        start = time.time()
        predictions = model.transform(reduced_pr) # Predict
        evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction",metricName="precision")
        accuracy = evaluator.evaluate(predictions)
        logger.info("Test Error = %g" %(1.0-accuracy))
        logger.info('------------')
        logger.info('Pulling results ...')
        yypredict=numpy.array(predictions.select("indexedLabel").collect()) # Pulls all results into numpy arrays to continue program
        yypredict=yypredict[:,0]
        result=numpy.array(predictions.select("prediction").collect())
        result=result[:,0]
        XXpredict=numpy.array(predictions.select("indexedFeatures").collect())
        XXpredict=XXpredict[:,0]
        probs=numpy.array(predictions.select("probability").collect())
        probs=probs[:,0]
        XXpredict=numpy.column_stack((XXpredict,yypredict))
        end=time.time()
        logger.info('Predict ended in %s seconds' %(end-start))
        logger.info('------------')
    
    else:
        # Run sklearn MLA switch
        MLA = get_function(settings.MLA) # Pulls in machine learning algorithm from settings
        clf = MLA().set_params(**settings.MLAset)
        logger.info('MLA settings') 
        logger.info(clf)
        logger.info('------------')    
        start, end=[],[] # Timer
        logger.info('Fit start')
        logger.info('------------')
        start = time.time()
        clf = clf.fit(XX[:,0:n_feat],yy) # XX is train array, yy is training answers
        end = time.time()
        logger.info('Fit ended in %s seconds' %(end-start))
        logger.info('------------')
        score = clf.score
        if 'OvsA' not in ind_run_name:
            if settings.output_all_trees == 1:
                i_tree = 0
                for tree_in_forest in clf.estimators_:
                    with open('plots/tree_' + str(i_tree) + '.dot', 'w') as my_file:
                        my_file = tree.export_graphviz(tree_in_forest, out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True)
                    os.system('dot -Tpng plots/tree_%s.dot -o plots/tree_%s.png' %(i_tree,i_tree))
                    os.remove('plots/tree_%s.dot' %i_tree)
                    i_tree = i_tree + 1        
            else:
                with open('plots/tree_example.dot', 'w') as my_file:
                    my_file = tree.export_graphviz(clf.estimators_[0], out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True)
                os.system('dot -Tpng plots/tree_example.dot -o plots/tree_example.png')
                os.remove('plots/tree_example.dot')
        start, end=[],[]
        # Split cats for RAM management
        numcats = numpy.int64((2*(XXpredict.size/1024/1024)*clf.n_jobs))
        if settings.get_contributions ==1:
            numcats=100
        if numcats < 1:
            numcats = 1
        logger.info('Predict start')
        logger.info('------------')
        start = time.time()
        result,probs,bias,contributions,train_contributions=[],[],[],[],[]
        XXpredict_cats=numpy.array_split(XXpredict,numcats)
        logger.info('Splitting predict array into %s' %numcats)
        logger.info('------------')
        for i in range(len(XXpredict_cats)):
            logger.info('Predicting cat %s/%s' %(i,len(XXpredict_cats)))
            result.extend(clf.predict(XXpredict_cats[i][:,0:n_feat])) # XX is predict array.
            probs.extend(clf.predict_proba(XXpredict_cats[i][:,0:n_feat])) # Only take from 0:n_feat because answers are tacked on end
            if 'OvsA' not in ind_run_name:            
                if (settings.get_contributions == 1) | (settings.get_perfect_contributions==1):           
                    logger.info('Getting contributions from predict catalogue %s' %i)
                    tiresult = ti.predict(clf,XXpredict_cats[i][:,0:n_feat])
                    contributions.extend(tiresult[2])
                    bias = tiresult[1][0]
        feat_importance = clf.feature_importances_
        result=numpy.float32(result)
        probs=numpy.float32(probs)
        if 'OvsA' not in ind_run_name:            
            if settings.get_contributions == 1: 
                numpy.save('contributions',contributions)
            if settings.get_perfect_contributions == 1: 
                numpy.save('perfect_contributions',contributions)
            if settings.compute_contribution_mic == 1:
                logger.info('Getting contributions from train catalogue (for plot_mic_cont)')
                tiresult_train = ti.predict(clf,XX[:,0:n_feat])
                train_contributions=tiresult_train[2]
                bias_train = tiresult_train[1][0]
        
        accuracy = metrics.accuracy_score(result,yypredict)
        recall = metrics.recall_score(result,yypredict,average=None)
        precision = metrics.precision_score(result,yypredict,average=None)
        score = metrics.f1_score(result, yypredict,average=None)
        
        end = time.time()
        logger.info('Predict ended in %s seconds' %(end-start))
        logger.info('------------')

    logger.info('Recall Score: %s' %recall)
    logger.info('Precision Score: %s' %precision)
    logger.info('Accuracy Score: %s' %accuracy)
    logger.info('F1 Score: %s' %score)
    percentage=(n/predictdatanum)*100
    
    run_opts.diagnostics([result,yypredict,unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr],'result')
#    stats=numpy.array([])
#    stats=numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage))
    # SAVE
    if settings.saveresults == 1:
        logger.info('Saving results')
        logger.info('------------')

        numpy.savetxt(settings.result_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((yypredict,result)),header="True_target Predicted_target")
        numpy.savetxt(settings.prob_outfile+('_%s' %ind_run_name)+'.txt',probs)
        numpy.savetxt(settings.feat_outfile+('_%s' %ind_run_name)+'.txt',feat_importance)
        numpy.savetxt(settings.stats_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage,clf.max_depth)),header="n_est traindatanum predictdatanum percentage max_depth",fmt="%s")
    
    return result,feat_importance,probs,bias,contributions,accuracy,recall,precision,score,clf,train_contributions
コード例 #2
0
def visualize_prediction(start_date = date(2000,1, 1),end_date=date(2000,2, 1)): 
    




    df_combined = return_combined(start_date,end_date)
    df_combined.business = df_combined.business.pct_change()
    df_combined.money = df_combined.money.pct_change()
    df_combined.world = df_combined.world.pct_change()
    #df_combined.drop(df_combined.head(1).index, inplace=True)
    #df_combined.drop(df_combined.tail(1).index, inplace=True)
    df_combined.replace(np.inf, 0, inplace=True)
    df_combined.replace(np.nan, 0, inplace=True)
    df_combined.replace(-np.inf, 0, inplace=True)
    
    df_combined = df_combined.fillna(method ='bfill')
    df_combined = df_combined.fillna(method ='ffill')
    #df_combined.drop(df_combined.head(1).index, inplace=True)
    #df_combined.drop(df_combined.tail(1).index, inplace=True)
    #df_combined.dropna(inplace=True)
    df_combined['sp_500'] = df_combined['sp_500'].apply(np.int64)
    #df_combined = df_combined.pct_change()
    train_start = '2010-01-02'
    train_end = '2010-08-01'
    test_start = '2010-08-05'
    test_end = '2010-12-24'
    #df_combined['sp_500'] = df_combined['sp_500'].apply(np.int64)
    train = df_combined.ix[train_start : train_end]
    test = df_combined.ix[test_start:test_end]



    prediction_list = []

    sentiment_score_list = []
    for date, row in train.T.iteritems():
        sentiment_score = np.asarray([df_combined.loc[date, 'business'],df_combined.loc[date, 'money'],df_combined.loc[date, 'world']])
        #sentiment_score = np.asarray([df_combined.loc[date, 'money']])
        sentiment_score_list.append(sentiment_score)
        numpy_df_train = np.asarray(sentiment_score_list)
        
    
    sentiment_score_list = []
    for date, row in test.T.iteritems():
        sentiment_score = np.asarray([df_combined.loc[date, 'business'],df_combined.loc[date, 'money'],df_combined.loc[date, 'world']])
        #sentiment_score = np.asarray([df_combined.loc[date, 'money']])
        sentiment_score_list.append(sentiment_score)
        numpy_df_test = np.asarray(sentiment_score_list)
        
    y_train = pd.DataFrame(train['sp_500'])
    y_test = pd.DataFrame(test['sp_500'])
    
    rf = RandomForestRegressor()
    rf.fit(numpy_df_train, y_train)


    #print (rf.feature_importances_)
    prediction, bias, contributions = ti.predict(rf, numpy_df_test)

    idx = pd.date_range(test_start, test_end)
    predictions_df = pd.DataFrame(data=prediction[0:], index = idx, columns=['sp500_predicted'])
    #predictions_df

    predictions_plot = predictions_df.plot()

    fig = y_test.plot(ax = predictions_plot).get_figure()
コード例 #3
0
    num_max_images = 10
    for i in range(len(unique_IDS_pr)):
        url_list,url_objid_list,url_spectra_list,tiresult_list,img_values_list=[],[],[],[],[]
        if len(image_IDs[i]['good_ID']) > num_max_images:
            top_good = num_max_images
        else:
            top_good = len(image_IDs[i]['good_ID'])
        for j in range(0,top_good):   
            img_ID_good = image_IDs[i]['good_ID'][j]
            img_SPECOBJID_good = image_IDs[i]['good_SPECOBJID'][j]
            img_RA_good = image_IDs[i]['good_RA'][j]
            img_DEC_good = image_IDs[i]['good_DEC'][j]
            img_index_good = image_IDs[i]['good_index'][j]
            img_values_loop = XXpredict[:,0:n_feat][img_index_good]
            tiresult = ti.predict(clf,XXpredict[:,0:n_feat][img_index_good].reshape(1,-1))
            tiresult_list.append(tiresult)
            img_values_list.append(img_values_loop)
            url_objid_list.append('http://skyserver.sdss.org/dr12/en/tools/explore/Summary.aspx?id=%s' %img_ID_good)
            url_spectra_list.append('http://skyserver.sdss.org/dr12/en/get/SpecById.ashx?id=%s' %img_SPECOBJID_good)
            url_list.append('http://skyserver.sdss.org/SkyserverWS/dr12/ImgCutout/getjpeg?TaskName=Skyserver.Explore.Image&ra=%s&dec=%s&scale=0.2&width=200&height=200&opt=G' %(img_RA_good,img_DEC_good))
        image_IDs[i].update({'good_url':url_list,'good_spectra' : url_spectra_list, 'good_tiresult' : tiresult_list, 'good_url_objid' : url_objid_list, 'good_values' : img_values_list})
        url_list,url_objid_list,url_spectra_list,tiresult_list,img_values_list=[],[],[],[],[]
        if len(image_IDs[i]['ok_ID']) > num_max_images:
            top_ok = num_max_images
        else:
            top_ok = len(image_IDs[i]['ok_ID'])
        for j in range(0,top_ok):  
            img_ID_ok = image_IDs[i]['ok_ID'][j]
            img_SPECOBJID_ok = image_IDs[i]['ok_SPECOBJID'][j]
            img_RA_ok =  image_IDs[i]['ok_RA'][j]