def run_MLA(XX,XXpredict,yy,yypredict,unique_IDS_tr,unique_IDS_pr,uniquetarget_tr,uniquetarget_pr,n_feat,ind_run_name,n_run): logger.info('Starting MLA run') logger.info('------------') if settings.pyspark_on == 1: # Use pyspark or not? Pyspark makes cross node (HPC) calculation possible. from pyspark import SparkContext # It's slower, manages resources between nodes using HTTP. from pyspark.sql import SQLContext # So far, it does not include feature importance outputs. from pyspark.ml import Pipeline # I would have to program feature importances myself. May be time consuming. from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # pyspark go if settings.pyspark_remake_csv == 1: # Making the csv files for the pyspark MLA to read in is time consuming, turn off the file generation? logger.info('Remaking csvs for pysparks...') numpy.savetxt(temp_train, XX, delimiter=",") logger.info('Training csv saved') numpy.savetxt(temp_pred, XXpredict, delimiter=",") logger.info('Predict csv saved') sc = SparkContext(appName="ML_RF") # Initiate spark sclogger=sc._jvm.org.apache.log4j # Initiate spark logging sclogger.LogManager.getLogger("org").setLevel(sclogger.Level.ERROR) sclogger.LogManager.getLogger("akka").setLevel(sclogger.Level.ERROR) sqlContext=SQLContext(sc) # Read in data data_tr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_train) data_pr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_pred) data_tr=data_tr.withColumnRenamed(data_tr.columns[-1],"label") # rename last column (answers), to label data_pr=data_pr.withColumnRenamed(data_pr.columns[-1],"label") assembler=VectorAssembler(inputCols=data_tr.columns[:-1],outputCol="features") reduced=assembler.transform(data_tr.select('*')) # Assemble feature vectos for spark MLA assembler_pr=VectorAssembler(inputCols=data_pr.columns[:-1],outputCol="features") reduced_pr=assembler_pr.transform(data_pr.select('*')) labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(reduced) # Index vectors featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(reduced) # Initiate MLA alg rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=100,maxDepth=5,maxBins=200) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) # Set up fitting pipeline start, end=[],[] # Timer logger.info('Fit start') logger.info('------------') start = time.time() model=pipeline.fit(reduced) # Fit end = time.time() logger.info('Fit ended in %s seconds' %(end-start)) logger.info('------------') start, end=[],[] logger.info('Predict start') logger.info('------------') start = time.time() predictions = model.transform(reduced_pr) # Predict evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction",metricName="precision") accuracy = evaluator.evaluate(predictions) logger.info("Test Error = %g" %(1.0-accuracy)) logger.info('------------') logger.info('Pulling results ...') yypredict=numpy.array(predictions.select("indexedLabel").collect()) # Pulls all results into numpy arrays to continue program yypredict=yypredict[:,0] result=numpy.array(predictions.select("prediction").collect()) result=result[:,0] XXpredict=numpy.array(predictions.select("indexedFeatures").collect()) XXpredict=XXpredict[:,0] probs=numpy.array(predictions.select("probability").collect()) probs=probs[:,0] XXpredict=numpy.column_stack((XXpredict,yypredict)) end=time.time() logger.info('Predict ended in %s seconds' %(end-start)) logger.info('------------') else: # Run sklearn MLA switch MLA = get_function(settings.MLA) # Pulls in machine learning algorithm from settings clf = MLA().set_params(**settings.MLAset) logger.info('MLA settings') logger.info(clf) logger.info('------------') start, end=[],[] # Timer logger.info('Fit start') logger.info('------------') start = time.time() clf = clf.fit(XX[:,0:n_feat],yy) # XX is train array, yy is training answers end = time.time() logger.info('Fit ended in %s seconds' %(end-start)) logger.info('------------') score = clf.score if 'OvsA' not in ind_run_name: if settings.output_all_trees == 1: i_tree = 0 for tree_in_forest in clf.estimators_: with open('plots/tree_' + str(i_tree) + '.dot', 'w') as my_file: my_file = tree.export_graphviz(tree_in_forest, out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True) os.system('dot -Tpng plots/tree_%s.dot -o plots/tree_%s.png' %(i_tree,i_tree)) os.remove('plots/tree_%s.dot' %i_tree) i_tree = i_tree + 1 else: with open('plots/tree_example.dot', 'w') as my_file: my_file = tree.export_graphviz(clf.estimators_[0], out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True) os.system('dot -Tpng plots/tree_example.dot -o plots/tree_example.png') os.remove('plots/tree_example.dot') start, end=[],[] # Split cats for RAM management numcats = numpy.int64((2*(XXpredict.size/1024/1024)*clf.n_jobs)) if settings.get_contributions ==1: numcats=100 if numcats < 1: numcats = 1 logger.info('Predict start') logger.info('------------') start = time.time() result,probs,bias,contributions,train_contributions=[],[],[],[],[] XXpredict_cats=numpy.array_split(XXpredict,numcats) logger.info('Splitting predict array into %s' %numcats) logger.info('------------') for i in range(len(XXpredict_cats)): logger.info('Predicting cat %s/%s' %(i,len(XXpredict_cats))) result.extend(clf.predict(XXpredict_cats[i][:,0:n_feat])) # XX is predict array. probs.extend(clf.predict_proba(XXpredict_cats[i][:,0:n_feat])) # Only take from 0:n_feat because answers are tacked on end if 'OvsA' not in ind_run_name: if (settings.get_contributions == 1) | (settings.get_perfect_contributions==1): logger.info('Getting contributions from predict catalogue %s' %i) tiresult = ti.predict(clf,XXpredict_cats[i][:,0:n_feat]) contributions.extend(tiresult[2]) bias = tiresult[1][0] feat_importance = clf.feature_importances_ result=numpy.float32(result) probs=numpy.float32(probs) if 'OvsA' not in ind_run_name: if settings.get_contributions == 1: numpy.save('contributions',contributions) if settings.get_perfect_contributions == 1: numpy.save('perfect_contributions',contributions) if settings.compute_contribution_mic == 1: logger.info('Getting contributions from train catalogue (for plot_mic_cont)') tiresult_train = ti.predict(clf,XX[:,0:n_feat]) train_contributions=tiresult_train[2] bias_train = tiresult_train[1][0] accuracy = metrics.accuracy_score(result,yypredict) recall = metrics.recall_score(result,yypredict,average=None) precision = metrics.precision_score(result,yypredict,average=None) score = metrics.f1_score(result, yypredict,average=None) end = time.time() logger.info('Predict ended in %s seconds' %(end-start)) logger.info('------------') logger.info('Recall Score: %s' %recall) logger.info('Precision Score: %s' %precision) logger.info('Accuracy Score: %s' %accuracy) logger.info('F1 Score: %s' %score) percentage=(n/predictdatanum)*100 run_opts.diagnostics([result,yypredict,unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr],'result') # stats=numpy.array([]) # stats=numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage)) # SAVE if settings.saveresults == 1: logger.info('Saving results') logger.info('------------') numpy.savetxt(settings.result_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((yypredict,result)),header="True_target Predicted_target") numpy.savetxt(settings.prob_outfile+('_%s' %ind_run_name)+'.txt',probs) numpy.savetxt(settings.feat_outfile+('_%s' %ind_run_name)+'.txt',feat_importance) numpy.savetxt(settings.stats_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage,clf.max_depth)),header="n_est traindatanum predictdatanum percentage max_depth",fmt="%s") return result,feat_importance,probs,bias,contributions,accuracy,recall,precision,score,clf,train_contributions
def visualize_prediction(start_date = date(2000,1, 1),end_date=date(2000,2, 1)): df_combined = return_combined(start_date,end_date) df_combined.business = df_combined.business.pct_change() df_combined.money = df_combined.money.pct_change() df_combined.world = df_combined.world.pct_change() #df_combined.drop(df_combined.head(1).index, inplace=True) #df_combined.drop(df_combined.tail(1).index, inplace=True) df_combined.replace(np.inf, 0, inplace=True) df_combined.replace(np.nan, 0, inplace=True) df_combined.replace(-np.inf, 0, inplace=True) df_combined = df_combined.fillna(method ='bfill') df_combined = df_combined.fillna(method ='ffill') #df_combined.drop(df_combined.head(1).index, inplace=True) #df_combined.drop(df_combined.tail(1).index, inplace=True) #df_combined.dropna(inplace=True) df_combined['sp_500'] = df_combined['sp_500'].apply(np.int64) #df_combined = df_combined.pct_change() train_start = '2010-01-02' train_end = '2010-08-01' test_start = '2010-08-05' test_end = '2010-12-24' #df_combined['sp_500'] = df_combined['sp_500'].apply(np.int64) train = df_combined.ix[train_start : train_end] test = df_combined.ix[test_start:test_end] prediction_list = [] sentiment_score_list = [] for date, row in train.T.iteritems(): sentiment_score = np.asarray([df_combined.loc[date, 'business'],df_combined.loc[date, 'money'],df_combined.loc[date, 'world']]) #sentiment_score = np.asarray([df_combined.loc[date, 'money']]) sentiment_score_list.append(sentiment_score) numpy_df_train = np.asarray(sentiment_score_list) sentiment_score_list = [] for date, row in test.T.iteritems(): sentiment_score = np.asarray([df_combined.loc[date, 'business'],df_combined.loc[date, 'money'],df_combined.loc[date, 'world']]) #sentiment_score = np.asarray([df_combined.loc[date, 'money']]) sentiment_score_list.append(sentiment_score) numpy_df_test = np.asarray(sentiment_score_list) y_train = pd.DataFrame(train['sp_500']) y_test = pd.DataFrame(test['sp_500']) rf = RandomForestRegressor() rf.fit(numpy_df_train, y_train) #print (rf.feature_importances_) prediction, bias, contributions = ti.predict(rf, numpy_df_test) idx = pd.date_range(test_start, test_end) predictions_df = pd.DataFrame(data=prediction[0:], index = idx, columns=['sp500_predicted']) #predictions_df predictions_plot = predictions_df.plot() fig = y_test.plot(ax = predictions_plot).get_figure()
num_max_images = 10 for i in range(len(unique_IDS_pr)): url_list,url_objid_list,url_spectra_list,tiresult_list,img_values_list=[],[],[],[],[] if len(image_IDs[i]['good_ID']) > num_max_images: top_good = num_max_images else: top_good = len(image_IDs[i]['good_ID']) for j in range(0,top_good): img_ID_good = image_IDs[i]['good_ID'][j] img_SPECOBJID_good = image_IDs[i]['good_SPECOBJID'][j] img_RA_good = image_IDs[i]['good_RA'][j] img_DEC_good = image_IDs[i]['good_DEC'][j] img_index_good = image_IDs[i]['good_index'][j] img_values_loop = XXpredict[:,0:n_feat][img_index_good] tiresult = ti.predict(clf,XXpredict[:,0:n_feat][img_index_good].reshape(1,-1)) tiresult_list.append(tiresult) img_values_list.append(img_values_loop) url_objid_list.append('http://skyserver.sdss.org/dr12/en/tools/explore/Summary.aspx?id=%s' %img_ID_good) url_spectra_list.append('http://skyserver.sdss.org/dr12/en/get/SpecById.ashx?id=%s' %img_SPECOBJID_good) url_list.append('http://skyserver.sdss.org/SkyserverWS/dr12/ImgCutout/getjpeg?TaskName=Skyserver.Explore.Image&ra=%s&dec=%s&scale=0.2&width=200&height=200&opt=G' %(img_RA_good,img_DEC_good)) image_IDs[i].update({'good_url':url_list,'good_spectra' : url_spectra_list, 'good_tiresult' : tiresult_list, 'good_url_objid' : url_objid_list, 'good_values' : img_values_list}) url_list,url_objid_list,url_spectra_list,tiresult_list,img_values_list=[],[],[],[],[] if len(image_IDs[i]['ok_ID']) > num_max_images: top_ok = num_max_images else: top_ok = len(image_IDs[i]['ok_ID']) for j in range(0,top_ok): img_ID_ok = image_IDs[i]['ok_ID'][j] img_SPECOBJID_ok = image_IDs[i]['ok_SPECOBJID'][j] img_RA_ok = image_IDs[i]['ok_RA'][j]