def dataGettingEvent(sc): print("Fetching data") getData.main() #re-setup scheduler # todo set this way higher datagettingScheduler.enter(180, 1, dataGettingEvent, (sc,))
def main(): getLogger() logging.info("start ETL process") if config.CREATE_DATABASE_OBJECTS_FLAG: logging.info("create DB") databaseUtils.main() logging.info("download data") getData.main() logging.info("prepare data") shipping_lines_data = prepareData.get_shipping_lines_data() logging.info("load data") databaseUtils.load_shipping_lines_table(shipping_lines_data) logging.info("end ETL process")
def get_data(): try: data = pd.read_csv("rb1901.csv", index_col=0) data.index = pd.to_datetime(data.index) except: import getData getData.main() data = pd.read_csv("rb1901.csv", index_col=0) data.index = pd.to_datetime(data.index) days_number = np.load("days_number.npy") da = [ data.iloc[days_number[i]:days_number[i + 1]] for i in xrange(len(days_number) - 1) ]
def stat(): form = UserDate(request.form) requsted_date_from = form.input_date_from._value() requsted_date_to = form.input_date_to._value() all_managers = getData.main(requsted_date_from, requsted_date_to) visible_managers = {} visible_dealingers = {} for num in managers_num: if all_managers.has_key(num): visible_managers[num] = all_managers[num] for num in dealingers_num: if all_managers.has_key(num): visible_dealingers[num] = all_managers[num] return render_template("stat.html", work_hours = work_hours, all_hours = all_hours, today = requsted_date_from, title = 'Get Report', form = form, dealingers = visible_dealingers, managers_names = managers_names, visible_managers = visible_managers, managers = all_managers)
def main(): messageSize = 32 mostFrequentWords = 9999 fullLog, wordFreq = getData.main() #data, labels, nameDict = standardizePackets(fullLog, messageSize) dictionary = convertFreqTable(wordFreq, mostFrequentWords) data, labels = standardizePackets(fullLog, messageSize, dictionary) #trainingData, trainingLabels, testingData, testingLabels = splitTrainTest(data, labels, .9) printToFile(data, 'data.txt') printToFile(labels, 'labels.txt') return data, labels, (messageSize)
def main(careerData=None,seasonStats=None): """ This is the main module. This can be run without any inputs, in which case the careerData and seasonStats variables will be creating using modules in the getData.py file. Once all the features are created they are saved as pickle files. Inputs: careerData - dictionary or pickle file, optional (default=None) This is the dictionary containing all relevant career data. If the data has not been collected previously, then it will be collected with the default input. Either a dictionary or a saved pickle file are acceptable inputs seasonStats - dictionary of pickle file, optional (default=None) This is the dictionary containing end of season stats for all relevant years. If the data has not been collected previously, then it will be collected with the default input. Either a dictionary or a saved pickle file are acceptable inputs Outputs: nonRookieData - dictionary This is the dictionary containing all relevant feature data for Non-Rookies. The keys are the seasons and the values are the features rookieData - dictionary This is the dictionary containing all relevant feature data for Rookies. The keys are the seasons and the values are the features careerData - dictionary If a new player is found during feature extraction that was not previously in the saved careerData variable, it will be updated. """ if careerData: try: careerData = pickle.load(careerData) except: pass if seasonStats: try: seasonStats = pickle.load(seasonStats) except: pass if not (careerData and seasonStats): careerData,seasonStats=getData.main() nonRookieData,rookieData,careerData=getTrainData(careerData,seasonStats) pickle.dump(nonRookieData,open('nonRookieData.p','wb')) pickle.dump(rookieData,open('rookieData.p','wb')) pickle.dump(careerData,open('careerData.p','wb')) return(nonRookieData,rookieData,careerData)
def testMethods(nonRookieData=None,rookieData=None,careerData=None): """ The test suite for deciding the best model """ if not careerData: seasonStats,careerData,lookUp = getData.main() if not (nonRookieData and rookieData): nonRookieData,rookieData,careerData=getFeatures.main(careerData) resultsNonRookies,predsNonRookies = getCrossVal(nonRookieData,careerData) resultsRookies,predsRookies = getRookieCrossVal(rookieData,careerData) resultsNonRookies = writeResToPandas(resultsNonRookies,'nonRookies') resultsRookies = writeResToPandas(resultsRookies,'rookies') return(resultsNonRookies,resultsRookies,predsNonRookies,predsRookies)
def predictNextSeason(year=2015,nonRookieData='nonRookieData.p',rookieData='rookieData.p',careerData='careerData.p',seasonStats='seasonStats.p'): """This module uses previously selected algorithms: -Veterans: Random Forest; n_estimators=500, min_samples_split=125 -Novices: SVM; C=.15, gamma = .015, epsilon= .05 It then trains the models and generates predictions in csv format Inputs: year - string, optional (default=2015) nonRookieData,rookieData,careerData,seasonStats - dictionary, optional (default = None) If None, then the variable is generated using the getFeatures.py file Outputs: predictionNonRookies,predicitonRookies - Pandas Dataframe The dataframes containing the predictions for both groups """ t0=time.time() last2digits=str(year)[-2:] season = str((datetime.datetime(year,1,1)-datetime.timedelta(days=365)).year)+'-'+last2digits if not (careerData and seasonStats): seasonStats,careerData,lookUp = getData.main() if not (nonRookieData and rookieData): nonRookieData,rookieData,careerData=getFeatures.main(careerData) nonRookieData,rookieData,careerData,seasonStats=tryPickle(nonRookieData,rookieData,careerData,seasonStats) print('All past data found! Now fitting models ',time.time()-t0) nonRookiesModel,nonRookiesTrain,nonRookiesScaler = getModel(nonRookieData,'nonRookies') rookiesModel,rookiesTrain,rookiesScaler = getModel(rookieData,'rookies') print('Models fitted! Now getting all current players features ',time.time()-t0) nonRookies,rookies = findPlayerFeatures(year,seasonStats[year],careerData, nonRookiesTrain,rookiesTrain,nonRookiesScaler,rookiesScaler) print('Features found! Now making predictions ',time.time()-t0) predictionsNonRookies = getPredictions(nonRookies,nonRookiesModel,'nonRookies') print('Non-Rookie Predictions made! Now predicting Rookies ',time.time()-t0) ## predictionsNonRookies.to_csv(season+'_Veteran_Predictions.csv',index=False) predictionsRookies = getPredictions(rookies,rookiesModel,'rookies') ## predictionsRookies.to_csv(season+'_Novice_Predictions.csv',index=False) predictionsNonRookies.append(predictionsRookies).to_csv(season+'_Predictions.csv',index=False) print('Total Runtime is ',time.time()-t0,'s') return(predictionsNonRookies,predictionsRookies)
def test_production_analysis_with_energies(): args = default_args() args['energies'] = 'Yes' getData.main(args)
def test_production_analysis(): getData.main(default_args())
def main(runPams): timeStam = str(int(time.time())) #saveExcelPath = "C:\\Users\\pdang\\Desktop\\" + timeStam + ".xlsx" saveExcelPath = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200113Predicte/results/l1NormCRNumCNN_small/block1/excelRes/" + timeStam + ".xlsx" #st = time.time() # get samples, featureMap, optFeatureMap olabel, samples, featureMap, optFeatureMap = getData.main(runPams) ''' [print(olabel[i], samples[i]) for i in range(len(samples))] print("------------------------------------------------------") [print(olabel[i], featureMap[i]) for i in range(len(featureMap))] print("--------------------------------------------------") [print(olabel[i], optFeatureMap[i]) for i in range(len(optFeatureMap))] print("---------------------------------------------------------------") print(olabel.size()) print(samples.size()) print(featureMap.size()) print(optFeatureMap.size()) sys.exit() ''' # choose spu or gpu automatically device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # samples data net, optimizer, lossFunc = getCNNPams(samples.size()[1], samples.size()[2], samples.size()[3], device, runPams.lr) sres, sytrue_ypred = Predicte.myUtils.myTrainTest.train_test( olabel, samples, net, device, optimizer, lossFunc, runPams) # featureMap data net, optimizer, lossFunc = getCNNPams(featureMap.size()[1], featureMap.size()[2], featureMap.size()[3], device, runPams.lr) fres, fytrue_ypred = Predicte.myUtils.myTrainTest.train_test( olabel, featureMap, net, device, optimizer, lossFunc, runPams) # optFeatureMap data net, optimizer, lossFunc = getCNNPams(optFeatureMap.size()[1], optFeatureMap.size()[2], optFeatureMap.size()[3], device, runPams.lr) ores, oytrue_ypred = Predicte.myUtils.myTrainTest.train_test( olabel, optFeatureMap, net, device, optimizer, lossFunc, runPams) # prepare results res = list() if runPams.minusMean == 1: res.append("c*r-E") else: res.append("c*r") res.append(runPams.xn) res.append("N(0-" + str(runPams.stdBias / 10) + ")") res.append("10*" + str(runPams.sampleNum)) res.append(runPams.numThreshold) res.append("7*" + str(samples.size()[2])) res.append(sres) res.append("7*" + str(featureMap.size()[3])) res.append(fres) res.append("7*" + str(optFeatureMap.size()[3])) res.append(ores) # save data to excel resDF = pd.DataFrame(res) resDF.columns = ["res"] sytrue_ypred = pd.DataFrame(sytrue_ypred) sytrue_ypred.columns = ["true", "pred"] cytrue_ypred = pd.DataFrame(fytrue_ypred) cytrue_ypred.columns = ["true", "pred"] oytrue_ypred = pd.DataFrame(oytrue_ypred) oytrue_ypred.columns = ["true", "pred"] writer = pd.ExcelWriter(saveExcelPath) # 写入Excel文件 resDF.to_excel(writer, index=False) sytrue_ypred.to_excel(writer, startcol=2, index=False) cytrue_ypred.to_excel(writer, startcol=5, index=False) oytrue_ypred.to_excel(writer, startcol=8, index=False) writer.save() writer.close() # output data res = ','.join(str(i) for i in res) print(res) return ()
import statsmodels.api as sm import statsmodels.formula.api as smf import seaborn as sns #import getData __all__ = [ "data", "futurepng", "days_number", "get_vwap", "vwap", "last", "rejectData", "getData", "get_vabp", "get_mabp" ] try: data = pd.read_csv("rb1901.csv", index_col=0) data.index = pd.to_datetime(data.index) except: import getData getData.main() data = pd.read_csv("rb1901.csv", index_col=0) data.index = pd.to_datetime(data.index) days_number = np.load("days_number.npy") da = [ data.iloc[days_number[i]:days_number[i + 1]] for i in xrange(len(days_number) - 1) ] def get_data(): try: data = pd.read_csv("rb1901.csv", index_col=0) data.index = pd.to_datetime(data.index) except:
from pandas.plotting import scatter_matrix import matplotlib.pyplot as plt from sklearn import model_selection from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC import getData dataset = getData.main() # box and whisker plots dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False) plt.show() # histograms dataset.hist() plt.show() # scatter plot matrix scatter_matrix(dataset)