def crawl(args): addr_in = args[0] clf = args[1] X_test, y_test = gd.get(addr_in) prediction = clf.predict(X_test) return metrics.confusion_matrix(y_test, prediction)
def crawl(args): addr_in = args[0] clf = args[1] X_test, y_test = gd.get(addr_day=addr_in, features_to_get=__FEATURES_TO_GET) prediction = clf.predict(X_test) return metrics.confusion_matrix(y_test, prediction)
def test(addr_test, clf): path_in = os.path.join(addr_test, "day_samp_bin.npy") X_test, y_test = gd.get(addr_day=addr_test, features_to_get=__FEATURES_TO_GET) prediction = clf.predict(X_test) confusion_matrix = metrics.confusion_matrix(y_test, prediction) tp = confusion_matrix[1, 1] fp = confusion_matrix[0, 1] tn = confusion_matrix[0, 0] fn = confusion_matrix[1, 0] total = tp+fp+tn+fn recall = round(tp / float(tp+fn), 4) filtered = round(float(tn+fn) / total, 4) return [tn, fp, fn, tp], round(recall, 4), round(filtered, 4)
def train(addr_train, clf, ratio, add_estimators): X_train, y_train = gd.get(addr_day=addr_train, ratio=ratio, features_to_get=__FEATURES_TO_GET) print "Fitting Model......" clf.n_estimators += add_estimators clf.fit(X_train, y_train) print "Done" if __SAVE_MODEL: model_name = "RF_" + onoff_line + "_" + str(ratio) + "_Model.p" dir_out = os.path.join(addr_train, "Random_Forest_Models") if not os.path.isdir(dir_out): os.mkdir(dir_out) path_out = os.path.join(dir_out, model_name) with open(path_out, "w") as file_out: pickle.dump(clf, file_out) return clf
def train(): print "\n========== Start Training ==========" list_io_addr = get_io_addr(__TRAIN_DATA) clf = BernoulliNB(class_prior=[0.1, 0.9], alpha=0.5) for addr_in in list_io_addr: print "\nGenerating training set from {}".format(addr_in) X_train, y_train = gd.get(addr_in, __RATIO) print "Done" print "Fitting Model......" clf.partial_fit(X_train, y_train, classes=[0, 1]) print "Done" if __SAVE_MODEL: with open(__ROOT_MODEL, "w") as file_out: pickle.dump(clf, file_out) return None else: return clf
def train(): clf = RandomForestClassifier(n_estimators=40, max_features="sqrt", oob_score=True, warm_start=False, n_jobs=-1, random_state=1514) list_io_addr = get_io_addr(__TRAIN_DATA) for path_in in list_io_addr: print "\n>>>>> Start Training on {}".format(path_in) X_train, y_train = gd.get(addr_day=path_in, ratio=__RATIO, features_to_get=__FEATURES_TO_GET) print "Fitting Model......" clf.fit(X_train, y_train) print "Done" if __SAVE_MODEL: with open(__ROOT_MODEL, "w") as file_out: pickle.dump(clf, file_out) return clf
#!/usr/bin/python import Get_Data as Data import numpy as np xData = [] dataFile = "" dataFile = raw_input( 'Enter the name of the datafile to find the standard deviation (Blank for \'data.txt\'): \n' ) if (dataFile == ""): dataFile = "data.txt" Data.getData(dataFile, xData) tempAvg = 0.0 count = 0 for n in xData: tempAvg += n count += 1 avg = tempAvg / count tempSum = 0.0 count = 0 for n in xData: tempSum += pow(avg - n, 2) count += 1 stdDev = pow(tempSum / (count - 1), 0.5)
if not os.path.exists(path): os.mkdir(path) def Directory(): CreateDirectory(dataset_path + 'CT_Images/') CreateDirectory(dataset_path + 'CT_Images/' + 'Train') CreateDirectory(dataset_path + 'CT_Images/' + 'Test') if __name__ == "__main__": #Create some directory Directory() ImagePaths = ["Chest_Xray/", "Covid_Xray/"] md = Data.MakeDataset(ImagePaths) md.TrainData() md.TestData() train_generator = ig.TrainGenerator() val_generator = ig.ValGenerator() VggModel = BuildModel() VggModel.GetModel(input_shape=(224, 224, 3), no_of_class=3, learning_rate=0.001) VggModel.TrainModel(train_generator, val_generator, epochs=50, steps_per_epoch=39, validation_steps=10)
def test(self, addr_in): X, y = gd.get(addr_in) prediction = self.clf.predict(X) return y, prediction
import G_API import time import Get_Data import Parse_Data if __name__ == '__main__': # ----------------------------------------------------------------------------------------------------------------- # Start to count execution time and number of shipments in the process. start = time.time() # ----------------------------------------------------------------------------------------------------------------- # Create a dict object to save primary reference, file name, and weight information. weight_dict = dict() file_list = os.listdir(Constant.shearers_xml_path) for file in file_list: pri_ref, weight = Get_Data.get_info_xml(Constant.shearers_xml_path, file) weight_dict[pri_ref] = [weight, Constant.shearers_xml_path + file] # ----------------------------------------------------------------------------------------------------------------- # Convert primary reference list to a string separated by comma. pri_ref_str = ','.join([key for key, item in weight_dict.items()]) # ----------------------------------------------------------------------------------------------------------------- # Collect report OID, login username and password from .txt files. Get_Data.read_login_credentials() # ----------------------------------------------------------------------------------------------------------------- # Use username and password combinations to log into TMS and extract login token from HTML response. session_requests, csrf = Post_Data.login_tms() # ----------------------------------------------------------------------------------------------------------------- # Request the first page of the report in HTML format. Extract the response in text format. shipment_report_html_script = \ Get_Data.get_shipment_report_by_report_format(session_requests, csrf, pri_ref_str).text # -----------------------------------------------------------------------------------------------------------------
@author: Admin """ from Get_Relation import * from Get_Data import * from Time_Tool import * from PowLaw import * import numpy as np import pandas as pd import math import matplotlib.pyplot as plt from Tools import * Data = Get_Data('2006/4','2007/3') Data.index = range(len(Data)) Relation = Get_Relation(Data,0,0) Relation = Relation.dropna() Save_DataFrame(Relation,'2012') ''' def Get_People_Num(Relation): P = set(Relation.ID)|set(Relation.Post_ID) return len(P) Active.append(Get_People_Num(Data)) #Relation.columns = [['Source','Target']] #Save_DataFrame(Relation,'Relation_2008')
@author: Admin """ from Get_Relation import * from Get_Data import * from Time_Tool import * import numpy as np import pandas as pd import math import matplotlib.pyplot as plt Start = input('Start Time (year/month): ') End = input('Start Time (year/month): ') Data = Get_Data(Start, End) ALL_Relation = Get_Relation(Data, 0, 1) # Remove Duplicate ALL_Relation_RD = ALL_Relation.drop_duplicates(['P', 'R']) # Remove Reply Self Judge = [i != j for i, j in zip(ALL_Relation_RD.P, ALL_Relation_RD.R)] Relation = ALL_Relation_RD[Judge] Relation.index = range(len(Relation)) GTime = [Generailze_Time(time) for time in Relation.Time] Relation['GTime'] = GTime SRelation = Relation.sort('GTime') SRelation.index = range(len(SRelation)) IDs = list(set(SRelation.P) | set(SRelation.R))
import math from textblob import TextBlob as tb from scipy import spatial import Get_Data as twitterdata docs = twitterdata.get_data() for i in range(docs.__len__()): docs[i] = tb(docs[i]) n_of_docs = docs.__len__() def tf(word, blob): return blob.words.count(word) / len(blob.words) def n_containing(word, bloblist): return sum(1 for blob in bloblist if word in blob.words) def idf(word, bloblist): return math.log(len(bloblist) / (1 + n_containing(word, bloblist))) def tfidf(word, blob, bloblist): return tf(word, blob) * idf(word, bloblist) bloblist = [i for i in docs] # for i, blob in enumerate(bloblist):
def train(self, addr_in, sampling_ratio, sampling_mode): X, y = gd.get(addr_in, ratio=sampling_ratio, mode=sampling_mode) self.clf.n_estimators += self.add_estimators self.clf.fit(X, y)
@author: Admin """ from Get_Relation import * from Get_Data import * from Time_Tool import * from PowLaw import * import numpy as np import pandas as pd import math import matplotlib.pyplot as plt import networkx as nx Data = Get_Data('2006/4', '2006/5') Relation = Get_Relation(Data, 0, 1) GTime = [Generailze_Time(time) for time in Relation.Time] Relation['GTime'] = GTime Sort_Relation = Relation.sort('GTime') R = Sort_Relation[['P', 'R']] #Number = input('N: ') rl = [] for Number in range(3, 3000):
import Get_Data import json import numpy.random.common import numpy.random.bounded_integers import numpy.random.entropy if __name__ == '__main__': # START COUNTING RUNNING TIME start = time.time() count = 0 # LOGIN TMS Get_Data.read_login_credentials() Get_Data.read_report_oid() session_requests, csrf = Post_Data.login_tms() # Concatenate for route board report url. url_report_routeboard = Constant.url_report_routeboard url_report_routeboard = url_report_routeboard.replace('OID_TO_REPLACE', str(Constant.oid_report_so)) # Request route board report. print('Reading data from route board...') response = session_requests.get(url_report_routeboard).text # Convert json_response. route_board_json_responses = json.loads(response) print('Route board data read successfully.') # Dictionaries for customer loads and execution loads.
import Constant import Post_Data import Get_Data import Config_Post_Data import time import G_API import Input_Collection as Input if __name__ == '__main__': # READ USER LOGIN CREDENTIALS FROM TEXT FILE. Get_Data.read_login_credentials() # GET INPUT FROM USER. input_load_number = 'load number' input_pickup_appointment_date = 'pickup appointment date' input_pickup_appointment_time = 'pickup appointment time' load_numbers = Input.get_general_input(input_load_number) pickup_appointment_date = Input.get_date(input_pickup_appointment_date) pickup_appointment_time = Input.get_time(input_pickup_appointment_time) # START COUNTING RUNNING TIME start = time.time() print('Pickup appointment', pickup_appointment_date, pickup_appointment_time, 'will be added to load(s):', load_numbers, '.') # LOGIN TMS session_requests, csrf = Post_Data.login_tms()
import Constant import Get_Data import G_API import Post_Data import Config_Post_Data import time if __name__ == '__main__': #START COUNTING RUNNING TIME start = time.time() #LOGIN TMS Get_Data.read_login_credentials() session_requests, csrf = Post_Data.login_tms() #READ GOOGLE SHEETS VALUE print('Loading Google Sheet...') workbook_cr = G_API.get_workbook_by_id(Constant.g_sheets_workbook_id_ops) worksheet_cr = G_API.get_worksheet_by_id( workbook_cr, Constant.g_sheets_worksheet_id_ops) sheet_values = G_API.get_values_in_list(worksheet_cr) sheet_values = G_API.convert_values_to_dict( sheet_values) #Convert value list to dictionary. for key, value_list in sheet_values.items(): sheet_values[key] = ','.join(value_list) assert sheet_values is not None, 'Error: Google Sheet is empty or reading failure.' print('Google Sheet loading completed.')
import Get_Data as twitterdata from sklearn.feature_extraction.text import TfidfVectorizer from scipy import spatial #corpus=["all is well","best of luck","best of luck","best of all"] corpus = twitterdata.get_data() n_of_articles = corpus.__len__() vectorizer = TfidfVectorizer(min_df=1) vectorizer.__init__(norm=u'l1', smooth_idf=False) X = vectorizer.fit_transform(corpus) # print (dict(zip(vectorizer.get_feature_names(), idf))) feature_names = vectorizer.get_feature_names() scores_relative_to_comparing = [] ss = [] doc = 1 feature_index = X[doc, :].nonzero()[1] writer = open('idf.txt', 'w', encoding="utf8") for i in range(vectorizer.idf_.__len__()): writer.write(str(vectorizer.idf_[i])) writer.write(" ") writer.write(str(feature_names[i])) writer.write("\n") writer.close() c = twitterdata.files_in_dir cc = 0 # for i in range(vectorizer.idf_.__len__()): # print(feature_names[i])
import Post_Data import Constant import os import G_API import time import Get_Data if __name__ == '__main__': # ----------------------------------------------------------------------------------------------------------------- # Start to count execution time and number of shipments in the process. start = time.time() # ----------------------------------------------------------------------------------------------------------------- # Remove Amplify 204 message. Remove unused files. Get_Data.remove_unused_files() print('Unused files removal completes.') # ----------------------------------------------------------------------------------------------------------------- # Create a dict object to save primary reference, file name, and weight information. weight_dict = dict() file_list = os.listdir(Constant.amplify_204_path) for file in file_list: pri_ref, weight = Get_Data.get_info_204(Constant.amplify_204_path, file) weight_dict[pri_ref] = [weight, Constant.amplify_204_path + file] # ----------------------------------------------------------------------------------------------------------------- # Convert primary reference list to a string separated by comma. pri_ref_str = ','.join([key for key, item in weight_dict.items()]) # ----------------------------------------------------------------------------------------------------------------- # Collect report OID, login username and password from .txt files. Get_Data.read_login_credentials() # -----------------------------------------------------------------------------------------------------------------
import pandas as pd import Post_Data import Constant import time import G_API import Get_Data if __name__ == '__main__': # START COUNTING RUNNING TIME start = time.time() # LOGIN TMS Get_Data.read_login_credentials() session_requests, csrf = Post_Data.login_tms() # Extract data from the spreadsheet. df_shipment = pd.read_excel('Dynamic.xlsx', dtype=str) oids = df_shipment['OID'].tolist() for oid in oids: # Concatenate the load page. url_load_html = Constant.url_prefix_load_html url_load_html = url_load_html.replace('TO_BE_REPLACE', oid) # Concatenate the reference page. url_ref_html = Constant.url_prefix_ref_html url_ref_html = url_ref_html.replace('TO_BE_REPLACE', oid) # Parse the html script of the load page and find the load attached.