def create_mono_gram(READ_DIR): one_gram = [] dc.clean_data() os.chdir(READ_DIR) index_count = 0 for filename in os.listdir(os.getcwd()): file = open(filename, "r") contents = file.read() tokens = contents.split(" ") for token in tokens: token = token.strip() if (len(token) <= 0): continue value = token is_open = 0 is_close = 0 is_name = 0 index = -1 if (token[0] == '<'): is_open = 1 if (token[-1] == '>'): is_close = 1 if (is_open == 1 and is_close == 1): is_name = 1 index = index_count index_count += 1 if (is_open == 1 or is_close == 1): value = re.sub(r'\W+', '', token) tk = ti.token(value, is_open, is_close, is_name, index) one_gram.append(tk) os.chdir('..') return one_gram
def load_mails_info(self): """ Load user labels and mails. """ self.load_labels() self.loader = Loader(self.service, self.directory, self.label_ids) self.load_mails() self.counts = self.mails['label'].value_counts() self.label_names.sort(key=lambda label: self.counts[label], reverse=True) DataCleaner.pack(self.mails)
def build(self): #try: self.train = pd.read_csv(self.entryPath.get() + "/train.csv") if self.validate(self.entryDiscBins.get()): # load train file, test file and structure file if (os.path.getsize(self.entryPath.get() + "/Structure.txt") == 0): raise Exception("The structure file is empty") self.structureFile = open(self.entryPath.get() + "/Structure.txt") self.fileHandler = FilesHandler() self.structureDic = self.fileHandler.createStstructureDic(self.structureFile) self.dataCleaner = DataCleaner(self.structureDic, self.numOfBins) self.toLowerCase("train") self.train = self.dataCleaner.trainCleaning(self.train) self.classifier = Classifier(self.train, self.entryPath.get(), self.structureDic, self.numOfBins) self.wasBuilt = True tkMessageBox.showinfo("Build Message", "Building classifier using train-set is done!")
def __init__(self, num_outputs, num_cells=42, sequence_length=30, dropout=0.0, epochs_per_fold=10, activation_function='relu', optimizer='adam'): self.num_outputs = num_outputs self.cleaner = DataCleaner.DataCleaner() self.sequence_length = sequence_length self.epochs = epochs_per_fold self.batch_size = 1 self.model = Sequential() self.model.add( LSTM(num_cells, input_shape=(None, 1), activation=activation_function)) self.model.add(Dropout(dropout)) self.model.add(Dense(2)) self.model.compile(optimizer=optimizer, loss='mse') self.test_data = [] self.predictions = []
def get_average_profit(): assets = 1000 #investing with $1000 base = Baseline(assets) cleaner = DataCleaner.DataCleaner() profits = [] invested = False num_folds = 4 window_size = 30 for k in range(1, num_folds + 1): training_data, cv_data = cleaner.get_clean_data(k) for index in range(0, len(training_data), window_size): window_data = training_data[index:index + window_size] open_values = [] close_values = [] for day in window_data: open_values.append(day[0]) close_values.append(day[1]) flat_window_data = [] for index in range(len(open_values)): flat_window_data.append(open_values[index]) flat_window_data.append(close_values[index]) if len(flat_window_data) != 60: continue base.train(list(range(index, index + base.window_size)), flat_window_data) base.predict( list( range(index + base.window_size, index + (2 * base.window_size)))) if invested and (base.classify() == 4): invested = False profit = (base.y_train[-1] - base.purchase_value) * base.assets profits.append(profit) print("sold for ${} profit".format(profit)) elif (not invested) and ((base.classify() == 0) or (base.classify() == 1)): base.purchase_value = base.y_train[-1] invested = True print("Invested") print("Profits: {}".format(profits)) print("Total profits = ${}".format(sum(profits)))
import DataCleaner as dc import pandas as pd import os os.chdir("./data") data_train_class = dc.DataCleaner("application_test.csv") data_train = data_train_class.get_data() profile_train = data_train_class.get_profile("/Users/FangzhouYu/Desktop/home-credit-default-risk/test_profile.html") data_train_class.replace_missing_data("ORGANIZATION_TYPE", "XNA", "NA")
def runDataCleaner(test_file1, test_file2, type): if type == 'c': d = DataCleaner("tmdb_5000_movies_classification.csv", "tmdb_5000_credits.csv", test_file1, test_file2, type) d.movies = pd.read_csv('trainC.csv') # TrainC is data not numeric # loop on 2 files only (2 loops) for i in tqdm(['trainC_data.csv', 'testC_data.csv']): if (not os.path.exists(i) ): # If you have already created the dataset: d.movies = d.movies.iloc[:, 1:] # beacuse coulmn 1 is the id d.defineCategories() d.normalaize() d.movies.to_csv(i) d.movies = pd.read_csv('testC.csv') #print(d.movies.shape[0]) # now d.movies has the train dataset in first loop, in the second loop d.moves has the test dataset train = handelMissingValues(pd.read_csv('trainC_data.csv').iloc[:, 1:]) test = handelMissingValues(pd.read_csv( 'testC_data.csv').iloc[:, 1:]) # first coulmn is 0,1,2,3,... else: d = DataCleaner("tmdb_5000_movies_train.csv", "tmdb_5000_credits_train.csv", test_file1, test_file2, type) d.movies = pd.read_csv('trainR.csv') # TrainC is data not numeric # loop on 2 files only (2 loops) for i in tqdm(['trainR_data.csv', 'testR_data.csv']): if (not os.path.exists(i) ): # If you have already created the dataset: d.movies = d.movies.iloc[:, 1:] # beacuse coulmn 1 is the id d.defineCategories() d.normalaize() d.movies.to_csv(i) d.movies = pd.read_csv('testR.csv') # now d.movies has the train dataset in first loop, in the second loop d.moves has the test dataset train = handelMissingValues(pd.read_csv('trainR_data.csv').iloc[:, 1:]) test = handelMissingValues(pd.read_csv( 'testR_data.csv').iloc[:, 1:]) # first coulmn is 0,1,2,3,... return train, test
class NaiveBayesClassifier: # data members master = None filePath = "" numOfBins = 0 train = None test = None structureFile = None wasBuilt = False structureDic = {} fileHandler = None classifier = None dataCleaner = None # Initialize the GUI def __init__(self, master): self.master = master master.title("Naive Bayes Classifier") master.geometry("650x300") # <editor-fold desc="init buttons, labels and entries"> self.labelPath = Label(master, text="Directory Path:") self.entryPath = Entry(master, width=70) self.browse_button = Button(master, text="Browse", width=10, command=self.askopenfile) self.browse_button.pack() self.labelDiscBins = Label(master, text="Discretization Bins:") self.entryDiscBins = Entry(master, width=20, validate="key") self.build_button = Button(master, text="Build", width=20, command=self.build) self.build_button.pack() self.labelErr = Label(master, text="", fg="red", font="Verdana 10 bold") self.classify_button = Button(master, text="Classify", width=20, command=self.classify) self.classify_button.pack() self.close_button = Button(master, text="Exit", width=10, command=master.quit) self.close_button.pack() #</editor-fold> # Define grid self.gridDefinition(master) # layout the controls in the grid self.controlsLayout() # <editor-fold desc="Gui Functions"> def controlsLayout(self): self.labelPath.grid(row=1, column=0, sticky=E) self.entryPath.grid(row=1, column=1, columnspan=2, sticky=W) self.browse_button.grid(row=1, column=3, sticky=W) self.labelDiscBins.grid(row=2, column=0, sticky=E) self.entryDiscBins.grid(row=2, column=1, sticky=W) self.labelErr.grid(row=4, column=0, columnspan=4, sticky=W) self.build_button.grid(row=5, column=1, columnspan=2) self.classify_button.grid(row=6, column=1, columnspan=2) self.close_button.grid(row=7, column=1, columnspan=2) def gridDefinition(self, master): master.grid_rowconfigure(0, weight=2) master.grid_rowconfigure(1, weight=1) master.grid_rowconfigure(2, weight=1) master.grid_rowconfigure(3, weight=1) master.grid_rowconfigure(4, weight=1) master.grid_rowconfigure(5, weight=1) master.grid_rowconfigure(6, weight=1) master.grid_rowconfigure(7, weight=1) master.grid_columnconfigure(0, weight=1) master.grid_columnconfigure(1, weight=1) master.grid_columnconfigure(2, weight=1) master.grid_columnconfigure(3, weight=2) # </editor-fold> # Build button was clicked def build(self): #try: self.train = pd.read_csv(self.entryPath.get() + "/train.csv") if self.validate(self.entryDiscBins.get()): # load train file, test file and structure file if (os.path.getsize(self.entryPath.get() + "/Structure.txt") == 0): raise Exception("The structure file is empty") self.structureFile = open(self.entryPath.get() + "/Structure.txt") self.fileHandler = FilesHandler() self.structureDic = self.fileHandler.createStstructureDic(self.structureFile) self.dataCleaner = DataCleaner(self.structureDic, self.numOfBins) self.toLowerCase("train") self.train = self.dataCleaner.trainCleaning(self.train) self.classifier = Classifier(self.train, self.entryPath.get(), self.structureDic, self.numOfBins) self.wasBuilt = True tkMessageBox.showinfo("Build Message", "Building classifier using train-set is done!") #except Exception as e: # tkMessageBox.showinfo("Error Message", "Something went wrong:\n" + str(e)) # Clasify button was clicked def classify(self): try: if self.wasBuilt: self.test = pd.read_csv(self.entryPath.get() + "/test.csv") self.toLowerCase("test") self.test = self.dataCleaner.testCleaning(self.test) self.classifier.classify(self.test) tkMessageBox.showinfo("Classify Message", "Classifying the test-set to the chosen path is done!") sys.exit(0) else: tkMessageBox.showinfo("Error Message", "Please build before Classifying") except Exception as e: tkMessageBox.showinfo("Error Message", "Something went wrong:\n" + str(e)) # Trnasfer dataset to lowercase def toLowerCase(self, file): for attribute in self.structureDic: if self.structureDic[attribute] != "NUMERIC" and attribute != "class": if file == "train": self.train[attribute] = self.train[attribute].str.lower() else: self.test[attribute] = self.test[attribute].str.lower() # Open file dialog openrer def askopenfile(self): self.filePath = tkFileDialog.askdirectory() self.entryPath.delete(0, END) self.entryPath.insert(0, self.filePath) # Validate input def validate(self, new_text): if not new_text: # the field is being cleared self.labelErr['text'] = "Please enter a number" return False try: self.numOfBins = int(new_text) # check validate number if self.numOfBins < 1: self.labelErr['text'] = "The number for \"Discretization Bins\" should be bigger than 0" return False elif self.numOfBins > self.train.count()[0]: self.labelErr['text'] = "\"Discretization Bins\" shouldn't be higher then the number of records" return False # if the numer is valid else: self.labelErr['text'] = "" return True except ValueError: self.labelErr['text'] = "Invalid input - Please enter a number"
catimage = CattoPoster.catgetimgfile() catfact = CattoPoster.catgettextfile() with open('UsedImages.txt', 'a') as filer: filer.write("\n" + catimage) filer.close() tweetMe.goTweet(catfact, catimage) elif mainin == "2": dogimage = DoggoPoster.doggetimgfile() doggofacto = DoggoPoster.doggettextfile() with open('UsedImages.txt', 'a') as filer: filer.write("\n" + dogimage) filer.close() tweetMe.goTweet(doggofacto, dogimage) elif mainin == "3": print("Loading the cleaner...") print("THERE HAS TO BE AT LEAST 2 FILES IN EACH .txt FILE !\n") DataCleaner.cleaner() elif mainin == "4": print("Downloading doggos...") dogsDownloader.dog() elif mainin == "5": print("Downloading cattos...") catsDownloader.cat() elif mainin == "6": print("Loading...") import imageChecker #ovo necemo importat odma jer ima previse modula anyway; We dont wanna bloat imageChecker.checkmypics() else: print("No. Dont be rude.")
# used to remove car names from data array cols_rmv = [8] # represents the data split (traingin, validation) size = [.75, .25] split_selection = list() runs = 15 for x in range(0, runs): split_selection.append(size) print('Number of Runs: ', runs) print("Data Split: ", split_selection[0]) # get the data using data cleaner # returns a 2D array where rows are observations and columns # are attributes of a specific observations data_array = DataCleaner.data_cleaner("CarData.txt") # used to do Linear Regression. # Arguments are: # data_array: The data array created with DataCleaner # imputation: The users choice of imputation # cont_dis: The array that represents which cols/attributes are continuous or discrete(0,1) # cols_rmv: The columns the user would like to be removed from the data set here it is the car_name # bad data signal: This will be used to determine if and what data points are missing # split_selection: Array controlling how many tests are run and the split between test and validation sets Regression.perform_regression(list(data_array), imputation, cont_dis, cols_rmv, '?', 0, split_selection)
fifaCleaner.getAllPlayersInPosition(midfielderPositions, fifaData)) defenders = fifaCleaner.convertHeightAndWeight(defenders) midfield = fifaCleaner.convertHeightAndWeight(midfield) defenders = fifaCleaner.ConvertMonetaryValue(defenders) midfield = fifaCleaner.ConvertMonetaryValue(midfield) defenders = defenders.fillna(defenders) midfield = midfield.fillna(midfield) missing_Defender_Data = util.Missing(defenders) missing_midfielder_Data = util.Missing(midfield) util.SideSide(missing_Defender_Data, missing_midfielder_Data) print('Missing Data') X_train, X_test, y_train, y_test = dc.Spliting(defenders, 'BP') RF_Model = RandomForestClassifier(max_features='sqrt', max_leaf_nodes=5) analysis.ApplyModel(X_train, y_train, RF_Model) for position in arrayOfPositions: players = fifaCleaner.getAllPlayersInPosition(position, fifaData) playerDataFrame = pd.DataFrame(players) print(playerDataFrame.info) analysis.CorrelationMatrix('Overall', playerDataFrame) index = arrayOfPositions.index(position) if index == 0: fileName = 'GoalKeepers' elif index == 1: fileName = 'Defenders'
rf = RandomForestClassifier(n_estimators=10, max_features=n_features, max_depth=None, min_samples_split=2, bootstrap=True) print('Starting RandomForest training process....') rf.fit(trainData_X, trainData_Y) print('Precison of training data: ' + str(rf.score(trainData_X, trainData_Y))) print('Precision of testing data: ' + str(rf.score(testData_X, testData_Y))) # In[52]: data, label = DataCleaner.loadData() trainData_X, trainData_Y, testData_X, testData_Y = seperateData(data, label) KNN(trainData_X, trainData_Y, testData_X, testData_Y) # In[53]: DecisionTree(trainData_X, trainData_Y, testData_X, testData_Y) # In[54]: logisticRegression(trainData_X, trainData_Y, testData_X, testData_Y) # In[66]: SVM(trainData_X, trainData_Y, testData_X, testData_Y)
'highway:track', 'highway:trunk', 'highway:trunk_link', 'highway:unclassified' ec = [ 'blue' if data['highway']=='tertiary_link' else 'green' if data['highway']=='residential' else 'yellow' if data['highway']=='tertiary' else 'orange' if data['highway']=='secondary' else 'red' if data['highway']=='primary' else 'blue' for u, v, key, data in G.edges(keys=True, data=True)] #this works, but is static ox.plot_graph(G,fig_height=8,fig_width=8,node_size=0, edge_color=ec) # Run cleaning Methods df_bike_accidents_in_region = DataCleaner.clean_crashes(df_all_traffic_accidents,polygon,north,south,east,west) df_traffic_studies_in_region = DataCleaner.clean_traffic(df_all_traffic_studies,polygon,north,south,east,west) df_undirected_edges_with_features = DataCleaner.edge_featurizer(df_edges_undirected,column_name_list) df_undirected_edges_with_features.columns len(df_traffic_studies_in_region) len(df_bike_accidents_in_region) ## some plots fig,ax = ox.plot_graph(G_undirected, node_zorder=2,node_size=0.03,node_alpha = 0.1,node_color='k', bgcolor='k', edge_linewidth=0.4,use_geom=True, axis_off=False,show=False, close=False) ax=df_bike_accidents_in_region.plot(kind='scatter',x='DEC_LONG',y='DEC_LAT',s=1,fig=fig,label='Bike Accident',ax=ax,color='r') ax = df_traffic_studies_in_region.plot(title='Plotting the 3 Main Datasets',kind='scatter',x='X',y='Y',s=3,c='y',label='Traffic Study',fig=fig,ax=ax) df_traffic_studies_in_region.setyear.describe() df_traffic_studies_in_region.plot.scatter(x='setyear',y = 'aadb')
from DataCleaner import * #dir format:/Users/mac/Desktop/CityWander/ cleaner = DataCleaner("Shanghai", "/Users/mac/Desktop/CityWander/") cleaner.get_file_name() cleaner.error_point_name() cleaner.deleter_name() cleaner.get_info() cleaner.error_point_info() cleaner.deleter_info() cleaner.error_info_to_name() cleaner.deleter_info_to_name()
from DataCleaner import * #dir format:/Users/mac/Desktop/CityWander/ cleaner = DataCleaner("Guangzhou", "/data/yyh/CityWander/") print("City:", cleaner.city_name) cleaner.get_file_name() #获取街景文件目录下的所有文件名 cleaner.error_point_name() #使用文件名筛选出来的错误街景点 cleaner.deleter_name() #删除error_point_name筛选出的文件 cleaner.get_info() #筛选现在剩余街景文件的信息 cleaner.error_point_info() #根据get_info方法筛选出的信息,找出信息有错误的街景点 cleaner.deleter_info() #删除error_point_info方法找出的错误街景点 cleaner.error_info_to_name() #找出文件名和信息不匹配的点 cleaner.deleter_info_to_name() #删除文件名和信息不匹配的点