def test_spelling_correction(self): prep2 = Preprocessor(corpus1, spellcorrect=True) corpus = prep2.preprocessed_corpus()[0] self.assertEqual(corpus[0], ["sentences", "word", "<positiveemoji>", "butterfly", "<positiveemoji>"])
def test_lemmatization(self): prep3 = Preprocessor(corpus1, lemmatize=True) corpus = prep3.preprocessed_corpus()[0] self.assertEqual(corpus[1], ['this', 'be', 'the', 'longest', 'sentence', 'in', 'the', 'corpus'])
# return [sent.string.strip() for sent in doc.sents] #time.sleep(0.5) predict_df[data_columns] = predict_df[data_columns].progress_apply( lambda x: sent_tokenize(x)) predict_df = predict_df.explode(data_columns) predict_df = predict_df.reset_index(drop=True) predict_df = predict_df.reset_index(drop=False) ## do the preprocessing print("Preprocess") preprocessor = Preprocessor( doLower=args["doLower"], doLemmatization=args["doLemmatization"], removeStopWords=args["removeStopWords"], doSpellingCorrection=args["doSpellingCorrection"], removeNewLine=args["removeNewLine"], removePunctuation=args["removePunctuation"], removeHtmlTags=args["removeHtmlTags"], minTextLength=args["minTextLength"]) predict_df["processed"] = preprocessor.fit_transform( predict_df["text_german"]) predict_df = predict_df.dropna(subset=["processed"], axis=0) print("Tokenize") tokenizer = Tokenizer(tokenizeStr=preperation_technique, ngram=preperation_ngram, fasttextFile=args["fasttext_file"], doLower=args["doLower"]) predict_df["processed"] = tokenizer.fit_transform(predict_df["processed"])
def main(model_name, batch_size=64, model_params=None, preprocessing_params=None, test_run=False): model_wrapper = model_dict[model_name](*model_params) preprocessor = Preprocessor(**preprocessing_params) model_dir = os.path.join('test_runs' if test_run else 'output', model_wrapper.path(), preprocessor.path()) success_path = os.path.join(model_dir, 'history') if not os.path.exists(success_path): best_model_path = os.path.join(model_dir, 'model-best.hdf5') shutil.rmtree(model_dir, ignore_errors=True) os.makedirs(model_dir) model = model_wrapper.get_model() callbacks = [ ModelCheckpoint(best_model_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min'), EarlyStopping(patience=1 if test_run else 5, verbose=1) ] if test_run: print 'test run' train_generator = preprocessor.get_train_generator( vs.TINY_TRAIN_DIR, batch_size) val_generator = preprocessor.get_test_generator( vs.TINY_VALIDATION_DIR, batch_size) test_generator = preprocessor.get_test_generator( vs.TINY_TEST_DIR, batch_size) print 'class indices' print train_generator.class_indices else: print 'true run' train_generator = preprocessor.get_train_generator( vs.TRAIN_DIR, batch_size) val_generator = preprocessor.get_test_generator( vs.VALIDATION_DIR, batch_size) test_generator = preprocessor.get_test_generator( vs.TEST_DIR, batch_size) train_samples = train_generator.samples val_samples = val_generator.samples history = model.fit_generator( train_generator, train_samples // batch_size, validation_data=val_generator, validation_steps=val_samples // batch_size + 1, epochs=100, callbacks=callbacks).history with open(success_path, 'wb') as f: pickle.dump(history, f) model = load_model(best_model_path) make_submission(model, test_generator, os.path.join(model_dir, 'submission.csv')) with open(success_path, 'rb') as f: history = pickle.load(f) print model_wrapper print preprocessor print 'best loss %.3f' % min(history['val_loss']) print 'best accuracy %.3f' % max(history['val_acc'])
------- y_pred : ndarray, shape (n_samples,) The predicted target. """ return self.model.predict(X) if __name__ == "__main__": # load data print('Loading data...') data = build_dataset('train') train_data, valid_data = train_test_split(data, test_size=0.2, random_state=42) preprocessor = Preprocessor() preprocessor.fit(train_data) train_data = preprocessor.transform(train_data) valid_data = preprocessor.transform(valid_data) save_dataset(pd.concat([train_data, valid_data]), 'train_preprocessed.csv') X_train = train_data.drop(['Sales', 'Customers'], axis=1) X_valid = valid_data.drop(['Sales', 'Customers'], axis=1) y_train = train_data['Sales'] y_valid = valid_data['Sales'] print('Training model on', len(X_train), 'samples') print('Validating model on', len(X_valid), 'samples') print('Training model on features: ', X_train.columns.tolist()) # model selection with grid search
import numpy as np from multinomialNB import MultinomialNB from preprocessing import Preprocessor from Scorer.scorer import BinaryScorer from matplotlib import pyplot as plt # Open file and read content in a variable. # Couldn't use standard python way of opening files due to ASCII decode errors. raw = codecs.open('./SMSSpamCollection.txt', 'r', encoding='utf-8').readlines() # Create a Multinomial Naive Bayes Classifier, in this case we only have 2 classes model = MultinomialNB() # Preprocess, Tokenize and Split data in train and test # IMPORTANT: Unless seed parameter is removed from call, the split will always be the same. x_tr, y_tr, x_ts, y_ts = Preprocessor(data=raw).preprocess().tokenize().split( percentage_train=0.8, seed=555, shuffle=True, functional=False) # Fit the model model.fit(x_tr, y_tr) print('Fit complete') # Predict train and test values pred_tr = model.predict(x_tr, alpha=0.1, voc_size=20000) pred_ts = model.predict(x_ts, alpha=0.1, voc_size=20000) # Print train and test predictions performance train_scores = BinaryScorer(y_tr, pred_tr, description='Training').describe() test_scores = BinaryScorer(y_ts, pred_ts, description='Testing').describe() # Create list of alphas with different i value
def __init__(self, g): self.G = deepcopy(g) self.queryCount = 0 # Do pre-processing and get the initial query set p = Preprocessor(self.G) self.Q = deepcopy(p.query) # Find the lower limit tree self.Tl = deepcopy(p.Tl) # Find the upper limit tree self.Tu = deepcopy(p.Tu) g = list(deepcopy(self.Tu)) g.sort(key=lambda x: -x.upper) # g is the set of edges in upper limit tree sorted in descending order of upper limits component = {} # DFS Function to traverse in the tree def dfs(u, par, adj, c): component[u] = c for v in adj[u]: if v.u + v.v - u != par: dfs(v.u + v.v - u, u, adj, c) # Function to check whether the edgeSet contains in always minimal edge def check(edgeSet): if len(edgeSet) <= 1: return False lowers = [] for edge in edgeSet: lowers.append(edge.lower) lowers.sort() for edge in edgeSet: if edge.lower == lowers[0] and (edge.trivial or edge.upper <= lowers[1]): return False return True # Go through each edge in g for edge in g: # Construct the graph in adjacency list adj = [[] for _ in range(self.G.size + 1)] erased = None for edge2 in self.Tu: if {edge2.u, edge2.v} == {edge.u, edge.v}: erased = edge2 self.Tu.remove(erased) for edge2 in self.Tu: adj[edge2.u].append(edge2) adj[edge2.v].append(edge2) # Do two DFS - once from each component component.clear() dfs(edge.u, -1, adj, 0) dfs(edge.v, -1, adj, 1) # C stores the edge set denoting the "cut" created C = [] for edge2 in self.G.edges: if component[edge2.u] != component[edge2.v]: C.append(edge2) # While C does not have always mininal edge while check(C): firstLower = 1e9 firstInd = 0 # Find the two edges with minimum lower limits for i in range(len(C)): if C[i].lower < firstLower: firstLower = C[i].lower firstInd = i secondLower = 1e9 secondInd = 0 firstEdge = C[firstInd] for i in range(len(C)): if i == firstInd: continue if C[i].lower < secondLower: secondLower = C[i].lower secondInd = i secondEdge = C[secondInd] assert secondEdge.lower <= firstEdge.upper # If the first edge is not trivial, then query it if not firstEdge.trivial: self.Q.add(deepcopy(firstEdge)) self.G.query(firstEdge) C.remove(firstEdge) firstEdge.lower = firstEdge.actual firstEdge.upper = firstEdge.actual firstEdge.trivial = True C.append(firstEdge) # If the second edge is not trivial, then query it if not secondEdge.trivial: self.Q.add(deepcopy(secondEdge)) self.G.query(secondEdge) C.remove(secondEdge) secondEdge.lower = secondEdge.actual secondEdge.upper = secondEdge.actual secondEdge.trivial = True C.append(secondEdge) # If an always minimal edge is found, erase it from Upper Limit Tree if len(C): if len(C) == 1: self.Tu.add(C[0]) else: lowers = [] for edge in C: lowers.append(edge.lower) lowers.sort() for edge in C: if edge.lower == lowers[0] and (edge.trivial or edge.upper <= lowers[1]): self.Tu.add(edge) break
from preprocessing import Preprocessor from nn import train import torch import torch.nn as nn p = Preprocessor(dimensions=2) data, focus_words, contexts = p.run() model = train(input_dimension=len(p.vocabulary), embedding_dimension=2, learning_rate=0.1, focus_words=focus_words, contexts=contexts) # check for similarity between batman and wayne vs joker and wayne def who_is_wayne(m): w1 = model.layer1.weight # stack to merge rows like row_1 -> col_1 with row_2 -> col_2 as a single row # print w1 in case you need an explanation! w1 = torch.stack((w1[0], w1[1]), dim=1) b1 = model.layer1.bias # word vectors we're looking for from word2vec # are actually the backprop updated weights and biases of the # hidden layer vectors = w1 + b1 print("") print("Preparing vectors")
class PostprocessingWorker(threading.Thread): """ Python script for Postprocessing worker... runs until cancelled or till max waiting time """ pause_time = 2 max_waiting_time = 60 * 60 # 60seconds * 60min = 1 hour in seconds base_path = "" saves_path = "mating_progress/" pickle_prefix = "" get_save = False pop_path = "population/" traces_path = "traces_afterVox/" traces_backup_path = "traces_afterVox_backup/" traces_during_pp_path = "traces_duringPP/" traces_after_pp_path = "traces_afterPP/" debug = False db = None queue = [] vox_preamble = 8 # number of lines that voxelyze adds before the actual output in a trace file config = ConfigParser.RawConfigParser() arena_x = 0 arena_y = 0 arena_type = "" end_time = 0 timeTolerance = 0.0 # maximum mating time distance spaceTolerance = 0.01 # maximum mating distance radius one_child = False infertile_birth = False infertile_birth_percent = 0.1 area_birthcontrol = False area_birthcontrol_radius = 0.05 area_birthcontrol_cutoff = 25 population_cap = False pp = Preprocessor() indiv_max_age = 0 indiv_infertile = False indiv_infertile_span = 0.25 random_birth_place = False queue_length = 1 timestep = 0.002865 pick_from_pool = False def readConfig(self, config_path): self.config.read(config_path) self.exp_name = self.config.get('Experiment', 'name') self.path_prefix = self.config.get('Experiment', 'path_prefix') self.debug = self.config.get('Experiment', 'debug') self.end_time = self.config.getfloat('Experiment', 'end_time') self.base_path = os.path.expanduser(self.path_prefix + self.exp_name) + "/" self.queue_length = self.config.getint('Postprocessing', 'queue_len') self.pop_path = self.config.get('Postprocessing', 'pop_path') self.traces_path = self.config.get('Postprocessing', 'traces_path') self.traces_backup_path = self.config.get('Postprocessing', 'traces_backup_path') self.traces_during_pp_path = self.config.get('Postprocessing', 'traces_during_pp_path') self.traces_after_pp_path = self.config.get('Postprocessing', 'traces_after_pp_path') self.vox_preamble = self.config.getint('Postprocessing', 'vox_preamble') self.timestep = self.config.getfloat('Postprocessing', 'timestep') self.pause_time = self.config.getint('Workers', 'pause_time') self.max_waiting_time = self.config.getint('Workers', 'max_waiting_time') self.timeTolerance = self.config.getfloat('Mating', 'timeTolerance') self.spaceTolerance = self.config.getfloat('Mating', 'spaceTolerance') self.indiv_infertile = self.config.getboolean('Mating', 'indiv_infertile') self.indiv_infertile_span = self.config.getfloat( 'Mating', 'indiv_infertile_span') self.one_child = self.config.getboolean('Mating', 'onlyOneChildPerParents') self.infertile_birth = self.config.getboolean('Mating', 'infertileAfterBirth') self.infertile_birth_percent = self.config.getfloat( 'Mating', 'infertileAfterBirthPercentage') self.area_birthcontrol = self.config.getboolean( 'Mating', 'areaBirthControl') self.area_birthcontrol_radius = self.config.getfloat( 'Mating', 'areaBirthControlRadius') self.area_birthcontrol_cutoff = self.config.getfloat( 'Mating', 'areaBirthControlCutoff') self.population_cap = self.config.getboolean('Mating', 'populationCap') self.random_birth_place = self.config.getboolean( 'Mating', 'randomBirthPlace') self.pick_from_pool = self.config.getboolean('Mating', 'pickFromPool') self.arena_x = self.config.getfloat('Arena', 'x') self.arena_y = self.config.getfloat('Arena', 'y') self.arena_type = self.config.get('Arena', 'type') self.indiv_max_age = self.config.getfloat('Population', 'indiv_max_age') def __init__(self, dbParams, config_path): threading.Thread.__init__(self) self.db = DB(dbParams[0], dbParams[1], dbParams[2], dbParams[3]) self.readConfig(config_path) self.stopRequest = threading.Event() def run(self): """ main thread function :return: None """ waitCounter = 0 startTime = time.time() obs_path = os.path.normpath(self.base_path + self.traces_path) while not self.stopRequest.isSet( ): # and waitCounter < self.max_waiting_time): self.dirCheck(obs_path) if (len(self.queue) > 0): print 'PP:', map(self.getIDfromTrace, self.queue) self.queue = sorted( self.queue, key=lambda id: int(self.getIDfromTrace(id))) item = self.queue[0] self.queue = self.queue[1:] if self.debug: print "PP: working on id", item self.markAsVoxelyzed(item) self.moveFilesToTmp(item) self.adjustTraceFile(item) self.traceToDatabase(item) self.findMates(item) babies = self.calculateOffspring(item) self.makeBabies(babies) self.moveFilesToFinal(item) self.markAsPostprocessed(item) waitCounter = 0 else: if (self.debug): print("PP: found nothing") waitCounter += time.time() - startTime startTime = time.time() jobsRunning = self.db.getJobsWaitingCount() if (self.debug): print("PP: {n} jobs currently waiting in LISA queue...". format(n=jobsRunning)) print("PP: sleeping now for " + str(self.pause_time) + "s") self.stopRequest.wait(self.pause_time) print("PP: got exit signal... cleaning up") def join(self, timeout=None): """ function to terminate the thread (softly) :param timeout: not implemented yet :return: None """ if (self.debug): print("PP: got kill request for thread") self.stopRequest.set() super(PostprocessingWorker, self).join(timeout) def getIDfromTrace(self, file_path): path, filename = os.path.split(file_path) name_parts = filename.split(".") return name_parts[0] def dirCheck(self, path): """ upon start check if there are files in the target diretory, because the watcher only notices files being moved there while running :return: None """ unprocessed = [ os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.trace') ] for todo in unprocessed: if todo not in self.queue: self.addFile(todo) def markAsVoxelyzed(self, todo): """ mark all the individuals as voxelyzed, i.e. as successfully processed by Voxelyze :param todos: list of strings with trace file paths :return: None """ id = self.getIDfromTrace(todo) self.db.markAsVoxelyzed(id) self.db.setJobDone(id) def markAsPostprocessed(self, todo): """ mark all the individuals as postprocessed, i.e. all offspring has been calculated, files have been moved and the individuals are basically done :param todos: list of strings with trace file paths :return: None """ id = self.getIDfromTrace(todo) self.db.markAsPostprocessed(id) self.db.setFinalTime(id) def adjustTraceFile(self, todo): """ put the individuals into an arena, correct their coordinates, etc. :param todos: list of strings with the individual trace filepaths :return: None """ id = self.getIDfromTrace(todo) # get initial coordinates from DB indiv = self.db.getIndividual(id) first_trace = self.db.getFirstTrace(id) self.pp.addStartingPointArenaAndTime(self.getPathDuringPP(id), self.vox_preamble, self.arena_x, self.arena_y, self.arena_type, first_trace["x"], first_trace["y"], indiv["born"], self.end_time, self.timestep) def traceToDatabase(self, todo): """ put the individuals into the database :param todos: list of strings with the individual trace filepaths :return: None """ id = self.getIDfromTrace(todo) with open(self.getPathDuringPP(id), 'r') as inputFile: traces = [] fileAsList = inputFile.readlines() fileLen = len(fileAsList) for i in range(0, fileLen): fertile = 1 if (self.infertile_birth): if (i <= self.infertile_birth_percent * fileLen): fertile = 0 traceLine = fileAsList[i].split() traces.append([ id, traceLine[1], traceLine[2], traceLine[3], traceLine[4], fertile ]) if (len(traces) == 0): print( "PP-WARNING: individual {indiv} has 0 traces, so skipping... please check this though!" .format(len=len(traces), indiv=id)) else: if (self.debug): print("PP: adding {len} traces for individual {indiv} to DB". format(len=len(traces), indiv=id)) self.db.addTraces(id, traces) def getPotentialBirthplace(self, parent1, parent2): x = (parent1["x"] + parent2["x"]) / 2 y = (parent1["y"] + parent2["y"]) / 2 return [x, y] def filterGlobalInfertility(self, id, mates): pass def filterIncestControl(self, id, mates): pass def filterAreaBirthControl(self, id, mates): pass def filterPopulationCap(self, id, mates): if len(mates) > 0: mate = random.choice(mates) else: if not self.pick_from_pool: # then we mate the individual with itself lastTrace = self.db.getLastTrace(id) mate = {} mate["id"] = 0 mate["indiv_id"] = id mate["ltime"] = lastTrace["ltime"] mate["x"] = lastTrace["x"] mate["y"] = lastTrace["y"] mate["z"] = lastTrace["z"] mate["mate_id"] = 0 mate["mate_indiv_id"] = id mate["mate_ltime"] = lastTrace["ltime"] mate["mate_x"] = lastTrace["x"] mate["mate_y"] = lastTrace["y"] mate["mate_z"] = lastTrace["z"] else: return [None] return [mate] def calculateOffspring(self, todo): """ yeah, well... generate offspring, calculate where the new individuals met friends on the way :param todos: list of strings with the individual IDs :return: list of babies to make """ babies = [] if (not os.path.exists(todo)) or os.path.getsize(todo) == 0: return babies id = self.getIDfromTrace(todo) if self.debug: print("PP: looking for mates for individual {indiv}...".format( indiv=id)) mates = self.db.getMates(id) # population cap is exclusive - if it is on, no other control works if self.population_cap: mates = self.filterPopulationCap(id, mates) else: if self.indiv_infertile: mates = self.filterGlobalInfertility(id, mates) if self.one_child: mates = self.filterIncestControl(id, mates) if self.area_birthcontrol: mates = self.filterAreaBirthControl(id, mates) if mates != [ None ]: # this happens only if self.pick_from_pool is True and if no mate was found babies += self.matesToBabies(id, mates) else: randomMate = self.db.getRandomMate(id) babies += self.matesToBabies(randomMate["id"], [randomMate]) return babies def close_in_time(self, t1, t2): return abs(t1['ltime'] - t2['ltime']) <= self.timeTolerance def close_in_space(self, t1, t2): return math.sqrt((t1['x'] - t2['x'])**2 + (t1['y'] - t2['y'])**2) <= self.spaceTolerance def findMates(self, indiv_path): id = self.getIDfromTrace(indiv_path) traces = self.db.getTraces(id) territory = self.db.getTerritory(id) lifetime = self.db.getLifetime(id) if not all(territory.values()) or not all(lifetime.values()): return possibleMates = self.db.getPossibleMates(id, territory, lifetime) mates = [] for t in traces: for p in possibleMates: if self.close_in_time(t, p) and self.close_in_space(t, p): mates.append((t, p)) print 'PP: found', len(mates), 'possible mates for individual', id self.db.insertMates(mates) def matesToBabies(self, id, mates): babies = [] for mate in mates: parent2 = {} parent2["id"] = mate["mate_id"] parent2["indiv_id"] = mate["mate_indiv_id"] parent2["ltime"] = mate["mate_ltime"] parent2["x"] = mate["mate_x"] parent2["y"] = mate["mate_y"] parent2["z"] = mate["mate_z"] babies.append([mate, parent2, mate["ltime"]]) return babies def makeBabies(self, babies): for baby in babies: self.db.makeBaby(baby[0], baby[1], baby[2], self.one_child, self.indiv_max_age * self.indiv_infertile_span, self.arena_x, self.arena_y, self.random_birth_place) def getPathDuringPP(self, id): return self.base_path + self.traces_during_pp_path + str(id) + ".trace" def moveFilesToTmp(self, indiv): """ once all preprocessing is done, move the files to their target destination :param todos: list of strings with the individual IDs :return: None """ id = self.getIDfromTrace(indiv) try: shutil.copy2( indiv, self.base_path + self.traces_backup_path + str(id) + ".trace") shutil.copy2(indiv, self.getPathDuringPP(id)) except: pass def moveFilesToFinal(self, indiv): """ once all preprocessing is done, move the files to their target destination :param todos: list of strings with the individual IDs :return: None """ id = self.getIDfromTrace(indiv) if os.path.isfile(self.getPathDuringPP(id)): shutil.move( self.getPathDuringPP(id), self.base_path + self.traces_after_pp_path + str(id) + ".trace") if os.path.isfile(indiv): os.remove(indiv) def addFile(self, path): self.queue.append(path)
if args["validation_split"]: train_df, val_df = train_test_split( train_df, test_size=args["validation_split"], random_state=42) else: logging.error("vaidation_split needs to be given.") sys.exit("vaidation_split needs to be given.") ## get data and train columns data_column = list(set(train_df.columns) - set(args["targets"]))[0] ## do the preprocessing print("Preprocess") preprocessor = Preprocessor( doLower=args["doLower"], doLemmatization=args["doLemmatization"], removeStopWords=args["removeStopWords"], doSpellingCorrection=args["doSpellingCorrection"], removeNewLine=args["removeNewLine"], removePunctuation=args["removePunctuation"]) train_df[data_column] = preprocessor.fit_transform( train_df[data_column]) val_df[data_column] = preprocessor.transform(val_df[data_column]) test_df[data_column] = preprocessor.transform(test_df[data_column]) ## save the preprocessed data if not os.path.exists(os.path.join(args["data_path"], "temp")): os.makedirs(os.path.join(args["data_path"], "temp")) train_df.to_pickle(train_pre_path) val_df.to_pickle(val_pre_path) test_df.to_pickle(test_pre_path) else:
from preprocessing import Preprocessor # from attention import AttentionLayer import tensorflow as tf keras = tf.keras from keras.layers import Input, LSTM, Embedding, Dense, Concatenate from keras.models import Model from keras.callbacks import EarlyStopping, ModelCheckpoint if __name__ == "__main__": path = "/Users/seungyoungoh/workspace/text_summarization_project/" data = pd.read_csv(path + "/data/sample.csv", error_bad_lines=False) data = data.rename({ 'body': 'src', 'key_point': 'smry' }, axis='columns')[['src', 'smry']] pr = Preprocessor(data) src_max_len, smry_max_len, src_vocab, smry_vocab, X_train, X_test, y_train, y_test = pr.preprocess( ) # ### modeling # embedding_dim = 128 # hidden_size = 256 # # 인코더 # encoder_inputs = Input(shape=(src_max_len,)) # # 인코더의 임베딩 층 # enc_emb = Embedding(src_vocab, embedding_dim)(encoder_inputs) # # 인코더의 LSTM 1 # encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4)
def __init__(self, g): self.G = deepcopy(g) self.queryCount = 0 # Do pre-processing and get the initial query set p = Preprocessor(g) self.Q = deepcopy(p.query) # Find the lower limit tree self.Tl = deepcopy(p.Tl) # Find the upper limit tree self.Tu = deepcopy(p.Tu) f = self.G.edges removed = set() for edge in self.Tl: removed.add(edge) erased = set() for edge in f: if edge in removed: erased.add(edge) for edge in erased: f.remove(edge) # print("f is", f) # f is a set of edges not present in lower limit tree C = [] # DFS Function to traverse in the tree def dfs(u, par, adj, last, edges): if u == last: C[:] = edges for v in adj[u]: if v.u + v.v - u != par: edges.append(v) dfs(v.u + v.v - u, u, adj, last, edges) edges.remove(v) # Function to check whether the edgeSet contains in always maximal edge def check(edgeSet): if len(edgeSet) <= 1: return False uppers = [] for edge in edgeSet: uppers.append(edge.upper) uppers.sort() for edge in edgeSet: if edge.upper == uppers[-1] and (edge.trivial or edge.lower >= uppers[-2]): return False return True # Go through each edge in f for edge in f: # Construct the graph using adjacency list adj = [[] for _ in range(self.G.size + 1)] for edge2 in self.Tl: adj[edge2.u].append(edge2) adj[edge2.v].append(edge2) C[:] = [] dfs(edge.u, -1, adj, edge.v, []) C.append(edge) # C is the set of edges which denote the cycle formed by adding 'edge' self.Tl.add(edge) # Unless C does not contain always maximal edge while check(C): uppers = [] firstUpper = 0 firstEdge = 0 # Find the two edges with maximum upper limits for i in range(len(C)): if C[i].upper > firstUpper: firstUpper = C[i].upper firstInd = i secondUpper = 0 secondInd = 0 firstEdge = C[firstInd] for i in range(len(C)): if i == firstInd: continue if C[i].upper > secondUpper: secondUpper = C[i].upper secondInd = i secondEdge = C[secondInd] print("firstEdge ", firstEdge) print("secondEdge ", secondEdge) # If the first edge is not trivial, then query it if (not firstEdge.trivial) and ( secondEdge.trivial or firstEdge.cost <= secondEdge.cost): print("adding 1st", firstEdge) self.Q.add(deepcopy(firstEdge)) self.Tl.remove(firstEdge) self.G.query(firstEdge) C.remove(firstEdge) firstEdge.lower = firstEdge.actual firstEdge.upper = firstEdge.actual firstEdge.trivial = True self.Tl.add(firstEdge) C.append(firstEdge) # If the second edge is not trivial, then query it elif (not secondEdge.trivial) and ( firstEdge.trivial or secondEdge.cost < firstEdge.cost): print("adding 2nd", secondEdge) self.Q.add(deepcopy(secondEdge)) self.Tl.remove(secondEdge) self.G.query(secondEdge) C.remove(secondEdge) secondEdge.lower = secondEdge.actual secondEdge.upper = secondEdge.actual secondEdge.trivial = True self.Tl.add(secondEdge) C.append(secondEdge) # If an always maximal edge is found, erase it from Lower Limit Tree if len(C): uppers = [] for edge in C: uppers.append(edge.upper) uppers.sort() uCon, vCon = None, None for edge in C: if edge.upper == uppers[-1] and (edge.trivial or edge.lower >= uppers[-2]): self.Tl.remove(edge) break
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, "ModelTrainingLog", 'Start of Training') # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns( data, ['Wafer', "Unnamed: 0"] ) # remove the unnamed column as it doesn't contribute to prediction. # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='Good/Bad') is_null_present = preprocessor.is_null_present(X) # if missing values are there, replace them appropriately. if (is_null_present): X = preprocessor.impute_missing_values( X) # missing value imputation # check further which columns do not contribute to predictions # if the standard deviation for a column is zero, it means that the column has constant values # and they are giving the same output both for good and bad sensors # prepare the list of such columns to drop cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X) # drop the columns obtained above X = preprocessor.remove_columns(X, cols_to_drop) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) # create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization # getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) # saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, "ModelTrainingLog", 'Successful End of Training')
def test_emoji_replacement(self): prep4 = Preprocessor(corpus1, remove_short_tweets=False, verbose_emoji=True) corpus = prep4.preprocessed_corpus()[0] self.assertEqual(corpus[1], ['short', ':fearful_face:'])
def main(): """Main entry point""" # Unclear why this is needed but I get BLAS errors otherwise physical_devices = tf.config.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(physical_devices[0], True) # Valiate command line args parser = argparse.ArgumentParser( description="Fit and evaluate all models in `models.py`") parser.add_argument('label') parser.add_argument('epochs', type=int) parser.add_argument('batch_size', type=int) opts = parser.parse_args(sys.argv[1:]) filename = os.path.join("output", opts.label+".json") if not os.path.exists(filename): parser.error("Could not find JSON preprocessed data: %s" % filename) # Create image output directory if it doesn't exist if not os.path.exists("images"): os.mkdir("images") # De-serialize preprocessor with open(filename, "r") as this_file: json_txt = this_file.read() pre_proc = Preprocessor() pre_proc.from_json(json_txt) # Create the models models_dict = models.generate_models(pre_proc) # Fitting with cross-validation kfolds = KFold(n_splits=models.K_FOLDS) results = {} # Loop over all models... for model_name, fold_models in models_dict.items(): # Reshape X based on model type... standard neural networks # take a different shape than LSTM and have an additional input # for position in vector. This is handled automatically for LSTM # networks. if model_name[0:4] == "DENS": x_data = pre_proc.x_train y_data = pre_proc.y_train elif model_name[0:4] == "LSTM": x_data, y_data, _, _ = pre_proc.get_rnn_format() else: raise ValueError("Uknown model prefix: %s" % model_name[0:4]) begin_time = time.time() train_err = [] val_err = [] history = [] model_idx = 0 for train_idx, val_idx in kfolds.split(x_data, y_data): # Grab the model for this fold model = fold_models[model_idx] train_dataset = tf.data.Dataset.from_tensor_slices(( tf.cast(x_data[train_idx], tf.float32), tf.cast(y_data[train_idx], tf.float32), )) train_dataset = train_dataset.batch(opts.batch_size) val_dataset = tf.data.Dataset.from_tensor_slices(( tf.cast(x_data[val_idx], tf.float32), tf.cast(y_data[val_idx], tf.float32), )) val_dataset = val_dataset.batch(opts.batch_size) # No suffle, already done hist = model.fit( x=train_dataset, epochs=opts.epochs, shuffle=False, # Shuffle already done verbose=0, callbacks=callbacks, validation_data=val_dataset, ) # Story history and cycle to next fold history.append(hist) train_err.append(hist.history['loss'][-1]) val_err.append(hist.history['val_loss'][-1]) print("{0:10} {1:7.4f} {2:7.4f}".format( model_name, train_err[-1], val_err[-1])) model_idx += 1 # Done with all the folds end_time = time.time() fit_time = end_time-begin_time print("time: {0:7.2f}".format(fit_time)) results[model_name] = (np.mean(train_err), np.mean(val_err), fit_time) fig = go.Figure() for i in range(models.K_FOLDS): fig.add_trace(go.Scatter( x=[t for t in range(len(history[i].history['loss']))], y=history[i].history['loss'], name="Train Fold {}".format(i+1), mode='lines', )) fig.add_trace(go.Scatter( x=[t for t in range(len(history[i].history['val_loss']))], y=history[i].history['val_loss'], name="Val Fold {}".format(i+1), mode='lines', )) fig.update_xaxes(title="Epoch") fig.update_yaxes(title="Loss") plot(fig) df_summary = pd.DataFrame(results).transpose() df_summary.columns = ['train', 'val', 'time'] df_summary['labels'] = df_summary.index df_summary = df_summary[['labels', 'train', 'val', 'time']] outfile = os.path.join("output", "{0}_param_search.csv".format(opts.label)) df_summary.to_csv(outfile)
def test_has_proper_noun(self): prep7 = Preprocessor([]) self.assertTrue(prep7.has_proper_noun("Bernie will win.")) self.assertFalse(prep7.has_proper_noun("not impeachable!")) self.assertFalse(prep7.has_proper_noun("🤕"))
Directory of files used for training. """) parser.add_argument('--architecture', type=str, default='vgg', help="""\ Model architecture to use. """) FLAGS, unparsed = parser.parse_known_args() print("Loading model...") tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() preprocessor = Preprocessor(feature_count=40, window_size_ms=20, window_stride_ms=10) fingerprint_input = tf.placeholder(tf.float32, [None, preprocessor.fingerprint_size], name='fingerprint_input') fingerprint_input_4d = tf.reshape( fingerprint_input, [-1, preprocessor.feature_count, preprocessor.window_number, 1]) logits = create_model(FLAGS.architecture, fingerprint_input_4d, {'label_count': len(CLASSES)}, is_training=False) predicted_indices = tf.argmax(logits, 1) tf.global_variables_initializer().run() if FLAGS.model:
from preprocessing import Preprocessor import pandas as pd import numpy as np import boto df = pd.read_csv('stackodata.csv') p = Preprocessor() cleaned_data = p.transform(df.values) df_cleaned = pd.DataFrame(cleaned_data, columns=[ 'id', 'title', 'qscore', 'ascore', 'tags', 'q_nocode', 'q_code', 'a_nocode', 'a_code' ]) df_cleaned.to_csv('stack_data_cleaned.csv', encoding='utf-8')
db, log = DBProvider(), Log(config, 'update') now_morning = datetime(now.year, now.month, now.day, 4) start_time = time.time() log.debug(f"Clear RAM {config.OS.clear_ram()}") log.debug(f'Current DB is {config.Database}') log.debug(f"Update current_matches to {now_morning}") updater = CurrentUpdater(LeagueScraper(), MatchScraper(from_time=None, to_time=now_morning), db, log) updater.update() log.debug(f"Updated current_matches for {int(time.time() - start_time)} sec") next_day = now + timedelta(days=1) next_day_morning = datetime(next_day.year, next_day.month, next_day.day, 4) start_time = time.time() log.debug(f"Update future_matches from {now_morning} to {next_day_morning}") updater = FutureUpdater( FutureLeagueScraper(), MatchScraper(from_time=now_morning, to_time=next_day_morning), db, log) updater.update() log.debug(f"Updated future_matches for {int(time.time() - start_time)} sec") start_time = time.time() log.debug(f"Clear RAM {config.OS.clear_ram()}") preprocessor = Preprocessor(db, log) log.debug('Preprocess matches') num_matches = preprocessor.preprocess() log.debug( f'Preprocessed {num_matches} matches for {int(time.time() - start_time)} sec' )
def setup_to_train(self, train_data=None, dev_data=None, test_data=None): # create a model directory: if os.path.isdir(self.model_dir): shutil.rmtree(self.model_dir) os.mkdir(self.model_dir) self.train_tokens = train_data['token'] if self.include_test: self.test_tokens = test_data['token'] if self.include_dev: self.dev_tokens = dev_data['token'] idx_cnt = 0 if self.include_lemma: self.lemma_out_idx = idx_cnt idx_cnt += 1 self.train_lemmas = train_data['lemma'] self.known_lemmas = set(self.train_lemmas) if self.include_dev: self.dev_lemmas = dev_data['lemma'] if self.include_test: self.test_lemmas = test_data['lemma'] if self.include_pos: self.pos_out_idx = idx_cnt idx_cnt += 1 self.train_pos = train_data['pos'] if self.include_dev: self.dev_pos = dev_data['pos'] if self.include_test: self.test_pos = test_data['pos'] if self.include_morph: self.morph_out_idx = idx_cnt self.train_morph = train_data['morph'] if self.include_dev: self.dev_morph = dev_data['morph'] if self.include_test: self.test_morph = test_data['morph'] self.preprocessor = Preprocessor().fit( tokens=self.train_tokens, lemmas=self.train_lemmas, pos=self.train_pos, morph=self.train_morph, include_lemma=self.include_lemma, include_morph=self.include_morph, max_token_len=self.max_token_len, focus_repr=self.focus_repr, min_lem_cnt=self.min_lem_cnt, ) self.pretrainer = Pretrainer(nb_left_tokens=self.nb_left_tokens, nb_right_tokens=self.nb_right_tokens, size=self.nb_embedding_dims, minimum_count=self.min_token_freq_emb) self.pretrainer.fit(tokens=self.train_tokens) train_transformed = self.preprocessor.transform( tokens=self.train_tokens, lemmas=self.train_lemmas, pos=self.train_pos, morph=self.train_morph) if self.include_dev: dev_transformed = self.preprocessor.transform( tokens=self.dev_tokens, lemmas=self.dev_lemmas, pos=self.dev_pos, morph=self.dev_morph) if self.include_test: test_transformed = self.preprocessor.transform( tokens=self.test_tokens, lemmas=self.test_lemmas, pos=self.test_pos, morph=self.test_morph) self.train_X_focus = train_transformed['X_focus'] if self.include_dev: self.dev_X_focus = dev_transformed['X_focus'] if self.include_test: self.test_X_focus = test_transformed['X_focus'] if self.include_lemma: self.train_X_lemma = train_transformed['X_lemma'] if self.include_dev: self.dev_X_lemma = dev_transformed['X_lemma'] if self.include_test: self.test_X_lemma = test_transformed['X_lemma'] if self.include_pos: self.train_X_pos = train_transformed['X_pos'] if self.include_dev: self.dev_X_pos = dev_transformed['X_pos'] if self.include_test: self.test_X_pos = test_transformed['X_pos'] if self.include_morph: self.train_X_morph = train_transformed['X_morph'] if self.include_dev: self.dev_X_morph = dev_transformed['X_morph'] if self.include_test: self.test_X_morph = test_transformed['X_morph'] self.train_contexts = self.pretrainer.transform( tokens=self.train_tokens) if self.include_dev: self.dev_contexts = self.pretrainer.transform( tokens=self.dev_tokens) if self.include_test: self.test_contexts = self.pretrainer.transform( tokens=self.test_tokens) print('Building model...') nb_tags = None try: nb_tags = len(self.preprocessor.pos_encoder.classes_) except AttributeError: pass nb_morph_cats = None try: nb_morph_cats = self.preprocessor.nb_morph_cats except AttributeError: pass max_token_len, token_char_dict = None, None try: max_token_len = self.preprocessor.max_token_len token_char_dict = self.preprocessor.token_char_dict except AttributeError: pass max_lemma_len, lemma_char_dict = None, None try: max_lemma_len = self.preprocessor.max_lemma_len lemma_char_dict = self.preprocessor.lemma_char_dict except AttributeError: pass nb_lemmas = None try: nb_lemmas = len(self.preprocessor.lemma_encoder.classes_) except AttributeError: pass self.model = build_model( token_len=max_token_len, token_char_vector_dict=token_char_dict, lemma_len=max_lemma_len, nb_tags=nb_tags, nb_morph_cats=nb_morph_cats, lemma_char_vector_dict=lemma_char_dict, nb_encoding_layers=self.nb_encoding_layers, nb_dense_dims=self.nb_dense_dims, nb_embedding_dims=self.nb_embedding_dims, nb_train_tokens=len(self.pretrainer.train_token_vocab), nb_context_tokens=self.nb_context_tokens, pretrained_embeddings=self.pretrainer.pretrained_embeddings, include_token=self.include_token, include_context=self.include_context, include_lemma=self.include_lemma, include_pos=self.include_pos, include_morph=self.include_morph, nb_filters=self.nb_filters, filter_length=self.filter_length, focus_repr=self.focus_repr, dropout_level=self.dropout_level, nb_lemmas=nb_lemmas, ) self.save() self.setup = True
# evaluating SVM using cross validation print "Evaluating model with cross validation..." if speaker_indipendence: k_folds = len(db.test_sets) splits = zip(db.train_sets, db.test_sets) else: k_folds = 10 # 交叉验证,分为k个子样本 sss = StratifiedShuffleSplit(n_splits=k_folds, test_size=0.2, random_state=1) splits = sss.split(Fglobal, y) # setting preprocessing pp = Preprocessor('standard', n_components=50) n_classes = len(db.classes) clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=10, gamma=0.01)) # C惩罚参数,值越大,训练准确率高,泛化能力弱 # gamma 不知道干嘛的 # one vs rest, Also known as one-vs-all prfs = [] scores = [] acc = np.zeros(n_classes) mi_threshold = 0.0 for (train, test) in splits: # selecting features using mutual information Ftrain = Fglobal[train] Ftest = Fglobal[test] f_subset = pp.mutual_info_select(Ftrain, y[train], mi_threshold) # 通过互信息选择特征