def parse(path, crawl = False): if crawl == True: raise StandardError() pos_filename = os.path.join(path, "pos.lst") neg_filename = os.path.join(path, "neg.lst") pos_dir = os.path.join(path, "pos") neg_dir = os.path.join(path, "neg") if not os.path.isfile(pos_filename): print "%s is not a file."%(pos_filename,) return None if not os.path.isfile(neg_filename): print "%s is not a file."%(neg_filename,) return None if not os.path.isdir(pos_dir): print "%s is not a directory."%(pos_dir,) return None if not os.path.isdir(neg_dir): print "%s is not a directory."%(neg_dir,) return None ret = DataSet() pos = open(pos_filename, "r") pos_names = [line[line.rfind("/")+1:] for line in pos.read().split()] pos.close() for name in pos_names: filename = os.path.join(pos_dir, name) ret.add_obj(name, WholeImage(name)) neg = open(neg_filename, "r") neg_names = [line[line.rfind("/")+1:] for line in neg.read().split()] neg.close() for name in neg_names: ret.add_empty_image(name) return ret
def __init__(self, data, interval_type=ClassIntervalType.ROOT): f = [] for d in data: f.append(float(d)) data = f DataSet.__init__(self, data) self.interval_type = interval_type if self.interval_type != ClassIntervalType.THREESIGMA: self.class_interval = self.calc_class_interval(interval_type, self.min, self.max, self.n); self.construct_bins(self.min, self.max, self.class_interval, False); else: sigma_span = 6 min = self.mean - self.stdev * (sigma_span / 2) max = self.mean + self.stdev * (sigma_span / 2) self.class_interval = self.calc_class_interval(ClassIntervalType.THREESIGMA, min, max, sigma_span) self.construct_bins(min, max, self.class_interval, True) self.fill_bins() self.sort_bins() total = 0 for bin in self.bins: total = total + bin.count() self.bin_contents_count = total
def eval_classifier(classifierToUse, featuresToUse, testOrTrain="train"): print("Chosen feature: {0}".format(featuresToUse) ) print("Chosen classifier: {0}".format(classifierToUse)) fe = FeatureExtractor(featuresToUse) dataset = DataSet(fe) classifier = Classifier() evaluate = Evaluation() print "test or Train %s" % testOrTrain for feature_class, files in getTestData(testOrTrain).items(): print "%s" % testOrTrain for f in files: dataset.addFile(feature_class, f) print "Dataset initialized" print_class_stats(dataset.classes) print "Test set created." a_train, a_test, c_train, c_test = train_test_split(dataset.featureVector, dataset.classes, test_size=0.9) c_pred = classifier.classification(a_train,a_test,c_train,c_test,classifierToUse) evaluate.evaluate(c_pred,c_test,featuresToUse,classifierToUse)
def __vectorize(self, data): """\ Train vectorization and subsequently vectorize. Accepts a DataSet or a list of dictionaries to be vectorized. """ # no vectorization performed, only converted to matrix if self.vectorizer is None: if not isinstance(data, DataSet): data_set = DataSet() data_set.load_from_dict(data) data = data_set data.match_headers(self.data_headers, add_values=True) # TODO pre-filtering here? return data.as_bunch(target=self.class_attr, select_attrib=self.select_attr).data # vectorization needed: converted to dictionary # and passed to the vectorizer if isinstance(data, DataSet): data = data.as_dict(select_attrib=self.select_attr, mask_attrib=self.class_attr) else: data = [{key: val for key, val in inst.items() if key != self.class_attr and key in self.select_attr} for inst in data] # pre-filter attributes if filter_attr is set if self.filter_attr: data = [{key: val for key, val in inst.items() if self.filter_attr(key, val)} for inst in data] if not self.vectorizer_trained: self.vectorizer.fit(data) self.vectorizer_trained = True return self.vectorizer.transform(data).tocsr()
def extract_data(raw_data_file, format_data_path, n_vectors, n_components, shift=-1, n_datums=-1, test_percentage=0): """ Extrait les données brutes de raw_data_file, et, à partir des paramètres de formatage renseignés, construit deux fichiers de données préformatées train et test. Un item de données (datum) est une matrice de la forme (n_vectors,n_components*20), où n_vectors est le nombre de vecteurs à prendre pour un item de donnée, et n_components le nombre de composantes à garder sur chaque vecteur. :param file raw_data_file: Fichier contenant les données brutes :param str format_data_path: Chemin vers le dossier où placer les données préformatées :param int n_vectors: Nombre de vecteurs à considérer comme étant un item de données :param int n_components: Nombre de composantes à garder sur chaque vecteur. Si = 1, un vecteur ne contiendra que les mfcc, si = 2 un vecteur contiendra les mfcc et leurs dérivées premières, si = 3, il y aura aussi les dérivées secondes. :param int shift: Décalage/Overlapping. Nombre de vecteurs en commun entre le dernier item de données extrait et le prochain. Attention ! Laisser à -1 pour désactiver l'overlapping. Introduire un overlapping donne des résultats surestimés en apprentissage (voir rapport de stage). :param int n_datums: the first n_datums will be extracted. -1 to extract data from the whole file :param int n_datums: Nombre d'items donnée à lire dans le fichier de données brutes avant l'arrêt du script. -1 = Aller jusqu'à la fin du fichier. :param float test_percentage: Rapport attendu du nombre de données à mettre en généralisation divisé par le nombre de données à mettre en apprentissage. (\*100 = pourcentage de données en test) :return: Les bases de données train et test (sous forme de classe DataSet) """ train = DataSet(format_data_path, "train") test = DataSet(format_data_path, "test") data = [] datum = [] feature_list = [] line_count = 0 total_line_count = 0 for feature in raw_data_file: line_count += 1 if feature[0] == ' ': # New data vector feature_list = feature.split() if feature_list[-1] == ']': feature_list.pop() # remove ending "]" for the last vector of the signal datum.append([ float(x) for x in feature_list[:(20*n_components)] ]) if len(datum) >= n_vectors: # Append the datum data.append(datum) # Shift the datum datum = datum[shift:] if shift > 0 else [] if len(data)%20000 == 0: print "extract data >> ", len(data), " datums extracted for", line_count, "lines read" else: # New signal new_str_label = feature.split('#')[0] if new_str_label != DataSet.str_label: if data: # There is data to split in train/test DataSet.split_train_test(data, test_percentage, train, test) # Append to files train.flush_buffer() test.flush_buffer() data = [] print "SPLIT : ", "train =", len(train), " - test =", len(test) print "Line count for this label : ", line_count print "TOTAL : ", len(train)+len(test), " datums extracted for", total_line_count + line_count, "lines read" if n_datums > 0 and len(train) + len(test) >= n_datums: break # Update current label DataSet.update_label(new_str_label) print "New LABEL : ", DataSet.str_label, "int : ", DataSet.int_label total_line_count += line_count line_count = 0 datum = [] print "extract data >> GRAND TOTAL : ", (len(train) + len(test)), " datums extracted for", total_line_count + line_count, "lines read" return train, test
def setField(self, label, arr, **kwargs): """Set the given array `arr` as the new array of the field specfied by `label`.""" DataSet.setField(self, label, arr, **kwargs) # refresh dimensions, in case any of these fields were modified if label == 'input': self.indim = self.getDimension('input') elif label == 'target': self.outdim = self.getDimension('target')
def load_training_set(self, filename, encoding='UTF-8'): """\ Load the given training data set into memory and strip it if configured to via the train_part parameter. """ log_info('Loading training data set from ' + str(filename) + '...') train = DataSet() train.load_from_arff(filename, encoding) if self.train_part < 1: train = train.subset(0, int(round(self.train_part * len(train))), copy=False) return train
def parse(path, crawl = False): if crawl == True: raise StandardError() ret = DataSet() filenames = os.listdir(path) for filename in filenames: #TODO : check validity (fname, width, height, chans, bboxes) \ = parse_file(os.path.join(path, filename)) fname = os.path.basename(fname) for bbox in bboxes: ret.add_obj(fname, bbox, height, width) return ret
def ResultsToXY(sets,x,y,foreach=[]): """ combines observable x and y to build a list of DataSet with y vs x this function is used to collect data from a hierarchy of DataSet objects, to prepare plots or evaluation. the inner-most list has to contain one DataSet with props['observable'] = x and one props['observable'] = y, this will be the pair x-y used in the collection. The parameters are: sets: hierarchy of datasets where the inner-most list must contain to pair x-y x: the name of the observable to be used as x-value of the collected results y: the name of the observable to be used as y-value of the collected results foreach: an optional list of properties used for grouping the results. A separate DataSet object is created for each unique set of values of the specified parameers. The function returns a list of DataSet objects. """ dd = depth(sets) if dd < 2: raise Exception('The input hierarchy does not provide a unique pair x-y. The input structure has to be a list of lists as minimum. pyalps.groupSets might help you.') hgroups = flatten(sets, fdepth=-1) foreach_sets = {} for gg in hgroups: xset = None yset = None for d in gg: if d.props['observable'] == x: xset = d if d.props['observable'] == y: yset = d if xset is None or yset is None: continue common_props = dict_intersect([d.props for d in gg]) fe_par_set = tuple((common_props[m] for m in foreach)) if not fe_par_set in foreach_sets: foreach_sets[fe_par_set] = DataSet() foreach_sets[fe_par_set].props = common_props foreach_sets[fe_par_set].props['xlabel'] = x foreach_sets[fe_par_set].props['ylabel'] = y if len(xset.y) == len(yset.y): foreach_sets[fe_par_set].x = np.concatenate((foreach_sets[fe_par_set].x, xset.y)) foreach_sets[fe_par_set].y = np.concatenate((foreach_sets[fe_par_set].y, yset.y)) elif len(xset.y) == 1: foreach_sets[fe_par_set].x = np.concatenate((foreach_sets[fe_par_set].x, np.array( [xset.y[0]]*len(yset.y) ))) foreach_sets[fe_par_set].y = np.concatenate((foreach_sets[fe_par_set].y, yset.y)) for k, res in foreach_sets.items(): order = np.argsort(res.x, kind = 'mergesort') res.x = res.x[order] res.y = res.y[order] res.props['label'] = '' for p in foreach: res.props['label'] += '%s = %s ' % (p, res.props[p]) return foreach_sets.values()
def load_test_data(self, sessions_df): data_df = read_from_csv(self.task_core.test_data_file, self.task_core.n_seed #, max_rows=50000 ) cache_file = os.path.join(self.task_core.cache_dir, 'features_test_' + str(len(data_df.index)) + '.p') if os.path.isfile(cache_file): print('Loading test features from file') x = DataSet.load_from_file(cache_file) else: x = ds_from_df(data_df, sessions_df, True) print('saving test features to file') DataSet.save_to_file(x, cache_file) return x
def __reduce__(self): # FIXME: This does actually not feel right: We have to use the DataSet # method here, although we inherit from sequential dataset. _, _, state, _, _ = DataSet.__reduce__(self) creator = self.__class__ args = self.statedim, self.actiondim return creator, args, state, iter([]), iter({})
def _trim_data(self, extension_fraction=None, max_interval=None): """ Toss out data outside of (extended) view range, and closer than max_interval seconds apart. """ if extension_fraction is None: start_stamp = self._start_stamp end_stamp = self._end_stamp else: extension = rospy.Duration((self._end_stamp - self._start_stamp).to_sec() * extension_fraction) if extension.to_sec() >= self._start_stamp.to_sec(): start_stamp = rospy.Time(0, 1) else: start_stamp = self._start_stamp - extension end_stamp = self._end_stamp + extension min_x = (start_stamp - self._timeline.start_stamp).to_sec() max_x = (end_stamp - self._timeline.start_stamp).to_sec() for series in list(self._data.keys()): points = self._data[series].points num_points = len(points) trimmed_points = [] if num_points > 0 and points[0][0] < max_x and points[-1][0] > min_x: first_index = None last_x = None for i, (x, y) in enumerate(points): if x >= min_x: trimmed_points.append((x, y)) first_index = i last_x = x break if first_index is not None: for i, (x, y) in enumerate(points[first_index + 1:]): if x > max_x: break if (max_interval is None) or (x - last_x >= max_interval): trimmed_points.append((x, y)) last_x = x new_data = DataSet() new_data.set(trimmed_points) self._data[series] = new_data
def load_train_data(self, sessions_df): data_df = read_from_csv(self.task_core.data_file, self.task_core.n_seed #, max_rows=50000 ) cache_file = os.path.join(self.task_core.cache_dir, 'features_train_' + str(len(data_df.index)) + '.p') if os.path.isfile(cache_file): print('Loading train features from file') x = DataSet.load_from_file(cache_file) else: x = ds_from_df(data_df, sessions_df, False) print('saving train features to file') DataSet.save_to_file(x, cache_file) labels = data_df['country_destination'].values y = le_.transform(labels) return x, y
def ds_from_df(data_df, sessions_df, is_test): print('ds_from_df <<') data_df = add_features(data_df) data_df = add_sessions_features(data_df, sessions_df) if not is_test: data_df = data_df.drop(['country_destination'], axis=1) print('ds_from_df >>') return DataSet.create_from_df(data_df)
def exp_(self): #""" data = DataSet() self.quick = DataSet() data.dataimport("D:\Dropbox\St Andrews\IT\IS5189 MSc Thesis\\02 Data\InnoCentive_Challenge_9933493_training_data.csv") data.labelencode(columns=self.configLE) xtest, xtrain, ytest, ytrain = data.split(quick=True) self.quick.import_split(xtest, xtrain, ytest, ytrain) self.output_str("10 percent of original dataset loaded (into train. Testset is 90 percent).") rows_train = len(xtrain) self.feedback("Challenge data loaded. self.quick init with " + str(rows_train) + " rows.") correlation_list, descstats = self.quick.correlation() self._output_last(correlation_list) #print(test) #a = test.sort_values(by='Correlation', ascending=True).head(20) #b = test.sort_values(by='Correlation',ascending=False).head(20) #print(a) #print(b) #print(descstats) #self.quick.descstats() #""" #Clock.schedule_once(lambda dt: self.feedback("this is good"), -1) #descstats = data.descstats(self.configLE) ############################################################ # df is short for DataFrame , to make it more readable when manipulating the Pandas DataFrame. # Might be easier (and is shorter) to read by developers as an in house var name. threshold = 0.7 df = correlation_list[correlation_list['Correlation'] > threshold] df = df.sort_values(by='Correlation',ascending=False) column_a_b = df['Var1'] column_a_b = column_a_b.append(df['Var2']) print(df[df['Var1'] == 'C31']) print(column_a_b.value_counts()) #print(df.head(10)) print(pd.crosstab(df['Var1'], df['Var2']))
def exp_quick_load(self): self.output_str("Import.") global data data = DataSet() data.dataimport("D:\Dropbox\St Andrews\IT\IS5189 MSc Thesis\\02 Data\InnoCentive_Challenge_9933493_training_data.csv") self.loaded = True self.output_str("Label Encode.") data.labelencode(columns=self.configLE) self.output_str("Split (quick = True).") data.split(target_column_name=self.configCV['target_value'], test_set_size=self.configCV['test_set_size'], seed=self.configCV['seed'], random_state_is=self.configCV['random_state_is'],quick=True) self.update_overview(trainrows=len(data.X_train), testrows=len(data.X_test), ncols=len(data.X_train.columns.values)) self.output_str("Function 'exp_quick_load()' finished running.") data.descstats(self.configLE,write=True,workdir=self.configGeneral['workdir'])
def __init__(self): conf = Configuration() self.ptext = TextProcess(conf) self.ds = DataSet(conf) self.mongo = MongoDB(self.ds.db,self.ds.collection) self.tweet="" self.tokens = "" self.i = 0 self.enable_translation = self.ptext.translation self.translation_store = self.ptext.translation_store
def __init__(self,conf,q): self.ptext = TextProcess(conf) self.ds = DataSet(conf) self.cleaner = KeyCleaner() self.enable_translation = self.ptext.translation self.translation_store = self.ptext.translation_store self.tweets = q # Tweets queue self.tweet = "" self.tokens = "" self.i = 0 Thread.__init__(self)
def parse(filen, crawl = False): if crawl == True: raise StandardError() file = open(filen, "r") ret = DataSet() for line in file: line = line.strip().rstrip() splited = line.split() filename = splited[0] (left_eye_x, left_eye_y, right_eye_x, right_eye_y, nose_x, nose_y, left_corner_mouth_x, left_corner_mouth_y, center_mouth_x, center_mouth_y, right_corner_mouth_x, right_corner_mouth_y) = tuple([float(a) for a in splited[1:]]) ret.add_obj(filename, EyesNoseMouth(Point(left_eye_x, left_eye_y), Point(right_eye_x, right_eye_y), Point(nose_x, nose_y), Point(left_corner_mouth_x, left_corner_mouth_y), Point(center_mouth_x, center_mouth_y), Point(right_corner_mouth_x, right_corner_mouth_y))) file.close() return ret
def get_blend_feature_or_load_from_cache( classifier, scale, classes_count, x_train, y_train, x_test, feature_prefix, random_state, cache_dir, n_folds, bagging_count, ): file_suffix = "_cl" + str(classes_count) + "_" + feature_prefix + "fld" + str(n_folds) + "_bag" + str(bagging_count) cache_file_train = os.path.join(cache_dir, "f_train_" + str(len(x_train.ids_)) + file_suffix + ".p") cache_file_test = os.path.join(cache_dir, "f_test_" + str(len(x_test.ids_)) + file_suffix + ".p") if os.path.isfile(cache_file_train) and os.path.isfile(cache_file_test): print("loading features " + feature_prefix + " from files") feature_train = DataSet.load_from_file(cache_file_train) feature_test = DataSet.load_from_file(cache_file_test) else: feature_train, feature_test = get_blend_feature( classifier, scale, classes_count, x_train, y_train, x_test, feature_prefix, random_state, n_folds ) print("saving features " + feature_prefix + " to files") DataSet.save_to_file(feature_train, cache_file_train) DataSet.save_to_file(feature_test, cache_file_test) return feature_train, feature_test
def parse(filen, crawl = False): file = open(filen, "r") ret = DataSet() for line in file: line = line.strip().rstrip() splited = line.split() filename = splited[0] # filename = filename[filename.rfind("/")+1:] # filename = filename[:filename.rfind(".")] height = int(splited[1]) width = int(splited[2]) class_id = int(splited[3]) (confidence, x, y, x2, y2) = tuple([float(a) for a in splited[4:]]) #if confidence > parse_confidence_min: #TODO if hratio != None: height = y2 - y height2 = height * hratio y += (height - height2) / 2.0 y2 = y + height2 if wratio != None: width = x2 - x width2 = width * wratio x += (width - width2) / 2.0 x2 = x + width2 if whratio != None: height = y2 - y width = x2 - x width2 = height * whratio x += (width - width2) / 2.0 x2 = x + width2 bb = BoundingBox(x, y, x2, y2) area = bb.area() if (min_area == None or area >= min_area) and \ (max_area == None or area <= max_area): ret.add_obj(filename, bb) file.close() # print summary print 'Dataset ' + str(filen) + ' has ' + str(len(ret)) + ' images and ' \ + str(ret.get_nobjs()) + ' positive objects.' return ret
def train(self, learning_rate, training_epochs, batch_size, keep_prob): # Load dataset for training and testing self.dataset = DataSet() # Define size of output self.Y = tf.placeholder(tf.float32, [None, 10], name='Y') # Define cost function self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.Y)) # Define optimization method self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost) # Start logger if self.log: tf.summary.scalar('cost', self.cost) self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter('./log_train', self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) print('Training...') weights = [] # For each epoch, feed training data and perform updating parameters for epoch in range(training_epochs): avg_cost = 0 # Number of batches = size of training set / batch_size total_batch = int(self.dataset.get_train_set_size() / batch_size) # For each batch for i in range(total_batch + 1): # Get next batch to feed to the network batch_xs, batch_ys = self.dataset.next_batch(batch_size) feed_dict = { self.X: batch_xs.reshape([batch_xs.shape[0], 28, 28, 1]), self.Y: batch_ys, self.keep_prob: keep_prob } weights, summary, c, _ = self.sess.run([self.parameters, self.merged, self.cost, self.optimizer], feed_dict=feed_dict) avg_cost += c / total_batch if self.log: self.train_writer.add_summary(summary, epoch + 1) print('Epoch:', '%02d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost)) print('Training finished!') saver = tf.train.Saver() save_path = saver.save(self.sess, model_dir + "/mnist_lenet.ckpt") print("Trainned model is saved in file: %s" % save_path)
def __init__(self, inp, target): """Initialize an empty supervised dataset. Pass `inp` and `target` to specify the dimensions of the input and target vectors.""" DataSet.__init__(self) if isscalar(inp): # add input and target fields and link them self.addField('input', inp) self.addField('target', target) else: self.setField('input', inp) self.setField('target', target) self.linkFields(['input', 'target']) # reset the index marker self.index = 0 # the input and target dimensions self.indim = self.getDimension('input') self.outdim = self.getDimension('target')
def get_region(region): """ The main endpoint to get the information on the given region :param region: The genomic region who's data is to be extracted. ( chrom:start-end ) :type: str ADDITIONAL PARAMETERS of the endpoint : These parameters are to be added to the query url as so : /region/<string:region> **?param=<string>** :param dataset: Name of the dataset in which the region's data is to be fetched. :type: str :return: A JSONify dict with the the formated data under the "response" key. :rtype: dict """ return_data = defaultdict(list) query_string = request.query_string.decode("utf-8") querys = query_string.split("&") datasets = [] r = Region(region.split(":")[0], region.split(":")[1].split("-")[0],region.split("-")[1]) for query in querys: if query.split("=")[0] == "dataset": dataset = DataSet(query.split("=")[1]) data = dataset.get_region(r) dataset_name = os.path.splitext(os.path.basename(query.split("=")[1]))[0] return_data[dataset_name] = data return jsonify({"response":return_data, "sucess": 1})
def __init__(self, statedim, actiondim): """ initialize the reinforcement dataset, add the 3 fields state, action and reward, and create an index marker. This class is basically a wrapper function that renames the fields of SupervisedDataSet into the more common reinforcement learning names. Instead of 'episodes' though, we deal with 'sequences' here. """ DataSet.__init__(self) # add 3 fields: input, target, importance self.addField('state', statedim) self.addField('action', actiondim) self.addField('reward', 1) # link these 3 fields self.linkFields(['state', 'action', 'reward']) # reset the index marker self.index = 0 # add field that stores the beginning of a new episode self.addField('sequence_index', 1) self.append('sequence_index', 0) self.currentSeq = 0 self.statedim = statedim self.actiondim = actiondim # the input and target dimensions (for compatibility) self.indim = self.statedim self.outdim = self.actiondim
def __init__(self, config): self.config = config self.data = DataSet(self.config) self.add_placeholders() self.summarizer = tf.summary self.net = Network(config) self.saver = tf.train.Saver() self.epoch_count, self.second_epoch_count = 0, 0 self.outputs, self.prob = self.net.neural_search() self.hyperparams = self.net.gen_hyperparams(self.outputs) self.hype_list = [1 for i in range(self.config.hyperparams)] #[7, 7, 24, 5, 5, 36, 3, 3, 48, 64] self.reinforce_loss = self.net.REINFORCE(self.prob) self.tr_cont_step = self.net.train_controller(self.reinforce_loss, self.val_accuracy) self.cNet, self.y_pred = self.init_child(self.hype_list) self.cross_loss, self.accuracy, self.tr_model_step = self.grow_child() self.init = tf.global_variables_initializer() self.local_init = tf.local_variables_initializer()
class TweetDB(): def __init__(self): conf = Configuration() self.ptext = TextProcess(conf) self.ds = DataSet(conf) self.mongo = MongoDB(self.ds.db,self.ds.collection) self.tweet="" self.tokens = "" self.i = 0 self.enable_translation = self.ptext.translation self.translation_store = self.ptext.translation_store def get_tweet_from_db(self): where = { "text":{"$exists":"true"}, "geo.coordinates":{"$exists":"true"} } select = {"text":1,"source":1,"geo":1, "user":1,"retweet_count":1,"created_at":1} results = self.mongo.find(where,select) return results def process_tweets(self): tweets = self.get_tweet_from_db() for rawTweet in tweets: if "text" in rawTweet: tokens = {} self.ptext.set_tweet_text(rawTweet['text']) self.ptext.set_tweet_source(rawTweet['source']) self.ptext.process_text() rawTweet['source'] = self.ptext.get_tweet_source() rawTweet['text'] = self.ptext.get_tweet_text() self.tokens = self.ptext.get_tweet_tokens() tokens['tokens'] = self.tokens rawTweet.update(tokens) self.tweet = self.cleaner.unset_tweet_keys(rawTweet) if not self.ptext.get_translate_status(): self.ds.output_tweet(self.tweet) self.i += 1 else: if self.translation_store: if self.enable_translation: if not self.ptext.get_translate_failed(): self.ds.output_tweet(self.tweet) self.i += 1 else: self.ds.output_tweet(self.tweet) self.i += 1 def get_tweet_count(self): return self.i
def load(self, path, filename): global last_path global last_filename global data last_path = path last_filename = filename[0] try: data = DataSet() data.dataimport(filename[0]) self.loaded = True except (RuntimeError, TypeError, NameError): data.dprint("Error: most likely not a csv file.") self.output_str("Successfully loaded the data set.") self.feedback("Fileimport completed") if self.configGeneral['desc_stats_on_load']: data.descstats(self.configLE) self.output_str("Descriptive statistics performed.") ncols = len(data.information()) # Get the filename and cut it to fit the GUI.. # Filename only used to remind the user of which dataset has been loaded. head, tail = os.path.split(filename[0]) fname = tail[:5]+ "." + tail[-4:] self.update_overview(fname=fname,ncols=ncols) self.dismiss_popup()
class ProcessTweets(Thread): def __init__(self,conf,q): self.ptext = TextProcess(conf) self.ds = DataSet(conf) self.cleaner = KeyCleaner() self.enable_translation = self.ptext.translation self.translation_store = self.ptext.translation_store self.tweets = q # Tweets queue self.tweet = "" self.tokens = "" self.i = 0 Thread.__init__(self) def run(self): while True: rawTweet = self.tweets.get() if "text" in rawTweet: tokens = {} self.ptext.set_tweet_text(rawTweet['text']) self.ptext.set_tweet_source(rawTweet['source']) self.ptext.process_text() rawTweet['source'] = self.ptext.get_tweet_source() rawTweet['text'] = self.ptext.get_tweet_text() self.tokens = self.ptext.get_tweet_tokens() tokens['tokens'] = self.tokens rawTweet.update(tokens) self.tweet = self.cleaner.unset_tweet_keys(rawTweet) if not self.ptext.get_translate_status(): self.ds.output_tweet(self.tweet) self.i += 1 else: if self.translation_store: if self.enable_translation: if not self.ptext.get_translate_failed(): self.ds.output_tweet(self.tweet) self.i += 1 else: self.ds.output_tweet(self.tweet) self.i += 1 self.tweets.task_done() def get_tweet_count(self): return self.i
def train(self, user_limit, start_learning_rate, training_steps, decay_rate): # data set train_x, train_y = DataSet(user_limit, self.time_step).lstm_train() # error and optimize function with tf.name_scope('train'): error = tf.reduce_mean(tf.abs(self.prd - self.y)) tf.summary.scalar('error', error) # Dynamic learning rate global_step = tf.placeholder(tf.int16, name='global_step') learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, training_steps, decay_rate) tf.summary.scalar('learning_rate', learning_rate) update_op = tf.train.AdamOptimizer(learning_rate).minimize(error) # Run session config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Merge summaries merged = tf.summary.merge_all() summary_writer = tf.summary.FileWriter('saved_models/%s_%d/' % (self.model_name, self.hidden_unit), sess.graph) # Initialize global variables sess.run(tf.global_variables_initializer()) # Initialize mapping matrix map_file = tf.train.latest_checkpoint('saved_models/Prl_SVD_%d/' % self.map_size) self.map_saver.restore(sess, map_file) # Start learning err_sum = 0 turns = 0 data_len = len(train_x) for start in range(0, data_len * training_steps, self.batch_size): end = start + self.batch_size curr_step = start // data_len if curr_step == end // data_len: feed_dict = { self.x: train_x[start % data_len:end % data_len], self.y: train_y[start % data_len:end % data_len], global_step: curr_step } _, curr_err, _ = sess.run([update_op, error, learning_rate], feed_dict=feed_dict) err_sum += curr_err turns += 1 else: feed_dict = { self.x: train_x[start % data_len:] + train_x[:end % data_len], self.y: train_y[start % data_len:] + train_y[:end % data_len], global_step: curr_step } _, curr_err, curr_lr, summary = sess.run([update_op, error, learning_rate, merged], feed_dict=feed_dict) err_sum += curr_err turns += 1 # Write summaries summary_writer.add_summary(summary, global_step=curr_step) print('Step %d: error = %g, learning rate = %g' % (curr_step, err_sum / turns, curr_lr)) err_sum = 0 turns = 0 # Save model saver = tf.train.Saver() saver.save(sess, 'saved_models/%s_%d/' % (self.model_name, self.hidden_unit), global_step=training_steps)
class Model: def __init__(self, args): self.dataName = args.dataName self.dataSet = DataSet(self.dataName) self.shape = self.dataSet.shape self.maxRate = self.dataSet.maxRate self.train = self.dataSet.train self.test = self.dataSet.test self.negNum = args.negNum ############# self.initializer = args.initializer self.activation_func = args.activation self.regularizer_rate = args.regularizer self.inference() self.dropout = args.dropout self.embed_size = args.embed_size ############# # self.testNeg = self.dataSet.getTestNeg(self.test, 99) self.maxEpochs = args.maxEpochs self.batchSize = args.batchSize self.topK = args.topK self.earlyStop = args.earlyStop self.add_embedding_matrix() self.add_placeholders() self.add_model() self.add_loss() self.lr = args.lr self.add_train_step() self.checkPoint = args.checkPoint self.init_sess() def inference(self): """ Initialize important settings """ self.regularizer = tf.contrib.layers.l2_regularizer( self.regularizer_rate) if self.initializer == 'Normal': self.initializer = tf.truncated_normal_initializer(stddev=0.01) elif self.initializer == 'Xavier_Normal': self.initializer = tf.contrib.layers.xavier_initializer() else: self.initializer = tf.glorot_uniform_initializer() if self.activation_func == 'ReLU': self.activation_func = tf.nn.relu elif self.activation_func == 'Leaky_ReLU': self.activation_func = tf.nn.leaky_relu elif self.activation_func == 'ELU': self.activation_func = tf.nn.elu def add_placeholders(self): self.user = tf.placeholder(shape=(None, ), dtype=tf.int32, name="userid") self.item = tf.placeholder(shape=(None, ), dtype=tf.int32, name="itemid") self.rate = tf.placeholder(shape=(None, ), dtype=tf.float32, name='rate') self.drop = tf.placeholder(tf.float32, name="drop") def add_embedding_matrix(self): self.user_Embedding = tf.Variable(tf.truncated_normal( shape=[self.shape[0], self.embed_size], dtype=tf.float32, mean=0.0, stddev=0.01), name="user_Embedding") self.item_Embedding = tf.Variable(tf.truncated_normal( shape=[self.shape[1], self.embed_size], dtype=tf.float32, mean=0.0, stddev=0.01), name="item_Embedding") def add_model(self): self.user_input = tf.nn.embedding_lookup(self.user_Embedding, self.user) self.item_input = tf.nn.embedding_lookup(self.item_Embedding, self.item) with tf.name_scope("MNN"): self.interaction = tf.concat([self.user_input, self.item_input], axis=-1, name='interaction') self.layer1_MLP = tf.layers.dense( inputs=self.interaction, units=self.embed_size, activation=self.activation_func, kernel_initializer=self.initializer, kernel_regularizer=self.regularizer, name='layer1_MLP') self.layer1_MLP = tf.layers.dropout(self.layer1_MLP, rate=self.dropout) self.layer2_MLP = tf.layers.dense( inputs=self.layer1_MLP, units=self.embed_size // 2, activation=self.activation_func, kernel_initializer=self.initializer, kernel_regularizer=self.regularizer, name='layer2_MLP') self.layer2_MLP = tf.layers.dropout(self.layer2_MLP, rate=self.dropout) # 就是你在训练的时候想拿掉多少神经元,按比例计算。0就是没有dropout,1就是整个层都没了 self.layer3_MLP = tf.layers.dense( inputs=self.layer2_MLP, units=self.embed_size // 4, activation=self.activation_func, kernel_initializer=self.initializer, kernel_regularizer=self.regularizer, name='layer3_MLP') self.layer3_MLP = tf.layers.dropout(self.layer3_MLP, rate=self.dropout) self.logits = tf.layers.dense(inputs=self.layer3_MLP, units=1, activation=None, kernel_initializer=self.initializer, kernel_regularizer=self.regularizer, name='predict') self.logits_dense = tf.reshape(self.logits, [-1]) def add_loss(self): losses = tf.square(self.rate - self.logits_dense) self.loss = tf.reduce_sum(losses) def add_train_step(self): ''' global_step = tf.Variable(0, name='global_step', trainable=False) self.lr = tf.train.exponential_decay(self.lr, global_step, self.decay_steps, self.decay_rate, staircase=True) ''' optimizer = tf.train.AdamOptimizer(self.lr) self.train_step = optimizer.minimize(self.loss) def init_sess(self): self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True self.config.allow_soft_placement = True self.sess = tf.Session(config=self.config) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() if os.path.exists(self.checkPoint): [os.remove(f) for f in os.listdir(self.checkPoint)] else: os.mkdir(self.checkPoint) def run(self): best_hr = -1 best_NDCG = -1 best_epoch = -1 loss = np.inf print("Start Training!") for epoch in range(self.maxEpochs): # print("="*20+"Epoch ", epoch, "="*20) loss_temp = self.run_epoch(self.sess) if loss_temp < loss: loss = loss_temp else: break print("Epoch:", epoch, "loss:", loss_temp) self.saver.save(self.sess, self.checkPoint + 'model.ckpt') print("Training complete!") def run_epoch(self, sess, verbose=10): train_u, train_i, train_r = self.dataSet.getInstances(self.train) # train_set = {'user':train_u,"item":train_i,'rate':train_r} # dataset = tf.data.Dataset.from_tensor_slices(train_set) # dataset = dataset.shuffle(100000).batch(self.batchSize) # iterator = tf.data.Iterator.from_structure(dataset.output_types, # dataset.output_shapes) # sess.run(iterator.make_initializer(dataset)) train_len = len(train_u) shuffled_idx = np.random.permutation(np.arange(train_len)) train_u = train_u[shuffled_idx] train_i = train_i[shuffled_idx] train_r = train_r[shuffled_idx] num_batches = len(train_u) // self.batchSize + 1 losses = [] for i in range(num_batches): min_idx = i * self.batchSize max_idx = np.min([train_len, (i + 1) * self.batchSize]) train_u_batch = np.array(train_u[min_idx:max_idx]) train_i_batch = train_i[min_idx:max_idx] train_r_batch = train_r[min_idx:max_idx] print("ssdsdsdds", train_u_batch.shape) print(train_u_batch) feed_dict = self.create_feed_dict(train_u_batch, train_i_batch, train_r_batch, self.drop) _, tmp_loss = sess.run([self.train_step, self.loss], feed_dict=feed_dict) losses.append(tmp_loss) loss = np.mean(losses) print("\nMean loss in this epoch is: {}".format(loss)) return loss def create_feed_dict(self, u, i, r=None, drop=None): return {self.user: u, self.item: i, self.rate: r, self.drop: drop} def evaluate(self, sess, topK): def getHitRatio(ranklist, targetItem): for item in ranklist: if item == targetItem: return 1 return 0 def getNDCG(ranklist, targetItem): for i in range(len(ranklist)): item = ranklist[i] if item == targetItem: return math.log(2) / math.log(i + 2) return 0 hr = [] NDCG = [] testUser = self.testNeg[0] testItem = self.testNeg[1] for i in range(len(testUser)): target = testItem[i][0] feed_dict = self.create_feed_dict(testUser[i], testItem[i]) predict = sess.run(self.y_, feed_dict=feed_dict) item_score_dict = {} for j in range(len(testItem[i])): item = testItem[i][j] item_score_dict[item] = predict[j] ranklist = heapq.nlargest(topK, item_score_dict, key=item_score_dict.get) tmp_hr = getHitRatio(ranklist, target) tmp_NDCG = getNDCG(ranklist, target) hr.append(tmp_hr) NDCG.append(tmp_NDCG) return np.mean(hr), np.mean(NDCG)
# and DATASET3.TXT are within the DATASETS folder and that # the paths specified in these files for database and query # images folders are correct. # Author : Antoni Burguera ([email protected]) # History : 27-June-2019 - Creation # Citation : Please, refer to the README file to know how to properly cite # us if you use this software. ############################################################################### from dataset import DataSet import matplotlib.pyplot as plt import sys # Load three datasets print('[[ LOADING DATASETS ]]') dataSet1 = DataSet('DATASETS/DATASET1.TXT') dataSet2 = DataSet('DATASETS/DATASET2.TXT') dataSet3 = DataSet('DATASETS/DATASET3.TXT') print('[[DATASETS LOADED ]]\n\n') # Let's print the dataSet1 info print('[[ PRINTINT DATASET1 INFO ]]') dataSet1.print() print('[[ DATASET1 PRINTED ]]\n\n') # Let's compare dataSet1 to itself print('[[ COMPARING DATASET1 TO ITSELF ]]') dataSet1.compare(dataSet1) print('[[ COMPARED ]]\n\n') # Let's compare dataSet1 to dataSet3
def get_train_batch(x, y, batch_size=10): ''' 重构generator ''' while 1: idx = np.random.randint(0, len(y), batch_size) x1 = get_im_cv2(x[0][idx]) x2 = get_im_cv2(x[1][idx]) y_train = y[idx] yield [x1, x2], y_train # 加载图片地址 data = DataSet() images = data.data[:10000] labels = data.labels[:10000] same_face = [] diff_face = [] for i in range(9999): # 如果两张图片标签相同,则将两个图片作为相同组样本 if labels[i] == labels[i + 1]: same_face.append([images[i], images[i + 1]]) # 如果两张图片标签不同,则作为差异组样本 else: diff_face.append([images[i], images[i + 1]]) # 转化为numpy.ndarray,便于传入keras构造的神经网络进行计算
def fit(self, X_train, Y_train, X_val, Y_val, n_epoch=100): # initialize log directory if tf.gfile.Exists(self.logdir): tf.gfile.DeleteRecursively(self.logdir) tf.gfile.MakeDirs(self.logdir) # load some training params n_batch = self.opt_params['batch_size'] # create saver self.saver = tf.train.Saver() # summarization summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(self.logdir, self.sess.graph) # load data into DataSet train_data = DataSet(X_train, Y_train) val_data = DataSet(X_val, Y_val) # train the model start_time = time.time() step, epoch = 0, train_data.epochs_completed while train_data.epochs_completed < n_epoch: step += 1 # load the batch # alpha = min((n_epoch - train_data.epochs_completed) / 200, 1.) # alpha = 1.0 if epoch < 100 else 0.1 alpha = 1.0 batch = train_data.next_batch(n_batch) feed_dict = self.load_batch(batch, alpha) # take training step tr_objective = self.train(feed_dict) # tr_obj_snr = 20 * np.log10(1. / np.sqrt(tr_objective) + 1e-8) # if step % 50 == 0: # print step, tr_objective, tr_obj_snr # log results at the end of each epoch if train_data.epochs_completed > epoch: epoch = train_data.epochs_completed end_time = time.time() tr_l2_loss, tr_l2_snr = self.eval_err(X_train, Y_train, n_batch=n_batch) va_l2_loss, va_l2_snr = self.eval_err(X_val, Y_val, n_batch=n_batch) print "Epoch {} of {} took {:.3f}s ({} minibatches)".format( epoch, n_epoch, end_time - start_time, len(X_train) // n_batch) print " training l2_loss/segsnr:\t\t{:.6f}\t{:.6f}".format( tr_l2_loss, tr_l2_snr) print " validation l2_loss/segsnr:\t\t{:.6f}\t{:.6f}".format( va_l2_loss, va_l2_snr) # compute summaries for overall loss objectives_summary = tf.Summary() objectives_summary.value.add(tag='tr_l2_loss', simple_value=tr_l2_loss) objectives_summary.value.add(tag='tr_l2_snr', simple_value=tr_l2_snr) objectives_summary.value.add(tag='va_l2_snr', simple_value=va_l2_loss) # compute summaries for all other metrics summary_str = self.sess.run(summary, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.add_summary(objectives_summary, step) # write summaries and checkpoints summary_writer.flush() self.saver.save(self.sess, self.checkpoint_root, global_step=step) # restart clock start_time = time.time()
graph = tf.get_default_graph() X = graph.get_tensor_by_name("X:0") Y = graph.get_tensor_by_name("Y:0") keep_prob = graph.get_tensor_by_name("keep_prob:0") logits = graph.get_tensor_by_name("fc2/logits:0") softmax = graph.get_tensor_by_name("softmax:0") probs, chars = sess.run([logits, softmax], feed_dict={X: character_image.reshape((1, 28, 28, 1)), keep_prob: 1}) probs = (np.exp(probs) / np.sum(np.exp(probs))) * 100 idx = np.argmax(chars) return (probs[0, idx], idx) ds = DataSet(test_prob=1, one_hot=False) characters = DataGenerator().get_list_characters() x, y = ds.next_batch_test(1) print('x.shape', x.shape) print('y.shape', y.shape) prob, idx = predict(x) print('Input character: ', characters[int(y[0])]) print('Predicted: ', characters[idx], ' with probability = ', prob, '%') print('Result: ', characters[int(y[0])] == characters[idx]) print('-' * 10)
import tensorflow as tf import numpy as np import matplotlib.pyplot as plt import cv2 from dataset import DataSet batch_size = 100 sample_size = 100 epochs = 3001 steps = 1000 Z_dimension = 100 dataset = DataSet(test_prob=0, one_hot=False) def conv2d(X, filters, kernel_size=2, strides=2, padding='same', is_training=True): X = tf.layers.conv2d(X, filters, kernel_size, strides=strides, padding=padding) X = tf.layers.batch_normalization(X, training=is_training) X = tf.nn.leaky_relu(X) return X def deconv(Z,
def execute_runs(self, mode, num_runs, resume=False): """ Executes several training runs, each with different parameters and saves the results :param mode: experiment mode. MODE_FULL randomizes all parameters including the input data, per run MODE_PSA_RUNS generates different datasets and runs the psa separately for each :param num_runs: number of runs per experiment to add to the output :param resume: whether to resume the runs. if True, the runs will continue until there are num_runs records. :return: """ iter_index = -1 while True: iter_index += 1 if mode == self.MODE_FULL: if iter_index == 1: break out_dir = '../output/full' self.create_dir(out_dir, clean=not resume) curr_data = None elif mode == self.MODE_PSA_RUNS: if iter_index >= len(self.mode_psa_datasets): break noise = self.fixed_noise dataset_name = self.mode_psa_datasets[iter_index] out_dir = '../output/' + dataset_name + '_' + str(noise) self.create_dir(out_dir, clean=not resume) input_filename = out_dir + '/input.txt' if resume and os.path.exists(input_filename): curr_data = DataSet.create_from_file(input_filename) curr_data.noise = noise curr_data.dataset_name = dataset_name assert(curr_data.num_samples() == Run.num_samples) else: curr_data = DataSet(dataset_name, num_samples=Run.num_samples, noise=noise) curr_data.save_to_file(input_filename) else: print("Invalid mode:" + str(mode)) return run_id = 0 index_filename = out_dir + '/runsInfo.txt' print('index table: ' + index_filename) if resume and os.path.exists(index_filename): index_table = np.genfromtxt(index_filename, dtype=None, delimiter='\t', names=True, autostrip=False) if len(index_table) > 0 and 'ID' in index_table.dtype.fields: run_id = index_table['ID'][-1] + 1 print('Resuming from ID {}'.format(run_id)) write_header = (not os.path.exists(index_filename)) or (not resume) # create write the header for the runs.txt file f_runs = open(index_filename, 'a+' if resume else 'w+') all_param_info = \ ([['ID', 'ID', self.PARAM_TYPE_OUTPUT, 'ID'], ['imagePath', 'Image path', self.PARAM_TYPE_OUTPUT, 'Output image path']] + self.param_info() + [['epoch', 'Epoch', self.PARAM_TYPE_INT, 'Number of Epochs (of processing all training data)'], ['iteration', 'Iterations', self.PARAM_TYPE_INT, 'Number of Iterations (of processing a batch)'], ['success', 'Success', self.PARAM_TYPE_OUTPUT, 'Whether the training finished successfully'], ['total_time', 'Total time (ms)', self.PARAM_TYPE_OUTPUT, 'Total time at this epoch'], ['mean_time', 'Mean time (ms)', self.PARAM_TYPE_OUTPUT, 'Mean time per epoch'], ['train_loss', 'Training loss', self.PARAM_TYPE_OUTPUT, 'Training loss at epoch'], ['test_loss', 'Test loss', self.PARAM_TYPE_OUTPUT, 'Test loss at epoch'], ['train_TPR', 'TPR for train', self.PARAM_TYPE_OUTPUT, 'True positive rate for training data'], ['train_FPR', 'FPR for train', self.PARAM_TYPE_OUTPUT, 'False positive rate for training data'], # ['train_TNR', 'TNR for train', self.PARAM_TYPE_OUTPUT, 'True negative rate for training data'], # ['train_FNR', 'FNR for train', self.PARAM_TYPE_OUTPUT, 'False negative Rate for training data'], ['test_TPR', 'TPR for test', self.PARAM_TYPE_OUTPUT, 'True positive rate for test data'], ['test_FPR', 'FPR for test', self.PARAM_TYPE_OUTPUT, 'False positive rate for test data'], # ['test_TNR', 'TNR for test', self.PARAM_TYPE_OUTPUT, 'True negative rate for test data'], # ['test_FNR', 'FNR for test', self.PARAM_TYPE_OUTPUT, 'False negative Rate for test data'], ]) # save the paramInfo.txt with open(out_dir + '/paramInfo.txt', 'w') as fpi: fpi.write('\t'.join(self.param_info_header()) + '\n') fpi.write('\n'.join(['\t'.join(i) for i in all_param_info])) # write the header for the runs.txt if write_header: f_runs.write('\t'.join([i[0] for i in all_param_info]) + '\n') f_runs.flush() images_dir = out_dir + '/images' runs_dir = out_dir + '/runs' self.create_dir(images_dir, clean=not resume) self.create_dir(runs_dir, clean=not resume) while run_id < num_runs: if curr_data is None: self.randomize_data() # randomize the data every time else: self.data = curr_data # reuse the same data self.randomize_training_params() # print the parameters print('configuration (%d of %d)' % (int(run_id / len(self.epochs_per_config)) + 1, int(num_runs / len(self.epochs_per_config)))) print(', '.join(a[0] + ': ' + a[1] for a in zip(self.param_names(), self.param_str()))) prev_step = 0 total_time = 0 for epoch in self.epochs_per_config: curr_step = int(epoch * self.data.num_samples() / self.nn.batch_size) # curr_step = epoch # in the online demo epoch == iter: https://github.com/tensorflow/playground/blob/67cf64ffe1fc53967d1c979d26d30a4625d18310/src/playground.ts#L898 time_start = time.time() # train the network success = True try: train_loss, test_loss = self.nn.train(self.data, restart=False, num_steps=curr_step - prev_step) except: train_loss, test_loss = 1, 1 success = False total_time += (time.time() - time_start) * 1000.0 mean_time = total_time / epoch try: train_tpr, train_fpr, test_tpr, test_fpr = self.calc_tpr_fpr() except: train_tpr, train_fpr, test_tpr, test_fpr = 0, 1, 0, 1 success = False print('(epoch: %d, step: %d), ' '(total_time: %g, mean_time: %g), ' '(training loss: %g, test loss: %g), ' '(train_tpr: %g, train_fpr: %g test_tpr: %g, test_fpr: %g)' % (epoch, curr_step, round(total_time, 2), round(mean_time, 2), round(train_loss, 2), round(test_loss, 2), round(train_tpr, 2), round(train_fpr, 2), round(test_tpr, 2), round(test_fpr, 2))) image_filename = images_dir + '/' + str(run_id) + ".png" run_filename = runs_dir + '/' + str(run_id) + ".txt" self.save_plot(image_filename) self.save_current_run(run_filename) f_runs.write('\t'.join( [str(run_id), image_filename[len(out_dir)+1:]] + self.param_str() + [str(epoch), str(curr_step), str(success), str(round(total_time, 3)), str(round(mean_time, 3)), str(round(train_loss, 3)), str(round(test_loss, 3)), str(round(train_tpr, 3)), str(round(train_fpr, 3)), str(round(test_tpr, 3)), str(round(test_fpr, 3)), ]) + '\n') f_runs.flush() prev_step = curr_step run_id += 1 if run_id >= num_runs: break f_runs.close()
class Model(object): def __init__(self, config): self.epoch_count = 0 self.config = config self.data = DataSet(config) self.add_placeholders() self.summarizer = tf.summary self.net = Network(config, self.summarizer) self.optimizer = self.config.solver.optimizer self.y_pred = self.net.prediction(self.x, self.keep_prob) self.loss = self.net.loss_function(self.x, self.y, self.keep_prob) self.accuracy = self.net.accuracy(self.y_pred, self.y) self.summarizer.scalar("accuracy", self.accuracy) self.summarizer.scalar("loss", self.loss) self.train = self.net.train_step(self.loss) self.B = self.net.B self.A = self.net.A self.n_epoch_to_decay = list(range(800, 20000, 1000))[::-1] self.next_epoch_to_decay = self.n_epoch_to_decay.pop() self.saver = tf.train.Saver() self.init = tf.global_variables_initializer() self.local_init = tf.local_variables_initializer() self.kf = KFold(n_splits=10, random_state=0, shuffle=True) def add_placeholders(self): self.x = tf.placeholder(tf.float32, shape=[None, self.config.features_dim]) self.y = tf.placeholder(tf.float32, shape=[None, self.config.labels_dim]) self.keep_prob = tf.placeholder(tf.float32) def train_epoch(self, sess, summarizer): merged_summary = self.summarizer.merge_all() err, accuracy = list(), list() X, Y = self.data.get_train() for train, val in self.kf.split(X, y=Y): feed_dict = {self.x: X[train], self.y: Y[train], self.keep_prob: self.config.solver.dropout} # attention! summ, _, loss_, accuracy_ = sess.run([merged_summary, self.train, self.loss, self.accuracy], feed_dict=feed_dict) summarizer.add_summary(summ) err.append(loss_) accuracy.append(accuracy_) return np.mean(err), np.mean(accuracy) def do_eval(self, sess, data): if data == "validation": err, accuracy = list(), list() X, Y = self.data.get_validation() for train, val in self.kf.split(X, y=Y): feed_dict = {self.x: X[val], self.y: Y[val], self.keep_prob: 1} loss_, Y_pred, accuracy_ = sess.run([self.loss, self.y_pred, self.accuracy], feed_dict=feed_dict) metrics = evaluate(predictions=Y_pred, labels=Y[val]) err.append(loss_) accuracy.append(accuracy_) return np.mean(err), np.mean(accuracy), metrics if data == "test": X, Y = self.data.get_test() feed_dict = {self.x: X, self.y: Y, self.keep_prob: 1} loss_, Y_pred, accuracy_ = sess.run([self.loss, self.y_pred, self.accuracy], feed_dict=feed_dict) metrics = evaluate(predictions=Y_pred, labels=Y) return loss_, accuracy_, metrics def fit(self, sess, summarizer): sess.run(self.init) sess.run(self.local_init) max_epochs = self.config.max_epochs self.epoch_count = 0 max_micro_f1 = 0 max_macro_f1 = 0 while self.epoch_count < max_epochs: if self.config.load: break loss_train, accuracy_train = self.train_epoch(sess, summarizer['train']) loss_val, accuracy_val, metrics_val = self.do_eval(sess, "validation") if self.epoch_count == self.next_epoch_to_decay: if len(self.n_epoch_to_decay) == 0: self.next_epoch_to_decay = -1 else: self.next_epoch_to_decay = self.n_epoch_to_decay.pop() self.config.learning_rate *= self.config.lr_decay_factor print('Decaying learning rate ...') print(self.config.learning_rate) if max_micro_f1 < metrics_val['micro_f1'] and max_macro_f1 < metrics_val['macro_f1']: print(self.config.ckptdir_path) print("cur_max_Mi-F1 = %g, cur_max_Ma-F1 = %g, cur_epoch = %g." % ( metrics_val['micro_f1'], metrics_val['macro_f1'], self.epoch_count)) self.saver.save(sess, self.config.ckptdir_path + "model.ckpt") max_micro_f1 = max(max_micro_f1, metrics_val['micro_f1']) max_macro_f1 = max(max_macro_f1, metrics_val['macro_f1']) if self.epoch_count % 5 == 0: print("After %d training epoch(s), Training : Loss = %g, Validation : Loss = %g." % ( self.epoch_count, loss_train, loss_val)) print("train_accuracy = %g, val_accuracy = %g." % (accuracy_train, accuracy_val)) print("Micro-F1 = %g, Macro-F1 = %g." % (metrics_val['micro_f1'], metrics_val['macro_f1'])) self.epoch_count += 1 returnDict = {"train_loss": loss_train, "val_loss": loss_val, "train_accuracy": accuracy_train, "val_accuracy": accuracy_val} return returnDict def add_summaries(self, sess): if self.config.load or self.config.debug: path_ = os.path.join("../results/tensorboard" + self.config.dataset_name) else: path_ = os.path.join("../bin/results/tensorboard" + self.config.dataset_name) summary_writer_train = tf.summary.FileWriter(path_ + "/train", sess.graph) summary_writer_val = tf.summary.FileWriter(path_ + "/val", sess.graph) summary_writer_test = tf.summary.FileWriter(path_ + "/test", sess.graph) summary_writers = {'train': summary_writer_train, 'val': summary_writer_val, 'test': summary_writer_test} return summary_writers
def model(): X_indices = tf.placeholder(tf.int64, name='X_indices', shape=None) X_data = tf.placeholder(tf.float32, name='X_data', shape=None) X_shape = tf.placeholder(tf.int64, name='X_shape', shape=None) ''' Y_indices = tf.placeholder(tf.int64, name='Y_indices', shape=None) Y_data = tf.placeholder(tf.float32, name='Y_data', shape=None) Y_shape = tf.placeholder(tf.int64, name='Y_shape', shape=None) ''' X = tf.SparseTensor(indices=X_indices, values=X_data, dense_shape=X_shape) #Y = tf.SparseTensor(indices=Y_indices, values=Y_data, dense_shape=Y_shape) Y = tf.placeholder(tf.float32, shape=[None, label_dim]) Wx1 = tf.Variable(tf.random_normal(shape=[feature_dim, 700])) bx1 = tf.Variable(tf.random_normal(shape=[700])) Wx2 = tf.Variable(tf.random_normal(shape=[700, 983])) bx2 = tf.Variable(tf.random_normal(shape=[983])) act = tf.nn.relu hx1 = act(dot(X, Wx1) + bx1) hxe = dot(hx1, Wx2, sparse=False) + bx2 print(hxe.get_shape()) loss = ce_loss(hxe, Y) patk = tf.metrics.sparse_precision_at_k(labels=tf.cast(Y, tf.int64), predictions=tf.nn.sigmoid(hxe), k=3) train = tf.train.GradientDescentOptimizer(0.01).minimize(loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) for epoch in range(200): el, c = 0.0, 0 dataobj = DataSet("./data/delicious/delicious-train", batch_size) for x_train, y_train, dummy in dataobj.next_batch( "train", sparse_features=True, sparse_labels=False): x_props, y_props = get_sparse_props( x_train), None #get_sparse_props(y_train) feed = { X_indices: x_props[0], X_data: x_props[1], X_shape: x_props[2], Y: y_train } #, Y_indices : y_props[0], Y_data : y_props[1], Y_shape : y_props[2]} pl, _ = sess.run([loss, train], feed_dict=feed) el += pl c += 1 print("Epoch #{} Loss : {}".format(epoch, pl), end='\r') test_obj = DataSet("./data/delicious/delicious-test", 3185) x_test, y_test = test_obj.get_test() x_props, y_props = get_sparse_props( x_test), None #get_sparse_props(y_test) feed = { X_indices: x_props[0], X_data: x_props[1], X_shape: x_props[2], Y: y_test } #Y_indices : y_props[0], Y_data : y_props[1], Y_shape : y_props[2]} pk = sess.run(patk, feed_dict=feed) output = "Epoch #{} Loss : {}, P@K : {}".format(epoch, el / c, pk) with open("train_test.log", "a+") as f: f.write(output) print(output)
def train(FLAGS): # read data dataset = DataSet(fpath=FLAGS.train_file, seqlen=FLAGS.seq_len, n_classes=FLAGS.num_classes, num_feature=FLAGS.num_feature, is_raw=FLAGS.is_raw, need_shuffle=True) # set character set size FLAGS.charset_size = dataset.charset_size with tf.Graph().as_default(): # get placeholders global_step = tf.placeholder(tf.int32) placeholders = get_placeholders(FLAGS) # prediction pred, layers = inference(placeholders['data'], FLAGS, for_training=True) # loss # slim.losses.softmax_cross_entropy(pred, placeholders['labels']) # class_weight = tf.constant([[1.0, 5.0]]) # weight_per_label = tf.transpose( tf.matmul(placeholders['labels'] # , tf.transpose(class_weight)) ) # loss = tf.multiply(weight_per_label, # tf.nn.softmax_cross_entropy_with_logits(labels=placeholders['labels'], logits=pred)) # loss = tf.losses.compute_weighted_loss(loss) tf.losses.softmax_cross_entropy(placeholders['labels'], pred) loss = tf.losses.get_total_loss() # accuracy _acc_op = tf.equal(tf.argmax(pred, 1), tf.argmax(placeholders['labels'], 1)) acc_op = tf.reduce_mean(tf.cast(_acc_op, tf.float32)) # optimization train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(loss) # train_op = tf.train.RMSPropOptimizer( FLAGS.learning_rate ).minimize( loss ) # Create a saver. saver = tf.train.Saver(max_to_keep=None) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if tf.train.checkpoint_exists(FLAGS.prev_checkpoint_path): if FLAGS.fine_tuning: logging('%s: Fine Tuning Experiment!' % (datetime.now()), FLAGS) restore_variables = slim.get_variables_to_restore( exclude=FLAGS.fine_tuning_layers) restorer = tf.train.Saver(restore_variables) else: restorer = tf.train.Saver() restorer.restore(sess, FLAGS.prev_checkpoint_path) logging( '%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.prev_checkpoint_path), FLAGS) step = int( FLAGS.prev_checkpoint_path.split('/')[-1].split('-') [-1]) + 1 else: step = 0 # iter epoch # for data, labels in dataset.iter_batch( FLAGS.batch_size, 5 ): for data, labels in dataset.iter_once(FLAGS.batch_size): start_time = time.time() _, loss_val, acc_val = sess.run( [train_op, loss, acc_op], feed_dict={ placeholders['data']: data, placeholders['labels']: labels, global_step: step }) duration = time.time() - start_time assert not np.isnan(loss_val), 'Model diverge' # logging if step > 0 and step % FLAGS.log_interval == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ( '%s: step %d, loss = %.2f, acc = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') logging( format_str % (datetime.now(), step, loss_val, acc_val, examples_per_sec, duration), FLAGS) # save model if step > 0 and step % FLAGS.save_interval == 0: saver.save(sess, FLAGS.checkpoint_path, global_step=step) # counter step += 1 # save for last saver.save(sess, FLAGS.checkpoint_path, global_step=step - 1)
def setUp(self): ''' Executed prior to each test. ''' self.ds = DataSet('test', NoSchema) return
k = int(train_set_idx[i][0]) l = int(train_set_idx[j][0]) doc1 = self.documents[k] doc2 = self.documents[l] val = self.kernel_obj(doc1, doc2) ret[i, j] = val ret[j, i] = val return ret def save_kernel(self): self.kernel_obj.save_kernel_entry() if __name__ == '__main__': from dataset import DataSet data_set = DataSet() # make a small subset for testing train_set = data_set.train_set train_labels = data_set.train_labels test_set = data_set.test_set test_labels = data_set.test_labels test_model = StringSVM("test_k5_lambda0.8", 5, 0.8) try: test_model.train(train_set, train_labels) except Exception as e: # re-raise exception raise e finally:
def train(): unrelated_vs_all = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) disagree_vs_all = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) agree_vs_all = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) # create the training set with lemmatized bodies training_set = DataSet("csv/train_stances_csc483583.csv", "csv/lemmatized_bodies.csv") # create an original set that has original bodies orig_set = DataSet("csv/train_stances_csc483583.csv", "csv/train_bodies.csv") stances = training_set.stances articles = training_set.articles orig_articles = orig_set.articles similarity_vectors = [] similarity_labels = [] agree_labels = [] negation_vectors = [] negation_labels = [] count = 0 stanceVal = 0 for stance in stances: count += 1 print("Training article number: " + str(count)) headline = stance['Headline'] bodyID = stance['Body ID'] #get lemmatized body from DataSet created with lemmatized_bodies.csv body_lemmas = articles[bodyID] #get the original body from DataSet created with train_bodies.csv orig_body = orig_articles[bodyID] stance = stance['Stance'] #get the scores from the features similarity_score, similar_sentences, max_similarity, negation_avg = similarity_feature(headline, body_lemmas, orig_body) neg = max_similarity.get('Negates') if(neg == None): neg = 0 max_score = max_similarity.get('Score') if(max_score == None): max_score = 0.0 similarity_vectors.append([similarity_score, max_score]) if(stance == 'unrelated'): similarity_labels.append(1) else: similarity_labels.append(2) if(stance == 'agree'): agree_labels.append(1) else: agree_labels.append(2) negation_vectors.append([negation_avg]) if(stance == 'disagree'): negation_labels.append(1) else: negation_labels.append(2) np_sim_vectors = np.array(similarity_vectors) np_sim_labels = np.array(similarity_labels) unrelated_vs_all.fit(np_sim_vectors, np_sim_labels) save_object(unrelated_vs_all, 'unrelated_vs_all.pkl') np_neg_vectors = np.array(negation_vectors) np_neg_labels = np.array(negation_labels) disagree_vs_all.fit(np_neg_vectors, np_neg_labels) save_object(disagree_vs_all, 'disagree_vs_all.pkl') np_agree_labels = np.array(agree_labels) agree_vs_all.fit(np_sim_vectors, np_agree_labels) save_object(agree_vs_all, 'agree_vs_all.pkl')
''' from __future__ import print_function import numpy as np import tensorflow as tf from keras.layers.convolutional import UpSampling2D from keras import optimizers import math import os os.sys.path.append('../') import dataset.DataSet as DB os.environ["CUDA_VISIBLE_DEVICES"] ="0" dropout = 0.5 infodata = DB.Get5Class(ratio = 0.7, tag_array = [0, 0, 1, 1, 1], label_arry=[0, 0, 0, 0, 1], nlabel=0) x_scale_train, x_scale_test, _, _ = infodata.GetScaleData() N = infodata.GetSizeTrain() D = infodata.GetDim() num_classes = infodata.GetNumClass() num_fc_1 = 500 learning_rate = 1e-4 batch_size = 15 training_iters = 2100000 display_step = 14 # tf Graph input x = tf.placeholder(tf.float32, [None, D])
class Run: """ A single run """ num_samples = 200 # always a fixed number of data points noise_values = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] perc_train_values = [10, 20, 30, 40, 50, 60, 70, 80, 90] # percentage of training to test range_batch_size = [1, 30] learning_rates = [0.00001, 0.0001, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0] regularization_rates = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0] range_hidden = [0, 6] range_hidden_neuron = [1, 8] epochs_per_config = [50, 100, 200, 400] # number of epochs to run each nnet configuration for activations_names = Classifier.activations_names regularization_names = Classifier.regularization_names fixed_feature_ids = None # for debug: """ num_samples = 200 # always a fixed number of data points noise_values = [25] perc_train_values = [50] # percentage of training to test range_batch_size = [10, 11] learning_rates = [0.1] regularization_rates = [3.0] range_hidden = [3, 3] range_hidden_neuron = [4, 4] epochs_per_config = [400] # number of epochs to run each nnet configuration for activations_names = [Classifier.ACTIVATION_TANH] regularization_names = [Classifier.REGULARIZATION_NONE] fixed_feature_ids = [DataSet.FEATURE_X1SQ, DataSet.FEATURE_X2SQ, DataSet.FEATURE_SIN_X1, DataSet.FEATURE_SIN_X2] """ PARAM_TYPE_INT = 'int' PARAM_TYPE_DOUBLE = 'double' PARAM_TYPE_STR = 'string' PARAM_TYPE_OUTPUT = 'output' MODE_FULL = 'full' # a single directory, with randomized data for each run MODE_PSA_RUNS = 'psa_runs' # a few randomized data, in separate directories fixed_noise = 25 # mode_psa_datasets = DataSet.all_data_names mode_psa_datasets = [DataSet.DATA_SPIRAL, DataSet.DATA_XOR, DataSet.DATA_CIRCLE, DataSet.DATA_GAUSS] # debug def __init__(self): self.data = None self.nn = None def randomize_data(self, dataset_name=None, noise=None): """ Build dataset with randomized parameters :param dataset_name: dataset name. if None, will randomize :param noise: noise [0 .. 50]. if None will randomly pick :return: None """ # dataset parameters dataset_name = random.choice(DataSet.all_data_names) if dataset_name is None else dataset_name noise = random.choice(self.noise_values) if noise is None else noise self.data = DataSet(dataset_name, self.num_samples, noise) def randomize_training_params(self): """ Creates classifier and network with randomized parameters :return: None """ self.nn = Classifier() self.nn.perc_train = random.choice(self.perc_train_values) self.nn.batch_size = random.randint(*self.range_batch_size) self.nn.learning_rate = random.choice(self.learning_rates) self.nn.neurons_per_layer = [random.randint(*self.range_hidden_neuron) for _ in range(random.randint(*self.range_hidden))] self.nn.activation_h = random.choice(self.activations_names) self.nn.regularization_type = random.choice(self.regularization_names) self.nn.regularization_rate = random.choice(self.regularization_rates) # select which input features to use if self.fixed_feature_ids is not None: self.nn.features_ids = self.feature_ids else: # random feature_bits = random.randint(0, pow(2, DataSet.NUM_FEATURES)) self.nn.features_ids = [i for i in range(DataSet.NUM_FEATURES) if feature_bits & pow(2, i) != 0] self.nn.build() @staticmethod def param_info_header(): return ['label', 'name', 'type', 'info'] def param_info(self): max_hidden = self.range_hidden[1] return ([['data', 'Data', self.PARAM_TYPE_STR, 'Which dataset do you want to use?'], ['noise', 'Noise', self.PARAM_TYPE_INT, 'Noise'], ['training_ratio', 'Training Ratio', self.PARAM_TYPE_INT, 'Ratio of training to test data'], ['batch_size', 'Batch Size', self.PARAM_TYPE_INT, 'Batch Size']] + [[f, f, self.PARAM_TYPE_INT, f] for f in DataSet.feature_idx_to_name] + [['layer_count', 'Layers Count', self.PARAM_TYPE_INT, 'Number of hidden layers'], ['neuron_count', 'Neurons Count', self.PARAM_TYPE_INT, 'Total number of neurons in hidden layers']] + [['H'+str(i), 'H'+str(i), self.PARAM_TYPE_INT, 'H'+str(i)] for i in range(1, max_hidden + 1)] + [['learning_rate', 'Learning rate', self.PARAM_TYPE_DOUBLE, 'Learning rate'], ['activation', 'Activation', self.PARAM_TYPE_STR, 'Activation'], ['regularization', 'Regularization', self.PARAM_TYPE_STR, 'Regularization'], ['regularization_rate', 'Regularization rate', self.PARAM_TYPE_DOUBLE, 'Regularization rate']]) def param_names(self): """ returns array of string names for the parameters. matching 1-to-1 with param_str :return: """ info = self.param_info() return [info[i][0] for i in range(len(info))] def param_str(self): """ returns array of parameter values in string format. matching 1-to-1 to the param_names() :return: """ layer_count = len(self.nn.neurons_per_layer) max_hidden = self.range_hidden[1] return ([self.data.dataset_name, str(self.data.noise), str(self.nn.perc_train), str(self.nn.batch_size)] + ['1' if i in self.nn.features_ids else '0' for i in DataSet.all_features] + [str(layer_count), str(sum(self.nn.neurons_per_layer))] + [str(self.nn.neurons_per_layer[i]) if i < layer_count else '0' for i in range(max_hidden)] + [str(self.nn.learning_rate), self.nn.activation_h, self.nn.regularization_type, str(self.nn.regularization_rate)]) def save_plot(self, filename): """ Generates the plot using the current data and training state :param filename: output filename :return: None """ # matplotlib.interactive(False) # plot the resulting classifier colormap = colors.ListedColormap(["#f59322", "#e8eaeb", "#0877bd"]) x_min, x_max = -6, 6 # grid x bounds y_min, y_max = -6, 6 # grid y bounds xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300)) data_points = np.c_[xx.ravel(), yy.ravel()] data_grid = DataSet(None, len(data_points), 0, data_points=data_points) try: z = self.nn.predict_labels(data_grid.features).reshape(xx.shape) except: z = np.zeros(np.shape(xx)) fig = plt.figure(figsize=(4, 4), dpi=75) # plt.imshow(z, cmap=colormap, interpolation='nearest') plt.contourf(xx, yy, z, cmap=colormap, alpha=0.8) num_training = self.data.num_training(self.nn.perc_train) point_color = self.data.labels # plot training data points plt.scatter(self.data.points[:num_training, 0], self.data.points[:num_training, 1], c=point_color[:num_training], edgecolors='w', s=40, cmap=colormap) # plot test data points plt.scatter(self.data.points[num_training:, 0], self.data.points[num_training:, 1], c=point_color[num_training:], edgecolors='k', s=30, cmap=colormap) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) fig.savefig(filename) plt.close() @staticmethod def create_dir(dirname, clean=False): """ Creates the directory if doesn't exist :param dirname: directory path :param clean: whether to clean the directory :return: None """ if clean: shutil.rmtree(dirname, ignore_errors=True) if not os.path.exists(dirname): os.makedirs(dirname) def save_current_run(self, filename): try: yp = self.nn.predict_labels(self.data.features) except: yp = 1 - self.data.labels if Config.SAVE_LABELS_NEG_POS: yp = [-1 if label == 0 else 1 for label in yp] header = 'label_pred' with open(filename, 'w') as f: f.write(header + '\n') for v in yp: f.write(str(v) + '\n') def calc_tpr_fpr(self): """ calculates the true positive rate and false positive rate :return: [train_tpr, train_fpr, test_tpr, test_fpr] """ labels_pred = self.nn.predict_labels(self.data.features) num_training = self.data.num_training(self.nn.perc_train) stats = [] for population in ['train', 'test']: if population == 'train': y = self.data.labels[:num_training] # true labels for training yp = labels_pred[:num_training] # predicted labels for training else: # population == 'test' y = self.data.labels[num_training:] # true labels for test yp = labels_pred[num_training:] # predicted labels for test num_p = list(y).count(1) # number of positive labels num_n = list(y).count(0) # number of negative labels num_tp = [l == 1 and lp == 1 for l, lp in zip(y, yp)].count(True) # true positives num_fp = [l == 0 and lp == 1 for l, lp in zip(y, yp)].count(True) # false positives # num_tn = [l == 0 and lp == 0 for l, lp in zip(y, yp)].count(True) # true positives # num_fn = [l == 1 and lp == 0 for l, lp in zip(y, yp)].count(True) # true positives tpr = 0 if num_tp == 0 else num_tp/num_p # true positive rate fpr = 0 if num_fp == 0 else num_fp/num_n # false positive rate # tnr = 0 if num_tn == 0 else num_tn/num_n # true negative rate # fnr = 0 if num_fn == 0 else num_fn/num_p # false negative rate stats = stats + [tpr, fpr] return stats def execute_runs(self, mode, num_runs, resume=False): """ Executes several training runs, each with different parameters and saves the results :param mode: experiment mode. MODE_FULL randomizes all parameters including the input data, per run MODE_PSA_RUNS generates different datasets and runs the psa separately for each :param num_runs: number of runs per experiment to add to the output :param resume: whether to resume the runs. if True, the runs will continue until there are num_runs records. :return: """ iter_index = -1 while True: iter_index += 1 if mode == self.MODE_FULL: if iter_index == 1: break out_dir = '../output/full' self.create_dir(out_dir, clean=not resume) curr_data = None elif mode == self.MODE_PSA_RUNS: if iter_index >= len(self.mode_psa_datasets): break noise = self.fixed_noise dataset_name = self.mode_psa_datasets[iter_index] out_dir = '../output/' + dataset_name + '_' + str(noise) self.create_dir(out_dir, clean=not resume) input_filename = out_dir + '/input.txt' if resume and os.path.exists(input_filename): curr_data = DataSet.create_from_file(input_filename) curr_data.noise = noise curr_data.dataset_name = dataset_name assert(curr_data.num_samples() == Run.num_samples) else: curr_data = DataSet(dataset_name, num_samples=Run.num_samples, noise=noise) curr_data.save_to_file(input_filename) else: print("Invalid mode:" + str(mode)) return run_id = 0 index_filename = out_dir + '/runsInfo.txt' print('index table: ' + index_filename) if resume and os.path.exists(index_filename): index_table = np.genfromtxt(index_filename, dtype=None, delimiter='\t', names=True, autostrip=False) if len(index_table) > 0 and 'ID' in index_table.dtype.fields: run_id = index_table['ID'][-1] + 1 print('Resuming from ID {}'.format(run_id)) write_header = (not os.path.exists(index_filename)) or (not resume) # create write the header for the runs.txt file f_runs = open(index_filename, 'a+' if resume else 'w+') all_param_info = \ ([['ID', 'ID', self.PARAM_TYPE_OUTPUT, 'ID'], ['imagePath', 'Image path', self.PARAM_TYPE_OUTPUT, 'Output image path']] + self.param_info() + [['epoch', 'Epoch', self.PARAM_TYPE_INT, 'Number of Epochs (of processing all training data)'], ['iteration', 'Iterations', self.PARAM_TYPE_INT, 'Number of Iterations (of processing a batch)'], ['success', 'Success', self.PARAM_TYPE_OUTPUT, 'Whether the training finished successfully'], ['total_time', 'Total time (ms)', self.PARAM_TYPE_OUTPUT, 'Total time at this epoch'], ['mean_time', 'Mean time (ms)', self.PARAM_TYPE_OUTPUT, 'Mean time per epoch'], ['train_loss', 'Training loss', self.PARAM_TYPE_OUTPUT, 'Training loss at epoch'], ['test_loss', 'Test loss', self.PARAM_TYPE_OUTPUT, 'Test loss at epoch'], ['train_TPR', 'TPR for train', self.PARAM_TYPE_OUTPUT, 'True positive rate for training data'], ['train_FPR', 'FPR for train', self.PARAM_TYPE_OUTPUT, 'False positive rate for training data'], # ['train_TNR', 'TNR for train', self.PARAM_TYPE_OUTPUT, 'True negative rate for training data'], # ['train_FNR', 'FNR for train', self.PARAM_TYPE_OUTPUT, 'False negative Rate for training data'], ['test_TPR', 'TPR for test', self.PARAM_TYPE_OUTPUT, 'True positive rate for test data'], ['test_FPR', 'FPR for test', self.PARAM_TYPE_OUTPUT, 'False positive rate for test data'], # ['test_TNR', 'TNR for test', self.PARAM_TYPE_OUTPUT, 'True negative rate for test data'], # ['test_FNR', 'FNR for test', self.PARAM_TYPE_OUTPUT, 'False negative Rate for test data'], ]) # save the paramInfo.txt with open(out_dir + '/paramInfo.txt', 'w') as fpi: fpi.write('\t'.join(self.param_info_header()) + '\n') fpi.write('\n'.join(['\t'.join(i) for i in all_param_info])) # write the header for the runs.txt if write_header: f_runs.write('\t'.join([i[0] for i in all_param_info]) + '\n') f_runs.flush() images_dir = out_dir + '/images' runs_dir = out_dir + '/runs' self.create_dir(images_dir, clean=not resume) self.create_dir(runs_dir, clean=not resume) while run_id < num_runs: if curr_data is None: self.randomize_data() # randomize the data every time else: self.data = curr_data # reuse the same data self.randomize_training_params() # print the parameters print('configuration (%d of %d)' % (int(run_id / len(self.epochs_per_config)) + 1, int(num_runs / len(self.epochs_per_config)))) print(', '.join(a[0] + ': ' + a[1] for a in zip(self.param_names(), self.param_str()))) prev_step = 0 total_time = 0 for epoch in self.epochs_per_config: curr_step = int(epoch * self.data.num_samples() / self.nn.batch_size) # curr_step = epoch # in the online demo epoch == iter: https://github.com/tensorflow/playground/blob/67cf64ffe1fc53967d1c979d26d30a4625d18310/src/playground.ts#L898 time_start = time.time() # train the network success = True try: train_loss, test_loss = self.nn.train(self.data, restart=False, num_steps=curr_step - prev_step) except: train_loss, test_loss = 1, 1 success = False total_time += (time.time() - time_start) * 1000.0 mean_time = total_time / epoch try: train_tpr, train_fpr, test_tpr, test_fpr = self.calc_tpr_fpr() except: train_tpr, train_fpr, test_tpr, test_fpr = 0, 1, 0, 1 success = False print('(epoch: %d, step: %d), ' '(total_time: %g, mean_time: %g), ' '(training loss: %g, test loss: %g), ' '(train_tpr: %g, train_fpr: %g test_tpr: %g, test_fpr: %g)' % (epoch, curr_step, round(total_time, 2), round(mean_time, 2), round(train_loss, 2), round(test_loss, 2), round(train_tpr, 2), round(train_fpr, 2), round(test_tpr, 2), round(test_fpr, 2))) image_filename = images_dir + '/' + str(run_id) + ".png" run_filename = runs_dir + '/' + str(run_id) + ".txt" self.save_plot(image_filename) self.save_current_run(run_filename) f_runs.write('\t'.join( [str(run_id), image_filename[len(out_dir)+1:]] + self.param_str() + [str(epoch), str(curr_step), str(success), str(round(total_time, 3)), str(round(mean_time, 3)), str(round(train_loss, 3)), str(round(test_loss, 3)), str(round(train_tpr, 3)), str(round(train_fpr, 3)), str(round(test_tpr, 3)), str(round(test_fpr, 3)), ]) + '\n') f_runs.flush() prev_step = curr_step run_id += 1 if run_id >= num_runs: break f_runs.close()
def __init__(self, data_name): self.dataset = DataSet.load_dataset(name=data_name)
output_sentence = [] for di in range(config.MAX_LENGTH): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) topv, topi = decoder_output.data.topk(1) if topi.item() == config.EOS_token: break else: output_sentence.append(output_lang.index2word[topi.item()]) decoder_input = topi.squeeze().detach() return ' '.join(output_sentence) dataset = DataSet(config.input_lang, config.target_lang, config.path) dataset.prepareData() encoder = EncoderRNN(dataset.input_lang.n_words, config.hidden_size) decoder = AttnDecoderRNN(config.hidden_size, dataset.target_lang.n_words, config.MAX_LENGTH) encoder.load_state_dict( torch.load(config.curPath + 'annotation_encoder.pth', map_location=config.eval_device)) decoder.load_state_dict( torch.load(config.curPath + 'annotation_decoder.pth', map_location=config.eval_device)) code = sys.argv[1] #code = input() input_tensor = dataset.tensorFromSentence(code, dataset.input_lang)
num_epochs = 50 feature_extract = True device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True) #print(model_ft) # Create training and validation datasets image_datasets = { x: DataSet(data_dir[x], input_size, num_classes, x == 'train') for x in ['train', 'val'] } # Create training and validation dataloaders dataloaders_dict = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=16) for x in ['train', 'val'] } for x in ['train', 'val']: print('dataset size', x, len(image_datasets[x])) print('loader size', x, len(dataloaders_dict[x]))
# svm.test() # SIFT_SVM = SIFT_SupportVectorMachine() # SIFT_SVM.setInputs( # DataSet(vectors_path=None, images_path="datasets/Train/TrainImages", labels_path="datasets/Train/trainLbls.csv"), # DataSet(vectors_path=None, images_path="datasets/Validate/ValidationImages", labels_path="datasets/Validate/valLbls.csv"), # DataSet(vectors_path=None, images_path="datasets/Test/TestImages") # ) # SIFT_SVM.train() # NN = NearestNeighbour() # NN.setInputs( # DataSet("datasets/Train/trainVectors.csv", # labels_path="datasets/Train/trainLbls.csv", normalize = False), # DataSet("datasets/Validate/valVectors.csv", # labels_path="datasets/Validate/valLbls.csv", normalize = False), # DataSet("datasets/Test/testVectors.csv", normalize = False) # ) # NN.validate() SIFT_NN = SIFT_NearestNeighbour() SIFT_NN.setInputs( DataSet(vectors_path=None, images_path="datasets/Train/TrainImages", labels_path="datasets/Train/trainLbls.csv"), DataSet(vectors_path=None, images_path="datasets/Validate/ValidationImages", labels_path="datasets/Validate/valLbls.csv"), DataSet(vectors_path=None, images_path="datasets/Test/TestImages")) SIFT_NN.validate()
class Model(object): def __init__(self, config): self.epoch_count = 0 self.config = config self.data = DataSet(config) self.add_placeholders() self.summarizer = tf.summary self.net = Network(config, self.summarizer) self.optimizer = self.config.solver.optimizer self.y_pred = self.net.prediction(self.x, self.keep_prob) self.loss = self.net.loss(self.x, self.y, self.keep_prob) self.accuracy = self.net.accuracy(tf.nn.sigmoid(self.y_pred), self.y) self.patk = self.net.patk(self.y, self.y_pred) self.summarizer.scalar("accuracy", self.accuracy) self.summarizer.scalar("loss", self.loss) self.train = self.net.train_step(self.loss) self.saver = tf.train.Saver() self.init = tf.global_variables_initializer() self.local_init = tf.local_variables_initializer() def add_placeholders(self): self.x = tf.placeholder(tf.float32, shape=[None, self.config.features_dim]) self.y = tf.placeholder(tf.float32, shape=[None, self.config.labels_dim]) self.keep_prob = tf.placeholder(tf.float32) #self.k = int() def run_epoch(self, sess, data, summarizer, epoch): err = list() i, p_k, y_pred, Y = 0, None, None, None step = epoch merged_summary = self.summarizer.merge_all() for X, Y, tot in self.data.next_batch(data): feed_dict = { self.x: X, self.y: Y, self.keep_prob: self.config.solver.dropout } if not self.config.load: summ, _, y_pred, loss = sess.run([ merged_summary, self.train, tf.nn.sigmoid(self.y_pred), self.loss ], feed_dict=feed_dict) err.append(loss) output = "Epoch ({}) Batch({}) - Loss : {}".format( self.epoch_count, i, loss) with open( "../stdout/{}_train.log".format( self.config.project_name), "a+") as log: log.write(output + "\n") print(" {}".format(output), end='\r') step = int(epoch * tot + i) summarizer.add_summary(summ, step) i += 1 #p_k = patk(predictions=y_pred, labels=Y) return np.mean(err), step, p_k def run_eval(self, sess, data, summary_writer=None, step=0): y, y_pred, loss_, metrics, p_k = list(), list(), 0.0, None, None accuracy, loss = 0.0, 0.0 merged_summary = self.summarizer.merge_all() i = 0 for X, Y, tot in self.data.next_batch(data): feed_dict = {self.x: X, self.y: Y, self.keep_prob: 1} if i == tot - 1 and summary_writer is not None: if data == "validation": summ, loss_ = sess.run([merged_summary, self.loss], feed_dict=feed_dict) else: summ, loss_, accuracy_val = sess.run( [merged_summary, self.loss, self.accuracy], feed_dict=feed_dict) summary_writer.add_summary(summ, step) else: if data == "validation": loss_, Y_pred = sess.run( [self.loss, tf.nn.sigmoid(self.y_pred)], feed_dict=feed_dict) p_k = patk(predictions=Y_pred, labels=Y) else: loss_, Y_pred, accuracy_val = sess.run( [self.loss, tf.nn.sigmoid(self.y_pred), self.accuracy], feed_dict=feed_dict) metrics = evaluate(predictions=Y_pred, labels=Y) accuracy += accuracy_val #metrics['accuracy'] loss += loss_ i += 1 if data == "test": X, Y = self.data.get_test() p_k = patk( sess.run(tf.nn.sigmoid(self.y_pred), feed_dict={ self.x: X, self.y: Y, self.keep_prob: 1 }), Y ) # sess.run(self.patk, feed_dict={self.x: X, self.y: Y, self.keep_prob: 1}) # return loss / i, accuracy / self.config.batch_size, metrics, p_k def add_summaries(self, sess): if self.config.load or self.config.debug: path_ = "../results/tensorboard" else: path_ = "../bin/results/tensorboard" summary_writer_train = tf.summary.FileWriter(path_ + "/train", sess.graph) summary_writer_val = tf.summary.FileWriter(path_ + "/val", sess.graph) summary_writer_test = tf.summary.FileWriter(path_ + "/test", sess.graph) summary_writers = { 'train': summary_writer_train, 'val': summary_writer_val, 'test': summary_writer_test } return summary_writers def fit(self, sess, summarizer): ''' - Patience Method : + Train for particular no. of epochs, and based on the frequency, evaluate the model using validation data. + If Validation Loss increases, decrease the patience counter. + If patience becomes less than a certain threshold, devide learning rate by 10 and switch back to old model + If learning rate is lesser than a certain ''' sess.run(self.init) sess.run(self.local_init) max_epochs = self.config.max_epochs patience = self.config.patience patience_increase = self.config.patience_increase improvement_threshold = self.config.improvement_threshold best_validation_loss = 1e6 self.epoch_count = 0 best_step, losses, learning_rate = -1, list( ), self.config.solver.learning_rate while self.epoch_count < max_epochs: if (self.config.load == True): break start_time = time.time() average_loss, tr_step, train_patk = self.run_epoch( sess, "train", summarizer['train'], self.epoch_count) duration = time.time() - start_time if not self.config.debug: if self.epoch_count % self.config.epoch_freq == 0: val_loss, _, _, val_patk = self.run_eval( sess, "validation", summarizer['val'], tr_step) test_loss, _, metrics, patk = self.run_eval( sess, "test", summarizer['test'], tr_step) output = "=> Training : Loss = {:.2f} | Validation : Loss = {:.2f} | Test : Loss = {:.2f}\n=> Training : P@K = {} | Validation : P@K = {} | Test : P@K {}".format( average_loss, val_loss, test_loss, train_patk, val_patk, patk) with open("../stdout/validation.log", "a+") as f: output_ = output + "\n=> Test : Coverage = {}, Average Precision = {}, Micro Precision = {}, Micro Recall = {}, Micro F Score = {}".format( metrics['coverage'], metrics['average_precision'], metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1']) output_ += "\n=> Test : Macro Precision = {}, Macro Recall = {}, Macro F Score = {}\n=> P@K = {}\n\n".format( metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'], patk) f.write(output_) print(output) if self.config.have_patience: if val_loss < best_validation_loss: if val_loss < best_validation_loss * improvement_threshold: self.saver.save( sess, self.config.ckptdir_path + "/model_best.ckpt") best_validation_loss = val_loss best_step = self.epoch_count else: if patience < 1: self.saver.restore( sess, self.config.ckptdir_path + "/model_best.ckpt") if learning_rate <= 0.00001: print("=> Breaking by Patience Method") break else: learning_rate /= 10 patience = self.config.patience print( "\033[91m=> Learning rate dropped to {}\033[0m" .format(learning_rate)) else: patience -= 1 self.epoch_count += 1 print("=> Best epoch : {}".format(best_step)) if self.config.debug == True: sys.exit() test_loss, test_accuracy, test_metrics, p_k = self.run_eval( sess, "test", summarizer['test'], tr_step) returnDict = { "test_loss": test_loss, "test_accuracy": test_accuracy, 'test_metrics': test_metrics, "test_pak": p_k } if self.config.debug == False: returnDict["train"] = best_validation_loss return returnDict
cur_pix_frames = [] for i, frame_data in enumerate(self.temp_data): printProgressBar(i + self.temp_data.start_frame, self.temp_data.end_frame, 'Generating temp history plot.') cur_pix_frames.append(i + self.temp_data.start_frame) cur_pix_history.append(frame_data[pixel]) pixelTempHistory.append(cur_pix_history) frame.append(cur_pix_frames) fig6, ax6 = plt.subplots() fig6.suptitle('Pixel {} Temperature History:\n'.format(self.pixels)) ax6.set_xlabel('Frame') ax6.set_ylabel('Temperature') for history, frames, pixel in zip(pixelTempHistory, frame, self.pixels): ax6.plot(frames, history, label=str(pixel[1]) + ',' + str(pixel[0])) plt.legend() if __name__ == '__main__': dataset = DataSet( '/home/troy/thermography/4-20_corrected/thermal_cam_temps.npy', end_frame=27000) plotter = Plots(dataset, [(50, 100), (123, 99)], threshold=500) plotter.plot3DBubble() plt.show()
def main(operation='train', code=None): step = 30 input_size = 73 train_steps = 1000000 batch_size = 512 learning_rate = 0.0002 hidden_size = 16 nclasses = 1 validation_size = 700 keep_rate = 0.7 selector = [ "ROCP", "OROCP", "HROCP", "LROCP", "MACD", "RSI", "VROCP", "BOLL", "MA", "VMA", "PRICE_VOLUME", "CROSS_PRICE" ] input_shape = [step, input_size] # [length of time series, length of feature] if operation == 'train': dataset_dir = "./dataset" train_features = [] train_labels = [] val_features = [] val_labels = [] for filename in os.listdir(dataset_dir): if filename != '000001.csv': continue print("processing file: " + filename) filepath = dataset_dir + "/" + filename raw_data = read_sample_data(filepath) moving_features, moving_labels = extract_feature( raw_data=raw_data, selector=selector, window=input_shape[0], with_label=True, flatten=False) train_features.extend(moving_features[:-validation_size]) train_labels.extend(moving_labels[:-validation_size]) val_features.extend(moving_features[-validation_size:]) val_labels.extend(moving_labels[-validation_size:]) train_features = numpy.transpose(numpy.asarray(train_features), [0, 2, 1]) train_labels = numpy.asarray(train_labels) train_labels = numpy.reshape(train_labels, [train_labels.shape[0], 1]) val_features = numpy.transpose(numpy.asarray(val_features), [0, 2, 1]) val_labels = numpy.asarray(val_labels) val_labels = numpy.reshape(val_labels, [val_labels.shape[0], 1]) train_set = DataSet(train_features, train_labels) val_set = DataSet(val_features, val_labels) # raw_data = read_sample_data("toy_stock.csv") # moving_features, moving_labels = extract_feature(raw_data=raw_data, selector=selector, window=input_shape[0], # with_label=True, flatten=False) # moving_features = numpy.asarray(moving_features) # moving_features = numpy.transpose(moving_features, [0, 2, 1]) # moving_labels = numpy.asarray(moving_labels) # moving_labels = numpy.reshape(moving_labels, [moving_labels.shape[0], 1]) # train_set = DataSet(moving_features[:-validation_size], moving_labels[:-validation_size]) # val_set = DataSet(moving_features[-validation_size:], moving_labels[-validation_size:]) trader = SmartTrader(step, input_size, learning_rate, hidden_size, nclasses) trader.build_graph() train(trader, train_set, val_set, train_steps, batch_size=batch_size, keep_rate=keep_rate) elif operation == "predict": predict_file_path = "./dataset/000001.csv" if code is not None: predict_file_path = "./dataset/%s.csv" % code print("processing file %s" % predict_file_path) raw_data = read_sample_data(predict_file_path) moving_features, moving_labels = extract_feature(raw_data=raw_data, selector=selector, window=input_shape[0], with_label=True, flatten=False) moving_features = numpy.asarray(moving_features) moving_features = numpy.transpose(moving_features, [0, 2, 1]) moving_labels = numpy.asarray(moving_labels) moving_labels = numpy.reshape(moving_labels, [moving_labels.shape[0], 1]) # train_set = DataSet(moving_features[:-validation_size], moving_labels[:-validation_size]) val_set = DataSet(moving_features[-validation_size:], moving_labels[-validation_size:]) predict(val_set, step=step, input_size=input_size, learning_rate=learning_rate, hidden_size=hidden_size, nclasses=nclasses) else: print("Operation not supported. ")
def main(args): check_path(args) # CIFAR-10的全部类别,一共10类 classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') # 数据集 data_builder = DataBuilder(args) dataSet = DataSet(data_builder.train_builder(), data_builder.test_builder(), classes) # 选择模型 if args.lenet: net = LeNet() model_name = args.name_le elif args.vgg: net = Vgg16_Net() model_name = args.name_vgg elif args.resnet18: net = ResNet18() model_name = args.name_res18 elif args.resnet34: net = ResNet34() model_name = args.name_res34 elif args.resnet50: net = ResNet50() model_name = args.name_res50 elif args.resnet101: net = ResNet101() model_name = args.name_res101 elif args.resnet152: net = ResNet152() model_name = args.name_res152 # 交叉熵损失函数 criterion = nn.CrossEntropyLoss() # SGD优化器 optimizer = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=args.sgd_momentum, weight_decay=args.weight_decay) # 余弦退火调整学习率 scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=150) # 模型的参数保存路径 model_path = os.path.join(args.model_path, model_name) # 启动训练 if args.do_train: print("Training...") trainer = Trainer(net, criterion, optimizer, scheduler, dataSet.train_loader, dataSet.test_loader, model_path, args) trainer.train(epochs=args.epoch) # t.save(net.state_dict(), model_path) # 启动测试,如果--do_train也出现,则用刚刚训练的模型进行测试 # 否则就使用已保存的模型进行测试 if args.do_eval: if not args.do_train and not os.path.exists(model_path): print( "Sorry, there's no saved model yet, you need to train first.") return # --do_eval if not args.do_train: checkpoint = t.load(model_path) net.load_state_dict(checkpoint['net']) accuracy = checkpoint['acc'] epoch = checkpoint['epoch'] print("Using saved model, accuracy : %f epoch: %d" % (accuracy, epoch)) tester = Tester(dataSet.test_loader, net, args) tester.test() if args.show_model: if not os.path.exists(model_path): print( "Sorry, there's no saved model yet, you need to train first.") return show_model(args) if args.do_predict: device = t.device("cuda" if t.cuda.is_available() else "cpu") checkpoint = t.load(model_path, map_location=device) net.load_state_dict(checkpoint['net']) predictor = Predictor(net, classes) img_path = 'test' img_name = [os.path.join(img_path, x) for x in os.listdir(img_path)] for img in img_name: predictor.predict(img)
from params import * from least_squares import LeastSquares from regularized_least_squares import RegularizedLeastSquares from lasso import Lasso from robust_regression import RobustRegression from bayesian_regression import BayesianRegression from dataset import DataSet from common import * result_sub_path = result_path + 'part1c/' #get data gt_data = DataSet(gt_x_path, gt_y_path) gt_x, gt_y = gt_data.x, gt_data.y gt_phi = generate_polynomial_features(gt_x) sample_percent = [10, 25, 50, 75] num_sub_sample = 5 sample_data_list = [] for i in range(4): for j in range(num_sub_sample): sample_data = DataSet(sample_x_path, sample_y_path, percent_sample=sample_percent[i]) sample_data_list.append(sample_data) def deploy_least_square(sample_id): sample_data = sample_data_list[sample_id] sample_x, sample_y = sample_data.x, sample_data.y sample_phi = generate_polynomial_features(sample_x) title = 'LEAST SQUARES' log = open(result_sub_path + title + '_' + str(sample_id) + '.txt', 'w')
import torch import numpy as np from dataset import DataSet from loss2 import MultiBoxLoss from ssd300 import SSD from ssdpytorch.utils.augmentations import SSDAugmentation from torch.utils.data import DataLoader from torch.autograd import Variable import matplotlib as plt from PIL import ImageDraw model = SSD() model.cuda() model.load_state_dict(torch.load('ssd.pth')) test_dataset = DataSet('MiniSKU/test','MiniSKU/annotations/test.csv', SSDAugmentation(scale_only=True)) test_loader = DataLoader(test_dataset, 1, num_workers=2, collate_fn=test_dataset.collate_fn) def test(n=5): d = [] for i, (img, boxes, labels) in enumerate(test_loader): predicted_locs, predicted_scores = model(img[0].unsqueeze(0).cuda()) det_boxes, det_labels, det_scores = model.detect_objects(predicted_locs, predicted_scores, min_score=0.2, max_overlap=0.5, top_k=200) det_boxes = det_boxes[0].to('cpu') img = img[0].permute(1,2,0) img = torch.squeeze(img) print(img.shape) dist1 = cv2.convertScaleAbs(img.numpy()) w, h, _ = dist1.shape origin_dims = torch.FloatTensor([w,h,w,h]).unsqueeze(0)
def moving_extract(self, window=30, date=None, open_prices=None, close_prices=None, high_prices=None, low_prices=None, volumes=None, N_predict=1, flatten=True): self.extract(open_prices=open_prices, close_prices=close_prices, high_prices=high_prices, low_prices=low_prices, volumes=volumes) feature_arr = numpy.asarray(self.feature) p = 0 rows = feature_arr.shape[0] print("feature dimension: %s" % rows) all_data = DataSet([], [], []) predict = DataSet([], [], []) while p + window <= feature_arr.shape[1]: # The last self.prospective days can not produce complete labels if feature_arr.shape[1] - (p + window) >= N_predict: x = feature_arr[:, p:p + window] # Label the closing price of the next day -days y = make_label(close_prices, p + window, self.prospective) d = list(date[p + window:p + window + self.prospective]) if flatten: x = x.flatten("F") all_data.features.append(numpy.nan_to_num(x)) all_data.labels.append(y) all_data.date.append(d) else: x = feature_arr[:, p:p + window] if flatten: x = x.flatten("F") predict.features.append(numpy.nan_to_num(x)) predict.date.append(date[p + window - 1]) predict.closing_price.append(close_prices[p + window - 1]) predict.last_label.append(close_prices[p + window - 2]) p += 1 all_data._features = numpy.asarray(all_data.features) all_data._labels = numpy.asarray(all_data.labels) all_data._date = numpy.asarray(all_data.date) predict._features = numpy.asarray(predict.features) predict._date = numpy.asarray(predict.date) predict._last_label = numpy.asarray(predict.last_label) predict._closing_price = numpy.asarray(predict.closing_price) return all_data, predict
def test(): BATCH_SIZE = 1 with tf.Graph().as_default(): dataset = DataSet(BATCH_SIZE) keep_conv = tf.placeholder(tf.float32) images, depths, invalid_depths, features = dataset.csv_inputs( TEST_FILE) coarse = model.inference(images, trainable=False) logits = model.inference_refine(images, coarse, keep_conv, trainable=False) loss1 = model.loss(coarse, depths, invalid_depths) loss2 = model.loss(logits, depths, invalid_depths) init_op = tf.global_variables_initializer() #改了 # Session sess = tf.Session(config=tf.ConfigProto( log_device_placement=LOG_DEVICE_PLACEMENT)) # 不打印设备分配日志 sess.run(init_op) coarse_params = {} # 定义一个新的dict refine_params = {} for variable in tf.all_variables(): variable_name = variable.name #print("parameter: %s" % (variable_name)) if variable_name.find("/") < 0 or variable_name.count("/") != 1: continue if variable_name.find('coarse') >= 0: coarse_params[variable_name] = variable #print("parameter: %s" %(variable_name)) if variable_name.find('fine') >= 0: refine_params[variable_name] = variable saver_coarse = tf.train.Saver(coarse_params) saver_refine = tf.train.Saver(refine_params) # fine tune 微调。。。 if FINE_TUNE: coarse_ckpt = tf.train.get_checkpoint_state(COARSE_DIR) if coarse_ckpt and coarse_ckpt.model_checkpoint_path: #print(coarse_ckpt.model_checkpoint_path) saver_coarse.restore(sess, coarse_ckpt.model_checkpoint_path) refine_ckpt = tf.train.get_checkpoint_state(REFINE_DIR) if refine_ckpt and refine_ckpt.model_checkpoint_path: #print(refine_ckpt.model_checkpoint_path) saver_refine.restore(sess, refine_ckpt.model_checkpoint_path) # test coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) index = 0 ls1 = [] ls2 = [] print('\n', '---------Examples---------:') for step in range(NumOfTest): #print('-----------------------------------------') loss_value1, loss_value2, logits_val, coarse_val, images_val, features_ = sess.run( [loss1, loss2, logits, coarse, images, features], feed_dict={keep_conv: 1}) ls1.append(loss_value1) ls2.append(loss_value2) if step % 1 == 0: index = index + 1 print(features_, 'Coarse losses:', loss_value1, 'Refine losses:', loss_value2, '\n') output_save(coarse_val, logits_val, images_val, index, "data/test") ls1m = np.mean(ls1) ls2m = np.mean(ls2) print('---------Testing Results--------:') print('Coasre image mean losses:', ls1m) print('Refine image mean losses:', ls2m) coord.request_stop() #请求所有线程停止 coord.join(threads) #等待所有的线程完成 sess.close()
def main(args): config_file = args.config test = args.test cfg = Config(config_file) tr = None if test is None: tr = DataSet(cfg.tr_data, cfg) te = DataSet(cfg.te_data, cfg, sub_sample=1) tr0 = DataSet([cfg.tr_data[0]], cfg, sub_sample=1) cfg.att = te.sz[1] else: if test == 'te': te = DataSet([cfg.te_data[0]], cfg) else: te = DataSet([cfg.tr_data[0]], cfg) cfg.att = te.sz[1] iterations = 10000 loop = cfg.loop print "input attribute", cfg.att, "LR", cfg.lr, 'feature', cfg.feature_len n_att = cfg.att # n_length = cfg.feature_len n_hidden = cfg.nodes[1][-1] n_output = cfg.num_output hidden0 = ToTensor(np.ones(n_hidden).astype(np.float32)) mrnn = RNN(n_att, cfg.nodes, n_output, cfg.lr) if test: mrnn.load_state_dict(torch.load(cfg.netTest[:-3])) run_test(mrnn, te, cfg, hidden0) tr_loss, tr_median = run_test(mrnn, te, cfg, hidden0) for a in range(len(tr_loss)): print a, tr_loss[a], tr_median[a] exit(0) if cfg.renetFile: mrnn.load_state_dict(torch.load(cfg.renetFile[:-3])) t00 = datetime.datetime.now() T = 0 T_err = 0 for a in range(iterations): tr_pre_data = tr.prepare(multi=1) while tr_pre_data: for b in tr_pre_data: length = len(b[0]) x = ToTensor(b[0].reshape(length, cfg.feature_len, cfg.att).astype(np.float32)) y = ToTensor(b[1].astype(np.float32)) err = mrnn.train(y, x, hidden0) if a % loop == 0 and a > 0: t1 = datetime.datetime.now() print a, (t1 - t00).total_seconds() / 3600.0, T_err / T T_err = 0 T = 0 torch.save(mrnn.state_dict(), cfg.netFile[:-3]) T_err += err T += 1 tr_pre_data = tr.get_next()
def main(): # Load json file ds_task1 = DataSet('dataset/entity.json') # ds_task1.json_print() hash_type = ds_task1.entity_categories()
def train(REFINE_TRAIN): BATCH_SIZE = 8 with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) dataset = DataSet(BATCH_SIZE) keep_conv = tf.placeholder(tf.float32) images, depths, invalid_depths, features = dataset.csv_inputs( TRAIN_FILE) if REFINE_TRAIN: print("refine train.") coarse = model.inference(images, trainable=False) logits = model.inference_refine(images, coarse, keep_conv) #???这个 参数是什么 else: print("coarse train.") logits = model.inference(images) loss = model.loss(logits, depths, invalid_depths) train_op = op.train(loss, global_step, BATCH_SIZE) init_op = tf.global_variables_initializer() #改了 # Session sess = tf.Session(config=tf.ConfigProto( log_device_placement=LOG_DEVICE_PLACEMENT)) # 不打印设备分配日志 sess.run(init_op) # parametersi coarse_params = {} # 定义一个新的dict refine_params = {} if REFINE_TRAIN: for variable in tf.all_variables(): variable_name = variable.name print("parameter: %s" % (variable_name)) if variable_name.find("/") < 0 or variable_name.count( "/") != 1: continue if variable_name.find('coarse') >= 0: coarse_params[variable_name] = variable print("parameter: %s" % (variable_name)) if variable_name.find('fine') >= 0: refine_params[variable_name] = variable else: for variable in tf.trainable_variables(): variable_name = variable.name print("parameter: %s" % (variable_name)) if variable_name.find("/") < 0 or variable_name.count( "/") != 1: continue if variable_name.find('coarse') >= 0: coarse_params[variable_name] = variable if variable_name.find('fine') >= 0: refine_params[variable_name] = variable # define saver print(coarse_params) saver_coarse = tf.train.Saver(coarse_params) if REFINE_TRAIN: saver_refine = tf.train.Saver(refine_params) # fine tune 微调。。。 if FINE_TUNE: coarse_ckpt = tf.train.get_checkpoint_state(COARSE_DIR) if coarse_ckpt and coarse_ckpt.model_checkpoint_path: print(coarse_ckpt.model_checkpoint_path) saver_coarse.restore(sess, coarse_ckpt.model_checkpoint_path) else: print("No Pretrained coarse Model.") if REFINE_TRAIN: refine_ckpt = tf.train.get_checkpoint_state(REFINE_DIR) if refine_ckpt and refine_ckpt.model_checkpoint_path: saver_refine.restore(sess, refine_ckpt.model_checkpoint_path) else: print("No Pretrained refine Model.") # train coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) lossli = [] lossli1 = [] for step in range(MAX_STEPS): index = 0 lossli = [] print('-------------------------------') for i in range(3000): _, loss_value, logits_val, images_val = sess.run( [train_op, loss, logits, images], feed_dict={keep_conv: 0.8}) if i % 100 == 0: print('[Epoch]:', step, '[iteration]:', i, '[Train losses]:', loss_value) lossli.append(loss_value) index += 1 lossli1.append(np.mean(lossli)) if step % 5 == 0 or (step * 1) == MAX_STEPS: if REFINE_TRAIN: refine_checkpoint_path = REFINE_DIR + '/model.ckpt' saver_refine.save(sess, refine_checkpoint_path, global_step=step) else: coarse_checkpoint_path = COARSE_DIR + '/model.ckpt' saver_coarse.save(sess, coarse_checkpoint_path, global_step=step) plt.figure() plt.plot(lossli1) plt.savefig("train_loss.jpg") plt.xlabel("Epoch") plt.ylabel("Train_loss") plt.title("Train_Loss for Each Epoch") coord.request_stop() #请求所有线程停止 coord.join(threads) #等待所有的线程完成 sess.close()