def __init__(self, features): Features.__init__(self) self.features = features d = 0 for i in range(len(features)): d += features[i].GetCount() self.SetCount(d)
def __init__(self, conf): Features.__init__(self) nc = 0 for i in range(kNumLevels): nc += (i + 1)**2 self.SetCount(kNumBins * nc) print "Histogram bins:", self.GetCount()
def generateFeatures(self): featureGenerator = Features() self.featuresFunctionsHandMade = featureGenerator.features() self.featuresFunctions = Features.featuresFromSentences(self.sentences) self.weigths = np.random.uniform(low=0.0, high=1.0, size=(len( self.featuresFunctionsHandMade)))
def extract_and_transform(avm, df, transform_y): f = Features() return f.extract_and_transform_X_y( df, f.ege(avm.features_group), layout_transactions.price, 'natural', 'natural', transform_y, )
def getFeatureFunctions(args): """ Based on the user's choice pass the appropriate functions For more details look into features class :param args: :return: """ functionsArray = Features.getSupportedFunctions() functionsList = sub1FromList(args.features) featureFunctions = functionsArray[functionsList] return Features(featureFunctions)
def read_and_save_features(path_to_audio_dataset, path_to_save_features): Preprocesor.create_structure(path=path_to_save_features) labels = np.sort(os.listdir(path_to_audio_dataset)) for dir_filename in labels: path_to_word_dataset = path_to_audio_dataset + os.sep + dir_filename if (os.path.isdir(path_to_word_dataset) == 0): continue features_pack = [] for file in tqdm(os.listdir(path_to_word_dataset)): input, rate = sf.read(file=path_to_word_dataset + os.sep + file) features = Features(input=input, rate=rate) features_to_save = features.wav_to_features() features_pack.append(features_to_save) np.save(file=path_to_save_features + os.sep + dir_filename, arr=features_pack)
def __init__(self, graph, settings): """ :param graph (matplotlib.pyplot.scatter): graph instance that needs to be filled while exporing the neighbourhood :param settings (Settings): settings of the graph to be plotted and bees algorithm itself Settings that are used here: determing elite and nonelite sites: - BEESNUM - ELITE - NONELITE - RECRUITEDELITE - RECRUITEDNONELITE plotting the points on the graph: - getbest() - getlocalbest() - SIZELOCALBEST - OPACITYLOCALBEST - getrgbcolor() """ self.graph = graph self.settings = settings self.globalbest = Point(Coordinate(float('inf'), float('inf'), float('inf')), Features(None, None, None)) self.wasglobalbest = False self.fabric = CoordinateFabric(settings) self.sites = [] self.pointscontroller = PointsController()
def __init__(self, row, store_score=False): self.essay_id = row[0] self.essay_set = int(row[1]) text = row[2] if store_score: self.score = self.get_score(self.essay_set, row) self.features = Features(text)
def load_features(img_dirs, feature_names): """ loads features from file :param img_dirs: a list of image paths :type img_dirs: list :param feature_names: a list of feature names :type feature_names: list :return: dict (img_dir, features) features is also a dict (feature_name, feature vector) :rtype: dict """ features = OrderedDict() for img_dir in img_dirs: features[img_dir] = OrderedDict() for feature_name in feature_names: if Features.is_TU_feature(feature_name): feature = _load_TU_feature(img_dir, feature_name) if feature is None: #replace with zeros of correct vector size feature = np.zeros( features[img_dirs[0]][feature_name].shape) else: feature = load_precalc_feature(img_dir, feature_name) features[img_dir][feature_name] = feature return features
def save_features(img_dir, feature_name, feature): """ saves features of image in .txt file (compressed in gzip format) :param img_dir: path of the image :type img_dir: str :param feature_name: name of the feature (should be one of Features enum) :type feature_name: str :param feature: the feature vector :type feature: np.array :return: :rtype: """ #create feature file path feature_file_path = os.path.join( img_dir[:img_dir.find('videos')], 'features', 'Features_From_TUWien', 'Image_Subtask', feature_name, img_dir[img_dir.find('videos') + 7:] + '.txt.gz') if platform.system() == 'Linux': dirs = feature_file_path[:feature_file_path.rfind('/')] #Linux system else: dirs = feature_file_path[:feature_file_path.rfind( '\\')] #windows system if not os.path.exists(dirs): os.makedirs(dirs) if Features.is_single_val_feature(feature_name): #feature is a single value with gzip.open(feature_file_path, "w") as f: f.write(str(feature)) else: np.savetxt(feature_file_path, feature, newline=' ')
def add(self, point): # ADDS POINT TO THE HOLDER AND UPDATES OTHER POINTS IF NECESSARY rgba = self.decideoncolor(point.z) if (point.z < self.currentbestz): if (self.currentbestid != -1): self.holder.changesize(self.currentbestid, self.settings.SIZEBAD) self.holder.changeopacity(self.currentbestid, self.settings.OPACITYBAD) rgba.append(self.settings.OPACITYGOOD) self.currentbestz = point.z self.currentbestid = point.id features = Features(self.settings.SIZEGOOD, rgba) else: rgba.append(self.settings.OPACITYBAD) features = Features(self.settings.SIZEBAD, rgba) self.holder.add(ColoredPoint(point, features))
def loadFeatures(self): np.set_printoptions(threshold=np.nan) features = loadmat(datadir + 'features/cache.binAudLSTM_' + self.type + '_scene' + str(self.sceneid) + '/' + self.name) features = np.array(features["x"]) self.fClass = Features(features)
def parseInputFile(inputFileName): featureObjects = [] with open(inputFileName, 'r') as inputFile: csvFile = csv.reader(inputFile) for line in csvFile: feature = Features(line[0], line[1]) featureObjects.append(feature) return featureObjects
def make_details(data, test_months, n_best, n_worst): 'return a ColumnTable' extra_info = [] feature_names = Features().ege_names(control.arg.features) columns_table = ColumnsTable(( ('test_month', 6, '%6s', ('test', 'month'), 'test month'), ('nth', 2, '%2d', (' ', 'n'), 'rank of feature (1 ==> more frequently included)'), ('probability', 4, '%4.1f', (' ', 'prob'), 'probability feature appears in a decision tree'), ('feature_name', 40, '%40s', (' ', 'feature name'), 'name of feature'), ), verbose=True) for test_month in test_months: value = data[ReductionKey(test_month)] if 'feature_importances' not in value.importances: # one month has an ensemble model # skip that month print 'chart a sees an unexpected ensemble model' print 'test_month', test_month print 'value', value print 'value.importance', value.importances print 'skipping the test month' print 'entering debugger' pdb.set_trace() importances = value.importances['feature_importances'] assert value.importances['features_group'] == control.arg.features, value model = value.model assert type(model) == ResultKeyGbr or type(model) == ResultKeyRfr sorted_indices = importances.argsort() # sorted first lowest, last highest for nth_best in xrange(n_best): if nth_best == len(feature_names): break index = sorted_indices[len(importances) - nth_best - 1] columns_table.append_detail( test_month=test_month, nth=nth_best + 1, probability=importances[index] * 100.0, feature_name=feature_names[index] ) extra_info.append([test_month, nth_best+1, importances[index]*100.0, feature_names[index]]) for nth in xrange(n_worst): break # skip, for now if nth == len(feature_names): break nth_worst = n_worst - nth - 1 index = sorted_indices[nth_worst] columns_table.append_detail( test_month=test_month, nth=len(importances) - nth_worst, probability=importances[index] * 100.0, feature_name=feature_names[index] ) if n_best > 1 or n_worst > 1: # insert blank line between test_months if more than 1 row in a month columns_table.append_detail() columns_table.append_legend() return columns_table, extra_info
def gettor(cls, coordinate): """ :param coordinate (Coorindate): coordinate of the point in Cartesian space :return (Point): TOR point that corresponds to coordinate provided (see more in settings documentation) """ color = cls.FILLCOLORTOR features = Features(cls.SIZETOR, color, cls.BORDERCOLORTOR) res = Point(coordinate, features) return res
def classify(args): out = pickle.load(open(args.m, 'rb')) params = out[0] dict_rev = out[1] if args.m == "odia" or "odia.torch": mydata = Features(350, 'unk-odia.vec', "fasttext.wiki.300d.vec", args.i) else: mydata = Features(350, 'unk.vec', "glove.6B.50d.txt", args.i) model = NNComp(20, 0.01, 32, 40, 15000, 4) out, _ = model.forward(mydata.final_data, params) labels = np.argmax(out, axis=1) preds = np.array([dict_rev.get(str(k)) for k in labels]) with open(args.o, "w") as file: for pred in preds: file.write(pred + "\n")
def getbad(cls, coordinate): """ :param coordinate (Coorindate): coordinate of the point in Cartesian space :return (Point): LOCALBEST point that corresponds to coordinate provided (see more in settings documentation) """ color = cls.getrgbcolor(coordinate.z) color.append(cls.OPACITYBAD) features = Features(cls.SIZEBAD, color, cls.BORDERCOLORBAD) res = Point(coordinate, features) return res
def do_work(control): 'write predictions to output csv file' samples = pd.read_csv( control.path_in_samples, nrows=10 if control.arg.test else None, usecols=None, # TODO: change to columns we actually use low_memory=False, ) apns = samples[layout_transactions.apn] sale_dates = samples[layout_transactions.sale_date] print 'read %d rows of samples from file %s' % (len(samples), control.path_in_samples) # iterate over the fitted models hps_predictions = {} for root, dirnames, filenames in os.walk(control.path_in_fitted): assert len(dirnames) == 0, dirnames print root, len(filenames) for filename in filenames: suffix_we_process = '.pickle' if not filename.endswith(suffix_we_process): print 'skipping file without a fitted model: %s' % filename continue hps_string = filename[:-len(suffix_we_process)] hps = HPs.from_str(hps_string) path_to_file = os.path.join(root, filename) with open(path_to_file, 'r') as f: ok, fitted_model = pickle.load(f) if ok: print 'predicting samples using fitted model %s' % filename X, y = Features().extract_and_transform( samples, hps['units_X'], hps['units_y']) predictions = fitted_model.predict(X) assert len(predictions) == len(samples) assert hps_string not in hps_predictions hps_predictions[hps_string] = predictions else: print 'not not predict samples using fitted model %s; reason: %s' % ( filename, fitted_model, # an error message ) # have all the predictions for all filenames (= a set of hyperparameters) print 'walked all %d files' % len(filenames) out = { 'apns': apns, 'sale_dates': sale_dates, 'hps_predictions': hps_predictions, } with open(control.path_out_file, 'w') as f: pickle.dump(out, f) print 'wr0te results to %s' % control.path_out_file return
def train(args): if args.i == "datasets/odia.train.txt": mydata = Features(args.f,'unk-odia.vec',args.E,args.i,"Train") dim = 300 else: mydata = Features(args.f,'unk.vec',args.E,args.i,"Train") dim = 50 model = NNComp(args.u,args.l,args.b,args.e,args.f*dim,len(mydata.labelname) ) param,Train_cost,Test_cost = model.run(mydata.final_data,mydata.lables_number) plt.plot(Train_cost,'b',Test_cost,'r--') plt.ylabel('Loss') plt.xlabel('epochs') # plt.xticks([0,5,10,15,20,25]) plt.suptitle('Train/vald loss') red_patch = mpatches.Patch(color='red', label='Validation') blue_patch = mpatches.Patch(color='blue', label='Train') plt.legend(handles=[red_patch,blue_patch]) plt.show() out = (param,mydata.labeldict_rev)
def classify(args): out = pickle.load(open(args.m, 'rb')) dict_rev = out[5] model = out[0] if args.m == ("odia" or "odia.torch"): mydata = Features(out[2],'unk-odia.vec',"fasttext.wiki.300d.vec",args.i) else: mydata = Features(out[2],'unk.vec',"glove.6B.50d.txt",args.i) model.eval() out= model.forward(torch.from_numpy(mydata.final_data).float()) labels = np.argmax(out.detach().numpy(),axis=1) preds = np.array([ dict_rev.get(k) for k in labels]) with open(args.o, "w") as file: for pred in preds: file.write(pred+"\n")
def make_mean_importance_by_feature(test_months): 'return dict[feature_name] = float, the mean importance of the feature' feature_names = Features().ege_names(control.arg.features) mean_importance = {} # key = feature_name for feature_index, feature_name in enumerate(feature_names): # build vector of feature_importances for feature_name feature_importances = np.zeros(len(test_months)) # for feature_name for month_index, test_month in enumerate(test_months): month_importances = data[ReductionKey(test_month)] # for each feature all_feature_importances = month_importances.importances['feature_importances'] if 'feature_importances' not in month_importances.importances: print 'chart b sees an unexpected ensemble model' print 'test_month', test_month print 'month_importances', month_importances print 'entering debugger' pdb.set_trace() feature_importances[month_index] = all_feature_importances[feature_index] mean_importance[feature_name] = np.mean(feature_importances) return mean_importance
def resultOfFeaturesInWeigths(self, sentence): arranges = list( itertools.permutations(self.possibleLabels, len(sentence.words) + 1)) probabilityOfArrange = [] for (indexTop, arrange) in enumerate(arranges): sumOfFeatures = 0 for (index, feature) in enumerate(self.featuresFunctions): for position in range(0, len(sentence.labels)): current = arrange[position] previous = arrange[position - 1] response = Features.verify(feature, current, previous) sumOfFeatures += self.weigths[index] * response probabilityOfArrange.append(sumOfFeatures) bestIndices = np.argpartition(probabilityOfArrange, -1)[-1:] return bestIndices[0]
def scale_features_old(features): """ scales all feature vectors in features :param features: :type features: dict :return: scaled features :rtype: dict """ scaled_features = OrderedDict() for img_dir in features: scaled_features[img_dir] = dict() for feature_name in features[img_dir]: if Features.is_single_val_feature(feature_name): scaled_features[img_dir][feature_name] = features[img_dir][ feature_name] else: scaled_features[img_dir][feature_name] = preprocessing.scale( features[img_dir][feature_name]) #scaled_features[img_dir][feature_name] = preprocessing.minmax_scale(features[img_dir][feature_name]) return scaled_features
def gen_feature_matrices_per_feature(features): """ generates feature matrix for each feature which can be used to train SVM :param features: :type features: dict :return: feature_matrices: (feature_name, matrix) :rtype: dict """ feature_matrices = dict() for img_dir in features: for feature_name in features[img_dir]: if Features.is_single_val_feature(feature_name): vector = np.asscalar(features[img_dir][feature_name]) else: vector = features[img_dir][feature_name].tolist() #add vector to matrix if feature_name not in feature_matrices: feature_matrices[feature_name] = list() feature_matrices[feature_name].append(vector) return feature_matrices
def gen_final_feature_matrix_old(features): """ generates the final feature matrix which can be used to train SVM :param features: :type features: dict :return: final feature matrix :rtype: np.array """ final_feature_mat = [] for img_dir in features: final_feature_vec = [] for feature_name in features[img_dir]: if Features.is_single_val_feature(feature_name): final_feature_vec.append( np.asscalar(features[img_dir][feature_name])) else: final_feature_vec.extend( features[img_dir][feature_name].tolist()) final_feature_mat.append(final_feature_vec) return np.asarray(final_feature_mat)
def choose_feature(self,list_of_metods,X,Y,columns,iteration): features=[] for metod in list_of_metods: print(metod) switcher={ "sfm_lr": lambda : features.append(Features.select_features_select_from_model_LR(X, Y, columns, iteration).tolist()), "sfm_linearsvc":lambda:features.append(Features.select_features_select_from_model_linearsvc(X, Y, columns, iteration).tolist()), "sfm_rfc":lambda:features.append(Features.select_features_select_from_model_RandomForest(X, Y, columns, iteration).tolist()), "sfm_lasso":lambda:features.append(Features.select_features_select_from_model_lasso(X, Y, columns, iteration).tolist()),#last sfm "rle_lr":lambda:features.append(Features.select_features_RFE_LR(X, Y, columns, iteration).tolist()), "rle_linearsvc":lambda:features.append(Features.select_features_RFE_linearsvc(X, Y, columns, iteration).tolist()), "rle_rfc":lambda:features.append(Features.select_features_RFE_RandomForest(X, Y, columns, iteration).tolist()), "rle_lasso":lambda:features.append(Features.select_features_RFE_lasso(X, Y, columns, iteration).tolist()),#last rle "permutation_lr":lambda:features.append(Features.select_features_permutation_LR(X, Y, columns, iteration).tolist()), "permutation_linearsvc":lambda:features.append(Features.select_features_permutation_linearsvc(X, Y, columns, iteration).tolist()), "permutation_rfc":lambda:features.append(Features.select_features_permutation_RandomForest(X, Y, columns, iteration).tolist()), "permutation_lasso":lambda:features.append(Features.select_features_permutation_lasso(X, Y, columns, iteration).tolist()) }.get(metod,lambda: None)() flatten = [val for sublist in features for val in sublist]#flatten list features=list(dict.fromkeys(flatten))# delete duplicates return features
def do_work(control): 'write fitted models to file system' def make_transaction_ids(df): 'return dates and apns for the query samples' result = [] for index, row in df.iterrows(): next = TransactionId( sale_date=row[layout_transactions.sale_date], apn=row[layout_transactions.apn], ) result.append(next) return result def read_csv(path): df = pd.read_csv( path, nrows=100 if control.arg.test else None, usecols=None, # TODO: change to columns we actually use low_memory=False ) print 'read %d samples from file %s' % (len(df), path) return df def in_prediction_month(query_samples, prediction_YYYYMM): 'return DataFrame of sample in the month we are predicting' def splitYYYYMMDD(dates): year_factor = 10000.0 years = (dates / year_factor).astype('int64') month_factor = 100.0 months = ((dates - years * year_factor) / month_factor).astype('int64') return years, months def splitYYYYMM(date_str): date = int(date_str) year_factor = 100.0 year = int(date / year_factor) month = int(date - year * year_factor) return year, month sale_dates = query_samples[layout_transactions.sale_date] query_years, query_months = splitYYYYMMDD(sale_dates) prediction_year, prediction_month = splitYYYYMM(prediction_YYYYMM) mask_year = query_years == prediction_year mask_month = query_months == prediction_month mask = mask_year & mask_month result = query_samples.loc[mask] return result # reduce process priority, to try to keep the system responsive lower_priority() with open(control.path_out_feature_names, 'w') as f: feature_names = Features().ege_names('swpn') pickle.dump(feature_names, f) training_samples = read_csv(control.path_in_training_samples) query_samples_all = read_csv(control.path_in_query_samples) query_samples = in_prediction_month(query_samples_all, control.arg.prediction_month) print 'read %s query samples of which %d are in the prediction month %s' % ( len(query_samples_all), len(query_samples), control.arg.prediction_month, ) with open(control.path_out_transaction_ids, 'w') as f: transaction_ids = make_transaction_ids(query_samples) pickle.dump(transaction_ids, f) with open(control.path_out_actuals, 'w') as f: X, actuals = Features().extract_and_transform(query_samples, 'natural', 'natural') pickle.dump(actuals, f) count_fitted = 0 n_hps = make_n_hps(control.arg.model) # determine hps we have already fitted and predicted already_seen = set() if os.path.exists(control.path_out_predictions_attributes): with open(control.path_out_predictions_attributes, 'r') as f: unpickler = pickle.Unpickler(f) try: while True: hps_str, predictions, fitted_attributes = unpickler.load() print 'existing', hps_str already_seen.add(hps_str) except EOFError as e: pass print 'have already seen %d hps_str values' % len(already_seen) # fit and predict HPs that we have not already seen with open(control.path_out_predictions_attributes, 'w') as results_file: pickler = pickle.Pickler(results_file) for hps in HPs.iter_hps_model(control.arg.model): count_fitted += 1 start_time = time.clock() # wall clock time on Windows, processor time on Unix hps_str = HPs.to_str(hps) if hps_str in already_seen: print 'skipping already seen: %s' % hps_str continue try: predictions, fitted_attributes, n_training_samples = fit_and_predict( training_samples, query_samples, hps, control, ) pickler.dump((hps_str, predictions, fitted_attributes)) pickler.clear_memo() # don't build up a large data structure print 'fit-predict #%4d/%4d on:%6d in: %6.2f %s %s %s %s hps: %s ' % ( count_fitted, n_hps, n_training_samples, time.clock() - start_time, control.arg.training_data, control.arg.neighborhood, control.arg.model, control.arg.prediction_month, hps_str, ) except Exception as e: print 'exception: %s' % e pdb.set_trace() pickler.dump((hps_str, e)) # collect to get memory usage stable, so that we can run this program many time in parallel gc.collect() if control.arg.test and count_fitted == 5: print 'breaking because we are testing' break
def X_y(df): return Features().extract_and_transform(df, hps['units_X'], hps['units_y'])
# if BASIC_MODEL = False then the complex model will be created BASIC_MODEL = True # if TRAIN_WITH_MST = False then for training the perceptron will use the greedy method - faster TRAIN_WITH_MST = True max_accuracy = 0.83 if BASIC_MODEL == True: max_accuracy = 0.74 # load train file train_words, train_pos, train_heads = read_file_and_preprocess('train.labeled', include_y=True) # devide word lists into sent lists sent_word_list, sent_pos_list, sent_head_list = create_sentences_from_word_lists(train_words, train_pos, train_heads) if True == BASIC_MODEL: # create features instance for basic model featurs_basic_obj = Features(train_words, train_pos, train_heads, features_to_include_list=[1,2,3,4,5,6,8,10,13]) else: # creates features for the complex model featurs_basic_obj = Features(train_words, train_pos, train_heads, features_to_include_list='ALL') # init weight vector basic_feature_weights_vec = np.zeros(featurs_basic_obj.feature_wieghts_len, dtype=np.float64) # create sentence full graph and for each edge assigns the relevant features list # also calcs the feature vector for each empiric observation - for optimization sent_graph_list = [] sent_real_feat_idx = [] sent_graph_edges_feats = [] for m in range(len(sent_word_list)): # full graph sent_graph_list.append(build_sentence_full_graph(len(sent_word_list[m])))
import argparse from Features import Features if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--db_path', type=str, default='images/', help='Path of the image database') args = parser.parse_args() dbpath = args.db_path # List of features that stores feat = [] base_feat = [] features = Features() features.input_img(dbpath, feat, base_feat) features.compute_codebook(feat) features.compute_bow(feat) features.compute_tfidf() features.compute_baseline(base_feat)