def main(): experiment_set = final_experiment print("There are {} experiments to run".format(len(experiment_set))) train_data_path = "data/training.dat" dev_data_path = "data/full/dev.dat" tst_data_path = "data/full/evaluation.dat" feats_path = "data/model.features" num_feats = len([line for line in open(feats_path)]) batch_size = 80 runs_per_experiment = 5 for experiment_name in experiment_set.keys(): logger.info("Running experiment {}".format(experiment_name)) exp_features = experiment_set[experiment_name] out_path = 'output/experiments_v3/{}'.format(experiment_name) makedirs(out_path, exist_ok=True) train_instances = load_data(train_data_path, num_feats, exp_features) dev_instances = load_data(dev_data_path, num_feats, exp_features) dev_eval_instances = load_eval_data(dev_data_path, num_feats, exp_features) tst_instances = load_eval_data(tst_data_path, num_feats, exp_features) logger.info("Loaded {} training instances with {} features".format( len(train_instances), num_feats)) for i in range(runs_per_experiment): iter_path = out_path + '/v{}'.format(i) makedirs(iter_path, exist_ok=True) ranker = Ranker(num_feats, 256) trainer = RankerTrainer(ranker, batch_size, iter_path) trainer.train(train_instances, dev_instances, None, dev_eval_instances, tst_instances)
def main(args): torch.manual_seed(333) if use_cuda: torch.cuda.manual_seed(333) random.seed(333) train_data_path = "data/training.dat" train_eval_data_path = "data/train-eval.dat" dev_data_path = "data/full/dev.dat" eval_data_path = "data/full/evaluation.dat" feats_path = "data/model.features" num_feats = len([line for line in open(feats_path)]) batch_size = 80 ranker = Ranker(num_feats, 256) ## Instances for training - loaded as pairs feat_indices = set([i for i in range(num_feats)]) train_instances = load_data(train_data_path, num_feats, feat_indices) train_eval_instances = load_eval_data(train_data_path, num_feats, feat_indices) dev_instances = load_data(dev_data_path, num_feats, feat_indices) dev_eval_instances = load_eval_data(dev_data_path, num_feats, feat_indices) tst_instances = load_eval_data(eval_data_path, num_feats, feat_indices) logger.info("Loaded {} training instances with {} features".format( len(train_instances), num_feats)) trainer = RankerTrainer(ranker, batch_size, 'output/') trainer.train(train_instances, dev_instances, train_eval_instances, dev_eval_instances, tst_instances) ranker.save('output/ranker.model')
def main(): """ The main function """ es = Elasticsearch() ic = IndicesClient(es) dl.create_wikipedia_index(ic) dl.load_data(es) print("The top ranked title without synonym:", search_and_rank(es)) add_synonyms_to_index(ic) print("The top ranked title with synonym:", search_and_rank(es))
def main(): """ The main function """ es = Elasticsearch() ic = IndicesClient(es) dl.create_wikipedia_index(ic) dl.load_data(es) print( f"There are {filter(es)['hits']['total']['value']} documents contains 'lake' or 'tour'" ) print( f"There are {search_without_improvement(es)['hits']['total']['value']} documents contains" " 'lake' or 'tour', but without the 'improvement required' sentense.")
def main(models, dataset_paths, best_grids, model_fitting_parameters): trained_model_list = list() for model, dset_path, grid, fitting_parameters in zip( models, dataset_paths, best_grids, model_fitting_parameters): # Load data set. dataframe = data_loading.load_data(dset_path, constants.OUTPUT_DATA_PROC_PATH) # Divide into training and test data set. x_train, x_test, y_train, y_test = data_loading.train_test_split( dataframe) # Load and train models. trained_model = load_trained_model(model, constants.OUTPUT_MODEL_PATH) if trained_model is None: # if model is not lodaded train and save. trained_model = model grid.pop('scores') trained_model.set_params(**grid) trained_model.fit(x_train, y_train, **fitting_parameters) save_trained_model(trained_model, constants.OUTPUT_MODEL_PATH) trained_model_list.append(trained_model) # Report model results. report_models_results(x_train, x_test, y_train, y_test, model)
def get_trained_model(): ''' Return trained model (if no pre-trained model, then also train it) ''' model = tflearn.DNN(build_resnet(), tensorboard_verbose=0, tensorboard_dir='tensorboard') if os.path.exists(MODEL_FILE_PATH): print('-' * 80) print('Pretrained model was found.') model.load(os.path.splitext(MODEL_FILE_PATH)[0]) else: print('-' * 80) print('Pretrained model was not found. Starting training:') images_train, labels_train, images_test, labels_test = data_loading.load_data( ) model.fit(images_train, labels_train, n_epoch=10, validation_set=(images_test, labels_test), snapshot_step=100, show_metric=True, run_id='convnet_hand_recognition') model.save(os.path.splitext(MODEL_FILE_PATH)[0]) return model
def main(): main_params = load_main_params() loop_features, loop_targets, loop_info, feature_names = load_data(main_params["read_data_prefixes"], False) # pre-processing data analysis analyze_data(loop_features.copy(), loop_targets.copy(), feature_names.copy(), main_params["data_analysis"], True, None) if main_params["create_models"]["regression"]["enabled"]: create_regression_models(loop_features.copy(), loop_targets.copy(), feature_names.copy(), main_params) if main_params["create_models"]["classification"]["enabled"]: create_classification_models(loop_features.copy(), loop_targets.copy(), feature_names.copy(), main_params) if main_params["make_prediction"]["enabled"]: loop_features, loop_targets, loop_info, feature_names = load_data(main_params["read_data_prefixes"], True) make_prediction_from_model(loop_features.copy(), loop_targets.copy(), main_params["make_prediction"], loop_info.copy()) exit(0)
def get_model_and_df_grid_combinations(models, grids): all_grids_results = list() # Iterate over data sets. for path in data_processing.PROCESSED_DATASETS_PATH: df = data_loading.load_data(path, constants.OUTPUT_DATA_PROC_PATH) pgo = sku.grid_search.PersistentGrid.load_from_path( persistent_grid_path=constants.PERSITENT_GRID_PATH, dataset_path=path) # Iterate over models. for grid, model in zip(grids, models): best_grid = get_best_grid(df, model, grid, pgo).copy() best_grid['model'] = sku.get_estimator_name(model) best_grid['path'] = path all_grids_results.append(best_grid.copy()) return all_grids_results
def search_results(user_dep, user_dest, user_time_dep, user_passengers, postgres=False, redis=False): redis_db = None pg_conn = None if redis: redis_db = StrictRedis(socket_connect_timeout=3, **redis_config) elif postgres: pg_conn = psycopg2.connect(**pg_config) return load_data(user_dep, user_dest, user_time_dep, user_passengers, redis_db, pg_conn)
def __init__(self, dataset, supergroups, size, epochs, learning_rate): self.dataset = dataset self.data_name = dataset self.supergroups = supergroups self.size = size self.epochs = epochs self.learning_rate = learning_rate self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") print(self.device) self.model = Model(self.dataset, self.size, supergroups=self.supergroups) self.model.to(self.device) #self.model.load_state_dict(torch.load("./agnews_40,0.0001.pt", map_location=torch.device('cpu'))) if self.supergroups: trans = 'supergroups' else: trans = None self.train_loader, self.test_loader = load_data(dataset=self.dataset, transformation=trans)
def main(): # Set the random seed for the entire project. du.set_random_seed(0) # Rationale: ensure reproducibility of the results. # Flush previous runs. constants.flush_project_results(constants.TMP_PATH, constants.OUTPUT_PATH) # Rationale: provide a clear state for the project to run and enforces # reproducibility of the results. # Load, save and split data. dataframe = data_loading.load_data(constants.DATA_PATH) data_loading.save_data(dataframe, constants.TMP_PATH) x_train, x_test, y_train, y_test = data_loading.train_test_split(dataframe) # Rationale: *Loading*: load data in the main module and pass it as a first # argument to every other defined function (that relates to the data set) # thus saving precious time with data loading. *Saving*: for big data sets # saving the dataset as a fast read format (such as HDF5) saves time. # Load and combine data processing pipelines. # TODO: data_processing_pipelines = None # Perform exploratory data analyses. data_exploration.main(dataframe) # Rationale: conduct exploratory data analyses. # Perform grid search. persistent_grid_object = sku.grid_search.PersistentGrid.load_from_path( persistent_grid_path=constants.PERSITENT_GRID_PATH, dataset_path=constants.DATA_PATH) # Iteration over processed data sets may occur here since they are model # dependent. grid_search.main(dataframe, constants.MODELS, data_processing_pipelines, constants.GRIDS, persistent_grid_object) best_grids = grid_search.get_best_grids( # noqa constants.MODELS, data_processing_pipelines, persistent_grid_object)
DATA_PATH = './szwagropol_data/transactions.txt' # zmienna, pod którą przechowywana jest ścieżka z danymi. Zapisanie nazwy zmiennej # w całości wielkimi literami jest konwencją (ogólnie stosowaną), do oznaczania # stałych (zmiennych, których wartość nie powinna się w trakcie wykonywania # programu zmieniać) TRANSACTION_TIME_INDEX = 0 CUSTOMER_INDEX = 1 PRODUCT_NAME_INDEX = 2 CATEGORY_NAME_INDEX = 3 QUANTITY_INDEX = 4 UNIT_PRICE_INDEX = 5 TOTAL_VALUE_INDEX = 6 # zmienne (stałe), przy pomocy których zapisujemy pod którym indeksem znajdują się konkretne informacje # przechowywane w wierszach danych - zwiększając przy tym czytelność kodu columns, rows = load_data(DATA_PATH) # wczytujemy listę kolumn i listę wierszy z pliku - przy pomocy wcześniej # stworzonej, i zaimportowanej funkcji load_data columns.append('total_transaction_values') # dodajemy dodatkową kolumnę, która będzie zawierać łączną wartość danego wiersza. # początkowo w wierszu zawarta jest liczba sztuk kupionego produktu i cena/sztukę, # zapisując sobie wynik mnożenia tych wartości w dodatkowej kolumnie unikniemy # wymnażania tych wartości wielokrotnie for row in rows: # dla każdego z wczytanych wierszy total = row[QUANTITY_INDEX] * row[UNIT_PRICE_INDEX] # wylicz całkowitą wartość danej transakcji, mnożąc liczbę sztuk * cenna/sztukę... row.append(total) # ... i dodaj obliczony wynik na końcu wiersza def calculate_total_revenue(rows): # funkcja do obliczania wartości całkowitego utargu w przekazanych wierszach z transakcjami total_revenue = 0 # inicjalizuemy zmienną, która będzie zawierała całkowity utarg for row in rows: # iterujemy po każdym z przekazanych wierszy total_revenue += row[TOTAL_VALUE_INDEX] # dodajemy do zmiennej pomocniczej, w której trzymamy całkowity utarg # wartość zamówienia po którym aktualnie iterujemy
from tensorflow.keras.utils import to_categorical from tensorflow.keras.backend import clear_session from sklearn.model_selection import KFold import numpy as np from model import create_model, create_encoder from data_loading import load_data, clean_sentence from plot_results import HistoriesStorage, plot_model_histories from hyperparameters import * tweets_data = load_data('Data_tweets.csv') tweets_data = tweets_data[[1, 6]] tweets_data.columns = ['Class', 'Tweet'] # One-hot-encoding classes def substitute_classes(x): """Maps classes (0,2,4) to indexes (0,1,2)""" sub_dict = {0: 0, 2: 1, 4: 2} return sub_dict[x] # Clean tweets inputs = np.array(tweets_data['Tweet'].apply(lambda x: clean_sentence(x))) # One-hot-encode classes targets = tweets_data["Class"].apply(lambda x: substitute_classes(x)) targets = to_categorical(targets, num_classes=3) # Fit encoder on input data - create vocabulary of NUM_WORDS most frequent words encoder = create_encoder(inputs)
def main(): # Filter warnings that polute the project stdout. filter_warnings() # Rationale: produce cleaner results. # Set the random seed for the entire project. du.set_random_seed(0) # Rationale: ensure reproducibility of the results. # Flush previous runs. # constants.flush_project_results(constants.TMP_PATH, # constants.OUTPUT_PATH) # Rationale: provide a clear state for the project to run and enforces # reproducibility of the results. # Download, load and save data. data_loading.main() dataframe = data_loading.load_data(constants.DATASET_PATH, constants.TMP_PATH) data_loading.save_data(dataframe, constants.TMP_PATH, constants.DATASET_PATH) # Rationale: *Loading*: load data in the main module and pass it as a first # argument to every other defined function (that relates to the data set) # thus saving precious time with data loading. *Saving*: for big data sets # saving the dataset as a fast read format (such as HDF5) saves time. # Load and combine data processing pipelines. data_processing.main(dataframe, nan_strategy='drop') # Rationale: prepare data to be fed into the models. # Different algorithms make use of different data structures. For instance # XGBoost allow for nans. Data transformations usually don't. # Perform exploratory data analyses. data_exploration.main(dataframe) # Rationale: conduct exploratory data analyses. # Data split. # Removed. # Rationale: module 'models' should execute this. # Perform grid search. # Iteration over processed data sets may occur here since they are model # dependent. grid_search.main(constants.MODELS, constants.GRIDS) best_combination_of_datasets_and_grids = ( grid_search.dict_of_best_datasets_and_grids(constants.MODELS, constants.GRIDS)) best_datasets = best_combination_of_datasets_and_grids['best_datasets'] best_grids = best_combination_of_datasets_and_grids['best_grids'] # Rationale: perform grid search as part of machine learning best # practices. # Summary of what was executed so far: # 1) Setting of the random seed for reproducibility. # 2) Flusing of intermediate results for a clean run. # 3) Data loading and data saving. # 4) Conduction of exploratory data analyses. # 5) Grid search of best model hyper parameters. # To conclude our project we need the grand finale: model selection and # evaluation/comparison. models.main(constants.MODELS, best_datasets, best_grids, constants.MODEL_FITTING_PARAMETERS)
#All file strings corresponding to BOLD data for subject 4 files = ['task001_run001.bold_dico.nii', 'task001_run002.bold_dico.nii', 'task001_run003.bold_dico.nii', 'task001_run004.bold_dico.nii', 'task001_run005.bold_dico.nii', 'task001_run006.bold.nii' 'task001_run007.bold.nii', 'task001_run008.bold.nii'] # # Load the images as an image object # Load all the image data from the images # Drop the first four volumes, as we know these are outliers # all_data = [] for index, filename in enumerate(files): new_data = dl.load_data(filename) #load_data function drops first 4 for us num_vols = new_data.shape[-1] if index != 0 or index != 7: new_num_vols = num_vols - 4 new_data = new_data[:,:,:,:new_num_vols] #Drop last 4 volumes for middle runs all_data.append(new_data) # * Get indices of outlier volumes for each dataset. # * Write each as its own file and save in 'vol_std_outliers' folder # * Takes 15 min to run all_bands_outliers = [] all_sdevs = [] all_iqr_outliers = [] for data in all_data:
from sklearn.model_selection import train_test_split from sklearn.ensemble import IsolationForest from sklearn.impute import SimpleImputer from anoflows.hpo import find_best_flows from data_loading import load_data logging.getLogger().setLevel(logging.INFO) if len(sys.argv) == 1: logging.error("YAML data specification missing from the command line arguments") exit(1) spec_file = sys.argv[1] df, spec = load_data(spec_file) max_rows = min(len(df), spec.get("max_rows", 40000)) novelty_detection = spec.get("novelty", True) normal_classes = spec["normal_classes"] precision = defaultdict(list) for rounds in range(spec.get("rounds", 1)): # random sampling df = df.sample(n=max_rows, replace=False) label_col = spec["label_column"] y = df[label_col].values other = df.drop(label_col, inplace=False, axis=1) X = other.values # imputing
import torch.optim as optim from model import Model from data_loading import load_data data = input('Type Dataset Choice, AGNews or 20Newsground: ') size = input('Pick model size, big or small: ') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #device=torch.device("cpu") print('device used: ', device) model = Model(data, size) model.to(device) #define train/test here train_loader, test_loader = load_data(dataset=data) loss_crit = nn.CrossEntropyLoss() #optimizer=optim.SGD(model.parameters(),lr=0.001,momentum=0.9) optimizer = optim.Adam(model.parameters(), lr=0.0001) print("Starting training...") for epoch in range(5): running_loss = 0.0 for i, data in enumerate(train_loader, 0): inputs, labels = data inputs, labels = torch.tensor(inputs), torch.tensor(labels) inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad()
tsne_df = _tsne(norm_df, 3) joined_df = pd.concat((norm_df, tsne_df, y), axis=1) assert norm_df.shape[0] == tsne_df.shape[0] == joined_df.shape[0] data_loading.save_data(joined_df, constants.OUTPUT_DATA_PROC_PATH, DATA_TSNE3) def no_transform(dataframe): if data_loading.dataframe_already_exists(constants.OUTPUT_DATA_PROC_PATH, DATA_VANILLA): return None data_loading.save_data(dataframe, constants.OUTPUT_DATA_PROC_PATH, DATA_VANILLA) def main(dataframe, nan_strategy='drop'): df = _process_nan(dataframe, how=nan_strategy) x = df[data_loading.get_x_columns(df)] y = df[constants.Y_COLUMN] norm_pca2(x, y) norm_pca3(x, y) norm_tsne2(x, y) norm_tsne3(x, y) no_transform(df) if __name__ == '__main__': dataframe = data_loading.load_data(constants.DATASET_PATH) main(dataframe)