def train_test(): if os.path.isfile('pickle_train') and os.path.isfile('pickle_test'): test = util.load_pickle('pickle_test') train = util.load_pickle('pickle_train') test.index = [i for i in range(len(test))] train.index = [i for i in range(len(train))] return train, test else: #Load of the dataset preprocessed before dataset = util.load_pickle('pickle_data_discrete') header = [ 'acceleration_mean', 'acceleration_stdev', 'pitch1', 'pitch2', 'pitch3', 'roll1', 'roll2', 'roll3', 'classes', 'total_accel_sensor_1', 'total_accel_sensor_2', 'total_accel_sensor_4' ] #write header in train and test csv with open('train_dataset.csv', "w", newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(header) csvFile.close() with open('test_dataset.csv', "w", newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(header) csvFile.close() total_test = dataset[0:0] total_train = dataset[0:0] for i in range(5): #find in dataset one class at once c = dataset.loc[dataset['classes'] == i] #training and testing of one class: 80% training, 20% testing of each class train, test = train_test_split(c, test_size=0.2) total_test = total_test.append(test) total_train = total_train.append(train) #append results in two csv, train and test with open('train_dataset.csv', 'a', newline='') as csvFile: train.to_csv(csvFile, header=False, index=False) csvFile.close() with open('test_dataset.csv', 'a', newline='') as csvFile: test.to_csv(csvFile, header=False, index=False) csvFile.close() #write train and test in a pickle util.to_pickle(total_train, 'pickle_train') util.to_pickle(total_test, 'pickle_test') total_train.index = [i for i in range(len(total_train))] total_test.index = [i for i in range(len(total_test))] return total_train, total_test
def __init__(self, name_tagger, corpus, mwe=True): """ When initialized it will load all the taggers. They are: * UnigramTagger * BigramTagger * TrigramTagger If not possible it will create them, and save them. If Multi-Word Expressions are not allowed its necessary to split them and then use a UnigramTagger to be trained Args: name_tagger: root part of the name of the tagger, like cess_esp corpus: corpus that will train the tagger mwe: It can allow Multi-Word Expressions """ self.mwe = mwe if not mwe: name_tagger += '_' + NOMWE_TEXT #set the names of the taggers like: # cess_es_unigram.tagger, cess_es_bigram.tagger # or cess_es_nomwe_unigram.tagger, cess_es_nomwe_bigram.tagger complete_names = [name_tagger + '_' + x for x in N_GRAM_NAMES] # Try to load the taggers. try: for x in complete_names: utilities.load_pickle(x, TAGGER_EXTENSION, TAGGER_PATH).tag(['hola']) #If it not work create them except IOError: print "\n*** First-time use of", name_tagger, " taggers ***" print "Training taggers ..." timer = utilities.Timer() if self.mwe: cess_sents = corpus.tagged_sents() train_tagger(name_tagger, cess_sents) else: #Without mutliwords we need to split them cess_sents = unchunk(corpus.tagged_sents()) #We need the mwe tagger to train aux_tagger = tagger(name_tagger, corpus, mwe=True) tagged_cess_nomwe = aux_tagger.uni.tag_sents(cess_sents) train_tagger(name_tagger + '_' + NOMWE_TEXT, tagged_cess_nomwe) print "\nAll taggers trained in", timer.get_time(), "seconds" # Load tagger self.uni = utilities.load_pickle(complete_names[0], TAGGER_EXTENSION, TAGGER_PATH) self.bi = utilities.load_pickle(complete_names[1], TAGGER_EXTENSION, TAGGER_PATH) self.tri = utilities.load_pickle(complete_names[2], TAGGER_EXTENSION, TAGGER_PATH)
def sent_tokenize(text, language="spanish"): """ It splits the text into sentences Args: text: text to be splited language: language of the tokenizer to be used Returns: List of sentences """ #try to use from local try: from utilities import load_pickle tokenizer = load_pickle(language, ".pickle", path="nltk_data/tokenizer/punkt/") return tokenizer.tokenize(text) #if not, use nltk except IOError: from nltk import sent_tokenize return sent_tokenize(text, language)
def get_classifier(name, x_train=None, y_train=None, train=False): """ It will load the specified classifier. If it's not possible it will train if it has x_train and y_train Args: name: name of the classifier x_train: metrics to train the classifier y_train: labels to train the classifier train: if true, it will force to train the classifier without loading it Returns: the classifier """ #if asked by user, train anyway if train: clf = train_classifier(name, x_train, y_train) #if not, try to load previously processed data else: try: clf = u.load_pickle(name, path=ML_path) #if not found, try to train it except IOError: print "Pickle object not found, starting to train the classifier" #Check if it is possible to train the classifier if x_train is None or y_train is None: print "\n\nNot possible to train the classifier without x_train and y_train\n\n" else: clf = train_classifier(name, x_train, y_train) return clf
def tokenize_outputs(model_path, test_x, output_path): RESOURCES_PATH = os.path.join(os.getcwd(), 'resources') char2idx_path = os.path.join(RESOURCES_PATH, 'char2idx.pkl') idx2label_path = os.path.join(RESOURCES_PATH, 'idx2label.pkl') char2idx = load_pickle(char2idx_path) idx2label = load_pickle(idx2label_path) vocab_size = len(char2idx.items()) out_vocab_size = len(idx2label.items()) # init hyperparameters hyperparams = HyperParameters() hyperparams.vocab_size = vocab_size hyperparams.num_classes = out_vocab_size # Load model model = BaselineModel(hyperparams) try: model.load_state_dict(torch.load(model_path)) except RuntimeError: model.load_state_dict( torch.load(model_path, map_location=torch.device('cpu'))) # compute predictions y_pred = [] for data_x in tqdm(test_x, desc='Computing predictions'): data_x_ = [char2idx.get(char, 1) for char in data_x] data_x = torch.LongTensor(data_x_) logits, _ = predict(model, data_x.unsqueeze(0)) pred_y = WikiDataset.decode_output(logits, idx2label)[0] y_pred.append(pred_y) # Save to text file with open(output_path, encoding='utf-8', mode='w+') as outputs_file: for prediction in tqdm(y_pred, desc='Writing predictions'): outputs_file.write(f"{''.join(prediction)}\n")
def plot_all(folder, config=None): # If no config object is given, load from selected folder if config == None: filename = glob.glob(folder + '/*.pkl')[-1] config = load_pickle(filename) plt.style.use(['seaborn-paper', 'seaborn-whitegrid']) plt.rc("font", family="serif") plt.rc('xtick', labelsize='x-small') plt.rc('ytick', labelsize='x-small') width = 4 height = width / 1.618 plt.close('all') config.width = width config.height = height # Read data from folder data_opt = fix_units( pd.read_excel(glob.glob(folder + '/Intermediates/*.xlsx')[-1])) data_ss = fix_units(pd.read_excel(glob.glob(folder + '/ss*.xlsx')[-1])) # Set output folder plot_folder = os.path.join(config.results_folder, 'Plots') if not os.path.exists(plot_folder): os.makedirs(plot_folder) config.plot_folder = plot_folder # Plot solar data solar_plots(data_ss, config) # Plot energy data energy_plots(data_opt, data_ss, config) # Plot 3D Trajectory trajectory_plots_3D(data_opt, config) # Plot miscelanseous plots misc_plots(data_opt, config)
print("Extracting useful features from the dataset..") dataset = features_extraction(dataset) print('Dataset with extracted features') print(dataset[:100]) #Sample dataset print("Sampling dataset...") dataset_sampled = sample_dataset(dataset) print(dataset_sampled) #Save dataframe as pickle print('Saving dataset_sampled to pickle object...') util.to_pickle(dataset_sampled, 'dataset_sampled') print('\n Dataset sampled') dataset_sampled = util.load_pickle('dataset_sampled') print(dataset_sampled) #Feature selection (Mark Hall's algorithm, reference paper) print('Feature selection...') dataset_discrete = feature_selection_discretization(dataset_sampled) with open('discrete_dataset.csv', 'w', newline='') as c: dataset_discrete.to_csv(c, header=True, index=False) c.close() print("Discrete dataset\n") print(dataset_discrete) #Save dataframe as pickle print('Saving dataset_discrete to pickle object...')
def predict_multilingual(input_path: str, output_path: str, resources_path: str, lang: str) -> None: """ DO NOT MODIFY THE SIGNATURE! This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the "<id> <BABELSynset>" format (e.g. "d000.s000.t000 bn:01234567n"). The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded). :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :param lang: the language of the dataset specified in input_path, specify which model to load on which dataset :return: None """ # load the model model_path = os.path.join(resources_path, 'SensEmbed_BiLSTM_ATT_MultiTask_model.h5') model = load_model(model_path, custom_objects={'SeqSelfAttention': SeqSelfAttention}) logging.info(f'{model._name} is loaded.') # load tokenizer, fetch our vocabulary size tokenizer_path = os.path.join(resources_path, 'multilingual_tokenizer.pkl') tokenizer = load_pickle(tokenizer_path) word_tokens = [word for word in tokenizer.word_index if 'bn:' not in word] sense_tokens = [word for word in tokenizer.word_index if 'bn:' in word] vocabulary_size = len(word_tokens) output_size = vocabulary_size + len(sense_tokens) batch_size = 8 # hard coded; as this was the one worked on Colab Google # Parse the testing dataset gold_dict_path = input_path.replace("data.xml", "gold.key.txt") gold_dict = build_dict(gold_dict_path) data_x, mask_x = parse_test(input_path, tokenizer=tokenizer, gold_dict=gold_dict, batch_size=batch_size) # Getting the model predictions predictions = [] for batch_x, batch_mask in tqdm(test_generator(np.array(data_x), batch_size, output_size, use_elmo=False, mask_builder=np.array(mask_x), tokenizer=tokenizer, use_bert=False), desc="Predicting Senses"): # Output Shape (batch_size, max_len_per_batch, output_vocab_size) batch_pred = model.predict_on_batch([batch_x, batch_mask]) y_hat = np.argmax(batch_pred[0], axis=-1) predictions.extend(y_hat) # load lemma2synsets lemma2synsets_file_path = os.path.join(os.getcwd(), 'resources', 'lemma2synsets4.0.xx.wn.ALL.txt') lemma_synsets = get_lemma2synsets(lemma2synsets_file_path) # load wordnet 2 babelnet synsets' mapping bn2wn_path = os.path.join(resources_path, "babelnet2wordnet.tsv") _, wordnet_babelnet_ = build_bn2wn_dict(bn2wn_path) # Save predictions to a file id_bn_list = [] # stands for predictions in {word_id babelnet_sense} _predictions = [] for i, sentence in enumerate(tqdm(data_x, desc="Preparing models' predictions")): for j, word in enumerate(sentence): if len(mask_x[i][j]) == 2: # So it is an instance prediction = predictions[i][j] prediction_sense_ = tokenizer.index_word.get(prediction, '<OOV>') if 'wn:' not in prediction_sense_ or 'bn:' not in prediction_sense_: # Fallback Strategy prediction_sense = predict_multilingual_sense(word=word, word2idx=tokenizer.word_index, lemma_synsets=lemma_synsets, wordnet_babelnet=wordnet_babelnet_) else: prediction_sense = prediction_sense_[prediction_sense_.find('bn:'):] word_id = mask_x[i][j][1] bn = prediction_sense if prediction_sense is not None else '<OOV>' if word_id is None or bn is None: continue id_bn_list.append(f'{word_id}\t{bn}') _predictions.append(bn) # Writing model predictions with open(output_path, encoding='utf-8', mode="w+") as output_file: for id_bn in tqdm(id_bn_list, desc="Writing model predictions"): output_file.write(f'{id_bn}\n') # Fetching the ground truth of the data ground_truth = [] ground_truth_path = input_path.replace("data.xml", "gold.key.txt") with open(ground_truth_path, encoding='utf-8', mode='r') as ground_truth_file: lines = ground_truth_file.read().splitlines() for line in lines: sense_key = line.split()[1] ground_truth.append(sense_key) # Compute F1_Score _, _, f1score, _ = precision_recall_fscore_support(ground_truth, _predictions, average='micro') print(f'{model._name} F1_score: {f1score}')
def predict_lexicographer(input_path: str, output_path: str, resources_path: str): """ DO NOT MODIFY THE SIGNATURE! This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the "<id> <lexicographerId>" format (e.g. "d000.s000.t000 noun.animal"). The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded). :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :return: None """ # Load configuration params config_path = os.path.join(os.getcwd(), 'config.yaml') config_file = open(config_path) config_params = yaml.load(config_file) # Parse and process Test set elmo = config_params["use_elmo"] batch_size = config_params["batch_size"] tokenizer_path = os.path.join(resources_path, 'tokenizer.pkl') tokenizer = load_pickle(tokenizer_path) data_x, test_mask_builder = parse_test(input_path, tokenizer, batch_size, elmo) # Raw Data word_tokens = [ word for word in tokenizer.word_index if not word.startswith('wn:') ] sense_tokens = [ word for word in tokenizer.word_index if word.startswith('wn:') ] vocabulary_size = len(word_tokens) output_size = vocabulary_size + len(sense_tokens) # Load model model_path = os.path.join(resources_path, 'Baseline_model.h5') custom_objs = {'ElmoEmbeddingLayer': ElmoEmbeddingLayer} model = load_model(model_path, custom_objects=custom_objs) logging.info(f'{model._name} is loaded.') # Model Predictions predictions = [] for batch_x, batch_test in tqdm(test_generator(np.array(data_x), batch_size, output_size, elmo, np.array(test_mask_builder), tokenizer, False), desc="Predicting_Lexes"): # Output Shape (batch_size, max_len_per_batch, output_vocab_size) batch_pred = model.predict_on_batch([batch_x, batch_test]) # Output Shape (batch_size, max_len_per_batch) y_hat = np.argmax(batch_pred, axis=-1) predictions.extend(y_hat) # Dictionaries to get mappings bn2wn_path = os.path.join(resources_path, 'babelnet2wordnet.tsv') _, wordnet_babelnet = build_bn2wn_dict(bn2wn_path) bn2lex_path = os.path.join(resources_path, 'babelnet2lexnames.tsv') babelnet_lex, _ = build_bn2lex_dict(bn2lex_path) # Save predictions to a file id_lex_list = [] # stands for predictions in {word_id babelnet_sense} for i, sentence in enumerate( tqdm(data_x, desc='Formatting model predictions into ID_LEX')): for j, word in enumerate(sentence): if len(test_mask_builder[i][j]) == 2: # So it is an instance prediction = predictions[i][j] prediction_sense = tokenizer.index_word.get( prediction, '<OOV>') if 'wn:' not in prediction_sense or 'bn:' not in prediction_sense: prediction_sense = predict_sense(token=word) # Fallback word_id = test_mask_builder[i][j][1] id_ = word_id[word_id.find('.') + 1:] bn = wordnet_babelnet.get(prediction_sense, None) lex = babelnet_lex.get(bn, 'factotum') if id_ is None or lex is None: continue id_lex_list.append(f'{id_}\t{lex}') with open(output_path, mode="w+") as output_file: for id_lex in tqdm(id_lex_list, desc="Writing model predictions"): output_file.write(f'{id_lex}\n')
'Stacked_BiLSTM_CRF_Fasttext_2315.pth') configure_workspace(seed=1873337) train_dataset, dev_dataset, test_dataset = prepare_data(CRF_MODEL) train_dataset_ = DataLoader(dataset=train_dataset, batch_size=batch_size) dev_dataset_ = DataLoader(dataset=dev_dataset, batch_size=batch_size) test_dataset_ = DataLoader(dataset=test_dataset, batch_size=batch_size) embeddings_path = os.path.join(RESOURCES_PATH, 'wiki.en.vec') pretrained_embeddings = load_pretrained_embeddings( embeddings_path, train_dataset.word2idx, 300, is_crf=CRF_MODEL) if PRETRAINED else None idx2label = load_pickle( os.path.join(RESOURCES_PATH, 'Stacked_BiLSTM_CRF_Fasttext_2315_idx2label.pkl')) word2idx = load_pickle( os.path.join(RESOURCES_PATH, 'Stacked_BiLSTM_CRF_Fasttext_2315_word2idx.pkl')) hp = HyperParameters(name_, word2idx, train_dataset.idx2label, pretrained_embeddings, batch_size) model = CRF_Model(hp).to( train_dataset.get_device) if CRF_MODEL else BaselineModel(hp).to( train_dataset.get_device) model.load_model(model_path) evaluator = Evaluator(model, test_dataset_, CRF_MODEL) evaluator.check_performance(idx2label) tokens = test_dataset.data_x
if save_to is not None: plt.savefig(f'{save_to}_acc.png') plt.show() def visualize_plot_mdl(visualize, plot, model): # Balabizo acts as placeholder name = model._name if model._name is not None else 'Balabizo' if visualize: print(f'\n{name} Model Summary: \n') model.summary() if plot: img_path = os.path.join(os.getcwd(), 'resources', f'{name}_Model.png') plot_model(model, to_file=img_path, show_shapes=True) logging.info(f"{name} model image saved") logging.info(f"{name} model is created & compiled") def train_model_huge_dataset(dir_path, model): # TODO: LOOP ON PICKLE FILES, LOAD THEM, TRAIN MODEL, COMPUTE F1 SCORE (ITERATIVELY) # TODO: READ http://nlpprogress.com/english/word_sense_disambiguation.html # TODO: READ ABOUT REGULARIZERS IN LSTMs pass if __name__ == '__main__': history_path = os.path.join( os.getcwd(), 'resources', 'SensEmbed_BiLSTM_ATT_MultiTask_history.pkl') hist = load_pickle(history_path) plot_history(hist)