Ejemplo n.º 1
0
def train_test():
    if os.path.isfile('pickle_train') and os.path.isfile('pickle_test'):
        test = util.load_pickle('pickle_test')
        train = util.load_pickle('pickle_train')

        test.index = [i for i in range(len(test))]
        train.index = [i for i in range(len(train))]

        return train, test
    else:
        #Load of the dataset preprocessed before
        dataset = util.load_pickle('pickle_data_discrete')

        header = [
            'acceleration_mean', 'acceleration_stdev', 'pitch1', 'pitch2',
            'pitch3', 'roll1', 'roll2', 'roll3', 'classes',
            'total_accel_sensor_1', 'total_accel_sensor_2',
            'total_accel_sensor_4'
        ]

        #write header in train and test csv
        with open('train_dataset.csv', "w", newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(header)
        csvFile.close()

        with open('test_dataset.csv', "w", newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(header)
        csvFile.close()

        total_test = dataset[0:0]
        total_train = dataset[0:0]

        for i in range(5):
            #find in dataset one class at once
            c = dataset.loc[dataset['classes'] == i]

            #training and testing of one class: 80% training, 20% testing of each class
            train, test = train_test_split(c, test_size=0.2)
            total_test = total_test.append(test)
            total_train = total_train.append(train)

            #append results in two csv, train and test
            with open('train_dataset.csv', 'a', newline='') as csvFile:
                train.to_csv(csvFile, header=False, index=False)
            csvFile.close()

            with open('test_dataset.csv', 'a', newline='') as csvFile:
                test.to_csv(csvFile, header=False, index=False)
            csvFile.close()

        #write train and test in a pickle
        util.to_pickle(total_train, 'pickle_train')
        util.to_pickle(total_test, 'pickle_test')

        total_train.index = [i for i in range(len(total_train))]
        total_test.index = [i for i in range(len(total_test))]

        return total_train, total_test
Ejemplo n.º 2
0
	def __init__(self, name_tagger, corpus, mwe=True):
		"""
		When initialized it will load all the taggers. They are:
			* UnigramTagger
			* BigramTagger
			* TrigramTagger
			
		If not possible it will create them, and save them.

		If Multi-Word Expressions are not allowed its necessary to split them 
		and then use a UnigramTagger to be trained
		
		Args:
			name_tagger:	root part of the name of the tagger, like cess_esp
			corpus:			corpus that will train the tagger
			mwe:			It can allow Multi-Word Expressions
		"""
		
		self.mwe = mwe
		
		if not mwe:
			name_tagger += '_' + NOMWE_TEXT
			
		#set the names of the taggers like: 
		#		cess_es_unigram.tagger,			cess_es_bigram.tagger
		#	or	cess_es_nomwe_unigram.tagger,	cess_es_nomwe_bigram.tagger
		complete_names = [name_tagger + '_' + x for x in N_GRAM_NAMES]
		
		# Try to load the taggers.		
		try:	
			for x in complete_names:
				utilities.load_pickle(x, TAGGER_EXTENSION, TAGGER_PATH).tag(['hola'])
		
		#If it not work create them
		except IOError:
			print "\n*** First-time use of", name_tagger, " taggers ***"
			print "Training taggers ..."
			
			timer = utilities.Timer()
			
			if self.mwe:
				cess_sents = corpus.tagged_sents()
				train_tagger(name_tagger, cess_sents)
			
			else:
				#Without mutliwords we need to split them
				cess_sents = unchunk(corpus.tagged_sents())
				
				#We need the mwe tagger to train
				aux_tagger = tagger(name_tagger, corpus, mwe=True)
				tagged_cess_nomwe = aux_tagger.uni.tag_sents(cess_sents)
				train_tagger(name_tagger + '_' + NOMWE_TEXT, tagged_cess_nomwe)
			
			print "\nAll taggers trained in", timer.get_time(), "seconds"
			
		# Load tagger
		self.uni = utilities.load_pickle(complete_names[0], TAGGER_EXTENSION, TAGGER_PATH)
		self.bi = utilities.load_pickle(complete_names[1], TAGGER_EXTENSION, TAGGER_PATH)
		self.tri = utilities.load_pickle(complete_names[2], TAGGER_EXTENSION, TAGGER_PATH)
Ejemplo n.º 3
0
def sent_tokenize(text, language="spanish"):
    """
		It splits the text into sentences
		
		Args:
			text:		text to be splited
			language:	language of the tokenizer to be used
			
		Returns:
			List of sentences
	"""

    #try to use from local
    try:
        from utilities import load_pickle
        tokenizer = load_pickle(language,
                                ".pickle",
                                path="nltk_data/tokenizer/punkt/")

        return tokenizer.tokenize(text)

    #if not, use nltk
    except IOError:
        from nltk import sent_tokenize

        return sent_tokenize(text, language)
Ejemplo n.º 4
0
def get_classifier(name, x_train=None, y_train=None, train=False):
    """
		It will load the specified classifier. If it's not possible it will train if it has x_train and y_train
		
		Args:
			name:		name of the classifier
			x_train:	metrics to train the classifier
			y_train:	labels to train the classifier
			train:		if true, it will force to train the classifier without loading it
			
		Returns:
			the classifier
	"""

    #if asked by user, train anyway
    if train:
        clf = train_classifier(name, x_train, y_train)

    #if not, try to load previously processed data
    else:
        try:
            clf = u.load_pickle(name, path=ML_path)

        #if not found, try to train it
        except IOError:
            print "Pickle object not found, starting to train the classifier"

            #Check if it is possible to train the classifier
            if x_train is None or y_train is None:
                print "\n\nNot possible to train the classifier without x_train and y_train\n\n"

            else:
                clf = train_classifier(name, x_train, y_train)

    return clf
def tokenize_outputs(model_path, test_x, output_path):
    RESOURCES_PATH = os.path.join(os.getcwd(), 'resources')
    char2idx_path = os.path.join(RESOURCES_PATH, 'char2idx.pkl')
    idx2label_path = os.path.join(RESOURCES_PATH, 'idx2label.pkl')

    char2idx = load_pickle(char2idx_path)
    idx2label = load_pickle(idx2label_path)

    vocab_size = len(char2idx.items())
    out_vocab_size = len(idx2label.items())

    # init hyperparameters
    hyperparams = HyperParameters()
    hyperparams.vocab_size = vocab_size
    hyperparams.num_classes = out_vocab_size

    # Load model
    model = BaselineModel(hyperparams)
    try:
        model.load_state_dict(torch.load(model_path))
    except RuntimeError:
        model.load_state_dict(
            torch.load(model_path, map_location=torch.device('cpu')))

    # compute predictions
    y_pred = []
    for data_x in tqdm(test_x, desc='Computing predictions'):
        data_x_ = [char2idx.get(char, 1) for char in data_x]
        data_x = torch.LongTensor(data_x_)
        logits, _ = predict(model, data_x.unsqueeze(0))
        pred_y = WikiDataset.decode_output(logits, idx2label)[0]
        y_pred.append(pred_y)

    # Save to text file
    with open(output_path, encoding='utf-8', mode='w+') as outputs_file:
        for prediction in tqdm(y_pred, desc='Writing predictions'):
            outputs_file.write(f"{''.join(prediction)}\n")
Ejemplo n.º 6
0
def plot_all(folder, config=None):

    # If no config object is given, load from selected folder
    if config == None:
        filename = glob.glob(folder + '/*.pkl')[-1]
        config = load_pickle(filename)

    plt.style.use(['seaborn-paper', 'seaborn-whitegrid'])
    plt.rc("font", family="serif")
    plt.rc('xtick', labelsize='x-small')
    plt.rc('ytick', labelsize='x-small')
    width = 4
    height = width / 1.618
    plt.close('all')
    config.width = width
    config.height = height

    # Read data from folder
    data_opt = fix_units(
        pd.read_excel(glob.glob(folder + '/Intermediates/*.xlsx')[-1]))
    data_ss = fix_units(pd.read_excel(glob.glob(folder + '/ss*.xlsx')[-1]))

    # Set output folder
    plot_folder = os.path.join(config.results_folder, 'Plots')
    if not os.path.exists(plot_folder):
        os.makedirs(plot_folder)
    config.plot_folder = plot_folder

    # Plot solar data
    solar_plots(data_ss, config)

    # Plot energy data
    energy_plots(data_opt, data_ss, config)

    # Plot 3D Trajectory
    trajectory_plots_3D(data_opt, config)

    # Plot miscelanseous plots
    misc_plots(data_opt, config)
Ejemplo n.º 7
0
    print("Extracting useful features from the dataset..")
    dataset = features_extraction(dataset)
    print('Dataset with extracted features')
    print(dataset[:100])

    #Sample dataset
    print("Sampling dataset...")
    dataset_sampled = sample_dataset(dataset)
    print(dataset_sampled)

    #Save dataframe as pickle
    print('Saving dataset_sampled to pickle object...')
    util.to_pickle(dataset_sampled, 'dataset_sampled')

    print('\n Dataset sampled')
    dataset_sampled = util.load_pickle('dataset_sampled')
    print(dataset_sampled)

    #Feature selection (Mark Hall's algorithm, reference paper)
    print('Feature selection...')
    dataset_discrete = feature_selection_discretization(dataset_sampled)

    with open('discrete_dataset.csv', 'w', newline='') as c:
        dataset_discrete.to_csv(c, header=True, index=False)
    c.close()

    print("Discrete dataset\n")
    print(dataset_discrete)

    #Save dataframe as pickle
    print('Saving dataset_discrete to pickle object...')
def predict_multilingual(input_path: str, output_path: str, resources_path: str, lang: str) -> None:
    """
    DO NOT MODIFY THE SIGNATURE!
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the "<id> <BABELSynset>" format (e.g. "d000.s000.t000 bn:01234567n").

    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.

    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
    If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding

    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :param lang: the language of the dataset specified in input_path, specify which model to load on which dataset 
    :return: None
    """
    # load the model
    model_path = os.path.join(resources_path, 'SensEmbed_BiLSTM_ATT_MultiTask_model.h5')
    model = load_model(model_path, custom_objects={'SeqSelfAttention': SeqSelfAttention})
    logging.info(f'{model._name} is loaded.')

    # load tokenizer, fetch our vocabulary size
    tokenizer_path = os.path.join(resources_path, 'multilingual_tokenizer.pkl')
    tokenizer = load_pickle(tokenizer_path)

    word_tokens = [word for word in tokenizer.word_index if 'bn:' not in word]
    sense_tokens = [word for word in tokenizer.word_index if 'bn:' in word]
    vocabulary_size = len(word_tokens)
    output_size = vocabulary_size + len(sense_tokens)

    batch_size = 8  # hard coded; as this was the one worked on Colab Google

    # Parse the testing dataset
    gold_dict_path = input_path.replace("data.xml", "gold.key.txt")
    gold_dict = build_dict(gold_dict_path)
    data_x, mask_x = parse_test(input_path, tokenizer=tokenizer, gold_dict=gold_dict, batch_size=batch_size)

    # Getting the model predictions
    predictions = []
    for batch_x, batch_mask in tqdm(test_generator(np.array(data_x), batch_size, output_size,
                                                   use_elmo=False, mask_builder=np.array(mask_x),
                                                   tokenizer=tokenizer, use_bert=False),
                                    desc="Predicting Senses"):
        # Output Shape (batch_size, max_len_per_batch, output_vocab_size)
        batch_pred = model.predict_on_batch([batch_x, batch_mask])
        y_hat = np.argmax(batch_pred[0], axis=-1)
        predictions.extend(y_hat)

    # load lemma2synsets
    lemma2synsets_file_path = os.path.join(os.getcwd(), 'resources', 'lemma2synsets4.0.xx.wn.ALL.txt')
    lemma_synsets = get_lemma2synsets(lemma2synsets_file_path)

    # load wordnet 2 babelnet synsets' mapping
    bn2wn_path = os.path.join(resources_path, "babelnet2wordnet.tsv")
    _, wordnet_babelnet_ = build_bn2wn_dict(bn2wn_path)

    # Save predictions to a file
    id_bn_list = []
    # stands for predictions in {word_id babelnet_sense}
    _predictions = []
    for i, sentence in enumerate(tqdm(data_x, desc="Preparing models' predictions")):
        for j, word in enumerate(sentence):
            if len(mask_x[i][j]) == 2:  # So it is an instance
                prediction = predictions[i][j]
                prediction_sense_ = tokenizer.index_word.get(prediction, '<OOV>')
                if 'wn:' not in prediction_sense_ or 'bn:' not in prediction_sense_:
                    # Fallback Strategy
                    prediction_sense = predict_multilingual_sense(word=word, word2idx=tokenizer.word_index,
                                                                  lemma_synsets=lemma_synsets,
                                                                  wordnet_babelnet=wordnet_babelnet_)
                else:
                    prediction_sense = prediction_sense_[prediction_sense_.find('bn:'):]
                word_id = mask_x[i][j][1]
                bn = prediction_sense if prediction_sense is not None else '<OOV>'
                if word_id is None or bn is None:
                    continue
                id_bn_list.append(f'{word_id}\t{bn}')
                _predictions.append(bn)

    # Writing model predictions
    with open(output_path, encoding='utf-8', mode="w+") as output_file:
        for id_bn in tqdm(id_bn_list, desc="Writing model predictions"):
            output_file.write(f'{id_bn}\n')

    # Fetching the ground truth of the data
    ground_truth = []
    ground_truth_path = input_path.replace("data.xml", "gold.key.txt")
    with open(ground_truth_path, encoding='utf-8', mode='r') as ground_truth_file:
        lines = ground_truth_file.read().splitlines()
        for line in lines:
            sense_key = line.split()[1]
            ground_truth.append(sense_key)

    # Compute F1_Score
    _, _, f1score, _ = precision_recall_fscore_support(ground_truth, _predictions, average='micro')
    print(f'{model._name} F1_score: {f1score}')
Ejemplo n.º 9
0
def predict_lexicographer(input_path: str, output_path: str,
                          resources_path: str):
    """
    DO NOT MODIFY THE SIGNATURE!
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the "<id> <lexicographerId>" format (e.g. "d000.s000.t000 noun.animal").

    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.

    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
    If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding

    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """
    # Load configuration params
    config_path = os.path.join(os.getcwd(), 'config.yaml')
    config_file = open(config_path)
    config_params = yaml.load(config_file)

    # Parse and process Test set
    elmo = config_params["use_elmo"]
    batch_size = config_params["batch_size"]
    tokenizer_path = os.path.join(resources_path, 'tokenizer.pkl')
    tokenizer = load_pickle(tokenizer_path)

    data_x, test_mask_builder = parse_test(input_path, tokenizer, batch_size,
                                           elmo)  # Raw Data

    word_tokens = [
        word for word in tokenizer.word_index if not word.startswith('wn:')
    ]
    sense_tokens = [
        word for word in tokenizer.word_index if word.startswith('wn:')
    ]
    vocabulary_size = len(word_tokens)
    output_size = vocabulary_size + len(sense_tokens)

    # Load model
    model_path = os.path.join(resources_path, 'Baseline_model.h5')
    custom_objs = {'ElmoEmbeddingLayer': ElmoEmbeddingLayer}

    model = load_model(model_path, custom_objects=custom_objs)
    logging.info(f'{model._name} is loaded.')

    # Model Predictions
    predictions = []
    for batch_x, batch_test in tqdm(test_generator(np.array(data_x),
                                                   batch_size, output_size,
                                                   elmo,
                                                   np.array(test_mask_builder),
                                                   tokenizer, False),
                                    desc="Predicting_Lexes"):
        # Output Shape (batch_size, max_len_per_batch, output_vocab_size)
        batch_pred = model.predict_on_batch([batch_x, batch_test])
        # Output Shape (batch_size, max_len_per_batch)
        y_hat = np.argmax(batch_pred, axis=-1)
        predictions.extend(y_hat)

    # Dictionaries to get mappings
    bn2wn_path = os.path.join(resources_path, 'babelnet2wordnet.tsv')
    _, wordnet_babelnet = build_bn2wn_dict(bn2wn_path)
    bn2lex_path = os.path.join(resources_path, 'babelnet2lexnames.tsv')
    babelnet_lex, _ = build_bn2lex_dict(bn2lex_path)

    # Save predictions to a file
    id_lex_list = []  # stands for predictions in {word_id babelnet_sense}
    for i, sentence in enumerate(
            tqdm(data_x, desc='Formatting model predictions into ID_LEX')):
        for j, word in enumerate(sentence):
            if len(test_mask_builder[i][j]) == 2:  # So it is an instance
                prediction = predictions[i][j]
                prediction_sense = tokenizer.index_word.get(
                    prediction, '<OOV>')
                if 'wn:' not in prediction_sense or 'bn:' not in prediction_sense:
                    prediction_sense = predict_sense(token=word)  # Fallback
                word_id = test_mask_builder[i][j][1]
                id_ = word_id[word_id.find('.') + 1:]
                bn = wordnet_babelnet.get(prediction_sense, None)
                lex = babelnet_lex.get(bn, 'factotum')
                if id_ is None or lex is None:
                    continue
                id_lex_list.append(f'{id_}\t{lex}')

    with open(output_path, mode="w+") as output_file:
        for id_lex in tqdm(id_lex_list, desc="Writing model predictions"):
            output_file.write(f'{id_lex}\n')
                              'Stacked_BiLSTM_CRF_Fasttext_2315.pth')

    configure_workspace(seed=1873337)

    train_dataset, dev_dataset, test_dataset = prepare_data(CRF_MODEL)
    train_dataset_ = DataLoader(dataset=train_dataset, batch_size=batch_size)
    dev_dataset_ = DataLoader(dataset=dev_dataset, batch_size=batch_size)
    test_dataset_ = DataLoader(dataset=test_dataset, batch_size=batch_size)

    embeddings_path = os.path.join(RESOURCES_PATH, 'wiki.en.vec')
    pretrained_embeddings = load_pretrained_embeddings(
        embeddings_path, train_dataset.word2idx, 300,
        is_crf=CRF_MODEL) if PRETRAINED else None

    idx2label = load_pickle(
        os.path.join(RESOURCES_PATH,
                     'Stacked_BiLSTM_CRF_Fasttext_2315_idx2label.pkl'))
    word2idx = load_pickle(
        os.path.join(RESOURCES_PATH,
                     'Stacked_BiLSTM_CRF_Fasttext_2315_word2idx.pkl'))
    hp = HyperParameters(name_, word2idx, train_dataset.idx2label,
                         pretrained_embeddings, batch_size)

    model = CRF_Model(hp).to(
        train_dataset.get_device) if CRF_MODEL else BaselineModel(hp).to(
            train_dataset.get_device)
    model.load_model(model_path)

    evaluator = Evaluator(model, test_dataset_, CRF_MODEL)
    evaluator.check_performance(idx2label)
    tokens = test_dataset.data_x
    if save_to is not None:
        plt.savefig(f'{save_to}_acc.png')
    plt.show()


def visualize_plot_mdl(visualize, plot, model):
    # Balabizo acts as placeholder
    name = model._name if model._name is not None else 'Balabizo'
    if visualize:
        print(f'\n{name} Model Summary: \n')
        model.summary()
    if plot:
        img_path = os.path.join(os.getcwd(), 'resources', f'{name}_Model.png')
        plot_model(model, to_file=img_path, show_shapes=True)
        logging.info(f"{name} model image saved")
    logging.info(f"{name} model is created & compiled")


def train_model_huge_dataset(dir_path, model):
    # TODO: LOOP ON PICKLE FILES, LOAD THEM, TRAIN MODEL, COMPUTE F1 SCORE (ITERATIVELY)
    # TODO: READ http://nlpprogress.com/english/word_sense_disambiguation.html
    # TODO: READ ABOUT REGULARIZERS IN LSTMs
    pass


if __name__ == '__main__':
    history_path = os.path.join(
        os.getcwd(), 'resources', 'SensEmbed_BiLSTM_ATT_MultiTask_history.pkl')
    hist = load_pickle(history_path)
    plot_history(hist)