def test_ignore_missing_cues(): weights = xr.DataArray(np.array([[0, 1, 0], [1, 0, 0]]), coords={ 'outcomes': ['o1', 'o2'], 'cues': ['c1', 'c2', 'c3'] }, dims=('outcomes', 'cues')) events = [(['c1', 'c2', 'c3'], []), (['c1', 'c3'], []), (['c2', 'c4'], []), (['c1', 'c1'], [])] reference_activations = np.array([[1, 0, 1, 0], [1, 1, 0, 1]]) with pytest.raises(KeyError): activations = activation(events, weights, number_of_threads=1, remove_duplicates=True) activations = activation(events, weights, number_of_threads=1, remove_duplicates=True, ignore_missing_cues=True) activations_mp = activation(events, weights, number_of_threads=3, remove_duplicates=True, ignore_missing_cues=True) assert np.allclose(reference_activations, activations) assert np.allclose(reference_activations, activations_mp)
def test_exceptions(): with pytest.raises(ValueError) as e_info: wm = ndl.dict_ndl(FILE_PATH_SIMPLE, ALPHA, BETAS, remove_duplicates=None) activation(FILE_PATH_MULTIPLE_CUES, wm) assert e_info == 'cues or outcomes needs to be unique: cues "a a"; outcomes "A"; use remove_duplicates=True' with pytest.raises(ValueError) as e_info: activation(FILE_PATH_MULTIPLE_CUES, "magic") assert e_info == "Weights other than xarray.DataArray or dicts are not supported."
def test_activation_dict(): weights = defaultdict(lambda: defaultdict(float)) weights['o1']['c1'] = 0 weights['o1']['c2'] = 1 weights['o1']['c3'] = 0 weights['o2']['c1'] = 1 weights['o2']['c2'] = 0 weights['o2']['c3'] = 0 events = [(['c1', 'c2', 'c3'], []), (['c1', 'c3'], []), (['c2'], []), (['c1', 'c1'], [])] reference_activations = {'o1': [1, 0, 1, 0], 'o2': [1, 1, 0, 1]} with pytest.raises(ValueError): activations = activation(events, weights, number_of_threads=1) activations = activation(events, weights, number_of_threads=1, remove_duplicates=True) for outcome, activation_list in activations.items(): assert np.allclose(reference_activations[outcome], activation_list)
def predict_proba_eventfile_NDL(model, data_test, cue_index = None, outcome_index = None, temp_dir = None, remove_temp_dir = True, T = 1, num_threads = 1, chunksize = None): """ Generate predicted probabilities for NDL Parameters ---------- model: class NDL model outputs (contains weights and activations) data_test: dataframe or class dataframe or indexed text file containing test data cue_index: dict or None If None, all cues in the event file are used. Otherwise a dictionary that maps cues to indices should be given. The dictionary should include only the cues to keep in the data. Default: None outcome_index: dict or None If None, all outcomes in the event file are used. Otherwise a dictionary that maps outcomes to indices should be given. The dictionary should include only the outcomes to keep in the data. Default: None temp_dir: str directory where to store the converted gz file if a dataframe is passed to data_test (needed to compute the activation matrix). Default: None (will create a folder 'TEMP_TRAIN_DIRECTORY' in the current working directory remove_temp_dir: Boolean whether or not to remove the temporary directory. Default: True T: float temperature hyperparameter to adjust the confidence in the predictions from the activations. Low values increase the confidence in the predictions. num_threads: int maximum number of processes to use when computing the activations is the data is unseen. Default: 1 chunksize : int or None number of lines to use for computing the activation matrix for these lines. If None, all lines will be used. Default: None Returns ------- numpy 2D-array array of dim (num_events * num_outcomes), which contains, for each event, the predicted probabilities of the different outcomes """ from pyndl.activation import activation from deep_text_modelling.preprocessing import df_to_gz # Create a temporary directory if not provided if not temp_dir: temp_dir0 = os.path.join(os.getcwd(), 'TEMP_TRAIN_DIRECTORY') # Add warning if the creation of a temporary directory fails # (e.g. folder with the same name already existing) try: os.mkdir(temp_dir0) except OSError: print("Creation of a temporary directory %s failed. This could be because ", "a folder with the same name already exists or you don't have the ", "required admin rights on the computer)." % temp_dir0) else: temp_dir0 = temp_dir ### Path to the train event file if isinstance(data_test, str): events_test_path = data_test elif isinstance(data_test, pd.DataFrame): # if temp_dir: events_test_path = os.path.join(temp_dir, 'data_test_temp.gz') df_to_gz(data = data_test, gz_outfile = events_test_path) else: raise ValueError("data_test should be either a path to an event file or a dataframe") ### Filter the event files by retaining only the cues and outcomes that are in the index system (e.g. most frequent tokens) ### if these index systems are provided by the user. Otherwise, use all cues and/or outcomes if cue_index or outcome_index: # Filtering only if an index file is provided # Path to the filtered file filtered_events_test_path = os.path.join(temp_dir0, 'filtered_events_test.gz') # Cues if cue_index: cues_to_keep = [cue for cue in cue_index.keys()] else: cues_to_keep = 'all' # Outcomes if outcome_index: outcomes_to_keep = [outcome for outcome in outcome_index.keys()] else: outcomes_to_keep = 'all' filter_event_file(events_test_path, filtered_events_test_path, number_of_processes = num_threads, keep_cues = cues_to_keep, keep_outcomes = outcomes_to_keep, verbose = False) else: # Path to the filtered file filtered_events_test_path = events_test_path if chunksize: #N_outcomes = len(outcome_index) #N_events = len(pd.read_csv(filtered_events_test_path, header = 0, sep='\t', quotechar='"', usecols = ['outcomes'])) #proba_pred = np.empty([N_events, N_outcomes]) proba_pred_arrays = [] events = io.events_from_file(filtered_events_test_path) for events_chunk in chunk(events, chunksize): # Generate the activations activations_test = activation(events = events_chunk, weights = model.weights, number_of_threads = 1, remove_duplicates = True, ignore_missing_cues = True) # Predicted probabilities using softmax proba_pred_arrays.append(activations_to_proba(activations = activations_test, T = T)) proba_pred = np.stack(proba_pred_arrays, axis = 0) else: activations_test = activation(events = filtered_events_test_path, weights = model.weights, number_of_threads = 1, remove_duplicates = True, ignore_missing_cues = True) # Predicted probabilities using softmax proba_pred = activations_to_proba(activations = activations_test, T = T) ### Remove temporary directory if it was automatically created and the option was selected by the user if remove_temp_dir and not temp_dir: try: shutil.rmtree(temp_dir0) except OSError as e: print("Error: %s : %s" % (temp_dir0, e.strerror)) return proba_pred
def predict_proba_eventfile_NDL(model, data_test, temp_dir = None, remove_temp_dir = True, T = 1, num_threads = 1, chunksize = 10000): """ Generate predicted probabilities for NDL Parameters ---------- model: class NDL model outputs (contains weights and activations) data_test: dataframe or class dataframe or indexed text file containing test data temp_dir: str directory where to store the converted gz file if a dataframe is passed to data_test (needed to compute the activation matrix). Default: None (will create a folder 'TEMP_TRAIN_DIRECTORY' in the current working directory remove_temp_dir: Boolean whether or not to remove the temporary directory. Default: True T: float temperature hyperparameter to adjust the confidence in the predictions from the activations. Low values increase the confidence in the predictions. num_threads: int maximum number of processes to use when computing the activations is the data is unseen. Default: 1 chunksize : int number of lines to use for computing the activation matrix for these lines. Default: 10000 Returns ------- numpy array array containing the predicted probabilities """ from pyndl.activation import activation from deep_text_modelling.preprocessing import df_to_gz # Create a temporary directory if not provided if not temp_dir: temp_dir0 = os.path.join(os.getcwd(), 'TEMP_TRAIN_DIRECTORY') # Add warning if the creation of a temporary directory fails # (e.g. folder with the same name already existing) try: os.mkdir(temp_dir0) except OSError: print("Creation of a temporary directory %s failed. This could be because ", "a folder with the same name already exists or you don't have the ", "required admin rights on the computer)." % temp_dir0) else: temp_dir0 = temp_dir ### Path to the train event file if isinstance(data_test, str): events_test_path = data_test elif isinstance(data_test, pd.DataFrame): # if temp_dir: events_test_path = os.path.join(temp_dir, 'data_test_temp.gz') df_to_gz(data = data_test, gz_outfile = events_test_path) # else: # raise ValueError("provide a path to a temporary directory for generating a temporary .gz event file") else: raise ValueError("data_test should be either a path to an event file or a dataframe") # Generate the activations events = io.events_from_file(events_test_path) for events_chunk in chunk(events, chunksize): activations_test = activation(events = events_chunk, weights = model.weights, number_of_threads = num_threads, remove_duplicates = True, ignore_missing_cues = True) # Predicted probabilities using softmax proba_pred = activations_to_proba(activations = activations_test, T = T) ### Remove temporary directory if it was automatically created and the option was selected by the user if remove_temp_dir and temp_dir: try: shutil.rmtree(temp_dir0) except OSError as e: print("Error: %s : %s" % (temp_dir0, e.strerror)) return proba_pred