コード例 #1
0
ファイル: test_activation.py プロジェクト: mikecroucher/pyndl
def test_ignore_missing_cues():
    weights = xr.DataArray(np.array([[0, 1, 0], [1, 0, 0]]),
                           coords={
                               'outcomes': ['o1', 'o2'],
                               'cues': ['c1', 'c2', 'c3']
                           },
                           dims=('outcomes', 'cues'))

    events = [(['c1', 'c2', 'c3'], []), (['c1', 'c3'], []), (['c2', 'c4'], []),
              (['c1', 'c1'], [])]
    reference_activations = np.array([[1, 0, 1, 0], [1, 1, 0, 1]])

    with pytest.raises(KeyError):
        activations = activation(events,
                                 weights,
                                 number_of_threads=1,
                                 remove_duplicates=True)

    activations = activation(events,
                             weights,
                             number_of_threads=1,
                             remove_duplicates=True,
                             ignore_missing_cues=True)
    activations_mp = activation(events,
                                weights,
                                number_of_threads=3,
                                remove_duplicates=True,
                                ignore_missing_cues=True)

    assert np.allclose(reference_activations, activations)
    assert np.allclose(reference_activations, activations_mp)
コード例 #2
0
ファイル: test_activation.py プロジェクト: mikecroucher/pyndl
def test_exceptions():
    with pytest.raises(ValueError) as e_info:
        wm = ndl.dict_ndl(FILE_PATH_SIMPLE,
                          ALPHA,
                          BETAS,
                          remove_duplicates=None)
        activation(FILE_PATH_MULTIPLE_CUES, wm)
        assert e_info == 'cues or outcomes needs to be unique: cues "a a"; outcomes "A"; use remove_duplicates=True'

    with pytest.raises(ValueError) as e_info:
        activation(FILE_PATH_MULTIPLE_CUES, "magic")
        assert e_info == "Weights other than xarray.DataArray or dicts are not supported."
コード例 #3
0
ファイル: test_activation.py プロジェクト: mikecroucher/pyndl
def test_activation_dict():
    weights = defaultdict(lambda: defaultdict(float))
    weights['o1']['c1'] = 0
    weights['o1']['c2'] = 1
    weights['o1']['c3'] = 0
    weights['o2']['c1'] = 1
    weights['o2']['c2'] = 0
    weights['o2']['c3'] = 0
    events = [(['c1', 'c2', 'c3'], []), (['c1', 'c3'], []), (['c2'], []),
              (['c1', 'c1'], [])]
    reference_activations = {'o1': [1, 0, 1, 0], 'o2': [1, 1, 0, 1]}

    with pytest.raises(ValueError):
        activations = activation(events, weights, number_of_threads=1)

    activations = activation(events,
                             weights,
                             number_of_threads=1,
                             remove_duplicates=True)
    for outcome, activation_list in activations.items():
        assert np.allclose(reference_activations[outcome], activation_list)
コード例 #4
0
def predict_proba_eventfile_NDL(model, data_test, cue_index = None, outcome_index = None, temp_dir = None, 
                                remove_temp_dir = True, T = 1, num_threads = 1, chunksize = None):

    """ Generate predicted probabilities for NDL

    Parameters
    ----------
    model: class
        NDL model outputs (contains weights and activations)
    data_test: dataframe or class
        dataframe or indexed text file containing test data
    cue_index: dict or None
        If None, all cues in the event file are used. Otherwise a dictionary that maps cues to indices should 
        be given. The dictionary should include only the cues to keep in the data. Default: None
    outcome_index: dict or None
        If None, all outcomes in the event file are used. Otherwise a dictionary that maps outcomes to indices should 
        be given. The dictionary should include only the outcomes to keep in the data. Default: None
    temp_dir: str
        directory where to store the converted gz file if a dataframe is passed to data_test 
        (needed to compute the activation matrix). Default: None (will create a folder 
        'TEMP_TRAIN_DIRECTORY' in the current working directory    
    remove_temp_dir: Boolean
        whether or not to remove the temporary directory. Default: True
    T: float
        temperature hyperparameter to adjust the confidence in the predictions from the activations.
        Low values increase the confidence in the predictions. 
    num_threads: int
        maximum number of processes to use when computing the activations is the data is unseen. Default: 1
    chunksize : int or None
        number of lines to use for computing the activation matrix for these lines. If None, all lines will be used. 
        Default: None

    Returns
    -------
    numpy 2D-array
        array of dim (num_events * num_outcomes), which contains, for each event, the predicted probabilities 
        of the different outcomes
    """

    from pyndl.activation import activation
    from deep_text_modelling.preprocessing import df_to_gz

    # Create a temporary directory if not provided
    if not temp_dir:
        temp_dir0 = os.path.join(os.getcwd(), 'TEMP_TRAIN_DIRECTORY')
        # Add warning if the creation of a temporary directory fails 
        # (e.g. folder with the same name already existing)
        try:
            os.mkdir(temp_dir0) 
        except OSError:
            print("Creation of a temporary directory %s failed. This could be because ", 
                  "a folder with the same name already exists or you don't have the ", 
                  "required admin rights on the computer)." % temp_dir0)
    else:
        temp_dir0 = temp_dir

    ### Path to the train event file
    if isinstance(data_test, str):     
        events_test_path = data_test
    elif isinstance(data_test, pd.DataFrame):
        # if temp_dir:
        events_test_path = os.path.join(temp_dir, 'data_test_temp.gz')
        df_to_gz(data = data_test, gz_outfile = events_test_path)
    else:
        raise ValueError("data_test should be either a path to an event file or a dataframe")

    ### Filter the event files by retaining only the cues and outcomes that are in the index system (e.g. most frequent tokens) 
    ### if these index systems are provided by the user. Otherwise, use all cues and/or outcomes
    if cue_index or outcome_index: # Filtering only if an index file is provided

        # Path to the filtered file
        filtered_events_test_path = os.path.join(temp_dir0, 'filtered_events_test.gz')  

        # Cues
        if cue_index:
            cues_to_keep = [cue for cue in cue_index.keys()]
        else:
            cues_to_keep = 'all'
        # Outcomes
        if outcome_index:
            outcomes_to_keep = [outcome for outcome in outcome_index.keys()]
        else:
            outcomes_to_keep = 'all'

        filter_event_file(events_test_path,
                          filtered_events_test_path,
                          number_of_processes = num_threads,
                          keep_cues = cues_to_keep,
                          keep_outcomes = outcomes_to_keep,
                          verbose = False)  
    else:
        # Path to the filtered file
        filtered_events_test_path = events_test_path 

    if chunksize:       
        #N_outcomes = len(outcome_index)
        #N_events = len(pd.read_csv(filtered_events_test_path, header = 0, sep='\t', quotechar='"', usecols = ['outcomes']))
        #proba_pred = np.empty([N_events, N_outcomes])
        proba_pred_arrays = []
        events = io.events_from_file(filtered_events_test_path)
        for events_chunk in chunk(events, chunksize):
            # Generate the activations 
            activations_test = activation(events = events_chunk, 
                                          weights = model.weights,
                                          number_of_threads = 1,
                                          remove_duplicates = True,
                                          ignore_missing_cues = True)

            # Predicted probabilities using softmax
            proba_pred_arrays.append(activations_to_proba(activations = activations_test, T = T))
        proba_pred = np.stack(proba_pred_arrays, axis = 0)
    else:
        activations_test = activation(events = filtered_events_test_path, 
                                      weights = model.weights,
                                      number_of_threads = 1,
                                      remove_duplicates = True,
                                      ignore_missing_cues = True)
        # Predicted probabilities using softmax
        proba_pred = activations_to_proba(activations = activations_test, T = T)

    ### Remove temporary directory if it was automatically created and the option was selected by the user
    if remove_temp_dir and not temp_dir:
        try:
            shutil.rmtree(temp_dir0)
        except OSError as e:
            print("Error: %s : %s" % (temp_dir0, e.strerror))
            
    return proba_pred
コード例 #5
0
def predict_proba_eventfile_NDL(model, data_test, temp_dir = None, remove_temp_dir = True, 
                                T = 1, num_threads = 1, chunksize = 10000):

    """ Generate predicted probabilities for NDL

    Parameters
    ----------
    model: class
        NDL model outputs (contains weights and activations)
    data_test: dataframe or class
        dataframe or indexed text file containing test data
    temp_dir: str
        directory where to store the converted gz file if a dataframe is passed to data_test 
        (needed to compute the activation matrix). Default: None (will create a folder 
        'TEMP_TRAIN_DIRECTORY' in the current working directory    
    remove_temp_dir: Boolean
        whether or not to remove the temporary directory. Default: True
    T: float
        temperature hyperparameter to adjust the confidence in the predictions from the activations.
        Low values increase the confidence in the predictions. 
    num_threads: int
        maximum number of processes to use when computing the activations is the data is unseen. Default: 1
    chunksize : int
        number of lines to use for computing the activation matrix for these lines. Default: 10000

    Returns
    -------
    numpy array
        array containing the predicted probabilities 
    """

    from pyndl.activation import activation
    from deep_text_modelling.preprocessing import df_to_gz

    # Create a temporary directory if not provided
    if not temp_dir:
        temp_dir0 = os.path.join(os.getcwd(), 'TEMP_TRAIN_DIRECTORY')
        # Add warning if the creation of a temporary directory fails 
        # (e.g. folder with the same name already existing)
        try:
            os.mkdir(temp_dir0) 
        except OSError:
            print("Creation of a temporary directory %s failed. This could be because ", 
                  "a folder with the same name already exists or you don't have the ", 
                  "required admin rights on the computer)." % temp_dir0)
    else:
        temp_dir0 = temp_dir

    ### Path to the train event file
    if isinstance(data_test, str):     
        events_test_path = data_test
    elif isinstance(data_test, pd.DataFrame):
        # if temp_dir:
        events_test_path = os.path.join(temp_dir, 'data_test_temp.gz')
        df_to_gz(data = data_test, gz_outfile = events_test_path)
        # else: 
        #     raise ValueError("provide a path to a temporary directory for generating a temporary .gz event file")
    else:
        raise ValueError("data_test should be either a path to an event file or a dataframe")

    # Generate the activations 
    events = io.events_from_file(events_test_path)
    for events_chunk in chunk(events, chunksize):
        activations_test = activation(events = events_chunk, 
                                    weights = model.weights,
                                    number_of_threads = num_threads,
                                    remove_duplicates = True,
                                    ignore_missing_cues = True)

    # Predicted probabilities using softmax
    proba_pred = activations_to_proba(activations = activations_test, T = T)

    ### Remove temporary directory if it was automatically created and the option was selected by the user
    if remove_temp_dir and temp_dir:
        try:
            shutil.rmtree(temp_dir0)
        except OSError as e:
            print("Error: %s : %s" % (temp_dir0, e.strerror))
            
    return proba_pred