def test_cues_outcomes(): n_events, cues, outcomes = count.cues_outcomes(EVENT_RESOURCE_FILE) n_events3, cues3, outcomes3 = count.cues_outcomes(EVENT_RESOURCE_FILE, number_of_processes=6, verbose=True) assert n_events == 2772 assert n_events == n_events3 assert cues == cues3 assert outcomes == outcomes3
def compare_arrays(file_path, arr1, arr2): n_events, cues, outcomes = count.cues_outcomes(file_path) cue_map, outcome_map, all_outcomes = generate_mapping(file_path) cue_indices = [cue_map[cue] for cue in cues] outcome_indices = [outcome_map[outcome] for outcome in outcomes] unequal = list() for outcome in outcomes: for cue in cues: values = list() for array in (arr1, arr2): if isinstance(array, np.ndarray): outcome_index = outcome_map[outcome] cue_index = cue_map[cue] values.append(array[outcome_index][cue_index]) elif isinstance(array, xr.DataArray): values.append(array.loc[{ 'outcomes': outcome, 'cues': cue }].values) elif isinstance(array, pd.DataFrame): values.append(array.loc[outcome][cue]) else: values.append(array[outcome][cue]) value1, value2 = values if not np.isclose(value1, value2, rtol=1e-02, atol=1e-05): unequal.append((outcome, cue, value1, value2)) unequal_ratio = len(unequal) / (len(outcomes) * len(cues)) return (unequal, unequal_ratio)
def test_save_load(): file_name = os.path.join(TEST_ROOT, "temp/cues.tab") _, cues, _ = count.cues_outcomes(EVENT_RESOURCE_FILE) count.save_counter(cues, file_name) cues_loaded = count.load_counter(file_name) assert cues == cues_loaded os.remove(file_name)
def generate_mapping(event_path): n_events, cues, outcomes = count.cues_outcomes(event_path) all_cues = list(cues.keys()) all_outcomes = list(outcomes.keys()) cue_map = OrderedDict(((cue, ii) for ii, cue in enumerate(all_cues))) outcome_map = OrderedDict( ((outcome, ii) for ii, outcome in enumerate(all_outcomes))) return (cue_map, outcome_map, all_outcomes)
def filter_tagged_event_file(input_event_file, filtered_event_file, cues, outcomes, fill_cues=0, fill_outcomes=0, overwrite=False, number_of_processes=1): """Filters event file with tokens and tags merged for collections of untagged cues and outcomes. Parameters ---------- input_event_file : str or path Path to event file with tokens and tags merged filtered_event_file : str or path Path to resulting event file cues : collection Collection of target cues (without tags) outcomes : collection Collection of taret outcomes (without tags) fill_cues : int Fill cues with most frequent words to size fill_cues. If 0, no words will be added. fill_outcomes : int Fill outcomes with most frequent words to size fill_outcomes. If 0, no words will be added. overwrite : bool Overwrite filtered_event_path if exists number_of_processes : int Number of processes to use """ if exists(filtered_event_file) and not overwrite: msg = f"'{filtered_event_file}' already exists and overwrite=False!" raise OSError(msg) counts = cues_outcomes(input_event_file, number_of_processes=number_of_processes) _, all_cues, all_outcomes = counts cues = filter_tagged_vocabulary(all_cues, cues) outcomes = filter_tagged_vocabulary(all_outcomes, outcomes) if fill_cues: cues = add_most_frequent(cues, all_cues, fill_cues) if fill_outcomes: outcomes = add_most_frequent(outcomes, all_outcomes, fill_outcomes) filter_event_file(input_event_file, filtered_event_file, keep_cues=cues, keep_outcomes=outcomes, number_of_processes=number_of_processes)
def test_preprocessing(): corpus_file = os.path.join(TEST_ROOT, "resources/corpus.txt") event_file = os.path.join(TEST_ROOT, "temp/events_corpus.tab.gz") symbols = "abcdefghijklmnopqrstuvwxyzóąćęłńśźż" # polish # create event file create_event_file(corpus_file, event_file, symbols, context_structure="document", event_structure="consecutive_words", event_options=(3, ), lower_case=True, verbose=True) # read in cues and outcomes n_events, cue_freq_map, outcome_freq_map = cues_outcomes(event_file, number_of_processes=2) cues = list(cue_freq_map.keys()) cues.sort() cue_id_map = {cue: ii for ii, cue in enumerate(cues)} # reduce number of outcomes through bandsampling outcome_freq_map_filtered = bandsample(outcome_freq_map, 50, cutoff=1, seed=None) outcomes = list(outcome_freq_map_filtered.keys()) outcomes.sort() outcome_id_map = {outcome: nn for nn, outcome in enumerate(outcomes)} # filter outcomes by reduced number of outcomes event_file_filtered = event_file + ".filtered" filter_event_file(event_file, event_file_filtered, keep_outcomes=outcomes) # TODO this is not working at the moment # create binary event files # path_name = event_file_filtered + ".events" # create_binary_event_files(event_file_filtered, path_name, cue_id_map, # outcome_id_map, sort_within_event=False, # number_of_processes=2, events_per_file=1000, # verbose=True) # with pytest.raises(IOError): # create_binary_event_files(event_file_filtered, path_name, cue_id_map, # outcome_id_map, sort_within_event=False, # number_of_processes=2, events_per_file=1000, # verbose=True) # overwrite=True # create_binary_event_files(event_file_filtered, path_name, cue_id_map, # outcome_id_map, sort_within_event=False, # number_of_processes=2, events_per_file=1000, # overwrite=True, verbose=True) # clean everything os.remove(event_file) os.remove(event_file_filtered)
def test_bandsample(): resource_file = os.path.join(TEST_ROOT, "resources/event_file_trigrams_to_word.tab.gz") n_events, cue_freq_map, outcome_freq_map = cues_outcomes(resource_file, number_of_processes=2) outcome_freq_map_filtered = bandsample(outcome_freq_map, 50, cutoff=1, seed=None, verbose=False) assert len(outcome_freq_map_filtered) == 50 reference_file = os.path.join(TEST_ROOT, 'reference/bandsampled_outcomes.tab') try: outcome_freq_map_filtered_reference = load_counter(reference_file) except (FileNotFoundError): temp_file = os.path.join(TEST_ROOT, 'temp/bandsampled_outcomes.tab') save_counter(outcome_freq_map_filtered, temp_file) raise bandsample(outcome_freq_map, 50, cutoff=1, verbose=True)
def test_write_events(): event_file = os.path.join(TEST_ROOT, "resources/event_file_trigrams_to_word.tab.gz") n_events, cue_freq_map, outcome_freq_map = cues_outcomes(event_file) outcomes = list(outcome_freq_map.keys()) outcomes.sort() cues = list(cue_freq_map.keys()) cues.sort() cue_id_map = {cue: ii for ii, cue in enumerate(cues)} outcome_id_map = {outcome: nn for nn, outcome in enumerate(outcomes)} events = event_generator(event_file, cue_id_map, outcome_id_map, sort_within_event=True) file_name = os.path.join(TEST_ROOT, "temp/events.bin") with pytest.raises(StopIteration): write_events(events, file_name, remove_duplicates=True) os.remove(file_name) # start stop events = event_generator(event_file, cue_id_map, outcome_id_map, sort_within_event=True) n_events = write_events(events, file_name, start=10, stop=20, remove_duplicates=True) assert n_events == 10 os.remove(file_name) # no events events = event_generator(event_file, cue_id_map, outcome_id_map, sort_within_event=True) n_events = write_events(events, file_name, start=100000, stop=100010, remove_duplicates=True) assert n_events == 0 _job_binary_event_file(file_name=file_name, event_file=event_file, cue_id_map=cue_id_map, outcome_id_map=outcome_id_map, sort_within_event=False, start=0, stop=10, remove_duplicates=True) _job_binary_event_file(file_name=file_name, event_file=event_file, cue_id_map=cue_id_map, outcome_id_map=outcome_id_map, sort_within_event=False, start=0, stop=10, remove_duplicates=True) os.remove(file_name) # bad event file with pytest.raises(ValueError): event_bad_file = os.path.join(TEST_ROOT, "resources/event_file_trigrams_to_word_BAD.tab.gz") events = event_generator(event_bad_file, cue_id_map, outcome_id_map) # traverse generator for event in events: pass
def test_filter_event_file(): input_event_file = os.path.join(TEST_ROOT, "resources/event_file_trigrams_to_word.tab.gz") output_event_file = os.path.join(TEST_ROOT, "temp/event_file_filtered.tab.gz") cues = ["#of", "of#"] cues.sort() outcomes = ["of", ] outcomes.sort() filter_event_file(input_event_file, output_event_file, keep_cues=cues, keep_outcomes=outcomes, number_of_processes=2, verbose=True) n_events, cue_freq_map, outcome_freq_map = cues_outcomes(output_event_file) cues_new = list(cue_freq_map) cues_new.sort() outcomes_new = list(outcome_freq_map) outcomes_new.sort() assert cues == cues_new assert outcomes == outcomes_new os.remove(output_event_file)
def test_read_binary_file(): file_path = "resources/event_file_trigrams_to_word.tab.gz" binary_path = "binary_resources/" abs_file_path = os.path.join(TEST_ROOT, file_path) abs_binary_path = os.path.join(TEST_ROOT, binary_path) abs_binary_file_path = os.path.join(abs_binary_path, "events_0_0.dat") n_events, cues, outcomes = cues_outcomes(abs_file_path) cue_id_map = OrderedDict(((cue, ii) for ii, cue in enumerate(cues.keys()))) outcome_id_map = OrderedDict(((outcome, ii) for ii, outcome in enumerate(outcomes.keys()))) number_events = create_binary_event_files(abs_file_path, abs_binary_path, cue_id_map, outcome_id_map, overwrite=True, remove_duplicates=False) bin_events = read_binary_file(abs_binary_file_path) events = ndl.events_from_file(abs_file_path) events_dup = ndl.events_from_file(abs_file_path) assert number_events == len(list(events_dup)) for event, bin_event in zip(events, bin_events): cues, outcomes = event bin_cues, bin_outcomes = bin_event if len(cues) != len(bin_cues): raise ValueError('Cues have different length') if len(outcomes) != len(bin_outcomes): raise ValueError('Cues have different length') for cue, bin_cue in zip(cues, bin_cues): assert cue_id_map[cue] == bin_cue for outcome, bin_outcome in zip(outcomes, bin_outcomes): assert outcome_id_map[outcome] == bin_outcome # clean everything os.remove(abs_binary_file_path)