def test_unfound_file_error(): try: FileNotFoundError except NameError: FileNotFoundError = OSError # Python 2 doesn't have FileNotFoundError with pytest.raises(FileNotFoundError): read_corpus("foo")
def test_affixes(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) test_object = set(lxa_object.affixes()) expected_object_path = os.path.join(data_dir, 'affixes_to_signatures.txt') expected_object = set(eval(open(expected_object_path).read()).keys()) assert test_object == expected_object
def test_words_to_sigtransforms(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) test_object = lxa_object.words_to_sigtransforms() expected_object_path = os.path.join(data_dir, 'words_to_sigtransforms.txt') expected_object = eval(open(expected_object_path).read()) assert test_object == expected_object
def test_stems(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) test_object = set(lxa_object.stems()) expected_object_path = os.path.join(data_dir, 'stems_to_words.txt') expected_object = set(eval(open(expected_object_path).read()).keys()) assert test_object == expected_object
def corpus_dir_dialog(self): """ Pop up the "open a file" dialog and ask for which corpus text file to use """ self.corpus_filename = self._get_filename_from_dialog(ftype='corpus') process_all_gui_events() if type(self.corpus_filename) != str: return # note that self.corpus_filename is an absolute full path self.corpus_name = os.path.basename(self.corpus_filename) self.corpus_stem_name = Path(self.corpus_name).stem self.lexicon = read_corpus(self.corpus_filename) self.initialize_lexicon_tree() self.load_main_window(major_display=QWidget(), parameter_window=QWidget()) process_all_gui_events() self.status.clearMessage() self.status.showMessage( 'Corpus selected: {}'.format(self.corpus_filename))
def test_wordlist_from_corpus_file(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) test_object = lxa_object.wordlist() expected_object_path = os.path.join(data_dir, 'wordlist.txt') expected_object = eval(open(expected_object_path).read()) assert test_object == expected_object
def test_word_trigram_counter(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) test_object = lxa_object.word_trigram_counter() expected_object_path = os.path.join(data_dir, 'word_trigram_counter.txt') expected_object = eval(open(expected_object_path).read()) assert test_object == expected_object
def test_neighbor_graph(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) test_object = lxa_object.neighbor_graph() expected_object = nx.Graph() words_to_neighbors_path = os.path.join(data_dir, 'words_to_neighbors.txt') words_to_neighbors = eval(open(words_to_neighbors_path).read()) for word in words_to_neighbors: neighbors = words_to_neighbors[word] for neighbor in neighbors: expected_object.add_edge(word, neighbor) test_edges = set(test_object.edges()) expected_edges = set(expected_object.edges()) number_of_hits = 0 for test_edge in test_edges: if test_edge in expected_edges: number_of_hits += 1 hit_ratio = number_of_hits / len(expected_edges) assert hit_ratio > 0.5
def test_predecessors(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) test_object = lxa_object.predecessors() expected_object_path = os.path.join(data_dir, 'predecessors.txt') expected_object = eval(open(expected_object_path).read()) assert test_object == expected_object
def test_words_to_neighbors(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) number_of_neighbors = lxa_object.parameters()['n_neighbors'] test_object = lxa_object.words_to_neighbors() number_of_words = len(test_object) correct_count = 0 expected_object_path = os.path.join(data_dir, 'words_to_neighbors.txt') expected_object = eval(open(expected_object_path).read()) # test if each word has a similar set of neighbor words # across test_object and expected_object for word in test_object.keys(): word_set1 = set(test_object[word]) word_set2 = set(expected_object[word]) if len(word_set1 & word_set2) >= (number_of_neighbors - 4): correct_count += 1 correct_ratio = correct_count / number_of_words # test if the ratio of words having a similar set of neighbor words is # high enough to pass the test assert correct_ratio >= 0.5
def test_contexts_to_words(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) test_object = lxa_object.contexts_to_words() expected_object_path = os.path.join(data_dir, 'contexts_to_words.txt') expected_object = eval(open(expected_object_path).read()) assert test_object == expected_object
def test_broken_words_right_to_left(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) test_object = lxa_object.broken_words_right_to_left() expected_object_path = os.path.join(data_dir, 'broken_words_right_to_left.txt') expected_object = eval(open(expected_object_path).read()) assert test_object == expected_object
def linguistica_data(pickle_name): print('Getting lexicon...') lex = lxa.read_corpus(corpus_path, min_stem_length=3, max_affix_length=2, min_sig_count=2) print('Stems to signatures...') signatures = lex.stems_to_signatures() print('Pickling...') dump_path = os.path.abspath(os.path.split(os.path.abspath(__file__))[0] + '/dump/' + pickle_name) with open(dump_path, 'wb') as f: pickle.dump(signatures, f) return signatures
def test_file_path_type_error(): with pytest.raises(TypeError): read_corpus(123)
def test_output_all_results(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) lxa_object.run_all_modules() lxa_object.output_all_results(test=True) assert True # test if there are errors
def test_unfound_parameter_error(): with pytest.raises(KeyError): read_corpus(corpus_path, non_existing_parameter=3)
import linguistica as lxa import pprint lxa_object = lxa.read_corpus('linguistica/datasets/Arabic.dx1', max_word_tokens=50000) pp = pprint.PrettyPrinter(indent=4) pp.pprint(lxa_object.stems())
def test_word_phonology_dict(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) _ = lxa_object.word_phonology_dict() assert True # TODO: only testing if there are errors for now...
def test_word_phonology_dict(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) lxa_object.word_phonology_dict()
def test_unfound_file_error(): with pytest.raises(FileNotFoundError): read_corpus("foo")
def test_change_parameters(): test_object = read_corpus(corpus_path) assert test_object.change_parameters(min_stem_length=4) is None
def main(): print( '\n================================================================\n' 'Welcome to Linguistica {}!\n' '================================================================'. format(lxa_version)) # -------------------------------------------------------------------------- # determine if file is a wordlist or a corpus text use_wordlist = determine_use_wordlist() print('--------------------------------------------') # -------------------------------------------------------------------------- # get file path file_abspath = get_file_abspath() print('\nFull file path:\n{}'.format(file_abspath)) print('--------------------------------------------') # -------------------------------------------------------------------------- # determine output directory output_dir = os.path.join(os.path.dirname(file_abspath), 'lxa_outputs') print('\nDefault output directory:\n{}'.format(output_dir)) output_dir = get_output_dir(output_dir) if not os.path.isdir(output_dir): os.mkdir(output_dir) print('--------------------------------------------') # -------------------------------------------------------------------------- # change encoding, if instructed encoding = get_encoding() print('--------------------------------------------') # -------------------------------------------------------------------------- # create the Linguistica object if use_wordlist: lxa_object = lxa.read_wordlist(file_abspath, encoding=encoding) else: lxa_object = lxa.read_corpus(file_abspath, encoding=encoding) # -------------------------------------------------------------------------- # change parameters, if instructed print('\nParameters:\n{}'.format(pformat(lxa_object.parameters()))) new_parameter_value_pairs = get_new_parameters() if new_parameter_value_pairs: lxa_object.change_parameters(**dict(new_parameter_value_pairs)) print('\nParameters after the changes:\n{}'.format( pformat(lxa_object.parameters()))) print('--------------------------------------------') # -------------------------------------------------------------------------- # run all Linguistica modules on the given file print('\nRunning all Linguistica modules on the given file:') lxa_object.run_all_modules(verbose=True) print('--------------------------------------------') # -------------------------------------------------------------------------- # output results as files print('\nGenerating output files...\n') lxa_object.output_all_results(directory=output_dir, verbose=True) print('\nResults are in ' + output_dir)
def test_use_default_parameters(): test_object = read_corpus(corpus_path) assert test_object.use_default_parameters() is None
def test_reset(): test_object = read_corpus(corpus_path) assert test_object.reset() is None
def test_biphone_dict(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) lxa_object.biphone_dict()
def test_change_parameters_with_error(): test_object = read_corpus(corpus_path) with pytest.raises(KeyError): test_object.change_parameters(non_existing_parameter=4)
def test_run_manifold_module(): lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000) lxa_object.run_manifold_module() assert True # test if there are errors
def main(): print('\n================================================================\n' 'Welcome to Linguistica {}!\n' '================================================================' .format(lxa_version)) # -------------------------------------------------------------------------- # determine if file is a wordlist or a corpus text use_wordlist = determine_use_wordlist() print('--------------------------------------------') # -------------------------------------------------------------------------- # get file path file_abspath = get_file_abspath() print('\nFull file path:\n{}'.format(file_abspath)) print('--------------------------------------------') # -------------------------------------------------------------------------- # determine output directory output_dir = os.path.join(os.path.dirname(file_abspath), 'lxa_outputs') print('\nDefault output directory:\n{}'.format(output_dir)) output_dir = get_output_dir(output_dir) if not os.path.isdir(output_dir): os.mkdir(output_dir) print('--------------------------------------------') # -------------------------------------------------------------------------- # change encoding, if instructed encoding = get_encoding() print('--------------------------------------------') # -------------------------------------------------------------------------- # create the Linguistica object if use_wordlist: lxa_object = lxa.read_wordlist(file_abspath, encoding=encoding) else: lxa_object = lxa.read_corpus(file_abspath, encoding=encoding) # -------------------------------------------------------------------------- # change parameters, if instructed print('\nParameters:\n{}'.format(pformat(lxa_object.parameters()))) new_parameter_value_pairs = get_new_parameters() if new_parameter_value_pairs: lxa_object.change_parameters(**dict(new_parameter_value_pairs)) print('\nParameters after the changes:\n{}' .format(pformat(lxa_object.parameters()))) print('--------------------------------------------') # -------------------------------------------------------------------------- # run all Linguistica modules on the given file print('\nRunning all Linguistica modules on the given file:') lxa_object.run_all_modules(verbose=True) print('--------------------------------------------') # -------------------------------------------------------------------------- # output results as files print('\nGenerating output files...\n') lxa_object.output_all_results(directory=output_dir, verbose=True) print('\nResults are in ' + output_dir)