Ejemplo n.º 1
0
def test_unfound_file_error():
    try:
        FileNotFoundError
    except NameError:
        FileNotFoundError = OSError  # Python 2 doesn't have FileNotFoundError
    with pytest.raises(FileNotFoundError):
        read_corpus("foo")
Ejemplo n.º 2
0
def test_affixes():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = set(lxa_object.affixes())

    expected_object_path = os.path.join(data_dir, 'affixes_to_signatures.txt')
    expected_object = set(eval(open(expected_object_path).read()).keys())
    assert test_object == expected_object
Ejemplo n.º 3
0
def test_words_to_sigtransforms():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = lxa_object.words_to_sigtransforms()

    expected_object_path = os.path.join(data_dir, 'words_to_sigtransforms.txt')
    expected_object = eval(open(expected_object_path).read())
    assert test_object == expected_object
Ejemplo n.º 4
0
def test_stems():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = set(lxa_object.stems())

    expected_object_path = os.path.join(data_dir, 'stems_to_words.txt')
    expected_object = set(eval(open(expected_object_path).read()).keys())
    assert test_object == expected_object
Ejemplo n.º 5
0
    def corpus_dir_dialog(self):
        """
        Pop up the "open a file" dialog and ask for which corpus text file
        to use
        """
        self.corpus_filename = self._get_filename_from_dialog(ftype='corpus')

        process_all_gui_events()

        if type(self.corpus_filename) != str:
            return

        # note that self.corpus_filename is an absolute full path
        self.corpus_name = os.path.basename(self.corpus_filename)
        self.corpus_stem_name = Path(self.corpus_name).stem

        self.lexicon = read_corpus(self.corpus_filename)
        self.initialize_lexicon_tree()
        self.load_main_window(major_display=QWidget(),
                              parameter_window=QWidget())
        process_all_gui_events()

        self.status.clearMessage()
        self.status.showMessage(
            'Corpus selected: {}'.format(self.corpus_filename))
Ejemplo n.º 6
0
def test_wordlist_from_corpus_file():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = lxa_object.wordlist()

    expected_object_path = os.path.join(data_dir, 'wordlist.txt')
    expected_object = eval(open(expected_object_path).read())
    assert test_object == expected_object
Ejemplo n.º 7
0
def test_word_trigram_counter():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = lxa_object.word_trigram_counter()

    expected_object_path = os.path.join(data_dir, 'word_trigram_counter.txt')
    expected_object = eval(open(expected_object_path).read())
    assert test_object == expected_object
Ejemplo n.º 8
0
def test_neighbor_graph():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = lxa_object.neighbor_graph()

    expected_object = nx.Graph()
    words_to_neighbors_path = os.path.join(data_dir, 'words_to_neighbors.txt')
    words_to_neighbors = eval(open(words_to_neighbors_path).read())

    for word in words_to_neighbors:
        neighbors = words_to_neighbors[word]

        for neighbor in neighbors:
            expected_object.add_edge(word, neighbor)

    test_edges = set(test_object.edges())
    expected_edges = set(expected_object.edges())

    number_of_hits = 0

    for test_edge in test_edges:
        if test_edge in expected_edges:
            number_of_hits += 1

    hit_ratio = number_of_hits / len(expected_edges)

    assert hit_ratio > 0.5
Ejemplo n.º 9
0
def test_predecessors():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = lxa_object.predecessors()

    expected_object_path = os.path.join(data_dir, 'predecessors.txt')
    expected_object = eval(open(expected_object_path).read())
    assert test_object == expected_object
Ejemplo n.º 10
0
def test_affixes():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = set(lxa_object.affixes())

    expected_object_path = os.path.join(data_dir, 'affixes_to_signatures.txt')
    expected_object = set(eval(open(expected_object_path).read()).keys())
    assert test_object == expected_object
Ejemplo n.º 11
0
def test_words_to_sigtransforms():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = lxa_object.words_to_sigtransforms()

    expected_object_path = os.path.join(data_dir, 'words_to_sigtransforms.txt')
    expected_object = eval(open(expected_object_path).read())
    assert test_object == expected_object
Ejemplo n.º 12
0
def test_stems():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = set(lxa_object.stems())

    expected_object_path = os.path.join(data_dir, 'stems_to_words.txt')
    expected_object = set(eval(open(expected_object_path).read()).keys())
    assert test_object == expected_object
Ejemplo n.º 13
0
def test_words_to_neighbors():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    number_of_neighbors = lxa_object.parameters()['n_neighbors']
    test_object = lxa_object.words_to_neighbors()
    number_of_words = len(test_object)
    correct_count = 0

    expected_object_path = os.path.join(data_dir, 'words_to_neighbors.txt')
    expected_object = eval(open(expected_object_path).read())

    # test if each word has a similar set of neighbor words
    # across test_object and expected_object

    for word in test_object.keys():
        word_set1 = set(test_object[word])
        word_set2 = set(expected_object[word])
        if len(word_set1 & word_set2) >= (number_of_neighbors - 4):
            correct_count += 1

    correct_ratio = correct_count / number_of_words

    # test if the ratio of words having a similar set of neighbor words is
    # high enough to pass the test

    assert correct_ratio >= 0.5
Ejemplo n.º 14
0
def test_contexts_to_words():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = lxa_object.contexts_to_words()

    expected_object_path = os.path.join(data_dir, 'contexts_to_words.txt')
    expected_object = eval(open(expected_object_path).read())
    assert test_object == expected_object
Ejemplo n.º 15
0
def test_broken_words_right_to_left():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    test_object = lxa_object.broken_words_right_to_left()

    expected_object_path = os.path.join(data_dir,
                                        'broken_words_right_to_left.txt')
    expected_object = eval(open(expected_object_path).read())
    assert test_object == expected_object
Ejemplo n.º 16
0
def linguistica_data(pickle_name):
	print('Getting lexicon...')
	lex = lxa.read_corpus(corpus_path, min_stem_length=3, max_affix_length=2, min_sig_count=2)
	print('Stems to signatures...')
	signatures = lex.stems_to_signatures()
	print('Pickling...')
	dump_path = os.path.abspath(os.path.split(os.path.abspath(__file__))[0] + '/dump/' + pickle_name)
	with open(dump_path, 'wb') as f:
		pickle.dump(signatures, f)
	return signatures
Ejemplo n.º 17
0
def test_file_path_type_error():
    with pytest.raises(TypeError):
        read_corpus(123)
Ejemplo n.º 18
0
def test_file_path_type_error():
    with pytest.raises(TypeError):
        read_corpus(123)
Ejemplo n.º 19
0
def test_output_all_results():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    lxa_object.run_all_modules()
    lxa_object.output_all_results(test=True)
    assert True  # test if there are errors
Ejemplo n.º 20
0
def test_unfound_parameter_error():
    with pytest.raises(KeyError):
        read_corpus(corpus_path, non_existing_parameter=3)
Ejemplo n.º 21
0
import linguistica as lxa
import pprint

lxa_object = lxa.read_corpus('linguistica/datasets/Arabic.dx1',
                             max_word_tokens=50000)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lxa_object.stems())
Ejemplo n.º 22
0
def test_word_phonology_dict():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    _ = lxa_object.word_phonology_dict()
    assert True  # TODO: only testing if there are errors for now...
Ejemplo n.º 23
0
def test_word_phonology_dict():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    lxa_object.word_phonology_dict()
Ejemplo n.º 24
0
def test_unfound_file_error():
    with pytest.raises(FileNotFoundError):
        read_corpus("foo")
Ejemplo n.º 25
0
def test_change_parameters():
    test_object = read_corpus(corpus_path)
    assert test_object.change_parameters(min_stem_length=4) is None
Ejemplo n.º 26
0
def test_output_all_results():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    lxa_object.run_all_modules()
    lxa_object.output_all_results(test=True)
    assert True  # test if there are errors
Ejemplo n.º 27
0
def test_unfound_file_error():
    with pytest.raises(FileNotFoundError):
        read_corpus("foo")
Ejemplo n.º 28
0
def main():
    print(
        '\n================================================================\n'
        'Welcome to Linguistica {}!\n'
        '================================================================'.
        format(lxa_version))

    # --------------------------------------------------------------------------
    # determine if file is a wordlist or a corpus text

    use_wordlist = determine_use_wordlist()

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # get file path

    file_abspath = get_file_abspath()

    print('\nFull file path:\n{}'.format(file_abspath))
    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # determine output directory

    output_dir = os.path.join(os.path.dirname(file_abspath), 'lxa_outputs')

    print('\nDefault output directory:\n{}'.format(output_dir))

    output_dir = get_output_dir(output_dir)

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # change encoding, if instructed

    encoding = get_encoding()

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # create the Linguistica object

    if use_wordlist:
        lxa_object = lxa.read_wordlist(file_abspath, encoding=encoding)
    else:
        lxa_object = lxa.read_corpus(file_abspath, encoding=encoding)

    # --------------------------------------------------------------------------
    # change parameters, if instructed

    print('\nParameters:\n{}'.format(pformat(lxa_object.parameters())))

    new_parameter_value_pairs = get_new_parameters()

    if new_parameter_value_pairs:
        lxa_object.change_parameters(**dict(new_parameter_value_pairs))

        print('\nParameters after the changes:\n{}'.format(
            pformat(lxa_object.parameters())))

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # run all Linguistica modules on the given file

    print('\nRunning all Linguistica modules on the given file:')

    lxa_object.run_all_modules(verbose=True)

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # output results as files

    print('\nGenerating output files...\n')

    lxa_object.output_all_results(directory=output_dir, verbose=True)

    print('\nResults are in ' + output_dir)
Ejemplo n.º 29
0
def test_use_default_parameters():
    test_object = read_corpus(corpus_path)
    assert test_object.use_default_parameters() is None
Ejemplo n.º 30
0
def test_reset():
    test_object = read_corpus(corpus_path)
    assert test_object.reset() is None
Ejemplo n.º 31
0
def test_use_default_parameters():
    test_object = read_corpus(corpus_path)
    assert test_object.use_default_parameters() is None
Ejemplo n.º 32
0
def test_biphone_dict():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    lxa_object.biphone_dict()
Ejemplo n.º 33
0
def test_unfound_parameter_error():
    with pytest.raises(KeyError):
        read_corpus(corpus_path, non_existing_parameter=3)
Ejemplo n.º 34
0
def test_change_parameters_with_error():
    test_object = read_corpus(corpus_path)
    with pytest.raises(KeyError):
        test_object.change_parameters(non_existing_parameter=4)
Ejemplo n.º 35
0
def test_change_parameters():
    test_object = read_corpus(corpus_path)
    assert test_object.change_parameters(min_stem_length=4) is None
Ejemplo n.º 36
0
def test_reset():
    test_object = read_corpus(corpus_path)
    assert test_object.reset() is None
Ejemplo n.º 37
0
def test_change_parameters_with_error():
    test_object = read_corpus(corpus_path)
    with pytest.raises(KeyError):
        test_object.change_parameters(non_existing_parameter=4)
Ejemplo n.º 38
0
def test_run_manifold_module():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    lxa_object.run_manifold_module()
    assert True  # test if there are errors
Ejemplo n.º 39
0
def test_run_manifold_module():
    lxa_object = lxa.read_corpus(corpus_path, max_word_tokens=50000)
    lxa_object.run_manifold_module()
    assert True  # test if there are errors
Ejemplo n.º 40
0
def main():
    print('\n================================================================\n'
          'Welcome to Linguistica {}!\n'
          '================================================================'
          .format(lxa_version))

    # --------------------------------------------------------------------------
    # determine if file is a wordlist or a corpus text

    use_wordlist = determine_use_wordlist()

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # get file path

    file_abspath = get_file_abspath()

    print('\nFull file path:\n{}'.format(file_abspath))
    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # determine output directory

    output_dir = os.path.join(os.path.dirname(file_abspath), 'lxa_outputs')

    print('\nDefault output directory:\n{}'.format(output_dir))

    output_dir = get_output_dir(output_dir)

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # change encoding, if instructed

    encoding = get_encoding()

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # create the Linguistica object

    if use_wordlist:
        lxa_object = lxa.read_wordlist(file_abspath, encoding=encoding)
    else:
        lxa_object = lxa.read_corpus(file_abspath, encoding=encoding)

    # --------------------------------------------------------------------------
    # change parameters, if instructed

    print('\nParameters:\n{}'.format(pformat(lxa_object.parameters())))

    new_parameter_value_pairs = get_new_parameters()

    if new_parameter_value_pairs:
        lxa_object.change_parameters(**dict(new_parameter_value_pairs))

        print('\nParameters after the changes:\n{}'
              .format(pformat(lxa_object.parameters())))

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # run all Linguistica modules on the given file

    print('\nRunning all Linguistica modules on the given file:')

    lxa_object.run_all_modules(verbose=True)

    print('--------------------------------------------')

    # --------------------------------------------------------------------------
    # output results as files

    print('\nGenerating output files...\n')

    lxa_object.output_all_results(directory=output_dir, verbose=True)

    print('\nResults are in ' + output_dir)