Beispiel #1
0
 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     tokens = ner.tag_ner("lat", input_text=text_str_iu, output_type=list)
     target = [
         ("ut", ),
         ("Uenus", "Entity"),
         (",", ),
         ("ut", ),
         ("Sirius", "Entity"),
         (",", ),
         ("ut", ),
         ("Spica", "Entity"),
         (",", ),
         ("ut", ),
         ("aliae", ),
         ("quae", ),
         ("primae", ),
         ("dicuntur", ),
         ("esse", ),
         ("mangitudinis", ),
         (".", ),
     ]
     self.assertEqual(tokens, target)
Beispiel #2
0
def get_tags(inputfile, outputfile):
    try:
        f = open(inputfile, 'r', encoding="utf-8")
        #f = codecs.open(inputfile, 'r', encoding='utf-8')
        try:
            x = f.read()

        except IOError as e:
            logfile.write("I/O error({0}): {1}\n".format(e.errno, e.strerror))
        except:  #handle other exceptions such as attribute errors
            logfile.write("Unexpected error:\n" + sys.exc_info()[0] + "\n")

        f.close()
        #print("x:",x)
        j = JVReplacer()
        x = x.lower()
        x = j.replace(x)
        ofile = open(outputfile, "w", encoding="utf-8")
        ofile.write(x)
        ofile.close()
        logfile.write("processing done\n")

    except IOError as e:
        logfile.write("I/O error({0}): {1}\n".format(e.errno, e.strerror))
    except:  #handle other exceptions such as attribute errors
        logfile.write("Unexpected error:" + sys.exc_info()[0] + "\n")
Beispiel #3
0
def stage1(text):
    '''
    Lowercases text, normalizes spelling by converting 'j' to 'i'
    and 'v' to 'u', removes punctuation.
    '''
    text = JVReplacer().replace(text.lower())
    words = re.split('\W', text)
    return ' '.join([word for word in words if word != ''])
Beispiel #4
0
 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     text = ner.tag_ner('latin', input_text=text_list_iu, output_type=str)
     target = ' ut Uenus/Entity Sirius/Entity'
     self.assertEqual(text, target)
Beispiel #5
0
 def test_tag_ner_str_str_latin(self):
     """Test make_ner(), str, str."""
     jv_replacer = JVReplacer()
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     text_str_iu = jv_replacer.replace(text_str)
     text = ner.tag_ner("lat", input_text=text_str_iu, output_type=str)
     target = " ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis."
     self.assertEqual(text, target)
Beispiel #6
0
 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     tokens = ner.tag_ner('latin', input_text=text_list_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), ('Sirius', 'Entity')]
     self.assertEqual(tokens, target)
Beispiel #7
0
 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     tokens = ner.tag_ner('latin', input_text=text_list_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), ('Sirius', 'Entity')]
     self.assertEqual(tokens, target)
Beispiel #8
0
 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     tokens = ner.tag_ner('latin', input_text=text_str_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), (',',), ('ut',), ('Sirius', 'Entity'), (',',), ('ut',), ('Spica', 'Entity'), (',',), ('ut',), ('aliae',), ('quae',), ('primae',), ('dicuntur',), ('esse',), ('mangitudinis',), ('.',)]
     self.assertEqual(tokens, target)
Beispiel #9
0
 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     tokens = ner.tag_ner('latin', input_text=text_str_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), (',',), ('ut',), ('Sirius', 'Entity'), (',',), ('ut',), ('Spica', 'Entity'), (',',), ('ut',), ('aliae',), ('quae',), ('primae',), ('dicuntur',), ('esse',), ('mangitudinis',), ('.',)]
     self.assertEqual(tokens, target)
Beispiel #10
0
 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ["ut", "Venus", "Sirius"]
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     text = ner.tag_ner("lat", input_text=text_list_iu, output_type=str)
     target = " ut Uenus/Entity Sirius/Entity"
     self.assertEqual(text, target)
Beispiel #11
0
 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ["ut", "Venus", "Sirius"]
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     tokens = ner.tag_ner("lat", input_text=text_list_iu, output_type=list)
     target = [("ut", ), ("Uenus", "Entity"), ("Sirius", "Entity")]
     self.assertEqual(tokens, target)
Beispiel #12
0
 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     text = ner.tag_ner('latin', input_text=text_list_iu, output_type=str)
     target = ' ut Uenus/Entity Sirius/Entity'
     self.assertEqual(text, target)
    def __init__(
            self, pathDF, language='english',
            dataType='pickle', dataIndex='multi', colname='text',
            maxValues=2500, pathMeta=False, pathType=False, showLogging=False,
            model_params=(4,5,300)
            ):

        super(CorpusML, self).__init__(
            pathDF, dataType, dataIndex, colname,
            maxValues, pathMeta, pathType
            )

        if showLogging:
            logging.basicConfig(
                format='%(asctime)s : %(levelname)s : %(message)s',
                level=logging.INFO
                )

        self.model = gensim.models.Word2Vec(
            workers=model_params[0],
            min_count=model_params[1],
            size=model_params[2]
            )

        # self.model.random.seed(42)

        self.language = language

        if self.language == 'latin' or self.language == 'greek':
            from cltk.corpus.utils.importer import CorpusImporter
            corpus_importer = CorpusImporter(self.language)
            corpus_importer.import_corpus(
                '{0}_models_cltk'.format(self.language)
                )
            from cltk.stem.lemma import LemmaReplacer
            from cltk.tokenize.word import nltk_tokenize_words as tokenizer
            lemmatizer = LemmaReplacer(self.language)
            if self.language == 'latin':
                from cltk.stem.latin.j_v import JVReplacer
                from cltk.stop.latin.stops import STOPS_LIST as stopwords
                self.jvreplacer = JVReplacer()
            elif self.language == 'greek':
                from cltk.stop.greek.stops import STOPS_LIST as stopwords
        elif self.language == 'english' or 'german':
            import nltk
            nltk.download('stopwords')
            from nltk.stem import WordNetLemmatizer
            from nltk.tokenize import word_tokenize as tokenizer
            from nltk.corpus import stopwords
            stopwords = stopwords.words(self.language)
            lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError(
                'Could not find lemmatizer, tokenizer,\
                 and stopwords for chosen language.')
        self.lemmatizer = lemmatizer
        self.tokenizer = tokenizer
        self.stopwords = stopwords
Beispiel #14
0
def get_sims(word, language, lemmatized=False, threshold=0.70):
    """Get similar Word2Vec terms from vocabulary or trained model.

    TODO: Add option to install corpus if not available.
    """
    # Normalize incoming word string
    jv_replacer = JVReplacer()
    if language == 'latin':
        # Note that casefold() seemingly does not work with diacritic
        # Greek, likely because of it expects single code points, not
        # diacritics. Look into global string normalization to code points
        # for all languages, especially Greek.
        word = jv_replacer.replace(word).casefold()

    model_dirs = {
        'greek': '~/cltk_data/greek/model/greek_word2vec_cltk',
        'latin': '~/cltk_data/latin/model/latin_word2vec_cltk'
    }
    assert language in model_dirs.keys(
    ), 'Langauges available with Word2Vec model: {}'.format(model_dirs.keys())
    if lemmatized:
        lemma_str = '_lemmed'
    else:
        lemma_str = ''
    model_name = '{0}_s100_w30_min5_sg{1}.model'.format(language, lemma_str)
    model_dir_abs = os.path.expanduser(model_dirs[language])
    model_path = os.path.join(model_dir_abs, model_name)
    w2v = Word2Vec()
    try:
        model = w2v.load(model_path)
    except FileNotFoundError as fnf_error:
        print(fnf_error)
        print(
            "CLTK's Word2Vec models cannot be found. Please import '{}_word2vec_cltk'."
            .format(language))
        raise
    try:
        similars = model.most_similar(word)
    except KeyError as key_err:
        print(key_err)
        possible_matches = []
        for term in model.vocab:
            if term.startswith(word[:3]):
                possible_matches.append(term)
        print(
            "The following terms in the Word2Vec model you may be looking for: '{}'."
            .format(possible_matches))
        return None
    returned_sims = []
    for similar in similars:
        if similar[1] > threshold:
            returned_sims.append(similar[0])
    if not returned_sims:
        print(
            "Matches found, but below the threshold of 'threshold={}'. Lower it to see these results."
            .format(threshold))
    return returned_sims
Beispiel #15
0
    def __init__(self, connection):
        super(LatinTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.jv_replacer = JVReplacer()
        self.lemmatizer = Lemmata('lemmata', 'lat')

        self.split_pattern = \
            '( / )|([\\s]+)|([^\\w' + self.diacriticals + ']+)'
Beispiel #16
0
    def __init__(self, connection):
        super(LatinTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.jv_replacer = JVReplacer()
        self.lemmatizer = Lemmata('lemmata', 'latin')

        self.split_pattern = \
            '[<].+[>][\s]| / | \. \. \.|\.\~\.\~\.|[^\w' + self.diacriticals + ']'
Beispiel #17
0
 def test_tag_ner_str_str_latin(self):
     """Test make_ner(), str, str."""
     jv_replacer = JVReplacer()
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     text = ner.tag_ner('latin', input_text=text_str_iu, output_type=str)
     target = ' ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis.'
     self.assertEqual(text, target)
Beispiel #18
0
def jv_replace(text):
    """
    Will perform CLTK-based jv_replacement.
    """
    jv_replacer = JVReplacer()
    jv_normalized_text = jv_replacer.replace(text)
    # no lowercasing or Truecasing is done so far!
    # lowercasing probably won't be done but Truecasing needs bow first
    return jv_normalized_text
Beispiel #19
0
 def __init__(self):
     self.lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     self.jv = JVReplacer()
     self.word_tokenizer = WordTokenizer('latin')
     self.count_dictionary = dict()
     self.punctuation_list = [
         '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',',
         '"', '\''
     ]
Beispiel #20
0
def get_tags(inputfile, outputfile):
    tree = ET.ElementTree(file=inputfile)
    root = tree.getroot()
    j = JVReplacer()
    for w in root.iter('w'):
        if not w.text is None:
            w.text = w.text.lower()
            w.text = j.replace(w.text)

    tree.write(outputfile, xml_declaration=True, encoding="utf-8")
def latin_lem_replacement(input_words):
    replacer = JVReplacer()

    if type(input_words) == list:
        for i in range(len(input_words)):
            input_words[i] = normalize_word(replacer.replace(input_words[i]))
    else:
        input_words = normalize_word(replacer.replace(input_words))

    return input_words
Beispiel #22
0
def jv_transform(string_matrix: List[List[str]]) -> List[List[str]]:
    """

    :param string_matrix: a data matrix: a list wrapping a list of strings, with each sublist being a sentence.
    >>> jv_transform([['venio', 'jacet'], ['julius', 'caesar']])
    [['uenio', 'iacet'], ['iulius', 'caesar']]
    """
    jvreplacer = JVReplacer()
    return [[jvreplacer.replace(word)
             for word in sentence]
            for sentence in string_matrix]
Beispiel #23
0
 def test_roman_numeral_lemmatizer(self):
     """Test roman_numeral_lemmatizer()"""
     lemmatizer = RomanNumeralLemmatizer()
     test_str = 'i ii iii iv v vi vii vii ix x xx xxx xl l lx c cc'
     target = [('i', 'NUM'), ('ii', 'NUM'), ('iii', 'NUM'), ('iu', 'NUM'), ('u', 'NUM'), ('ui', 'NUM'), ('uii', 'NUM'), ('uii', 'NUM'), ('ix', 'NUM'), ('x', 'NUM'), ('xx', 'NUM'), ('xxx', 'NUM'), ('xl', 'NUM'), ('l', 'NUM'), ('lx', 'NUM'), ('c', 'NUM'), ('cc', 'NUM')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = test_str.split()
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #24
0
 def test_roman_numeral_lemmatizer(self):
     """Test roman_numeral_lemmatizer()"""
     lemmatizer = RomanNumeralLemmatizer()
     test_str = 'i ii iii iv v vi vii vii ix x xx xxx xl l lx c cc'
     target = [('i', 'NUM'), ('ii', 'NUM'), ('iii', 'NUM'), ('iu', 'NUM'), ('u', 'NUM'), ('ui', 'NUM'), ('uii', 'NUM'), ('uii', 'NUM'), ('ix', 'NUM'), ('x', 'NUM'), ('xx', 'NUM'), ('xxx', 'NUM'), ('xl', 'NUM'), ('l', 'NUM'), ('lx', 'NUM'), ('c', 'NUM'), ('cc', 'NUM')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = test_str.split()
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #25
0
 def test_bigram_pos_lemmatizer(self):
     train = [[('dixissem', 'dico', 'v')], [('de', 'de', 'r'), ('te', 'tu', 'p'), ('autem', 'autem', 'c'), (',', 'punc', 'u'), ('catilina', 'catilina', 'n'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('quiescunt', 'quiesco', 'v'), (',', 'punc', 'u'), ('probant', 'probo', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('patiuntur', 'patior', 'v'), (',', 'punc', 'u'), ('decernunt', 'decerno', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('tacent', 'taceo', 'v'), (',', 'punc', 'u'), ('clamant', 'clamo', 'v'), (',', 'punc', 'u'), ('neque', 'neque', 'c'), ('hi', 'hic', 'p'), ('solum', 'solus', 'd'), ('quorum', 'qui', 'p'), ('tibi', 'tu', 'p'), ('auctoritas', 'auctoritas', 'n'), ('est', 'sum', 'v'), ('uidelicet', 'uidelicet', 'd'), ('cara', 'carus', 'a'), (',', 'punc', 'u'), ('uita', 'uita', 'n'), ('uilissima', 'uilis', 'a'), (',', 'punc', 'u'), ('sed', 'sed', 'c'), ('etiam', 'etiam', 'c'), ('illi', 'ille', 'p'), ('equites', 'eques', 'n'), ('romani', 'romanus', 'a'), (',', 'punc', 'u'), ('honestissimi', 'honestus', 'a'), ('atque', 'atque', 'c'), ('optimi', 'bonus', 'a'), ('uiri', 'uir', 'n'), (',', 'punc', 'u'), ('ceteri', 'ceterus', 'a'), ('-que', '-que', 'c'), ('fortissimi', 'fortis', 'a'), ('ciues', 'ciuis', 'n'), ('qui', 'qui', 'p'), ('circumstant', 'circumsto', 'v'), ('senatum', 'senatus', 'n'), (',', 'punc', 'u'), ('quorum', 'qui', 'p'), ('tu', 'tu', 'p'), ('et', 'et', 'c'), ('frequentiam', 'frequentia', 'n'), ('uidere', 'uideo', 'v'), ('et', 'et', 'c'), ('studia', 'studium', 'n'), ('perspicere', 'perspicio', 'v'), ('et', 'et', 'c'), ('uoces', 'uox', 'n'), ('paulo', 'paulus', 'd'), ('ante', 'ante', 'd'), ('exaudire', 'exaudio', 'v'), ('potuisti', 'possum', 'v'), ('.', 'punc', 'u')]]
     lemmatizer = BigramPOSLemmatizer(train=train, include=['cum'])
     test_str = """Quod cum esset intellectum et animadversum fecit animo libentissimo populus Romanus"""
     target = [('quod', None), ('cum', 'cum2'), ('esset', None), ('intellectum', None), ('et', None), ('animaduersum', None), ('fecit', None), ('animo', None), ('libentissimo', None), ('populus', None), ('romanus', None)]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #26
0
 def test_identity_lemmatizer(self):
     """Test identity_lemmatizer()"""
     lemmatizer = IdentityLemmatizer()
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #27
0
 def test_latin_lemmata(self):
     """Test Lemmata class lookup() method"""
     lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     self.assertEqual(lemmas, target)
Beispiel #28
0
 def test_identity_lemmatizer(self):
     """Test identity_lemmatizer()"""
     lemmatizer = IdentityLemmatizer()
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #29
0
 def test_latin_lemmata(self):
     """Test Lemmata class lookup() method"""
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     self.assertEqual(lemmas, target)
Beispiel #30
0
def get_sims(word, language, lemmatized=False, threshold=0.70):
    """Get similar Word2Vec terms from vocabulary or trained model.

    TODO: Add option to install corpus if not available.
    """
    # Normalize incoming word string
    jv_replacer = JVReplacer()
    if language == "latin":
        # Note that casefold() seemingly does not work with diacritic
        # Greek, likely because of it expects single code points, not
        # diacritics. Look into global string normalization to code points
        # for all languages, especially Greek.
        word = jv_replacer.replace(word).casefold()

    model_dirs = {
        "greek": "~/cltk_data/greek/model/greek_word2vec_cltk",
        "latin": "~/cltk_data/latin/model/latin_word2vec_cltk",
    }
    assert language in model_dirs.keys(), "Langauges available with Word2Vec model: {}".format(model_dirs.keys())
    if lemmatized:
        lemma_str = "_lemmed"
    else:
        lemma_str = ""
    model_name = "{0}_s100_w30_min5_sg{1}.model".format(language, lemma_str)
    model_dir_abs = os.path.expanduser(model_dirs[language])
    model_path = os.path.join(model_dir_abs, model_name)
    w2v = Word2Vec()
    try:
        model = w2v.load(model_path)
    except FileNotFoundError as fnf_error:
        print(fnf_error)
        print("CLTK's Word2Vec models cannot be found. Please import '{}_word2vec_cltk'.".format(language))
        raise
    try:
        similars = model.most_similar(word)
    except KeyError as key_err:
        print(key_err)
        possible_matches = []
        for term in model.vocab:
            if term.startswith(word[:3]):
                possible_matches.append(term)
        print("The following terms in the Word2Vec model you may be looking for: '{}'.".format(possible_matches))
        return None
    returned_sims = []
    for similar in similars:
        if similar[1] > threshold:
            returned_sims.append(similar[0])
    if not returned_sims:
        print(
            "Matches found, but below the threshold of 'threshold={}'. Lower it to see these results.".format(threshold)
        )
    return returned_sims
Beispiel #31
0
 def test_backoff_latin_lemmatizer_verbose(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #32
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     pattern = [(r'(\w*)abimus', 'o')]
     lemmatizer = RegexpLemmatizer(pattern)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #33
0
 def test_backoff_latin_lemmatizer_verbose(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #34
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     pattern = [(r'(\w*)abimus', 'o')]
     lemmatizer = RegexpLemmatizer(pattern)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #35
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #36
0
 def test_model_lemmatizer(self):
     """Test model_lemmatizer()"""
     model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'}  # pylint: disable=line-too-long
     lemmatizer = TrainLemmatizer(model=model)
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #37
0
 def test_model_lemmatizer(self):
     """Test model_lemmatizer()"""
     model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'}  # pylint: disable=line-too-long
     lemmatizer = TrainLemmatizer(model=model)
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #38
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #39
0
def main():
    # setup()
    a = open('./Gratian1.txt', 'r').read()
    b = open('./Gratian2.txt', 'r').read()
    a_lemmas = process(JVReplacer().replace(a.lower()))
    b_lemmas = process(JVReplacer().replace(b.lower()))
    a_only = [lemma for lemma in a_lemmas if lemma not in b_lemmas]
    b_only = [lemma for lemma in b_lemmas if lemma not in a_lemmas]

    a_only.sort()
    b_only.sort()
    print(a_only)
    print(b_only)
Beispiel #40
0
 def test_roman_numeral_lemmatizer_with_default(self):
     """Test roman_numeral_lemmatizer()"""
     rn_patterns = [(r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)', 'NUM'), (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)', 'NUM')]
     lemmatizer = RomanNumeralLemmatizer(rn_patterns, default="RN")
     test_str = 'i ii'
     target = [('i', 'RN'), ('ii', 'RN')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #41
0
 def test_unigram_lemmatizer(self):
     """Test unigram_lemmatizer()"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = UnigramLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #42
0
 def test_unigram_lemmatizer(self):
     """Test unigram_lemmatizer()"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = UnigramLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #43
0
 def test_latin_pp_lemmatizer(self):
     """Test latin_pp_lemmatizer()"""
     pattern = [(r'(\w*)[a|ie]bimus\b', 1)]
     pps = { 'amo': [1, 'am', 'amare', 'amau', 'amat'] }
     lemmatizer = PPLemmatizer(pattern, pps=pps)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #44
0
 def test_latin_pp_lemmatizer(self):
     """Test latin_pp_lemmatizer()"""
     pattern = [(r'(\w*)[a|ie]bimus\b', 1)]
     pps = {'amo': [1, 'am', 'amare', 'amau', 'amat']}
     lemmatizer = PPLemmatizer(pattern, pps=pps)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #45
0
  def normalize(self, text, stopwords):
    """ Normalize the text """
    jv = JVReplacer()
    punkt = RegexpTokenizer(r'\w+')
    data = []

    for citation in text:
      line = punkt.tokenize(jv.replace(citation.text.lower()))
      data = data + line
      self.refs = self.refs + [citation.ref] * len(line)

    if stopwords:
      return " ".join([lem for lem in data if lem not in STOPS_LIST_LATIN])
    else:
      return " ".join(data)
Beispiel #46
0
def tokenize(text, language="latin"):
    jv_replacer = JVReplacer()
    text = jv_replacer.replace(text.lower())

    t = WordTokenizer(language)
    l = LemmaReplacer(language)

    text_word_tokens = t.tokenize(text)

    # Garde les mots de plus de trois characters
    ## text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]
    text_word_tokens = [token for token in text_word_tokens if len(token) > 3]

    text_word_tokens = l.lemmatize(text_word_tokens)

    return text_word_tokens
Beispiel #47
0
def createNERListFromCorpus(string):
    """
    Will use CLTK NER method on a corpus (as string).
    Will perform jv replacement in the process.
    """
    ner_list = []
    jv_replacer = JVReplacer()
    text_str_iu = jv_replacer.replace(string)
    corpus_ner = ner.tag_ner('latin', input_text=text_str_iu)
    for tup in corpus_ner:
        if len(tup) > 1:
            ner_list.append(tup[0])
    NER_unique_values = set(ner_list)
    print('These NER were found in the given corpus:')
    print(NER_unique_values)
    return ner_list
Beispiel #48
0
def lemmatize(fname, tokenizer, lemmatizer):
    jv = JVReplacer()

    lemmacounts = {}
    formcounts = {}
    lemmaforms = {}
    with open(fname, "r") as f:
        i = 0
        hangingword = ""

        #        t = time.time()
        for line in f:

            line = hangingword.replace("-", "") + line.strip()
            if line and line[-1] == "-":
                splitline = line.split(" ")
                hangingword = " ".join(splitline[-1])
                line = " ".join(splitline[0:-1])
            else:
                hangingword = ""

            noaccents = remove_accents(line).replace("'", "").replace(
                "/", "").replace("-",
                                 "").replace("!",
                                             "").replace("?",
                                                         "").replace(".", "")
            lemmatized = lemmatizer.lemmatize(
                tokenizer.tokenize(jv.replace(noaccents.lower())))
            for form, lemma in lemmatized:
                if lemma.lower() == "punc" or lemma.lower() == "period":
                    continue
                if form not in formcounts:
                    formcounts[form] = 0
                formcounts[form] += 1
                if lemma not in lemmacounts:
                    lemmacounts[lemma] = 0
                    lemmaforms[lemma] = set([])
                lemmacounts[lemma] += 1
                lemmaforms[lemma].add(form)
            i += 1
            if not i % 100:

                print(basename(fname), i)
#                print(basename(fname), i, time.time() - t)
#                t = time.time()

    return formcounts, lemmacounts, lemmaforms
Beispiel #49
0
 def test_latin_synonyms(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     synonyms = Synonyms(dictionary = 'synonyms', language = 'latin')
     syns = synonyms.lookup_synonyms(lemmas)
     target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])]
     self.assertEqual(syns, target)
Beispiel #50
0
 def test_latin_translations(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     translations = Synonyms(dictionary = 'translations', language = 'latin')
     translations = translations.lookup_synonyms(lemmas)
     target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])]
     self.assertEqual(translations, target)
Beispiel #51
0
 def test_latin_i_u_transform(self):
     """Test converting ``j`` to ``i`` and ``v`` to ``u``."""
     jv_replacer = JVReplacer()
     trans = jv_replacer.replace('vem jam VEL JAM')
     self.assertEqual(trans, 'uem iam UEL IAM')
Beispiel #52
0
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = WordTokenizer('latin')
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = WordTokenizer('greek')

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [w[1:] if w.startswith('-') else w for w in sentence]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
Beispiel #53
0

# Import modules

# For XML
from xml.dom.minidom import parse, parseString
import codecs
# For CLTK
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.tag.pos import POSTag

# Initialize CLTK
lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()

# Parse XML

xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml')
#xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml')
wordElementList = xmldoc.getElementsByTagName('w')

for w in wordElementList:
        form = w.attributes['ana'].value
        print(form)
        # Parse the inflected word
        try:
            lemmaList = lemmatizer.lemmatize(form.lower())
            lemma = lemmaList[0].replace('v', 'u')
            posList   = tagger.tag_tnt(j.replace(form.lower()))