Beispiel #1
0
 def test_tag_ner_str_str_latin(self):
     """Test make_ner(), str, str."""
     jv_replacer = JVReplacer()
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     text = ner.tag_ner('latin', input_text=text_str_iu, output_type=str)
     target = ' ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis.'
     self.assertEqual(text, target)
Beispiel #2
0
def main():
    # setup()
    a = open('./Gratian1.txt', 'r').read()
    b = open('./Gratian2.txt', 'r').read()
    a_lemmas = process(JVReplacer().replace(a.lower()))
    b_lemmas = process(JVReplacer().replace(b.lower()))
    a_only = [lemma for lemma in a_lemmas if lemma not in b_lemmas]
    b_only = [lemma for lemma in b_lemmas if lemma not in a_lemmas]

    a_only.sort()
    b_only.sort()
    print(a_only)
    print(b_only)
Beispiel #3
0
 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     tokens = ner.tag_ner("lat", input_text=text_str_iu, output_type=list)
     target = [
         ("ut", ),
         ("Uenus", "Entity"),
         (",", ),
         ("ut", ),
         ("Sirius", "Entity"),
         (",", ),
         ("ut", ),
         ("Spica", "Entity"),
         (",", ),
         ("ut", ),
         ("aliae", ),
         ("quae", ),
         ("primae", ),
         ("dicuntur", ),
         ("esse", ),
         ("mangitudinis", ),
         (".", ),
     ]
     self.assertEqual(tokens, target)
Beispiel #4
0
def get_tags(inputfile, outputfile):
    try:
        f = open(inputfile, 'r', encoding="utf-8")
        #f = codecs.open(inputfile, 'r', encoding='utf-8')
        try:
            x = f.read()

        except IOError as e:
            logfile.write("I/O error({0}): {1}\n".format(e.errno, e.strerror))
        except:  #handle other exceptions such as attribute errors
            logfile.write("Unexpected error:\n" + sys.exc_info()[0] + "\n")

        f.close()
        #print("x:",x)
        j = JVReplacer()
        x = x.lower()
        x = j.replace(x)
        ofile = open(outputfile, "w", encoding="utf-8")
        ofile.write(x)
        ofile.close()
        logfile.write("processing done\n")

    except IOError as e:
        logfile.write("I/O error({0}): {1}\n".format(e.errno, e.strerror))
    except:  #handle other exceptions such as attribute errors
        logfile.write("Unexpected error:" + sys.exc_info()[0] + "\n")
Beispiel #5
0
 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ["ut", "Venus", "Sirius"]
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     tokens = ner.tag_ner("lat", input_text=text_list_iu, output_type=list)
     target = [("ut", ), ("Uenus", "Entity"), ("Sirius", "Entity")]
     self.assertEqual(tokens, target)
Beispiel #6
0
 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ["ut", "Venus", "Sirius"]
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     text = ner.tag_ner("lat", input_text=text_list_iu, output_type=str)
     target = " ut Uenus/Entity Sirius/Entity"
     self.assertEqual(text, target)
Beispiel #7
0
 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     tokens = ner.tag_ner('latin', input_text=text_list_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), ('Sirius', 'Entity')]
     self.assertEqual(tokens, target)
Beispiel #8
0
 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     tokens = ner.tag_ner('latin', input_text=text_str_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), (',',), ('ut',), ('Sirius', 'Entity'), (',',), ('ut',), ('Spica', 'Entity'), (',',), ('ut',), ('aliae',), ('quae',), ('primae',), ('dicuntur',), ('esse',), ('mangitudinis',), ('.',)]
     self.assertEqual(tokens, target)
Beispiel #9
0
def stage1(text):
    '''
    Lowercases text, normalizes spelling by converting 'j' to 'i'
    and 'v' to 'u', removes punctuation.
    '''
    text = JVReplacer().replace(text.lower())
    words = re.split('\W', text)
    return ' '.join([word for word in words if word != ''])
Beispiel #10
0
 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     text = ner.tag_ner('latin', input_text=text_list_iu, output_type=str)
     target = ' ut Uenus/Entity Sirius/Entity'
     self.assertEqual(text, target)
    def __init__(
            self, pathDF, language='english',
            dataType='pickle', dataIndex='multi', colname='text',
            maxValues=2500, pathMeta=False, pathType=False, showLogging=False,
            model_params=(4,5,300)
            ):

        super(CorpusML, self).__init__(
            pathDF, dataType, dataIndex, colname,
            maxValues, pathMeta, pathType
            )

        if showLogging:
            logging.basicConfig(
                format='%(asctime)s : %(levelname)s : %(message)s',
                level=logging.INFO
                )

        self.model = gensim.models.Word2Vec(
            workers=model_params[0],
            min_count=model_params[1],
            size=model_params[2]
            )

        # self.model.random.seed(42)

        self.language = language

        if self.language == 'latin' or self.language == 'greek':
            from cltk.corpus.utils.importer import CorpusImporter
            corpus_importer = CorpusImporter(self.language)
            corpus_importer.import_corpus(
                '{0}_models_cltk'.format(self.language)
                )
            from cltk.stem.lemma import LemmaReplacer
            from cltk.tokenize.word import nltk_tokenize_words as tokenizer
            lemmatizer = LemmaReplacer(self.language)
            if self.language == 'latin':
                from cltk.stem.latin.j_v import JVReplacer
                from cltk.stop.latin.stops import STOPS_LIST as stopwords
                self.jvreplacer = JVReplacer()
            elif self.language == 'greek':
                from cltk.stop.greek.stops import STOPS_LIST as stopwords
        elif self.language == 'english' or 'german':
            import nltk
            nltk.download('stopwords')
            from nltk.stem import WordNetLemmatizer
            from nltk.tokenize import word_tokenize as tokenizer
            from nltk.corpus import stopwords
            stopwords = stopwords.words(self.language)
            lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError(
                'Could not find lemmatizer, tokenizer,\
                 and stopwords for chosen language.')
        self.lemmatizer = lemmatizer
        self.tokenizer = tokenizer
        self.stopwords = stopwords
Beispiel #12
0
def get_sims(word, language, lemmatized=False, threshold=0.70):
    """Get similar Word2Vec terms from vocabulary or trained model.

    TODO: Add option to install corpus if not available.
    """
    # Normalize incoming word string
    jv_replacer = JVReplacer()
    if language == 'latin':
        # Note that casefold() seemingly does not work with diacritic
        # Greek, likely because of it expects single code points, not
        # diacritics. Look into global string normalization to code points
        # for all languages, especially Greek.
        word = jv_replacer.replace(word).casefold()

    model_dirs = {
        'greek': '~/cltk_data/greek/model/greek_word2vec_cltk',
        'latin': '~/cltk_data/latin/model/latin_word2vec_cltk'
    }
    assert language in model_dirs.keys(
    ), 'Langauges available with Word2Vec model: {}'.format(model_dirs.keys())
    if lemmatized:
        lemma_str = '_lemmed'
    else:
        lemma_str = ''
    model_name = '{0}_s100_w30_min5_sg{1}.model'.format(language, lemma_str)
    model_dir_abs = os.path.expanduser(model_dirs[language])
    model_path = os.path.join(model_dir_abs, model_name)
    w2v = Word2Vec()
    try:
        model = w2v.load(model_path)
    except FileNotFoundError as fnf_error:
        print(fnf_error)
        print(
            "CLTK's Word2Vec models cannot be found. Please import '{}_word2vec_cltk'."
            .format(language))
        raise
    try:
        similars = model.most_similar(word)
    except KeyError as key_err:
        print(key_err)
        possible_matches = []
        for term in model.vocab:
            if term.startswith(word[:3]):
                possible_matches.append(term)
        print(
            "The following terms in the Word2Vec model you may be looking for: '{}'."
            .format(possible_matches))
        return None
    returned_sims = []
    for similar in similars:
        if similar[1] > threshold:
            returned_sims.append(similar[0])
    if not returned_sims:
        print(
            "Matches found, but below the threshold of 'threshold={}'. Lower it to see these results."
            .format(threshold))
    return returned_sims
Beispiel #13
0
    def __init__(self, connection):
        super(LatinTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.jv_replacer = JVReplacer()
        self.lemmatizer = Lemmata('lemmata', 'lat')

        self.split_pattern = \
            '( / )|([\\s]+)|([^\\w' + self.diacriticals + ']+)'
Beispiel #14
0
 def __init__(self):
     self.lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     self.jv = JVReplacer()
     self.word_tokenizer = WordTokenizer('latin')
     self.count_dictionary = dict()
     self.punctuation_list = [
         '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',',
         '"', '\''
     ]
Beispiel #15
0
def jv_replace(text):
    """
    Will perform CLTK-based jv_replacement.
    """
    jv_replacer = JVReplacer()
    jv_normalized_text = jv_replacer.replace(text)
    # no lowercasing or Truecasing is done so far!
    # lowercasing probably won't be done but Truecasing needs bow first
    return jv_normalized_text
Beispiel #16
0
    def __init__(self, connection):
        super(LatinTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.jv_replacer = JVReplacer()
        self.lemmatizer = Lemmata('lemmata', 'latin')

        self.split_pattern = \
            '[<].+[>][\s]| / | \. \. \.|\.\~\.\~\.|[^\w' + self.diacriticals + ']'
Beispiel #17
0
def get_tags(inputfile, outputfile):
    tree = ET.ElementTree(file=inputfile)
    root = tree.getroot()
    j = JVReplacer()
    for w in root.iter('w'):
        if not w.text is None:
            w.text = w.text.lower()
            w.text = j.replace(w.text)

    tree.write(outputfile, xml_declaration=True, encoding="utf-8")
Beispiel #18
0
 def test_roman_numeral_lemmatizer(self):
     """Test roman_numeral_lemmatizer()"""
     lemmatizer = RomanNumeralLemmatizer()
     test_str = 'i ii iii iv v vi vii vii ix x xx xxx xl l lx c cc'
     target = [('i', 'NUM'), ('ii', 'NUM'), ('iii', 'NUM'), ('iu', 'NUM'), ('u', 'NUM'), ('ui', 'NUM'), ('uii', 'NUM'), ('uii', 'NUM'), ('ix', 'NUM'), ('x', 'NUM'), ('xx', 'NUM'), ('xxx', 'NUM'), ('xl', 'NUM'), ('l', 'NUM'), ('lx', 'NUM'), ('c', 'NUM'), ('cc', 'NUM')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = test_str.split()
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #19
0
def jv_transform(string_matrix: List[List[str]]) -> List[List[str]]:
    """

    :param string_matrix: a data matrix: a list wrapping a list of strings, with each sublist being a sentence.
    >>> jv_transform([['venio', 'jacet'], ['julius', 'caesar']])
    [['uenio', 'iacet'], ['iulius', 'caesar']]
    """
    jvreplacer = JVReplacer()
    return [[jvreplacer.replace(word)
             for word in sentence]
            for sentence in string_matrix]
Beispiel #20
0
 def test_identity_lemmatizer(self):
     """Test identity_lemmatizer()"""
     lemmatizer = IdentityLemmatizer()
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #21
0
 def test_latin_lemmata(self):
     """Test Lemmata class lookup() method"""
     lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     self.assertEqual(lemmas, target)
Beispiel #22
0
 def test_backoff_latin_lemmatizer_verbose(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #23
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     pattern = [(r'(\w*)abimus', 'o')]
     lemmatizer = RegexpLemmatizer(pattern)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #24
0
 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #25
0
 def test_unigram_lemmatizer(self):
     """Test unigram_lemmatizer()"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = UnigramLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #26
0
 def test_model_lemmatizer(self):
     """Test model_lemmatizer()"""
     model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'}  # pylint: disable=line-too-long
     lemmatizer = TrainLemmatizer(model=model)
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #27
0
 def test_latin_pp_lemmatizer(self):
     """Test latin_pp_lemmatizer()"""
     pattern = [(r'(\w*)[a|ie]bimus\b', 1)]
     pps = {'amo': [1, 'am', 'amare', 'amau', 'amat']}
     lemmatizer = PPLemmatizer(pattern, pps=pps)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #28
0
def tokenize(text, language="latin"):
    jv_replacer = JVReplacer()
    text = jv_replacer.replace(text.lower())

    t = WordTokenizer(language)
    l = LemmaReplacer(language)

    text_word_tokens = t.tokenize(text)

    # Garde les mots de plus de trois characters
    ## text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]
    text_word_tokens = [token for token in text_word_tokens if len(token) > 3]

    text_word_tokens = l.lemmatize(text_word_tokens)

    return text_word_tokens
Beispiel #29
0
def createNERListFromCorpus(string):
    """
    Will use CLTK NER method on a corpus (as string).
    Will perform jv replacement in the process.
    """
    ner_list = []
    jv_replacer = JVReplacer()
    text_str_iu = jv_replacer.replace(string)
    corpus_ner = ner.tag_ner('latin', input_text=text_str_iu)
    for tup in corpus_ner:
        if len(tup) > 1:
            ner_list.append(tup[0])
    NER_unique_values = set(ner_list)
    print('These NER were found in the given corpus:')
    print(NER_unique_values)
    return ner_list
Beispiel #30
0
def lemmatize(fname, tokenizer, lemmatizer):
    jv = JVReplacer()

    lemmacounts = {}
    formcounts = {}
    lemmaforms = {}
    with open(fname, "r") as f:
        i = 0
        hangingword = ""

        #        t = time.time()
        for line in f:

            line = hangingword.replace("-", "") + line.strip()
            if line and line[-1] == "-":
                splitline = line.split(" ")
                hangingword = " ".join(splitline[-1])
                line = " ".join(splitline[0:-1])
            else:
                hangingword = ""

            noaccents = remove_accents(line).replace("'", "").replace(
                "/", "").replace("-",
                                 "").replace("!",
                                             "").replace("?",
                                                         "").replace(".", "")
            lemmatized = lemmatizer.lemmatize(
                tokenizer.tokenize(jv.replace(noaccents.lower())))
            for form, lemma in lemmatized:
                if lemma.lower() == "punc" or lemma.lower() == "period":
                    continue
                if form not in formcounts:
                    formcounts[form] = 0
                formcounts[form] += 1
                if lemma not in lemmacounts:
                    lemmacounts[lemma] = 0
                    lemmaforms[lemma] = set([])
                lemmacounts[lemma] += 1
                lemmaforms[lemma].add(form)
            i += 1
            if not i % 100:

                print(basename(fname), i)
#                print(basename(fname), i, time.time() - t)
#                t = time.time()

    return formcounts, lemmacounts, lemmaforms