Beispiel #1
0
 def __init__(self, root, items, encoding=None):
     gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
     sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
     TaggedCorpusReader.__init__(self,
                                 root,
                                 items,
                                 sep='_',
                                 sent_tokenizer=sent_tokenizer)
 def __init__(self, root, items, encoding="utf8"):
     gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*"
     sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
     TaggedCorpusReader.__init__(self,
                                 root,
                                 items,
                                 sep="_",
                                 sent_tokenizer=sent_tokenizer)
    def __init__(self, poem_title):
        """
        >>> pel_reader = PoeticEddaLemmatizationReader("Völuspá")

        :param poem_title:
        """
        assert poem_title in poetic_edda_titles
        TaggedCorpusReader.__init__(
            self,
            os.path.join(poetic_edda, poem_title, "txt_files",
                         "lemmatization"), "lemmatized.txt")
Beispiel #4
0
    def __init__(self, poem_title, _type=None):
        """
        >>> pel_reader = PoeticEddaLemmatizationReader("Völuspá")

        :param poem_title:
        """
        assert poem_title in poetic_edda_titles
        if _type == "tei":
            TaggedCorpusReader.__init__(self, os.path.join(CORPUS_PATH, poetic_edda, poem_title, "txt_files",
                                                           "lemmatization"),
                                        "tei_lemmatized_complete.txt")
        elif _type == "test":
            TaggedCorpusReader.__init__(self, os.path.join(CORPUS_PATH, poetic_edda, poem_title, "txt_files",
                                                           "lemmatization"),
                                        "test_lemmatized_complete.txt")
        else:
            TaggedCorpusReader.__init__(self, os.path.join(CORPUS_PATH, poetic_edda, poem_title, "txt_files",
                                                       "lemmatization"),
                                        "lemmatized.txt")
Beispiel #5
0
 def __init__(self, root, items, encoding='utf8'):
     gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
     sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
     TaggedCorpusReader.__init__(self, root, items, sep='_',
                                 sent_tokenizer=sent_tokenizer)
 def __init__(self, poem_title):
     TaggedCorpusReader.__init__(
         self,
         os.path.join(poetic_edda, poem_title, "txt_files", "syllabified"),
         "syllabified.txt")
 def __init__(self, poem_title):
     assert poem_title in poetic_edda_titles
     TaggedCorpusReader.__init__(
         self, os.path.join(poetic_edda, poem_title, "txt_files", "pos"),
         "pos_tagged.txt")