コード例 #1
0
 def create_reader(self):
     """Reads the corpus files with the XMLCorpusReader. See book for further explanations"""
     self.reader = XMLCorpusReader("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara\\", self.file_list, self.option_tone, self.option_tag)
     self.reader.all_tagging_sents()
     self.reader.all_sents()
     self.anzahl_sents = len(self.reader.tagged_sents)
     return self.reader, self.anzahl_sents
コード例 #2
0
class BambaraTagging(object):
    def __init__(self, root, file_list, option_tone, option_tag):
        self.root = root #only the pathname after "C:Users/<username>/nltk_data/corpora"; example: 'cookbook\\bambara' (instead of: C:/Users/<username>/nltk_data/corpora/cookbook/bambara)
        self.file_list = file_list
        self.option_tone = option_tone
        self.option_tag = option_tag
        self.reader = None
        self.anzahl_sents = 0
        self.train_sents = []
        self.test_sents = [] #=devset!
        self.unigramtagger = None
        self.bigramtagger = None
        self.trigramtagger = None
        self.contingenzliste = ["Getagged   :   Ursprüngliches Tag"+"\n"]
        self.reference_tags = []
        self.test_tags = []
        self.evaluate = 0
        self.evaluate_final = 0
        self.user = getpass.getuser()
        self.testset=[] #real test set
     
    def copy_files(self):
        """ Copies the corpus files (self.file_list) to the
        C:/Users/<username>/nltk_data/corpora/cookbook/bambara for further usage.
        If this directory does not exist yet, it will be created also.
        """
        print("Checking corpus directory...")
        if not os.path.exists("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara"):
            print("Creating corpus directories...")
            os.mkdir("C:\\Users\\"+self.user+"\\nltk_data\\corpora")
            os.mkdir("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook")
            os.mkdir("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara")
            os.mkdir("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara\\Corpus")
        else:
            print("nltk_data/corpora/cookbook/bambara folder exists")
        print("Checking corpus files...")
        for file in self.file_list:
            if not os.path.exists(file):
                print(file, "This corpus file does not exist")
                raise IOError('File does not exist:', file)
            else:
                if not os.path.exists("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara\\"+file):
                    print("Corpus file ", file, "does not exist yet.")
                    print("Copying file to nltk_data/corpora/cookbook/bambara. Please wait. This may take a while.")
                    # read the desired corpus files (given in self.file_list) from the folder
                    f = codecs.open(file, "r+", "utf-8")
                    # copies these corpus files in the right user directory
                    g = codecs.open("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara\\"+file, "w", "utf-8")
                    lines = f.readlines()
                    output = []
                    for i in lines:
                        output.append(i)
                    g.writelines(output)
                    g.close()
                    f.close()
                    print("Copied file: ", file)
                else:
                    print("Corpus file exists")     
    
    def create_reader(self):
        """Reads the corpus files with the XMLCorpusReader. See book for further explanations"""
        self.reader = XMLCorpusReader("C:\\Users\\"+self.user+"\\nltk_data\\corpora\\cookbook\\bambara\\", self.file_list, self.option_tone, self.option_tag)
        self.reader.all_tagging_sents()
        self.reader.all_sents()
        self.anzahl_sents = len(self.reader.tagged_sents)
        return self.reader, self.anzahl_sents
    
    def sets8_1_1(self, split):
        """
        Creation of train- and test-set. Furthermore, develoment set is created that serves to optimize the trained tagger.
        In 10 sentences, 1st-4th sentences and 6th-9th sentences go to the training set, while the 5th sentence goes to the test set
        and 10the sentence goes to the dev set.
        This partition of the corpus sentences is done in steps: <split> gives the numer of the bundles of sentences that are treated
        together.
        """
        n = (self.anzahl_sents//split) # gives the number of bundles consisting of split sentences
        saetze = self.reader.sents[:(n*split)]
        saetzetagged = self.reader.tagged_sents[:(n*split)]
        for i in range(n):
            s_split= saetze[:split]
            s_tag_split = saetzetagged[:split]
            for j in range(0, split, 10):
                #print(i, j)
                self.train_sents.append(s_tag_split[j]) # tagged sentences to train the tagger
                self.train_sents.append(s_tag_split[j+1])
                self.train_sents.append(s_tag_split[j+2])
                self.train_sents.append(s_tag_split[j+3])
                self.train_sents.append(s_tag_split[(j+5)])
                self.train_sents.append(s_tag_split[j+6])
                self.train_sents.append(s_tag_split[j+7])
                self.train_sents.append(s_tag_split[j+8])
                self.test_sents.append(s_tag_split[j+4]) # tagged sentences to test and optimize the tagger
                self.testset.append(s_tag_split[j+9]) # tagged sentences to check optimized tagger
            saetze = saetze[split:]
            saetzetagged = saetzetagged[split:]
        
    def sets9_1(self, split):
        """
        Creation of train- and test-set. Furthermore, development set is created that serves to optimize the trained tagger.
        In 10 sentences, 1st-4th sentences and 6th-9th sentences go to the training set, while the 5th sentence goes to the test set
        and 10the sentence goes to the dev set.
        This partition of the corpus sentences is done in steps: <split> gives the numer of the bundles of sentences that are treated
        together.
        """
        n = (self.anzahl_sents//split) # gives the number of bundles consisting of split sentences
        saetze = self.reader.sents[:(n*split)]
        saetzetagged = self.reader.tagged_sents[:(n*split)]
        for i in range(n):
            s_split= saetze[:split]
            s_tag_split = saetzetagged[:split]
            for j in range(0, split, 10):
                #print(i, j)
                self.train_sents+=s_tag_split[j:(j+9)] # tagged sentences to train the tagger
                self.test_sents.append(s_tag_split[j+9]) # tagged sentences to test and optimize the tagger
            saetze = saetze[split:]
            saetzetagged = saetzetagged[split:]
   

    def calculate_contingenz_with_sets(self, tagger):
        """
        Compares the original tags with the tags created by the tagger.
        """
        tagger_tagged = tagger.tag_sents([untag(i) for i in self.test_sents])
        tagger_words = sum(tagger_tagged,[])
        original_tagged = self.test_sents
        original_words = sum(original_tagged,[])
        tagged_org_zip = zip([i[1] for i in original_words],[i[1] for i in tagger_words])
        contingenzliste = []
        orig_tags = []
        tag_tags = []
        for i in tagged_org_zip:
            if i[0] != i[1]:
                if i[1] == None:
                    i = (i[0], "None")
                contingenzliste.append(i[1]+"   :   "+i[0]+"\n")
                orig_tags.append(i[0])
                tag_tags.append(i[1])   

        self.contingenzliste = self.contingenzliste + contingenzliste
        self.reference_tags = self.reference_tags + orig_tags
        self.test_tags = self.test_tags + tag_tags


    def matrix(self):
        """Creates a Contingenz Matrix using ConfusionMatrix of NLTK"""
        cm = ConfusionMatrix(self.reference_tags, self.test_tags) # first reference, then test!
        #f = codecs.open("C:\\Users\\"+self.user+"\\Downloads\\continenzmatrix.txt", "w", "utf-8")
        f = codecs.open("Results\\contingenzmatrix.txt","w","utf-8")
        f.write(cm.pp())
        f.close()
        ###print contingenzliste
        #g = codecs.open("C:\\Users\\"+self.user+"\\Downloads\\contingenzliste.txt", "w", "utf-8")
        #g.writelines(self.contingenzliste)
        #g.close()
        values_not_null = cm.get_values_not_null()
        return values_not_null