Beispiel #1
0
def main():
    treebank_tagged_sents = TreebankNoTraces()  # Remove trace tokens.
    training_set = treebank_tagged_sents[:
                                         3000]  # This is the train-test split that we will use.
    test_set = treebank_tagged_sents[3000:]
    vocabulary = makeVocab(training_set)

    training_set_prep = PreprocessText(training_set, vocabulary)
    test_set_prep = PreprocessText(test_set, vocabulary)

    # Print the first sentence of each data set.
    print " ".join(untag(training_set_prep[0]))  # See nltk.tag.util module.
    print " ".join(untag(test_set_prep[0]))

    # Estimate Bigram HMM from the training set, report level of ambiguity.
    bigram_hmm = BigramHMM()
    bigram_hmm.Train(training_set_prep)
    print "Percent tag ambiguity in training set is %.2f%%." % bigram_hmm.ComputePercentAmbiguous(
        training_set_prep)
    print "Joint probability of the first sentence is %s." % bigram_hmm.JointProbability(
        training_set_prep[0])

    # Implement the most common class baseline. Report accuracy of the predicted tags.
    test_set_predicted_baseline = MostCommonClassBaseline(
        training_set_prep, test_set_prep)
    print "--- Most common class baseline accuracy ---"
    ComputeAccuracy(test_set_prep, test_set_predicted_baseline)

    # Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags.
    test_set_predicted_bigram_hmm = bigram_hmm.Test(test_set_prep)
    print "--- Bigram HMM accuracy ---"
    ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm)
Beispiel #2
0
def replace_names(text, method="fake", replacechar="_"):
    newlines = []
    names = []
    fakenames = []
    print(st.tag_sents(text))
    lines = text.splitlines()
    for line in tqdm(lines):
        tagline = st.tag(line.split())
        for i, tag in enumerate(tagline):
            if tag[1] == "PERSON":
                newtag = ["", ""]
                word, classification = tag
                if method == "fake":
                    fake = getFakeFirstName()
                    fakenames.append(fake)
                    names.append(word)
                    if tag[0] in names:
                        newtag[0] = fakenames[names.index(word)]
                        print(newtag[0])
                    else:
                        newtag[0] = fakenames[i]
                else:
                    newtag[0] = replace_with_char(word, replacechar)
                newtag[1] = classification
                newtag = tuple(newtag)
                tagline[i] = newtag
        newline = " ".join(untag(tagline))
        newlines.append(newline)
    formatted = "\n".join(newlines)
    newlines.clear()
    fakenames.clear()
    return (formatted, names)
Beispiel #3
0
def replace_names_nltk(text, method="fake", replacechar="_"):
    newlines = []
    names = []
    fakenames = []
    lines = nltk.word_tokenize(text, preserve_line=True)
    tagline = nltk.pos_tag(lines)
    namedEnt = nltk.ne_chunk(tagline, binary=False)
    tree = namedEnt.pos()
    for i, tag in enumerate(tree):
        if "PERSON" in tag:
            print(tag[0][0])
            newtag = ["", ""]
            if method == "fake":
                fake = getFakeFirstName()
                names.append(tag)
                fakenames.append(fake)
                if tag[0] in names:
                    newtag[0] = fakenames[names.index(tag[0])]
                else:
                    newtag[0] = fake
            else:
                newtag[0] = replace_with_char(tag[0][0], replacechar)
            newtag[1] = "0"
            newtag = tuple(newtag)
            tagline[i] = newtag
    newline = d.detokenize(untag(tagline))
    # remove whitespace around single quotes with regex
    subbed = re.sub(r'( ’ )', "'", newline)
    newlines.append(subbed)
    formatted = "\n".join(newlines)
    return (formatted, names)
Beispiel #4
0
def appenddata(sent_tag):
    X, y = [], []
    for tagged in sent_tag:
        X.append(
            [feature(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
    return X, y
Beispiel #5
0
    def chunk(self, sentence):

        tagged_tree = self.parse(pos_tag(word_tokenize(sentence.lower())))

        chunks = []
        for subtree in tagged_tree.subtrees(filter=tree_filter):
            chunks.append(untag(subtree.leaves()))

        max_length = 0
        for i in range(len(chunks)):
            if len(chunks[i]) > max_length:
                chunk = chunks[i]
                max_length = len(chunks[i])

        output = ''
        if len(chunks) > 0:

            for i in range(len(chunk)):
                if not chunk[i] == '.' and not chunk[i] == ',' and not i == 0:
                    output = output + ' ' + chunk[i]
                else:
                    output = output + chunk[i]

            index = sentence.lower().find(output)
            output = sentence[index:len(output) + index]

        return output
Beispiel #6
0
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        X.append(
            [features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
    return X, y
Beispiel #7
0
def UDuntagged(path):
    newfile = open('UDuntagged.txt', 'w')
    sents = [untag(sent) for sent in read_tagged_sents(path)]
    for sent in sents:
        for word in sent:
            newfile.write(word + ' ')
        newfile.write('\n')
Beispiel #8
0
 def transform_to_dataset(tagged_sentences):
     x, y = [], []
 
     for tagged in tagged_sentences:
         x.append([features(untag(tagged), index) for index in range(len(tagged))])
         y.append([tag for _, tag in tagged])
 
     return x, y
Beispiel #9
0
def evaluate(self, gold):
    "overriding evaluate from nltk.TaggerI, it seems to have a bug"
    tagged_sents = [
        list(s) for s in self.tag_sents(untag(sent) for sent in gold)
    ]
    gold_tokens = sum(gold, [])
    test_tokens = sum(tagged_sents, [])
    return accuracy(gold_tokens, test_tokens)
def main():
    treebank_tagged_sents = TreebankNoTraces()
    training_set = treebank_tagged_sents[:3000]
    test_set = treebank_tagged_sents[3000:]

    # 1. Preprocessing:
    vocabulary = Set_up_volcabulary(training_set)
    training_set_prep = PreprocessText(training_set, vocabulary)
    test_set_prep = PreprocessText(test_set, vocabulary)

    # Print the first sentence of each data set.
    print "--- First sentence of each data set after preprocessing ---"
    print " ".join(untag(training_set_prep[0]))
    print " ".join(untag(test_set_prep[0]))
    print "\n"

    # 2. Implement the most common class baseline. Report accuracy of the predicted tags.
    test_set_predicted_baseline = MostCommonClassBaseline(
        training_set_prep, test_set_prep)
    print "--- Most common class baseline accuracy ---"
    ComputeAccuracy(test_set_prep, test_set_predicted_baseline)
    print "\n"

    # Estimate Bigram HMM from the training set, report level of ambiguity.
    bigram_hmm = BigramHMM()
    bigram_hmm.Train(training_set_prep)
    print "--- Training ---"
    print "Percent tag ambiguity(tokens) in training set is %.2f%%." % bigram_hmm.ComputePercentAmbiguous(
        training_set_prep)
    print "For comparison, percent tag ambiguity(words) in training set is %.2f%%." % bigram_hmm.ComputePercentAmbiguous1(
        vocabulary)
    print "Joint probability of the first sentence is %s." % bigram_hmm.JointProbability(
        training_set_prep[0])
    print "\n"

    # Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags.
    test_set_predicted_bigram_hmm = bigram_hmm.Test(test_set_prep)
    print "--- Testing ---"
    print "--- Bigram HMM accuracy ---"
    ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm)
    print "\n"

    # confusion matrix
    print "--- Confusion matrix ---"
    ConfusionMatrix(test_set_prep, test_set_predicted_bigram_hmm)
Beispiel #11
0
    def splitWordsTagsTestingNoDelim(self):
        sentences = []
        tags = []

        for s in self.check_sents:
            # untag the sentence, and append it to the list of sentences
            sentences.append(untag(s))
            tags.append([t for (_, t) in s
                         ])  # add all tags in a sentence to a list of tags
        return sentences, tags
Beispiel #12
0
def BuildTrainVocab(training):
    first_occur = set()
    s_occur = set()
    for sent in training:
        for token in untag(sent):
            if (token not in s_occur and token in first_occur):
                s_occur.add(token)
            if (token not in first_occur):
                first_occur.add(token)
    return s_occur
Beispiel #13
0
def main():
    character, pinyin = readCorpus('C:\Users\wruan02\Documents\GitHub\Comp150NLP')
    token_char = tokenizeCharacter(character[0:])
    token_pinyin = tokenizePinyin(pinyin[0:])
    dataset = organizeSentence(token_char, token_pinyin)

    training_set = dataset[0:500]
    test_set = dataset[501:]

    # get vocabulary first!
    vocabulary, labelset = getVoc(training_set_prep)
    print len(vocabulary)
    print len(labelset)

    """ Transform the data sets by eliminating unknown words and adding sentence boundary tokens.
    """
    training_set_prep = PreprocessText(training_set, vocabulary)
    test_set_prep = PreprocessText(test_set, vocabulary)

    """ Print the first sentence of each data set.
        """
    # print training_set_prep[0]
    print " ".join(untag(training_set_prep[0]))  # See nltk.tag.util module.
    print " ".join(untag(test_set_prep[0]))
    print test_set_prep[0]

    bigram_hmm = BigramHMM(vocabulary, labelset)
    bigram_hmm.Train(training_set_prep)

    """ Implement the most common class baseline. Report accuracy of the predicted tags.
        """
    test_set_predicted_baseline = MostCommonClassBaseline(training_set_prep, test_set_prep, vocabulary, labelset)
    print "--- Most common class baseline accuracy ---"
    ComputeAccuracy(test_set_prep, test_set_predicted_baseline)

    print test_set_prep[0]
    print test_set_predicted_baseline[0]

    """ Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags.
    """
    test_set_predicted_bigram_hmm = bigram_hmm.Test(test_set_prep)
    print "--- Bigram HMM accuracy ---"
    ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm)
Beispiel #14
0
def RemoveWords_by_tag(text):
    remove_tag_list = ['JJ', 'JJR', 'JJS', 'RBR', 'RBS']
    token = ToktokTokenizer()
    words = token.tokenize(text)
    words_tagged = nltk.pos_tag(words)
    filtered = untag([
        w for w in words_tagged if not w[1] in remove_tag_list
    ])  # Filtre les mots qui n'appartiennt pas à la catégorie à supprimer

    return ' '.join(map(str, filtered))
def count_phrases(corpus, tagger, chunker):
    cfd = probability.ConditionalFreqDist()

    for sent in corpus.sents():
        tree = chunker.parse(tagger.tag(sent))

        for sub in tree.subtrees():
            if sub.node == 'S': continue
            words = untag(sub.leaves())
            if len(words) >= 2: cfd[sub.node].inc(' '.join(words))

    return cfd
def extract_address(chunker, sentence):
    """
    returns all addresses in sentence
    """
    def tree_filter(tree):
        return GPE_TAG == tree.label()

    tagged_tree = get_tagged_sentence(chunker, sentence)
    addresses = list()
    for subtree in tagged_tree.subtrees(filter=tree_filter):
        addresses.append(untag(subtree.leaves()))
    return addresses
Beispiel #17
0
 def evaluate(self, gold):
     '''
     Score the accuracy of the tagger against the gold standard.
     Strip the tags from the gold standard text,retag it using
     the tagger,then compute the accuracy score.
     :param gold: 真实的标记
     :return: 准确率
     '''
     tagged_sents = self.tag_sents(untag(sent) for sent in gold)
     gold_tokens = sum(gold, [])
     test_tokens = sum(tagged_sents, [])
     return accuracy(gold_tokens, test_tokens)
Beispiel #18
0
        def pass_to_dataframe(taggedSentences):
            wordList = []  # X
            tagList = []  # Y

            for tagged in taggedSentences:
                wordList.append([
                    feature_extraction_function(untag(tagged), index)
                    for index in range(len(tagged))
                ])
                tagList.append([tag for _, tag in tagged])

            return wordList, tagList
    def metrics(self, gold, printout=True, confusion_matrix=False, oov=True):
        '''
        More sophisticated evalution method gives more numbers.

        :param gold: The sentences to use for testing
        :type gold: [[(str, str)]]
        :param printout: Should I print the results or just return them?
        :type printout: bool
        :param confusion_matrix: Should I create a Confusion Matrix?
        :type confusion_matrix: bool
        :param oov: Should the out of vocabulary words be calculated
        :type oov: bool
        :return: (acc, prec, rec, fsc, aov, None) the first five are the accuracy, precision, recall, fscore, and out of vocabulary words. The last one is the Confusion Matrix if requested, else None
        :rtype: (double, double, double, double, double, ConfusionMatrix or None)
        '''

        tagger_out = self._tagger.tag_sents(untag(sent) for sent in gold)
        gold_tokens = sum(gold, [])
        test_tokens = sum(tagger_out, [])
        gold_tokens_set = set(gold_tokens)
        test_tokens_set = set(test_tokens)

        gold_tags = [t for (_, t) in gold_tokens]
        test_tags = [t for (_, t) in test_tokens]

        # calculate out of vocabulary words
        if oov:
            d = {word: True for (word, _) in reduce(lambda a, b: a + b, self._tagged_sents, [])}
            aov = reduce(lambda a, b: a + 1 if not b in d else a, [w for (w, _) in gold_tokens], 0)
            aov = (aov * 100.0) / len(gold_tokens)
        else:
            aov = '-1'

        acc = accuracy(gold_tokens, test_tokens)
        prc = precision(gold_tokens_set, test_tokens_set)
        rec = recall(gold_tokens_set, test_tokens_set)
        fms = f_measure(gold_tokens_set, test_tokens_set)
        cfm = None

        if confusion_matrix:
            cfm = ConfusionMatrix(gold_tags, test_tags)

        if printout:
            print("accuracy:          " + str(acc))
            print("precision:         " + str(prc))
            print("recall:            " + str(rec))
            print("f-score:           " + str(fms))
            print("out of vocabulary: " + str(aov) + " %")
            if confusion_matrix:
                print(cfm)

        return acc, prc, rec, fms, aov, cfm
Beispiel #20
0
def transform_to_dataset(tagged_sentences):
    """
    transform list of tagged sentences to list of untagged sentences and list of tags
    :param tagged_sentences: list of sentences, each contains tuples of (word, tag)
    :return: list of sentences, list of sentences tags
    the wights are currently ignored but still maintained
    """
    X, y = [], []
    for tagged, weight in tagged_sentences:
        X.append(
            [features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
    return X, y
def count_phrases(corpus, tagger, chunker):
    cfd = probability.ConditionalFreqDist()

    for sent in corpus.sents():
        tree = chunker.parse(tagger.tag(sent))

        for sub in tree.subtrees():
            if sub.node == "S":
                continue
            words = untag(sub.leaves())
            if len(words) >= 2:
                cfd[sub.node].inc(" ".join(words))

    return cfd
Beispiel #22
0
    def evaluate(self, gold):
        """
        Score the accuracy of the tagger against the gold standard.
        Strip the tags from the gold standard text, retag it using
        the tagger, then compute the accuracy score.

        :type gold: list(list(tuple(str, str)))
        :param gold: The list of tagged sentences to score the tagger on.
        :rtype: float
        """

        tagged_sents = self.tag_sents(untag(sent) for sent in gold)
        gold_tokens = sum(gold, [])
        test_tokens = sum(tagged_sents, [])
        return accuracy(gold_tokens, test_tokens)
Beispiel #23
0
    def evaluate(self, gold):
        """
        Score the accuracy of the tagger against the gold standard.
        Strip the tags from the gold standard text, retag it using
        the tagger, then compute the accuracy score.

        :type gold: list(list(tuple(str, str)))
        :param gold: The list of tagged sentences to score the tagger on.
        :rtype: float
        """

        tagged_sents = self.tag_sents(untag(sent) for sent in gold)
        gold_tokens = sum(gold, [])
        test_tokens = sum(tagged_sents, [])
        return accuracy(gold_tokens, test_tokens)
Beispiel #24
0
def transform_to_dataset(tagged_sentences,args):
    X, y = [], []
    for tagged in tagged_sentences:
        # sent='CNN reported that Republican leader Bill Frist should have what is known to be dangerous.'
        # token=nlp.word_tokenize(sent)
        token=untag(tagged)
        sent = " ".join(token)
        pos=nlp.pos_tag(sent)
        pos_tag=copy.deepcopy(pos)
        tagged=tupple_allign(tagged,pos_tag)
        token=[x[0] for x in pos]
        assert len(tagged)  ==  len(pos)
        dparse=nlp.dependency_parse(sent)
        word_lem= [stemmer.stem(x) for x in token]
        X.append([features(sent,word_lem,token,pos,dparse, index,args) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
    return X, y
Beispiel #25
0
    def _confusion_cached(self, gold):
        """
        Inner function used after ``gold`` is converted to a
        ``tuple(tuple(tuple(str, str)))``. That way, we can use caching on
        creating a ConfusionMatrix.

        :param gold: The list of tagged sentences to run the tagger with,
            also used as the reference values in the generated confusion matrix.
        :type gold: tuple(tuple(tuple(str, str)))
        :rtype: ConfusionMatrix
        """

        tagged_sents = self.tag_sents(untag(sent) for sent in gold)
        gold_tokens = [token for _word, token in chain.from_iterable(gold)]
        test_tokens = [
            token for _word, token in chain.from_iterable(tagged_sents)
        ]
        return ConfusionMatrix(gold_tokens, test_tokens)
Beispiel #26
0
def extract_address(chunker, sentence):
    """
    returns all addresses in sentence
    """
    def tree_filter(tree):
        return GPE_TAG == tree.label()

    tagged_tree = get_tagged_sentence(chunker, sentence)
    addresses = list()
    realAddresses = []
    for subtree in tagged_tree.subtrees(filter=tree_filter):
        addresses.append(untag(subtree.leaves()))
    for address in addresses:
      try:
        houseNum = int(address[0])
        realAddresses.append(address)
      except ValueError:
        continue
    return {"ADDRESSES": addresses, "PARSEDADDRESSES": realAddresses}
def tokenizacion(archivo):
    # Tokens
    palabras = nltk.word_tokenize(archivo.read())

    palabraSucias = []

    for palabra in palabras:
        palabraSucias.append(str2tuple(palabra))
        # print(palabraSucias)

    textoLimpio = untag(palabraSucias)

    # Borro los signos de puntuacion
    textoLimpio = [palabra for palabra in textoLimpio if len(palabra) > 1]
    #
    # for texto in textoLimpio:
    #     if (texto == '``'):
    #         textoLimpio=textoLimpio.remove(texto)

    for palabra in textoLimpio:
        palabra.lower()
    return textoLimpio
Beispiel #28
0
def person_connotation(tweet, name):
    """
    Decide whether a person is talked favorably about or not, based on the
    tone of the sentences in which their name appears
    """
    twtcontent = sent_tokenize(tweet)
    overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0}
    mentions = 0
    # analyze each sentence talking about `name` person
    for s in twtcontent:
        tags = get_tweet_tags(s)
        # if the name appears in the tagged sentence, get its tone
        if (name, 'NNP') in tags:
            sentence = util.untag(tags)
            scores = tweet_connotation(' '.join(sentence))
            # add it up to the overall tweet's tone
            for i, z in enumerate(scores):
                overall[z] += scores[z]
            mentions += 1
    # averaging all sentences' scores. don't wanna divide by zero now do we
    if mentions != 0:
        for v in overall:
            overall[v] = round(overall[v] / mentions, 3)
    return overall
Beispiel #29
0
    def viterbi(self, sequence):
        """ Runs the viterbi algorithm and returns the best labeling
        of the partially unlabeled sequence provided

        The viterbi modification shown here only predicts the hidden
        state if the state associated with the sequence is BLANK, otherwise
        the hidden state is given to us

        Note this implementation doesn't consider all sates for a obs,
        just the ones that were seen associated with it at train time

        Args:
        sequence -- labeled sequence
        """

        viterbi = defaultdict(lambda: defaultdict(float))
        backpointer = defaultdict(lambda: defaultdict(str))

        most_common_state = self.state_counts.most_common(1)[0]

        # init, generalized to account for start token padding
        for i in range(1, self.n):
            viterbi[START_TOKEN][i] = 1.0
            backpointer[START_TOKEN][i] = (START_TOKEN,) * (self.n - 1)
        prev_states = [(START_TOKEN,) * (self.n - 1)]


        allinf = True   # if all log probs at time step are -inf
        t = len(sequence)
        # recusion
        for i in range(self.n, t):
            allinf = True
            obs = sequence[i - 1][0]
            potential_state = sequence[i - 1][1]

            # determine whether we need to attempt to label
            if potential_state == BLANK_TOKEN:
                # add most common tag in case no states for tag, 
                # also ensures prev not empty
                states = self.B[obs].keys() + [most_common_state]
            else:
                # if we don't need to label, only give one option
                states = [potential_state]
                
            for q in states:
                bp, max_prob = self.__viterbi_prob(viterbi, obs, q, \
                    prev_states, i)
                viterbi[q][i] = max_prob
                backpointer[q][i] = bp

                if max_prob != float("-inf"): # received non -inf log prob
                    allinf = False

            # if state prob is 0 for all obs, reset viterbi matrix at time i
            if allinf:
                for q in states:
                    viterbi[q][i] = 0.0

            # get list of obs from previous obs state
            prev_states = [tuple(backpointer[s][i][1:]) + (s,) for s in states]


        # finalize
        bp, max_prob = self.__viterbi_prob(viterbi, sequence[t-1][0], \
            END_TOKEN, prev_states, t)
        viterbi[END_TOKEN][t] = max_prob
        backpointer[END_TOKEN][t] = bp

        labeled_seq = zip(untag(sequence), \
                          self.__follow_backpointers(backpointer, t))
        return labeled_seq
def main():
    character, pinyin = readCorpus('C:\Users\wruan02\Documents\GitHub\Comp150NLP')
    token_char = tokenizeCharacter(character[0:])
    token_pinyin = tokenizePinyin(pinyin[0:])

    # print len(token_char)
    # print len(token_pinyin)

    # token_pinyin.pop(45730)

    # print len(token_pinyin)
    # print token_pinyin[45700:45734]
    # characterPrint(token_char[45700:45734])


    dataset = organizeSentence(token_char, token_pinyin)

    print len(dataset)

    """ Supervised Learning
    """

    training_set = copy.deepcopy(dataset[0:1000])
    test_set = copy.deepcopy(dataset[1001:])

    # get vocabulary first!
    vocabulary, labelset = getVoc(training_set)
    # print len(vocabulary)
    # print len(labelset)


    """ Transform the data sets by eliminating unknown words and adding sentence boundary tokens.
    """
    training_set_prep = PreprocessText(training_set, vocabulary)
    test_set_prep = PreprocessText(test_set, vocabulary)

    # supervised with laplace smoothing
    bigram_hmm_laplace = BigramHMM()
    bigram_hmm_laplace.supervisedLearning(training_set_prep,"Laplace")

    # supervised with no smoothing in learning probabilities but use most common to smooth in Viterbi
    bigram_hmm_mostcommon = BigramHMM()
    bigram_hmm_mostcommon.supervisedLearning(training_set_prep, "no smoothing")
    bigram_hmm_mostcommon.getMostCommon(training_set_prep)


    """ Implement the most common class baseline. Report accuracy of the predicted tags.
        """
    test_set_predicted_baseline = MostCommonClassBaseline(training_set_prep, test_set_prep)
    print "--- Most common class baseline accuracy ---"
    ComputeAccuracy(test_set_prep, test_set_predicted_baseline)

    # print test_set_prep[0]
    # print test_set_predicted_baseline[0]

    """ Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags.
    """
    test_set_predicted_bigram_hmm_laplace = bigram_hmm_laplace.Test(test_set_prep,"Laplace")
    test_set_predicted_bigram_hmm_mostcommon = bigram_hmm_mostcommon.Test(test_set_prep, "Most Common")
    print "--- Bigram HMM with most common accuracy ---"
    ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm_mostcommon)
    print "--- Bigram HMM with Laplace accuracy ---"
    ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm_laplace)
    # print test_set_predicted_bigram_hmm[0]
    # print " ".join(untag(test_set_predicted_bigram_hmm[20]))
    print "\n"    
    print "A sequence of Pinyin words:"
    print " ".join(untag(test_set_prep[25]))
    print "Common baseline results:"
    displayTag(test_set_predicted_baseline[25])
    print "HMM with most common results:"
    displayTag(test_set_predicted_bigram_hmm_mostcommon[25])
    print "HMM with laplace result:"
    displayTag(test_set_predicted_bigram_hmm_laplace[25])



    # """ Semi_supervised Learning starts here
    # """
    #
    # training_set_tagged = copy.deepcopy(dataset[0:800])
    # training_set_untagged = copy.deepcopy(dataset[801:1000])
    # test_set = copy.deepcopy(dataset[1001:])
    #
    # # get vocabulary first!
    # vocabulary, labelset = getVoc(training_set)
    #
    #
    # """ Transform the data sets by eliminating unknown words and adding sentence boundary tokens.
    # """
    # training_set_tagged_prep = PreprocessText(training_set_tagged, vocabulary)
    # training_set_untagged_prep = PreprocessText(training_set_untagged, vocabulary)
    # test_set_prep = PreprocessText(test_set, vocabulary)
    #
    # training_with_tag = training_set_tagged_prep
    # training_without_tag = []
    # for sent in training_set_untagged_prep:
    #     temp = []
    #     for tup in sent:
    #         temp.append(tup[0])
    #     training_without_tag.append(temp)
    #
    # bigram_hmm_semi = BigramHMM()
    # bigram_hmm_semi.semisupervisedLearning(training_with_tag, training_without_tag, "Laplace")
    # bigram_hmm_semi.getMostCommon(training_with_tag)
    #
    # """ Implement the most common class baseline. Report accuracy of the predicted tags.
    #     """
    # test_set_predicted_baseline = MostCommonClassBaseline(training_set_tagged_prep, test_set_prep)
    # print "--- Most common class baseline accuracy ---"
    # ComputeAccuracy(test_set_prep, test_set_predicted_baseline)
    #
    #
    # """ Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags.
    # """
    # test_set_predicted_bigram_hmm = bigram_hmm.Test(test_set_prep, "Laplace")
    # print "--- Bigram HMM accuracy ---"
    # ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm)
    # print test_set_predicted_bigram_hmm[0]
    # print " ".join(untag(test_set_predicted_bigram_hmm[20]))
    # print "Common baseline results:"
    # displayTag(test_set_predicted_baseline[25])
    # print "HMM result:"
    # displayTag(test_set_predicted_bigram_hmm[25])



    """ Print the first sentence of each data set.
        """
    print '\n'
    print "A sequence of Pinyin words:"
    print " ".join(untag(training_set_prep[0]))  # See nltk.tag.util module.
    print "It's corresponding Chinese characters:"
    displayTag(training_set_prep[0])
    print "A sequence of Pinyin words:"
    print " ".join(untag(test_set_prep[1]))
    print "It's corresponding Chinese characters:"
    displayTag(test_set_prep[1])
Beispiel #31
0
import nltk
from nltk.tag.util import str2tuple
from nltk.tag.util import untag
from nltk.tag.util import tuple2str

textoSucio = 'It/pps recommended/vbd that/cs Fulton/np legislators/nns act/vb ``/`` to/to have/hv these/dts laws/nns studied/vbn and/cc revised/vbn to/in the/at end/nn of/in modernizing/vbg and/cc improving/vbg them/ppo ' '/' ' ./.'
palabras = nltk.word_tokenize(textoSucio)
palabraSucias = []

for palabra in palabras:
    palabraSucias.append(str2tuple(palabra))
    # print(palabraSucias)

textoLimpio = untag(palabraSucias)

textoLimpio = [palabra for palabra in textoLimpio if len(palabra) > 1]

for texto in textoLimpio:
    if (texto == '``'):
        textoLimpio.remove(texto)

# stopwords = set(nltk.corpus.stopwords.words('english'))  # StopWords Configuracion
# textoLimpio = [palabra for palabra in textoLimpio if palabra not in stopwords]

print(textoLimpio)
Beispiel #32
0
 def evaluate(self, gold):
     tagged_sents = self.tag_sents(untag(sent) for sent in gold)
     gold_tokens = list(itertools.chain(*gold))
     print(json.dumps(gold_tokens))
     print(len(tagged_sents), len(gold_tokens))
     return accuracy(gold_tokens, tagged_sents)
Beispiel #33
0
        tagged_sents = self.tag_sents(untag(sent) for sent in gold)
        gold_tokens = list(itertools.chain(*gold))
        print(json.dumps(gold_tokens))
        print(len(tagged_sents), len(gold_tokens))
        return accuracy(gold_tokens, tagged_sents)


if __name__ == '__main__':
    sents = treebank.tagged_sents()
    PT = PerceptronTagger()

    print("Timing NLTK ...")
    pt_times = []
    for _ in range(5):
        now = time.time()
        PT.tag_sents(untag(sent) for sent in sents)
        pt_times.append(time.time() - now)
    pt_time = round(sum(pt_times) / len(pt_times), 3)
    '''NOTE: Moved to tag_test.go
    print("Timing prose ...")
    acc = round(APTagger().evaluate(sents), 3)
    ap_time = round(sum(AP_TIME) / len(AP_TIME), 3)
    '''

    print("Evaluating accuracy ...")
    headers = ['Library', 'Accuracy', '5-Run Average (sec)']
    table = [
        ['NLTK', round(PT.evaluate(sents), 3), pt_time],
        # ['`prose`', acc, ap_time]
    ]
    print(tabulate(table, headers, tablefmt='pipe'))
    def Viterbi(self, sent):
        # viterbi is a list of dictionary that maps each tag t to the probablity of the best tag sequence that ends in t
        viterbi = []
        # backpointer is a list of dictionary that maps each tag t to the previous tag in the best tag sequence
        backpointer = []

        first_viterbi = {}
        first_backpointer = {}
        for tag in self.dictionary[sent[1][0]]:
            if tag == start_token:
                continue
            # we start with the first meaning for word in the sentence, since all the sentence start with <S>
            first_viterbi[tag] = self.log_transitions.get(
                (start_token, tag), -float("inf")) + self.log_emissions.get(
                    (sent[1][0], tag), -float("inf"))
            first_backpointer[tag] = start_token
        viterbi.append(first_viterbi)
        backpointer.append(first_backpointer)

        for word_index in range(2, len(sent) - 1):
            this_viterbi = {}
            this_backpointer = {}
            prev_viterbi = viterbi[-1]
            for cur_tag in self.dictionary[sent[word_index][0]]:
                if cur_tag == start_token:
                    continue
                best_pre_tag = None
                best_prob = -float("inf")
                for pre_tag in self.dictionary[sent[word_index - 1][0]]:
                    this_prob = prev_viterbi.get(
                        pre_tag, -float("inf")) + self.log_transitions.get(
                            (pre_tag, cur_tag), -float("inf"))
                    if this_prob > best_prob:
                        best_pre_tag = pre_tag
                        best_prob = this_prob
                this_viterbi[cur_tag] = prev_viterbi.get(
                    best_pre_tag, -float("inf")) + self.log_transitions.get(
                        (best_pre_tag, cur_tag),
                        -float("inf")) + self.log_emissions.get(
                            (sent[word_index][0], cur_tag), -float("inf"))
                this_backpointer[cur_tag] = best_pre_tag
            viterbi.append(this_viterbi)
            backpointer.append(this_backpointer)
        # Done with all meaningful word in the sentence. Now caculate the prob fo each tag followd by </S>
        prev_viterbi = viterbi[-1]
        best_pre_tag = None
        best_prob = -float("inf")
        for pre_tag in self.dictionary[sent[-2][0]]:
            this_prob = prev_viterbi.get(
                pre_tag, -float("inf")) + self.log_transitions.get(
                    (pre_tag, end_token), -float("inf"))
            if this_prob > best_prob:
                best_pre_tag = pre_tag
                best_prob = this_prob
        log_prob_tag_seq = prev_viterbi.get(
            best_pre_tag, -float("inf")) + self.log_transitions.get(
                (best_pre_tag, end_token), -float("inf"))
        prob_tag_seq = exp(log_prob_tag_seq)

        # Get the best tag sequence
        if prob_tag_seq == 0.0:
            return []  # All branches is 0, so I skip this sentence
        else:
            best_tag_seq = [end_token, best_pre_tag]
            backpointer.reverse()
            cur_best_tag = best_pre_tag
            for bp in backpointer:
                best_tag_seq.append(bp[cur_best_tag])
                cur_best_tag = bp[cur_best_tag]
            best_tag_seq.reverse()
            sent_predicted = zip(untag(sent), best_tag_seq)
            return sent_predicted
Beispiel #35
0
 def all_sents(self):
     '''set class´s attribute "sents"'''
     self.sents = [untag(i) for i in self.tagged_sents]
Beispiel #36
0
    def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
        """
        Trains the Brill tagger on the corpus *train_sents*,
        producing at most *max_rules* transformations, each of which
        reduces the net number of errors in the corpus by at least
        *min_score*, and each of which has accuracy not lower than
        *min_acc*.

        #imports
        >>> from nltk.tbl.template import Template
        >>> from nltk.tag.brill import Pos, Word
        >>> from nltk.tag import RegexpTagger, BrillTaggerTrainer

        #some data
        >>> from nltk.corpus import treebank
        >>> training_data = treebank.tagged_sents()[:100]
        >>> baseline_data = treebank.tagged_sents()[100:200]
        >>> gold_data = treebank.tagged_sents()[200:300]
        >>> testing_data = [untag(s) for s in gold_data]

        >>> backoff = RegexpTagger([
        ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
        ... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
        ... (r'.*able$', 'JJ'),                # adjectives
        ... (r'.*ness$', 'NN'),                # nouns formed from adjectives
        ... (r'.*ly$', 'RB'),                  # adverbs
        ... (r'.*s$', 'NNS'),                  # plural nouns
        ... (r'.*ing$', 'VBG'),                # gerunds
        ... (r'.*ed$', 'VBD'),                 # past tense verbs
        ... (r'.*', 'NN')                      # nouns (default)
        ... ])

        >>> baseline = backoff #see NOTE1

        >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
        0.2450142...

        #templates
        >>> Template._cleartemplates() #clear any templates created in earlier tests
        >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        #construct a BrillTaggerTrainer
        >>> tt = BrillTaggerTrainer(baseline, templates, trace=3)

        >>> tagger1 = tt.train(training_data, max_rules=10)
        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
        Finding initial useful rules...
            Found 845 useful rules.
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          47  63  16 161  | NN->IN if Pos:NNS@[-1]
          33  33   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | IN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | IN->, if Pos:NNS@[-1] & Word:,@[0]
          22  27   5  24  | NN->-NONE- if Pos:VBD@[-1]
          17  17   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]

        >>> tagger1.rules()[1:3]
        (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]))

        >>> train_stats = tagger1.train_stats()
        >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]

        >>> tagger1.print_template_statistics(printunused=False)
        TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
        TRAIN (   2417 tokens) initial  1775 0.2656 final:  1269 0.4750
        #ID | Score (train) |  #Rules     | Template
        --------------------------------------------
        001 |   305   0.603 |   7   0.700 | Template(Pos([-1]),Word([0]))
        000 |   201   0.397 |   3   0.300 | Template(Pos([-1]))
        <BLANKLINE>
        <BLANKLINE>

        >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
        0.43996...

        >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)

        >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'),
        ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'),
        ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')]
        True

        >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]

        # a high-accuracy tagger
        >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
        Finding initial useful rules...
            Found 845 useful rules.
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          36  36   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | NN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | NN->, if Pos:NNS@[-1] & Word:,@[0]
          19  19   0   6  | NN->VB if Pos:TO@[-1]
          18  18   0   0  | CD->-NONE- if Pos:NN@[-1] & Word:0@[0]
          18  18   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]

        >>> tagger2.evaluate(gold_data)  # doctest: +ELLIPSIS
        0.44159544...
        >>> tagger2.rules()[2:4]
        (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))

        # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger,
        # with a RegexpTagger only as backoff. For instance,
        # >>> baseline = UnigramTagger(baseline_data, backoff=backoff)
        # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results
        # between python versions. The simplistic backoff above is a workaround to make doctests
        # get consistent input.

        :param train_sents: training data
        :type train_sents: list(list(tuple))
        :param max_rules: output at most max_rules rules
        :type max_rules: int
        :param min_score: stop training when no rules better than min_score can be found
        :type min_score: int
        :param min_acc: discard any rule with lower accuracy than min_acc
        :type min_acc: float or None
        :return: the learned tagger
        :rtype: BrillTagger

        """
        # FIXME: several tests are a bit too dependent on tracing format
        # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates

        # Basic idea: Keep track of the rules that apply at each position.
        # And keep track of the positions to which each rule applies.

        # Create a new copy of the training corpus, and run the
        # initial tagger on it.  We will progressively update this
        # test corpus to look more like the training corpus.
        test_sents = [list(self._initial_tagger.tag(untag(sent)))
                      for sent in train_sents]

        # Collect some statistics on the training process
        trainstats = {}
        trainstats['min_acc'] = min_acc
        trainstats['min_score'] = min_score
        trainstats['tokencount'] = sum(len(t) for t in test_sents)
        trainstats['sequencecount'] = len(test_sents)
        trainstats['templatecount'] = len(self._templates)
        trainstats['rulescores'] = []
        trainstats['initialerrors'] = sum(
            tag[1] != truth[1]
            for paired in zip(test_sents, train_sents)
            for (tag, truth) in zip(*paired)
        )
        trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount']
        if self._trace > 0:
            print("TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; "
                  "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats))

        # Initialize our mappings.  This will find any errors made
        # by the initial tagger, and use those to generate repair
        # rules, which are added to the rule mappings.
        if self._trace:
            print("Finding initial useful rules...")
        self._init_mappings(test_sents, train_sents)
        if self._trace:
            print(("    Found %d useful rules." % len(self._rule_scores)))

        # Let the user know what we're up to.
        if self._trace > 2:
            self._trace_header()
        elif self._trace == 1:
            print("Selecting rules...")

        # Repeatedly select the best rule, and add it to `rules`.
        rules = []
        try:
            while (len(rules) < max_rules):
                # Find the best rule, and add it to our rule list.
                rule = self._best_rule(train_sents, test_sents, min_score, min_acc)
                if rule:
                    rules.append(rule)
                    score = self._rule_scores[rule]
                    trainstats['rulescores'].append(score)
                else:
                    break  # No more good rules left!

                # Report the rule that we found.
                if self._trace > 1:
                    self._trace_rule(rule)

                # Apply the new rule at the relevant sites
                self._apply_rule(rule, test_sents)

                # Update _tag_positions[rule.original_tag] and
                # _tag_positions[rule.replacement_tag] for the affected
                # positions (i.e., self._positions_by_rule[rule]).
                self._update_tag_positions(rule)

                # Update rules that were affected by the change.
                self._update_rules(rule, train_sents, test_sents)

        # The user can cancel training manually:
        except KeyboardInterrupt:
            print("Training stopped manually -- %d rules found" % len(rules))

        # Discard our tag position mapping & rule mappings.
        self._clean()
        trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores'])
        trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount']
        # Create and return a tagger from the rules we found.
        return BrillTagger(self._initial_tagger, rules, trainstats)
def processContent():
    
    for item in text4:
        
    
        grammar = r"""
        ProperNoun: {<NNP|NNS>}
        Verb: {<BT|VBD>|<VBD>|<BT>*<VBD>}
        OtherNouns: {<IN><NP>|*<NNS|NN>*|<PRP\$>?<NN>+|<PRP\$>?}
        jj: {*<JJ>*}
        Pronoun:{<PRP>|<PRP\$>|<PRO>}
        NonlivingPronoun: {<NLPRO>}
        """

        train_sents = [
                       [('it', 'NLPRO')],
                       [('walk', 'NN')],
                       [('discovers', 'VBD')],
                       [('finds','VBD')],
                       [('meets','VBD')]
                       ]
        tokenized = nltk.word_tokenize(item)
        tagger = nltk.UnigramTagger(train_sents, backoff=default_tagger)
        i=tagger.tag(tokenized)
                       
        cp = nltk.RegexpParser(grammar)
        result = cp.parse(i)
    
    G=nx.DiGraph()
    Gc=nx.DiGraph()
    Gcc=nx.DiGraph()
    
    flag=0
    for a in result:
        if type(a) is nltk.Tree:
            
            if(a.label()=='ProperNoun') :
                cur=a.leaves()
                ProperNoun.append(cur)
                
                if flag == 0:
                    n1=a.leaves()
                    un1=untag(n1)
                else:
                    n2=a.leaves()
                    un2=untag(n2)
                    print(un1,  uv   ,un2)
                    G.add_node(tuple(un1))
                    G.add_node(tuple(un2))
                    G.add_edge(tuple(un1),tuple(un2),label=tuple(uv))
                    n1=n2
                    un1=untag(n1)
                    flag=0
            if(a.label()=='Noun2'):
                cur2=a.leaves()
                noun.append(cur2)
                
                if flag == 0:
                    n1=a.leaves()
                    un1=untag(n1)
                else:
                    n2=a.leaves()
                    un2=untag(n2)
                print(un1,  uv   ,un2)
                Gcc.add_node(tuple(un1))
                Gcc.add_node(tuple(un2))
                Gcc.add_edge(tuple(un1),tuple(un2),label=tuple(uv))
                flag=0
            if(a.label()=='OtherNouns') :
                cur1=a.leaves()
                noun.append(cur1)
                if flag == 0:
                    n1=a.leaves()
                    un1=untag(n1)
                else:
                    n2=a.leaves()
                    un2=untag(n2)
                    print(un1,  uv ,un2  )
                    Gc.add_node(tuple(un1))
                    Gc.add_node(tuple(un2))
                    Gc.add_edge(tuple(un1),tuple(un2),label=tuple(uv))
                    flag=0
                    n1=n2
                    un1=untag(n1)
            if(a.label()=='Pronoun') :
                #print('Pronoun',a.leaves(),'stands for',cur)
                ProperNoun.append(cur)
                if flag == 0:
                    n1=cur
                    un1=untag(n1)
                #print('n1', n1)
                else:
                    n2=cur
                    un2=untag(n2)
                    print('pronoun::',un1,  uv ,un2  )
                    G.add_node(tuple(un1))
                    G.add_node(tuple(un2))
                    G.add_edge(tuple(un1),tuple(un2),label=tuple(uv))
                    flag=0
                    n1=n2
                    un1=untag(n1)
            if(a.label()=='NonlivingPronoun') :
                #print('Pronoun',a.leaves(),'stands for',cur1)
                noun.append(cur1)
                if flag == 0:
                    n1=cur1
                    un1=untag(n1)
                else:
                    n2=cur1
                    un2=untag(n2)
                    print(un1 , uv ,  un2)
                    G.add_node(tuple(un1))
                    G.add_node(tuple(un2))
                    G.add_edge(tuple(un1),tuple(un2),label=tuple(uv))
                    flag=0
                    n1=n2
                    un1=untag(n1)
            if(a.label()=='Verb') :
                flag=1
                v=a.leaves()
                uv=untag(v)
                VerbList.append(v)
    
    print('List of living nouns')
    print(ProperNoun)
    print('List of Non living nouns')
    print(noun)
    print('List of verbs')
    print(VerbList)
    
    graph_pos=nx.spring_layout(G)
    graph_pos=nx.spring_layout(Gc)
    graph_pos=nx.spring_layout(Gcc)
    nx.draw_networkx_nodes(G,graph_pos,node_size=3000,
                           alpha=0.3, node_color='red',node_shape='o')
    nx.draw_networkx_nodes(Gc,graph_pos,node_size=2000,
                               alpha=0.3, node_color='green',node_shape='o')
    nx.draw_networkx_nodes(Gcc,graph_pos,node_size=1000,
                                   alpha=0.3, node_color='yellow',node_shape='o')
    nx.draw_networkx_edges(G,graph_pos,width=1,
                                                  alpha=0.3,edge_color='blue')
    nx.draw_networkx_labels(G, graph_pos,font_size=10,
                                                                    font_family='sans-serif')
    nx.draw_networkx_edges(Gc,graph_pos,width=1,alpha=0.3,edge_color='blue')
    nx.draw_networkx_labels(G, graph_pos,font_size=10,font_family='sans-serif')
    nx.draw_networkx_edges(Gcc,graph_pos,width=1,alpha=0.3,edge_color='blue')
    nx.draw_networkx_labels(G, graph_pos,font_size=10,font_family='sans-serif')
    nx.draw_networkx_edge_labels(G, graph_pos,font_size=10,label_pos=0.3)
    nx.draw_networkx_edge_labels(Gc, graph_pos,font_size=10,label_pos=0.3)
    nx.draw_networkx_edge_labels(Gcc, graph_pos,font_size=10,label_pos=0.3)
                                                                                                       
    plt.show()
    result.draw()
Beispiel #38
0
from nltk.corpus import treebank
from nltk.tag.util import untag

sentences = treebank.tagged_sents()
text = []
for s in sentences:
    text.append(' '.join(untag(s)))
print(' '.join(text))
Beispiel #39
0
 def evaluate(self, gold):
     tagged_sents = self.tag_sents(untag(sent) for sent in gold)
     gold_tokens = list(itertools.chain(*gold))
     return accuracy(gold_tokens, tagged_sents)
Beispiel #40
0
    def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
        """
        Trains the Brill tagger on the corpus *train_sents*,
        producing at most *max_rules* transformations, each of which
        reduces the net number of errors in the corpus by at least
        *min_score*, and each of which has accuracy not lower than
        *min_acc*.

        #imports
        >>> from nltk.tbl.template import Template
        >>> from nltk.tag.brill import Pos, Word
        >>> from nltk.tag import RegexpTagger, BrillTaggerTrainer

        #some data
        >>> from nltk.corpus import treebank
        >>> training_data = treebank.tagged_sents()[:100]
        >>> baseline_data = treebank.tagged_sents()[100:200]
        >>> gold_data = treebank.tagged_sents()[200:300]
        >>> testing_data = [untag(s) for s in gold_data]

        >>> backoff = RegexpTagger([
        ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
        ... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
        ... (r'.*able$', 'JJ'),                # adjectives
        ... (r'.*ness$', 'NN'),                # nouns formed from adjectives
        ... (r'.*ly$', 'RB'),                  # adverbs
        ... (r'.*s$', 'NNS'),                  # plural nouns
        ... (r'.*ing$', 'VBG'),                # gerunds
        ... (r'.*ed$', 'VBD'),                 # past tense verbs
        ... (r'.*', 'NN')                      # nouns (default)
        ... ])

        >>> baseline = backoff #see NOTE1

        >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
        0.2450142...

        #templates
        >>> Template._cleartemplates() #clear any templates created in earlier tests
        >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        #construct a BrillTaggerTrainer
        >>> tt = BrillTaggerTrainer(baseline, templates, trace=3)

        >>> tagger1 = tt.train(training_data, max_rules=10)
        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
        Finding initial useful rules...
            Found 845 useful rules.
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          47  63  16 161  | NN->IN if Pos:NNS@[-1]
          33  33   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | IN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | IN->, if Pos:NNS@[-1] & Word:,@[0]
          22  27   5  24  | NN->-NONE- if Pos:VBD@[-1]
          17  17   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]

        >>> tagger1.rules()[1:3]
        (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]))

        >>> train_stats = tagger1.train_stats()
        >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]

        >>> tagger1.print_template_statistics(printunused=False)
        TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
        TRAIN (   2417 tokens) initial  1775 0.2656 final:  1269 0.4750
        #ID | Score (train) |  #Rules     | Template
        --------------------------------------------
        001 |   305   0.603 |   7   0.700 | Template(Pos([-1]),Word([0]))
        000 |   201   0.397 |   3   0.300 | Template(Pos([-1]))
        <BLANKLINE>
        <BLANKLINE>

        >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
        0.43996...

        >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)

        >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'),
        ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'),
        ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')]
        True

        >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]

        # a high-accuracy tagger
        >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
        Finding initial useful rules...
            Found 845 useful rules.
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          36  36   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | NN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | NN->, if Pos:NNS@[-1] & Word:,@[0]
          19  19   0   6  | NN->VB if Pos:TO@[-1]
          18  18   0   0  | CD->-NONE- if Pos:NN@[-1] & Word:0@[0]
          18  18   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]

        >>> tagger2.evaluate(gold_data)  # doctest: +ELLIPSIS
        0.44159544...
        >>> tagger2.rules()[2:4]
        (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))

        # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger,
        # with a RegexpTagger only as backoff. For instance,
        # >>> baseline = UnigramTagger(baseline_data, backoff=backoff)
        # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results
        # between python versions. The simplistic backoff above is a workaround to make doctests
        # get consistent input.

        :param train_sents: training data
        :type train_sents: list(list(tuple))
        :param max_rules: output at most max_rules rules
        :type max_rules: int
        :param min_score: stop training when no rules better than min_score can be found
        :type min_score: int
        :param min_acc: discard any rule with lower accuracy than min_acc
        :type min_acc: float or None
        :return: the learned tagger
        :rtype: BrillTagger

        """
        # FIXME: several tests are a bit too dependent on tracing format
        # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates

        # Basic idea: Keep track of the rules that apply at each position.
        # And keep track of the positions to which each rule applies.

        # Create a new copy of the training corpus, and run the
        # initial tagger on it.  We will progressively update this
        # test corpus to look more like the training corpus.
        test_sents = [
            list(self._initial_tagger.tag(untag(sent))) for sent in train_sents
        ]

        # Collect some statistics on the training process
        trainstats = {}
        trainstats['min_acc'] = min_acc
        trainstats['min_score'] = min_score
        trainstats['tokencount'] = sum(len(t) for t in test_sents)
        trainstats['sequencecount'] = len(test_sents)
        trainstats['templatecount'] = len(self._templates)
        trainstats['rulescores'] = []
        trainstats['initialerrors'] = sum(
            tag[1] != truth[1] for paired in zip(test_sents, train_sents)
            for (tag, truth) in zip(*paired))
        trainstats['initialacc'] = 1 - trainstats[
            'initialerrors'] / trainstats['tokencount']
        if self._trace > 0:
            print(
                "TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; "
                "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})"
                .format(**trainstats))

        # Initialize our mappings.  This will find any errors made
        # by the initial tagger, and use those to generate repair
        # rules, which are added to the rule mappings.
        if self._trace:
            print("Finding initial useful rules...")
        self._init_mappings(test_sents, train_sents)
        if self._trace:
            print(("    Found %d useful rules." % len(self._rule_scores)))

        # Let the user know what we're up to.
        if self._trace > 2:
            self._trace_header()
        elif self._trace == 1:
            print("Selecting rules...")

        # Repeatedly select the best rule, and add it to `rules`.
        rules = []
        try:
            while (len(rules) < max_rules):
                # Find the best rule, and add it to our rule list.
                rule = self._best_rule(train_sents, test_sents, min_score,
                                       min_acc)
                if rule:
                    rules.append(rule)
                    score = self._rule_scores[rule]
                    trainstats['rulescores'].append(score)
                else:
                    break  # No more good rules left!

                # Report the rule that we found.
                if self._trace > 1:
                    self._trace_rule(rule)

                # Apply the new rule at the relevant sites
                self._apply_rule(rule, test_sents)

                # Update _tag_positions[rule.original_tag] and
                # _tag_positions[rule.replacement_tag] for the affected
                # positions (i.e., self._positions_by_rule[rule]).
                self._update_tag_positions(rule)

                # Update rules that were affected by the change.
                self._update_rules(rule, train_sents, test_sents)

        # The user can cancel training manually:
        except KeyboardInterrupt:
            print("Training stopped manually -- %d rules found" % len(rules))

        # Discard our tag position mapping & rule mappings.
        self._clean()
        trainstats['finalerrors'] = trainstats['initialerrors'] - sum(
            trainstats['rulescores'])
        trainstats['finalacc'] = 1 - trainstats['finalerrors'] / trainstats[
            'tokencount']
        # Create and return a tagger from the rules we found.
        return BrillTagger(self._initial_tagger, rules, trainstats)
Beispiel #41
0
    def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
        """
        Trains the Brill tagger on the corpus *train_sents*,
        producing at most *max_rules* transformations, each of which
        reduces the net number of errors in the corpus by at least
        *min_score*, and each of which has accuracy not lower than
        *min_acc*.

        #imports
        >>> from nltk.tbl.template import Template
        >>> from nltk.tag.brill import Pos, Word
        >>> from nltk.tag import RegexpTagger
        >>> from nltk.tag.brill_trainer_orig import BrillTaggerTrainer

        #some data
        >>> from nltk.corpus import treebank
        >>> training_data = treebank.tagged_sents()[:100]
        >>> baseline_data = treebank.tagged_sents()[100:200]
        >>> gold_data = treebank.tagged_sents()[200:300]
        >>> testing_data = [untag(s) for s in gold_data]

        >>> backoff = RegexpTagger([
        ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
        ... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
        ... (r'.*able$', 'JJ'),                # adjectives
        ... (r'.*ness$', 'NN'),                # nouns formed from adjectives
        ... (r'.*ly$', 'RB'),                  # adverbs
        ... (r'.*s$', 'NNS'),                  # plural nouns
        ... (r'.*ing$', 'VBG'),                # gerunds
        ... (r'.*ed$', 'VBD'),                 # past tense verbs
        ... (r'.*', 'NN')                      # nouns (default)
        ... ])

        >>> baseline = backoff #see NOTE1

        >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
        0.2450142...

        #templates
        >>> Template._cleartemplates() #clear any templates created in earlier tests
        >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        #construct a BrillTaggerTrainer
        >>> tt = BrillTaggerTrainer(baseline, templates, trace=3)
        >>> tagger1 = tt.train(training_data, max_rules=10)
        TBL train (orig) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          47  63  16 161  | NN->IN if Pos:NNS@[-1]
          33  33   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | IN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | IN->, if Pos:NNS@[-1] & Word:,@[0]
          22  27   5  24  | NN->-NONE- if Pos:VBD@[-1]
          17  17   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]



        >>> tagger1.rules()[1:3]
        (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]))


        >>> train_stats = tagger1.train_stats()
        >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]


        ##FIXME: the following test fails -- why?
        #
        #>>> tagger1.print_template_statistics(printunused=False)
        #TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
        #TRAIN (   3163 tokens) initial  2358 0.2545 final:  1719 0.4565
        ##ID | Score (train) |  #Rules     | Template
        #--------------------------------------------
        #001 |   404   0.632 |   7   0.700 | Template(Pos([-1]),Word([0]))
        #000 |   235   0.368 |   3   0.300 | Template(Pos([-1]))
        #<BLANKLINE>
        #<BLANKLINE>

        >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
        0.43996...

        >>> (tagged, test_stats) = tagger1.batch_tag_incremental(testing_data, gold_data)


        >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'),
        ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'),
        ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')]
        True


        >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]

        ##a high-accuracy tagger
        >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
        TBL train (orig) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          36  36   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | NN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | NN->, if Pos:NNS@[-1] & Word:,@[0]
          19  19   0   6  | NN->VB if Pos:TO@[-1]
          18  18   0   0  | CD->-NONE- if Pos:NN@[-1] & Word:0@[0]
          18  18   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]


        >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS
        0.44159544...

        >>> tagger2.rules()[2:4]
        (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))

        #NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger,
        #with a RegexpTagger only as backoff. For instance,
        #>>> baseline = UnigramTagger(baseline_data, backoff=backoff)
        #However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results
        #between python versions. The simplistic backoff above is a workaround to make doctests
        #get consistent input.

        :param train_sents: training data
        :type train_sents: list(list(tuple))
        :param max_rules: output at most max_rules rules
        :type max_rules: int
        :param min_score: stop training when no rules better than min_score can be found
        :type min_score: int
        :param min_acc: discard any rule with lower accuracy than min_acc
        :type min_acc: float or None
        :return: the learned tagger
        :rtype: BrillTagger


        :param train_sents: training data
        :type train_sents: list(list(tuple))
        :param max_rules: output at most max_rules rules
        :type max_rules: int
        :param min_score: stop training when no rules better than min_score can be found
        :type min_score: int
        :param min_acc: discard any rule with lower accuracy than min_acc
        :type min_acc: float or None
        :return: the learned tagger
        :rtype: BrillTagger

        """

        # Create a new copy of the training corpus, and run the
        # initial tagger on it.  We will progressively update this
        # test corpus to look more like the training corpus.
        test_sents = [self._initial_tagger.tag(untag(sent))
                      for sent in train_sents]
        trainstats = {}
        trainstats['min_acc'] = min_acc
        trainstats['min_score'] = min_score
        trainstats['tokencount'] = sum(len(t) for t in test_sents)
        trainstats['sequencecount'] = len(test_sents)
        trainstats['templatecount'] = len(self._templates)
        trainstats['rulescores'] = []
        trainstats['initialerrors'] = sum(tag[1] != truth[1]
                                                    for paired in zip(test_sents, train_sents)
                                                    for (tag, truth) in zip(*paired))
        trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount']
        if self._trace > 0:
            print("TBL train (orig) (seqs: {sequencecount}; tokens: {tokencount}; "
                  "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats))

        if self._trace > 2:
            self._trace_header()

        # Look for useful rules.
        rules = []
        try:
            while len(rules) < max_rules:
                (rule, score, fixscore) = self._best_rule(test_sents,
                                                          train_sents, min_acc=min_acc)
                if rule is None or score < min_score:
                    if self._trace > 1:
                        print('Insufficient improvement; stopping')
                    break
                else:
                    # Add the rule to our list of rules.
                    rules.append(rule)
                    trainstats['rulescores'].append(score)
                    # Use the rules to update the test corpus.  Keep
                    # track of how many times the rule applied (k).
                    k = 0
                    for sent in test_sents:
                        k += len(rule.apply(sent))
                    # Display trace output.
                    if self._trace > 1:
                        self._trace_rule(rule, score, fixscore, k)
        # The user can also cancel training manually:
        except KeyboardInterrupt:
            print("Training stopped manually -- %d rules found" % len(rules))

        trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores'])
        trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount']
        # Create and return a tagger from the rules we found.
        return BrillTagger(self._initial_tagger, rules, trainstats)
Beispiel #42
0
 def evaluate(self, gold):
     tagged_sents = self.batch_tag([untag(sent) for sent in gold])
     gold_tokens = sum(gold, [])
     test_tokens = sum(tagged_sents, [])
     return accuracy(gold_tokens, test_tokens)
Beispiel #43
0
    def evaluate(self, gold):

        tagged_sents = self.tag_sents(untag(sent) for sent in gold)
        gold_tokens = list(chain(*gold))
        test_tokens = list(chain(*tagged_sents))
        return accuracy(gold_tokens, test_tokens)