コード例 #1
0
ファイル: hmm_train.py プロジェクト: rrichajalota/ws-20-21
def main():
    """main function
    """
    n = 2  # Bigram HMM
    args = parse_arguments()
    treebank = TaggedCorpusReader(
        os.path.split(args.train_f)[0],
        os.path.split(args.train_f)[1])
    observation_space = [item[0] for item in treebank.sents()]  # all words
    state_space = [item[1] for item in treebank.sents()]  # all pos tags

    words = dict.fromkeys(observation_space)
    tags = dict.fromkeys(state_space)

    # HMM parameter estimation- initial, transition and emission probablity
    start = time.time()
    init_p = [item[1] for item in comp_initial(tags, treebank)]
    trans_p = comp_transition(n, tags, state_space)
    emission_p = comp_emission(words,
                               tags,
                               state_space,
                               treebank,
                               smoothing=args.smoothing)
    end = time.time()
    print("Runtime (training): %.3f s" % (end - start))

    # Test your HMM-trained model
    treebank = TaggedCorpusReader(
        os.path.split(args.eval_f)[0],
        os.path.split(args.eval_f)[1])
    viterbi_tags = []

    start = time.time()
    for sentence in treebank.paras():
        test_words = [item[0] for item in sentence]
        O, S, Y, pi, A, B = pre_process(words, tags, test_words, init_p,
                                        trans_p, emission_p)
        # Computes Viterbi's most likely tags

        if args.log_prob:
            X = viterbi_log(O, S, Y, pi, A, B)
        else:
            X = viterbi(O, S, Y, pi, A, B)
        viterbi_tags.append(X)
    end = time.time()

    print("Runtime (viterbi): %.3f s" % (end - start))
    output_path = "./" + "de-tagger.tt"
    post_processing(viterbi_tags, args.test_f, output_path)
コード例 #2
0
def NER_HINDINBC():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Result is:", test
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = nbc_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
コード例 #3
0
 def generate_corpus_from_segmented_reports(self):
     re = ReportEnviroments()
     new_corpus_of_segmented_reports = TaggedCorpusReader(re.segmented_reports_corpus_path, '.*',
                                                          sent_tokenizer=LineTokenizer(blanklines='discard'),
                                                          encoding='utf-8')
     raw_segmented_reports = []
     for i in range(len(new_corpus_of_segmented_reports.fileids())):
         raw_segmented_reports.append(new_corpus_of_segmented_reports.sents(fileids=new_corpus_of_segmented_reports.fileids()[i]))
     cut_of_segmented_reports = []
     topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED']
     for i in range(len(raw_segmented_reports)):
         cut_of_segmented_reports.append(raw_segmented_reports[i][raw_segmented_reports[i].index([topics[0].decode('utf-8')]):raw_segmented_reports[i].index([topics[-1].decode('utf-8')])+1])    
     return cut_of_segmented_reports, topics
コード例 #4
0
def read_reviews():
    """
    read reviews from the given file(s).
    """
    from glob import glob
    filenames = glob("input/food*.parsed")
    
    sent_end_pattern = ".\/[,\.]"
    reader = TaggedCorpusReader(
        root = ".",
        fileids = filenames,
        sep = "/",
        sent_tokenizer = RegexpTokenizer(sent_end_pattern, gaps=True))

    li = reader.sents()
    return li
コード例 #5
0
def read_reviews():
    """
    read reviews from the given file(s).
    """
    from glob import glob
    filenames = glob("input/food*.parsed")

    sent_end_pattern = ".\/[,\.]"
    reader = TaggedCorpusReader(root=".",
                                fileids=filenames,
                                sep="/",
                                sent_tokenizer=RegexpTokenizer(
                                    sent_end_pattern, gaps=True))

    li = reader.sents()
    return li
コード例 #6
0
class CorpusParser:
    def __init__(self, root, fileids='.*', encoding='utf8'):
        """
        Reads all the files in root.

        :param root: Directory.
        :param fileids: List of files that have to be read. '.*' if all files have to be parsed.
        :param encoding: File enconding
        """
        self._reader = TaggedCorpusReader(root, fileids, encoding=encoding)

    def words(self):
        """
        Returns all the words in the corpora.

        :return: List of words.
        """
        return self._reader.words()

    def tagged_words(self):
        """
        Returns all words of the corpora with their corresponding tag.

        :return: List of tuples (word, tag)
        """
        return self._reader.tagged_words()

    def sentences(self):
        """
        Returns a list of all sentences.

        :return: List of lists of words. Each list represents a sentence, with a list of its words in it.
        """
        return self._reader.sents()

    def tagged_sentences(self):
        """
        Returns a list of all sentences with the tag of each word.

        :return: List of lists of tuples. Each sentence is a list with all its members being tuples (word, tag).
        """
        return self._reader.tagged_sents()
コード例 #7
0
ファイル: Treebank.py プロジェクト: rickstello/hmm-tagger
    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")
コード例 #8
0
    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")
コード例 #9
0
 def generate_corpus_from_segmented_reports(self):
     re = ReportEnviroments()
     new_corpus_of_segmented_reports = TaggedCorpusReader(
         re.segmented_reports_corpus_path,
         '.*',
         sent_tokenizer=LineTokenizer(blanklines='discard'),
         encoding='utf-8')
     raw_segmented_reports = []
     for i in range(len(new_corpus_of_segmented_reports.fileids())):
         raw_segmented_reports.append(
             new_corpus_of_segmented_reports.sents(
                 fileids=new_corpus_of_segmented_reports.fileids()[i]))
     cut_of_segmented_reports = []
     topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED']
     for i in range(len(raw_segmented_reports)):
         cut_of_segmented_reports.append(
             raw_segmented_reports[i]
             [raw_segmented_reports[i].index([topics[0].decode('utf-8')]):
              raw_segmented_reports[i].index([topics[-1].decode('utf-8')]) +
              1])
     return cut_of_segmented_reports, topics
コード例 #10
0
ファイル: pos_corpus.py プロジェクト: anderscui/nlpy
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
コード例 #11
0
ファイル: pos_corpus.py プロジェクト: neuroph12/nlpy
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d,
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
コード例 #12
0
########## TAGGED CORPUS READER ###############

from nltk.corpus.reader import TaggedCorpusReader
root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
file="brown.pos"
source=root+file

#Using Regex to match all files with extension .pos
reader=TaggedCorpusReader(root,r'.*\.pos')

print reader.words()
print reader.tagged_words()
print reader.sents()
print reader.tagged_sents()
print reader.paras()
print reader.tagged_paras()


#TaggedCorpus uses default tokenizer but we can change it by customizing it
from nltk.tokenize import SpaceTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer())
print reader.words()

#Customing TaggedCorpus's sentence tokenizer
from nltk.tokenize import LineTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer())
print reader.words()

#Customizing TaggedCorpus's paragraph Block reader
#Customizing TaggedCorpus's tag separator - Pg 57
コード例 #13
0
ファイル: NERHINDI1.py プロジェクト: subhabangalore/ML-Codes
def NER_HINDI():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents)
    test = hmm_tagger.test(test_sents)
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = hmm_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
    #INPUT FROM FILE
    with open('HINDIHMMNER1.dill', 'wb') as f:
        dill.dump(hmm_tagger, f)
    with open('HINDIHMMNER1.dill', 'rb') as f:
        hmm_tagger1 = dill.load(f)

    test_tags = [
        tag for sent in reader.sents() for (word, tag) in hmm_tagger1.tag(sent)
    ]
    gold_tags = [tag for (word, tag) in reader.tagged_words()]
    ltesttag = len(test_tags)
    lgtags = len(gold_tags)
    print "Test Tag Len:", ltesttag
    print "Gold Tag Len:", lgtags
    cm = nltk.ConfusionMatrix(gold_tags, test_tags)
    print(cm.pretty_format(sort_by_count=True, show_percents=False,
                           truncate=5))
    labels = set('NA GPE PERS DATE  ORG'.split()
                 )  #THE TAG SETS AS GENERATED IN CONFUSION MATRIX
    true_positives = Counter()
    false_negatives = Counter()
    false_positives = Counter()
    for i in labels:
        for j in labels:
            if i == j:
                true_positives[i] += cm[i, j]
            else:
                false_negatives[i] += cm[i, j]
                false_positives[j] += cm[i, j]
    print "TP:", sum(true_positives.values()), true_positives
    print "FN:", sum(false_negatives.values()), false_negatives
    print "FP:", sum(false_positives.values()), false_positives
    print

    for i in sorted(labels):
        if true_positives[i] == 0:
            fscore = 0
        else:
            precision = true_positives[i] / float(true_positives[i] +
                                                  false_positives[i])
            recall = true_positives[i] / float(true_positives[i] +
                                               false_negatives[i])
            fscore = 2 * (precision * recall) / float(precision + recall)
            fscore1 = fscore * 100
            print "TAG:", i, "FMEASURE:", fscore1