def main(): """main function """ n = 2 # Bigram HMM args = parse_arguments() treebank = TaggedCorpusReader( os.path.split(args.train_f)[0], os.path.split(args.train_f)[1]) observation_space = [item[0] for item in treebank.sents()] # all words state_space = [item[1] for item in treebank.sents()] # all pos tags words = dict.fromkeys(observation_space) tags = dict.fromkeys(state_space) # HMM parameter estimation- initial, transition and emission probablity start = time.time() init_p = [item[1] for item in comp_initial(tags, treebank)] trans_p = comp_transition(n, tags, state_space) emission_p = comp_emission(words, tags, state_space, treebank, smoothing=args.smoothing) end = time.time() print("Runtime (training): %.3f s" % (end - start)) # Test your HMM-trained model treebank = TaggedCorpusReader( os.path.split(args.eval_f)[0], os.path.split(args.eval_f)[1]) viterbi_tags = [] start = time.time() for sentence in treebank.paras(): test_words = [item[0] for item in sentence] O, S, Y, pi, A, B = pre_process(words, tags, test_words, init_p, trans_p, emission_p) # Computes Viterbi's most likely tags if args.log_prob: X = viterbi_log(O, S, Y, pi, A, B) else: X = viterbi(O, S, Y, pi, A, B) viterbi_tags.append(X) end = time.time() print("Runtime (viterbi): %.3f s" % (end - start)) output_path = "./" + "de-tagger.tt" post_processing(viterbi_tags, args.test_f, output_path)
def NER_HINDINBC(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] nbc_tagger = ClassifierBasedPOSTagger(train=train_sents) test = nbc_tagger.evaluate(test_sents) print "The Test Result is:", test #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = nbc_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs
def generate_corpus_from_segmented_reports(self): re = ReportEnviroments() new_corpus_of_segmented_reports = TaggedCorpusReader(re.segmented_reports_corpus_path, '.*', sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') raw_segmented_reports = [] for i in range(len(new_corpus_of_segmented_reports.fileids())): raw_segmented_reports.append(new_corpus_of_segmented_reports.sents(fileids=new_corpus_of_segmented_reports.fileids()[i])) cut_of_segmented_reports = [] topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED'] for i in range(len(raw_segmented_reports)): cut_of_segmented_reports.append(raw_segmented_reports[i][raw_segmented_reports[i].index([topics[0].decode('utf-8')]):raw_segmented_reports[i].index([topics[-1].decode('utf-8')])+1]) return cut_of_segmented_reports, topics
def read_reviews(): """ read reviews from the given file(s). """ from glob import glob filenames = glob("input/food*.parsed") sent_end_pattern = ".\/[,\.]" reader = TaggedCorpusReader( root = ".", fileids = filenames, sep = "/", sent_tokenizer = RegexpTokenizer(sent_end_pattern, gaps=True)) li = reader.sents() return li
def read_reviews(): """ read reviews from the given file(s). """ from glob import glob filenames = glob("input/food*.parsed") sent_end_pattern = ".\/[,\.]" reader = TaggedCorpusReader(root=".", fileids=filenames, sep="/", sent_tokenizer=RegexpTokenizer( sent_end_pattern, gaps=True)) li = reader.sents() return li
class CorpusParser: def __init__(self, root, fileids='.*', encoding='utf8'): """ Reads all the files in root. :param root: Directory. :param fileids: List of files that have to be read. '.*' if all files have to be parsed. :param encoding: File enconding """ self._reader = TaggedCorpusReader(root, fileids, encoding=encoding) def words(self): """ Returns all the words in the corpora. :return: List of words. """ return self._reader.words() def tagged_words(self): """ Returns all words of the corpora with their corresponding tag. :return: List of tuples (word, tag) """ return self._reader.tagged_words() def sentences(self): """ Returns a list of all sentences. :return: List of lists of words. Each list represents a sentence, with a list of its words in it. """ return self._reader.sents() def tagged_sentences(self): """ Returns a list of all sentences with the tag of each word. :return: List of lists of tuples. Each sentence is a list with all its members being tuples (word, tag). """ return self._reader.tagged_sents()
def __init__(self, corpus_path, corpus_files): """ Construct a Treebank object :param corpus_path: path to corpus files :param corpus_files: list of filenames for corpus text """ msg("Importing treebank...") # get a corpus reader object for our corpus using NLTK treebank = TaggedCorpusReader(corpus_path, corpus_files) # get all sentences from corpus in a tagged format self.tagged_sents = treebank.tagged_sents() # get all sentences from corpus in an untagged format self.sents = treebank.sents() msg("done!\n")
def generate_corpus_from_segmented_reports(self): re = ReportEnviroments() new_corpus_of_segmented_reports = TaggedCorpusReader( re.segmented_reports_corpus_path, '.*', sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') raw_segmented_reports = [] for i in range(len(new_corpus_of_segmented_reports.fileids())): raw_segmented_reports.append( new_corpus_of_segmented_reports.sents( fileids=new_corpus_of_segmented_reports.fileids()[i])) cut_of_segmented_reports = [] topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED'] for i in range(len(raw_segmented_reports)): cut_of_segmented_reports.append( raw_segmented_reports[i] [raw_segmented_reports[i].index([topics[0].decode('utf-8')]): raw_segmented_reports[i].index([topics[-1].decode('utf-8')]) + 1]) return cut_of_segmented_reports, topics
from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = TaggedCorpusReader(d, r'.*\.pos') print(reader.words()) print(reader.tagged_words()) print(reader.sents()) print(reader.tagged_sents()) print(reader.paras()) print(reader.tagged_paras()) # custom tokenizer reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer()) print(reader.sents()) print(reader.tagged_sents()) # universal tagset reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.tagged_sents(tagset='universal')) # NLTK tagged corpora from nltk.corpus import treebank print(reader.tagged_words()) print(reader.tagged_words(tagset='universal'))
########## TAGGED CORPUS READER ############### from nltk.corpus.reader import TaggedCorpusReader root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" file="brown.pos" source=root+file #Using Regex to match all files with extension .pos reader=TaggedCorpusReader(root,r'.*\.pos') print reader.words() print reader.tagged_words() print reader.sents() print reader.tagged_sents() print reader.paras() print reader.tagged_paras() #TaggedCorpus uses default tokenizer but we can change it by customizing it from nltk.tokenize import SpaceTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer()) print reader.words() #Customing TaggedCorpus's sentence tokenizer from nltk.tokenize import LineTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer()) print reader.words() #Customizing TaggedCorpus's paragraph Block reader #Customizing TaggedCorpus's tag separator - Pg 57
def NER_HINDI(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents) test = hmm_tagger.test(test_sents) #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = hmm_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs #INPUT FROM FILE with open('HINDIHMMNER1.dill', 'wb') as f: dill.dump(hmm_tagger, f) with open('HINDIHMMNER1.dill', 'rb') as f: hmm_tagger1 = dill.load(f) test_tags = [ tag for sent in reader.sents() for (word, tag) in hmm_tagger1.tag(sent) ] gold_tags = [tag for (word, tag) in reader.tagged_words()] ltesttag = len(test_tags) lgtags = len(gold_tags) print "Test Tag Len:", ltesttag print "Gold Tag Len:", lgtags cm = nltk.ConfusionMatrix(gold_tags, test_tags) print(cm.pretty_format(sort_by_count=True, show_percents=False, truncate=5)) labels = set('NA GPE PERS DATE ORG'.split() ) #THE TAG SETS AS GENERATED IN CONFUSION MATRIX true_positives = Counter() false_negatives = Counter() false_positives = Counter() for i in labels: for j in labels: if i == j: true_positives[i] += cm[i, j] else: false_negatives[i] += cm[i, j] false_positives[j] += cm[i, j] print "TP:", sum(true_positives.values()), true_positives print "FN:", sum(false_negatives.values()), false_negatives print "FP:", sum(false_positives.values()), false_positives print for i in sorted(labels): if true_positives[i] == 0: fscore = 0 else: precision = true_positives[i] / float(true_positives[i] + false_positives[i]) recall = true_positives[i] / float(true_positives[i] + false_negatives[i]) fscore = 2 * (precision * recall) / float(precision + recall) fscore1 = fscore * 100 print "TAG:", i, "FMEASURE:", fscore1