def tagSentences(path, training_list=[], testing_list=[]): ct = CRFTagger() train_list = getTrainList(training_list) ct.train(train_list, 'model.crf.tagger') sentences = getSentences(path, testing_list) tagged_sentences = ct.tag_sents(sentences) return tagged_sentences
def test_taggers(): # load taggers unigram_tagger = load_pkl('unigram-tagger') tnt_tagger = load_pkl('tnt-tagger') perceptron_tagger = load_pkl('perceptron-tagger') # crf_tagger = load_pkl('crf-tagger') crf_tagger = CRFTagger() crf_tagger.set_model_file('crf-tagger.model') test_sents = load_pkl('test_sents')[:10] print(f'{len(test_sents)} sentences in testing set') taggers = [ ['Unigram tagger', unigram_tagger, 0, 0], ['TnT tagger', tnt_tagger, 0, 0], ['Perceptron tagger', perceptron_tagger, 0, 0], ['CRF tagger', crf_tagger, 0, 0], ] for t in taggers: print(f'evaluating {t[0]} ... ', end='', flush=True) f1 = t[1].evaluate(test_sents) t[2] = f1 # the evaluation result is the same as f1 score calculated by sklearn # f1 = cal_f1_score(t[1], test_sents) # t[3] = f1 # f1 = 0 print(f'done. f1 score: {f1}') best_tagger_info = max(taggers, key=lambda t: t[2]) print('best tagger is ' + best_tagger_info[0]) best_tagger = best_tagger_info[1]
def train_pos_tag(dataset_dir, output_path): jumSample = 500000 namaFile = dataset_dir with open(namaFile, 'r', encoding='utf-8') as f: lines = f.read().split('\n') pasangan = [] allPasangan = [] for line in lines[:min(jumSample, len(lines))]: # Remove Wiki Tags line = re.sub('<[^>]*>', '', line) if line == '': if len(pasangan) != 0: allPasangan.append(pasangan) pasangan = [] else: kata, tag = line.split('\t') p = (kata, tag) pasangan.append(p) ct = CRFTagger() print("Training Tagger...") ct.train(allPasangan, output_path) print("Training Complete")
class CRF: def __init__(self): self.__model = type('test', (object,), {})() pass def train(self, X_training_data): self.__model = CRFTagger() self.__model.train(X_training_data, 'crf.model') pass def test(self, X_test_data): total = 0 correct = 0 for kalimat in X_test_data: temp = [] for word in kalimat: temp.append(word[0]) if len(temp) != 0: predicted_y = self.__model.tag(temp) for i in range(len(predicted_y)): total += 1 if predicted_y[i][1] == kalimat[i][1]: correct += 1 print(correct, total) print(correct / total) pass
def question3(): tagger = CRFTagger(feature_func=feature_func) tagger.train(train_sentences, 'model.crf.tagger') print(tagger.evaluate(test_sentences)) return
def train_taggers(): train_sents = load_pkl('train_sents') # instantiate taggers unigram_tagger = nltk.UnigramTagger(train_sents) tnt_tagger = tnt.TnT() perceptron_tagger = perceptron.PerceptronTagger(load=False) # limit the number of iteractions as the training takes too long crf_tagger = CRFTagger(training_opt={'max_iterations': 100}) print('Unigram tagger has already been trained') save_pkl(unigram_tagger, 'unigram-tagger') print('training TnT tagger ...', end='', flush=True) tnt_tagger.train(train_sents) print('Done') save_pkl(tnt_tagger, 'tnt-tagger') print('training Perceptron tagger ...', end='', flush=True) perceptron_tagger.train(train_sents) print('Done') save_pkl(perceptron_tagger, 'perceptron-tagger') print('training CRF tagger ...', end='', flush=True) crf_tagger.train(train_sents, 'crf-tagger.model') print('Done')
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
class NamedEntityChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = CRFTagger( feature_func=features ) self.tagger.train(train_sents, 'model.crf.tagger') # self.tagger = ClassifierBasedTagger( # train=train_sents, # feature_detector=features, # **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # iob_triplets = [(w, t, 'O') for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
def __init__(self, train_sents=None, tagger="ClassifierBasedTagger", model=None, model_name="../results/modelCRF_featured", entities=None, language="english", **kwargs): self.all_entities = [] self.acronyms = [] self.language = language if not model: assert isinstance(train_sents, Iterable) if tagger == "ClassifierBasedTagger": self.feature_detector = iob_features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=iob_features, **kwargs) elif tagger == "CRFTagger": self.set_entities(entities) if not model: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.train( train_data=train_sents, model_file="../results/{}".format(model_name)) else: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.set_model_file(model) else: raise Exception('Unknown tagger')
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def __init__(self, data=[]): self.tagger = CRFTagger() self.tagger.set_model_file('model.crf.tagger') if data.count(True) > 0: self.data_tagging, self.data_testing = self.for_tagging_testing( data)
def tagpos(request): ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') tokenize = word_tokenize("Saya bekerja di Bandung") hasil = ct.tag_sents([tokenize]) postag = nltk.pos_tag(tokenize) context = { 'tokenize': tokenize, 'postag': postag, 'hasil': hasil, } template = loader.get_template('polls/tagged.html') # train_text = state_union.raw('2005-GWBush.txt') # sample_text = state_union.raw('2006-GWBush.txt') # custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # tokenized = custom_sent_tokenizer.tokenize(sample_text) # tagged = [] # for i in tokenized[:5]: # words = nltk.word_tokenize(i) # tagged.append(nltk.pos_tag(words)) # # template = loader.get_template('polls/tagged.html') # context = { # 'tagged' : tagged # } return HttpResponse(template.render(context, request))
def function_pos_tagging(new_stopwords_tweets): ct = CRFTagger() ct.set_model_file('data/all_indo_man_tag_corpus_model.crf.tagger') new_pos_tweets = [] for n in range(len(new_stopwords_tweets)): pos_tweet_word = [new_stopwords_tweets[n][0]] pos_tweet_words = ct.tag_sents(pos_tweet_word) pos_tweet = [pos_tweet_words, new_stopwords_tweets[n][1]] new_pos_tweets.append(pos_tweet) new_features_tweets = [] for n in range(len(new_pos_tweets)): pos_tweets_data = new_pos_tweets[n][0][0] features = [] for tokenTag in pos_tweets_data: token, tag = tokenTag access = ['NN', 'JJ', 'RB', 'VBD'] if tag in access: features.append(token) else: pass if features: features_tweets = [features, new_pos_tweets[n][1]] new_features_tweets.append(features_tweets) else: pass return new_features_tweets
def main(self): # metode SENDIRI file = open("forecast_corpus.txt", "r") call = file.read() corpus = call.split() file.close() verba = [] # stopword removal # sfactory = StopWordRemoverFactory() # stopwords = sfactory.create_stop_word_remover() # stop = stopwords.remove(call) # c = stop.split() # print("Membaca corpus.....") ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') hasil = ct.tag_sents([corpus]) this = Verba_finder() for x in range(len(hasil[0])): if hasil[0][x][1] == 'VB' and this.afiks_check( hasil[0][x][0]) == 1 and (hasil[0][x + 1][1] == 'NN' or hasil[0][x + 1][1] == 'JJ'): # print(hasil[0][x]) verba.append(" " + hasil[0][x][0] + " ") return verba
class SlotTaggingModel(object): def __init__(self, **argparams): self.train_data = argparams['train_data'] if self.train_data is not None: assert isinstance(self.train_data, DataSetCSVagentActPred) self.model_folder = argparams['model_folder'] self.model_fname = '{}/slotTagging.model'.format(self.model_folder) def train(self, verbose=True): assert self.train_data is not None, 'train_data is required.' print('\ttraining ...') # transform data instance_list = self._transform_data(self.train_data) userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder) writeUtterTag(instance_list, userUtterTag_train_fname) print('\ttrain_data={}'.format(userUtterTag_train_fname)) # train model self.model = CRFTagger(verbose=verbose) self.model.train(instance_list, self.model_fname) print('\tmodel_fname={}'.format(self.model_fname)) print('\tsaving model ...') def _transform_data(self, data): ''' convert textual utter and user tags into a list of lists that contain lists of (w, t) pairs ''' userUtter_txt = data.userUtter_txt userTag_txt = data.userTag_txt instance_list = list() for words, tags in zip(userUtter_txt, userTag_txt): instance = [(word.strip(), tag.strip()) for word, tag in zip(words.decode('utf-8').strip().split(), tags.decode('utf-8').strip().split())] instance_list.append(instance) return instance_list def predict(self, test_data): '''return a list of lists, [[(w1, tag1), (w2, tag2), (w3, tag3)], [...], [...]] ''' assert test_data is not None, 'test_data is required.' assert isinstance(test_data, DataSetCSVagentActPred) print('\tpredicting Slot Tags ...') # transform data instance_list = self._transform_data(test_data) userUtterTag_test_fname = '{}/userUtterTag_test.target'.format(self.model_folder) writeUtterTag(instance_list, userUtterTag_test_fname) print('\ttag_target={}'.format(userUtterTag_test_fname)) instance_utter_list = getUtterList(instance_list) # testing results = self.model.tag_sents(instance_utter_list) self.result_fname = '{}/userUtterTag_test.pred'.format(self.model_folder) print('\ttag_pred={}'.format(self.result_fname)) writeUtterTag(results, self.result_fname) precision, recall, fscore, accuracy_frame = eval_tagPredBaseline(instance_list, results, test_data.userTag2id, test_data.userTag_vocab_size) print('\tprecision={:.4f}, recall={:.4f}, fscore={:.4f}, accuracy_frame={:.4f}'.format(precision, recall, fscore, accuracy_frame)) return results def load_model(self, verbose=True): print('\tloading model ...') self.model = CRFTagger(verbose=verbose) self.model.set_model_file(self.model_fname)
def load_model(self, modelfile): with open('%s.act.model' % modelfile, 'r') as f: self.__speech_act_model, self.__speech_act_lb = pickle.load(f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.set_model_file('%s.semantic.model' % modelfile) return True
def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = CRFTagger( feature_func=features ) self.tagger.train(train_sents, 'model.crf.tagger')
def crftagger(hasil_stem): result = [] ct = CRFTagger() ct.set_model_file('D://dataset/all_indo_man_tag_corpus_model.crf.tagger') for i in hasil_stem: hasil = ct.tag_sents([i]) for j in hasil: result.append(j) return result
def Postagging(data): postaggedData = [] postagOnly = [] ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') postaggedData = ct.tag_sents(data) for i in range(len(postaggedData)): for j in range(len(postaggedData[i])): postagOnly.append(postaggedData[i][j][1]) return postagOnly
def __init__(self, iob_predictor): self.iob_predictor = iob_predictor self.stemmer = StemmerFactory().create_stemmer() self.TAGGER3 = CRFTagger() self.TAGGER3.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') self.label_words = self.read_label_file('label-words.txt') self.label_posses = self.read_label_file('label-posses.txt') self.label_lemmas = self.read_label_file('label-lemmas.txt') self.label_iob_feature = self.read_label_file('label-iob_feature.txt') self.label_iob_classes = self.read_label_file('label-iob_classes.txt')
def question3(): tagger = CRFTagger(feature_func=feature_func) tagger.train(train_sentences, 'model_windows_size_1.crf.tagger') #tagger = CRFTagger(feature_func=feature_func) #tagger.set_model_file('model_windows_size_1.crf.tagger') print(tagger.evaluate(test_sentences)) return
def tag_crf(self, untagged_string: str): """Tag POS with CRF tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['crf'] tagger = CRFTagger() tagger.set_model_file(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
def train(self, verbose=True): assert self.train_data is not None, 'train_data is required.' print('\ttraining ...') # transform data instance_list = self._transform_data(self.train_data) userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder) writeUtterTag(instance_list, userUtterTag_train_fname) print('\ttrain_data={}'.format(userUtterTag_train_fname)) # train model self.model = CRFTagger(verbose=verbose) self.model.train(instance_list, self.model_fname) print('\tmodel_fname={}'.format(self.model_fname)) print('\tsaving model ...')
def getPosTag(): global perLabel, jobLabel, subLabel, orgLabel, geoLabel raw_sent = sentInput.get() ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') tokens = nltk.tokenize.word_tokenize(raw_sent) postagged = ct.tag_sents([tokens]) data = [] for token in postagged[0]: data.append(token + ('O', )) tagger_ner = pycrfsuite.Tagger() tagger_ner.open('model_ner.crfsuite') ner = tagger_ner.tag(sent2features(data, False)) for i in range(len(ner)): data[i] = data[i][0:2] + (ner[i], ) tagger_oh = pycrfsuite.Tagger() tagger_oh.open('model_oh.crfsuite') oh = tagger_oh.tag(sent2features(data, True)) for i in range(len(oh)): data[i] += (oh[i], ) per = [] job = [] sub = [] org = [] geo = [] for token in data: if token[3] == '1': label = token[2][-3:] if label == 'PER': per.append(token[0]) elif label == 'ORG': org.append(token[0]) elif label == 'SUB': sub.append(token[0]) elif label == 'JOB': job.append(token[0]) elif label == 'GEO': geo.append(token[0]) perLabel.config(text='PER: ' + (' ').join(per)) jobLabel.config(text='JOB: ' + (' ').join(job)) subLabel.config(text='SUB: ' + (' ').join(sub)) orgLabel.config(text='ORG: ' + (' ').join(org)) geoLabel.config(text='GEO: ' + (' ').join(geo))
def crf_tag(): news_text = brown.tagged_sents(categories='news') train_sents = news_text[:3230] test_sents = news_text[3230:4600] ct = CRFTagger() tagger = ct.train(train_sents, 'model.crf.tagger') test = ct.evaluate(test_sents) print test sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode( 'utf-8') sent_w = sent3.lower().split() print sent_w tag = ct.tag(sent_w) print "The Tag Is:", tag
def pos_tagger(data, attr="paragraphs"): flatten = lambda l: [item for sublist in l for item in sublist] ct = CRFTagger() ct.set_model_file('dataset/all_indo_man_tag_corpus_model.crf.tagger') for category in data: category['word_tag_{}'.format(attr)] = [] for paragraph in category[attr]: list_tag_kalimat = [] for kalimat in paragraph: tag_kalimat = ct.tag_sents([kalimat]) tag_kalimat = flatten(tag_kalimat) list_tag_kalimat.append(tag_kalimat) category['word_tag_{}'.format(attr)].append(list_tag_kalimat) return data
def __init__(self): # Memuat data pre-trained POS-Tagger uni, bi, tri, word = self.load_obj("tagger") self.TAGGER1 = Tagger(uni, bi, tri, word) # Memuat data pre-trained POS-Tagger uni2, bi2, tri2, word2 = self.load_obj("tagger2") self.TAGGER2 = Tagger(uni2, bi2, tri2, word2) self.TAGGER3 = CRFTagger() self.TAGGER3.set_model_file( 'postagg/dataset/all_indo_man_tag_corpus_model.crf.tagger') # Memuat data grammar chunker self.load_chunker()
def ner_tag(word): from nltk.tag import CRFTagger ct = CRFTagger() import pickle infolist = pickle.load(open('infolist.pickle', 'rb')) infodict = {} posdict = {} nerdict = {} for [word, postag, nertag] in infolist: if word not in posdict: posdict[word] = [postag] if word in posdict: posdict[word].append(postag) for [word, postag, nertag] in infolist: if word not in nerdict: nerdict[word] = [nertag] if word in nerdict: nerdict[word].append(nertag) #print(most_common(posdict["van"])) ner_tag = most_common(nerdict[word]) return ner_tag
def pos_tagger(text): # input: teks/String # instansiasi ct = CRFTagger() # load model tagger indonesia ct.set_model_file('model_postagging_crf.tagger') # cleaning text = re.sub('\.?\,?\(?\)?\"?', '', text) text = re.sub("\n", " ", text) text = text.split(" ") # ini fungsi untuk melakukan postagging tagged_text = ct.tag_sents([text]) # hasil return tagged_text # output: teks yang sudah diberi pos_tag
def chunking(sents, chunked_file): ''' Chunking param sents: 列表,如[['dog', 'is', 'dog'], ['dog', 'good']] ''' os.chdir('/home/zqr/code/chunk2vec/') start_time = time.time() #PoS print '\n-->Start PoS' #print '->Training PoS Tagger' #ct = CRFTagger() #ct.train(chunk_traindata(pos_trainfile), 'model.crf.tagger') #print '->Done' #pos_testdata_gold = chunk_traindata(pos_testfile) # pos corpus print '->Load CRF Tagger model' ct = CRFTagger() ###这个model是从chunk任务中学习到的PoS标签 ct.set_model_file('model.crf.tagger') print '->Posing' tagged_sents = ct.tag_sents(sents) #print 'PoS acc.:', ct.evaluate(pos_testdata_gold) #将PoS好的句子写文件 print '->Write posed file' pos_data(tagged_sents, 'tmp_for_chunking') end_time = time.time() print '-->Done, Time:', end_time - start_time, 's' #节省时间,暂时用测试语料 #pos_data(pos_testdata_gold, chunk_inputfile) start_time = time.time() ###Chunk,依赖系统安装YamCha,训练语料就用CoNLL的训练语料 print '\n-->Start Chunking' os.system('yamcha-config --libexecdir') #os.chdir('/home/zqr/code/sent2vec/') os.system('cp /home/zqr/local/libexec/yamcha/Makefile .') #训练chunking模型 #os.system('make CORPUS=' + pos_trainfile +' MODEL=chunk_model train') os.system('yamcha -m chunk_model.model < tmp_for_chunking > ' + chunked_file) print '-->Done, Time:', time.time() - start_time, 's'
def train(self, modelfile): sa_feats = [x for x, _ in self.__speech_act_instance_list] sa_labels = [y for _, y in self.__speech_act_instance_list] self.__speech_act_lb = preprocessing.LabelBinarizer() sa_labels = self.__speech_act_lb.fit_transform(sa_labels) self.__speech_act_model = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))]) self.__speech_act_model.fit(sa_feats, sa_labels) with open('%s.act.model' % modelfile, 'wb') as f: pickle.dump((self.__speech_act_model, self.__speech_act_lb), f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)
def ExtractItemsFromJudgment(text,CodeTaggerFile,TitleTaggerFile): text = removeHTMLTags(text) tokenList = tokenizeTestData(text) CodesTagger = CRFTagger() TitleTagger = CRFTagger() CodesTagger.set_model_file(CodeTaggerFile) TitleTagger.set_model_file(TitleTaggerFile) taggedCodes = CodesTagger.tag_sents(tokenList) taggedTitles = TitleTagger.tag_sents(tokenList) return extract_entities(taggedCodes,taggedTitles)
def load(training, testing): ct = CRFTagger() # split the training into sentences t = "\n".join(training) sents = t.split("###/###") # split the sentences into tokens train = [] for sent in sents: if sent: new = [] words = sent.split("\n") for word in words: if word: # split the tokens into word and tag new.append(tuple(word.split("/"))) train.append(new) # remove any blank sentences that have been added for t in train: if not t: train.remove(t) ct.train(train, 'model.crf.tagger') # test on the testing data s = "\n".join(testing) s_sents = s.split("###/###") test = [] sent_tags = [] for t in s_sents: if t: new = [] right_tags = [] words = t.split("\n") for word in words: if word: # split the tokens into just words new.append(word.split("/")[0]) # save the tags in a list to be used later right_tags.append(word.split("/")[1]) sent_tags.append(right_tags) test.append(new) tags = ct.tag_sents(test) return tags, sent_tags
def run_crf(trainfile, testfile, model_file=None): maxlen = 100 sents_train, tags_train, unique_words_train, unique_tags_train = \ P.retrieve_sentences_tags(trainfile, maxlen=maxlen) sents_test, tags_test, unique_word_test, unique_tags_test = \ P.retrieve_sentences_tags(testfile, maxlen=maxlen, allowedtags=unique_tags_train) train_data = [] for n, st in enumerate(sents_train): s = [] for m, _ in enumerate(st): s.append((unicode(sents_train[n][m], "utf-8") , unicode(tags_train[n][m], "utf-8"))) train_data.append(s) crf = CRFTagger() if model_file is None: crf.train(train_data, model_file='data/crf.mdl') else: crf.set_model_file(model_file) test_data = [] for n, st in enumerate(sents_test): s = [] for m, _ in enumerate(st): s.append((unicode(sents_test[n][m], "utf-8") , unicode(tags_test[n][m], "utf-8"))) test_data.append(s) print(crf.evaluate(test_data))
def ExtractItemsFromJudgment(text): text = removeHTMLTags(text) tokenList = tokenizeTestData(text) CodesTagger = CRFTagger() titleTagger = CRFTagger() CodesTagger.set_model_file("models/CRF-Model-OnlyCodes") titleTagger.set_model_file("models/CRF-Model-OnlyTitles") taggedCodes = CodesTagger.tag_sents(tokenList) taggedTitles = titleTagger.tag_sents(tokenList) return extract_entities(taggedCodes,taggedTitles)
def train(self, modelfile): sa_feats = [x for x, _ in self.__speech_act_instance_list] sa_labels = [y for _, y in self.__speech_act_instance_list] self.__speech_act_lb = preprocessing.MultiLabelBinarizer() sa_labels = self.__speech_act_lb.fit_transform(sa_labels) self.__speech_act_model = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))]) self.__speech_act_model.fit(sa_feats, sa_labels) with open('%s.act.model' % modelfile, 'wb') as f: pickle.dump((self.__speech_act_model, self.__speech_act_lb), f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)
from nltk.corpus import treebank from nltk.tag import tnt, CRFTagger # split training data from test data train_data = treebank.tagged_sents()[:3000] test_data = treebank.tagged_sents()[3000:] # train a trigram N tagger (TnT) tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) print tnt_pos_tagger.evaluate(test_data) # train a CRF tagger crf_tagger = CRFTagger() crf_tagger.train(train_data, '~/Documents/NLP/NLP/crf_model.txt') print crf_tagger.evaluate(test_data)
def create_trainingModel(train_data,ModelPath): if os.path.isfile(ModelPath): os.remove(ModelPath) ct = CRFTagger() ct.train(train_data,ModelPath)
class SimpleSLU: def __init__(self): self.__semantic_instance_list = [] self.__speech_act_instance_list = [] self.__semantic_model = None self.__speech_act_model = None self.__speech_act_lb = None def load_model(self, modelfile): with open('%s.act.model' % modelfile, 'r') as f: self.__speech_act_model, self.__speech_act_lb = pickle.load(f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.set_model_file('%s.semantic.model' % modelfile) return True def add_instance(self, utter, speech_act, semantic_tagged): tokenized = self.__tokenize(utter, semantic_tagged) if tokenized is None: return False semantic_instance = [] for word, (bio, tag, attrs) in tokenized: if bio is None: sem_label = 'O' else: cat = None for attr, val in attrs: if attr == 'cat': cat = val sem_label = '%s-%s_%s' % (bio, tag, cat) semantic_instance.append((unicode(word.lower()), unicode(sem_label))) self.__semantic_instance_list.append(semantic_instance) sa_label_list = [] for sa in speech_act: sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list += sa_labels sa_label_list = sorted(set(sa_label_list)) word_feats = ' '.join([word.lower() for word, _ in tokenized]) self.__speech_act_instance_list.append((word_feats, sa_label_list)) return True def train(self, modelfile): sa_feats = [x for x, _ in self.__speech_act_instance_list] sa_labels = [y for _, y in self.__speech_act_instance_list] self.__speech_act_lb = preprocessing.MultiLabelBinarizer() sa_labels = self.__speech_act_lb.fit_transform(sa_labels) self.__speech_act_model = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))]) self.__speech_act_model.fit(sa_feats, sa_labels) with open('%s.act.model' % modelfile, 'wb') as f: pickle.dump((self.__speech_act_model, self.__speech_act_lb), f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile) def pred(self, utter): tokenized = self.__tokenize(utter) word_feats = ' '.join([word.lower() for word, _ in tokenized]) pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats])) pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized]) return (pred_act, pred_semantic) def __tokenize(self, utter, semantic_tagged=None): result = None if semantic_tagged is None: result = [(word, None) for word in nltk.word_tokenize(utter)] else: parser_raw = SemanticTagParser(False) parser_tagged = SemanticTagParser(False) segmented = ' '.join(nltk.word_tokenize(utter)) tagged = ' '.join(semantic_tagged) parser_raw.feed(segmented) parser_tagged.feed(tagged) raw_chr_seq = parser_raw.get_chr_seq() raw_space_seq = parser_raw.get_chr_space_seq() tagged_chr_seq = parser_tagged.get_chr_seq() tagged_space_seq = parser_tagged.get_chr_space_seq() if raw_chr_seq == tagged_chr_seq: merged_space_seq = [ x or y for x, y in zip(raw_space_seq, tagged_space_seq)] word_seq = parser_tagged.tokenize(merged_space_seq) tag_seq = parser_tagged.get_word_tag_seq() result = [(word, tag) for word, tag in zip(word_seq, tag_seq)] return result
for word in wordsPerLine : requiredFormat.append(word) #print "\nrequiredFormat = ",requiredFormat return requiredFormat print "\nReading training corpus...." ListOfSentences_Training = corpusRead(Training_Data) print "Reading test corpus...." ListOfSentences_Test = corpusRead(Test_Data) #CRF Training ct = CRFTagger() print "CRF Training starts..." ct.train(ListOfSentences_Training,'model.crf.tagger') print "CRF Training is done." print "Testing starts" print "Accuracy of CRF is = ",ct.evaluate(ListOfSentences_Test) * 100 #Tagging by CRF Tagger ch = 'y' while (ch != 'n'): text = raw_input("Enter the text to be tagged : \n") text = converter(text) print ct.tag_sents(text) print "\nDo you want to continue ?" ch = raw_input()
#with codecs.open("/Users/Preethi/nlp_project/EMNLP/mandarin_english/training/mandarin-english-training.txt","r","utf-8") as f: line=f.readline() line_list=[] while line: #print(line) words=line.replace("\r","").replace("\n","").split("\t") #print(words) if(len(words)<2): train_data.append(line_list) line_list=[] else: tup1=(words[0],words[1]) line_list.append(tup1) line=f.readline() f.close() ct = CRFTagger() ct.train(train_data,'model.crf.tagger') test_actual=[] test_sentences=[] #with codecs.open("nepali-english-demo-20%training-data.txt","r","utf-8") as f: with codecs.open("/Users/Preethi/nlp_project/EMNLP/spanish_english/training/spanish-english-training-20%.txt","r","utf-8") as f: #with codecs.open("/Users/Preethi/nlp_project/EMNLP/mandarin_english/training/mandarin-english-testing-answers.txt","r","utf-8") as f: line=f.readline() test=[] sentence=[] while line: words=line.replace("\r","").replace("\n","").split("\t") #print(words)
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) crf_accuracies.append(crf_accuracy) print('crf:', crf_accuracy) #if counter> 0: break final_accuracies_list = [] mean_accuracy_crf = mean(crf_accuracies) standard_deviation_crf = stdev(crf_accuracies) uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}} final_accuracies_list.append(uni) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict