def __init__(self, train_sents=None, tagger="ClassifierBasedTagger", model=None, model_name="../results/modelCRF_featured", entities=None, language="english", **kwargs): self.all_entities = [] self.acronyms = [] self.language = language if not model: assert isinstance(train_sents, Iterable) if tagger == "ClassifierBasedTagger": self.feature_detector = iob_features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=iob_features, **kwargs) elif tagger == "CRFTagger": self.set_entities(entities) if not model: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.train( train_data=train_sents, model_file="../results/{}".format(model_name)) else: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.set_model_file(model) else: raise Exception('Unknown tagger')
def ner_tag(word): from nltk.tag import CRFTagger ct = CRFTagger() import pickle infolist = pickle.load(open('infolist.pickle', 'rb')) infodict = {} posdict = {} nerdict = {} for [word, postag, nertag] in infolist: if word not in posdict: posdict[word] = [postag] if word in posdict: posdict[word].append(postag) for [word, postag, nertag] in infolist: if word not in nerdict: nerdict[word] = [nertag] if word in nerdict: nerdict[word].append(nertag) #print(most_common(posdict["van"])) ner_tag = most_common(nerdict[word]) return ner_tag
def tagSentences(path, training_list=[], testing_list=[]): ct = CRFTagger() train_list = getTrainList(training_list) ct.train(train_list, 'model.crf.tagger') sentences = getSentences(path, testing_list) tagged_sentences = ct.tag_sents(sentences) return tagged_sentences
def question3(): tagger = CRFTagger(feature_func=feature_func) tagger.train(train_sentences, 'model.crf.tagger') print(tagger.evaluate(test_sentences)) return
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def __init__(self, data=[]): self.tagger = CRFTagger() self.tagger.set_model_file('model.crf.tagger') if data.count(True) > 0: self.data_tagging, self.data_testing = self.for_tagging_testing( data)
def function_pos_tagging(new_stopwords_tweets): ct = CRFTagger() ct.set_model_file('data/all_indo_man_tag_corpus_model.crf.tagger') new_pos_tweets = [] for n in range(len(new_stopwords_tweets)): pos_tweet_word = [new_stopwords_tweets[n][0]] pos_tweet_words = ct.tag_sents(pos_tweet_word) pos_tweet = [pos_tweet_words, new_stopwords_tweets[n][1]] new_pos_tweets.append(pos_tweet) new_features_tweets = [] for n in range(len(new_pos_tweets)): pos_tweets_data = new_pos_tweets[n][0][0] features = [] for tokenTag in pos_tweets_data: token, tag = tokenTag access = ['NN', 'JJ', 'RB', 'VBD'] if tag in access: features.append(token) else: pass if features: features_tweets = [features, new_pos_tweets[n][1]] new_features_tweets.append(features_tweets) else: pass return new_features_tweets
def main(self): # metode SENDIRI file = open("forecast_corpus.txt", "r") call = file.read() corpus = call.split() file.close() verba = [] # stopword removal # sfactory = StopWordRemoverFactory() # stopwords = sfactory.create_stop_word_remover() # stop = stopwords.remove(call) # c = stop.split() # print("Membaca corpus.....") ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') hasil = ct.tag_sents([corpus]) this = Verba_finder() for x in range(len(hasil[0])): if hasil[0][x][1] == 'VB' and this.afiks_check( hasil[0][x][0]) == 1 and (hasil[0][x + 1][1] == 'NN' or hasil[0][x + 1][1] == 'JJ'): # print(hasil[0][x]) verba.append(" " + hasil[0][x][0] + " ") return verba
def train_taggers(): train_sents = load_pkl('train_sents') # instantiate taggers unigram_tagger = nltk.UnigramTagger(train_sents) tnt_tagger = tnt.TnT() perceptron_tagger = perceptron.PerceptronTagger(load=False) # limit the number of iteractions as the training takes too long crf_tagger = CRFTagger(training_opt={'max_iterations': 100}) print('Unigram tagger has already been trained') save_pkl(unigram_tagger, 'unigram-tagger') print('training TnT tagger ...', end='', flush=True) tnt_tagger.train(train_sents) print('Done') save_pkl(tnt_tagger, 'tnt-tagger') print('training Perceptron tagger ...', end='', flush=True) perceptron_tagger.train(train_sents) print('Done') save_pkl(perceptron_tagger, 'perceptron-tagger') print('training CRF tagger ...', end='', flush=True) crf_tagger.train(train_sents, 'crf-tagger.model') print('Done')
def tagpos(request): ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') tokenize = word_tokenize("Saya bekerja di Bandung") hasil = ct.tag_sents([tokenize]) postag = nltk.pos_tag(tokenize) context = { 'tokenize': tokenize, 'postag': postag, 'hasil': hasil, } template = loader.get_template('polls/tagged.html') # train_text = state_union.raw('2005-GWBush.txt') # sample_text = state_union.raw('2006-GWBush.txt') # custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # tokenized = custom_sent_tokenizer.tokenize(sample_text) # tagged = [] # for i in tokenized[:5]: # words = nltk.word_tokenize(i) # tagged.append(nltk.pos_tag(words)) # # template = loader.get_template('polls/tagged.html') # context = { # 'tagged' : tagged # } return HttpResponse(template.render(context, request))
def train_pos_tag(dataset_dir, output_path): jumSample = 500000 namaFile = dataset_dir with open(namaFile, 'r', encoding='utf-8') as f: lines = f.read().split('\n') pasangan = [] allPasangan = [] for line in lines[:min(jumSample, len(lines))]: # Remove Wiki Tags line = re.sub('<[^>]*>', '', line) if line == '': if len(pasangan) != 0: allPasangan.append(pasangan) pasangan = [] else: kata, tag = line.split('\t') p = (kata, tag) pasangan.append(p) ct = CRFTagger() print("Training Tagger...") ct.train(allPasangan, output_path) print("Training Complete")
def test_taggers(): # load taggers unigram_tagger = load_pkl('unigram-tagger') tnt_tagger = load_pkl('tnt-tagger') perceptron_tagger = load_pkl('perceptron-tagger') # crf_tagger = load_pkl('crf-tagger') crf_tagger = CRFTagger() crf_tagger.set_model_file('crf-tagger.model') test_sents = load_pkl('test_sents')[:10] print(f'{len(test_sents)} sentences in testing set') taggers = [ ['Unigram tagger', unigram_tagger, 0, 0], ['TnT tagger', tnt_tagger, 0, 0], ['Perceptron tagger', perceptron_tagger, 0, 0], ['CRF tagger', crf_tagger, 0, 0], ] for t in taggers: print(f'evaluating {t[0]} ... ', end='', flush=True) f1 = t[1].evaluate(test_sents) t[2] = f1 # the evaluation result is the same as f1 score calculated by sklearn # f1 = cal_f1_score(t[1], test_sents) # t[3] = f1 # f1 = 0 print(f'done. f1 score: {f1}') best_tagger_info = max(taggers, key=lambda t: t[2]) print('best tagger is ' + best_tagger_info[0]) best_tagger = best_tagger_info[1]
def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = CRFTagger( feature_func=features ) self.tagger.train(train_sents, 'model.crf.tagger')
def load_model(self, modelfile): with open('%s.act.model' % modelfile, 'r') as f: self.__speech_act_model, self.__speech_act_lb = pickle.load(f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.set_model_file('%s.semantic.model' % modelfile) return True
def crftagger(hasil_stem): result = [] ct = CRFTagger() ct.set_model_file('D://dataset/all_indo_man_tag_corpus_model.crf.tagger') for i in hasil_stem: hasil = ct.tag_sents([i]) for j in hasil: result.append(j) return result
def __init__(self, iob_predictor): self.iob_predictor = iob_predictor self.stemmer = StemmerFactory().create_stemmer() self.TAGGER3 = CRFTagger() self.TAGGER3.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') self.label_words = self.read_label_file('label-words.txt') self.label_posses = self.read_label_file('label-posses.txt') self.label_lemmas = self.read_label_file('label-lemmas.txt') self.label_iob_feature = self.read_label_file('label-iob_feature.txt') self.label_iob_classes = self.read_label_file('label-iob_classes.txt')
def question3(): tagger = CRFTagger(feature_func=feature_func) tagger.train(train_sentences, 'model_windows_size_1.crf.tagger') #tagger = CRFTagger(feature_func=feature_func) #tagger.set_model_file('model_windows_size_1.crf.tagger') print(tagger.evaluate(test_sentences)) return
def Postagging(data): postaggedData = [] postagOnly = [] ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') postaggedData = ct.tag_sents(data) for i in range(len(postaggedData)): for j in range(len(postaggedData[i])): postagOnly.append(postaggedData[i][j][1]) return postagOnly
def tag_crf(self, untagged_string: str): """Tag POS with CRF tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['crf'] tagger = CRFTagger() tagger.set_model_file(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
def train(self, verbose=True): assert self.train_data is not None, 'train_data is required.' print('\ttraining ...') # transform data instance_list = self._transform_data(self.train_data) userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder) writeUtterTag(instance_list, userUtterTag_train_fname) print('\ttrain_data={}'.format(userUtterTag_train_fname)) # train model self.model = CRFTagger(verbose=verbose) self.model.train(instance_list, self.model_fname) print('\tmodel_fname={}'.format(self.model_fname)) print('\tsaving model ...')
def getPosTag(): global perLabel, jobLabel, subLabel, orgLabel, geoLabel raw_sent = sentInput.get() ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') tokens = nltk.tokenize.word_tokenize(raw_sent) postagged = ct.tag_sents([tokens]) data = [] for token in postagged[0]: data.append(token + ('O', )) tagger_ner = pycrfsuite.Tagger() tagger_ner.open('model_ner.crfsuite') ner = tagger_ner.tag(sent2features(data, False)) for i in range(len(ner)): data[i] = data[i][0:2] + (ner[i], ) tagger_oh = pycrfsuite.Tagger() tagger_oh.open('model_oh.crfsuite') oh = tagger_oh.tag(sent2features(data, True)) for i in range(len(oh)): data[i] += (oh[i], ) per = [] job = [] sub = [] org = [] geo = [] for token in data: if token[3] == '1': label = token[2][-3:] if label == 'PER': per.append(token[0]) elif label == 'ORG': org.append(token[0]) elif label == 'SUB': sub.append(token[0]) elif label == 'JOB': job.append(token[0]) elif label == 'GEO': geo.append(token[0]) perLabel.config(text='PER: ' + (' ').join(per)) jobLabel.config(text='JOB: ' + (' ').join(job)) subLabel.config(text='SUB: ' + (' ').join(sub)) orgLabel.config(text='ORG: ' + (' ').join(org)) geoLabel.config(text='GEO: ' + (' ').join(geo))
def crf_tag(): news_text = brown.tagged_sents(categories='news') train_sents = news_text[:3230] test_sents = news_text[3230:4600] ct = CRFTagger() tagger = ct.train(train_sents, 'model.crf.tagger') test = ct.evaluate(test_sents) print test sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode( 'utf-8') sent_w = sent3.lower().split() print sent_w tag = ct.tag(sent_w) print "The Tag Is:", tag
def pos_tagger(data, attr="paragraphs"): flatten = lambda l: [item for sublist in l for item in sublist] ct = CRFTagger() ct.set_model_file('dataset/all_indo_man_tag_corpus_model.crf.tagger') for category in data: category['word_tag_{}'.format(attr)] = [] for paragraph in category[attr]: list_tag_kalimat = [] for kalimat in paragraph: tag_kalimat = ct.tag_sents([kalimat]) tag_kalimat = flatten(tag_kalimat) list_tag_kalimat.append(tag_kalimat) category['word_tag_{}'.format(attr)].append(list_tag_kalimat) return data
def __init__(self): # Memuat data pre-trained POS-Tagger uni, bi, tri, word = self.load_obj("tagger") self.TAGGER1 = Tagger(uni, bi, tri, word) # Memuat data pre-trained POS-Tagger uni2, bi2, tri2, word2 = self.load_obj("tagger2") self.TAGGER2 = Tagger(uni2, bi2, tri2, word2) self.TAGGER3 = CRFTagger() self.TAGGER3.set_model_file( 'postagg/dataset/all_indo_man_tag_corpus_model.crf.tagger') # Memuat data grammar chunker self.load_chunker()
def pos_tagger(text): # input: teks/String # instansiasi ct = CRFTagger() # load model tagger indonesia ct.set_model_file('model_postagging_crf.tagger') # cleaning text = re.sub('\.?\,?\(?\)?\"?', '', text) text = re.sub("\n", " ", text) text = text.split(" ") # ini fungsi untuk melakukan postagging tagged_text = ct.tag_sents([text]) # hasil return tagged_text # output: teks yang sudah diberi pos_tag
def make_pos_model(model_type): """Load selected algorithm, save model to models repo.""" now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' elif model_type == 'crf': tagger = CRFTagger() file = 'crf.pickle' _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) tagger.train(train_sents, path) print('Completed training {0} model in {1} seconds to {2}.'.format( model_type, time.time() - now, path)) return else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format( model_type, time.time() - now, path))
def getData(filename): ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') result = [] annotated = [] with open(filename + '.csv', 'r') as f: reader = csv.reader(f) annotated = list(reader) sent = [] sent_gold = [] sent_oh = [] curr_sent = '' for token in annotated: if curr_sent != str(token[1]) + ' ' + str(token[2]): hasil = ct.tag_sents([sent]) mytuple = [] for idx in range(len(sent)): try: mytuple.append(hasil[0][idx] + (sent_gold[idx], sent_oh[idx])) except IndexError: pass result.append(mytuple) sent = [] sent_gold = [] sent_oh = [] curr_sent = str(token[1]) + ' ' + str(token[2]) sent.append(token[4]) sent_gold.append(token[5]) sent_oh.append(token[6]) hasil = ct.tag_sents([sent]) mytuple = [] for idx in range(len(sent)): try: mytuple.append(hasil[0][idx] + (sent_gold[idx], sent_oh[idx])) except: pass result.append(mytuple) result = result[1:] print('Total sentence: ' + str(len(result))) random.shuffle(result) return result
def train(self, modelfile): sa_feats = [x for x, _ in self.__speech_act_instance_list] sa_labels = [y for _, y in self.__speech_act_instance_list] self.__speech_act_lb = preprocessing.LabelBinarizer() sa_labels = self.__speech_act_lb.fit_transform(sa_labels) self.__speech_act_model = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))]) self.__speech_act_model.fit(sa_feats, sa_labels) with open('%s.act.model' % modelfile, 'wb') as f: pickle.dump((self.__speech_act_model, self.__speech_act_lb), f) self.__semantic_model = CRFTagger(verbose=True) self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)
def main(no_stopwords, use_manual_train_set): print "MAINTAIN COMMON WORDS: " + str(not no_stopwords) print "USING HAND LABELED TRAIN DATA: " + str(use_manual_train_set) full_set = get_domain_set(no_stopwords) if not no_stopwords: full_set.extend(get_other_set()) train_set, test_set_auto = divide_sets(full_set, 0.75) set_manual = get_manual_set(no_stopwords) train_set_manual = [] test_set_manual = [] if use_manual_train_set: train_set_manual, test_set_manual = divide_sets(set_manual, 0.28) train_set.extend(train_set_manual) else: test_set_manual = set_manual tagger = CRFTagger(feature_func=feature_extraction) try: tagger.train(train_set, 'laptop.crf.tagger') except ValueError: fi = open('DEBUG', 'w') for li in DEBUG: fi.write(str(li.encode('utf-8')) + '\n') fi.close() print "AUTOMATIC LABELED TEST" tagged_sents_auto = tagger.tag_sents(map_test_set(test_set_auto, word=True)) predicted_auto = create_vector_of_predicted_labels(tagged_sents_auto) golden_auto = create_vector_of_predicted_labels(test_set_auto) print calculate_micro_accuracy(predicted_auto, golden_auto, no_stopwords) print "MANUAL LABELED TEST" tagged_sents_manual = tagger.tag_sents(map_test_set(test_set_manual, word=True)) predicted_manual = create_vector_of_predicted_labels(tagged_sents_manual) golden_manual = create_vector_of_predicted_labels(test_set_manual) print calculate_micro_accuracy(predicted_manual, golden_manual, no_stopwords) print ""
def train_tagger(language, model_type, feature, train_sents): if model_type == 'unigram': tagger = UnigramTagger(train_sents) elif model_type == 'bigram': tagger = BigramTagger(train_sents) elif model_type == 'trigram': tagger = TrigramTagger(train_sents) elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) elif model_type == 'crf': tagger = CRFTagger() tagger.train(train_sents, 'taggers/{0}/{1}/crf.pickle'.format(language, feature)) elif model_type == 'perceptron': tagger = PerceptronTagger(load=False) tagger.train(train_sents) return tagger