def ExtractItemsFromJudgment(text,CodeTaggerFile,TitleTaggerFile): text = removeHTMLTags(text) tokenList = tokenizeTestData(text) CodesTagger = CRFTagger() TitleTagger = CRFTagger() CodesTagger.set_model_file(CodeTaggerFile) TitleTagger.set_model_file(TitleTaggerFile) taggedCodes = CodesTagger.tag_sents(tokenList) taggedTitles = TitleTagger.tag_sents(tokenList) return extract_entities(taggedCodes,taggedTitles)
def ExtractItemsFromJudgment(text): text = removeHTMLTags(text) tokenList = tokenizeTestData(text) CodesTagger = CRFTagger() titleTagger = CRFTagger() CodesTagger.set_model_file("models/CRF-Model-OnlyCodes") titleTagger.set_model_file("models/CRF-Model-OnlyTitles") taggedCodes = CodesTagger.tag_sents(tokenList) taggedTitles = titleTagger.tag_sents(tokenList) return extract_entities(taggedCodes,taggedTitles)
def tagpos(request): ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') tokenize = word_tokenize("Saya bekerja di Bandung") hasil = ct.tag_sents([tokenize]) postag = nltk.pos_tag(tokenize) context = { 'tokenize': tokenize, 'postag': postag, 'hasil': hasil, } template = loader.get_template('polls/tagged.html') # train_text = state_union.raw('2005-GWBush.txt') # sample_text = state_union.raw('2006-GWBush.txt') # custom_sent_tokenizer = PunktSentenceTokenizer(train_text) # tokenized = custom_sent_tokenizer.tokenize(sample_text) # tagged = [] # for i in tokenized[:5]: # words = nltk.word_tokenize(i) # tagged.append(nltk.pos_tag(words)) # # template = loader.get_template('polls/tagged.html') # context = { # 'tagged' : tagged # } return HttpResponse(template.render(context, request))
def tagSentences(path, training_list=[], testing_list=[]): ct = CRFTagger() train_list = getTrainList(training_list) ct.train(train_list, 'model.crf.tagger') sentences = getSentences(path, testing_list) tagged_sentences = ct.tag_sents(sentences) return tagged_sentences
def main(self): # metode SENDIRI file = open("forecast_corpus.txt", "r") call = file.read() corpus = call.split() file.close() verba = [] # stopword removal # sfactory = StopWordRemoverFactory() # stopwords = sfactory.create_stop_word_remover() # stop = stopwords.remove(call) # c = stop.split() # print("Membaca corpus.....") ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') hasil = ct.tag_sents([corpus]) this = Verba_finder() for x in range(len(hasil[0])): if hasil[0][x][1] == 'VB' and this.afiks_check( hasil[0][x][0]) == 1 and (hasil[0][x + 1][1] == 'NN' or hasil[0][x + 1][1] == 'JJ'): # print(hasil[0][x]) verba.append(" " + hasil[0][x][0] + " ") return verba
def function_pos_tagging(new_stopwords_tweets): ct = CRFTagger() ct.set_model_file('data/all_indo_man_tag_corpus_model.crf.tagger') new_pos_tweets = [] for n in range(len(new_stopwords_tweets)): pos_tweet_word = [new_stopwords_tweets[n][0]] pos_tweet_words = ct.tag_sents(pos_tweet_word) pos_tweet = [pos_tweet_words, new_stopwords_tweets[n][1]] new_pos_tweets.append(pos_tweet) new_features_tweets = [] for n in range(len(new_pos_tweets)): pos_tweets_data = new_pos_tweets[n][0][0] features = [] for tokenTag in pos_tweets_data: token, tag = tokenTag access = ['NN', 'JJ', 'RB', 'VBD'] if tag in access: features.append(token) else: pass if features: features_tweets = [features, new_pos_tweets[n][1]] new_features_tweets.append(features_tweets) else: pass return new_features_tweets
class SlotTaggingModel(object): def __init__(self, **argparams): self.train_data = argparams['train_data'] if self.train_data is not None: assert isinstance(self.train_data, DataSetCSVagentActPred) self.model_folder = argparams['model_folder'] self.model_fname = '{}/slotTagging.model'.format(self.model_folder) def train(self, verbose=True): assert self.train_data is not None, 'train_data is required.' print('\ttraining ...') # transform data instance_list = self._transform_data(self.train_data) userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder) writeUtterTag(instance_list, userUtterTag_train_fname) print('\ttrain_data={}'.format(userUtterTag_train_fname)) # train model self.model = CRFTagger(verbose=verbose) self.model.train(instance_list, self.model_fname) print('\tmodel_fname={}'.format(self.model_fname)) print('\tsaving model ...') def _transform_data(self, data): ''' convert textual utter and user tags into a list of lists that contain lists of (w, t) pairs ''' userUtter_txt = data.userUtter_txt userTag_txt = data.userTag_txt instance_list = list() for words, tags in zip(userUtter_txt, userTag_txt): instance = [(word.strip(), tag.strip()) for word, tag in zip(words.decode('utf-8').strip().split(), tags.decode('utf-8').strip().split())] instance_list.append(instance) return instance_list def predict(self, test_data): '''return a list of lists, [[(w1, tag1), (w2, tag2), (w3, tag3)], [...], [...]] ''' assert test_data is not None, 'test_data is required.' assert isinstance(test_data, DataSetCSVagentActPred) print('\tpredicting Slot Tags ...') # transform data instance_list = self._transform_data(test_data) userUtterTag_test_fname = '{}/userUtterTag_test.target'.format(self.model_folder) writeUtterTag(instance_list, userUtterTag_test_fname) print('\ttag_target={}'.format(userUtterTag_test_fname)) instance_utter_list = getUtterList(instance_list) # testing results = self.model.tag_sents(instance_utter_list) self.result_fname = '{}/userUtterTag_test.pred'.format(self.model_folder) print('\ttag_pred={}'.format(self.result_fname)) writeUtterTag(results, self.result_fname) precision, recall, fscore, accuracy_frame = eval_tagPredBaseline(instance_list, results, test_data.userTag2id, test_data.userTag_vocab_size) print('\tprecision={:.4f}, recall={:.4f}, fscore={:.4f}, accuracy_frame={:.4f}'.format(precision, recall, fscore, accuracy_frame)) return results def load_model(self, verbose=True): print('\tloading model ...') self.model = CRFTagger(verbose=verbose) self.model.set_model_file(self.model_fname)
def crftagger(hasil_stem): result = [] ct = CRFTagger() ct.set_model_file('D://dataset/all_indo_man_tag_corpus_model.crf.tagger') for i in hasil_stem: hasil = ct.tag_sents([i]) for j in hasil: result.append(j) return result
def getData(filename): ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') result = [] annotated = [] with open(filename + '.csv', 'r') as f: reader = csv.reader(f) annotated = list(reader) sent = [] sent_gold = [] sent_oh = [] curr_sent = '' for token in annotated: if curr_sent != str(token[1]) + ' ' + str(token[2]): hasil = ct.tag_sents([sent]) mytuple = [] for idx in range(len(sent)): try: mytuple.append(hasil[0][idx] + (sent_gold[idx], sent_oh[idx])) except IndexError: pass result.append(mytuple) sent = [] sent_gold = [] sent_oh = [] curr_sent = str(token[1]) + ' ' + str(token[2]) sent.append(token[4]) sent_gold.append(token[5]) sent_oh.append(token[6]) hasil = ct.tag_sents([sent]) mytuple = [] for idx in range(len(sent)): try: mytuple.append(hasil[0][idx] + (sent_gold[idx], sent_oh[idx])) except: pass result.append(mytuple) result = result[1:] print('Total sentence: ' + str(len(result))) random.shuffle(result) return result
def main(no_stopwords, use_manual_train_set): print "MAINTAIN COMMON WORDS: " + str(not no_stopwords) print "USING HAND LABELED TRAIN DATA: " + str(use_manual_train_set) full_set = get_domain_set(no_stopwords) if not no_stopwords: full_set.extend(get_other_set()) train_set, test_set_auto = divide_sets(full_set, 0.75) set_manual = get_manual_set(no_stopwords) train_set_manual = [] test_set_manual = [] if use_manual_train_set: train_set_manual, test_set_manual = divide_sets(set_manual, 0.28) train_set.extend(train_set_manual) else: test_set_manual = set_manual tagger = CRFTagger(feature_func=feature_extraction) try: tagger.train(train_set, 'laptop.crf.tagger') except ValueError: fi = open('DEBUG', 'w') for li in DEBUG: fi.write(str(li.encode('utf-8')) + '\n') fi.close() print "AUTOMATIC LABELED TEST" tagged_sents_auto = tagger.tag_sents(map_test_set(test_set_auto, word=True)) predicted_auto = create_vector_of_predicted_labels(tagged_sents_auto) golden_auto = create_vector_of_predicted_labels(test_set_auto) print calculate_micro_accuracy(predicted_auto, golden_auto, no_stopwords) print "MANUAL LABELED TEST" tagged_sents_manual = tagger.tag_sents(map_test_set(test_set_manual, word=True)) predicted_manual = create_vector_of_predicted_labels(tagged_sents_manual) golden_manual = create_vector_of_predicted_labels(test_set_manual) print calculate_micro_accuracy(predicted_manual, golden_manual, no_stopwords) print ""
def Postagging(data): postaggedData = [] postagOnly = [] ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') postaggedData = ct.tag_sents(data) for i in range(len(postaggedData)): for j in range(len(postaggedData[i])): postagOnly.append(postaggedData[i][j][1]) return postagOnly
def getPosTag(): global perLabel, jobLabel, subLabel, orgLabel, geoLabel raw_sent = sentInput.get() ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') tokens = nltk.tokenize.word_tokenize(raw_sent) postagged = ct.tag_sents([tokens]) data = [] for token in postagged[0]: data.append(token + ('O', )) tagger_ner = pycrfsuite.Tagger() tagger_ner.open('model_ner.crfsuite') ner = tagger_ner.tag(sent2features(data, False)) for i in range(len(ner)): data[i] = data[i][0:2] + (ner[i], ) tagger_oh = pycrfsuite.Tagger() tagger_oh.open('model_oh.crfsuite') oh = tagger_oh.tag(sent2features(data, True)) for i in range(len(oh)): data[i] += (oh[i], ) per = [] job = [] sub = [] org = [] geo = [] for token in data: if token[3] == '1': label = token[2][-3:] if label == 'PER': per.append(token[0]) elif label == 'ORG': org.append(token[0]) elif label == 'SUB': sub.append(token[0]) elif label == 'JOB': job.append(token[0]) elif label == 'GEO': geo.append(token[0]) perLabel.config(text='PER: ' + (' ').join(per)) jobLabel.config(text='JOB: ' + (' ').join(job)) subLabel.config(text='SUB: ' + (' ').join(sub)) orgLabel.config(text='ORG: ' + (' ').join(org)) geoLabel.config(text='GEO: ' + (' ').join(geo))
def pos_tagger(data, attr="paragraphs"): flatten = lambda l: [item for sublist in l for item in sublist] ct = CRFTagger() ct.set_model_file('dataset/all_indo_man_tag_corpus_model.crf.tagger') for category in data: category['word_tag_{}'.format(attr)] = [] for paragraph in category[attr]: list_tag_kalimat = [] for kalimat in paragraph: tag_kalimat = ct.tag_sents([kalimat]) tag_kalimat = flatten(tag_kalimat) list_tag_kalimat.append(tag_kalimat) category['word_tag_{}'.format(attr)].append(list_tag_kalimat) return data
def oninfolist(): """NU DOEN: KIJK NAAR FORMAT VAN GEGEVEN INFORMATIE OP INTERNET IN VOORBEELD, CHECK ALLE LIJSTEN DIE IK GEMAAKT HEB OF ZE OVEREENKOMEN MET DE VORM""" #SEE: http://www.nltk.org/_modules/nltk/tag/crf.html infolist = pickle.load(open('sentencelist.pickle', 'rb')) limit = round(len(infolist) * 0.4) train_data = infolist[0:limit] #print("train_data = ", train_data[0:10]) ct = CRFTagger() #print(infolist[0:10]) realsentences = [] realsentence = "" """ for sentence in infolist[limit:]: for (word,nertag) in sentence: realsentence = realsentence +" "+ word realsentences.append(realsentence) realsentence = "" pickle.dump(realsentences,open("realsentences.pickle","wb")) print("pickle-bestand gemaakt") """ realsentences = pickle.load(open("realsentences.pickle", "rb")) print("REALSENTENCES:", realsentences[0:10]) splitsentences = [] #[['dog','is','good'],['cat','eat','meat']] for r in realsentences: splitsentence = r.split() splitsentences.append(splitsentence) #print("train_data:", infolist[0:10]) #print("sentences for tag_sents:", splitsentences[0:10]) ct.tag_sents(splitsentences[limit:]) gold_sentences = infolist[limit:] print("GOLD SENTENCES:", infolist[10:20]) print(ct.evaluate(gold_sentences))
def pos_tagger(text): # input: teks/String # instansiasi ct = CRFTagger() # load model tagger indonesia ct.set_model_file('model_postagging_crf.tagger') # cleaning text = re.sub('\.?\,?\(?\)?\"?', '', text) text = re.sub("\n", " ", text) text = text.split(" ") # ini fungsi untuk melakukan postagging tagged_text = ct.tag_sents([text]) # hasil return tagged_text # output: teks yang sudah diberi pos_tag
def chunking(sents, chunked_file): ''' Chunking param sents: 列表,如[['dog', 'is', 'dog'], ['dog', 'good']] ''' os.chdir('/home/zqr/code/chunk2vec/') start_time = time.time() #PoS print '\n-->Start PoS' #print '->Training PoS Tagger' #ct = CRFTagger() #ct.train(chunk_traindata(pos_trainfile), 'model.crf.tagger') #print '->Done' #pos_testdata_gold = chunk_traindata(pos_testfile) # pos corpus print '->Load CRF Tagger model' ct = CRFTagger() ###这个model是从chunk任务中学习到的PoS标签 ct.set_model_file('model.crf.tagger') print '->Posing' tagged_sents = ct.tag_sents(sents) #print 'PoS acc.:', ct.evaluate(pos_testdata_gold) #将PoS好的句子写文件 print '->Write posed file' pos_data(tagged_sents, 'tmp_for_chunking') end_time = time.time() print '-->Done, Time:', end_time - start_time, 's' #节省时间,暂时用测试语料 #pos_data(pos_testdata_gold, chunk_inputfile) start_time = time.time() ###Chunk,依赖系统安装YamCha,训练语料就用CoNLL的训练语料 print '\n-->Start Chunking' os.system('yamcha-config --libexecdir') #os.chdir('/home/zqr/code/sent2vec/') os.system('cp /home/zqr/local/libexec/yamcha/Makefile .') #训练chunking模型 #os.system('make CORPUS=' + pos_trainfile +' MODEL=chunk_model train') os.system('yamcha -m chunk_model.model < tmp_for_chunking > ' + chunked_file) print '-->Done, Time:', time.time() - start_time, 's'
def main(): import pickle from nltk.tag import CRFTagger infolist = pickle.load(open('infolist.pickle', 'rb')) ct = CRFTagger() train_data = [[(x, z) for [x, y, z] in infolist[:round(0.9 * len(infolist))]]] ct.train(train_data, 'model.crf.tagger') ners = ct.tag_sents( [[x for [x, y, z] in infolist[round(0.9 * len(infolist)):]]]) print(ners) gold_sentences = [[(x, z) for [x, y, z] in infolist[round(0.9 * len(infolist)):]]] ct.evaluate(gold_sentences) print(ct.evaluate(gold_sentences))
def load(training, testing): ct = CRFTagger() # split the training into sentences t = "\n".join(training) sents = t.split("###/###") # split the sentences into tokens train = [] for sent in sents: if sent: new = [] words = sent.split("\n") for word in words: if word: # split the tokens into word and tag new.append(tuple(word.split("/"))) train.append(new) # remove any blank sentences that have been added for t in train: if not t: train.remove(t) ct.train(train, 'model.crf.tagger') # test on the testing data s = "\n".join(testing) s_sents = s.split("###/###") test = [] sent_tags = [] for t in s_sents: if t: new = [] right_tags = [] words = t.split("\n") for word in words: if word: # split the tokens into just words new.append(word.split("/")[0]) # save the tags in a list to be used later right_tags.append(word.split("/")[1]) sent_tags.append(right_tags) test.append(new) tags = ct.tag_sents(test) return tags, sent_tags
def main(): # start timer for item in [ "UD_Ukrainian", "Brown", ]: print("in process " + item) # open Brown training data infile = open(DATA_PATH + item + "_tagged_train.txt", "r", encoding="utf-8") brown_train = infile.readlines() infile.close() # split words and tags, and add start and stop symbols (question 1) brown_words, brown_tags = split_wordtags(brown_train) # calculate tag trigram probabilities (question 2) q_values = calc_trigrams(brown_tags) # question 2 output q2_output(q_values, OUTPUT_PATH + item + '_B2.txt') # calculate list of words with count > 5 (question 3) known_words = calc_known(brown_words) # get a version of brown_words with rare words replace with '_RARE_' (question 3) brown_words_rare = replace_rare(brown_words, known_words) # question 3 output q3_output(brown_words_rare, OUTPUT_PATH + item + "_B3.txt") # calculate emission probabilities (question 4) e_values, taglist = calc_emission(brown_words_rare, brown_tags) # question 4 output q4_output(e_values, OUTPUT_PATH + item + "_B4.txt") # delete unneceessary data del brown_train del brown_words_rare # open Brown development data (question 5) infile = open(DATA_PATH + item + "_test.txt", "r") brown_dev = infile.readlines() infile.close() # format Brown development data here brown_dev_words = [] for sentence in brown_dev: brown_dev_words.append(sentence.split(" ")[:-1]) # do viterbi on brown_dev_words (question 5) viterbi_tagged = viterbi(brown_dev_words, taglist, known_words, q_values, e_values) # question 5 output q5_output(viterbi_tagged, OUTPUT_PATH + item + "_B5.txt") # # do nltk tagging here # nltk_tagged = nltk_tagger(brown_words, brown_tags, brown_dev_words) # # # question 6 output # q6_output(nltk_tagged, OUTPUT_PATH + item + "_B6.txt") for item in ["Brown", "UD_Ukrainian"]: print("in crf process " + item) # open Brown training data infile = open(DATA_PATH + item + "_tagged_train.txt", "rb", encoding="utf-8") brown_train = infile.readlines() infile.close() brown_words, brown_tags = split_wordtags(brown_train) train_words_tags = [] ct = CRFTagger() for i in range(len(brown_words)): tmp = [] for j in range(len(brown_words[i])): tmp.append((brown_words[i][j].decode('utf-8'), brown_tags[i][j].decode('utf-8'))) train_words_tags.append(tmp) ct.train(train_words_tags, u'model.crf.tagger') # open Brown development data (question 5) infile = open(DATA_PATH + item + "_test.txt", "r") brown_dev = infile.readlines() infile.close() # format Brown development data here tests_words = [] for sentence in brown_dev: tests_words.append([i for i in sentence.split(" ")[:-1]]) result_cfg = ct.tag_sents(tests_words) with open(OUTPUT_PATH + item + "_CFG.txt", "w") as file: for line in result_cfg: for word in line: file.write(word[0] + "/" + word[1] + " ") file.write("\n") # print total time to run Part B print("Part B time: ", str(time.clock()), ' sec')
with open('inputText.txt', 'r') as myfile: data = myfile.read().replace('\n', '') import nltk tokens = nltk.word_tokenize(data) from nltk.tag import CRFTagger ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') hasil = ct.tag_sents([tokens]) tagging = "" for tokenTag in hasil[0]: token, tag = tokenTag tagging += token + "\t" + tag + "\n" with open("outputText.txt", "w") as text_file: text_file.write(tagging)
class Chunker: UNIQ = '_UNIQUE_STRING_' CHUNK_PARSER = None """ """ def __init__(self): # Memuat data pre-trained POS-Tagger uni, bi, tri, word = self.load_obj("tagger") self.TAGGER1 = Tagger(uni, bi, tri, word) # Memuat data pre-trained POS-Tagger uni2, bi2, tri2, word2 = self.load_obj("tagger2") self.TAGGER2 = Tagger(uni2, bi2, tri2, word2) self.TAGGER3 = CRFTagger() self.TAGGER3.set_model_file( 'postagg/dataset/all_indo_man_tag_corpus_model.crf.tagger') # Memuat data grammar chunker self.load_chunker() """ """ def load_obj(self, name): with open('postagg/' + 'obj/' + name + '.pkl', 'rb') as f: return pickle.load(f) """ Melakukan formatting string menjadi regex """ def format_to_re(self, format): parts = (format % MarkPlaceholders()).split(self.UNIQ) for i in range(0, len(parts), 2): parts[i] = re.escape(parts[i]) return ' '.join(parts).replace('\\', '') """ Mengubah tree POS Tag menjadi tree chunk """ def tree_to_str(self, tree_data): ne_in_sent = [] for subtree in tree_data: if type(subtree ) == Tree: # If subtree is a noun chunk, i.e. NE != "O" ne_label = subtree.label() ne_string = " ".join( [token for token, pos in subtree.leaves()]) ne_in_sent.append((ne_string, ne_label)) else: ne_in_sent.append((subtree[0], subtree[1])) return ne_in_sent """ Memuat rule chunk """ def load_chunker(self): try: f = open('postagg/dataset/phrase_chunker_grammar_id.txt') files = self.format_to_re(f.read()) grammars = files f.close() self.CHUNK_PARSER = nltk.RegexpParser(grammars) except Exception as e: print(str(e)) """ Mengubah tree chunk menjadi list of chunk dalam bentuk list of string """ def get_only_str(self, tree_chunk): output = [] for chunk, tag in tree_chunk: output.append(chunk) return output """ Mengubah list of chunk(string) menjadi string dengan format: [chunk1] [chunk2] ... [chunkN] """ def beautify(self, chunks): strout = "" for s in chunks: strout += "[" + s + "] " return strout """ Memberi POSTag pada setiap kata pada kalimat Melakukan chunking kalimat Mengembalikan chunk Tree """ def chunk_me1(self, _str): return self.CHUNK_PARSER.parse( self.TAGGER1.tagSentence(_str.split(" "))) """ Memberi POSTag pada setiap kata pada kalimat Melakukan chunking kalimat Mengembalikan chunk Tree """ def chunk_me2(self, _str): return self.CHUNK_PARSER.parse( self.TAGGER2.tagSentence(_str.split(" "))) """ """ def chunk_me3(self, _str): _strs = _str.split(" ") strs = [] for s in _strs: strs.append(s) return self.CHUNK_PARSER.parse(self.TAGGER3.tag_sents([strs])[0])
while line: words=line.replace("\r","").replace("\n","").split("\t") #print(words) if(len(words)<2): test_actual.append(test) test_sentences.append(sentence) test=[] sentence=[] else: tup1=(words[0],words[1]) sentence.append(words[0]) test.append(tup1) line=f.readline() f.close() res = ct.tag_sents(test_sentences) tagged_result = [] tagged_actual = [] for i in range(len(res)): for j in range(len(res[i])): tagged_result.append(res[i][j][1]) tagged_actual.append(test_actual[i][j][1]) print res[0] print test_actual[0] #print tagged_result[0] #print tagged_actual[0] gold_sentences=test_actual accuracy = ct.evaluate(gold_sentences) print "accuracy:"+str(accuracy)
class NERFeatureExtractor: def read_label_file(self, filename): return open(filename).read().split('\n') def __init__(self, iob_predictor): self.iob_predictor = iob_predictor self.stemmer = StemmerFactory().create_stemmer() self.TAGGER3 = CRFTagger() self.TAGGER3.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') self.label_words = self.read_label_file('label-words.txt') self.label_posses = self.read_label_file('label-posses.txt') self.label_lemmas = self.read_label_file('label-lemmas.txt') self.label_iob_feature = self.read_label_file('label-iob_feature.txt') self.label_iob_classes = self.read_label_file('label-iob_classes.txt') def getPOSTag(self, _temporary_tokens): strin = [] for token_tag in _temporary_tokens: strin.append(unicode(token_tag.decode('utf-8'))) return [(token.encode('ascii', 'ignore'), tag.encode('ascii', 'ignore')) for (token, tag) in self.TAGGER3.tag_sents([strin])[0]] def features(self, tokens, index, history): # print history # print tokens """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for `history` = the previous predicted IOB tags """ # Pad the sequence with placeholders tokens = [ ('[START2]', '[START2]'), ('[START1]', '[START1]') ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')] history = ['[START2]', '[START1]'] + list(history) # shift the index with 2, to accommodate the padding index += 2 word, pos = tokens[index] prevword, prevpos = tokens[index - 1] prevprevword, prevprevpos = tokens[index - 2] nextword, nextpos = tokens[index + 1] nextnextword, nextnextpos = tokens[index + 2] previob = history[index - 1] contains_dash = '-' in word contains_dot = '.' in word allascii = all([True for c in word if c in string.ascii_lowercase]) allcaps = word == word.capitalize() capitalized = word[0] in string.ascii_uppercase prevallcaps = prevword == prevword.capitalize() prevcapitalized = prevword[0] in string.ascii_uppercase nextallcaps = prevword == prevword.capitalize() nextcapitalized = prevword[0] in string.ascii_uppercase return [ word, str(self.stemmer.stem(word)), str(pos), str(allascii), str(nextword), str(self.stemmer.stem(nextword)), str(nextpos), str(nextnextword), str(nextnextpos), str(prevword), str(self.stemmer.stem(prevword)), str(prevpos), str(prevprevword), str(prevprevpos), str(previob), str(contains_dash), str(contains_dot), str(allcaps), str(capitalized), str(prevallcaps), str(prevcapitalized), str(nextallcaps), str(nextcapitalized) ] def normalizeFeature(self, featx): out = [] if featx[0] in self.label_words: out.append(self.label_words.index(featx[0])) else: out.append(-1) if featx[1] in self.label_lemmas: out.append(self.label_lemmas.index(featx[1])) else: out.append(-1) if featx[2] in self.label_posses: out.append(self.label_posses.index(featx[2])) else: out.append(-1) out.append(1 if featx[3] else 0) if featx[4] in self.label_words: out.append(self.label_words.index(featx[4])) else: out.append(-1) if featx[5] in self.label_lemmas: out.append(self.label_lemmas.index(featx[5])) else: out.append(-1) if featx[6] in self.label_posses: out.append(self.label_posses.index(featx[6])) else: out.append(-1) if featx[7] in self.label_words: out.append(self.label_words.index(featx[7])) else: out.append(-1) if featx[8] in self.label_posses: out.append(self.label_posses.index(featx[8])) else: out.append(-1) if featx[9] in self.label_words: out.append(self.label_words.index(featx[9])) else: out.append(-1) if featx[10] in self.label_lemmas: out.append(self.label_lemmas.index(featx[10])) else: out.append(-1) if featx[11] in self.label_posses: out.append(self.label_posses.index(featx[11])) else: out.append(-1) if featx[12] in self.label_words: out.append(self.label_words.index(featx[12])) else: out.append(-1) if featx[13] in self.label_posses: out.append(self.label_posses.index(featx[13])) else: out.append(-1) if featx[14] in self.label_iob_feature: out.append(self.label_iob_feature.index(featx[14])) else: out.append(-1) out.append(1 if featx[15] else 0) out.append(1 if featx[16] else 0) out.append(1 if featx[17] else 0) out.append(1 if featx[18] else 0) out.append(1 if featx[19] else 0) out.append(1 if featx[20] else 0) out.append(1 if featx[21] else 0) out.append(1 if featx[22] else 0) return out def parseEntityName(self, _sent=""): tokens = self.getPOSTag(_sent.split()) history = [] self.res_all = [] last_feature = [] for i in range(len(tokens)): last_feature = self.features(tokens, i, history) iob_res = self.iob_predictor([self.normalizeFeature(last_feature) ])[0] history.append(iob_res) self.res_all.append((tokens[i], self.label_iob_classes[iob_res]))
def tag_strings(path_to_model, tokenized_string): ct = CRFTagger() ct.set_model_file(path_to_model) tagged_strings = ct.tag_sents([tokenized_string]) # print("Tagged Strings:", tagged_strings) return tagged_strings
sentences = "" for line in lines: arr = re.findall(r"[a-zA-Z]+", line) sentences = sentences + " " + " ".join([w for w in arr]) paragraph_nouns = [] if sentences.strip(): for s in sentences.split("."): try: # s = remove_numbers(s) # s = remove_punctuation(s) # s = remove_stopwords(s) # s = remove_english_stopwords(s) s = remove_single_char(s) # s = stem_text(s) # s = stem_english_text(s) hasil = ct.tag_sents([s.split()]) temp_noun = "" sentence_nouns = [] prev_pos = "" for text, pos in hasil[0]: # print("{}:{}".format(text,pos)) if (pos == "NN" or pos == "NNP") and (prev_pos == "NN" or prev_pos == "NNP"): if len(temp_noun.split()) < 2: temp_noun = temp_noun + " " + text temp_noun = str(temp_noun).lower() elif (pos != "NN" or pos != "NNP") and (prev_pos == "NN" or prev_pos == "NNP"): if temp_noun: temp_noun = remove_punctuation(temp_noun) total_nouns.append(temp_noun)
from nltk.tag import CRFTagger ct = CRFTagger() ct.set_model_file('model/all_indo_man_tag_corpus_model.crf.tagger') hasil = ct.tag_sents([['Saya', 'bekerja', 'di', 'Bandung']])
# In[26]: TAGGER_PATH = "crfpostagger" # pre-trained POS-tagger # In[27]: tagger = CRFTagger() # initialize tagger tagger.set_model_file(TAGGER_PATH) # In[30]: # try some sentences out- must all be unicode strings- trained on lower case print(tagger.tag([u"i", u"like", u"revision"])) print(tagger.tag([u"i", u"like", u"natural", u"language", u"processing"])) # In[31]: # scaling up as you might get them in text- make sure unicode and lower case sentences = ["I like revision", "I like Natural Language Processing"] print(tagger.tag_sents([unicode(word.lower()) for word in s.split()] for s in sentences)) # In[ ]:
from nltk.tag import CRFTagger jumSample = 500000 namaFile = "Indonesian_Manually_Tagged_Corpus.tsv" with open(namaFile, 'r', encoding='utf-8') as f: lines = f.read().split('\n') pasangan = [] allPasangan = [] for line in lines[:min(jumSample, len(lines))]: if line == '': allPasangan.append(pasangan) pasangan = [] else: kata, tag = line.split('\t') p = (kata, tag) pasangan.append(p) ct = CRFTagger() ct.train(allPasangan, 'all_indo_man_tag_corpus_model.crf.tagger') # test hasil = ct.tag_sents([['Saya', 'bekerja', 'di', 'Bandung'], ['Nama', 'saya', 'Yudi']]) print(hasil)
#CRF Training ct = CRFTagger() print "CRF Training starts..." ct.train(ListOfSentences_Training,'model.crf.tagger') print "CRF Training is done." print "Testing starts" print "Accuracy of CRF is = ",ct.evaluate(ListOfSentences_Test) * 100 #Tagging by CRF Tagger ch = 'y' while (ch != 'n'): text = raw_input("Enter the text to be tagged : \n") text = converter(text) print ct.tag_sents(text) print "\nDo you want to continue ?" ch = raw_input() #HMM Training print "HMM Training using HiddenMarkovModelTrainer() starts.." hmmTrain1 = HiddenMarkovModelTrainer().train_supervised(ListOfSentences_Training) print "Training is completed.\n" print "Testing starts now.." hmmTrain1.test(ListOfSentences_Test) print "Testing is completed.."
crf_tagger = CRFTagger() tnt_tagger.train(training_data[ki]) hmm_tagger = nltk.HiddenMarkovModelTagger.train(training_data[ki]) perc_tagger.train(training_data[ki]) crf_tagger.train(training_data[ki], 'model.crf.tagger') # t.tagdata(test_data[800:]) perc_pred = [] hmm_pred = [] for i in testing_data[ki]: perc_pred.append(perc_tagger.tag(i)) hmm_pred.append(hmm_tagger.tag(i)) crf_pred = crf_tagger.tag_sents(testing_data[ki]) tnt_pred = tnt_tagger.tagdata(testing_data[ki]) pred = {'p': perc_pred, 'h': hmm_pred, 'c': crf_pred, 't': tnt_pred} def most_frequent(List): return max(set(List), key=List.count) import itertools def picker(tag_seq, i, j): tags = [] for k in tag_seq: tags.append(pred[k][i][j][1]) return tags, most_frequent(tags) s = 'phct'
def onsentencelist(): ct = CRFTagger() """sentencelist contains nertaged sentences""" sentencelist = pickle.load(open('sentencelist.pickle','rb')) """training size as percentage""" trainingsize = 0.9 """ calculate where to split data """ limit = round(trainingsize*len(sentencelist)) """wordsentencelist contains the same sentences not ner-tagged""" wordsentencelist = pickle.load(open("wordsentencelist.pickle","rb")) """train the data / choose one of the 2 blocks """ #train_data = sentencelist[:limit] #ct.train(train_data,'model.crf.tagger') ct.set_model_file('tweetmodel.crf.tagger') """Test data and evaluate""" test_data = wordsentencelist[limit:] ct.tag_sents(test_data) # tagging sentences gold_sentences = sentencelist[limit:] print("\nAccuracy:", ct.evaluate(gold_sentences)) """ TURN TRAINED TAGGED LIST AND TEST LIST INTO ONE LIST CONTAINING ONLY THE TRUE AND PREDTAGS""" pred_nerlist = [] for sentence in wordsentencelist[:limit]: for (word,nertag) in ct.tag(sentence): #pred_nerlist.append((word,nertag)) pred_nerlist.append(nertag.lower()) true_nerlist = [] #ct_true = gold_sentences for sentence in sentencelist[:limit]: for (word,nertag) in sentence: #true_nerlist.append((word,nertag)) true_nerlist.append(nertag.lower()) """ Print baseline """ #print("\nBaseline = 0.9048987094135446 (everything tagged O)") """"Print F-score and confusion matrix """ #print(len(pred_nerlist)) #print(len(true_nerlist)) """"Print F-score and confusion matrix """ print("\nF-score (micro):", f1_score(true_nerlist, pred_nerlist, average='micro') ) print("\nF-score (macro):", f1_score(true_nerlist, pred_nerlist, average='macro') ) print("\nF-score (weigthed):", f1_score(true_nerlist, pred_nerlist, average='weighted') ) print("\nF-score (None):", f1_score(true_nerlist, pred_nerlist, average=None, labels=["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"])) print("\nConfusion matrix:\n") for item in ["O","B-per","I-per","B-loc","I-loc","B-org","I-org","B-misc","I-misc"]: print(" ",item,end="") print("\n",confusion_matrix(true_nerlist, pred_nerlist,labels = ["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"]))
from nltk.tag import CRFTagger ct = CRFTagger() train_data = [[('Universiteit', 'Noun'), ('is', 'Verb'), ('een', 'Det'), ('goed', 'Adj'), ('goede', 'Adj'), ('plek', 'Noun'), ('hond', 'Noun'), ('eet', 'Verb'), ('vlees', 'Noun')]] ct.train(train_data, 'model.crf.tagger') ct.tag_sents([['hond', 'is', 'goed'], ['kat', 'eet', 'vlees']]) gold_sentences = [[('hond', 'Noun'), ('is', 'Verb'), ('goed', 'Adj')], [('kat', 'Noun'), ('eet', 'Verb'), ('vlees', 'Noun')]] ct.evaluate(gold_sentences) ct = CRFTagger() ct.set_model_file('model.crf.tagger') print(ct.evaluate(gold_sentences))
class DataAdapter(object): def __init__(self, data=[]): self.tagger = CRFTagger() self.tagger.set_model_file('model.crf.tagger') if data.count(True) > 0: self.data_tagging, self.data_testing = self.for_tagging_testing( data) # print('TAGGING', self.data_tagging) # print('TESTING', self.data_testing) def tokenize_tag(self, text): text = text.replace('\r', ' | ').replace('\n', ' | ') tokens = word_tokenize(text, preserve_line=True) labels = [] for label in self.tag(tokens): labels.append(label[1]) return tokens, labels def for_tagging_testing(self, data): # self.data = data array_tagging = [] array_testing = [] for d in data: all_tags = [] all_test = [] for index, t in enumerate(d['text']): one_tag = [t, d['label'][index]] all_test.append(one_tag) all_tags.append(t) array_tagging.append(all_tags) array_testing.append(all_test) # print(all_tags) return array_tagging, array_testing def for_testing(self, data): # self.data = data array = [] # print('TEST', data.count()) for d in data: all_tags = [] for index, t in enumerate(d['text']): # one_tag = [t, (d['label'][index] if is_ascii(d['label'][index]) else 'O')] one_tag = [t, d['label'][index]] all_tags.append(one_tag) array.append(all_tags) # print(all_tags) return array def for_tagging(self, data): # self.data = data array = [] for d in data: all_tags = [] for t in d['text']: all_tags.append(t) array.append(all_tags) # print(all_tags) return array def tag_sents(self): if self.data_tagging is not None: return self.tagger.tag_sents(self.data_tagging) else: return 'NoData' def tag(self, data): return self.tagger.tag(data) def evaluate(self): if self.data_testing is not None: return self.tagger.evaluate(self.data_testing) else: return 'NoData' def train(self, data): data = self.for_testing(data) self.tagger.train(data, 'model.crf.tagger') print('ACCURACY:', self.tagger.evaluate(data))
def tagger(msg): ct = CRFTagger() ct.set_model_file('model/all_indo_man_tag_corpus_model.crf.tagger') hasil = ct.tag_sents([split(msg)]) return hasil[0]
# Stop Words StopWordFactory = StopWordRemoverFactory() StopWord = StopWordFactory.create_stop_word_remover() # Stemming StemFactory = StemmerFactory() Stemmer = StemFactory.create_stemmer() # pos tagging ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') # # stop word # stop = StopWord.remove(kalimat) #tokenize tokenize = nltk.tokenize.word_tokenize(response) # pos tagging tag = ct.tag_sents([tokenize]) print(tag) # print(direct) # nltk for i in tag[0]: # http request for DeviceData in DeviceDirect['data']: if (i[1] == 'NN'): # mencari NN untuk Device if (tokenize[j] == DeviceData['device_category']): # hasil dari NN dengan membandungkan device pada database id = DeviceData['id'] Device = DeviceData['device_category'] # print(