def demo(self, test_sents): tagger = CRFTagger(feature_func=self.feature_detector) tagger.set_model_file(self.modelpath) for sent in test_sents: tagged = tagger.tag(untag(sent)) for s in self._to_sentence(tagged): print(s) print(tagger.evaluate(test_sents))
def pyt_sent_tokenizer(self, paragraph): """단락을 문장으로 바꿔주는 함수입니다. 파이테스트용입니다. Args: paragraph(list(str)): 단락이 리스트 인자로 들어옵니다. Returns: sentences(list(list(str))): 단락을 문장단위로 잘라서 반환합니다. """ tagger = CRFTagger(feature_func=self.feature_detector) tagger.set_model_file(self.modelpath) words = re.split('\s+', paragraph.strip()) tagged = tagger.tag(words) return self._to_sentence(tagged)
def batch_sent_tokenizer(self, paragraphs): """단락들을 문장으로 바꿔주는 함수입니다. Args: paragraphs(list(str)): 단락들이 리스트 인자로 들어옵니다. Returns: sentences(list(str)): 단락을 문장단위로 잘라서 반환합니다. """ tagger = CRFTagger(feature_func=self.feature_detector) tagger.set_model_file(self.modelpath) sentences = [] for paragraph in paragraphs: words = re.split('\s', paragraph.strip()) tagged = tagger.tag(words) sentences.append(self._to_sentence(tagged)) return sentences
y = np.array(y) y_hat = np.array(y_hat) print("hmm acc : ", (y == y_hat).mean()) #named entities recognition import pickle a = pickle.load( open( "/users/Etu0/3770640/M1/Sem2/TAL/TME1/maxent_ne_chunker/PY3/english_ace_multiclass.pickle", "rb")) from nltk.tag.crf import CRFTagger tagger = CRFTagger() tagger.train(alldocs, u'crf.model' ) # donner en plus le fichier de stockage du calcul des features tagger.tag(['Je suis à la maison']) print(tagger._get_features([u"Je"], 0)) from nltk.tag.perceptron import PerceptronTagger tagger = PerceptronTagger(load=False) tagger.train(alldocs) # adT_seq: liste de liste de mots (=liste de phrase) allpred_smart = [[t for w, t in tagger.tag(adT_seq[i])] for i in range(len(adT_seq))] allpred_stupid = [[tagger.tag([w])[0][1] for w in adT_seq[i]] for i in range(len(adT_seq))]
def main(positive, death): ############# Compile the dataset ############### ## Load the dataset text = list() response = list() file_path = [positive, death] for path in file_path: input_file = jsonlines.open(path) for obj in input_file: text.append(obj['text']) response.append(obj['annotation']['part1.Response']) ## Tweet Preprocessing prep_text = list() for i in text: prep_text.append(p.clean(i)) ## Tag Keywords and Create Labels ### Focus on verbs--therefore, try lemmatization first wnl = WordNetLemmatizer() n_corpus = len(prep_text) token_data = ["test"] * n_corpus n = 0 for sent in prep_text: token_data[n] = [ wnl.lemmatize(i, j[0].lower()) if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i) for i, j in pos_tag(word_tokenize(sent)) ] n = n + 1 ### Create labels death_list = ["die", "dead", "death", "pass", "away"] n = 0 for sent in token_data: for idx, token in enumerate(sent): if ((token.lower() in ["test", "positive", "result"]) and (response[n] == ["yes"])): sent[idx] = [sent[idx], "P-Yes"] elif ((token.lower() in ["test", "positive", "result"]) and (response[n] == ["no"])): sent[idx] = [sent[idx], "P-No"] elif ((token.lower() in death_list) and (response[n] == ["yes"])): sent[idx] = [sent[idx], "D-Yes"] elif ((token.lower() in death_list) and (response[n] == ["no"])): sent[idx] = [sent[idx], "D-No"] else: sent[idx] = [sent[idx], "Irr"] n = n + 1 ## Shuffle and split into train data and dev data token_data = shuffle(token_data, random_state=6) train_data, dev_data = train_test_split(token_data, test_size=0.3, random_state=616) print( f"The number of sentences in training data: {len(train_data)}; The number of sentences in dev data: {len(dev_data)};" ) ############# Fit A CRF Model And Predict ############### condition_to_func = { "base": my_features, "include_neighbors": neighbor_features } for cond, func in condition_to_func.items(): # initialize crf = CRFTagger(feature_func=func) crf.train(train_data, 'model.tagger') # Test crf._feature_func(prep_text[0].split(), 7) crf.tag_sents([['I', 'get', 'covid'], ['he', 'test', 'positive']]) # Output filename = cond + "_final_output.tsv" with open(filename, 'w') as pred_file: for sent in dev_data: sent_words = [item[0] for item in sent] gold_tags = [item[1] for item in sent] with_tags = crf.tag(sent_words) for i, output in enumerate(with_tags): original_word, tag_prediction = output line_as_str = f"{original_word}\t{gold_tags[i]}\t{tag_prediction}\n" pred_file.write(line_as_str) # add an empty line after each sentence pred_file.write("\n") ############# Evaluation ############### ## Extract Data with Meaning Labels cond_list = ['base', 'include_neighbors'] for cond in cond_list: filename = cond + "_final_output.tsv" with open(filename) as fd: rd = csv.reader(fd, delimiter="\t", quotechar='"') D_data = [] P_data = [] for row in rd: if len(row) > 1: if row[1] in ['P-Yes', 'P-No']: P_data.append(row) elif row[1] in ['D-Yes', 'D-No']: D_data.append(row) column_name = ['token', 'label', 'prediction'] P_df = pd.DataFrame(P_data, columns=column_name) D_df = pd.DataFrame(D_data, columns=column_name) Total_df = P_df.append(D_df) # Accuracy ## Overall Accuracy T_a = accuracy_score(Total_df['label'], Total_df['prediction']) ## Accuracy, Precision, and Recall for two events accuracy = [] precision = [] recall = [] for df in [P_df, D_df]: accuracy.append(accuracy_score(df['label'], df['prediction'])) precision.append( sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item] and 'Yes' in df['prediction'][item])) / sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['prediction'][item]))) recall.append( sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item] and 'Yes' in df['prediction'][item])) / sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item]))) ## F-1 f1 = [] for num in [0, 1]: f1.append((2 * precision[num] * recall[num]) / (precision[num] + recall[num])) # Report performance print("condition: " + cond) print(f"Overall Accuracy {T_a:0.03}") covid_event = ['Test Positive', 'Death Case'] num = 0 for event in covid_event: print( f"Scores for {event} : \taccuracy {accuracy[num]:0.03}\tprecision {precision[num]:0.03}\trecall {recall[num]:0.03}\tF1 {f1[num]:0.03}" ) num = num + 1 ## Basicline Performance / Confusion Matrix print("Confusion Matrix:") print(pd.crosstab(Total_df['label'], Total_df['prediction'])) print("Training data:") labels = ["P-Yes", "P-No", "D-Yes", "D-No"] for label in labels: train_data2 = np.concatenate(train_data).flat n_label = sum(1 for item in train_data2 if item == label) print(f"Number of {label}: {n_label}")