Beispiel #1
0
 def demo(self, test_sents):
     tagger = CRFTagger(feature_func=self.feature_detector)
     tagger.set_model_file(self.modelpath)
     for sent in test_sents:
         tagged = tagger.tag(untag(sent))
         for s in self._to_sentence(tagged):
             print(s)
     print(tagger.evaluate(test_sents))
Beispiel #2
0
    def pyt_sent_tokenizer(self, paragraph):
        """단락을 문장으로 바꿔주는 함수입니다. 파이테스트용입니다.

        Args:
            paragraph(list(str)): 단락이 리스트 인자로 들어옵니다.

        Returns:
            sentences(list(list(str))): 단락을 문장단위로 잘라서 반환합니다.
        """
        tagger = CRFTagger(feature_func=self.feature_detector)
        tagger.set_model_file(self.modelpath)
        words = re.split('\s+', paragraph.strip())
        tagged = tagger.tag(words)
        return self._to_sentence(tagged)
Beispiel #3
0
    def batch_sent_tokenizer(self, paragraphs):
        """단락들을 문장으로 바꿔주는 함수입니다.

        Args:
            paragraphs(list(str)): 단락들이 리스트 인자로 들어옵니다.

        Returns:
            sentences(list(str)): 단락을 문장단위로 잘라서 반환합니다.
        """
        tagger = CRFTagger(feature_func=self.feature_detector)
        tagger.set_model_file(self.modelpath)
        sentences = []
        for paragraph in paragraphs:
            words = re.split('\s', paragraph.strip())
            tagged = tagger.tag(words)
            sentences.append(self._to_sentence(tagged))
        return sentences
y = np.array(y)
y_hat = np.array(y_hat)

print("hmm acc : ", (y == y_hat).mean())

#named entities recognition
import pickle

a = pickle.load(
    open(
        "/users/Etu0/3770640/M1/Sem2/TAL/TME1/maxent_ne_chunker/PY3/english_ace_multiclass.pickle",
        "rb"))

from nltk.tag.crf import CRFTagger

tagger = CRFTagger()
tagger.train(alldocs, u'crf.model'
             )  # donner en plus le fichier de stockage du calcul des features

tagger.tag(['Je suis à la maison'])
print(tagger._get_features([u"Je"], 0))

from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger(load=False)
tagger.train(alldocs)

# adT_seq: liste de liste de mots (=liste de phrase)
allpred_smart = [[t for w, t in tagger.tag(adT_seq[i])]
                 for i in range(len(adT_seq))]
allpred_stupid = [[tagger.tag([w])[0][1] for w in adT_seq[i]]
                  for i in range(len(adT_seq))]
Beispiel #5
0
def main(positive, death):
    ############# Compile the dataset ###############
    ## Load the dataset
    text = list()
    response = list()
    file_path = [positive, death]

    for path in file_path:
        input_file = jsonlines.open(path)
        for obj in input_file:
            text.append(obj['text'])
            response.append(obj['annotation']['part1.Response'])

    ## Tweet Preprocessing
    prep_text = list()
    for i in text:
        prep_text.append(p.clean(i))

    ## Tag Keywords and Create Labels
    ### Focus on verbs--therefore, try lemmatization first
    wnl = WordNetLemmatizer()
    n_corpus = len(prep_text)
    token_data = ["test"] * n_corpus

    n = 0
    for sent in prep_text:
        token_data[n] = [
            wnl.lemmatize(i, j[0].lower())
            if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i)
            for i, j in pos_tag(word_tokenize(sent))
        ]
        n = n + 1

    ### Create labels
    death_list = ["die", "dead", "death", "pass", "away"]

    n = 0
    for sent in token_data:
        for idx, token in enumerate(sent):
            if ((token.lower() in ["test", "positive", "result"])
                    and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "P-Yes"]
            elif ((token.lower() in ["test", "positive", "result"])
                  and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "P-No"]
            elif ((token.lower() in death_list) and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "D-Yes"]
            elif ((token.lower() in death_list) and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "D-No"]
            else:
                sent[idx] = [sent[idx], "Irr"]
        n = n + 1

    ## Shuffle and split into train data and dev data
    token_data = shuffle(token_data, random_state=6)
    train_data, dev_data = train_test_split(token_data,
                                            test_size=0.3,
                                            random_state=616)
    print(
        f"The number of sentences in training data: {len(train_data)}; The number of sentences in dev data: {len(dev_data)};"
    )

    ############# Fit A CRF Model And Predict ###############
    condition_to_func = {
        "base": my_features,
        "include_neighbors": neighbor_features
    }
    for cond, func in condition_to_func.items():
        # initialize
        crf = CRFTagger(feature_func=func)
        crf.train(train_data, 'model.tagger')
        # Test
        crf._feature_func(prep_text[0].split(), 7)
        crf.tag_sents([['I', 'get', 'covid'], ['he', 'test', 'positive']])

        # Output
        filename = cond + "_final_output.tsv"
        with open(filename, 'w') as pred_file:
            for sent in dev_data:
                sent_words = [item[0] for item in sent]
                gold_tags = [item[1] for item in sent]

                with_tags = crf.tag(sent_words)
                for i, output in enumerate(with_tags):
                    original_word, tag_prediction = output
                    line_as_str = f"{original_word}\t{gold_tags[i]}\t{tag_prediction}\n"
                    pred_file.write(line_as_str)
                # add an empty line after each sentence
                pred_file.write("\n")

    ############# Evaluation ###############
    ## Extract Data with Meaning Labels
    cond_list = ['base', 'include_neighbors']

    for cond in cond_list:
        filename = cond + "_final_output.tsv"

        with open(filename) as fd:
            rd = csv.reader(fd, delimiter="\t", quotechar='"')
            D_data = []
            P_data = []
            for row in rd:
                if len(row) > 1:
                    if row[1] in ['P-Yes', 'P-No']:
                        P_data.append(row)
                    elif row[1] in ['D-Yes', 'D-No']:
                        D_data.append(row)

        column_name = ['token', 'label', 'prediction']
        P_df = pd.DataFrame(P_data, columns=column_name)
        D_df = pd.DataFrame(D_data, columns=column_name)
        Total_df = P_df.append(D_df)

        # Accuracy
        ## Overall Accuracy
        T_a = accuracy_score(Total_df['label'], Total_df['prediction'])

        ## Accuracy, Precision, and Recall for two events
        accuracy = []
        precision = []
        recall = []
        for df in [P_df, D_df]:
            accuracy.append(accuracy_score(df['label'], df['prediction']))
            precision.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['prediction'][item])))
            recall.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item])))

        ## F-1
        f1 = []
        for num in [0, 1]:
            f1.append((2 * precision[num] * recall[num]) /
                      (precision[num] + recall[num]))

        # Report performance
        print("condition: " + cond)
        print(f"Overall Accuracy {T_a:0.03}")
        covid_event = ['Test Positive', 'Death Case']

        num = 0
        for event in covid_event:
            print(
                f"Scores for {event} : \taccuracy {accuracy[num]:0.03}\tprecision {precision[num]:0.03}\trecall {recall[num]:0.03}\tF1 {f1[num]:0.03}"
            )
            num = num + 1

    ## Basicline Performance / Confusion Matrix
    print("Confusion Matrix:")
    print(pd.crosstab(Total_df['label'], Total_df['prediction']))
    print("Training data:")
    labels = ["P-Yes", "P-No", "D-Yes", "D-No"]
    for label in labels:
        train_data2 = np.concatenate(train_data).flat
        n_label = sum(1 for item in train_data2 if item == label)
        print(f"Number of {label}: {n_label}")