예제 #1
0
def start_report_classification_training(dset_name):
    print("Start report classification using MAUDE\n")

    if dset_name == "100000_random_entries":
        print("Dataset:", dset_name, "\n")
        loadpath = os.path.join("data", "tokenized",
                                "100000_random_entries_prod_codes.pkl")
        pd_maude = read_pickle(loadpath)
        print(pd_maude)

    elif dset_name == "subset_2":
        print("Dataset:", dset_name, "\n")
        loadpath = os.path.join("data", "tokenized", "subset_2.pkl")
        pd_maude = read_pickle(loadpath)
        print(pd_maude)

    elif dset_name == "whole_maude":
        pd_maude = read_whole_MAUDE(version="tokenized")

    else:
        raise ValueError(
            "dset_name must be one of [100000_random_entries, whole_maude]. Got",
            dset_name)

    pd_dset_tokenized = Maude_pd_dataset(pd_maude)
    pd_dset_tokenized.unpack_device_column()
    print(pd_dset_tokenized.dataset)
    print(pd_dset_tokenized.dataset[[
        "tokenized_text", "device_report_product_code"
    ]])

    exit(99)

    train_LSTM(pd_dset_tokenized)
def tokenize_pd_maude(maude_pd_dataset: Maude_pd_dataset):
    """Get all texts from the MAUDE entries, i.e. unpack the mdr_text field to be one row per text"""
    start = time.time()
    maude_pd_dataset.explode()
    print("Explode:", time.time() - start)
    maude_pd_dataset.unpack_mdr_text_column()
    maude_pd_dataset.dataset.dropna(subset=["text"], inplace=True)
    print(maude_pd_dataset.dataset)
    """Tokenize the dataset"""
    """takes some time. Afterwards, save as .pkl"""
    pd_tokenized = apply_tokenization_on_dataset(maude_pd_dataset.dataset)

    return Maude_pd_dataset(pd_tokenized)
예제 #3
0
    def __init__(self, pickle_data):
        self.hobj = hunspell.HunSpell(os.path.join("..", "data", "hunspell", "en_US.dic.txt"),
                                      os.path.join("..", "data", "hunspell", "en_US.aff.txt"))
        ADDITIONAL_DICTIONARY = 'additional_vocabulary_terms.txt'
        terms = codecs.open(os.path.join("data", ADDITIONAL_DICTIONARY), 'r', 'utf-8').readlines()
        for term in terms:
            term = term.split()[0]
            self.hobj.add(term)

        pkl = load_pkl(os.path.join("..", "data", "tokenized", pickle_data))
        self.data = Maude_pd_dataset(pkl).dataset
        self.valid_tokens = ValidTokens().tokens
        self.vocabulary = Counter()
        self.OOV = Counter()
        # Tokens ordered by occurrence
        self.occ = Counter()
예제 #4
0
def read_whole_MAUDE(version):

    if version == "raw":
        pkl = read_all_pd_chunks(os.path.join("data", "MAUDE", "all_entries"))
        pd_maude_dset = Maude_pd_dataset(pkl)

    elif version == "tokenized":
        pd_maude = read_all_pd_chunks(
            os.path.join("data", "tokenized", "MAUDE"))
        pd_maude_dset = Maude_pd_dataset(pd_maude)

    else:
        raise ValueError("version must be one of [raw, tokenized]. Got",
                         version)

    return pd_maude_dset
예제 #5
0
def load_maude_dset(path):

    pkl = load_pkl(path)

    pd_maude = Maude_pd_dataset(pkl)

    return pd_maude
예제 #6
0
from DataImporter.utils import open_json, load_pkl
import os
from DataImporter.MAUDE.maude_dset import Maude_pd_dataset

import pandas as pd

pd.set_option('display.max_columns', 80)
pd.set_option('display.width', 100000)
pd.set_option('display.max_rows', 50)

dset = open_json(
    os.path.join("data", "MAUDE", "2020q1", "device-event-0001-of-0004.json"))
m = Maude_pd_dataset(dset)
print(m.dataset)
print(m.get_all_report_texts())

pkl = load_pkl(
    os.path.join("data", "MAUDE", "100000_random_entries_prod_codes.pkl"))
subset_2 = Maude_pd_dataset(pkl)
print(subset_2.get_all_report_texts())

exit(99)
"""
print(m[0])
print(m.get_report_number(1))
print(m.get_product_problems(0))

list = list()
for i in range(len(m)):
    pp = m.get_product_problems(i)
    for p in pp:
예제 #7
0
class MaudeVocabulary():

    def __init__(self, pickle_data):
        self.hobj = hunspell.HunSpell(os.path.join("..", "data", "hunspell", "en_US.dic.txt"),
                                      os.path.join("..", "data", "hunspell", "en_US.aff.txt"))
        ADDITIONAL_DICTIONARY = 'additional_vocabulary_terms.txt'
        terms = codecs.open(os.path.join("data", ADDITIONAL_DICTIONARY), 'r', 'utf-8').readlines()
        for term in terms:
            term = term.split()[0]
            self.hobj.add(term)

        pkl = load_pkl(os.path.join("..", "data", "tokenized", pickle_data))
        self.data = Maude_pd_dataset(pkl).dataset
        self.valid_tokens = ValidTokens().tokens
        self.vocabulary = Counter()
        self.OOV = Counter()
        # Tokens ordered by occurrence
        self.occ = Counter()



    def get_vocab(self):
        for i, (idx, row) in enumerate(self.data.iterrows()):
            try:
                tokens = row["tokenized text"]
                for token in tokens:
                    self.vocabulary[token] += 1
            except:
                pass
                #print(row)


    def OOV_words(self):
        for type in self.vocabulary:
            if not self.hobj.spell(type):
                if type not in self.valid_tokens:
                    if type not in string.punctuation:
                        self.OOV[type] = self.vocabulary[type]


    def print_numbers(self):
        print('token count: ', sum(self.vocabulary.values()))
        print('OOV tokens: ', sum(self.OOV.values()))
        print('type count: ', len(self.vocabulary))
        print('OOV types: ', len(self.OOV))
        for i in range(1,10):
            print('Tokens occuring ', i, ' times: ', self.token_occurence(i))
        OOV_hapax_count = 0
        for word in self.occ[1]:
            if word in self.OOV:
                OOV_hapax_count += 1
        print('OOV words that occur only once: ', OOV_hapax_count)
        OOV_hapax2_count = 0
        for word in self.occ[2]:
            if word in self.OOV:
                OOV_hapax2_count += 1
        print('OOV words that occur only twice: ', OOV_hapax2_count)


    def write_to_file(self, filename, oov_filename):
        vocab = open(filename, 'w')
        for type in self.vocabulary.most_common():
            vocab.write(type[0] + '\t' + str(type[1]) + '\n')
        oov = open(oov_filename, 'w')
        for type in self.OOV.most_common():
            oov.write(type[0] + '\t' + str(type[1]) + '\n')


    def tokens_ordered_by_occurence(self):
        for k, v in self.vocabulary.items():
            if self.occ[v] == 0:
                self.occ[v] = [k]
            else:
                self.occ[v].append(k)


    # How many tokens occur n times
    def token_occurence(self, n):
        return len(self.occ[n])


    def get_numbers(self):
        self.get_vocab()
        self.OOV_words()
        self.tokens_ordered_by_occurence()
        self.write_to_file('../data/vocab/100000_entries_vocab.txt', '../data/vocab/100000_entries_OOV_vocab.txt')
        self.print_numbers()
예제 #8
0
        return seg_text


    # complete pipeline: raw input, tokenize, lemmatize and clean, output: list of tokens
    def pipe(self, text, only_tokens=False):
        output = self.tokenizer(text)
        if not only_tokens:
            output = self.lemmatizer(output)
        output = self.clean(output)
        return output



if __name__ == "__main__":
    P = MaudePreprocessor()
    pkl_name = "100000_random_entries_prod_codes.pkl"
    pkl = load_pkl(os.path.join("data", "MAUDE", pkl_name))

    savepath = os.path.join("data", "tokenized", pkl_name)
    pkl = load_pkl(os.path.join("data", "MAUDE", pkl_name))
    subset_2 = Maude_pd_dataset(pkl)
    pd_texts = subset_2.get_all_report_texts()
    #pd_texts = subset_2.get_all_report_texts(mode="reports+texts")
    for i, (idx, row) in enumerate(pd_texts.iterrows()):
        try:
            #prep = P.pipe(row["mdr_text"]["text"])
            prep = P.pipe(row["text"])
            pd_texts.at[idx, "tokenized text"] = prep
        except:
            pass