def start_report_classification_training(dset_name): print("Start report classification using MAUDE\n") if dset_name == "100000_random_entries": print("Dataset:", dset_name, "\n") loadpath = os.path.join("data", "tokenized", "100000_random_entries_prod_codes.pkl") pd_maude = read_pickle(loadpath) print(pd_maude) elif dset_name == "subset_2": print("Dataset:", dset_name, "\n") loadpath = os.path.join("data", "tokenized", "subset_2.pkl") pd_maude = read_pickle(loadpath) print(pd_maude) elif dset_name == "whole_maude": pd_maude = read_whole_MAUDE(version="tokenized") else: raise ValueError( "dset_name must be one of [100000_random_entries, whole_maude]. Got", dset_name) pd_dset_tokenized = Maude_pd_dataset(pd_maude) pd_dset_tokenized.unpack_device_column() print(pd_dset_tokenized.dataset) print(pd_dset_tokenized.dataset[[ "tokenized_text", "device_report_product_code" ]]) exit(99) train_LSTM(pd_dset_tokenized)
def tokenize_pd_maude(maude_pd_dataset: Maude_pd_dataset): """Get all texts from the MAUDE entries, i.e. unpack the mdr_text field to be one row per text""" start = time.time() maude_pd_dataset.explode() print("Explode:", time.time() - start) maude_pd_dataset.unpack_mdr_text_column() maude_pd_dataset.dataset.dropna(subset=["text"], inplace=True) print(maude_pd_dataset.dataset) """Tokenize the dataset""" """takes some time. Afterwards, save as .pkl""" pd_tokenized = apply_tokenization_on_dataset(maude_pd_dataset.dataset) return Maude_pd_dataset(pd_tokenized)
def __init__(self, pickle_data): self.hobj = hunspell.HunSpell(os.path.join("..", "data", "hunspell", "en_US.dic.txt"), os.path.join("..", "data", "hunspell", "en_US.aff.txt")) ADDITIONAL_DICTIONARY = 'additional_vocabulary_terms.txt' terms = codecs.open(os.path.join("data", ADDITIONAL_DICTIONARY), 'r', 'utf-8').readlines() for term in terms: term = term.split()[0] self.hobj.add(term) pkl = load_pkl(os.path.join("..", "data", "tokenized", pickle_data)) self.data = Maude_pd_dataset(pkl).dataset self.valid_tokens = ValidTokens().tokens self.vocabulary = Counter() self.OOV = Counter() # Tokens ordered by occurrence self.occ = Counter()
def read_whole_MAUDE(version): if version == "raw": pkl = read_all_pd_chunks(os.path.join("data", "MAUDE", "all_entries")) pd_maude_dset = Maude_pd_dataset(pkl) elif version == "tokenized": pd_maude = read_all_pd_chunks( os.path.join("data", "tokenized", "MAUDE")) pd_maude_dset = Maude_pd_dataset(pd_maude) else: raise ValueError("version must be one of [raw, tokenized]. Got", version) return pd_maude_dset
def load_maude_dset(path): pkl = load_pkl(path) pd_maude = Maude_pd_dataset(pkl) return pd_maude
from DataImporter.utils import open_json, load_pkl import os from DataImporter.MAUDE.maude_dset import Maude_pd_dataset import pandas as pd pd.set_option('display.max_columns', 80) pd.set_option('display.width', 100000) pd.set_option('display.max_rows', 50) dset = open_json( os.path.join("data", "MAUDE", "2020q1", "device-event-0001-of-0004.json")) m = Maude_pd_dataset(dset) print(m.dataset) print(m.get_all_report_texts()) pkl = load_pkl( os.path.join("data", "MAUDE", "100000_random_entries_prod_codes.pkl")) subset_2 = Maude_pd_dataset(pkl) print(subset_2.get_all_report_texts()) exit(99) """ print(m[0]) print(m.get_report_number(1)) print(m.get_product_problems(0)) list = list() for i in range(len(m)): pp = m.get_product_problems(i) for p in pp:
class MaudeVocabulary(): def __init__(self, pickle_data): self.hobj = hunspell.HunSpell(os.path.join("..", "data", "hunspell", "en_US.dic.txt"), os.path.join("..", "data", "hunspell", "en_US.aff.txt")) ADDITIONAL_DICTIONARY = 'additional_vocabulary_terms.txt' terms = codecs.open(os.path.join("data", ADDITIONAL_DICTIONARY), 'r', 'utf-8').readlines() for term in terms: term = term.split()[0] self.hobj.add(term) pkl = load_pkl(os.path.join("..", "data", "tokenized", pickle_data)) self.data = Maude_pd_dataset(pkl).dataset self.valid_tokens = ValidTokens().tokens self.vocabulary = Counter() self.OOV = Counter() # Tokens ordered by occurrence self.occ = Counter() def get_vocab(self): for i, (idx, row) in enumerate(self.data.iterrows()): try: tokens = row["tokenized text"] for token in tokens: self.vocabulary[token] += 1 except: pass #print(row) def OOV_words(self): for type in self.vocabulary: if not self.hobj.spell(type): if type not in self.valid_tokens: if type not in string.punctuation: self.OOV[type] = self.vocabulary[type] def print_numbers(self): print('token count: ', sum(self.vocabulary.values())) print('OOV tokens: ', sum(self.OOV.values())) print('type count: ', len(self.vocabulary)) print('OOV types: ', len(self.OOV)) for i in range(1,10): print('Tokens occuring ', i, ' times: ', self.token_occurence(i)) OOV_hapax_count = 0 for word in self.occ[1]: if word in self.OOV: OOV_hapax_count += 1 print('OOV words that occur only once: ', OOV_hapax_count) OOV_hapax2_count = 0 for word in self.occ[2]: if word in self.OOV: OOV_hapax2_count += 1 print('OOV words that occur only twice: ', OOV_hapax2_count) def write_to_file(self, filename, oov_filename): vocab = open(filename, 'w') for type in self.vocabulary.most_common(): vocab.write(type[0] + '\t' + str(type[1]) + '\n') oov = open(oov_filename, 'w') for type in self.OOV.most_common(): oov.write(type[0] + '\t' + str(type[1]) + '\n') def tokens_ordered_by_occurence(self): for k, v in self.vocabulary.items(): if self.occ[v] == 0: self.occ[v] = [k] else: self.occ[v].append(k) # How many tokens occur n times def token_occurence(self, n): return len(self.occ[n]) def get_numbers(self): self.get_vocab() self.OOV_words() self.tokens_ordered_by_occurence() self.write_to_file('../data/vocab/100000_entries_vocab.txt', '../data/vocab/100000_entries_OOV_vocab.txt') self.print_numbers()
return seg_text # complete pipeline: raw input, tokenize, lemmatize and clean, output: list of tokens def pipe(self, text, only_tokens=False): output = self.tokenizer(text) if not only_tokens: output = self.lemmatizer(output) output = self.clean(output) return output if __name__ == "__main__": P = MaudePreprocessor() pkl_name = "100000_random_entries_prod_codes.pkl" pkl = load_pkl(os.path.join("data", "MAUDE", pkl_name)) savepath = os.path.join("data", "tokenized", pkl_name) pkl = load_pkl(os.path.join("data", "MAUDE", pkl_name)) subset_2 = Maude_pd_dataset(pkl) pd_texts = subset_2.get_all_report_texts() #pd_texts = subset_2.get_all_report_texts(mode="reports+texts") for i, (idx, row) in enumerate(pd_texts.iterrows()): try: #prep = P.pipe(row["mdr_text"]["text"]) prep = P.pipe(row["text"]) pd_texts.at[idx, "tokenized text"] = prep except: pass