def __init__(self, BASEDIR, session_only=False, cycle_time=1): super().__init__(BASEDIR, session_only, cycle_time) self.name = 'contentrank' mapper = Mapping() self.rec_mapping = mapper.get_header_rec() self.event_mapping = mapper.get_header_event() self.update_mapping = mapper.get_header_update() self.item_id_idx = self.rec_mapping.index('ITEM_SOURCE') self.publisher_id_idx = self.rec_mapping.index('PUBLISHER') self.recs_idx = self.event_mapping.index('recs') self.limit_idx = self.rec_mapping.index('limit') self.title_idx = self.update_mapping.index('title') self.text_idx = self.update_mapping.index('text') self.update_id_idx = self.update_mapping.index('id') self.update_domainid_idx = self.update_mapping.index('domainid') self.germanStemmer = GermanStemmer(ignore_stopwords=True) self.stopwords = stopwords.words('german') self.stems = {} # (item, [stem, stem, stem]) self.correct = 0 self.total_events = 0 self.nrrows = 0 self.counts = {}
def build_stems(pattern: str, category: Category, elements: List[Tuple[Category, Set[str]]], total_stems: Set[str]) -> Set[str]: """ Builds a set of stems for all words used in the pattern. Args: pattern: The pattern to tokenize and stem. category: The category of the pattern. elements: A mutable list of (category, stem) pairs that the new stems will be appended to. total_stems: The set of total stems before this function was invoked. Will not be mutated. Returns: The union of total_stems and stems found in the pattern. """ # Tokenize pattern into words words = nltk.word_tokenize(pattern) # Get stems for the pattern's words, as a set to avoid duplicates stemmer = GermanStemmer() stems: Set[str] = {stemmer.stem(w.lower()) for w in words} # Add stems associated with association to the category to the # pattern list. elements.append((category, stems)) # Add stems to total set of stems, needed for conversion to numeric # TensorFlow training array total_stems |= stems return total_stems
def cosine_preprocess(texts, pickle_name, pickle_folder='pickle'): pickle_path = os.path.join(pickle_folder, pickle_name) # Return from disk if possible for efficiency reasons if os.path.exists(pickle_path): with open(pickle_path, 'rb') as f: return pickle.load(f) processed = [] for text in tqdm(texts): stemmer = GermanStemmer() words = stopwords.words('german') tokens = [ stemmer.stem(token) for token in word_tokenize(text) if token not in words ] processed.append(' '.join(tokens)) # Pickle the output if not os.path.exists(pickle_folder): os.makedirs(pickle_folder) with open(pickle_path, 'wb') as f: pickle.dump(processed, f) return processed
def tokenize(self, tweet): tweet = remove_handles(tweet) tweet = tweet.replace('#', ' ') tweet = tweet.replace('<', ' ') tweet = tweet.replace('>', ' ') tweet = tweet.replace('&', ' und ') tweet = tweet.replace('|LBR|', ' ') tweet = tweet.replace('-', ' ') tweet = tweet.replace('_', ' ') tweet = tweet.replace("'s", ' ') tweet = tweet.replace(",", ' ') tweet = tweet.replace(";", ' ') tweet = tweet.replace(":", ' ') tweet = tweet.replace("/", ' ') tweet = tweet.replace("+", ' ') tknzr = Tokenizer_NLTK(preserve_case=self.preserve_case, reduce_len=True) if self.join: return " ".join(tknzr.tokenize(tweet)) elif self.use_stemmer: stmmr = Stemmer_NLTK() return [stmmr.stem(token) for token in tknzr.tokenize(tweet)] else: return tknzr.tokenize(tweet)
def __init__(self, filename): """ Parameters ---------- filename: str Path to the (plaintext) file for this document. """ # open file with utf-8-sig to remove any BOMs with open(filename, "r", encoding="utf-8-sig", errors="ignore") as infile: self.string = clean_whitespace(infile.read()) self.lang = langid.classify(self.string)[0] if self.lang == 'de': self._stemmer = GermanStemmer() elif self.lang == 'en': self._stemmer = EnglishStemmer() else: print("no stemmer for '{}'".format(self.lang)) print("falling back to 'de'...") self._stemmer = GermanStemmer() self.name = os.path.splitext(os.path.split(filename)[1])[0] self.ID = self.name self.tokens = word_tokenize(self.string) self.stems = list(map(self.stem, self.tokens)) self.length = len(self.tokens) self.hashes = list(map(hash, self.stems)) self.sents = self._get_sents() self.freq_dist = dict(Counter(self.hashes))
def evaluate_dnn(path:str): with open(os.path.join(path, "tag_to_int.json"), "rt") as f: tag_to_int = json.load(f) with open(os.path.join(path, "int_to_tag.json"), "rt") as f: int_to_tag = json.load(f) cv = pickle.load(open(os.path.join(path, "cv.p"), "rb")) stemmer = GermanStemmer() model_name = "dnn_intent_classification.h5" model = load_model(os.path.join(path, model_name)) with open(os.path.join("Data", "commands", "Test", "testingdata.json"), "rt") as f: val_data = json.load(f) X = [] y = [] for tag, commands in val_data.items(): for command in commands: command = " ".join(stemmer.stem(c) for c in sorted(word_tokenize(command))) X.append(transform_command_BoW(command, cv)) y.append(tag_to_int[tag]) X = np.array(X) y = np.array(y) predictions = model.predict(X) predicted_indices = np.argmax(predictions, 1) print("acc: ", accuracy_score(y, predicted_indices)) cm = confusion_matrix(y, predicted_indices) cm = pd.DataFrame(cm, index=int_to_tag.values(), columns=int_to_tag.values()) print(cm) return (accuracy_score(y, predicted_indices), cm)
def remove_stop_words(msg): # remove stop words and stem words stemmer = GermanStemmer() tokenizer = RegexpTokenizer(r'\w+') words = tokenizer.tokenize(msg) stop_words = set(stopwords.words('german')) words_filtered = [] for w in words: if w not in stop_words: words_filtered.append(stemmer.stem(w)) return words_filtered
def __init__(self, essay: str, name: str, gazetteer_version: int = 1): """ Initalizes the Stringmatcher. Takes a path to an essay and the gazetteer version, that should be used. See the above dict :param file_path: path to the essay that is to be processed :param gazetteer_version: the gazetteer version that should be used. See the above defined dict "version_subfolder" for what values are possible """ # Initialize data structures self.essay = essay self.essay_name = name self.gazetteer_version = gazetteer_version self.tokens_without_stopwords = [] self.found_entities = dict() self.stemmer = GermanStemmer() self.fastText_model = None self.spacy_model = None self.file_path = RESULTS_PATH + name if not os.path.exists(self.file_path): os.makedirs(self.file_path) # retrieve the gazetteers that should be used for annotation self.gazetteers = sorted([ f for f in os.listdir(PATH_GAZETTEERS + version_subfolder[gazetteer_version]) if os.path.isfile(PATH_GAZETTEERS + version_subfolder[gazetteer_version] + f) ]) print("Used gazetteers: %s" % (gazetteer_version)) # retrieve gazetteers with already preprocessed entries if available (for efficiency reasons) or create new one if os.path.isfile(PATH_GAZETTEERS + "tokenized_gazetteers"): self.tokenized_gazetteers = pickle.load( open(PATH_GAZETTEERS + "tokenized_gazetteers", "rb")) else: self.tokenized_gazetteers = dict() changed = False for gazetteer_filename in self.gazetteers: # if there is not already a tokenized version of this gazetteer, tokenize it if not gazetteer_filename in self.tokenized_gazetteers.keys(): self.tokenized_gazetteers[ gazetteer_filename] = self.tokenize_gazetteer( gazetteer_filename) changed = True if changed: pickle.dump(self.tokenized_gazetteers, open(PATH_GAZETTEERS + "tokenized_gazetteers", "wb"))
def __init__(self, config): self.config = config if config.stem: if config.lang == 'en': self.stemmer = PorterStemmer() elif config.lang == 'de': self.stemmer = GermanStemmer() else: self.stemmer = IdStemmer()
def _check_NE_yeah(gram): tag = entities.get(" ".join(gram), "O") if tag == "O": if len(gram) == 2: first, last = gram if first in vornamen and last in nachnamen: tag = "PER" if tag == "O": try: tag = entities.get( " ".join([GermanStemmer().stem(g) for g in gram]), "O") except: tag = entities.get( " ".join([ GermanStemmer().stem(g.decode(encoding="UTF-8")) for g in gram ]), "O") return tag
def _preprocess(text, mode=None): '''helper function to preprocess text. returns List of Sentences''' sentences = split_single(text) if mode: nlp = spacy.load('de_core_news_sm') if mode == 'lemmatize': sentences = [ Sentence((' ').join([token.lemma_ for token in nlp(s)])) for s in sentences ] elif mode == 'stem': stemmer = GermanStemmer() sentences = [ Sentence((' ').join( [stemmer.stem(token.text) for token in nlp(s)])) for s in sentences ] else: sentences = [Sentence(s, use_tokenizer=True) for s in sentences] return sentences
def clean_text(text): """ :param text: :return: """ # stopwords = set(nltk.corpus.stopwords.words('german')) file_path = r'etc/models/german.txt' with open(file_path) as file: file_data = file.read() stopwords = file_data.split('\n') gs = GermanStemmer() text_cleaned = "" text_cleaned = re.sub('[^a-zA-Z]', ' ', text) # Keep only alphabet and space characters text_cleaned = text_cleaned.lower() # All character to lowercase text_cleaned = text_cleaned.split( ) # Split to list of word (split by space specify character) text_cleaned = [ gs.stem(word) for word in text_cleaned if not word in stopwords ] text_cleaned = ' '.join(text_cleaned) return text_cleaned
def text_cleaner(text): use_GermanStemmer = False tokens = False # Remove username handles # -? do we need the user names text = remove_handles(text) # Remove punctuation marks text_blob = TextBlob(text) text = ' '.join(text_blob.words) # replace the umlauts # ============================================================================= # text = re.sub('ä', 'ae', text) # text = re.sub('ö', 'oe', text) # text = re.sub('ü', 'ue', text) # text = re.sub('Ä', 'Ae', text) # text = re.sub('Ö', 'Oe', text) # text = re.sub('Ü', 'Ue', text) # text = re.sub('ß', 'ss', text) # ============================================================================= # remove the numbers text = re.sub(r'[0-9]+', '', text) # Remove emojis german_char = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜ" text = ''.join(c for c in text if c in german_char) tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True) if tokens: return tokenizer.tokenize(text) elif use_GermanStemmer: stemmer = GermanStemmer() return [stemmer.stem(token) for token in tokenizer.tokenize(text)] else: return text
def ner_features(sentence, i, history): # TODO: try using TreeTagger's POS tag wordO = sentence[i] word = wordO.string pos = wordO.pos stemmed = GermanStemmer().stem(word) if i == 0: prevword, prevpos = "<START>", "<START>" last = "<START>" prevstemmed = "<START>" else: last = history[-1] prevword = sentence[i - 1].string prevpos = sentence[i - 1].pos prevstemmed = GermanStemmer().stem(sentence[i - 1].string) chunk = [] if not wordO.chunk: chunk.append("START") knowledge_sources = "O" else: knowledge_sources = check_NE(convert(wordO.string), wordO.chunk) chunk = [w.string for w in wordO.chunk] stem_is_word = stemmed == word.lower() knowledge_sources_stemmed = _check_NE_yeah([stemmed]) return { "knowledge": knowledge_sources, "knowledge_lemma": knowledge_sources_stemmed, "history": "+".join(history)[-2:], "pos": pos, "word": word, "stemmed": stemmed }
def __init__(self): self.tweets = 0 self.related_tweets = 0 self.stopwords = {} self.stemmers = {} self.stemmers["es"] = SpanishStemmer() self.stemmers["en"] = PorterStemmer() self.stemmers["fr"] = FrenchStemmer() self.stemmers["de"] = GermanStemmer() self.stopwords["es"] = self.load_stopwords_file( "spanish_stopwords.txt") self.stopwords["en"] = self.load_stopwords_file( "english_stopwords.txt") self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt") self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt") self.output_file = open(sys.argv[2], 'a')
class CleanDoc(BaseEstimator, TransformerMixin, NoFit): def __init__(self): self.stemmer = GermanStemmer() def transform(self, docs): res = [] for doc in docs: lines = doc.split("\n") lines = [ " ".join( self.stemmer.stem(word) for word in re.findall("[a-zäöüß]{3,}", line.lower())) for line in lines ] res.append("\n".join(lines)) return res
def __init__(self, lang, strip_accents=None, ngram_range=(1, 1), max_df=1.0, min_df=1, stop_words=None): if lang == 'de': self.stemmer = GermanStemmer() else: self.stemmer = EnglishStemmer() super(self.__class__, self).__init__(stop_words=stop_words, strip_accents=strip_accents, ngram_range=ngram_range, max_df=max_df, min_df=min_df)
def stemWord(self, word, lng): '''Separates the word's changeable part with a '|' for wordfast''' if lng == 'ru': stemmer = RussianStemmer() elif lng == 'en': stemmer = PorterStemmer() elif lng == 'de': stemmer = GermanStemmer() else: print('Language error. Exiting...') sys.exit(1) word = word.lower() #otherwise the stemmer fails if len(word) <= 3: return word elif len(word) == len(stemmer.stem(word)): return "{0}|{1}".format(word[:-1], word[-1]) else: return "{0}|{1}".format(word[:len(stemmer.stem(word))], \ word[len(stemmer.stem(word)):])
class Stringmatcher: def __init__(self, essay: str, name: str, gazetteer_version: int = 1): """ Initalizes the Stringmatcher. Takes a path to an essay and the gazetteer version, that should be used. See the above dict :param file_path: path to the essay that is to be processed :param gazetteer_version: the gazetteer version that should be used. See the above defined dict "version_subfolder" for what values are possible """ # Initialize data structures self.essay = essay self.essay_name = name self.gazetteer_version = gazetteer_version self.tokens_without_stopwords = [] self.found_entities = dict() self.stemmer = GermanStemmer() self.fastText_model = None self.spacy_model = None self.file_path = RESULTS_PATH + name if not os.path.exists(self.file_path): os.makedirs(self.file_path) # retrieve the gazetteers that should be used for annotation self.gazetteers = sorted([ f for f in os.listdir(PATH_GAZETTEERS + version_subfolder[gazetteer_version]) if os.path.isfile(PATH_GAZETTEERS + version_subfolder[gazetteer_version] + f) ]) print("Used gazetteers: %s" % (gazetteer_version)) # retrieve gazetteers with already preprocessed entries if available (for efficiency reasons) or create new one if os.path.isfile(PATH_GAZETTEERS + "tokenized_gazetteers"): self.tokenized_gazetteers = pickle.load( open(PATH_GAZETTEERS + "tokenized_gazetteers", "rb")) else: self.tokenized_gazetteers = dict() changed = False for gazetteer_filename in self.gazetteers: # if there is not already a tokenized version of this gazetteer, tokenize it if not gazetteer_filename in self.tokenized_gazetteers.keys(): self.tokenized_gazetteers[ gazetteer_filename] = self.tokenize_gazetteer( gazetteer_filename) changed = True if changed: pickle.dump(self.tokenized_gazetteers, open(PATH_GAZETTEERS + "tokenized_gazetteers", "wb")) def tokenize(self): """ tokenizes the complete essay and stores one version with stopwords and one that has neither stopwords nor special characters (except for '-' and '_') The latter one will be used for the extraction of n-grams """ # store tokens without stopwords if not self.essay.strip(): return ([], 0) stop_words = set( stopwords.words('german') ) # we remove stopwords, because they have little meaning. Thus we gain some efficiency tokenizer = RegexpTokenizer( r'[\w-]+' ) # finds all strings with alphanumerical characters or with '-' or '_' (the latter implicitly through \w) tokenized_essay = tokenizer.tokenize(self.essay) self.tokens_all = tokenized_essay # token_offsets = list(tokenizer.span_tokenize(self.essay)) # print(tokenized_essay) # print(token_offsets) # print(len(tokenized_essay)) # print(len(token_offsets)) # self.token_offsets = token_offsets # give every token an index for later backmapping tokenized_enumerated_essay = [ t for t in enumerate(tokenized_essay, start=0) ] # filter out stopwords self.tokens_without_stopwords = [] for token in tokenized_enumerated_essay: if token[1].lower() not in stop_words: self.tokens_without_stopwords.append( (token[0], token[1].lower())) def get_n_grams_up_to_n(self, words: list, N: int = 3): """ returns a list of all n_grams with n <= N Example for N=3: [[(idx1, t1)], [(idx2, t2)], [(idx3, t3)], [(idx1, t1), (idx2, t2)], [(idx2, t2), (idx3, t3)], [(idx1, t1), (idx2, t2), (idx3, t3)]] :param words: list of tokens to form n_grams from :param N: specifies the maximum n-gram size :return: list of n-grams """ n_grams_all = [] for n in range(1, N + 1): # is is the start word-index of each possible n-gram (note: not the character index!) n_grams = [words[i:i + n] for i in range(0, len(words) - n + 1)] n_grams_all.extend(n_grams) return n_grams_all def tokenize_gazetteer(self, gazetteer: str): """ creates a tokenized representation of a gazetteer :param gazetteer: the gazetteer to be tokenized :return: list containing the original and preprocessed gazetteer entries """ tokenized_gazetteer = [] with open( PATH_GAZETTEERS + version_subfolder[self.gazetteer_version] + gazetteer, "r") as gazetteer_f: for line in gazetteer_f: # keep the original and preprocessed version of this line tokenized_gazetteer.append((line, self.tokenize_line(line, 0)[0])) return tokenized_gazetteer def tokenize_line(self, line: str, start_index: int = 0): """ takes a string, and returns an variant with enumerated tokens, buth without stopwords and most special characters. tokens themselves are of the form (index, string), where index refers to its position in the input file :param line: line to be tokenized :param start_index: start index of the enumeration of returned tokens :return: returns a tuple of 1. List of tokens, that are enumerated for later backmapping, starting with the given start index 2. The next unused (free) index """ if not line.strip(): return ([], start_index) stop_words = set( stopwords.words('german') ) # we remove stopwords, because they have little meaning. Thus we gain some efficiency tokenizer = RegexpTokenizer( r'[\w-]+' ) # finds all strings with alphanumerical characters or with '-' or '_' (the latter implicitly through \w) tokenized_line = tokenizer.tokenize(line) # give every token an index for later backmapping tokenized_enumerated_line = [ t for t in enumerate(tokenized_line, start=start_index) ] next_index = tokenized_enumerated_line[-1][0] + 1 # filter out stopwords filtered_tokens = [] for token in tokenized_enumerated_line: if token[1].lower() not in stop_words: filtered_tokens.append((token[0], token[1].lower())) return (filtered_tokens, next_index) def match(self, stemmer: bool = False, similarity: float = 0, method: str = "exact", semantic_cmp: bool = False, oov: bool = True, store_responsible_word: bool = False): """ Annotates the essay that is associated with this stringmatcher object with the specified parameters :param stemmer: indicates whether or not the terms should be stemmed before comparison :param similarity: Similarity threshold, indicating how similar two strings have to be for a match. Matching based on similarity happens only if similarity is not 0 :param method: A method name to refer to this setup :param semantic_cmp: if flag is set, compares based on embeddings :param oov: if flag is set, fastText with cosine similarity (embeddings_oov) is used instead of spacy (embeddings). :param store_responsible_word: if flag is set, keeps the information of what phrase was detected as ME with the help of what gazetteer entry. :return: a list of all detected MEs (including stopwords) """ # "matches" is a list, that contains a tuple with the original n_gram and a set, that stores in which gazetteers it occurred # therefore it looks like this: [([(index1, word1), (index2, word2), ...], {NETDOK, ...}), ...] matches = [] # define how large the n_grams should be n = 3 # get essay n_grams essay_n_grams = self.get_n_grams_up_to_n(self.tokens_without_stopwords, n) # add a set to each n_gram, that keeps track of which gazetteer detects that n_gram essay_n_grams = [(n_gram, set()) for n_gram in essay_n_grams] if not method in self.found_entities.keys(): self.found_entities[method] = (self.tokens_all, set()) # find gazetteer entries in essays for gazetteer_filename in self.gazetteers: gazetteer = self.tokenized_gazetteers[gazetteer_filename] # compare every gazetteer entry to all essay n_grams for gaz_idx, (gaz_ME_string, gaz_n_gram) in enumerate(gazetteer): for essay_n_gram, matched_gazetteers in essay_n_grams: matched = False # compare essay- and gazetteer n_gram with given parameters if self.compare_n_grams( essay_n_gram, gaz_n_gram, similarity, semantic_cmp=semantic_cmp, oov=oov, store_responsible_word=store_responsible_word, gazetteer_filename=gazetteer_filename, method_name=method, stemmer=stemmer): matched = True # check whether the n_gram has been found in a previous gazetteer. # If that is the case, do not add the n_gram, but instead just update the set of gazetteers that is associated with this essay n_gram if matched and len(matched_gazetteers) == 0: # this is the first time this n_gram has been seen in a gazetteer, so append it to matches matched_gazetteers.add(gazetteer_filename) matches.append((essay_n_gram, matched_gazetteers)) elif matched: # this n_gram has been seen in a previous gazetteer already, therefore just update the set of gazetteers, in which it occured for n_gram, gaz in matches: if n_gram == essay_n_gram: gaz.add(gazetteer_filename) if matched: # finally, store the line index of the gazetteer entry that matched for later statistics entity_information = (tuple(essay_n_gram), gaz_ME_string.strip(), gaz_idx + 1, gazetteer_filename) if not entity_information in self.found_entities[ method][1]: self.found_entities[method][1].add( entity_information) # filter out sub_n_grams filtered_n_grams = self.filter_matches(matches=matches, n=n) # extend the n_grams to include stopwords again medical_terms = self.get_whole_terms(filtered_n_grams) # create an annotated version of the essay self.annotated_essay = self.annotate_essay(medical_terms, method=method) return self.found_entities def filter_matches(self, matches: list, n: int): """ filters and keeps only the biggest n_grams found in each gazetter. Discards those that are part of bigger n_grams note that elements of the returned list are now not tuples of the form (n_gram, set_of_gazetteers) as in the input, but instead (n_gram, gazetter) i.e. a tuple is not anymore stored with a set of gazetters but with one concrete gazetter (and thus may appear several times in the list) therefore the list may be larger afterwards, but n_grams are directly mapped to the gazetter they were found with This allows us to define something like a subset relation on these tuples in order to only keep the largest ones E.g. ([(1, "asthma")], NetDok) would be a subset of ([(1, "asthma"), (2, "bronchiale")], NetDok) but it would not be a subset of ([(1, "asthma"), (2, "bronchiale")], Wiki), as they were not found with the same gazetteer :param matches: list of tuples, which contain an n_gram and the gazetteer they were found with, i.e. (n_gram, set_of_gazetteers) n_grams have the form [(index1, word1), (index2, word2), ...] where index refers to the position of the token in the input file, word_i is the i'th word of the n_gram :param n: Indicates the maximum size of n_grams """ filtered_n_grams = [] # seen_idx_gaz is a list storing tuples of the form (idx, gazetteer) where index is the position of the word in the complete essay # and gazetteer is a set, storing the gazetteers in which the word occurred in seen_idx_gaz = set() # process n_grams with decreasing size for n in range(n, 0, -1): for n_gram, gazetteers in matches: for gaz in gazetteers: # process only n_grams of the current size if len(n_gram) == n: # check whether one of the words has already been seen in a previous (larger) n_gram using its index/position # keep the largest complete n_grams that were found for each gazetteer, discard all sub_n_grams already_seen = False temp_idx_gaz = set() for index, word in n_gram: if (index, gaz) in seen_idx_gaz: already_seen = True temp_idx_gaz.add((index, gaz)) if not already_seen: filtered_n_grams.append((n_gram, gaz)) seen_idx_gaz = seen_idx_gaz.union(temp_idx_gaz) return filtered_n_grams def get_whole_terms(self, filtered_n_grams: list): """ adds previously deleted stopwords to the n_grams again. :param filtered_n_grams: The list of n_grams to enrich with stopwords again. n_grams are here not only the n_grams of the form [(idx1, t1), (idx2, t2), etc...] but instead are tuples (n_gram, gazetteer), where "n_gram" has the format above :return: The list of (n_gram, gazetteer)-tuples that now also include the stopwords """ # find all contiguous words, that appear in the range of the n_gram tokens_all_words = self.tokens_all medical_terms = [] for n_gram, gazetteer in filtered_n_grams: # for each n_gram, retrieve all words between the index of the first and the last word of the n_gram start_idx = n_gram[0][0] end_idx = n_gram[-1][0] + 1 medical_term = [(idx, tokens_all_words[idx]) for idx in range(start_idx, end_idx)] medical_terms.append((medical_term, gazetteer)) return medical_terms def annotate_essay(self, medical_terms: list, method: str): """ takes a list of n_grams/medical_terms and stores an annotated version of the original essay Annotations indicate in which gazetteer the word was found in and which ME it is part of using a unique ID For the text "Asthma Bronchiale is a terrible disease", where the word "Asthma" was found in gazetteer 1 and "Asthma Bronchiale" was found in gazetteer 2, the annotated text would look the like the following: "asthma {(gazetteer1, 1), (gazetteer2, 2)} Bronchiale {(gazetteer2, 2)} is a terrible disease" :param medical_terms: list of medical terms, represented as lists of tuples [(term1, gazetteer1), ...], terms are represented as lists of the form [(index1, word1), (index2, word2), ...], index refers to the word's position in the essay :param method: Name for the used matching setup. Used to derive the name of the annotated essay """ # read all words from the original essay tokens_all_words = self.tokens_all[:] medical_terms.sort(key=lambda ngram_gaz: ngram_gaz[0][0]) # store all word indices that are part of a detected ME to later add annotations to them marked_words = set() for term in medical_terms: for idx_words in term[0]: marked_words.add(idx_words[0]) # iterate over all tokens of the essay for position in range(0, len(tokens_all_words)): # if this position/token is part of a medical_term/ME, append an annotation if position in marked_words: tokens_all_words[position] = tokens_all_words[position] + "\t{" # the index at which the medical term appears in the input list is also its ID for term_id in range(0, len(medical_terms)): # annotate the token at this position, if it is in the range of the first and the last word_index of the current medical_term/ME if position in range(medical_terms[term_id][0][0][0], medical_terms[term_id][0][-1][0] + 1): tokens_all_words[position] += "(%s, %s), " % ( medical_terms[term_id][1], term_id) tokens_all_words[position] = tokens_all_words[position].rstrip( ", ") + "}" # store the annotations in a file annotated_essay_name = self.essay_name + "_annotated_" + method + ".txt" with open(self.file_path + "/" + annotated_essay_name, "w") as essay_annotated: for token in tokens_all_words: essay_annotated.write(token + "\n") def compare_n_grams(self, n_gram1: list, n_gram2: list, similarity: float = 0, semantic_cmp: bool = False, oov: bool = True, store_responsible_word: bool = False, gazetteer_filename: str = "-", method_name: str = "", stemmer=False): """ takes two n_grams and returns True if all words at all indices of both n_grams satisfy the similarity constraints n_grams for this method are of the form [(index1, word1), (index2, word2), ...] :param n_gram1: The first n_gram to compare :param n_gram2: The second n_gram to compare :param similarity: similarity threshold. Uses similarity based comparison if in the range (0,1]. Uses exact or stemmed matching if 0 :param semantic_cmp: Compares based on semantic/embeddings if flag is set, based on lexical similarity otherwise. No effect if similarity = 0 :param oov: compares with FastText's model (embeddings_oov) if flag is set, else with Spacy's model (embeddings) :param store_responsible_word: if flag is set, keeps the gazetteer entries that are responsible for a match :param gazetteer_filename: the gazetteer's name from which the second n_gram was created. Used to keep the mapping of a match to the responsible gazetteer entry :param method: method name, used for correct mapping of method to match for later statistics :param stemmer: compares stemmed versions of the n_grams if flag is set :return: True if a match occurred """ # check for same length if not n_gram1 or not n_gram2 or not len(n_gram1) == len(n_gram2): return False same = True for i in range(len(n_gram1)): # exact matching if not similarity and not stemmer and not semantic_cmp and n_gram1[ i][1] != n_gram2[i][1]: same = False # exact matching with stemming elif not similarity and stemmer and not semantic_cmp and self.stemmer.stem( n_gram1[i][1]) != self.stemmer.stem(n_gram2[i][1]): same = False # lexical similarity with stemming elif similarity and stemmer and not semantic_cmp and not SequenceMatcher( None, self.stemmer.stem(n_gram1[i][1]), self.stemmer.stem(n_gram2[i][1])).ratio() >= similarity: same = False # lexical similarity matching elif similarity and not stemmer and not semantic_cmp and not SequenceMatcher( None, n_gram1[i][1], n_gram2[i][1]).ratio() >= similarity: same = False # semantic similarity matching elif similarity and not stemmer and semantic_cmp and not self.embedding_similarity( n_gram1[i][1], n_gram2[i][1], oov=oov) >= similarity: same = False #if store_responsible_word and same: # keep information of this match # self.store_responsible(n_gram1, n_gram2, gazetteer_filename, similarity, semantic_cmp, oov, method_name) return same def match_char_based(self, similarity: float = 0.9, store_responsible_word: bool = False): """ matches based on similarity of character sequences :param similarity: the similarity threshold, which is a value in the range of (0,1]. If two terms have a similarity above this threshold, a match occurs :param store_responsible_word: flag is propagated and indicates that matching information should be kept """ method = "char_based_%1.2f" % similarity #self.store_method_metadata(method) return self.match(similarity=similarity, method=method, semantic_cmp=False, store_responsible_word=store_responsible_word) def match_embeddings(self, similarity: float = 0.6, oov: bool = True, store_responsible_word: bool = False): """ matches based on semantic similarity, a.k.a. similarity of word embeddings :param similarity: similarity threshold in the range of (0,1] :param oov: if flag is set, FastText's model (embeddings_oov) is used, otherwise Spacy :param store_responsible_word: flag is propagated and indicates that matching information should be kept """ if oov: method = "embeddings_oov_%1.2f" % similarity else: method = "embeddings_%1.2f" % similarity return self.match(similarity=similarity, method=method, semantic_cmp=True, oov=oov, store_responsible_word=store_responsible_word) def match_stemmed(self, store_responsible_word: bool = False): """ matches based on exact string matching with stemming :param store_responsible_word: flag is propagated and indicates that matching information should be kept """ method = "stemmed" return self.match(stemmer=True, method=method, store_responsible_word=store_responsible_word) def match_stemmed_char_based(self, similarity: float = 0.95, store_responsible_word: bool = False): """ matches based on a combination of char_based and stemming :param store_responsible_word: flag is propagated and indicates that matching information should be kept """ method = "stemmed_char_based_%1.2f" % similarity return self.match(stemmer=True, similarity=similarity, method=method, semantic_cmp=False, store_responsible_word=store_responsible_word) def match_exact(self, store_responsible_word: bool = False): """ matches with comparison based on exact string matching :param store_responsible_word: flag is propagated and indicates that matching information should be kept """ method = "exact" return self.match(method=method, store_responsible_word=store_responsible_word) def embedding_similarity(self, word1: str, word2: str, oov: bool = True): """ accepts two strings and returns the similarity between both words according to their embeddings :param oov: if flag is set, uses the fastText model with cosine similarity. Otherwise a spacy model is used to for comparison """ # use fasttext if oov: # load model if this is the first comparison if not self.fastText_model: self.fastText_model = fastText.load_model(MODEL_PATH) # get embeddings embedding1 = self.fastText_model.get_word_vector(word1).reshape( 1, -1) # reshape needed to satisfy cosine function below embedding2 = self.fastText_model.get_word_vector(word2).reshape( 1, -1) similarity = cosine_similarity(embedding1, embedding2) # use spacy else: if not self.spacy_model: self.spacy_model = spacy.load(SPACY_EMBEDDINGS) embedding1 = self.spacy_model(word1) embedding2 = self.spacy_model(word2) similarity = embedding1.similarity(embedding2) return similarity
test_df.reset_index(inplace=True) print test_df.isnull().sum() print 'Unique restaurants: {}'.format(len(data['restaurant_id'].unique())) print 'Unique menu_category: {}'.format(len(data['menu_category'].unique())) print 'Unique product_name: {}'.format(len(data['product_name'].unique())) print 'Unique ingredients: {}'.format(len(data['ingredients'].unique())) print test_df.shape encode_menu = test_df['menu_category'].str.encode('ascii', errors='ignore') print len(encode_menu.unique()) encode_menu.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True) print len(encode_menu.unique()) encode_menu = encode_menu.apply(lambda x:GermanStemmer().stem(x)) print len(encode_menu.unique()) encode_name = test_df['product_name'].str.encode('ascii', errors='ignore') print len(encode_name.unique()) encode_name.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True) print len(encode_name.unique()) encode_name = encode_name.apply(lambda x:GermanStemmer().stem(x)) print len(encode_name.unique()) # X = pd.concat([encode_menu, encode_name, test_df['restaurant_id'].astype('str')], axis=1) # le = preprocessing.LabelEncoder() # X_2 = X.apply(le.fit_transform) # print X_2.head() # print X_2.shape
def __init__(self): self.stemmer = GermanStemmer()
class Document: """ Represents a complete document. Attributes ---------- string: str Raw string contents of the file represented by this document. lang: str Language identifier, reflects classification of `langid` module. name: str Filename of source file, without extension. ID: str Alias for `name`. tokens: list[str] Tokens in document, generated by standard NLTK tokenizer. stems: list[str] Tokens after stemming. length: int Length of document. hashes: list[int] Representation of document as list of stems, hashed for efficiency. sents: list[Sentence] Sentences in this document. freq_dist: dict[int, int] Frequency distribution for this document, mapping hashes of stems to their frequencies. """ string = str() _stemmer = None def __init__(self, filename): """ Parameters ---------- filename: str Path to the (plaintext) file for this document. """ # open file with utf-8-sig to remove any BOMs with open(filename, "r", encoding="utf-8-sig", errors="ignore") as infile: self.string = clean_whitespace(infile.read()) self.lang = langid.classify(self.string)[0] if self.lang == 'de': self._stemmer = GermanStemmer() elif self.lang == 'en': self._stemmer = EnglishStemmer() else: print("no stemmer for '{}'".format(self.lang)) print("falling back to 'de'...") self._stemmer = GermanStemmer() self.name = os.path.splitext(os.path.split(filename)[1])[0] self.ID = self.name self.tokens = word_tokenize(self.string) self.stems = list(map(self.stem, self.tokens)) self.length = len(self.tokens) self.hashes = list(map(hash, self.stems)) self.sents = self._get_sents() self.freq_dist = dict(Counter(self.hashes)) def __repr__(self): return "<Document {0}...{1}>".format(self.name[:12], self.name[-5:]) def __eq__(self, other): return self.hashes == other def __hash__(self): return hash(self.name) def stem(self, s): """ Stem token `s` with appropriate stemmer. Notes ----- Running the result through the ASCII encoder ensures that no weird characters end up in the final stem (such as unusual space characters, which one might otherwise overlook). Examples -------- >>> d = Document("somefile.txt") >>> d.stem("Versicherungen") 'versicher' """ return self._stemmer.stem(s).encode('ascii', errors='ignore').decode() def _get_sents(self): """ Determine sentences in document according to locations of sentence boundary punctuation. Assumes that abbreviations, etc., have already been approprately tokenized. """ id_counter = 0 my_sents = [] last_start = 0 for i, h in enumerate(self.hashes): if h in SENT_PUNCT and i - last_start > 1: id_counter += 1 my_sents.append(Sentence("{0}_{1}".format(self.name, id_counter), self, (last_start, i + 1))) last_start = i + 1 if self.length - last_start > 0: id_counter += 1 my_sents.append(Sentence("{0}_{1}".format(self.name, id_counter), self, (last_start, self.length))) return my_sents
] res.append("\n".join(lines)) return res # In[3]: def subwords(word): return [word[:2], word[2:]] # In[27]: stem = GermanStemmer().stem cnt_vect_splits = [ ("short", lambda doc: [line for line in doc if len(line) <= 1], {}), ("long", lambda doc: [line for line in doc if len(line) > 1], {}), ("subwords", lambda doc: [ list(map(stem, concat(subwords(word) for word in line))) for line in doc ], { "ngram_range": (1, 1) }), ] doc_funcs = [ ("num_char", lambda doc: len(re.findall("[A-Za-zäöüÄÖÜß]", doc))), ]
class StringHandler: _STEMMER = GermanStemmer() _P_SIMILARITY_THRESHOLD: float = 0.9 def __init__(self, string_series: pd.Series): self._ds = string_series.str.lower() self.ds_origin = string_series def optimize(self): self.remove_noise() self.split_text() self.build_sentence() self.stem_words() # self.correct_spelling() def reset(self): self.ds = self.ds_origin.copy() # string manipulation ################################## def stem_words(self): self.ds = self.ds.apply(StringHandler.stem_sentence) def split_text(self): self.ds = self.ds.str.split(' ') def remove_noise(self): self.ds = self.ds.str.replace(r'[^a-zA-Z0-9]', ' ') # remove leftover isolated substrings that are not words/digits def build_sentence(self): self.ds = self.ds.apply(lambda x: ' '.join(word.strip() for word in x if word)) # nlp manipulation ################################## def correct_spelling(self): uniques = self.get_unique_series uniques.apply(lambda x: list(i for i in uniques if i != x and SequenceMatcher(None, x, i).ratio() > 0.9)) @classmethod def stem_sentence(cls, sentence: str, split_char: str = ' '): return ' '.join(cls._STEMMER.stem(word) for word in sentence.split(split_char)) # properties ################################## @property def get_unique_series(self): return pd.Series(self.ds.unique()).sort_values().reset_index(drop=True) @property def ds(self): return self._ds @ds.setter def ds(self, ds: pd.Series): if isinstance(ds, pd.Series) and not ds.empty: self._ds = ds else: raise TypeError('Wrong variable type or empty series')
import nltk text = "She looked at her father's arm-chair." text_fr = "Qu'est-ce que c'est?" text.split(' ') text_fr.split(' ') from sklearn.feature_extraction.text import CountVectorizer CountVectorizer().build_tokenizer()(text) from nltk.tokenize import word_tokenize word_tokenize(text) #from nltk.tokenize.punkt import PunktWordTokenizer #tokenizer = PunktWordTokenizer() #tokenizer.tokenize(text) #Stemming from nltk.stem.snowball import GermanStemmer stemmer=GermanStemmer() words=["Wald", "Walde", "Wälder", "Wäldern", "Waldes","Walds"] stemmer.stem("Waldi") #[stemmer.stem(w) for w in words] #Chunking import os import numpy as np corpus_path = os.path.join('/Users/Gourhari/Documents/Py/data', 'french-tragedy') sorted(os.listdir(corpus_path))[0:5] tragedy_filenames = [os.path.join(corpus_path, fn) for fn in sorted(os.listdir(corpus_path))] def split_text(filename, n_words): """Split a text into chunks approximately `n_words` words in length.""" input = open(filename, 'r') words = input.read().split(' ')
import re import preprocess_files from nltk.stem.snowball import GermanStemmer gs = GermanStemmer() punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~''' def match_synms(tokens): syn_dict = preprocess_files.read_synms_list() for t in tokens: for (idx, val) in enumerate(t): if val in syn_dict: t[idx] = syn_dict[val] return tokens def _remove_punctuation(tokens): tokens_filt = [] for gT in tokens: if gT not in punctuations: tokens_filt.append(gT) return tokens_filt def _remove_stopwords(tokens): '''Remove stop words from an array of tokens''' stopWords = ['the', 'to', '-', 'pr', 'der', 'is', 'of', 'die', 'in', 'and', 'und', '–', '•', '✔', '●', 'a']
#For Snowball Stemmer, which is based on Snowball Stemming Algorithm, can be used in NLTK like this: from nltk.stem import SnowballStemmer print(" ".join(SnowballStemmer.languages)) # In[20]: snowball_stemmer = SnowballStemmer('english') #snowball_stemmer.stem('maximum') #snowball_stemmer.stem('presumably') print(snowball_stemmer.stem('computing')) print(snowball_stemmer.stem('nationality')) # In[21]: from nltk.stem.snowball import GermanStemmer stemmer = GermanStemmer() stemmer.stem("Autobahnen") # In[22]: #for more details and examples see http://www.nltk.org/api/nltk.tokenize.html1 # In[33]: from nltk.corpus import stopwords nltk.download("stopwords") text = "Sachin Ramesh Tendulkar (/ˌsətʃɪn tɛnˈduːlkər/ (About this sound listen); born 24 April 1973) is a former Indian cricketer and a former captain, regarded as one of the greatest batsmen of all time.[4] The highest run scorer of all time in International cricket, Tendulkar took up cricket at the age of eleven, made his Test debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for close to twenty-four years. He is the only player to have scored one hundred international centuries, the first batsman to score a double century in a One Day International, the holder of the record for the most number of runs in both ODI and Test cricket, and the only player to complete more than 30,000 runs in international cricket.[5]" stopwordsList = set(stopwords.words('english')) stopwordsListUpdated = list(stopwordsList) punctuationList = ["{", "}", "]", "[", ",", ";", ".", "/"] stopwordsListUpdated += punctuationList
def tokenize_and_stem_german(text): stemmer = GermanStemmer() return [clean_token(stemmer.stem(token)) for token in tokenize(text)]
def get_stem_relations(sentences, gn): """Gets verb-noun relations between two sentences. Returns Array of word-pairs between two sentences """ # Init word pairs word_pairs = [] # Init stemmer stemmer = GermanStemmer(ignore_stopwords=True) # Loop over every sentence for val, sentence in enumerate(sentences): # Is current sentence not the last # sentence? If so carry on if val != (len(sentences) - 1): # Get stems of all words in current sentence stems_next_sentence = map(lambda x: stemmer.stem(x['lemma']), sentences[val + 1]) # Nouns in next sentence nouns_next_sentence = [ word['lemma'] for word in sentences[val + 1] if word['noun'] ] # Nouns of current sentence words_current_sentence = [ word for word in sentence if word['noun'] ] # Loop over every word in current sentece for word in sentences[val]: # Stem of current word stem_current_word = stemmer.stem(word['lemma']) # Is the stemmed word in the next sentence, great. # If word is a lame 'sein', ignore it if (stem_current_word in stems_next_sentence) and word['lemma'] != 'sein': # Get index of stem that is related to current word index_word_next_sentence = stems_next_sentence.index( stem_current_word) # Corresponding word in next sentence corresponding_word = sentences[val + 1][index_word_next_sentence] # Only add word pairs if verb or noun if word['noun'] or word['verb']: # Get dictionary of word in next sentence dict_next = sentences[val + 1][index_word_next_sentence] # We do not want to combine words # that have the same grammatical function # A noun should not be combined with a noun # We are only interested in verb-noun relations if word['verb'] and dict_next['noun']: # Get all combinations of corresponding noun # in next sentence an all nouns in current sentence for wordCurrent in words_current_sentence: # Append to list word_pairs.append({ 'source': { 'word': corresponding_word['orth'], 'lemma': corresponding_word['lemma'], 'sentence': val }, 'target': { 'word': wordCurrent['orth'], 'lemma': wordCurrent['lemma'], 'sentence': val + 1 }, 'device': 'verb noun relation' }) # Current word is noun and corresponding word is # verb elif word['noun'] and dict_next['verb']: # Get all combinations of of noun in this sentence # with nouns in next sentence for wordNext in sentences[val + 1]: # Do not use stupid 'sein' if wordNext['noun']: # Append to list word_pairs.append({ 'source': { 'word': word['orth'], 'lemma': word['lemma'], 'sentence': val }, 'target': { 'word': wordNext['orth'], 'lemma': wordNext['lemma'], 'sentence': val + 1 }, 'device': 'noun verb relation' }) return word_pairs
def rootform(infinite): stemmer = GermanStemmer() return stemmer.stem(infinite)
from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO from boilerpipe.extract import Extractor from nltk.stem.snowball import GermanStemmer from nltk import word_tokenize import nltk.data import os import re logger = logging.getLogger(__name__) logging.getLogger('pdfminer').setLevel(logging.CRITICAL) satztokenizer = nltk.data.load('tokenizers/punkt/german.pickle') stemmer = GermanStemmer() stoppwörter = [] '''Lädt Stopwortliste''' with open('traindata/german', 'r') as f: for line in f: wort = line.split('\n')[0] stoppwörter.append(wort.lower()) def preprocess(text): '''Filterregelungen, um Text zu vereinheitlichen.''' try: text = re.sub( "/innen|\*innen|/-innen", "innen", text) # Vereinheitlicht unterschiedliche Gender-Varianten text = re.sub("-\s*\n", "", text) # Entfernt Silbentrennung
def main(): now = datetime.now() page_dates = [] revision_dates = [] corpora = defaultdict(lambda: set()) stemmer = GermanStemmer() for page, revisions in MediawikiDump(sys.stdin).iterpages(): timestamp = revisions[0]['timestamp'] page_dates.append(timestamp) first = revisions[0] stems = set() for year in xrange(first['timestamp'].year, now.year+1): revisions_in_year = [r for r in revisions if r['timestamp'].year == year] revision_dates.extend(r['timestamp'] for r in revisions_in_year) if revisions_in_year: stems = set() for revision in revisions_in_year: html = WikiMarkup(revision['text'].encode('utf-8')).render() text = clean_html(html.decode('utf-8')) # TODO: remove remaining markup words = WORD_RE.findall(text) stems.update(stemmer.stem(word) for word in words) corpora[year].update(stems) page_dates.sort() revision_dates.sort() delta = relativedelta(revision_dates[-1], revision_dates[0]) months = delta.years * 12 + delta.months outdir = os.path.abspath('./out') if not os.path.exists(outdir): os.mkdir(outdir) fig = pyplot.figure() ax = fig.add_subplot(111) ax.plot_date(page_dates, range(1, len(page_dates)+1), '-') ax.hist(date2num(revision_dates), months, histtype='step') ax.set_xlabel(u'Year') ax.legend([u'Total No. of Pages', u'New Revisions per month']) fig.autofmt_xdate() fig.savefig(os.path.join(outdir, 'pages.png'), format='png') fig = pyplot.figure() ax = fig.add_subplot(111) timestamps = [datetime(year, 12, 31) for year in sorted(corpora.keys())] counts = [len(corpora[t.year]) for t in timestamps] ax.plot_date(timestamps, counts, '-') ax.set_xlabel(u'Year') ax.legend([u'No. of distinct tokens']) fig.autofmt_xdate() fig.savefig(os.path.join(outdir, 'tokens.png'), format='png') years = sorted(corpora.keys()) years = range(years[0], years[-1]) year_pairs = zip(years, years[1:]) for year1, year2 in year_pairs: current = corpora.get(year1, set()) next_ = corpora.get(year2, set()) filename = '{:04d}-{:04d}.diff'.format(year1, year2) with open(os.path.join(outdir, filename), 'w') as f: for token in sorted(current - next_): f.write(u'-{}\n'.format(token).encode('utf-8')) for token in sorted(next_ - current): f.write(u'+{}\n'.format(token).encode('utf-8'))
def load_stemmer(self): self._stemmer = None if self._stemming_lang == Language.GERMAN: self._stemmer = GermanStemmer() else: self._stemmer = EnglishStemmer()
def germanMapping(pattern,uri): array = pattern.split(" ") from nltk.stem.snowball import GermanStemmer stemmer = GermanStemmer() if len(array)== 3: if " n nn " in array[1]: marker = "" term = array[1] if " x " not in term and " y " not in term: term = term.split(" ")[1] return [NounPPFrame(term,uri,marker)] if " v vvfin " in array[1]: marker = "" term = array[1] if " x " not in term and " y " not in term: term = term.split(" ")[1] return [TransitiveFrame(term, uri,marker),TransitiveFrame(stemmer.stem(term), uri,marker)] if " v vvpp " in array[1]: marker = "" term = array[1] if " x " not in term and " y " not in term: term = term.split(" ")[1] return [AdjectivePredicateFrameMit(term,uri)] if len(array)==4: if " n nn " in array[1] and " prep appr " in array[2]: marker = array[2] if " x " not in marker and " y " not in marker: marker = marker.split(" ")[1] term = array[1] if " x " not in term and " y " not in term: term = term.split(" ")[1] return [NounPPFrame(term,uri,marker)] if " v vvfin " in array[1] and " n nn " in array[2]: marker = "" term = array[1] if " x " not in term and " y " not in term: term = term.split(" ")[1] return [TransitiveFrame(term, uri,marker),TransitiveFrame(stemmer.stem(term), uri,marker)] #return [TransitiveFrame(term, uri,marker)] if " v vvpp " in array[1] and " prep appr " in array[2]: marker = array[2] if " x " not in marker and " y " not in marker: marker = marker.split(" ")[1] term = array[1] if " x " not in term and " y " not in term: term = term.split(" ")[1] return [AdjectivePredicateFrameMarker(term,uri,marker)] if len(array) == 5: if " v vvpp " in array[1] and " prep appr " in array[2]: marker = array[2] if " x " not in marker and " y " not in marker: marker = marker.split(" ")[1] term = array[1] if " x " not in term and " y " not in term: term = term.split(" ")[1] return [AdjectivePredicateFrameMarker(term,uri,marker)] if " nn " in array[2] and " prep " in array[3]: marker = array[3] if " x " not in marker and " y " not in marker: marker = marker.split(" ")[1] term = array[2] if " x " not in term and " y " not in term: term = term.split(" ")[1] return [NounPPFrame(term,uri,marker)] return []
restore_best_weights=True) model.fit(X_training, y_training_one_hot, epochs=200, batch_size=64, callbacks=[early_stopping]) model.save(os.path.join("Data", "models", "rnn_intent_classification.h5")) predictions = model.predict(X_training) locations = np.argmax(predictions, 1) print(confusion_matrix(y_training, locations)) print(accuracy_score(y_training, locations)) stemmer = GermanStemmer() # with open(os.path.join("Data", "commands", "stopwords.txt"), "rt") as f: # stopwords = set(f.read().splitlines()) while True: c = input("Your Input:") if c == "q": break c = " ".join(sorted([stemmer.stem(x) for x in word_tokenize(c.lower())])) c = np.array([transform_command(c)]) prediction = model.predict(c) out_index = np.argmax(prediction)
def set_stemmer(stemmer_language): if (stemmer_language == "GER"): stemmers = GermanStemmer() else: stemmers = EnglishStemmer() return stemmers
def stem_words(self, words): stemmer = GermanStemmer() stemmed_words = [] for word in words: stemmed_words.append(stemmer.stem(word)) return stemmed_words