Exemple #1
0
    def __init__(self, BASEDIR, session_only=False, cycle_time=1):
        super().__init__(BASEDIR, session_only, cycle_time)
        self.name = 'contentrank'

        mapper = Mapping()
        self.rec_mapping = mapper.get_header_rec()
        self.event_mapping = mapper.get_header_event()
        self.update_mapping = mapper.get_header_update()
        self.item_id_idx = self.rec_mapping.index('ITEM_SOURCE')
        self.publisher_id_idx = self.rec_mapping.index('PUBLISHER')
        self.recs_idx = self.event_mapping.index('recs')
        self.limit_idx = self.rec_mapping.index('limit')
        self.title_idx = self.update_mapping.index('title')
        self.text_idx = self.update_mapping.index('text')
        self.update_id_idx = self.update_mapping.index('id')
        self.update_domainid_idx = self.update_mapping.index('domainid')

        self.germanStemmer = GermanStemmer(ignore_stopwords=True)
        self.stopwords = stopwords.words('german')
        self.stems = {}  # (item, [stem, stem, stem])

        self.correct = 0
        self.total_events = 0
        self.nrrows = 0

        self.counts = {}
Exemple #2
0
def build_stems(pattern: str, category: Category,
                elements: List[Tuple[Category, Set[str]]],
                total_stems: Set[str]) -> Set[str]:
    """
    Builds a set of stems for all words used in the pattern.

    Args:
        pattern: The pattern to tokenize and stem.
        category: The category of the pattern.
        elements:
            A mutable list of (category, stem) pairs that the new stems will
            be appended to.
        total_stems:
            The set of total stems before this function was invoked.
            Will not be mutated.

    Returns:
        The union of total_stems and stems found in the pattern.
    """

    # Tokenize pattern into words
    words = nltk.word_tokenize(pattern)
    # Get stems for the pattern's words, as a set to avoid duplicates
    stemmer = GermanStemmer()
    stems: Set[str] = {stemmer.stem(w.lower()) for w in words}
    # Add stems associated with association to the category to the
    # pattern list.
    elements.append((category, stems))
    # Add stems to total set of stems, needed for conversion to numeric
    # TensorFlow training array
    total_stems |= stems
    return total_stems
Exemple #3
0
def cosine_preprocess(texts, pickle_name, pickle_folder='pickle'):
    pickle_path = os.path.join(pickle_folder, pickle_name)

    # Return from disk if possible for efficiency reasons
    if os.path.exists(pickle_path):
        with open(pickle_path, 'rb') as f:
            return pickle.load(f)

    processed = []
    for text in tqdm(texts):
        stemmer = GermanStemmer()
        words = stopwords.words('german')

        tokens = [
            stemmer.stem(token) for token in word_tokenize(text)
            if token not in words
        ]

        processed.append(' '.join(tokens))

    # Pickle the output
    if not os.path.exists(pickle_folder):
        os.makedirs(pickle_folder)

    with open(pickle_path, 'wb') as f:
        pickle.dump(processed, f)

    return processed
Exemple #4
0
    def tokenize(self, tweet):
        tweet = remove_handles(tweet)

        tweet = tweet.replace('#', ' ')
        tweet = tweet.replace('<', ' ')
        tweet = tweet.replace('>', ' ')
        tweet = tweet.replace('&', ' und ')
        tweet = tweet.replace('|LBR|', ' ')
        tweet = tweet.replace('-', ' ')
        tweet = tweet.replace('_', ' ')
        tweet = tweet.replace("'s", ' ')
        tweet = tweet.replace(",", ' ')
        tweet = tweet.replace(";", ' ')
        tweet = tweet.replace(":", ' ')
        tweet = tweet.replace("/", ' ')
        tweet = tweet.replace("+", ' ')

        tknzr = Tokenizer_NLTK(preserve_case=self.preserve_case,
                               reduce_len=True)

        if self.join:
            return " ".join(tknzr.tokenize(tweet))
        elif self.use_stemmer:
            stmmr = Stemmer_NLTK()
            return [stmmr.stem(token) for token in tknzr.tokenize(tweet)]
        else:
            return tknzr.tokenize(tweet)
Exemple #5
0
    def __init__(self, filename):
        """
        Parameters
        ----------
        filename: str
            Path to the (plaintext) file for this document.

        """
        # open file with utf-8-sig to remove any BOMs
        with open(filename, "r", encoding="utf-8-sig", errors="ignore") as infile:
            self.string = clean_whitespace(infile.read())

        self.lang = langid.classify(self.string)[0]
        if self.lang == 'de':
            self._stemmer = GermanStemmer()
        elif self.lang == 'en':
            self._stemmer = EnglishStemmer()
        else:
        	print("no stemmer for '{}'".format(self.lang))
        	print("falling back to 'de'...")
        	self._stemmer = GermanStemmer()

        self.name = os.path.splitext(os.path.split(filename)[1])[0]
        self.ID = self.name
        self.tokens = word_tokenize(self.string)
        self.stems = list(map(self.stem, self.tokens))
        self.length = len(self.tokens)
        self.hashes = list(map(hash, self.stems))
        self.sents = self._get_sents()
        self.freq_dist = dict(Counter(self.hashes))
Exemple #6
0
def evaluate_dnn(path:str):
    with open(os.path.join(path, "tag_to_int.json"), "rt") as f:
        tag_to_int = json.load(f)
    with open(os.path.join(path, "int_to_tag.json"), "rt") as f:
        int_to_tag = json.load(f)  

    cv = pickle.load(open(os.path.join(path, "cv.p"), "rb"))
    stemmer = GermanStemmer()
    model_name = "dnn_intent_classification.h5"
    model = load_model(os.path.join(path, model_name))

    with open(os.path.join("Data", "commands", "Test", "testingdata.json"), "rt") as f:
        val_data = json.load(f)

    X = []
    y = []

    for tag, commands in val_data.items():
        for command in commands:
            command = " ".join(stemmer.stem(c) for c in sorted(word_tokenize(command)))
            X.append(transform_command_BoW(command, cv))
            y.append(tag_to_int[tag])

    X = np.array(X)
    y = np.array(y)

    predictions =  model.predict(X)
    predicted_indices = np.argmax(predictions, 1)

    print("acc: ", accuracy_score(y, predicted_indices))
    cm = confusion_matrix(y, predicted_indices)
    cm = pd.DataFrame(cm, index=int_to_tag.values(), columns=int_to_tag.values())
    print(cm)

    return (accuracy_score(y, predicted_indices), cm)
Exemple #7
0
def remove_stop_words(msg):
    # remove stop words and stem words
    stemmer = GermanStemmer()

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(msg)

    stop_words = set(stopwords.words('german'))

    words_filtered = []

    for w in words:
        if w not in stop_words:
            words_filtered.append(stemmer.stem(w))

    return words_filtered
Exemple #8
0
    def __init__(self, essay: str, name: str, gazetteer_version: int = 1):
        """
        Initalizes the Stringmatcher. Takes a path to an essay and the gazetteer version, that should be used. See the above dict 
        :param file_path: path to the essay that is to be processed
        :param gazetteer_version: the gazetteer version that should be used. See the above defined dict "version_subfolder" for what values are possible
        """
        # Initialize data structures
        self.essay = essay
        self.essay_name = name
        self.gazetteer_version = gazetteer_version
        self.tokens_without_stopwords = []
        self.found_entities = dict()
        self.stemmer = GermanStemmer()
        self.fastText_model = None
        self.spacy_model = None
        self.file_path = RESULTS_PATH + name

        if not os.path.exists(self.file_path):
            os.makedirs(self.file_path)

        # retrieve the gazetteers that should be used for annotation
        self.gazetteers = sorted([
            f for f in os.listdir(PATH_GAZETTEERS +
                                  version_subfolder[gazetteer_version])
            if os.path.isfile(PATH_GAZETTEERS +
                              version_subfolder[gazetteer_version] + f)
        ])
        print("Used gazetteers: %s" % (gazetteer_version))

        # retrieve gazetteers with already preprocessed entries if available (for efficiency reasons) or create new one
        if os.path.isfile(PATH_GAZETTEERS + "tokenized_gazetteers"):
            self.tokenized_gazetteers = pickle.load(
                open(PATH_GAZETTEERS + "tokenized_gazetteers", "rb"))
        else:
            self.tokenized_gazetteers = dict()
        changed = False
        for gazetteer_filename in self.gazetteers:
            # if there is not already a tokenized version of this gazetteer, tokenize it
            if not gazetteer_filename in self.tokenized_gazetteers.keys():
                self.tokenized_gazetteers[
                    gazetteer_filename] = self.tokenize_gazetteer(
                        gazetteer_filename)
                changed = True
        if changed:
            pickle.dump(self.tokenized_gazetteers,
                        open(PATH_GAZETTEERS + "tokenized_gazetteers", "wb"))
Exemple #9
0
 def __init__(self, config):
   self.config = config
   if config.stem:
     if config.lang == 'en':
       self.stemmer = PorterStemmer()
     elif config.lang == 'de':
       self.stemmer = GermanStemmer()
     else:
       self.stemmer = IdStemmer()
Exemple #10
0
def _check_NE_yeah(gram):
    tag = entities.get(" ".join(gram), "O")

    if tag == "O":
        if len(gram) == 2:
            first, last = gram
            if first in vornamen and last in nachnamen:
                tag = "PER"

    if tag == "O":
        try:
            tag = entities.get(
                " ".join([GermanStemmer().stem(g) for g in gram]), "O")
        except:
            tag = entities.get(
                " ".join([
                    GermanStemmer().stem(g.decode(encoding="UTF-8"))
                    for g in gram
                ]), "O")

    return tag
    def _preprocess(text, mode=None):
        '''helper function to preprocess text. returns List of Sentences'''
        sentences = split_single(text)
        if mode:
            nlp = spacy.load('de_core_news_sm')
            if mode == 'lemmatize':
                sentences = [
                    Sentence((' ').join([token.lemma_ for token in nlp(s)]))
                    for s in sentences
                ]
            elif mode == 'stem':
                stemmer = GermanStemmer()
                sentences = [
                    Sentence((' ').join(
                        [stemmer.stem(token.text) for token in nlp(s)]))
                    for s in sentences
                ]
        else:
            sentences = [Sentence(s, use_tokenizer=True) for s in sentences]

        return sentences
Exemple #12
0
def clean_text(text):
    """
    :param text:
    :return:
    """
    # stopwords = set(nltk.corpus.stopwords.words('german'))
    file_path = r'etc/models/german.txt'
    with open(file_path) as file:
        file_data = file.read()
    stopwords = file_data.split('\n')
    gs = GermanStemmer()
    text_cleaned = ""
    text_cleaned = re.sub('[^a-zA-Z]', ' ',
                          text)  # Keep only alphabet and space characters
    text_cleaned = text_cleaned.lower()  # All character to lowercase
    text_cleaned = text_cleaned.split(
    )  # Split to list of word (split by space specify character)
    text_cleaned = [
        gs.stem(word) for word in text_cleaned if not word in stopwords
    ]
    text_cleaned = ' '.join(text_cleaned)
    return text_cleaned
Exemple #13
0
def text_cleaner(text):
    use_GermanStemmer = False
    tokens = False

    # Remove username handles
    # -? do we need the user names
    text = remove_handles(text)

    # Remove punctuation marks
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)

    # replace the umlauts
    # =============================================================================
    #         text = re.sub('ä', 'ae', text)
    #         text = re.sub('ö', 'oe', text)
    #         text = re.sub('ü', 'ue', text)
    #         text = re.sub('Ä', 'Ae', text)
    #         text = re.sub('Ö', 'Oe', text)
    #         text = re.sub('Ü', 'Ue', text)
    #         text = re.sub('ß', 'ss', text)
    # =============================================================================

    # remove the numbers
    text = re.sub(r'[0-9]+', '', text)

    # Remove emojis
    german_char = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜ"
    text = ''.join(c for c in text if c in german_char)

    tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True)
    if tokens:
        return tokenizer.tokenize(text)
    elif use_GermanStemmer:
        stemmer = GermanStemmer()
        return [stemmer.stem(token) for token in tokenizer.tokenize(text)]
    else:
        return text
Exemple #14
0
def ner_features(sentence, i, history):
    # TODO: try using TreeTagger's POS tag
    wordO = sentence[i]
    word = wordO.string
    pos = wordO.pos
    stemmed = GermanStemmer().stem(word)

    if i == 0:
        prevword, prevpos = "<START>", "<START>"
        last = "<START>"
        prevstemmed = "<START>"
    else:
        last = history[-1]
        prevword = sentence[i - 1].string
        prevpos = sentence[i - 1].pos
        prevstemmed = GermanStemmer().stem(sentence[i - 1].string)

    chunk = []
    if not wordO.chunk:
        chunk.append("START")
        knowledge_sources = "O"
    else:
        knowledge_sources = check_NE(convert(wordO.string), wordO.chunk)
        chunk = [w.string for w in wordO.chunk]

    stem_is_word = stemmed == word.lower()

    knowledge_sources_stemmed = _check_NE_yeah([stemmed])

    return {
        "knowledge": knowledge_sources,
        "knowledge_lemma": knowledge_sources_stemmed,
        "history": "+".join(history)[-2:],
        "pos": pos,
        "word": word,
        "stemmed": stemmed
    }
Exemple #15
0
 def __init__(self):
     self.tweets = 0
     self.related_tweets = 0
     self.stopwords = {}
     self.stemmers = {}
     self.stemmers["es"] = SpanishStemmer()
     self.stemmers["en"] = PorterStemmer()
     self.stemmers["fr"] = FrenchStemmer()
     self.stemmers["de"] = GermanStemmer()
     self.stopwords["es"] = self.load_stopwords_file(
         "spanish_stopwords.txt")
     self.stopwords["en"] = self.load_stopwords_file(
         "english_stopwords.txt")
     self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt")
     self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt")
     self.output_file = open(sys.argv[2], 'a')
Exemple #16
0
class CleanDoc(BaseEstimator, TransformerMixin, NoFit):
    def __init__(self):
        self.stemmer = GermanStemmer()

    def transform(self, docs):
        res = []
        for doc in docs:
            lines = doc.split("\n")
            lines = [
                " ".join(
                    self.stemmer.stem(word)
                    for word in re.findall("[a-zäöüß]{3,}", line.lower()))
                for line in lines
            ]
            res.append("\n".join(lines))

        return res
Exemple #17
0
    def __init__(self,
                 lang,
                 strip_accents=None,
                 ngram_range=(1, 1),
                 max_df=1.0,
                 min_df=1,
                 stop_words=None):

        if lang == 'de':
            self.stemmer = GermanStemmer()
        else:
            self.stemmer = EnglishStemmer()

        super(self.__class__, self).__init__(stop_words=stop_words,
                                             strip_accents=strip_accents,
                                             ngram_range=ngram_range,
                                             max_df=max_df,
                                             min_df=min_df)
Exemple #18
0
    def stemWord(self, word, lng):
        '''Separates the word's changeable part with a '|' for wordfast'''
        if lng == 'ru':
            stemmer = RussianStemmer()
        elif lng == 'en':
            stemmer = PorterStemmer()
        elif lng == 'de':
            stemmer = GermanStemmer()
        else:
            print('Language error. Exiting...')
            sys.exit(1)

        word = word.lower()  #otherwise the stemmer fails
        if len(word) <= 3:
            return word
        elif len(word) == len(stemmer.stem(word)):
            return "{0}|{1}".format(word[:-1], word[-1])
        else:
            return "{0}|{1}".format(word[:len(stemmer.stem(word))], \
            word[len(stemmer.stem(word)):])
Exemple #19
0
class Stringmatcher:
    def __init__(self, essay: str, name: str, gazetteer_version: int = 1):
        """
        Initalizes the Stringmatcher. Takes a path to an essay and the gazetteer version, that should be used. See the above dict 
        :param file_path: path to the essay that is to be processed
        :param gazetteer_version: the gazetteer version that should be used. See the above defined dict "version_subfolder" for what values are possible
        """
        # Initialize data structures
        self.essay = essay
        self.essay_name = name
        self.gazetteer_version = gazetteer_version
        self.tokens_without_stopwords = []
        self.found_entities = dict()
        self.stemmer = GermanStemmer()
        self.fastText_model = None
        self.spacy_model = None
        self.file_path = RESULTS_PATH + name

        if not os.path.exists(self.file_path):
            os.makedirs(self.file_path)

        # retrieve the gazetteers that should be used for annotation
        self.gazetteers = sorted([
            f for f in os.listdir(PATH_GAZETTEERS +
                                  version_subfolder[gazetteer_version])
            if os.path.isfile(PATH_GAZETTEERS +
                              version_subfolder[gazetteer_version] + f)
        ])
        print("Used gazetteers: %s" % (gazetteer_version))

        # retrieve gazetteers with already preprocessed entries if available (for efficiency reasons) or create new one
        if os.path.isfile(PATH_GAZETTEERS + "tokenized_gazetteers"):
            self.tokenized_gazetteers = pickle.load(
                open(PATH_GAZETTEERS + "tokenized_gazetteers", "rb"))
        else:
            self.tokenized_gazetteers = dict()
        changed = False
        for gazetteer_filename in self.gazetteers:
            # if there is not already a tokenized version of this gazetteer, tokenize it
            if not gazetteer_filename in self.tokenized_gazetteers.keys():
                self.tokenized_gazetteers[
                    gazetteer_filename] = self.tokenize_gazetteer(
                        gazetteer_filename)
                changed = True
        if changed:
            pickle.dump(self.tokenized_gazetteers,
                        open(PATH_GAZETTEERS + "tokenized_gazetteers", "wb"))

    def tokenize(self):
        """
        tokenizes the complete essay and stores one version with stopwords
        and one that has neither stopwords nor special characters (except for '-' and '_')
        The latter one will be used for the extraction of n-grams
        """
        # store tokens without stopwords
        if not self.essay.strip():
            return ([], 0)
        stop_words = set(
            stopwords.words('german')
        )  # we remove stopwords, because they have little meaning. Thus we gain some efficiency
        tokenizer = RegexpTokenizer(
            r'[\w-]+'
        )  # finds all strings with alphanumerical characters or with '-' or '_' (the latter implicitly through \w)
        tokenized_essay = tokenizer.tokenize(self.essay)
        self.tokens_all = tokenized_essay
        # token_offsets = list(tokenizer.span_tokenize(self.essay))
        # print(tokenized_essay)
        # print(token_offsets)
        # print(len(tokenized_essay))
        # print(len(token_offsets))
        # self.token_offsets = token_offsets

        # give every token an index for later backmapping
        tokenized_enumerated_essay = [
            t for t in enumerate(tokenized_essay, start=0)
        ]

        # filter out stopwords
        self.tokens_without_stopwords = []
        for token in tokenized_enumerated_essay:
            if token[1].lower() not in stop_words:
                self.tokens_without_stopwords.append(
                    (token[0], token[1].lower()))

    def get_n_grams_up_to_n(self, words: list, N: int = 3):
        """
        returns a list of all n_grams with n <= N
        Example for N=3: [[(idx1, t1)], [(idx2, t2)], [(idx3, t3)], [(idx1, t1), (idx2, t2)], [(idx2, t2), (idx3, t3)], [(idx1, t1), (idx2, t2), (idx3, t3)]]
        :param words: list of tokens to form n_grams from
        :param N: specifies the maximum n-gram size
        :return: list of n-grams
        """
        n_grams_all = []
        for n in range(1, N + 1):
            # is is the start word-index of each possible n-gram (note: not the character index!)
            n_grams = [words[i:i + n] for i in range(0, len(words) - n + 1)]
            n_grams_all.extend(n_grams)
        return n_grams_all

    def tokenize_gazetteer(self, gazetteer: str):
        """
        creates a tokenized representation of a gazetteer
        :param gazetteer: the gazetteer to be tokenized
        :return: list containing the original and preprocessed gazetteer entries
        """
        tokenized_gazetteer = []
        with open(
                PATH_GAZETTEERS + version_subfolder[self.gazetteer_version] +
                gazetteer, "r") as gazetteer_f:
            for line in gazetteer_f:
                # keep the original and preprocessed version of this line
                tokenized_gazetteer.append((line, self.tokenize_line(line,
                                                                     0)[0]))

        return tokenized_gazetteer

    def tokenize_line(self, line: str, start_index: int = 0):
        """
        takes a string, and returns an variant with enumerated tokens, buth without stopwords and most special characters.

        tokens themselves are of the form (index, string), where index refers to its position in the input file
        :param line: line to be tokenized
        :param start_index: start index of the enumeration of returned tokens
        :return: returns a tuple of
                1. List of tokens, that are enumerated for later backmapping, starting with the given start index
                2. The next unused (free) index
        """
        if not line.strip():
            return ([], start_index)
        stop_words = set(
            stopwords.words('german')
        )  # we remove stopwords, because they have little meaning. Thus we gain some efficiency
        tokenizer = RegexpTokenizer(
            r'[\w-]+'
        )  # finds all strings with alphanumerical characters or with '-' or '_' (the latter implicitly through \w)
        tokenized_line = tokenizer.tokenize(line)

        # give every token an index for later backmapping
        tokenized_enumerated_line = [
            t for t in enumerate(tokenized_line, start=start_index)
        ]
        next_index = tokenized_enumerated_line[-1][0] + 1

        # filter out stopwords
        filtered_tokens = []
        for token in tokenized_enumerated_line:
            if token[1].lower() not in stop_words:
                filtered_tokens.append((token[0], token[1].lower()))
        return (filtered_tokens, next_index)

    def match(self,
              stemmer: bool = False,
              similarity: float = 0,
              method: str = "exact",
              semantic_cmp: bool = False,
              oov: bool = True,
              store_responsible_word: bool = False):
        """
        Annotates the essay that is associated with this stringmatcher object with the specified parameters
        :param stemmer: indicates whether or not the terms should be stemmed before comparison
        :param similarity: Similarity threshold, indicating how similar two strings have to be for a match.
                           Matching based on similarity happens only if similarity is not 0
        :param method:  A method name to refer to this setup
        :param semantic_cmp: if flag is set, compares based on embeddings
        :param oov: if flag is set, fastText with cosine similarity (embeddings_oov) is used instead of spacy (embeddings). 
        :param store_responsible_word: if flag is set, keeps the information of what phrase was detected as ME with the help of what gazetteer entry.
        :return: a list of all detected MEs (including stopwords)
        """

        # "matches" is a list, that contains a tuple with the original n_gram and a set, that stores in which gazetteers it occurred
        # therefore it looks like this: [([(index1, word1), (index2, word2), ...], {NETDOK, ...}), ...]
        matches = []
        # define how large the n_grams should be
        n = 3
        # get essay n_grams
        essay_n_grams = self.get_n_grams_up_to_n(self.tokens_without_stopwords,
                                                 n)
        # add a set to each n_gram, that keeps track of which gazetteer detects that n_gram
        essay_n_grams = [(n_gram, set()) for n_gram in essay_n_grams]

        if not method in self.found_entities.keys():
            self.found_entities[method] = (self.tokens_all, set())

        # find gazetteer entries in essays
        for gazetteer_filename in self.gazetteers:
            gazetteer = self.tokenized_gazetteers[gazetteer_filename]

            # compare every gazetteer entry to all essay n_grams
            for gaz_idx, (gaz_ME_string, gaz_n_gram) in enumerate(gazetteer):
                for essay_n_gram, matched_gazetteers in essay_n_grams:
                    matched = False

                    # compare essay- and gazetteer n_gram with given parameters
                    if self.compare_n_grams(
                            essay_n_gram,
                            gaz_n_gram,
                            similarity,
                            semantic_cmp=semantic_cmp,
                            oov=oov,
                            store_responsible_word=store_responsible_word,
                            gazetteer_filename=gazetteer_filename,
                            method_name=method,
                            stemmer=stemmer):
                        matched = True

                    # check whether the n_gram has been found in a previous gazetteer.
                    # If that is the case, do not add the n_gram, but instead just update the set of gazetteers that is associated with this essay n_gram
                    if matched and len(matched_gazetteers) == 0:
                        # this is the first time this n_gram has been seen in a gazetteer, so append it to matches
                        matched_gazetteers.add(gazetteer_filename)
                        matches.append((essay_n_gram, matched_gazetteers))
                    elif matched:
                        # this n_gram has been seen in a previous gazetteer already, therefore just update the set of gazetteers, in which it occured
                        for n_gram, gaz in matches:
                            if n_gram == essay_n_gram:
                                gaz.add(gazetteer_filename)

                    if matched:
                        # finally, store the line index of the gazetteer entry that matched for later statistics
                        entity_information = (tuple(essay_n_gram),
                                              gaz_ME_string.strip(),
                                              gaz_idx + 1, gazetteer_filename)
                        if not entity_information in self.found_entities[
                                method][1]:
                            self.found_entities[method][1].add(
                                entity_information)

        # filter out sub_n_grams
        filtered_n_grams = self.filter_matches(matches=matches, n=n)
        # extend the n_grams to include stopwords again
        medical_terms = self.get_whole_terms(filtered_n_grams)
        # create an annotated version of the essay
        self.annotated_essay = self.annotate_essay(medical_terms,
                                                   method=method)

        return self.found_entities

    def filter_matches(self, matches: list, n: int):
        """
        filters and keeps only the biggest n_grams found in each gazetter. Discards those that are part of bigger n_grams
        
        note that elements of the returned list are now not tuples of the form (n_gram, set_of_gazetteers) as in the input, but instead (n_gram, gazetter)
        i.e. a tuple is not anymore stored with a set of gazetters but with one concrete gazetter (and thus may appear several times in the list)
        therefore the list may be larger afterwards, but n_grams are directly mapped to the gazetter they were found with
        This allows us to define something like a subset relation on these tuples in order to only keep the largest ones
        E.g. ([(1, "asthma")], NetDok) would be a subset of ([(1, "asthma"), (2, "bronchiale")], NetDok)
        but it would not be a subset of ([(1, "asthma"), (2, "bronchiale")], Wiki), as they were not found with the same gazetteer

        :param matches: list of tuples, which contain an n_gram and the gazetteer they were found with, i.e. (n_gram, set_of_gazetteers)
                        n_grams have the form [(index1, word1), (index2, word2), ...]
                        where index refers to the position of the token in the input file, word_i is the i'th word of the n_gram
        :param n:       Indicates the maximum size of n_grams
        """

        filtered_n_grams = []
        # seen_idx_gaz is a list storing tuples of the form (idx, gazetteer) where index is the position of the word in the complete essay
        # and gazetteer is a set, storing the gazetteers in which the word occurred in
        seen_idx_gaz = set()

        # process n_grams with decreasing size
        for n in range(n, 0, -1):
            for n_gram, gazetteers in matches:
                for gaz in gazetteers:
                    # process only n_grams of the current size
                    if len(n_gram) == n:
                        # check whether one of the words has already been seen in a previous (larger) n_gram using its index/position
                        # keep the largest complete n_grams that were found for each gazetteer, discard all sub_n_grams
                        already_seen = False
                        temp_idx_gaz = set()
                        for index, word in n_gram:
                            if (index, gaz) in seen_idx_gaz:
                                already_seen = True
                            temp_idx_gaz.add((index, gaz))
                        if not already_seen:
                            filtered_n_grams.append((n_gram, gaz))
                            seen_idx_gaz = seen_idx_gaz.union(temp_idx_gaz)
        return filtered_n_grams

    def get_whole_terms(self, filtered_n_grams: list):
        """
        adds previously deleted stopwords to the n_grams again.

        :param filtered_n_grams: The list of n_grams to enrich with stopwords again.
                                 n_grams are here not only the n_grams of the form [(idx1, t1), (idx2, t2), etc...]
                                 but instead are tuples (n_gram, gazetteer), where "n_gram" has the format above
        :return: The list of (n_gram, gazetteer)-tuples that now also include the stopwords
        """

        # find all contiguous words, that appear in the range of the n_gram
        tokens_all_words = self.tokens_all
        medical_terms = []
        for n_gram, gazetteer in filtered_n_grams:
            # for each n_gram, retrieve all words between the index of the first and the last word of the n_gram
            start_idx = n_gram[0][0]
            end_idx = n_gram[-1][0] + 1
            medical_term = [(idx, tokens_all_words[idx])
                            for idx in range(start_idx, end_idx)]
            medical_terms.append((medical_term, gazetteer))
        return medical_terms

    def annotate_essay(self, medical_terms: list, method: str):
        """
        takes a list of n_grams/medical_terms and stores an annotated version of the original essay
        Annotations indicate in which gazetteer the word was found in and which ME it is part of using a unique ID

        For the text "Asthma Bronchiale is a terrible disease", where the word "Asthma" was found in gazetteer 1 and "Asthma Bronchiale" was
        found in gazetteer 2, the annotated text would look the like the following:
        "asthma     {(gazetteer1, 1), (gazetteer2, 2)}
        Bronchiale  {(gazetteer2, 2)}
        is
        a
        terrible 
        disease"

        :param medical_terms: list of medical terms, represented as lists of tuples [(term1, gazetteer1), ...],
                              terms are represented as lists of the form [(index1, word1), (index2, word2), ...],
                              index refers to the word's position in the essay
        :param method: Name for the used matching setup. Used to derive the name of the annotated essay
        """

        # read all words from the original essay
        tokens_all_words = self.tokens_all[:]
        medical_terms.sort(key=lambda ngram_gaz: ngram_gaz[0][0])

        # store all word indices that are part of a detected ME to later add annotations to them
        marked_words = set()
        for term in medical_terms:
            for idx_words in term[0]:
                marked_words.add(idx_words[0])

        # iterate over all tokens of the essay
        for position in range(0, len(tokens_all_words)):
            # if this position/token is part of a medical_term/ME, append an annotation
            if position in marked_words:
                tokens_all_words[position] = tokens_all_words[position] + "\t{"
                # the index at which the medical term appears in the input list is also its ID
                for term_id in range(0, len(medical_terms)):
                    # annotate the token at this position, if it is in the range of the first and the last word_index of the current medical_term/ME
                    if position in range(medical_terms[term_id][0][0][0],
                                         medical_terms[term_id][0][-1][0] + 1):
                        tokens_all_words[position] += "(%s, %s), " % (
                            medical_terms[term_id][1], term_id)
                tokens_all_words[position] = tokens_all_words[position].rstrip(
                    ", ") + "}"

        # store the annotations in a file
        annotated_essay_name = self.essay_name + "_annotated_" + method + ".txt"
        with open(self.file_path + "/" + annotated_essay_name,
                  "w") as essay_annotated:
            for token in tokens_all_words:
                essay_annotated.write(token + "\n")

    def compare_n_grams(self,
                        n_gram1: list,
                        n_gram2: list,
                        similarity: float = 0,
                        semantic_cmp: bool = False,
                        oov: bool = True,
                        store_responsible_word: bool = False,
                        gazetteer_filename: str = "-",
                        method_name: str = "",
                        stemmer=False):
        """
        takes two n_grams and returns True if all words at all indices of both n_grams satisfy the similarity constraints
        n_grams for this method are of the form [(index1, word1), (index2, word2), ...]
        :param n_gram1: The first n_gram to compare
        :param n_gram2: The second n_gram to compare
        :param similarity: similarity threshold. Uses similarity based comparison if in the range (0,1]. Uses exact or stemmed matching if 0
        :param semantic_cmp: Compares based on semantic/embeddings if flag is set, based on lexical similarity otherwise. No effect if similarity = 0
        :param oov: compares with FastText's model (embeddings_oov) if flag is set, else with Spacy's model (embeddings)
        :param store_responsible_word: if flag is set, keeps the gazetteer entries that are responsible for a match
        :param gazetteer_filename: the gazetteer's name from which the second n_gram was created. Used to keep the mapping of a match to the responsible gazetteer entry
        :param method: method name, used for correct mapping of method to match for later statistics
        :param stemmer: compares stemmed versions of the n_grams if flag is set
        :return: True if a match occurred
        """
        # check for same length
        if not n_gram1 or not n_gram2 or not len(n_gram1) == len(n_gram2):
            return False

        same = True
        for i in range(len(n_gram1)):
            # exact matching
            if not similarity and not stemmer and not semantic_cmp and n_gram1[
                    i][1] != n_gram2[i][1]:
                same = False
            # exact matching with stemming
            elif not similarity and stemmer and not semantic_cmp and self.stemmer.stem(
                    n_gram1[i][1]) != self.stemmer.stem(n_gram2[i][1]):
                same = False
            # lexical similarity with stemming
            elif similarity and stemmer and not semantic_cmp and not SequenceMatcher(
                    None, self.stemmer.stem(n_gram1[i][1]),
                    self.stemmer.stem(n_gram2[i][1])).ratio() >= similarity:
                same = False
            # lexical similarity matching
            elif similarity and not stemmer and not semantic_cmp and not SequenceMatcher(
                    None, n_gram1[i][1], n_gram2[i][1]).ratio() >= similarity:
                same = False
            # semantic similarity matching
            elif similarity and not stemmer and semantic_cmp and not self.embedding_similarity(
                    n_gram1[i][1], n_gram2[i][1], oov=oov) >= similarity:
                same = False

        #if store_responsible_word and same:
        # keep information of this match
        #    self.store_responsible(n_gram1, n_gram2, gazetteer_filename, similarity, semantic_cmp, oov, method_name)

        return same

    def match_char_based(self,
                         similarity: float = 0.9,
                         store_responsible_word: bool = False):
        """
        matches based on similarity of character sequences
        :param similarity: the similarity threshold, which is a value in the range of (0,1]. If two terms have a similarity above this threshold, a match occurs
        :param store_responsible_word: flag is propagated and indicates that matching information should be kept
        """
        method = "char_based_%1.2f" % similarity
        #self.store_method_metadata(method)

        return self.match(similarity=similarity,
                          method=method,
                          semantic_cmp=False,
                          store_responsible_word=store_responsible_word)

    def match_embeddings(self,
                         similarity: float = 0.6,
                         oov: bool = True,
                         store_responsible_word: bool = False):
        """
        matches based on semantic similarity, a.k.a. similarity of word embeddings
        :param similarity: similarity threshold in the range of (0,1]
        :param oov: if flag is set, FastText's model (embeddings_oov) is used, otherwise Spacy
        :param store_responsible_word: flag is propagated and indicates that matching information should be kept
        """
        if oov:
            method = "embeddings_oov_%1.2f" % similarity
        else:
            method = "embeddings_%1.2f" % similarity

        return self.match(similarity=similarity,
                          method=method,
                          semantic_cmp=True,
                          oov=oov,
                          store_responsible_word=store_responsible_word)

    def match_stemmed(self, store_responsible_word: bool = False):
        """
        matches based on exact string matching with stemming
        :param store_responsible_word: flag is propagated and indicates that matching information should be kept
        """
        method = "stemmed"

        return self.match(stemmer=True,
                          method=method,
                          store_responsible_word=store_responsible_word)

    def match_stemmed_char_based(self,
                                 similarity: float = 0.95,
                                 store_responsible_word: bool = False):
        """
        matches based on a combination of char_based and stemming
        :param store_responsible_word: flag is propagated and indicates that matching information should be kept
        """
        method = "stemmed_char_based_%1.2f" % similarity

        return self.match(stemmer=True,
                          similarity=similarity,
                          method=method,
                          semantic_cmp=False,
                          store_responsible_word=store_responsible_word)

    def match_exact(self, store_responsible_word: bool = False):
        """
        matches with comparison based on exact string matching
        :param store_responsible_word: flag is propagated and indicates that matching information should be kept
        """
        method = "exact"

        return self.match(method=method,
                          store_responsible_word=store_responsible_word)

    def embedding_similarity(self, word1: str, word2: str, oov: bool = True):
        """
        accepts two strings and returns the similarity between both words according to their embeddings
        :param oov: if flag is set, uses the fastText model with cosine similarity. Otherwise a spacy model is used to for comparison
        """

        # use fasttext
        if oov:
            # load model if this is the first comparison
            if not self.fastText_model:
                self.fastText_model = fastText.load_model(MODEL_PATH)

            # get embeddings
            embedding1 = self.fastText_model.get_word_vector(word1).reshape(
                1, -1)  # reshape needed to satisfy cosine function below
            embedding2 = self.fastText_model.get_word_vector(word2).reshape(
                1, -1)

            similarity = cosine_similarity(embedding1, embedding2)

        # use spacy
        else:
            if not self.spacy_model:
                self.spacy_model = spacy.load(SPACY_EMBEDDINGS)

            embedding1 = self.spacy_model(word1)
            embedding2 = self.spacy_model(word2)

            similarity = embedding1.similarity(embedding2)

        return similarity
test_df.reset_index(inplace=True)
print test_df.isnull().sum()



print 'Unique restaurants: {}'.format(len(data['restaurant_id'].unique()))
print 'Unique menu_category: {}'.format(len(data['menu_category'].unique()))
print 'Unique product_name: {}'.format(len(data['product_name'].unique()))
print 'Unique ingredients: {}'.format(len(data['ingredients'].unique()))
print test_df.shape

encode_menu = test_df['menu_category'].str.encode('ascii', errors='ignore')
print len(encode_menu.unique())
encode_menu.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True)
print len(encode_menu.unique())
encode_menu = encode_menu.apply(lambda x:GermanStemmer().stem(x))
print len(encode_menu.unique())
encode_name = test_df['product_name'].str.encode('ascii', errors='ignore')
print len(encode_name.unique())
encode_name.replace({r'[^a-zA-Z0-9\s,]':''}, regex=True, inplace=True)
print len(encode_name.unique())
encode_name = encode_name.apply(lambda x:GermanStemmer().stem(x))
print len(encode_name.unique())


# X = pd.concat([encode_menu, encode_name, test_df['restaurant_id'].astype('str')], axis=1)

# le = preprocessing.LabelEncoder()
# X_2 = X.apply(le.fit_transform)
# print X_2.head()
# print X_2.shape
Exemple #21
0
 def __init__(self):
     self.stemmer = GermanStemmer()
Exemple #22
0
class Document:
    """
    Represents a complete document.

    Attributes
    ----------
    string: str
        Raw string contents of the file represented by this document.
    lang: str
        Language identifier, reflects classification of `langid` module.
    name: str
        Filename of source file, without extension.
    ID: str
        Alias for `name`.
    tokens: list[str]
        Tokens in document, generated by standard NLTK tokenizer.
    stems: list[str]
        Tokens after stemming.
    length: int
        Length of document.
    hashes: list[int]
        Representation of document as list of stems, hashed for efficiency.
    sents: list[Sentence]
        Sentences in this document.
    freq_dist: dict[int, int]
        Frequency distribution for this document, mapping hashes of stems to
        their frequencies.
    """

    string = str()
    _stemmer = None

    def __init__(self, filename):
        """
        Parameters
        ----------
        filename: str
            Path to the (plaintext) file for this document.

        """
        # open file with utf-8-sig to remove any BOMs
        with open(filename, "r", encoding="utf-8-sig", errors="ignore") as infile:
            self.string = clean_whitespace(infile.read())

        self.lang = langid.classify(self.string)[0]
        if self.lang == 'de':
            self._stemmer = GermanStemmer()
        elif self.lang == 'en':
            self._stemmer = EnglishStemmer()
        else:
        	print("no stemmer for '{}'".format(self.lang))
        	print("falling back to 'de'...")
        	self._stemmer = GermanStemmer()

        self.name = os.path.splitext(os.path.split(filename)[1])[0]
        self.ID = self.name
        self.tokens = word_tokenize(self.string)
        self.stems = list(map(self.stem, self.tokens))
        self.length = len(self.tokens)
        self.hashes = list(map(hash, self.stems))
        self.sents = self._get_sents()
        self.freq_dist = dict(Counter(self.hashes))

    def __repr__(self):
        return "<Document {0}...{1}>".format(self.name[:12], self.name[-5:])

    def __eq__(self, other):
        return self.hashes == other

    def __hash__(self):
        return hash(self.name)

    def stem(self, s):
        """
        Stem token `s` with appropriate stemmer.

        Notes
        -----
        Running the result through the ASCII encoder ensures that no weird
        characters end up in the final stem (such as unusual space characters,
        which one might otherwise overlook).

        Examples
        --------
        >>> d = Document("somefile.txt")
        >>> d.stem("Versicherungen")
        'versicher'

        """
        return self._stemmer.stem(s).encode('ascii', errors='ignore').decode()

    def _get_sents(self):
        """
        Determine sentences in document according to locations of sentence
        boundary punctuation.

        Assumes that abbreviations, etc., have already been approprately tokenized.

        """
        id_counter = 0
        my_sents = []
        last_start = 0
        for i, h in enumerate(self.hashes):
            if h in SENT_PUNCT and i - last_start > 1:
                id_counter += 1
                my_sents.append(Sentence("{0}_{1}".format(self.name,
                                                          id_counter),
                                         self,
                                         (last_start, i + 1)))
                last_start = i + 1
        if self.length - last_start > 0:
            id_counter += 1
            my_sents.append(Sentence("{0}_{1}".format(self.name, id_counter),
                                     self,
                                     (last_start, self.length)))
        return my_sents
Exemple #23
0
            ]
            res.append("\n".join(lines))

        return res


# In[3]:


def subwords(word):
    return [word[:2], word[2:]]


# In[27]:

stem = GermanStemmer().stem

cnt_vect_splits = [
    ("short", lambda doc: [line for line in doc if len(line) <= 1], {}),
    ("long", lambda doc: [line for line in doc if len(line) > 1], {}),
    ("subwords", lambda doc: [
        list(map(stem, concat(subwords(word) for word in line)))
        for line in doc
    ], {
        "ngram_range": (1, 1)
    }),
]

doc_funcs = [
    ("num_char", lambda doc: len(re.findall("[A-Za-zäöüÄÖÜß]", doc))),
]
Exemple #24
0
class StringHandler:
    _STEMMER = GermanStemmer()
    _P_SIMILARITY_THRESHOLD: float = 0.9

    def __init__(self, string_series: pd.Series):
        self._ds = string_series.str.lower()
        self.ds_origin = string_series

    def optimize(self):
        self.remove_noise()
        self.split_text()
        self.build_sentence()
        self.stem_words()
        # self.correct_spelling()

    def reset(self):
        self.ds = self.ds_origin.copy()

    # string manipulation
    ##################################

    def stem_words(self):
        self.ds = self.ds.apply(StringHandler.stem_sentence)

    def split_text(self):
        self.ds = self.ds.str.split(' ')

    def remove_noise(self):
        self.ds = self.ds.str.replace(r'[^a-zA-Z0-9]', ' ')
        # remove leftover isolated substrings that are not words/digits

    def build_sentence(self):
        self.ds = self.ds.apply(lambda x: ' '.join(word.strip() for word in x if word))

    # nlp manipulation
    ##################################

    def correct_spelling(self):
        uniques = self.get_unique_series
        uniques.apply(lambda x: list(i for i in uniques if i != x and SequenceMatcher(None, x, i).ratio() > 0.9))

    @classmethod
    def stem_sentence(cls, sentence: str, split_char: str = ' '):
        return ' '.join(cls._STEMMER.stem(word) for word in sentence.split(split_char))

    # properties
    ##################################

    @property
    def get_unique_series(self):
        return pd.Series(self.ds.unique()).sort_values().reset_index(drop=True)

    @property
    def ds(self):
        return self._ds

    @ds.setter
    def ds(self, ds: pd.Series):
        if isinstance(ds, pd.Series) and not ds.empty:
            self._ds = ds
        else:
            raise TypeError('Wrong variable type or empty series')
Exemple #25
0
import nltk
text = "She looked at   her father's arm-chair."
text_fr = "Qu'est-ce que c'est?"
text.split(' ')
text_fr.split(' ')
from sklearn.feature_extraction.text import CountVectorizer
CountVectorizer().build_tokenizer()(text)
from nltk.tokenize import word_tokenize
word_tokenize(text)
#from nltk.tokenize.punkt import PunktWordTokenizer
#tokenizer = PunktWordTokenizer()
#tokenizer.tokenize(text)

#Stemming
from nltk.stem.snowball import GermanStemmer
stemmer=GermanStemmer()
words=["Wald", "Walde", "Wälder", "Wäldern", "Waldes","Walds"]
stemmer.stem("Waldi")
#[stemmer.stem(w) for w in words]

#Chunking
import os
import numpy as np
corpus_path = os.path.join('/Users/Gourhari/Documents/Py/data', 'french-tragedy')
sorted(os.listdir(corpus_path))[0:5]
tragedy_filenames = [os.path.join(corpus_path, fn) for fn in sorted(os.listdir(corpus_path))]

def split_text(filename, n_words):
    """Split a text into chunks approximately `n_words` words in length."""
    input = open(filename, 'r')
    words = input.read().split(' ')
import re
import preprocess_files
from nltk.stem.snowball import GermanStemmer


gs = GermanStemmer()
punctuations = '''!()-[]{};:'"\,<>/?@#$%^&*_~'''



def match_synms(tokens):
    syn_dict = preprocess_files.read_synms_list()
    for t in tokens:
        for (idx, val) in enumerate(t):
            if val in syn_dict:
                t[idx] = syn_dict[val]

    return tokens

def _remove_punctuation(tokens):
    tokens_filt = []
    for gT in tokens:
        if gT not in punctuations: tokens_filt.append(gT)
    return tokens_filt


def _remove_stopwords(tokens):
    '''Remove stop words from an array of tokens'''

    stopWords = ['the', 'to', '-', 'pr', 'der', 'is', 'of', 'die', 'in', 'and', 'und', '–', '•', '✔', '●', 'a']
#For Snowball Stemmer, which is based on Snowball Stemming Algorithm, can be used in NLTK like this:
from nltk.stem import SnowballStemmer
print(" ".join(SnowballStemmer.languages))

# In[20]:

snowball_stemmer = SnowballStemmer('english')
#snowball_stemmer.stem('maximum')
#snowball_stemmer.stem('presumably')
print(snowball_stemmer.stem('computing'))
print(snowball_stemmer.stem('nationality'))

# In[21]:

from nltk.stem.snowball import GermanStemmer
stemmer = GermanStemmer()
stemmer.stem("Autobahnen")

# In[22]:

#for more details and examples see http://www.nltk.org/api/nltk.tokenize.html1

# In[33]:

from nltk.corpus import stopwords
nltk.download("stopwords")
text = "Sachin Ramesh Tendulkar (/ˌsətʃɪn tɛnˈduːlkər/ (About this sound listen); born 24 April 1973) is a former Indian cricketer and a former captain, regarded as one of the greatest batsmen of all time.[4] The highest run scorer of all time in International cricket, Tendulkar took up cricket at the age of eleven, made his Test debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for close to twenty-four years. He is the only player to have scored one hundred international centuries, the first batsman to score a double century in a One Day International, the holder of the record for the most number of runs in both ODI and Test cricket, and the only player to complete more than 30,000 runs in international cricket.[5]"
stopwordsList = set(stopwords.words('english'))
stopwordsListUpdated = list(stopwordsList)
punctuationList = ["{", "}", "]", "[", ",", ";", ".", "/"]
stopwordsListUpdated += punctuationList
Exemple #28
0
def tokenize_and_stem_german(text):
    stemmer = GermanStemmer()
    return [clean_token(stemmer.stem(token)) for token in tokenize(text)]
Exemple #29
0
def get_stem_relations(sentences, gn):
    """Gets verb-noun relations
    between two sentences.

    Returns
        Array of word-pairs between two sentences
    """

    # Init word pairs
    word_pairs = []

    # Init stemmer
    stemmer = GermanStemmer(ignore_stopwords=True)

    # Loop over every sentence
    for val, sentence in enumerate(sentences):
        # Is current sentence not the last
        # sentence? If so carry on
        if val != (len(sentences) - 1):
            # Get stems of all words in current sentence
            stems_next_sentence = map(lambda x: stemmer.stem(x['lemma']),
                                      sentences[val + 1])

            # Nouns in next sentence
            nouns_next_sentence = [
                word['lemma'] for word in sentences[val + 1] if word['noun']
            ]

            # Nouns of current sentence
            words_current_sentence = [
                word for word in sentence if word['noun']
            ]

            # Loop over every word in current sentece
            for word in sentences[val]:
                # Stem of current word
                stem_current_word = stemmer.stem(word['lemma'])

                # Is the stemmed word in the next sentence, great.
                # If word is a lame 'sein', ignore it
                if (stem_current_word
                        in stems_next_sentence) and word['lemma'] != 'sein':
                    # Get index of stem that is related to current word
                    index_word_next_sentence = stems_next_sentence.index(
                        stem_current_word)

                    # Corresponding word in next sentence
                    corresponding_word = sentences[val +
                                                   1][index_word_next_sentence]

                    # Only add word pairs if verb or noun
                    if word['noun'] or word['verb']:
                        # Get dictionary of word in next sentence
                        dict_next = sentences[val +
                                              1][index_word_next_sentence]

                        # We do not want to combine words
                        # that have the same grammatical function
                        # A noun should not be combined with a noun
                        # We are only interested in verb-noun relations
                        if word['verb'] and dict_next['noun']:
                            # Get all combinations of corresponding noun
                            # in next sentence an all nouns in current sentence
                            for wordCurrent in words_current_sentence:
                                # Append to list
                                word_pairs.append({
                                    'source': {
                                        'word': corresponding_word['orth'],
                                        'lemma': corresponding_word['lemma'],
                                        'sentence': val
                                    },
                                    'target': {
                                        'word': wordCurrent['orth'],
                                        'lemma': wordCurrent['lemma'],
                                        'sentence': val + 1
                                    },
                                    'device':
                                    'verb noun relation'
                                })

                        # Current word is noun and corresponding word is
                        # verb
                        elif word['noun'] and dict_next['verb']:
                            # Get all combinations of of noun in this sentence
                            # with nouns in next sentence
                            for wordNext in sentences[val + 1]:
                                # Do not use stupid 'sein'
                                if wordNext['noun']:
                                    # Append to list
                                    word_pairs.append({
                                        'source': {
                                            'word': word['orth'],
                                            'lemma': word['lemma'],
                                            'sentence': val
                                        },
                                        'target': {
                                            'word': wordNext['orth'],
                                            'lemma': wordNext['lemma'],
                                            'sentence': val + 1
                                        },
                                        'device':
                                        'noun verb relation'
                                    })

    return word_pairs
Exemple #30
0
def rootform(infinite):
    stemmer = GermanStemmer()
    return stemmer.stem(infinite)
Exemple #31
0
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from boilerpipe.extract import Extractor
from nltk.stem.snowball import GermanStemmer
from nltk import word_tokenize
import nltk.data
import os
import re

logger = logging.getLogger(__name__)
logging.getLogger('pdfminer').setLevel(logging.CRITICAL)

satztokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
stemmer = GermanStemmer()
stoppwörter = []
'''Lädt Stopwortliste'''
with open('traindata/german', 'r') as f:
    for line in f:
        wort = line.split('\n')[0]
        stoppwörter.append(wort.lower())


def preprocess(text):
    '''Filterregelungen, um Text zu vereinheitlichen.'''
    try:
        text = re.sub(
            "/innen|\*innen|/-innen", "innen",
            text)  # Vereinheitlicht unterschiedliche Gender-Varianten
        text = re.sub("-\s*\n", "", text)  # Entfernt Silbentrennung
Exemple #32
0
def main():
    now = datetime.now()
    page_dates = []
    revision_dates = []

    corpora = defaultdict(lambda: set())

    stemmer = GermanStemmer()

    for page, revisions in MediawikiDump(sys.stdin).iterpages():
        timestamp = revisions[0]['timestamp']
        page_dates.append(timestamp)

        first = revisions[0]
        stems = set()
        for year in xrange(first['timestamp'].year, now.year+1):
            revisions_in_year = [r for r in revisions if r['timestamp'].year == year]
            revision_dates.extend(r['timestamp'] for r in revisions_in_year)
            if revisions_in_year:
                stems = set()
                for revision in revisions_in_year:
                    html = WikiMarkup(revision['text'].encode('utf-8')).render()
                    text = clean_html(html.decode('utf-8'))
                    # TODO: remove remaining markup
                    words = WORD_RE.findall(text)
                    stems.update(stemmer.stem(word) for word in words)
            corpora[year].update(stems)

    page_dates.sort()
    revision_dates.sort()

    delta = relativedelta(revision_dates[-1], revision_dates[0])
    months = delta.years * 12 + delta.months

    outdir = os.path.abspath('./out')
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fig = pyplot.figure()
    ax = fig.add_subplot(111)
    ax.plot_date(page_dates, range(1, len(page_dates)+1), '-')
    ax.hist(date2num(revision_dates), months, histtype='step')
    ax.set_xlabel(u'Year')
    ax.legend([u'Total No. of Pages', u'New Revisions per month'])
    fig.autofmt_xdate()
    fig.savefig(os.path.join(outdir, 'pages.png'), format='png')

    fig = pyplot.figure()
    ax = fig.add_subplot(111)
    timestamps = [datetime(year, 12, 31) for year in sorted(corpora.keys())]
    counts = [len(corpora[t.year]) for t in timestamps]
    ax.plot_date(timestamps, counts, '-')
    ax.set_xlabel(u'Year')
    ax.legend([u'No. of distinct tokens'])
    fig.autofmt_xdate()
    fig.savefig(os.path.join(outdir, 'tokens.png'), format='png')

    years = sorted(corpora.keys())
    years = range(years[0], years[-1])
    year_pairs = zip(years, years[1:])
    for year1, year2 in year_pairs:
        current = corpora.get(year1, set())
        next_ = corpora.get(year2, set())
        filename = '{:04d}-{:04d}.diff'.format(year1, year2)
        with open(os.path.join(outdir, filename), 'w') as f:
            for token in sorted(current - next_):
                f.write(u'-{}\n'.format(token).encode('utf-8'))
            for token in sorted(next_ - current):
                f.write(u'+{}\n'.format(token).encode('utf-8'))
Exemple #33
0
 def load_stemmer(self):
     self._stemmer = None
     if self._stemming_lang == Language.GERMAN:
         self._stemmer = GermanStemmer()
     else:
         self._stemmer = EnglishStemmer()
def germanMapping(pattern,uri):
    array = pattern.split("  ")
    from nltk.stem.snowball import GermanStemmer
    stemmer = GermanStemmer()
    
    if len(array)== 3:
        if " n nn " in array[1]:
            marker = ""
            term = array[1]
            if " x " not in term and " y " not in term:
                 term = term.split(" ")[1]
                 return [NounPPFrame(term,uri,marker)]
        
        if " v vvfin " in array[1]:
            marker = ""
            term = array[1]
            if " x " not in term and " y " not in term:
                 term = term.split(" ")[1]
                 return [TransitiveFrame(term, uri,marker),TransitiveFrame(stemmer.stem(term), uri,marker)]
        
        if " v vvpp " in array[1]:
            marker = ""
            term = array[1]
            if " x " not in term and " y " not in term:
                 term = term.split(" ")[1]
                 return [AdjectivePredicateFrameMit(term,uri)]
        
    
    if len(array)==4:
        if " n nn " in array[1] and " prep appr " in array[2]:
            marker = array[2]
            if " x " not in marker and " y " not in marker:
                 marker = marker.split(" ")[1]
            term = array[1]
            if " x " not in term and " y " not in term:
                 term = term.split(" ")[1]
                 return [NounPPFrame(term,uri,marker)]
        
        if " v vvfin " in array[1] and " n nn " in array[2]:
            marker = ""
            term = array[1]
            if " x " not in term and " y " not in term:
                 term = term.split(" ")[1]
                 return [TransitiveFrame(term, uri,marker),TransitiveFrame(stemmer.stem(term), uri,marker)]
            #return [TransitiveFrame(term, uri,marker)]
        
        if " v vvpp " in array[1] and " prep appr " in array[2]:
            marker = array[2]
            if " x " not in marker and " y " not in marker:
                 marker = marker.split(" ")[1]
            term = array[1]
            if " x " not in term and " y " not in term:
                 term = term.split(" ")[1]
                 return [AdjectivePredicateFrameMarker(term,uri,marker)]
        
    if len(array) == 5:
        if " v vvpp " in array[1] and " prep appr " in array[2]:
            marker = array[2]
            if " x " not in marker and " y " not in marker:
                 marker = marker.split(" ")[1]
            term = array[1]
            if " x " not in term and " y " not in term:
                 term = term.split(" ")[1]
                 return [AdjectivePredicateFrameMarker(term,uri,marker)]
             
        if " nn " in array[2] and " prep " in array[3]:
            marker = array[3]
            if " x " not in marker and " y " not in marker:
                 marker = marker.split(" ")[1]
            term = array[2]
            if " x " not in term and " y " not in term:
                 term = term.split(" ")[1]
                 return [NounPPFrame(term,uri,marker)]
        
    return []
Exemple #35
0
                               restore_best_weights=True)
model.fit(X_training,
          y_training_one_hot,
          epochs=200,
          batch_size=64,
          callbacks=[early_stopping])

model.save(os.path.join("Data", "models", "rnn_intent_classification.h5"))

predictions = model.predict(X_training)
locations = np.argmax(predictions, 1)

print(confusion_matrix(y_training, locations))
print(accuracy_score(y_training, locations))

stemmer = GermanStemmer()
# with open(os.path.join("Data", "commands", "stopwords.txt"), "rt") as f:
#     stopwords = set(f.read().splitlines())
while True:
    c = input("Your Input:")

    if c == "q":
        break

    c = " ".join(sorted([stemmer.stem(x) for x in word_tokenize(c.lower())]))

    c = np.array([transform_command(c)])
    prediction = model.predict(c)

    out_index = np.argmax(prediction)
def set_stemmer(stemmer_language):
    if (stemmer_language == "GER"):
        stemmers = GermanStemmer()
    else:
        stemmers = EnglishStemmer()
    return stemmers
Exemple #37
0
 def stem_words(self, words):
     stemmer = GermanStemmer()
     stemmed_words = []        
     for word in words:
         stemmed_words.append(stemmer.stem(word))
     return stemmed_words