Ejemplo n.º 1
0
def create_word2wordsense_dic(vocFile="", wwsFile="", vocWsFile=""):
    """
    sample call: create_word2wordsense_dic()
    :param vocFile:
    :param wwsFile:
    :param vocWsFile:
    :return:
    """
    if vocFile == "":
        vocFile = FILE_Loc["de"]["kyubyong"]["voc"]
    if wwsFile == "":
        wwsFile = FILE_Loc["de"]["kyubyong"]["w2ws"]
    if vocWsFile == "":
        vocWsFile = FILE_Loc["de"]["kyubyong"]["wsVoc"]
    wwlst = []
    wslst = []
    gn = load_germanet()
    with open(vocFile, 'r') as ifh:
        for ln in ifh:
            word = ln[:-1]
            lst = [get_name_of_synset(ele) for ele in gn.synsets(word)]
            if lst:
                wwlst.append(word + " " + " ".join(lst))
            wslst += lst
    wslst = list(set(wslst))
    wslst.sort()
    print(wwlst[:10])
    print(wslst[:10])
    with open(wwsFile, 'w') as ofh:
        ofh.write("\n".join(wwlst))
    with open(vocWsFile, 'w') as ofh:
        ofh.write("\n".join(wslst))
Ejemplo n.º 2
0
    def _add_germanet_categories(self,word):
	gn = load_germanet()
	cats=[]	
	hp = gn.synset(word+'.n.1').hypernym_paths

	for h in hp: 

	    for s in h[-4:-1]:		
		cats.append(str(s).split('(')[1].split('.')[0])
	cats = list(set(cats))
	return cats
def main():
    gn = load_germanet()

    if len(sys.argv) < 2:
        raise Exception(
            "Provide 1+ arguments:\n\t1+,model(s)")
    models = sys.argv[1:]

    senses2rel = compare(models, gn)
    print("senses reliability")
    for senses, rel in senses2rel:
        print(str(senses), str(rel))
def germanet_processor(data):
    gn = load_germanet()
    results = []
    for record in data:
        word = record[0]
        value = record[1]
        synsets = gn.synsets(word)
        if synsets == None:
            continue

        for synset in synsets:
            results += find_hyponyms(synset, value)

    return results
Ejemplo n.º 5
0
    def load_tree(self, outputfile):
        """Creates a tree and fills it with words and hypernyms.

		:param outputfile: the outputfile to be created containing all words
		:return: complete tree
		"""

        germanet = load_germanet()

        # step 1: extract words from GermaNet
        print("extracting words..")
        words, embedded_words = self.__extract_words(germanet, outputfile)

        # step 2: fill tree with hypernym paths
        count = 0
        skipped = 0
        mult_paths = 0
        tree = Tree()
        for word in words:
            synset = germanet.synset(word)
            if synset is None:
                skipped += 1
                continue
            paths = synset.hypernym_paths

            if len(paths) == 0:
                skipped += 1
                return
            elif len(paths) > 0 and isinstance(
                    paths[0], list):  # checks if synset has multiple paths
                mult_paths += 1
                for path in paths:
                    count += 1
                    tree.add_hypernym_path(path, embedded_words,
                                           self.__ignore_duplicates)
            else:
                count += 1
                tree.add_hypernym_path(paths, embedded_words,
                                       self.__ignore_duplicates)

        print("number of words added = " + str(len(tree.words)))
        print("number of paths = " + str(count))
        print("number of synsets with multiple paths = " + str(mult_paths))
        print("skipped = " + str(skipped))

        return tree
Ejemplo n.º 6
0
def create_wordsense_struc_dic(vocWsFile="",
                               childrenFile="",
                               parentFile="",
                               pathFile=""):
    if vocWsFile == "":
        vocWsFile = FILE_Loc["de"]["kyubyong"]["wsVoc"]
    if childrenFile == "":
        childrenFile = FILE_Loc["de"]["kyubyong"]["wsChildren"]
    if parentFile == "":
        parentFile = FILE_Loc["de"]["kyubyong"]["wsParent"]
    if pathFile == "":
        pathFile = FILE_Loc["de"]["kyubyong"]["wsPaths"]
    chidrenLst = []
    parentLst = []
    pathLst = []
    gn = load_germanet()
    with open(vocWsFile, 'r') as ifh:
        for ln in ifh:
            ws = ln[:-1]
            ins = gn.synset(ws)
            if ins.__class__.__name__ != 'Synset':
                print(ins)
                continue
            chidren = [get_name_of_synset(ele) for ele in ins.hyponyms]
            chidrenLst.append(ws + " " + " ".join(chidren))

            parent = [get_name_of_synset(ele) for ele in ins.hypernyms]
            parentLst.append(ws + " " + " ".join(parent))

            plst = []
            for apath in ins.hypernym_paths:
                pathStr = " ".join([get_name_of_synset(ele) for ele in apath])
                plst.append(pathStr)
            pathLst.append(":".join(plst))

    with open(childrenFile, 'w') as ofh:
        ofh.write("\n".join(chidrenLst))
    with open(parentFile, 'w') as ofh:
        ofh.write("\n".join(parentLst))
    with open(pathFile, 'w') as ofh:
        ofh.write("\n".join(pathLst))
Ejemplo n.º 7
0
class Germanet:
    """This code is used to find all synonym words of a given word.
    The words that are searched here are nouns. If you are interested searching verbs
    please use str(lemmatisedWord) instead of str(lemmatisedWord).capitalize()

    To run this code you need to load germanet data using mongodb using pygerman. See details of how to run germanet here
    https://pypi.org/project/pygermanet/
    mkdir -p ./mongodb
    mongod --dbpath ./mongodb"""

    gn = load_germanet()

    def lematise(self, word):
        """This method returns the lemmatised form of the word if it has lemmatised form, otherwise returns
        the word as it is."""

        if word != "":

            return self.gn.lemmatise(word)

    def getSynonyms(self, word):
        """This method returns the written representation(orth) of all the possible synsets of the given word."""

        synonymSynset = []

        synonymWords = set()

        for lemmatisedWord in self.lematise(word):
            synsets = self.gn.synsets(str(lemmatisedWord).capitalize())

            for synset in synsets:
                synonymSynset.append(synset)

        for synset in synonymSynset:
            for lemma in synset.lemmas:

                synonymWords.add(lemma.orthForm.strip())

        return synonymWords
Ejemplo n.º 8
0
from pygermanet import load_germanet
import numpy as np
from germanet.tree import Tree

germanet = load_germanet()
num_nodes = 0
words = {}
leaf_nodes = []
errors = 0


def __load_tree(file, log):
    """
    Creates and fills tree from the input file.

    :param file: file containing word-sense parents and their children
    :param log: log file
    """
    global words, errors

    tree = Tree()
    with open(file, 'r') as f:

        for line in f:
            parent, children = line.split()[0], len(
                line.split()) > 1 and line.split()[1:] or None

            # validation step 1: check for duplicate nodes
            if parent in words:
                if words[parent] >= 2:
                    log.write("validation error: synset '" + parent +
Ejemplo n.º 9
0
from nltk.tokenize import RegexpTokenizer
from pygermanet import load_germanet
from modules import file_reader as fr

########################
# GLOBAL FILE SETTINGS
########################
config = configparser.ConfigParser()
config._interpolation = configparser.ExtendedInterpolation()
config.read('../config.ini')

########################
# GermaNet & WordNet
########################
try:
    ger = load_germanet()
except:
    print(
        'Error! Please start mongodb on GermaNet xml files: mongod --dbpath ./mongodb or refer to README.md'
    )
    sys.exit()

# Tokenizer
sent_tok = load('tokenizers/punkt/german.pickle')
word_tok = RegexpTokenizer(r'\w+')

# Filter stopwords
german_stopwords = stopwords.words('german')
german_stopwords.extend(('dass', 'bzw', 'p', 'http', '0', '1', '2', '3', '4'))
stop_words = set(german_stopwords)
Ejemplo n.º 10
0
def analyzeTextCohesion(text):
    """Analyzed the cohesion of a txt.
    Args:
        text (String) - A string that is Analyzed
    Returns:
        Array - An array of word pairs
    """

    # Check if text is string or unicode
    # if type(text) is not str:
    #     raise TypeError('you did not pass a string as argument')
    #

    # Remove percent sign
    text = re.sub(r'%', '', text)
    text = re.sub(r'“', '', text)
    text = re.sub(r'–', '', text)
    text = re.sub(r'„', '', text)
    text = re.sub(r'ca\.', '', text)
    text = re.sub(r'Dr\.', 'Doktor', text)
    text = re.sub(r'St\.', 'Sankt', text)
    text = re.sub(r'bzw\.', 'beziehungsweise', text)
    text = re.sub(r'[zZ]\. ?[bB]\.', 'zum Beispiel', text)
    text = re.sub(r'usw\.', 'und so weiter', text)

    # Split text by line breaks
    paragraph_split = text.split('[LINEBREAK]')

    # Remove brackets and parenthesis from text
    text = re.sub(r"[\(\[].*?[\)\]]", "", text)

    # Remove trailing white space
    text = text.strip()

    # If text doesn't end with a dot, fill it in
    if not text[-1:] in ['.', '!', '?']:
        text += '.'

    ############################################################################
    # Tag text
    ############################################################################
    # Save text to file
    f = open(constants.temp_text, 'w')
    f.write(text.encode('utf-8'))
    f.close()

    # Tokenize
    f = open(constants.temp_tokens, 'w')
    subprocess.call([constants.tokenizer, constants.temp_text], \
        stdout=f, shell=False)
    f.close()

    # Tag Tokens from temp_tokens
    f = open(constants.temp_tags, 'w')
    subprocess.call([constants.rftagger, constants.german_par, \
        constants.temp_tokens], stdout=f, shell=False)
    f.close()

    # Read tags from file
    f = open(constants.temp_tags, 'r')
    tags = f.readlines()
    f.close()

    # Split tags in array
    tags = [str.split(tag, '\t') for tag in tags]

    # Remove last entry
    # the entry is only a \n character and can
    # be ignored. It is a percularity of the
    # RFTagger
    tags.pop()

    # Remove \n from end of tag
    tags = [[tag[0].decode('utf-8'), tag[1][:-1]] for tag in tags]

    ############################################################################
    # Further processing
    ############################################################################

    # Load germanet
    gn = load_germanet()

    # Lemmatise all words
    tags = [{
        'orth': tag[0],
        'lemma': gn.lemmatise(tag[0])[0],
        'pos': tag[1]
    } for tag in tags]

    # Filter only relevant tags: Verbs, Nouns, Pronouns
    regex = re.compile(
        r'.*N.Name.*|.*N.Reg.*|.*SYM.Pun.Sent.*|.*VFIN.*|.*PRO.Pers.*|.*PRO.Dem'
    )

    # Filtered tags
    tags = [tag for tag in tags if regex.match(tag['pos']) != None]

    # Get specific elements of words
    tags = getPOSElement('singular', r'.*Sg', tags)
    tags = getPOSElement('accusative', r'.*N.(Reg|Name).Acc', tags)
    tags = getPOSElement('dative', r'.*N.(Reg|Name).Dat', tags)
    tags = getPOSElement('nominative', r'.*N.(Reg|Name).Nom', tags)
    tags = getPOSElement('genitive', r'.*N.(Reg|Name).Gen', tags)
    tags = getPOSElement('feminin', r'.*Fem', tags)
    tags = getPOSElement('neutrum', r'.*Neut', tags)
    tags = getPOSElement('noun', r'.*N.Name.*|.*N.Reg', tags)
    tags = getPOSElement('pronoun', r'.*PRO.Dem.*|.*PRO.Pers', tags)
    tags = getPOSElement('verb', r'.*VFIN', tags)

    # Get sentences
    sentences = []
    sentenceArray = []

    for word in tags:
        if word['pos'] != 'SYM.Pun.Sent':
            sentenceArray.append(word)
        else:
            sentences.append(sentenceArray)
            sentenceArray = []

    ############################################################################
    # Build word pairs
    ############################################################################

    # Init word pairs array
    word_pairs = []

    # Build lexical overlap word pairs
    for val, sentence in enumerate(sentences):
        # Get all nouns
        nouns = [word['lemma'] for word in sentence if word['noun']]
        nouns_full = [word for word in sentence if word['noun']]
        nominatives = filter(lambda x: x['nominative'], sentence)

        # There is only one noun in the current sentence
        if len(nouns) == 1:
            # Append lonely noun
            word_pairs.append({
                'source': {
                    'word': nouns_full[0]['orth'],
                    'lemma': nouns_full[0]['lemma'],
                    'sentence': val
                },
                'target': {
                    'word': nouns_full[0]['orth'],
                    'lemma': nouns_full[0]['lemma'],
                    'sentence': val
                },
                'device': 'single word'
            })

        # There are at least two nouns in the sentence
        elif len(nouns) > 1:
            # There is a nominative among the nouns
            if len(nominatives) > 0:
                # Loop over every combination of nouns in current sentence
                for subset in itertools.combinations_with_replacement(
                        nouns_full, 2):
                    if subset[0] != subset[1]:
                        # Check if first word is nominative
                        if subset[0]['nominative']:
                            # Only combine nominatives with accusative, dative
                            # and genitive
                            if subset[1]['accusative'] or subset[1]['dative'] or \
                                subset[1]['genitive'] or subset[1]['nominative']:
                                # Append word pairs
                                word_pairs.append({
                                    'source': {
                                        'word': subset[0]['orth'],
                                        'lemma': subset[0]['lemma'],
                                        'sentence': val
                                    },
                                    'target': {
                                        'word': subset[1]['orth'],
                                        'lemma': subset[1]['lemma'],
                                        'sentence': val
                                    },
                                    'device': 'within sentence'
                                })
                        # Check if second word is nominative
                        if subset[1]['nominative']:
                            # Only combine nominatives with accusative, dative,
                            # and genitive
                            if subset[0]['accusative'] or subset[0]['dative'] or \
                                subset[0]['genitive'] or subset[0]['nominative']:
                                # Append word pairs
                                word_pairs.append({
                                    'source': {
                                        'word': subset[1]['orth'],
                                        'lemma': subset[1]['lemma'],
                                        'sentence': val
                                    },
                                    'target': {
                                        'word': subset[0]['orth'],
                                        'lemma': subset[0]['lemma'],
                                        'sentence': val
                                    },
                                    'device': 'within sentence'
                                })
            # There are no nominatives in the sentence
            else:
                # Loop over every combination of nouns in current sentence
                for subset in itertools.combinations_with_replacement(
                        nouns_full, 2):
                    if subset[0] != subset[1]:
                        # Combine accusative with dative
                        if subset[0]['accusative'] and subset[1]['dative'] and \
                           subset[0]['genitive']:
                            # Append word pairs
                            word_pairs.append({
                                'source': {
                                    'word': subset[0]['orth'],
                                    'lemma': subset[0]['lemma'],
                                    'sentence': val
                                },
                                'target': {
                                    'word': subset[1]['orth'],
                                    'lemma': subset[1]['lemma'],
                                    'sentence': val
                                },
                                'device': 'within sentence'
                            })
                        elif subset[1]['accusative'] and subset[0]['dative'] and \
                             subset[1]['genitive']:
                            # Append word pairs
                            word_pairs.append({
                                'source': {
                                    'word': subset[0]['orth'],
                                    'lemma': subset[0]['lemma'],
                                    'sentence': val
                                },
                                'target': {
                                    'word': subset[1]['orth'],
                                    'lemma': subset[1]['lemma'],
                                    'sentence': val
                                },
                                'device': 'within sentence'
                            })

    # Get hypernym hyponym pairs
    hyponym_hyper_pairs = []

    # Get coreference resolutions
    coreferences = []

    # Get compounds
    compounds = []

    # Get stem relations
    stem_relations = []

    # Get hypernym hyponym pairs
    # hyponym_hyper_pairs = getHypoHyperPairs(sentences, gn)

    # Get coreference resolutions
    # coreferences = get_coreferences(sentences, gn)

    # Get compounds
    # compounds = get_compounds(sentences)

    # Get stem relations
    # stem_relations = get_stem_relations(sentences, gn)

    # Merge all word pairs
    # word_pairs = word_pairs + hyponym_hyper_pairs + coreferences + compounds + \
    #     stem_relations

    ######################################
    # Calculate number of relations
    ######################################

    word_tuples = map(lambda x: (x['source']['lemma'], x['target']['lemma']),
                      word_pairs)
    word_tuples = list(
        set([(pair['source']['lemma'], pair['target']['lemma'])
             for pair in word_pairs
             if pair['source']['lemma'] != pair['target']['lemma']]))

    # Calc number of sentences
    num_sentences = len(sentences)

    # Calculate local cohesion
    local_cohesion = calc_local_cohesion(word_pairs, sentences)

    # Calculate clusters
    cluster = get_clusters(word_pairs, sentences)

    # When clusters are calculated assign them to the word_pairs as
    # an additional value
    word_cluster_index = {}
    for index, single_cluster in enumerate(cluster):
        # Get words for current cluster
        source_words = map(lambda x: x['source']['lemma'], single_cluster)
        target_words = map(lambda x: x['target']['lemma'], single_cluster)

        # Concatenate sources and targets in to one array
        words = source_words + target_words

        # Assign index to word_cluster_index dict
        for word in words:
            word_cluster_index[word] = index

    # Now that we have the indexes for each cluster we can assign the index
    # to the word_pairs
    for word_pair in word_pairs:
        word_pair['cluster'] = word_cluster_index[word_pair['source']['lemma']]

    # Get dictionary of orthographic forms of all lemmas
    word_lemma_mapping = get_lemma_mapping(word_pairs)

    # Prepare data for frontend
    links = [{
        'source': pair['source']['lemma'],
        'target': pair['target']['lemma'],
        'device': pair['device'],
        'cluster': pair['cluster']
    } for pair in word_pairs]
    nodes = [{
        'id': word,
        'index': ind
    } for ind, word in enumerate(word_lemma_mapping['lemma_word'])]

    # Get number of concepts
    num_concepts = len(
        set([concept['lemma'] for concept in tags if concept['noun'] == True]))

    # Generate html string for editor
    html_string = generateHTML(paragraph_split, word_lemma_mapping,
                               word_cluster_index)

    return {
        'word_pairs': word_pairs,
        'links': links,
        'nodes': nodes,
        'numSentences': num_sentences,
        'numConcepts': num_concepts,
        'clusters': cluster,
        'numRelations': len(word_tuples),
        'numCluster': len(cluster),
        'local cohesion': local_cohesion['local_cohesion'],
        'cohSentences': local_cohesion['cohSentences'],
        'cohNotSentences': local_cohesion['cohNotSentences'],
        'lemmaWordRelations': word_lemma_mapping['lemma_word'],
        'wordLemmaRelations': word_lemma_mapping['word_lemma'],
        'wordClusterIndex': word_cluster_index,
        'numCompounds': len(compounds),
        'numCoreferences': len(coreferences),
        'numStemRelations': len(stem_relations),
        'numHypoHyper': len(hyponym_hyper_pairs),
        'html_string': html_string
    }
Ejemplo n.º 11
0
from rest_framework.response import Response
from rest_framework.decorators import api_view
from pygermanet import load_germanet
from django.conf import settings

gn_host = settings.MONGO_SETTINGS['host']
gn_port = settings.MONGO_SETTINGS['port']

gn = load_germanet(host=gn_host, port=gn_port)


@api_view()
def synset(request):
    """
    get:
    Expects a `token` parameter (e.g. ?token=flog) which will be checked against germanet.

    """
    token = request.GET.get('token')
    enriched = {}
    if token:
        lemma = gn.lemmatise("{}".format(token))
        if len(lemma) > 0:
            synsets = []
            for x in lemma:
                for y in gn.synsets("{}".format(x)):
                    synsets.append(y)
        else:
            for y in gn.synsets("{}".format(lemma[0])):
                synsets.append(y)
        synonyms = []
from os.path import join, exists
from time import time

import numpy as np
from pygermanet import load_germanet, Synset
from tqdm import tqdm

from constants import LDA_PATH
from evaluate_topics import parse_args
from utils import load, init_logging, log_args

np.set_printoptions(precision=3)
gn = load_germanet()
tqdm.pandas()


def orth(synset):
    return synset.lemmas[0].orthForm


def compare_synset_lists(synset_list1, synset_list2, sim_func, agg_func):
    try:
        return agg_func(
            sim_func(ss1, ss2) for ss1 in synset_list1 for ss2 in synset_list2)
    except ValueError:
        return np.nan


def similarities(topic,
                 topn,
                 ignore_unknown=True,
Ejemplo n.º 13
0
    def get_tags(self):
        """
            Generates tags from string.
            Takes a text as input and extracts nominatives using RFTagger.
            Args:
                    None
            Returns:
                    List with tags
            """

        # Create directory temp if not existent
        if not os.path.exists(constants.temp_dir):
            os.makedirs(constants.temp_dir)

        # Create random string
        rand_string = ''.join(
            random.choice(string.ascii_lowercase + string.digits)
            for _ in range(15))

        # Path for text files
        tokens = constants.temp_tokens + "_" + rand_string + ".txt"
        curr_text = constants.temp_text + "_" + rand_string + ".txt"

        # Save text to file
        f = open(curr_text, 'w')
        f.write(self.text)
        f.close()

        # Tokenize
        f = open(tokens, 'w')
        subprocess.call([constants.tokenizer, curr_text],
                        stdout=f,
                        shell=False)
        f.close()

        # Tag Tokens from temp_tokens
        f = open(constants.temp_tags + "_" + rand_string + ".txt", 'w')
        subprocess.call([constants.rftagger, constants.german_par, tokens],
                        stdout=f,
                        shell=False)
        f.close()

        # Read tags from file
        f = open(constants.temp_tags + "_" + rand_string + ".txt", 'r')
        tags = f.readlines()
        f.close()

        # Regular Expression
        # regex = re.compile(r'.*N.Name.*|.*N.Reg.*|.*SYM.Pun.Sent')

        # # Filtered tags
        # filtered_tags = [regex.match(tag).string for tag in tags
        #                  if regex.match(tag) is not None]

        # # Split tags in lists
        splited_tags = [str.split(tag, '\t') for tag in tags]

        # Load germanet
        g = load_germanet()

        # Build Lemmas
        splited_tags_lemma = [[
            g.lemmatise(tag[0].decode('utf-8'))[0], tag[0], tag[1]
        ] for tag in splited_tags[:-1]]

        # Update self.tags
        tags = splited_tags_lemma

        # Remove files
        os.remove(curr_text)
        os.remove(tokens)
        os.remove(constants.temp_tags + "_" + rand_string + ".txt")

        return splited_tags_lemma