def processClass(katumClass, classID):
    frames = vn.frames(classID)
    for frame in frames:
        syntaxFrames = frame['syntax']
        semanticsFrames = frame['semantics']
        exampleKatum = processExample(frame['example'], katumClass)
        if (len(syntaxFrames) > 0):
            syntaxInstance = syntax.get(syntax.countI)
            exampleKatum._is(syntaxInstance, False)
            for syntaxFrame in syntaxFrames:
                syntaxFramesInstance = syntaxFramesKatum.get(
                    syntaxFramesKatum.countI)
                syntaxInstance._is(syntaxFramesInstance, False)
                processSyntax(syntaxFramesInstance, syntaxFrame,
                              syntaxArguments)
        if (len(semanticsFrames) > 0):
            semanticsInstance = semantics.get(semantics.countI)
            exampleKatum._is(semanticsInstance, False)
            for semanticsFrame in semanticsFrames:
                predicateVal = semanticsFrame.get('predicate_value')
                predicateKatum = predicateValue.get(predicateVal)
                numPredicateKatum = predicateKatum.get(predicateKatum.countI)
                semanticsFramesInstance = semanticsFramesKatum.get(
                    semanticsFramesKatum.countI)
                semanticsInstance._is(semanticsFramesInstance, False)
                semanticsFramesInstance._is(numPredicateKatum, False)
                for argument_ in semanticsFrame.get('arguments'):
                    argumentType = semanticsArguments.get(
                        argument_.get('type'))
                    argumentValue = argumentType.get(argument_.get('value'))
                    numPredicateKatum._is(argumentValue, False)
        processDescription(frame['description'], exampleKatum)
Beispiel #2
0
    def test(self):
        skips = [
            'Eggs and cream mix well together.',
            'The eggs and the cream mixed together.'
        ]
        warnings.simplefilter("ignore", ResourceWarning)
        classid_list = sorted(verbnet.classids(),
                              key=lambda c: LooseVersion(classid_to_number(c)))

        i = 0
        for classid in classid_list:
            for vn_frame in verbnet.frames(classid):
                text = vn_frame['frame'].find('EXAMPLES/EXAMPLE').text
                with self.subTest(i=i, msg='{}: {}'.format(classid, text)):
                    if text in skips:
                        continue
                    syntax = vn_frame['frame'].find('SYNTAX')
                    wanted_primary = strip_roles(
                        vn_frame['frame'].find('DESCRIPTION').get('primary'))
                    converted_primary = ' '.join(
                        [phrase for phrase, role in syntax_to_primary(syntax)])

                    self.assertEqual(wanted_primary, converted_primary)
                i += 1

        print('Total : {}'.format(i))
Beispiel #3
0
    def test(self):
        skips = [
            'Eggs and cream mix well together.',
            'The eggs and the cream mixed together.'
        ]
        warnings.simplefilter("ignore", ResourceWarning)
        classid_list = sorted(verbnet.classids(), key=lambda c: LooseVersion(classid_to_number(c)))

        i = 0
        for classid in classid_list:
            for vn_frame in verbnet.frames(classid):
                text = vn_frame['frame'].find('EXAMPLES/EXAMPLE').text
                with self.subTest(i=i, msg='{}: {}'.format(classid, text)):
                    if text in skips:
                        continue
                    syntax = vn_frame['frame'].find('SYNTAX')
                    wanted_primary = strip_roles(
                        vn_frame['frame'].find('DESCRIPTION').get('primary'))
                    converted_primary = ' '.join(
                        [phrase for phrase, role in syntax_to_primary(syntax)])

                    self.assertEqual(wanted_primary, converted_primary)
                i += 1

        print('Total : {}'.format(i))
Beispiel #4
0
def is_word_in_verb_frames(verb, word):
    classids = vn.classids(verb)
    frames = [frame for cid in classids for frame in vn.frames(cid)]
    for frame in frames:
        if word.lower() in frame['example'].lower().replace('.',
                                                            '').split(' '):
            return True
    return False
Beispiel #5
0
def is_transitive(lemma):
    try:
        cids = verbnet.classids(lemma)
        frames = verbnet.frames(verbnet.vnclass(cids[0]))
        ret = False
        # for frame in frames:
        #     print "primary:", frame['description']['primary']
        #     ret = ret or "Transitive" in frame['description']['primary']

        ret = "Transitive" in frames[0]['description']['primary']
        return ret
    except:
        return False
Beispiel #6
0
def get_transitivity(verb):
    """ 
    Take a verb lemma as input.
    Return transitivity score and VerbNet (VN) frames if available. 
    
    The returned tuple is constructed in the following way:
        -the first element is the transitivity score, where:
            -1 equals transitive
            -0 equals intransitive (or at least according to VN)
        -the second element is a list of tuples, each of which consists of:
            -first, the VN class_id of a given meaning of a verb
            -second, the corresponding frame itself
            
    Regardless of the length of the transitive frames list, 
    the transitivty score remains the same.
    """

    class_ids = vn.classids(verb)

    print(class_ids)

    # Define a list containing frames with transitive meanings of the given verb.
    trans_frames = []
    for class_id in class_ids:
        frames = vn.frames(class_id)
        for frame in frames:
            print(frame["description"]["primary"])
            #print(frame['description']['secondary'])
            if frame["description"]["primary"] == "NP V NP":
                entry = class_id, frame
                trans_frames.append(entry)


#            elif "NP V NP" in frame["description"]["primary"]:
#                entry = class_id, frame
#                trans_frames.append(entry)
#            elif "Transitive" in frame["description"]["secondary"]:
#                entry = class_id, frame
#                trans_frames.append(entry)

# If the trans_score is equal to one, the verb has a transitive meaning.
    if len(trans_frames) != 0:
        trans_score = 1

    else:
        trans_score = 0

    return trans_score, trans_frames
Beispiel #7
0
nps = extract_phrase(tree_str, 'NP')
vps = extract_phrase(tree_str, 'VP')
pps = extract_phrase(tree_str, 'PP')

if before_verb in nps:
    print("YES BEFORE VERB")

if after_verb in nps:
    print("YES AFTER VERB")

print(nps)
print(vps)
print(pps)

for np in nps:
    print(np)

print("=============")

word = "come"
vn_results = vn.classids(lemma=word)
print(vn_results)

frames = vn.frames('51.2')[0]

syntax = frames['syntax']
for item in syntax:
    print(item['pos_tag'])
    print("=====================")
nlp.close()
Beispiel #8
0
def process(text: str, params: dict) -> OrderedDict:
    """Process provided text"""

    # set JSON-NLP
    j: OrderedDict = base_document()
    t: OrderedDict = base_nlp_json()
    t['DC.source'] = 'NLTK {}'.format(__version__)
    t['documents'].append(j)
    j['text'] = text

    # collect parsers
    lemmatizer = get_lemmatizer()
    tokenizer = get_tokenizer(params)
    sentence_tokenizer = get_sentence_tokenizer()
    stemmer = get_stemmer()
    parser = get_parser()
    language = Counter()

    # tokenize and tag
    tokens: List[str] = tokenizer.tokenize(text)
    tokens_tagged: List[tuple] = nltk.pos_tag(tokens)
    conll_tagged = tree2conlltags(ne_chunk(tokens_tagged))

    offset_list: List[Tuple[int, int]] = list(tokenizer.span_tokenize(text))

    token_list: List[dict] = []
    for token_idx, token_tuple in enumerate(tokens_tagged):
        token = token_tuple[0]
        pos_tag = token_tuple[1]
        wordnet_pos = get_wordnet_pos(pos_tag)
        entity_tag = conll_tagged[token_idx][2].split("-")

        if wordnet_pos != '':
            synsets = wordnet.synsets(token, pos=wordnet_pos)
        else:
            synsets = wordnet.synsets(token)
        sys_id = 0
        sys_list = []
        for syn in synsets:
            s_hypo = set([x.lemma_names()[0] for x in syn.hyponyms()])
            s_hyper = set([x.lemma_names()[0] for x in syn.hypernyms()])
            s_examples = [x for x in syn.examples()]

            s = {
                'wordnet_id': syn.name(),
                'id': sys_id,
                'synonym': syn.lemma_names()[1:],
                'hyponym': list(s_hypo),
                'hypernym': list(s_hyper),
                'examples': s_examples,
                'definition': syn.definition()
            }

            if len(s['synonym']) == 0: s.pop('synonym')
            if len(s['hyponym']) == 0: s.pop('hyponym')
            if len(s['hypernym']) == 0: s.pop('hypernym')
            if len(s['examples']) == 0: s.pop('examples')
            if len(s['definition']) == 0: s.pop('definition')

            if s:
                sys_list.append(s)
            sys_id += 1

        verb_list = []
        vn_classids = vn.classids(token)
        for classid in vn_classids:
            verb_list.append({
                'class_id': classid,
                'frames': vn.frames(classid)
            })

        t = {
            'id':
            token_idx,
            'text':
            token,
            'lemma':
            lemmatizer(token, wordnet_pos)
            if wordnet_pos else lemmatizer(token),
            'stem':
            stemmer(token),
            'pos':
            pos_tag,
            'entity':
            entity_tag[1] if len(entity_tag) > 1 else "",
            'entity_iob':
            entity_tag[0],
            'overt':
            True,
            'characterOffsetBegin':
            offset_list[token_idx][0],
            'characterOffsetEnd':
            offset_list[token_idx][1],
            'synsets':
            sys_list,
            'verbnet':
            verb_list
        }
        if len(t['synsets']) == 0: t.pop('synsets')
        if len(t['verbnet']) == 0: t.pop('verbnet')
        token_list.append(t)

    j['tokenList'] = token_list

    # sentence and dependency parsing
    sent_list = []
    token_from = 0
    sentence_tokens = sentence_tokenizer.sentences_from_tokens(tokens)
    sentence_texts = sentence_tokenizer.sentences_from_text(text)

    # check whether MALT parser is loaded! DC
    if parser:
        for sent_idx, sent in enumerate(zip(sentence_tokens, sentence_texts)):
            # Detecting language of each sentence
            la = pycountry.languages.get(alpha_2=detect(sent[1]))
            token_to = token_from + len(sent[0]) - 1
            dg = parser.parse_one(sent[1].split())
            s = {
                'id': sent_idx,
                'text': sent[1],
                'tokenFrom': token_from,
                'tokenTo': token_to,
                'tokens': list(range(token_from, token_to))
            }

            for token in dg.nodes:
                head = dg.nodes[token]['head']
                head_word = [
                    dg.nodes[i]['word'] for i in dg.nodes
                    if dg.nodes[i]['address'] == head
                ]
                if len(head_word) > 0:
                    j['dependenciesBasic'].append({
                        'governor':
                        head_word[0],
                        'dependent':
                        dg.nodes[token]['word'],
                        'type':
                        dg.nodes[token]['rel']
                    })
                else:
                    j['dependenciesBasic'].append({
                        'governor':
                        'null',
                        'dependent':
                        dg.nodes[token]['word'],
                        'type':
                        dg.nodes[token]['rel']
                    })
                if j['dependenciesBasic'][-1]['governor'] == 'null' or j['dependenciesBasic'][-1]['dependent'] == 'null' \
                        or j['dependenciesBasic'][-1]['type'] == 'null':
                    j['dependenciesBasic'].pop()
            token_from = token_to
            language[la.name] += 1
            sent_list.append(s)
        j['sentences'] = sent_list

    if params['language']:
        t['DC.language'] = params['language']
    else:
        # only if language has some elements can we check for max!!! DC
        if len(token_list) > 4 and language:
            t['DC.language'] = max(language)
        else:
            t['DC.language'] = ''

    # TODO:
    # 1. Schema: clauses, coreferences, constituents, expressions, paragraphs
    # 2. fields: token: sentiment, embeddings; sentence: sentiment, complex, type, embeddings

    return j
Beispiel #9
0
    def process(text='',
                lang='en',
                coreferences=False,
                constituents=False,
                dependencies=False,
                expressions=False,
                **kwargs) -> OrderedDict:
        # build nlp-json
        j: OrderedDict = get_base()
        j['meta']['DC.language'] = lang
        d: OrderedDict = get_base_document(1)
        #j['documents'][d['id']] = d
        j['documents'].append(d)
        d['meta']['DC.source'] = 'NLTK {}'.format(nltk_version)
        j['meta']['DC.language'] = lang
        d['text'] = text

        # collect parsers
        lemmatizer = get_lemmatizer()
        stemmer = get_stemmer()

        # tokenization and pos
        words = []
        for sent in segment(text):
            for token in sent:
                words.append(token.value)

        # create the token list
        t_id = 1
        for word, xpos in pos_tag(words):
            wordnet_pos = get_wordnet_pos(xpos)
            lemma = lemmatizer(word, pos=wordnet_pos)

            # start the token
            t = {'id': t_id, 'text': word, 'stem': stemmer(word)}
            #d['tokenList'][t['id']] = t
            d['tokenList'].append(t)
            t_id += 1

            # wordnet
            try:
                synsets = wordnet.synsets(lemma, pos=wordnet_pos)
                senses = {}
                for s in synsets:
                    hyponyms = [
                        y for x in s.hyponyms() for y in x.lemma_names()
                    ]
                    hypernyms = [
                        y for x in s.hypernyms() for y in x.lemma_names()
                    ]
                    synonyms = s.lemma_names()[1:]
                    examples = s.examples()
                    sense = {
                        'wordnetId': s.name(),
                        'definition': s.definition()
                    }
                    if synonyms:
                        sense['synonyms'] = synonyms
                    if hypernyms:
                        sense['hypernyms'] = hypernyms
                    if hyponyms:
                        sense['hyponyms'] = hyponyms
                    if examples:
                        sense['examples'] = examples

                    antonyms = []
                    for l in s.lemmas():
                        if l.antonyms():
                            for a in l.antonyms():
                                antonyms.append(a.name())
                    if antonyms:
                        sense['antonyms'] = antonyms

                    senses[sense['wordnetId']] = sense

                if senses:
                    t['synsets'] = senses
            except:
                pass

            # verbnet
            try:
                verbs = dict((class_id, {
                    'classId': class_id,
                    'frames': vn.frames(class_id)
                }) for class_id in vn.classids(word))

                if verbs:
                    t['verbFrames'] = verbs
            except:
                pass

            # framenet
            try:
                frame_net = {}
                frames = invoke_frame(word)
                if frames is not None:
                    for fr in frames:
                        lu_temp = []
                        for lu in fn.lus(r'(?i)' + word.lower()):
                            fr_ = fn.frames_by_lemma(r'(?i)' + lu.name)
                            if len(fr_):
                                if fr_[0] == fr:
                                    lu_temp.append({
                                        'name': lu.name,
                                        'definition': lu.definition,
                                        'pos': lu.name.split('.')[1]
                                    })
                        frame_net[fr.ID] = {
                            'name': fr.name,
                            'frameId': fr.ID,
                            'definition': fr.definition,
                            # 'relations':fr.frameRelations,
                            'lu': lu_temp
                        }
                if frame_net:
                    t['frames'] = frame_net
            except:
                pass

        return remove_empty_fields(j)