def processClass(katumClass, classID): frames = vn.frames(classID) for frame in frames: syntaxFrames = frame['syntax'] semanticsFrames = frame['semantics'] exampleKatum = processExample(frame['example'], katumClass) if (len(syntaxFrames) > 0): syntaxInstance = syntax.get(syntax.countI) exampleKatum._is(syntaxInstance, False) for syntaxFrame in syntaxFrames: syntaxFramesInstance = syntaxFramesKatum.get( syntaxFramesKatum.countI) syntaxInstance._is(syntaxFramesInstance, False) processSyntax(syntaxFramesInstance, syntaxFrame, syntaxArguments) if (len(semanticsFrames) > 0): semanticsInstance = semantics.get(semantics.countI) exampleKatum._is(semanticsInstance, False) for semanticsFrame in semanticsFrames: predicateVal = semanticsFrame.get('predicate_value') predicateKatum = predicateValue.get(predicateVal) numPredicateKatum = predicateKatum.get(predicateKatum.countI) semanticsFramesInstance = semanticsFramesKatum.get( semanticsFramesKatum.countI) semanticsInstance._is(semanticsFramesInstance, False) semanticsFramesInstance._is(numPredicateKatum, False) for argument_ in semanticsFrame.get('arguments'): argumentType = semanticsArguments.get( argument_.get('type')) argumentValue = argumentType.get(argument_.get('value')) numPredicateKatum._is(argumentValue, False) processDescription(frame['description'], exampleKatum)
def test(self): skips = [ 'Eggs and cream mix well together.', 'The eggs and the cream mixed together.' ] warnings.simplefilter("ignore", ResourceWarning) classid_list = sorted(verbnet.classids(), key=lambda c: LooseVersion(classid_to_number(c))) i = 0 for classid in classid_list: for vn_frame in verbnet.frames(classid): text = vn_frame['frame'].find('EXAMPLES/EXAMPLE').text with self.subTest(i=i, msg='{}: {}'.format(classid, text)): if text in skips: continue syntax = vn_frame['frame'].find('SYNTAX') wanted_primary = strip_roles( vn_frame['frame'].find('DESCRIPTION').get('primary')) converted_primary = ' '.join( [phrase for phrase, role in syntax_to_primary(syntax)]) self.assertEqual(wanted_primary, converted_primary) i += 1 print('Total : {}'.format(i))
def is_word_in_verb_frames(verb, word): classids = vn.classids(verb) frames = [frame for cid in classids for frame in vn.frames(cid)] for frame in frames: if word.lower() in frame['example'].lower().replace('.', '').split(' '): return True return False
def is_transitive(lemma): try: cids = verbnet.classids(lemma) frames = verbnet.frames(verbnet.vnclass(cids[0])) ret = False # for frame in frames: # print "primary:", frame['description']['primary'] # ret = ret or "Transitive" in frame['description']['primary'] ret = "Transitive" in frames[0]['description']['primary'] return ret except: return False
def get_transitivity(verb): """ Take a verb lemma as input. Return transitivity score and VerbNet (VN) frames if available. The returned tuple is constructed in the following way: -the first element is the transitivity score, where: -1 equals transitive -0 equals intransitive (or at least according to VN) -the second element is a list of tuples, each of which consists of: -first, the VN class_id of a given meaning of a verb -second, the corresponding frame itself Regardless of the length of the transitive frames list, the transitivty score remains the same. """ class_ids = vn.classids(verb) print(class_ids) # Define a list containing frames with transitive meanings of the given verb. trans_frames = [] for class_id in class_ids: frames = vn.frames(class_id) for frame in frames: print(frame["description"]["primary"]) #print(frame['description']['secondary']) if frame["description"]["primary"] == "NP V NP": entry = class_id, frame trans_frames.append(entry) # elif "NP V NP" in frame["description"]["primary"]: # entry = class_id, frame # trans_frames.append(entry) # elif "Transitive" in frame["description"]["secondary"]: # entry = class_id, frame # trans_frames.append(entry) # If the trans_score is equal to one, the verb has a transitive meaning. if len(trans_frames) != 0: trans_score = 1 else: trans_score = 0 return trans_score, trans_frames
nps = extract_phrase(tree_str, 'NP') vps = extract_phrase(tree_str, 'VP') pps = extract_phrase(tree_str, 'PP') if before_verb in nps: print("YES BEFORE VERB") if after_verb in nps: print("YES AFTER VERB") print(nps) print(vps) print(pps) for np in nps: print(np) print("=============") word = "come" vn_results = vn.classids(lemma=word) print(vn_results) frames = vn.frames('51.2')[0] syntax = frames['syntax'] for item in syntax: print(item['pos_tag']) print("=====================") nlp.close()
def process(text: str, params: dict) -> OrderedDict: """Process provided text""" # set JSON-NLP j: OrderedDict = base_document() t: OrderedDict = base_nlp_json() t['DC.source'] = 'NLTK {}'.format(__version__) t['documents'].append(j) j['text'] = text # collect parsers lemmatizer = get_lemmatizer() tokenizer = get_tokenizer(params) sentence_tokenizer = get_sentence_tokenizer() stemmer = get_stemmer() parser = get_parser() language = Counter() # tokenize and tag tokens: List[str] = tokenizer.tokenize(text) tokens_tagged: List[tuple] = nltk.pos_tag(tokens) conll_tagged = tree2conlltags(ne_chunk(tokens_tagged)) offset_list: List[Tuple[int, int]] = list(tokenizer.span_tokenize(text)) token_list: List[dict] = [] for token_idx, token_tuple in enumerate(tokens_tagged): token = token_tuple[0] pos_tag = token_tuple[1] wordnet_pos = get_wordnet_pos(pos_tag) entity_tag = conll_tagged[token_idx][2].split("-") if wordnet_pos != '': synsets = wordnet.synsets(token, pos=wordnet_pos) else: synsets = wordnet.synsets(token) sys_id = 0 sys_list = [] for syn in synsets: s_hypo = set([x.lemma_names()[0] for x in syn.hyponyms()]) s_hyper = set([x.lemma_names()[0] for x in syn.hypernyms()]) s_examples = [x for x in syn.examples()] s = { 'wordnet_id': syn.name(), 'id': sys_id, 'synonym': syn.lemma_names()[1:], 'hyponym': list(s_hypo), 'hypernym': list(s_hyper), 'examples': s_examples, 'definition': syn.definition() } if len(s['synonym']) == 0: s.pop('synonym') if len(s['hyponym']) == 0: s.pop('hyponym') if len(s['hypernym']) == 0: s.pop('hypernym') if len(s['examples']) == 0: s.pop('examples') if len(s['definition']) == 0: s.pop('definition') if s: sys_list.append(s) sys_id += 1 verb_list = [] vn_classids = vn.classids(token) for classid in vn_classids: verb_list.append({ 'class_id': classid, 'frames': vn.frames(classid) }) t = { 'id': token_idx, 'text': token, 'lemma': lemmatizer(token, wordnet_pos) if wordnet_pos else lemmatizer(token), 'stem': stemmer(token), 'pos': pos_tag, 'entity': entity_tag[1] if len(entity_tag) > 1 else "", 'entity_iob': entity_tag[0], 'overt': True, 'characterOffsetBegin': offset_list[token_idx][0], 'characterOffsetEnd': offset_list[token_idx][1], 'synsets': sys_list, 'verbnet': verb_list } if len(t['synsets']) == 0: t.pop('synsets') if len(t['verbnet']) == 0: t.pop('verbnet') token_list.append(t) j['tokenList'] = token_list # sentence and dependency parsing sent_list = [] token_from = 0 sentence_tokens = sentence_tokenizer.sentences_from_tokens(tokens) sentence_texts = sentence_tokenizer.sentences_from_text(text) # check whether MALT parser is loaded! DC if parser: for sent_idx, sent in enumerate(zip(sentence_tokens, sentence_texts)): # Detecting language of each sentence la = pycountry.languages.get(alpha_2=detect(sent[1])) token_to = token_from + len(sent[0]) - 1 dg = parser.parse_one(sent[1].split()) s = { 'id': sent_idx, 'text': sent[1], 'tokenFrom': token_from, 'tokenTo': token_to, 'tokens': list(range(token_from, token_to)) } for token in dg.nodes: head = dg.nodes[token]['head'] head_word = [ dg.nodes[i]['word'] for i in dg.nodes if dg.nodes[i]['address'] == head ] if len(head_word) > 0: j['dependenciesBasic'].append({ 'governor': head_word[0], 'dependent': dg.nodes[token]['word'], 'type': dg.nodes[token]['rel'] }) else: j['dependenciesBasic'].append({ 'governor': 'null', 'dependent': dg.nodes[token]['word'], 'type': dg.nodes[token]['rel'] }) if j['dependenciesBasic'][-1]['governor'] == 'null' or j['dependenciesBasic'][-1]['dependent'] == 'null' \ or j['dependenciesBasic'][-1]['type'] == 'null': j['dependenciesBasic'].pop() token_from = token_to language[la.name] += 1 sent_list.append(s) j['sentences'] = sent_list if params['language']: t['DC.language'] = params['language'] else: # only if language has some elements can we check for max!!! DC if len(token_list) > 4 and language: t['DC.language'] = max(language) else: t['DC.language'] = '' # TODO: # 1. Schema: clauses, coreferences, constituents, expressions, paragraphs # 2. fields: token: sentiment, embeddings; sentence: sentiment, complex, type, embeddings return j
def process(text='', lang='en', coreferences=False, constituents=False, dependencies=False, expressions=False, **kwargs) -> OrderedDict: # build nlp-json j: OrderedDict = get_base() j['meta']['DC.language'] = lang d: OrderedDict = get_base_document(1) #j['documents'][d['id']] = d j['documents'].append(d) d['meta']['DC.source'] = 'NLTK {}'.format(nltk_version) j['meta']['DC.language'] = lang d['text'] = text # collect parsers lemmatizer = get_lemmatizer() stemmer = get_stemmer() # tokenization and pos words = [] for sent in segment(text): for token in sent: words.append(token.value) # create the token list t_id = 1 for word, xpos in pos_tag(words): wordnet_pos = get_wordnet_pos(xpos) lemma = lemmatizer(word, pos=wordnet_pos) # start the token t = {'id': t_id, 'text': word, 'stem': stemmer(word)} #d['tokenList'][t['id']] = t d['tokenList'].append(t) t_id += 1 # wordnet try: synsets = wordnet.synsets(lemma, pos=wordnet_pos) senses = {} for s in synsets: hyponyms = [ y for x in s.hyponyms() for y in x.lemma_names() ] hypernyms = [ y for x in s.hypernyms() for y in x.lemma_names() ] synonyms = s.lemma_names()[1:] examples = s.examples() sense = { 'wordnetId': s.name(), 'definition': s.definition() } if synonyms: sense['synonyms'] = synonyms if hypernyms: sense['hypernyms'] = hypernyms if hyponyms: sense['hyponyms'] = hyponyms if examples: sense['examples'] = examples antonyms = [] for l in s.lemmas(): if l.antonyms(): for a in l.antonyms(): antonyms.append(a.name()) if antonyms: sense['antonyms'] = antonyms senses[sense['wordnetId']] = sense if senses: t['synsets'] = senses except: pass # verbnet try: verbs = dict((class_id, { 'classId': class_id, 'frames': vn.frames(class_id) }) for class_id in vn.classids(word)) if verbs: t['verbFrames'] = verbs except: pass # framenet try: frame_net = {} frames = invoke_frame(word) if frames is not None: for fr in frames: lu_temp = [] for lu in fn.lus(r'(?i)' + word.lower()): fr_ = fn.frames_by_lemma(r'(?i)' + lu.name) if len(fr_): if fr_[0] == fr: lu_temp.append({ 'name': lu.name, 'definition': lu.definition, 'pos': lu.name.split('.')[1] }) frame_net[fr.ID] = { 'name': fr.name, 'frameId': fr.ID, 'definition': fr.definition, # 'relations':fr.frameRelations, 'lu': lu_temp } if frame_net: t['frames'] = frame_net except: pass return remove_empty_fields(j)