def frame_chose(): fs = fn.frames() fs_dic = {} fs_ID = [] for f in fs: fs_ID.append(f.ID) fs_dic[f.name] = [] lexes = f.lexUnit for lex in lexes: fs_dic[f.name].append(lexes[lex].name) fs_ID_copy = fs_ID result = [] for f1 in fs_ID: fs_ID_copy.remove(f1) f1_name = fn.frame(f1).name set1 = set(fs_dic[f1_name]) for f2 in fs_ID_copy: f2_name = fn.frame(f2).name set2 = set(fs_dic[f2_name]) r = list(set1 & set2) result.append((f1_name, f2_name, r, len(r))) result = sorted(result, key=lambda x: (x[3]), reverse=True) frame_chose = [] for r in result: if r[3] >= 10: frame_chose.append(r) return frame_chose
def getFrames(phrase): frames = [] content = [s.translate(string.maketrans("",""), string.punctuation) for s in phrase.split()] for lemma in content: frame = fn.frames(lemma) if frame != []: frames.append([f.name for f in frame]) return frames
def set_FN_embeddings(): embeddings = {} frames = fn.frames() vector = np.zeros(len(frames)) for i in range(len(frames)): name = frames[i].name vector_i = np.copy(vector) vector_i[i] = 1.0 embeddings[name] = vector_i return embeddings, len(frames)
def print_frame(name_re): for m_frame in fn.frames(name_re): #m_frame = fn.frame(299) print 'Unincorporated', [x.name for x in m_frame.lexUnit.values() if 'incorporatedFE' not in x] for relation in m_frame['frameRelations']: print ' ', relation for fe in m_frame['FE']: ailment_lus = [x for x in m_frame.lexUnit.values() if 'incorporatedFE' in x and x.incorporatedFE == fe] print ' ', fe print ' ', [x.name for x in ailment_lus] print '\n'
def expandByGraph(mappinglist): expandlist = list() for item in mappinglist: expandlist.append(item) for frame in fn.frames(): if frame.name == item: for fr in frame.frameRelations: if fr.type.name == 'Inheritance': if 'Child' in fr and fr.Child.name != item: expandlist.append(fr.Child.name) elif 'Parent' in fr and fr.Parent.name != item: expandlist.append(fr.Parent.name) elif fr.type.name == 'See_also': if 'ReferringEntry' in fr and fr.ReferringEntry.name != item: expandlist.append(fr.ReferringEntry.name) return list(set(expandlist))
def get_frame_to_root_information(di_g, fn, roots, verbose=0): """get all the relations from frames to their roots""" frame_to_root_information = {} for frame_obj in fn.frames(): frame = frame_obj.name if not di_g.has_node(frame): root_information = [{ 'subframe': frame, 'root': frame, 'the_path': [frame], 'len_path': 1 }] else: root_information = [] for root in roots: if nx.has_path(di_g, root, frame): the_path = nx.shortest_path(di_g, root, frame) len_path = len(the_path) root_info = { 'subframe': frame, 'root': root, 'the_path': the_path, 'len_path': len_path } root_information.append(root_info) # check for 2> root paths #chosen_root_info = {} #min_path_length = 100000 #for root_info in root_information: # if root_info['len_path'] < min_path_length: # min_path_length = root_info['len_path'] # chosen_root_info = root_info #assert chosen_root_info != {} frame_to_root_information[frame] = root_information assert len(frame_to_root_information) == 1221 #path_lengths = [root_info['len_path'] # for root_info in frame_to_root_information.values()] #if verbose >= 1: # print() # print(f'distribution of path lengths: {Counter(path_lengths)}') return frame_to_root_information
def extrac_framenet(): results = [] frames = fn.frames() for f in frames: temp = {} f_name = f.name f_definition = f.definition f_lexunit = f.lexUnit f_fes = f.FE temp['name'] = f_name temp['definition'] = f_definition temp['lexunit'] = list(f_lexunit.keys()) temp['fes'] = [[fe, f_fes[fe].coreType, f_fes[fe].definition] for fe in f_fes] results.append(temp) file_object = open('../data/frame.json', 'w') json.dump(results, file_object)
def getFrameSetForStudent(surname, list_len=5): frameList = [] nof_frames = len(fn.frames()) base_idx = ( abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) % nof_frames) print('\nstudent: ' + surname) framenet_IDs = get_frams_IDs() i = 0 offset = 0 seed(1) while i < list_len: fID = framenet_IDs[(base_idx + offset) % nof_frames] f = fn.frame(fID) fNAME = f.name print('\tID: {a:4d}\tframe: {framename}'.format(a=fID, framename=fNAME)) offset = randint(0, nof_frames) frameList.append(fID) i += 1 return frameList
def extract_examples(): results = [] frames = fn.frames() for f in frames: for lu in f.lexUnit: examples = f.lexUnit[lu].exemplars for example in examples: temp = {} temp['name'] = f.name temp['lexunit'] = lu temp['text'] = example.text if 'Target' in example: temp['target'] = example.Target else: print(example.text) temp['fe'] = example.FE results.append(temp) if len(results) % 100 == 0: print('Processing...', len(results)) file_object = open('../data/frame_examples.json', 'w') json.dump(results, file_object)
def print_frames_with_IDs(): for x in fn.frames(): print('{}\t{}'.format(x.ID, x.name))
pickle.dump(lcs, f) lcs_feats = ['lcs_eventive', 'lcs_stative'] type_embedder['lcs'] = lcs # Wordnet supersenses(lexicographer names) synsets = wordnet.all_synsets() supersenses = \ sorted(list(set(['supersense=' + x.lexname() for x in synsets]))) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: (lem2frame[lemma['name'] + '.' + \ framenet_posdict[lemma['POS']]]) = lm['frame']['name'] frame_names = sorted(['frame=' + x.name for x in framenet.frames()]) type_embedder['lem2frame'] = lem2frame # Verbnet classids verbnet_classids = \ sorted(['classid=' + vcid for vcid in verbnet.classids()]) type_hand_features = (verbnet_classids + supersenses + frame_names + lcs_feats + conc_cols) input_size += len(type_hand_features) for f in type_hand_features: type_embedder['embedder'][f] = 0 # Write all the feature names to a text file if args.type and args.token: with open('../../data/list_of_all_hand_eng_features.txt', 'w') as f:
#! /usr/bin/env python # Author: Kapil Thadani ([email protected]) from __future__ import division, with_statement from nltk.corpus import framenet ############################################################################### # Names of all frames in Framenet (1019 total) frames = sorted(frame.name for frame in framenet.frames()) # Names of all possible FEs (1170 total) fes = sorted(set(fe for frame in framenet.frames() for fe in frame.FE.keys())) # Names of all possible frames and FEs (9633 total) frame_fes = sorted([(frame.name, fe) for frame in framenet.frames() for fe in frame.FE.iterkeys()], key=lambda x: x[0] + x[1]) ############################################################################### # Core types of FEs coretypes = ['Core', 'Peripheral', 'Extra-Thematic'] # Names of all possible FEs and coretypes (1491 total) fe_coretypes = sorted(set((fe, frame_element.coreType) for frame in framenet.frames() for fe, frame_element in frame.FE.iteritems()), key=lambda x: x[0] + x[1]) # Names of all possible frames and FEs and coretypes (9633 total)
print_common_synsets(documents) tps = corpus_probability(documents) frames = extract_frames(documents) counter = Counter(frames) counter.most_common(25) frames = fn.frames(r'Mental_stimulus_stimulus_focus') for frame in frames: print set(frame.lexUnit.keys()) lus = [x for x in frame.lexUnit.values() if 'incorporatedFE' in x ] print(' ', [x.name for x in lus]) print_frame(r'Emotions_of_mental_activity') frames = [] frames += fn.frames(r'.*(?i)mental.*') frames += fn.frames(r'.*(?i)medical.*')
f.close() # LCS eventivity from lcsreader import LexicalConceptualStructureLexicon lcs = LexicalConceptualStructureLexicon(home + '/Desktop/protocols/data/verbs-English.lcs') lcs_feats = ['lcs_eventive', 'lcs_stative'] # Wordnet supersenses(lexicographer names) supersenses = list(set(['supersense=' + x.lexname() for x in wordnet.all_synsets()])) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name'] frame_names = ['frame=' + x.name for x in framenet.frames()] # Verbnet classids verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()] dict_feats = {} for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols: dict_feats[f] = 0 x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, tokens, lemmas)]) dev_x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, dev_tokens, dev_lemmas)]) test_x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_test_x, test_tokens, test_lemmas)]) feature_names = (verbnet_classids, supersenses, frame_names, lcs_feats, conc_cols, lexical_feats, all_ud_feature_cols)
framenetRoot=generalThing.find("framenet") frameElement=framenetRoot.find("frame element") lexicalUnit=framenetRoot.find("lexical unit") semType=framenetRoot.find("semantic type") id_=framenetRoot.find("id") frames=framenetRoot.find("frame") for fE in fn.fes(): if fE.semType!=None: semanticTypeKatum=exactSemType(fE.semType) frameElementkatum=exactFE(fE) if(semanticTypeKatum!=None and frameElementkatum!=None): frameElementkatum._is(semanticTypeKatum,False) for lU in fn.lus(): if len(lU.semTypes)!=0: for semTypeInstance in lU.semTypes: semanticTypeKatum=exactSemType(semTypeInstance) lUkatum=exactlU(lU) if(semanticTypeKatum!=None and lUkatum!=None): lUkatum._is(semanticTypeKatum,False) for frame in fn.frames(): if len(frame.semTypes)!=0: for semTypeInstance in frame.semTypes: semanticTypeKatum=exactSemType(semTypeInstance) frameKatum=exactFrame(frame) if(semanticTypeKatum!=None and frameKatum!=None): frameKatum._is(semanticTypeKatum,False) generalThing.save('wordnet-verbnet-framenet.datum')
bert_embedding = BertEmbeddings('bert-base-cased') from flair.embeddings import StackedEmbeddings # now create the StackedEmbedding object that combines all embeddings stacked_embeddings = StackedEmbeddings( embeddings=[ #flair_forward_embedding, #flair_backward_embedding, bert_embedding]) import nltk nltk.download('framenet_v17') from nltk.corpus import framenet as fn len(fn.frames()) txt=preprocess.read_pg(data_root + r'\EN_1818_Shelley,Mary_Frankenstein_Novel.txt') print(len(txt), 'chars') from segtok.segmenter import split_single sentences = [Sentence(s, use_tokenizer=True) for s in split_single(txt)] print(len(sentences), 'sentences') import random as rand t = range(100)#rand.sample(range(len(sentences)), 100) sents_sample = [sentences[i] for i in sorted(t)] t = np.array(t) _ = bert_embedding.embed(sents_sample)
else: reverse_dict[word] = {} reverse_dict[word][event] = features_dict reverse_dict = {} nominal_dict = {} for word in word_list: event = event_verb_mapping[word] nominal_dict[word] = [] nominals = {} #added_nouns = set() #frames = fn.frames_by_lemma(word) if use_framenet and word in frame_dict: for frame_index in frame_dict[word]: if len(fn.frames(frame_index)) < 1: print "no frame for: ", frame_index continue frame = fn.frames(frame_index)[0] for potential_noun in frame.lexUnit.keys(): lemma = potential_noun.split('.')[0] pos = potential_noun.split('.')[1] if pos == 'n' or add_verbs: features_dict = {} features_dict['event'] = event features_dict['word'] = lemma features_dict['pos'] = pos #features_dict['synset'] = float('nan') features_dict['fn'] = 1 features_dict['num_wordnet'] = 0 features_dict['synset_percent'] = 0
"""Yields a graph for FN instead of the unwieldy labyrinth of nested dicts We use a philosophy similar to conllreader and put stuff in the nodes, maybe as dicts or maybe as a class""" from nltk.corpus import framenet as fn fn.propagate_semtypes() framekeys = set() frametypes = set() for fx in fn.frames(): for k in fx.keys(): framekeys.add(k) if fx['semTypes']: for t in fx['semTypes']: frametypes.add(t['name']) #We could read straight from the print(frametypes) #for k in framekeys: # print(k,fn.frames()[0][k]) # print(k,fn.frames()[1][k]) # print(k,fn.frames()[2][k])
def closure_graph(synset, fn): seen = set() graph = nx.DiGraph() def recurse(s): if not s in seen: seen.add(s) graph.add_node(s.name()) for s1 in fn(s): graph.add_node(s1.name()) graph.add_edge(s.name(), s1.name()) recurse(s1) recurse(synset) return graph dog = fn.frames(r'(?i)medical') print(str(dog.name)) G = closure_graph(dog.name, lambda s: s.hypernyms()) index = nx.betweenness_centrality(G) plt.rc('figure', figsize=(12, 7)) node_size = [index[n] * 1000 for n in G] pos = nx.spring_layout(G) nx.draw_networkx(G, pos, node_size=node_size, edge_color='r', alpha=.3, linewidths=0) plt.show()
def load_framenet(): edges = [] for frm in fn.frames(): # frame-frame relations for fe in frm.frameRelations: edges = pretty_frame_edge(edges, frm_id(fe.superFrameName), frm_id(fe.subFrameName), ncheck(fe.type.name)) # lexical units for lu in frm.lexUnit.keys(): edges.append([ frm_id(frm.name), 'fn:HasLexicalUnit', lu_format(lu, frm.name) ]) # FE for fe in frm.FE.values(): if isinstance(fe.semType, nltk.corpus.reader.framenet.AttrDict): edges.append([ fe_id(fe.name), 'fn:HasSemType', fe_semtype_id(fe.semType.name) ]) edges.append([ fe_semtype_id(fe.semType.name), 'fn:st:RootType', fe_semtype_id(fe.semType.rootType.name) ]) edges.append([ fe_semtype_id(fe.semType.name), 'fn:st:SuperType', fe_semtype_id(fe.semType.superType.name) ]) for fesub in fe.semType.subTypes: edges.append([ fe_semtype_id(fe.semType.name), 'fn:st:SubType', fe_semtype_id(fesub.name) ]) if isinstance(fe.requiresFE, nltk.corpus.reader.framenet.AttrDict): edges.append([ fe_id(fe.name), 'fn:fe:RequiresFE', fe_id(fe.requiresFE.name) ]) if isinstance(fe.excludesFE, nltk.corpus.reader.framenet.AttrDict): edges.append([ fe_id(fe.name), 'fn:fe:ExcludesFE', fe_id(fe.excludesFE.name) ]) # coreType as edge feature edges.append( [frm_id(frm.name), 'fn:HasFrameElement', fe_id(fe.name)]) return edges
def get_global_frame_dictionary(): frame_dict = {f["name"]: i for i, f in enumerate(fn.frames())} return frame_dict
def get_frams_IDs(): return [f.ID for f in fn.frames()]
def load_framenet(): edges = [] for frm in fn.frames(): # frame-frame relations for fe in frm.frameRelations: edges = pretty_frame_edge(edges, frm_id(fe.superFrameName), frm_id(fe.subFrameName), ncheck(fe.type.name)) # lexical units for lu in frm.lexUnit.keys(): edges.append([ frm_id(frm.name), 'fn:HasLexicalUnit', lu_format(lu, frm.name) ]) # FE for fe in frm.FE.values(): if isinstance(fe.semType, nltk.corpus.reader.framenet.AttrDict): # Sem type semtype_edge = [ fe_id(fe.name), '/r/IsA', #'fn:HasSemType', fe_semtype_id(fe.semType.name) ] if semtype_edge not in edges: edges.append(semtype_edge) # Root type root_edge = [ fe_semtype_id(fe.semType.name), '/r/IsA', # 'fn:st:RootType' fe_semtype_id(fe.semType.rootType.name) ] if root_edge not in edges: edges.append(root_edge) # Super type super_edge = [ fe_semtype_id(fe.semType.name), '/r/IsA', #'fn:st:SuperType', fe_semtype_id(fe.semType.superType.name) ] if super_edge not in edges: edges.append(super_edge) # Sub type for fesub in fe.semType.subTypes: sub_edge = [ fe_semtype_id(fesub.name), '/r/IsA', fe_semtype_id(fe.semType.name) ] if sub_edge not in edges: edges.append(sub_edge) #edges.append([fe_semtype_id(fe.semType.name), # 'fn:st:SubType', # fe_semtype_id(fesub.name)]) # Requires FE if isinstance(fe.requiresFE, nltk.corpus.reader.framenet.AttrDict): req_edge = [ fe_id(fe.name), '/r/HasPrerequisite', fe_id(fe.requiresFE.name) ] if req_edge not in edges: edges.append(req_edge) #edges.append([fe_id(fe.name), 'fn:fe:RequiresFE', fe_id(fe.requiresFE.name)]) # Excludes FE if isinstance(fe.excludesFE, nltk.corpus.reader.framenet.AttrDict): excl_edge = [ fe_id(fe.name), '/r/RelatedTo', fe_id(fe.excludesFE.name) ] if excl_edge not in edges: edges.append(excl_edge) #edges.append([fe_id(fe.name), 'fn:fe:ExcludesFE', fe_id(fe.excludesFE.name)]) # HasFrameElement - coreType as edge feature hasfe_edge = [ frm_id(frm.name), '/r/HasA', #'fn:HasFrameElement', fe_id(fe.name) ] if hasfe_edge not in edges: edges.append(hasfe_edge) return edges
#! /usr/bin/env python # Author: Kapil Thadani ([email protected]) from __future__ import division, with_statement from nltk.corpus import framenet ############################################################################### # Names of all frames in Framenet (1019 total) frames = sorted(frame.name for frame in framenet.frames()) # Names of all possible FEs (1170 total) fes = sorted(set(fe for frame in framenet.frames() for fe in frame.FE.keys())) # Names of all possible frames and FEs (9633 total) frame_fes = sorted([(frame.name, fe) for frame in framenet.frames() for fe in frame.FE.iterkeys()], key=lambda x: x[0] + x[1]) ############################################################################### # Core types of FEs coretypes = ['Core', 'Peripheral', 'Extra-Thematic'] # Names of all possible FEs and coretypes (1491 total) fe_coretypes = sorted(set((fe, frame_element.coreType)
def demo(): from pprint import pprint from nltk.corpus import framenet as fn # # It is not necessary to explicitly build the indexes by calling # buildindexes(). We do this here just for demo purposes. If the # indexes are not built explicitely, they will be built as needed. # print('Building the indexes...') fn.buildindexes() # # Get some statistics about the corpus # print('Number of Frames:', len(fn.frames())) print('Number of Lexical Units:', len(fn.lexical_units())) print('Number of annotated documents:', len(fn.documents())) print() # # Frames # print( 'getting frames whose name matches the (case insensitive) regex: "(?i)medical"' ) medframes = fn.frames(r'(?i)medical') print('Found {0} Frames whose name matches "(?i)medical":'.format( len(medframes))) print([(f.name, f.ID) for f in medframes]) # # store the first frame in the list of frames # tmp_id = medframes[0].ID m_frame = fn.frame(tmp_id) # reads all info for the frame # # get the frame relations # print( '\nNumber of frame relations for the "{0}" ({1}) frame:'.format( m_frame.name, m_frame.ID), len(m_frame.frameRelation)) for fr in m_frame.frameRelation: print(' ', fr.type + ":", fr.relatedFrame) # # get the names of the Frame Elements # print( '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name), len(m_frame.FE)) print(' ', [x.name for x in m_frame.FE]) # # get the names of the "Core" Frame Elements # print('\nThe "core" Frame Elements in the "{0}" frame:'.format( m_frame.name)) print(' ', [x.name for x in m_frame.FE if x.coreType == "Core"]) # # get all of the Lexical Units that are incorporated in the # 'Ailment' FE of the 'Medical_conditions' frame (id=239) # print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') m_frame = fn.frame(239) ailment_lus = [x for x in m_frame.lexUnit if x.incorporatedFE == 'Ailment'] print([x.name for x in ailment_lus]) # # get all of the Lexical Units for the frame # print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name), len(m_frame.lexUnit)) print(' ', [x.name for x in m_frame.lexUnit[:5]], '...') # # get basic info on the second LU in the frame # tmp_id = m_frame.lexUnit[1].ID # grab the id of the second LU luinfo = fn.lu_basic(tmp_id) # get basic info on the LU print('\nInformation on the LU: {0}'.format(luinfo.name)) pprint(luinfo) # # Get a list of all of the corpora used for fulltext annotation # print('\nNames of all of the corpora used for fulltext annotation:') allcorpora = set([x.corpname for x in fn.documents()]) pprint(list(allcorpora)) # # Get the names of the annotated documents in the first corpus # firstcorp = list(allcorpora)[0] firstcorp_docs = fn.documents(firstcorp) print('\nNames of the annotated documents in the "{0}" corpus:'.format( firstcorp)) pprint([x.filename for x in firstcorp_docs]) # # Search for frames containing LUs whose name attribute matches a # regexp pattern. # # Note: if you were going to be doing a lot of this type of # searching, you'd want to build an index that maps from # lemmas to frames because each time frames_by_lemma() is # called, it has to search through ALL of the frame XML files # in the db. print( '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":' ) pprint(fn.frames_by_lemma(r'^run.v$'))
fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238) fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality') fn.lus('look.n')[0].frame fn.lus('look.n')[1].frame for f in fn.lus('look.n'): print f.frame.name result = fn.frames(r'(?i)erception') print result f = fn.frame(1301) f.ID f.definition for u in f.lexUnit: print u fn.lexical_units('r(?i)look') from pattern.en import wordnet
def demo(): from pprint import pprint from nltk.corpus import framenet as fn # # It is not necessary to explicitly build the indexes by calling # buildindexes(). We do this here just for demo purposes. If the # indexes are not built explicitely, they will be built as needed. # print('Building the indexes...') fn.buildindexes() # # Get some statistics about the corpus # print('Number of Frames:', len(fn.frames())) print('Number of Lexical Units:', len(fn.lexical_units())) print('Number of annotated documents:', len(fn.documents())) print() # # Frames # print('getting frames whose name matches the (case insensitive) regex: "(?i)medical"') medframes = fn.frames(r'(?i)medical') print( 'Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes))) print([(f.name, f.ID) for f in medframes]) # # store the first frame in the list of frames # tmp_id = medframes[0].ID m_frame = fn.frame(tmp_id) # reads all info for the frame # # get the frame relations # print( '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(m_frame.name, m_frame.ID), len(m_frame.frameRelation)) for fr in m_frame.frameRelation: print(' ', fr.type + ":", fr.relatedFrame) # # get the names of the Frame Elements # print( '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name), len(m_frame.FE)) print(' ', [x.name for x in m_frame.FE]) # # get the names of the "Core" Frame Elements # print( '\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name)) print(' ', [x.name for x in m_frame.FE if x.coreType == "Core"]) # # get all of the Lexical Units that are incorporated in the # 'Ailment' FE of the 'Medical_conditions' frame (id=239) # print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') m_frame = fn.frame(239) ailment_lus = [x for x in m_frame.lexUnit if x.incorporatedFE == 'Ailment'] print([x.name for x in ailment_lus]) # # get all of the Lexical Units for the frame # print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name), len(m_frame.lexUnit)) print(' ', [x.name for x in m_frame.lexUnit[:5]], '...') # # get basic info on the second LU in the frame # tmp_id = m_frame.lexUnit[1].ID # grab the id of the second LU luinfo = fn.lu_basic(tmp_id) # get basic info on the LU print('\nInformation on the LU: {0}'.format(luinfo.name)) pprint(luinfo) # # Get a list of all of the corpora used for fulltext annotation # print('\nNames of all of the corpora used for fulltext annotation:') allcorpora = set([x.corpname for x in fn.documents()]) pprint(list(allcorpora)) # # Get the names of the annotated documents in the first corpus # firstcorp = list(allcorpora)[0] firstcorp_docs = fn.documents(firstcorp) print( '\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp)) pprint([x.filename for x in firstcorp_docs]) # # Search for frames containing LUs whose name attribute matches a # regexp pattern. # # Note: if you were going to be doing a lot of this type of # searching, you'd want to build an index that maps from # lemmas to frames because each time frames_by_lemma() is # called, it has to search through ALL of the frame XML files # in the db. print('\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":') pprint(fn.frames_by_lemma(r'^run.v$'))
from nltk.corpus import framenet as fn import pattern.search as PS from pattern.search import Pattern, Classifier, search from pattern.en import parse, parsetree from pattern.en import wordnet as pwn from nltk.corpus import wordnet as wn from nltk.corpus import framenet as fn import pandas as pd import numpy as np from datasets.customers.tufamilia_dataset import TuFamilia frames = fn.frames('Medical_conditions') frames = fn.frames('Causation') frame = frames[0] #Take first match lus = frame['lexUnit'].values() for lu in lus: if lu.has_key('incorporatedFE'): print '%20s %10s' % (lu.name, lu['incorporatedFE']) else: print '%20s %10s' % (lu.name, 'No IFE') for relation in frame['frameRelations']: print ' ', relation for fe in frame['FE']:
def hand_engineering(prot, batch_size, data, data_dev): ''' Hand engineered feature extraction. Supports the following - UD, Verbnet classids, Wordnet supersenses, concreteness ratings, LCS eventivity scores ''' home = expanduser("~") framnet_posdict = { 'V': 'VERB', 'N': 'NOUN', 'A': 'ADJ', 'ADV': 'ADV', 'PREP': 'ADP', 'NUM': 'NUM', 'INTJ': 'INTJ', 'ART': 'DET', 'C': 'CCONJ', 'SCON': 'SCONJ', 'PRON': 'PRON', 'IDIO': 'X', 'AVP': 'ADV' } # Load the features features = {} with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f: for line in f.readlines(): feats = line.split('\t') features[feats[0]] = (feats[1].split(), feats[2].split()) # Load the predpatt objects for creating features files = [ '/Downloads/UD_English-r1.2/en-ud-train.conllu', '/Downloads/UD_English-r1.2/en-ud-dev.conllu', '/Downloads/UD_English-r1.2/en-ud-test.conllu' ] home = expanduser("~") options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in files: path = home + file with open(path, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse, opts=options) data['Structure'] = data['Split.Sentence.ID'].map(lambda x: (patt[x], features[x])) data_dev['Structure'] = data_dev['Split.Sentence.ID'].map( lambda x: (patt[x], features[x])) raw_x = data['Structure'].tolist() raw_dev_x = data_dev['Structure'].tolist() all_x = raw_x + raw_dev_x all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))]) feature_cols = Counter(all_feats.split('|')) # All UD dataset features all_ud_feature_cols = list( feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()] # Concreteness f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb') concreteness = pickle.load(f) if prot == 'arg': conc_cols = ['concreteness'] else: conc_cols = ['concreteness', 'max_conc', 'min_conc'] f.close() # LCS eventivity from lcsreader import LexicalConceptualStructureLexicon lcs = LexicalConceptualStructureLexicon( home + '/Desktop/protocols/data/verbs-English.lcs') lcs_feats = ['lcs_eventive', 'lcs_stative'] # Wordnet supersenses(lexicographer names) supersenses = list( set(['supersense=' + x.lexname() for x in wordnet.all_synsets()])) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name'] frame_names = ['frame=' + x.name for x in framenet.frames()] # Verbnet classids verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()] # Lexical features lexical_feats = [ 'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must', 'ought', 'dare', 'need' ] + [ 'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every', 'this', 'that', 'any', 'most', 'all', 'both', 'these' ] dict_feats = {} for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols: dict_feats[f] = 0 x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist()) ]) dev_x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist( ), data_dev['Lemma'].tolist()) ]) # Figure out which columns to drop(they're always zero) todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist() todrop = x_pd.columns[(x_pd == 0).all()].values.tolist() intdrop = [a for a in todrop if a not in todrop1] cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop)) x = x_pd.drop(cols_to_drop, axis=1).values.tolist() dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist() x = [[a[:] for a in x[i:i + batch_size]] for i in range(0, len(data), batch_size)] dev_x = [[a[:] for a in dev_x[i:i + batch_size]] for i in range(0, len(data_dev), batch_size)] return x, dev_x
doccollections = ['NYT_19980407','NYT_19980403','NYT_19980315','APW_19980429','APW_19980424','APW_19980314'] IN = re.compile(r'.*\bin\b(?!\b.+ing)') for doccol in doccollections: for doc in nltk.corpus.ieer.parsed_docs(doccol): relations = nltk.sem.extract_rels('PER', 'LOC', doc, corpus='ieer', pattern = IN) for relation in relations: print nltk.sem.relextract.rtuple(relation) f = fn.frames(r'(?i)perception') len(fn.frames()) f = fn.frame(66) f.ID f.definition set(f.lexUnit.keys()) [x.name for x in f.FE] f.frameRelations fn.frames_by_lemma(r'(?i)a little')
'\t_\t' + \ tag + \ '\t_\t' + \ str(head_idx) + '\t' + \ dep + \ '\t_\t_\n' if len(s) > 0: conll += '\n' return conll # def compile_framenet_starters(): print('loading dub frames') dub_frames = [ full_frame.name for full_frame in fn.frames() if len(full_frame.name.split('_')) > 1 ] FDD = defaultdict(list) for dub_frame in dub_frames: FDD[dub_frame.split('_')[0]].append(dub_frame) # return fdd # @clock def get_frame_from_name(frame_name): try: frame = fn.frame_by_name(frame_name) except: if len(FDD[frame_name]) == 1: frame = fn.frame_by_name(FDD[frame_name][0])
from nltk.corpus import framenet as fn fs = fn.frames() for i in fs: print(i.FE) break
__author__ = 'juliewe' from nltk.corpus import framenet as fn if __name__=='__main__': print len(fn.frames())