def frameCounter(): #parser.add_argument("infile", metavar="FILE", help="name of the framenet-parsed file file") Entries = Counter() #Add frames from the FN lexicon for d in fn.documents(): for sentence in fn.annotated_document(d["ID"])["sentence"]: for annotation in sentence["annotationSet"]: if "frameID" in annotation.keys(): signature = annotation["luName"] lemma, pos = annotation["luName"].split(".") lemma = normalizeLemma(lemma) frameID = str(annotation["frameID"]) framename = annotation["frameName"] for x in annotation["layer"]: if x["name"] == "Target": for l in x["label"]: start = int(l["start"]) end = int(l["end"]) form = sentence["text"][start:end+1].lower() Entries[framename]+=1 return Entries
def demo(): from pprint import pprint from nltk.corpus import framenet as fn # # It is not necessary to explicitly build the indexes by calling # buildindexes(). We do this here just for demo purposes. If the # indexes are not built explicitely, they will be built as needed. # print('Building the indexes...') fn.buildindexes() # # Get some statistics about the corpus # print('Number of Frames:', len(fn.frames())) print('Number of Lexical Units:', len(fn.lexical_units())) print('Number of annotated documents:', len(fn.documents())) print() # # Frames # print('getting frames whose name matches the (case insensitive) regex: "(?i)medical"') medframes = fn.frames(r'(?i)medical') print( 'Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes))) print([(f.name, f.ID) for f in medframes]) # # store the first frame in the list of frames # tmp_id = medframes[0].ID m_frame = fn.frame(tmp_id) # reads all info for the frame # # get the frame relations # print( '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(m_frame.name, m_frame.ID), len(m_frame.frameRelation)) for fr in m_frame.frameRelation: print(' ', fr.type + ":", fr.relatedFrame) # # get the names of the Frame Elements # print( '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name), len(m_frame.FE)) print(' ', [x.name for x in m_frame.FE]) # # get the names of the "Core" Frame Elements # print( '\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name)) print(' ', [x.name for x in m_frame.FE if x.coreType == "Core"]) # # get all of the Lexical Units that are incorporated in the # 'Ailment' FE of the 'Medical_conditions' frame (id=239) # print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') m_frame = fn.frame(239) ailment_lus = [x for x in m_frame.lexUnit if x.incorporatedFE == 'Ailment'] print([x.name for x in ailment_lus]) # # get all of the Lexical Units for the frame # print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name), len(m_frame.lexUnit)) print(' ', [x.name for x in m_frame.lexUnit[:5]], '...') # # get basic info on the second LU in the frame # tmp_id = m_frame.lexUnit[1].ID # grab the id of the second LU luinfo = fn.lu_basic(tmp_id) # get basic info on the LU print('\nInformation on the LU: {0}'.format(luinfo.name)) pprint(luinfo) # # Get a list of all of the corpora used for fulltext annotation # print('\nNames of all of the corpora used for fulltext annotation:') allcorpora = set([x.corpname for x in fn.documents()]) pprint(list(allcorpora)) # # Get the names of the annotated documents in the first corpus # firstcorp = list(allcorpora)[0] firstcorp_docs = fn.documents(firstcorp) print( '\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp)) pprint([x.filename for x in firstcorp_docs]) # # Search for frames containing LUs whose name attribute matches a # regexp pattern. # # Note: if you were going to be doing a lot of this type of # searching, you'd want to build an index that maps from # lemmas to frames because each time frames_by_lemma() is # called, it has to search through ALL of the frame XML files # in the db. print('\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":') pprint(fn.frames_by_lemma(r'^run.v$'))
print(targetfolder) for l in open("../res/pos_heads_for_mwe_targets.tsv").readlines(): idx, key = l.strip().split("\t") idx = int(idx) mwe_target_head_dict[key] = idx mwwtargets = 0 targets = 0 MWEs = Counter() from nltk.corpus import framenet as fn #from build_target_list import normalizeLemma sentences = [] FrameElement_bitmasks = [] for d in fn.documents(): for sentence in fn.annotated_document(d["ID"])["sentence"]: s = FrameSentence(sentence["text"]) local_frame_elements = ["O"] * len(s.textlist) #print(len(s.offsetdict.keys()),(s.offsetdict),len(s.text.split(" "))) #print(sentence.keys()) for annotation in sentence["annotationSet"]: if "frameID" in annotation.keys(): signature = annotation["luName"] lemma, pos = annotation["luName"].split(".") #lemma = normalizeLemma(lemma) frameID = str(annotation["frameID"]) framename = "TARGET" #annotation["frameName"] arguments = {} for x in annotation["layer"]: if x["name"] == "Target":
print(targetfolder) for l in open("../res/pos_heads_for_mwe_targets.tsv").readlines(): idx, key = l.strip().split("\t") idx = int(idx) mwe_target_head_dict[key] = idx mwwtargets=0 targets=0 MWEs=Counter() from nltk.corpus import framenet as fn #from build_target_list import normalizeLemma sentences = [] FrameElement_bitmasks = [] for d in fn.documents(): for sentence in fn.annotated_document(d["ID"])["sentence"]: s = FrameSentence(sentence["text"]) local_frame_elements = ["O"] * len(s.textlist) #print(len(s.offsetdict.keys()),(s.offsetdict),len(s.text.split(" "))) #print(sentence.keys()) for annotation in sentence["annotationSet"]: if "frameID" in annotation.keys(): signature = annotation["luName"] lemma, pos = annotation["luName"].split(".") #lemma = normalizeLemma(lemma) frameID = str(annotation["frameID"]) framename = "TARGET" #annotation["frameName"] arguments = {} for x in annotation["layer"]: if x["name"] == "Target":
f.ID f.definition set(f.lexUnit.keys()) [x.name for x in f.FE] f.frameRelations fn.frames_by_lemma(r'(?i)a little') fn.lu(256).name fn.lu(256).definition fn.lu(256).frame fn.lu(256).lexeme docs = fn.documents() len(docs) docs[0].keys() docs[0].filename docs[0].annotated_document()
def demo(): from pprint import pprint from nltk.corpus import framenet as fn # # It is not necessary to explicitly build the indexes by calling # buildindexes(). We do this here just for demo purposes. If the # indexes are not built explicitely, they will be built as needed. # print('Building the indexes...') fn.buildindexes() # # Get some statistics about the corpus # print('Number of Frames:', len(fn.frames())) print('Number of Lexical Units:', len(fn.lexical_units())) print('Number of annotated documents:', len(fn.documents())) print() # # Frames # print( 'getting frames whose name matches the (case insensitive) regex: "(?i)medical"' ) medframes = fn.frames(r'(?i)medical') print('Found {0} Frames whose name matches "(?i)medical":'.format( len(medframes))) print([(f.name, f.ID) for f in medframes]) # # store the first frame in the list of frames # tmp_id = medframes[0].ID m_frame = fn.frame(tmp_id) # reads all info for the frame # # get the frame relations # print( '\nNumber of frame relations for the "{0}" ({1}) frame:'.format( m_frame.name, m_frame.ID), len(m_frame.frameRelation)) for fr in m_frame.frameRelation: print(' ', fr.type + ":", fr.relatedFrame) # # get the names of the Frame Elements # print( '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name), len(m_frame.FE)) print(' ', [x.name for x in m_frame.FE]) # # get the names of the "Core" Frame Elements # print('\nThe "core" Frame Elements in the "{0}" frame:'.format( m_frame.name)) print(' ', [x.name for x in m_frame.FE if x.coreType == "Core"]) # # get all of the Lexical Units that are incorporated in the # 'Ailment' FE of the 'Medical_conditions' frame (id=239) # print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') m_frame = fn.frame(239) ailment_lus = [x for x in m_frame.lexUnit if x.incorporatedFE == 'Ailment'] print([x.name for x in ailment_lus]) # # get all of the Lexical Units for the frame # print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name), len(m_frame.lexUnit)) print(' ', [x.name for x in m_frame.lexUnit[:5]], '...') # # get basic info on the second LU in the frame # tmp_id = m_frame.lexUnit[1].ID # grab the id of the second LU luinfo = fn.lu_basic(tmp_id) # get basic info on the LU print('\nInformation on the LU: {0}'.format(luinfo.name)) pprint(luinfo) # # Get a list of all of the corpora used for fulltext annotation # print('\nNames of all of the corpora used for fulltext annotation:') allcorpora = set([x.corpname for x in fn.documents()]) pprint(list(allcorpora)) # # Get the names of the annotated documents in the first corpus # firstcorp = list(allcorpora)[0] firstcorp_docs = fn.documents(firstcorp) print('\nNames of the annotated documents in the "{0}" corpus:'.format( firstcorp)) pprint([x.filename for x in firstcorp_docs]) # # Search for frames containing LUs whose name attribute matches a # regexp pattern. # # Note: if you were going to be doing a lot of this type of # searching, you'd want to build an index that maps from # lemmas to frames because each time frames_by_lemma() is # called, it has to search through ALL of the frame XML files # in the db. print( '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":' ) pprint(fn.frames_by_lemma(r'^run.v$'))