def check(a, b): """Used to check for common frames of given two words using the FrameNet""" if a not in dictionary: dictionary[a] = set([lu.frame.name for lu in fn.lus(name=r'(^|\s)%s(\s.+)?\.%s' % a)]) if b not in dictionary: dictionary[b] = set([lu.frame.name for lu in fn.lus(name=r'(^|\s)%s(\s.+)?\.%s' % b)]) return len(dictionary[a].intersection(dictionary[b])) > 0
def check(a, b): """Used to check for common frames of given two words using the FrameNet""" if a not in dictionary: dictionary[a] = set( [lu.frame.name for lu in fn.lus(name=r'(^|\s)%s(\s.+)?\.%s' % a)]) if b not in dictionary: dictionary[b] = set( [lu.frame.name for lu in fn.lus(name=r'(^|\s)%s(\s.+)?\.%s' % b)]) return len(dictionary[a].intersection(dictionary[b])) > 0
def findCoreType(self, wordList): dictim = [] for word in wordList: word_ = '^{}$'.format(word) if len(fn.lus(word_)) > 0: ID = fn.lus(word_)[0].frame.ID dicti = [fename for fename, fe in fn.frame(ID).FE.items() if fe.coreType == 'Core'] if len(dicti) > 0: dictim.append(dicti[0]) return dictim
def get_frames(): lus = framenet.lus() print('num lus', len(lus)) some_lu = random.choice(lus) print('Some LU:', some_lu.name, some_lu.POS, some_lu.frame.name) lus2frames = defaultdict(set) for lu in lus: lus2frames[lu.name].add(lu.frame.name) frames = chain.from_iterable(lus2frames.values()) frames = sorted(set(frames)) print('num frames', len(frames)) # In[18]: mlb = MultiLabelBinarizer() mlb.fit(lus2frames.values()) # lb = LabelBinarizer() # lb.fit(frames) return lus, lus2frames, frames, mlb
def hieve_nltk_verbs(self, wildcard='<UNK>'): import nltk nltk.download('propbank') nltk.download('framenet_v17') from nltk.corpus import propbank from nltk.corpus import framenet as fn verbs = [x.lower() for x in propbank.verbs()] for i in range(len(fn.lus())): try: x = fn.lu(i).name[:-2].lower() x = x[:x.rindex('.')] verbs.append(x) except: pass verbs = set(verbs) verbs |= self.event_vocab for tt in self.text: rb = [] for x in tt: if x not in verbs: rb.append(wildcard) else: rb.append(x) self.text_verbs.append(rb) print("Tokenized hieve text with FrameNet and PropBank verbs.")
def frames(words): regex = r"(?i)"+words #print len(f) """for w in f: name = w.name.split(".")[0]; if words == name: print w.name, w.ID""" """for w in names: if word == w: print w a = True print "word found" """ f = fn.lus() word = words + ".n" nameSet = set() names = set([w.name for w in f]) found_in_framenet = word in names if found_in_framenet: print "found" print found_in_framenet else: print "not found" """if(a):
def frame_count(verb: str) -> int: """ Counts the amount of evoked frames in FrameNet per verb. :param verb: String. Input verb for which the amount of evoked frames will be counted :return: Integer. Amount of evoked frames """ verb_regex = regex(verb) frames = fn.lus(verb_regex) # Returns a list with all frames evoked by the verb. return len(frames)
def getLU(phrase, tag): lus = fn.lus(r'(?i)%s'%phrase) if len(lus) == 0: return None #print "Lus: ", #print [x['name'] for x in lus][:min(20, len(lus))] exactlu = exactLU(phrase, lus) if exactlu: return exactlu else: #print "mulLus: ", #print [x['name'] for x in lus][:min(10, len(lus))] return rndLU(phrase, tag, lus)
def getLU(phrase, tag): lus = fn.lus(r'(?i)%s' % phrase) if len(lus) == 0: return None #print "Lus: ", #print [x['name'] for x in lus][:min(20, len(lus))] exactlu = exactLU(phrase, lus) if exactlu: return exactlu else: #print "mulLus: ", #print [x['name'] for x in lus][:min(10, len(lus))] return rndLU(phrase, tag, lus)
def getluHash(): luHash = {} #luname:luids for x in fn.lus(): #name = x['name'][:x['name'].rfind(".")] name = x['name'] id = x['ID'] #id = x['frame']['ID'] if name not in luHash: luHash[name] = [id] else: if id not in luHash[name]: luHash[name].append(id) print "..All LUs in FrameNet have been loaded into dict. #Item: " + str(len(luHash)) return luHash
def getluHash(): luHash = {} #luname:luids for x in fn.lus(): #name = x['name'][:x['name'].rfind(".")] name = x['name'] id = x['ID'] #id = x['frame']['ID'] if name not in luHash: luHash[name] = [id] else: if id not in luHash[name]: luHash[name].append(id) print "..All LUs in FrameNet have been loaded into dict. #Item: " + str( len(luHash)) return luHash
def invoke_frame(token: str): word = token.lower() lu_list = [(i.name, i.definition) for i in fn.lus()] lu_temp = set([i for i in lu_list if word == i[0].split('.')[0]]) frames = [] for lu, def_ in lu_temp: fr = fn.frames_by_lemma(r'(?i)' + lu) # print(len(fr), fr[0].ID) if len(frames) == 0: frames.append(fr[0]) else: if fr[0] not in frames: frames.append(fr[0]) return frames
def get_lu_instance(verb: str, rand=False) -> object: """ Retrieves a Lexical Unit Object from FrameNet given a verb. Note: If several Lexical Units for the given verb exist, and rand is set to True (default), a random Lexical Unit will be retrieved. If rand is set to False, the first entry of Lexical Units will be retrieved. :param verb: String. A verb for which the Lexical Unit shall be retrieved :param rand: Boolean. If True, returned object will be chosen pseudo randomly. Else, first element will be returned :return: Object: Lexical Unit Object which can be processed within the FrameNet API """ lu = regex(verb) lus_list = fn.lus(lu) if rand is False: return lus_list[0] amount_lus = len(lus_list) random_index = random.randint(0, amount_lus-1) return lus_list[random_index]
def map_cfs_lus(verbs: list, cfs: dict) -> dict: """Maps the Connotation Frames to the Lexical Units in FrameNet. Note: The distinction between ambiguous verbs and unambiguous verbs has to be made before. So the input list 'verbs' should already be filtered. The return dict looks like this: all verbs: { ( "verb", (Lexical Unit IDs) ) : {Connotation Frame} } unambiguous verbs: { ( "verb", Lexical Unit ID ) : {Connotation Frame} } :param verbs: List. Common words that occur both in the Connotation Frame Lexicon and in FrameNet :param cfs: Dictionary. Keys are verbs as strings, values are the Connotation Frames as nested dictionaries :return: """ mapping = {} for verb in verbs: connotation_frame = cfs[verb] key_information = [] key_information.append(verb) verb_regex = fn_pre.regex(verb) lus = fn.lus(verb_regex) if len(lus) == 1: lu = lus[0].ID key_information.append( lu ) # Distinction between single occurences and multiple occurences is crucial, # otherwise one will get an exception else: int_lus = [] for lu in lus: int_lus.append(lu.ID) key_information.append(tuple(int_lus)) information = tuple(key_information) mapping[information] = connotation_frame return mapping
def get_frames(pos_tags: Iterable, frame_cache: dict, verbose: bool = False) -> set: results = set() # iterate through each token, and create a dict of token -> words for token, pos in pos_tags: search_word = token.lower() # Ignore single-letter words and stopwords if pos[0] == ['N'] or len(search_word) < 2 or search_word in STOPWORDS: continue # Get wordnet-pos. Ignore words with no wordnet pos tag pos = nlp.get_wordnet_pos(pos) if pos == '': # add search_word to results set if eligible normalized_token = nlp.normalize_text(search_word, lemmatize=False, ignore_num=True) if len(normalized_token) > 1: results.add(normalized_token) continue # If lemma is not a stop-word, use that instead of lowercase token lemma = LEMMATIZER.lemmatize(search_word, pos) if lemma not in STOPWORDS: search_word = lemma # Get lexical units matching the search word and pos search_word = nlp.normalize_text(search_word, lemmatize=False, ignore_num=True).replace('.', '') # Load frames for missing tokens from FrameNet if it does not exist in cache key = '%s__%s' % (search_word, pos) if key not in frame_cache: frame_cache[key] = sorted( set(lu.frame.name for lu in fn.lus(r'(?i)(^|\s)(%s)(\s.+)?\.%s' % (search_word, pos)))) # add the frames from current key to the results set results.update(frame_cache[key]) if verbose: print('Frames: %d' % len(results)) else: print('.', end='', flush=True) return results
def intersect_lemmas_with_framenet(corpus_lemmas, wikidata_properties): """ Intersect verb lemmas extracted from the input corpus with FrameNet Lexical Units (LUs). :param dict corpus_lemmas: dict of verb lemmas with their ranking scores :param dict wikidata_properties: dict with all Wikidata properties :return: a dictionary of corpus lemmas enriched with FrameNet LUs data (dicts) :rtype: dict """ # Each FrameNet LU triggers one frame, so assign them to the same corpus lemma enriched = defaultdict(list) for corpus_lemma, score in corpus_lemmas.iteritems(): # Look up the FrameNet LUs given the corpus lemma # Ensure exact match, as the lookup can be done only via regex lus = framenet.lus(r'^%s\.' % corpus_lemma) if lus: logger.debug("Found %d FrameNet Lexical Units (LUs) that match the corpus lemma '%s': %s" % ( len(lus), corpus_lemma, lus)) for lu in lus: lu_label = lu['name'] # Skip non-verbal LUs if lu['POS'] != 'V': logger.debug("Skipping non-verbal LU '%s' ..." % lu_label) continue logger.debug("Processing FrameNet LU '%s' ..." % lu_label) frame = lu['frame'] frame_label = frame['name'] core_fes = [] extra_fes = [] logger.debug("Processing Frame Elements (FEs) ...") fes = frame['FE'] for fe_label, fe_data in fes.iteritems(): # Skip numerical FEs if fe_label in NUMERICAL_FES: logger.debug("Skipping numerical FE '%s', frame '%s' ..." % (fe_label, frame_label)) continue mapping = defaultdict(list) # Compute exact matches between FEs and Wikidata properties labels and aliases for pid, p_label_and_aliases in wikidata_properties.iteritems(): # Lowercase for better matching p_label = p_label_and_aliases['label'].lower() p_aliases = [p_alias.lower() for p_alias in p_label_and_aliases.get('aliases', [])] fe = fe_label.lower() if fe == p_label: logger.debug("FE '%s' maps to '%s' label '%s'" % (fe_label, pid, p_label)) mapping[pid].append(p_label_and_aliases) elif p_aliases and fe in p_aliases: logger.debug("FE '%s' maps to one of '%s' aliases: %s" % (fe_label, pid, p_aliases)) mapping[pid].append(p_label_and_aliases) fe_type = fe_data['coreType'] semantic_type_object = fe_data['semType'] semantic_type = semantic_type_object['name'] if semantic_type_object else None to_be_added = { 'fe': fe_label, 'type': fe_type, 'semantic_type': semantic_type, 'mapping': mapping } if fe_type == 'Core': core_fes.append(to_be_added) else: extra_fes.append(to_be_added) # Skip frames with no mapping to Wikidata if not core_fes and not extra_fes: logger.debug("No '%s' FEs could be mapped to Wikidata. Skipping ..." % frame_label) continue logger.debug("Core FEs: %s" % core_fes) logger.debug("Extra FEs: %s" % extra_fes) intersected_lu = { 'lu': lu_label, 'frame': frame_label, 'pos': lu['POS'] } if core_fes: intersected_lu['core_fes'] = core_fes if extra_fes: intersected_lu['extra_fes'] = extra_fes enriched[score].append(intersected_lu) logger.debug("Corpus lemma '%s' enriched with frame data: %s" % (corpus_lemma, json.dumps(intersected_lu, indent=2))) # Order by decreasing score return OrderedDict(sorted(enriched.items(), key=lambda x: x[0], reverse=True))
def hand_engineering(prot, batch_size, data, data_dev): ''' Hand engineered feature extraction. Supports the following - UD, Verbnet classids, Wordnet supersenses, concreteness ratings, LCS eventivity scores ''' home = expanduser("~") framnet_posdict = { 'V': 'VERB', 'N': 'NOUN', 'A': 'ADJ', 'ADV': 'ADV', 'PREP': 'ADP', 'NUM': 'NUM', 'INTJ': 'INTJ', 'ART': 'DET', 'C': 'CCONJ', 'SCON': 'SCONJ', 'PRON': 'PRON', 'IDIO': 'X', 'AVP': 'ADV' } # Load the features features = {} with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f: for line in f.readlines(): feats = line.split('\t') features[feats[0]] = (feats[1].split(), feats[2].split()) # Load the predpatt objects for creating features files = [ '/Downloads/UD_English-r1.2/en-ud-train.conllu', '/Downloads/UD_English-r1.2/en-ud-dev.conllu', '/Downloads/UD_English-r1.2/en-ud-test.conllu' ] home = expanduser("~") options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in files: path = home + file with open(path, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse, opts=options) data['Structure'] = data['Split.Sentence.ID'].map(lambda x: (patt[x], features[x])) data_dev['Structure'] = data_dev['Split.Sentence.ID'].map( lambda x: (patt[x], features[x])) raw_x = data['Structure'].tolist() raw_dev_x = data_dev['Structure'].tolist() all_x = raw_x + raw_dev_x all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))]) feature_cols = Counter(all_feats.split('|')) # All UD dataset features all_ud_feature_cols = list( feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()] # Concreteness f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb') concreteness = pickle.load(f) if prot == 'arg': conc_cols = ['concreteness'] else: conc_cols = ['concreteness', 'max_conc', 'min_conc'] f.close() # LCS eventivity from lcsreader import LexicalConceptualStructureLexicon lcs = LexicalConceptualStructureLexicon( home + '/Desktop/protocols/data/verbs-English.lcs') lcs_feats = ['lcs_eventive', 'lcs_stative'] # Wordnet supersenses(lexicographer names) supersenses = list( set(['supersense=' + x.lexname() for x in wordnet.all_synsets()])) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name'] frame_names = ['frame=' + x.name for x in framenet.frames()] # Verbnet classids verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()] # Lexical features lexical_feats = [ 'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must', 'ought', 'dare', 'need' ] + [ 'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every', 'this', 'that', 'any', 'most', 'all', 'both', 'these' ] dict_feats = {} for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols: dict_feats[f] = 0 x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist()) ]) dev_x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist( ), data_dev['Lemma'].tolist()) ]) # Figure out which columns to drop(they're always zero) todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist() todrop = x_pd.columns[(x_pd == 0).all()].values.tolist() intdrop = [a for a in todrop if a not in todrop1] cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop)) x = x_pd.drop(cols_to_drop, axis=1).values.tolist() dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist() x = [[a[:] for a in x[i:i + batch_size]] for i in range(0, len(data), batch_size)] dev_x = [[a[:] for a in dev_x[i:i + batch_size]] for i in range(0, len(data_dev), batch_size)] return x, dev_x
generalThing = datum.thing framenetRoot=generalThing.find("framenet") frameElement=framenetRoot.find("frame element") lexicalUnit=framenetRoot.find("lexical unit") semType=framenetRoot.find("semantic type") id_=framenetRoot.find("id") frames=framenetRoot.find("frame") for fE in fn.fes(): if fE.semType!=None: semanticTypeKatum=exactSemType(fE.semType) frameElementkatum=exactFE(fE) if(semanticTypeKatum!=None and frameElementkatum!=None): frameElementkatum._is(semanticTypeKatum,False) for lU in fn.lus(): if len(lU.semTypes)!=0: for semTypeInstance in lU.semTypes: semanticTypeKatum=exactSemType(semTypeInstance) lUkatum=exactlU(lU) if(semanticTypeKatum!=None and lUkatum!=None): lUkatum._is(semanticTypeKatum,False) for frame in fn.frames(): if len(frame.semTypes)!=0: for semTypeInstance in frame.semTypes: semanticTypeKatum=exactSemType(semTypeInstance) frameKatum=exactFrame(frame) if(semanticTypeKatum!=None and frameKatum!=None): frameKatum._is(semanticTypeKatum,False)
def intersect_lemmas_with_framenet(corpus_lemmas, wikidata_properties): """ Intersect verb lemmas extracted from the input corpus with FrameNet Lexical Units (LUs). :param dict corpus_lemmas: dict of verb lemmas with their ranking scores :param dict wikidata_properties: dict with all Wikidata properties :return: a dictionary of corpus lemmas enriched with FrameNet LUs data (dicts) :rtype: dict """ # Each FrameNet LU triggers one frame, so assign them to the same corpus lemma enriched = defaultdict(list) for corpus_lemma, score in corpus_lemmas.iteritems(): # Look up the FrameNet LUs given the corpus lemma # Ensure exact match, as the lookup can be done only via regex lus = framenet.lus(r'^%s\.' % corpus_lemma) if lus: logger.debug( "Found %d FrameNet Lexical Units (LUs) that match the corpus lemma '%s': %s" % (len(lus), corpus_lemma, lus)) for lu in lus: lu_label = lu['name'] # Skip non-verbal LUs if lu['POS'] != 'V': logger.debug("Skipping non-verbal LU '%s' ..." % lu_label) continue logger.debug("Processing FrameNet LU '%s' ..." % lu_label) frame = lu['frame'] frame_label = frame['name'] core_fes = [] extra_fes = [] logger.debug("Processing Frame Elements (FEs) ...") fes = frame['FE'] for fe_label, fe_data in fes.iteritems(): # Skip numerical FEs if fe_label in NUMERICAL_FES: logger.debug( "Skipping numerical FE '%s', frame '%s' ..." % (fe_label, frame_label)) continue mapping = defaultdict(list) # Compute exact matches between FEs and Wikidata properties labels and aliases for pid, p_label_and_aliases in wikidata_properties.iteritems( ): # Lowercase for better matching p_label = p_label_and_aliases['label'].lower() p_aliases = [ p_alias.lower() for p_alias in p_label_and_aliases.get( 'aliases', []) ] fe = fe_label.lower() if fe == p_label: logger.debug("FE '%s' maps to '%s' label '%s'" % (fe_label, pid, p_label)) mapping[pid].append(p_label_and_aliases) elif p_aliases and fe in p_aliases: logger.debug( "FE '%s' maps to one of '%s' aliases: %s" % (fe_label, pid, p_aliases)) mapping[pid].append(p_label_and_aliases) fe_type = fe_data['coreType'] semantic_type_object = fe_data['semType'] semantic_type = semantic_type_object[ 'name'] if semantic_type_object else None to_be_added = { 'fe': fe_label, 'type': fe_type, 'semantic_type': semantic_type, 'mapping': mapping } if fe_type == 'Core': core_fes.append(to_be_added) else: extra_fes.append(to_be_added) # Skip frames with no mapping to Wikidata if not core_fes and not extra_fes: logger.debug( "No '%s' FEs could be mapped to Wikidata. Skipping ..." % frame_label) continue logger.debug("Core FEs: %s" % core_fes) logger.debug("Extra FEs: %s" % extra_fes) intersected_lu = { 'lu': lu_label, 'frame': frame_label, 'pos': lu['POS'] } if core_fes: intersected_lu['core_fes'] = core_fes if extra_fes: intersected_lu['extra_fes'] = extra_fes enriched[score].append(intersected_lu) logger.debug( "Corpus lemma '%s' enriched with frame data: %s" % (corpus_lemma, json.dumps(intersected_lu, indent=2))) # Order by decreasing score return OrderedDict( sorted(enriched.items(), key=lambda x: x[0], reverse=True))
local_verbnet_api_path = "C:/Users/Kevin/PycharmProjects/verbnet/api/" sys.path.append(local_verbnet_api_path) import verbnet VN_LOC = "C:/Users/Kevin/PycharmProjects/lexical_resources/verbnet3.3/" vn = verbnet.VerbNetParser(directory=VN_LOC) possible_classes = { "-".join(c.split("-")[1:]): [m.name for m in vn.verb_classes_dict[c].members] for c in vn.verb_classes_dict } possible_frames = {} for lu in framenet.lus(): if lu.frame.name not in possible_frames: possible_frames[lu.frame.name] = [lu.lexemes[0].name] else: possible_frames[lu.frame.name].append(lu.lexemes[0].name) class Mapping(): def __init__(self, member, vn_class, fn_frame): self.member = member self.vn_class = vn_class self.fn_frame = fn_frame self.errors = self.verify() def __str__(self): return self.member + " " + self.vn_class + " " + self.fn_frame
key_list = key_list + temp_key value_list = value_list + temp_value st.write(key_list) #Shortcircuit Option shortcircuit_option = st.checkbox( "For a given key would you like to see its lexical unit? (shortcircuit to FrameNet option)" ) if shortcircuit_option == True: option_of_number = st.multiselect( "For which key(s) would you like to see its lexical unit?", list(range(len(key_list)))) for q in option_of_number: st.write("You selected key: ", q, ".", key_list[q]) lu = fn.lus(r'%s' % key_list[q]) lu_list = [] #List for contructing the string lu_nameID_dict = { } #Dictionary for mapping LU_ID w/ LU Name (TODO: Replace the list w/ this dictionary for efficiency) for lexical_unit in lu: lexical_unit_LU_Name = lexical_unit['name'] lexical_unit_LU_ID = lexical_unit['ID'] lu_nameID_dict[lexical_unit_LU_ID] = lexical_unit_LU_Name input_str = "LU Name=" + str( lexical_unit_LU_Name) + " (LU ID=" + str( lexical_unit_LU_ID) + ")" lu_list.append(input_str) if len(lu_list) == 0: st.write("No Lexical Units for the selected key") st.write("JSON PATH: ", jpath[q - 1][0]) else:
)) for r in DBPedia().search(sparql, start=1, count=1000): print '%s (%s)' % (r.person.name, r.place.name) ##______________________________Framenet______________________________ from nltk.corpus import framenet as fn fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238) fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality') fn.lus('look.n')[0].frame fn.lus('look.n')[1].frame for f in fn.lus('look.n'): print f.frame.name result = fn.frames(r'(?i)erception') print result
def process(text='', lang='en', coreferences=False, constituents=False, dependencies=False, expressions=False, **kwargs) -> OrderedDict: # build nlp-json j: OrderedDict = get_base() j['meta']['DC.language'] = lang d: OrderedDict = get_base_document(1) #j['documents'][d['id']] = d j['documents'].append(d) d['meta']['DC.source'] = 'NLTK {}'.format(nltk_version) j['meta']['DC.language'] = lang d['text'] = text # collect parsers lemmatizer = get_lemmatizer() stemmer = get_stemmer() # tokenization and pos words = [] for sent in segment(text): for token in sent: words.append(token.value) # create the token list t_id = 1 for word, xpos in pos_tag(words): wordnet_pos = get_wordnet_pos(xpos) lemma = lemmatizer(word, pos=wordnet_pos) # start the token t = {'id': t_id, 'text': word, 'stem': stemmer(word)} #d['tokenList'][t['id']] = t d['tokenList'].append(t) t_id += 1 # wordnet try: synsets = wordnet.synsets(lemma, pos=wordnet_pos) senses = {} for s in synsets: hyponyms = [ y for x in s.hyponyms() for y in x.lemma_names() ] hypernyms = [ y for x in s.hypernyms() for y in x.lemma_names() ] synonyms = s.lemma_names()[1:] examples = s.examples() sense = { 'wordnetId': s.name(), 'definition': s.definition() } if synonyms: sense['synonyms'] = synonyms if hypernyms: sense['hypernyms'] = hypernyms if hyponyms: sense['hyponyms'] = hyponyms if examples: sense['examples'] = examples antonyms = [] for l in s.lemmas(): if l.antonyms(): for a in l.antonyms(): antonyms.append(a.name()) if antonyms: sense['antonyms'] = antonyms senses[sense['wordnetId']] = sense if senses: t['synsets'] = senses except: pass # verbnet try: verbs = dict((class_id, { 'classId': class_id, 'frames': vn.frames(class_id) }) for class_id in vn.classids(word)) if verbs: t['verbFrames'] = verbs except: pass # framenet try: frame_net = {} frames = invoke_frame(word) if frames is not None: for fr in frames: lu_temp = [] for lu in fn.lus(r'(?i)' + word.lower()): fr_ = fn.frames_by_lemma(r'(?i)' + lu.name) if len(fr_): if fr_[0] == fr: lu_temp.append({ 'name': lu.name, 'definition': lu.definition, 'pos': lu.name.split('.')[1] }) frame_net[fr.ID] = { 'name': fr.name, 'frameId': fr.ID, 'definition': fr.definition, # 'relations':fr.frameRelations, 'lu': lu_temp } if frame_net: t['frames'] = frame_net except: pass return remove_empty_fields(j)
else: lcs = \ LexicalConceptualStructureLexicon(path2lcs + '/verbs-English.lcs') with open(path2lcs + '/' + pickled_filename, 'wb') as f: pickle.dump(lcs, f) lcs_feats = ['lcs_eventive', 'lcs_stative'] type_embedder['lcs'] = lcs # Wordnet supersenses(lexicographer names) synsets = wordnet.all_synsets() supersenses = \ sorted(list(set(['supersense=' + x.lexname() for x in synsets]))) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: (lem2frame[lemma['name'] + '.' + \ framenet_posdict[lemma['POS']]]) = lm['frame']['name'] frame_names = sorted(['frame=' + x.name for x in framenet.frames()]) type_embedder['lem2frame'] = lem2frame # Verbnet classids verbnet_classids = \ sorted(['classid=' + vcid for vcid in verbnet.classids()]) type_hand_features = (verbnet_classids + supersenses + frame_names + lcs_feats + conc_cols) input_size += len(type_hand_features) for f in type_hand_features: type_embedder['embedder'][f] = 0