def test(self): skips = [ 'Eggs and cream mix well together.', 'The eggs and the cream mixed together.' ] warnings.simplefilter("ignore", ResourceWarning) classid_list = sorted(verbnet.classids(), key=lambda c: LooseVersion(classid_to_number(c))) i = 0 for classid in classid_list: for vn_frame in verbnet.frames(classid): text = vn_frame['frame'].find('EXAMPLES/EXAMPLE').text with self.subTest(i=i, msg='{}: {}'.format(classid, text)): if text in skips: continue syntax = vn_frame['frame'].find('SYNTAX') wanted_primary = strip_roles( vn_frame['frame'].find('DESCRIPTION').get('primary')) converted_primary = ' '.join( [phrase for phrase, role in syntax_to_primary(syntax)]) self.assertEqual(wanted_primary, converted_primary) i += 1 print('Total : {}'.format(i))
def one_hot(list): print list array = numpy.zeros((len(verbnet.classids()))) for i in list: # print i array[i-1] = 1 return array
def get_verbs(verb): all_set_verbs = set() all_verbs = verbnet.classids(lemma=verb) for v in all_verbs: splitted = v.split("-") all_set_verbs.add(splitted[0]) return all_set_verbs
def genVerbnetFeatures(word, pos, features): if pos != 'V': return vids=vn.classids(word) for vid in vids: v=vn.vnclass(vid) types=[t.attrib['type'] for t in v.findall('THEMROLES/THEMROLE/SELRESTRS/SELRESTR')] for type in types: fstr = "THEMETYPE_"+type features.append(fstr)
def is_transitive(lemma): try: cids = verbnet.classids(lemma) frames = verbnet.frames(verbnet.vnclass(cids[0])) ret = False # for frame in frames: # print "primary:", frame['description']['primary'] # ret = ret or "Transitive" in frame['description']['primary'] ret = "Transitive" in frames[0]['description']['primary'] return ret except: return False
def get_verb_lemmas(verbs): """Return verbnet lemmas for the given verbs. These verbs are stemmed before lookup to prevent empty results. :param verbs (list) - The list of verbs (verbs) to reference. :rtype lemmas (list) - A list of lemmas for all verbs - these are not separated by verb. """ lemmas = [] for verb in normalization.stem_words(verbs): _lemmas = verbnet.classids(lemma=verb) lemmas += [l.split('-')[0] for l in _lemmas] return lemmas
def test_remove_before_verb(self): """ Whenever we detect that the sentence starts with a verb, we'll remove it from the VerbNet syntax """ from nltk.corpus import verbnet buy_first_classid = verbnet.classids('buy')[0] buy_first_syntax = verbnet.vnclass(buy_first_classid).find('FRAMES/FRAME/SYNTAX') altered_syntax = remove_before_v(buy_first_syntax) wanted_syntax = ET.fromstring("""<SYNTAX><VERB /><NP value="Theme" /></SYNTAX>""") self.assertEqual(syntax_to_str(altered_syntax), syntax_to_str(wanted_syntax))
def getFrames(verb, frames): for classid in verbnet.classids(verb): vnclass = verbnet.pprint(verbnet.vnclass(classid)) members = re.compile("\s+").split(membersPattern.search(vnclass).group("members")) for i in framePattern.finditer(vnclass): frame = mergeintrans(mergeNPs("%s"%(i.group("frame")))) frame = scomp.sub("SCOMP", frame) frame = german.sub("VERB", frame) frame = shifted.sub("NP VERB NP", frame) frame = finalPPs.sub("", frame) if frame in frames: frames[frame] += members else: frames[frame] = members return frames
def get_transitivity(verb): """ Take a verb lemma as input. Return transitivity score and VerbNet (VN) frames if available. The returned tuple is constructed in the following way: -the first element is the transitivity score, where: -1 equals transitive -0 equals intransitive (or at least according to VN) -the second element is a list of tuples, each of which consists of: -first, the VN class_id of a given meaning of a verb -second, the corresponding frame itself Regardless of the length of the transitive frames list, the transitivty score remains the same. """ class_ids = vn.classids(verb) print(class_ids) # Define a list containing frames with transitive meanings of the given verb. trans_frames = [] for class_id in class_ids: frames = vn.frames(class_id) for frame in frames: print(frame["description"]["primary"]) #print(frame['description']['secondary']) if frame["description"]["primary"] == "NP V NP": entry = class_id, frame trans_frames.append(entry) # elif "NP V NP" in frame["description"]["primary"]: # entry = class_id, frame # trans_frames.append(entry) # elif "Transitive" in frame["description"]["secondary"]: # entry = class_id, frame # trans_frames.append(entry) # If the trans_score is equal to one, the verb has a transitive meaning. if len(trans_frames) != 0: trans_score = 1 else: trans_score = 0 return trans_score, trans_frames
def predicate_generator2(readme): for words1 in '.!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~': readme = readme.replace(words1, '') words1 = word_tokenize(readme) words1 = [lem.lemmatize(word) for word in words1] # words1 = [ps.stem(word)for word in words1] source = [word.lower() for word in words1] verb1 = [] for word, pos in nltk.pos_tag(source): if (pos == 'VB'): verb1.append(word) verbs1 = [] for token in verb1: lemma = [lemma for lemma in vn.classids(token)] verbs1.append(lemma) return verbs1
def run(self, query): " From tweets extract the keywords" words = [] for tweet in query.tweets.all(): words.extend(self.extract_words(tweet.text)) # for answer in query.yahoo_answers.all(): # words.extend(self.extract_words(answer.content)) # words.extend(self.extract_words(answer.chosen_answer)) # Turn to downcase. new_words = [] for w in words: if not valid_word.match(w): continue if w.endswith('ing'): continue if w in query.theme.split(' '): continue if w in query.text.split(' '): continue if len(verbnet.classids(w)) > 0: continue if w not in stopset: try: new_words.append(w.decode('ascii')) except: pass # Sort list new_words.sort() keywords = ['%s:%d' % (k, len(list(g))) for k, g in groupby(new_words)] # Save the keywords in the query query.keywords = ' '.join(keywords) query.save() return query
def process(text: str, params: dict) -> OrderedDict: """Process provided text""" # set JSON-NLP j: OrderedDict = base_document() t: OrderedDict = base_nlp_json() t['DC.source'] = 'NLTK {}'.format(__version__) t['documents'].append(j) j['text'] = text # collect parsers lemmatizer = get_lemmatizer() tokenizer = get_tokenizer(params) sentence_tokenizer = get_sentence_tokenizer() stemmer = get_stemmer() parser = get_parser() language = Counter() # tokenize and tag tokens: List[str] = tokenizer.tokenize(text) tokens_tagged: List[tuple] = nltk.pos_tag(tokens) conll_tagged = tree2conlltags(ne_chunk(tokens_tagged)) offset_list: List[Tuple[int, int]] = list(tokenizer.span_tokenize(text)) token_list: List[dict] = [] for token_idx, token_tuple in enumerate(tokens_tagged): token = token_tuple[0] pos_tag = token_tuple[1] wordnet_pos = get_wordnet_pos(pos_tag) entity_tag = conll_tagged[token_idx][2].split("-") if wordnet_pos != '': synsets = wordnet.synsets(token, pos=wordnet_pos) else: synsets = wordnet.synsets(token) sys_id = 0 sys_list = [] for syn in synsets: s_hypo = set([x.lemma_names()[0] for x in syn.hyponyms()]) s_hyper = set([x.lemma_names()[0] for x in syn.hypernyms()]) s_examples = [x for x in syn.examples()] s = { 'wordnet_id': syn.name(), 'id': sys_id, 'synonym': syn.lemma_names()[1:], 'hyponym': list(s_hypo), 'hypernym': list(s_hyper), 'examples': s_examples, 'definition': syn.definition() } if len(s['synonym']) == 0: s.pop('synonym') if len(s['hyponym']) == 0: s.pop('hyponym') if len(s['hypernym']) == 0: s.pop('hypernym') if len(s['examples']) == 0: s.pop('examples') if len(s['definition']) == 0: s.pop('definition') if s: sys_list.append(s) sys_id += 1 verb_list = [] vn_classids = vn.classids(token) for classid in vn_classids: verb_list.append({ 'class_id': classid, 'frames': vn.frames(classid) }) t = { 'id': token_idx, 'text': token, 'lemma': lemmatizer(token, wordnet_pos) if wordnet_pos else lemmatizer(token), 'stem': stemmer(token), 'pos': pos_tag, 'entity': entity_tag[1] if len(entity_tag) > 1 else "", 'entity_iob': entity_tag[0], 'overt': True, 'characterOffsetBegin': offset_list[token_idx][0], 'characterOffsetEnd': offset_list[token_idx][1], 'synsets': sys_list, 'verbnet': verb_list } if len(t['synsets']) == 0: t.pop('synsets') if len(t['verbnet']) == 0: t.pop('verbnet') token_list.append(t) j['tokenList'] = token_list # sentence and dependency parsing sent_list = [] token_from = 0 sentence_tokens = sentence_tokenizer.sentences_from_tokens(tokens) sentence_texts = sentence_tokenizer.sentences_from_text(text) # check whether MALT parser is loaded! DC if parser: for sent_idx, sent in enumerate(zip(sentence_tokens, sentence_texts)): # Detecting language of each sentence la = pycountry.languages.get(alpha_2=detect(sent[1])) token_to = token_from + len(sent[0]) - 1 dg = parser.parse_one(sent[1].split()) s = { 'id': sent_idx, 'text': sent[1], 'tokenFrom': token_from, 'tokenTo': token_to, 'tokens': list(range(token_from, token_to)) } for token in dg.nodes: head = dg.nodes[token]['head'] head_word = [ dg.nodes[i]['word'] for i in dg.nodes if dg.nodes[i]['address'] == head ] if len(head_word) > 0: j['dependenciesBasic'].append({ 'governor': head_word[0], 'dependent': dg.nodes[token]['word'], 'type': dg.nodes[token]['rel'] }) else: j['dependenciesBasic'].append({ 'governor': 'null', 'dependent': dg.nodes[token]['word'], 'type': dg.nodes[token]['rel'] }) if j['dependenciesBasic'][-1]['governor'] == 'null' or j['dependenciesBasic'][-1]['dependent'] == 'null' \ or j['dependenciesBasic'][-1]['type'] == 'null': j['dependenciesBasic'].pop() token_from = token_to language[la.name] += 1 sent_list.append(s) j['sentences'] = sent_list if params['language']: t['DC.language'] = params['language'] else: # only if language has some elements can we check for max!!! DC if len(token_list) > 4 and language: t['DC.language'] = max(language) else: t['DC.language'] = '' # TODO: # 1. Schema: clauses, coreferences, constituents, expressions, paragraphs # 2. fields: token: sentiment, embeddings; sentence: sentiment, complex, type, embeddings return j
def getClasses(self, verb): return vn.classids(verb)
def get_verbnet_args(verb, verbose=False): lemmatizer = WordNetLemmatizer() lemmatized_verb = lemmatizer.lemmatize(verb.lower(), 'v') classids = verbnet.classids(lemma=lemmatized_verb) if verbose: print('Class IDs for "{}": {}'.format(lemmatized_verb, classids)) if len(classids) < 1: if verbose: print( 'No entry found on verbnet for "{}". Attempting WordNet synsets!' .format(lemmatized_verb)) wn_synsets = wordnet.synsets(lemmatized_verb) for synset in wn_synsets: if len(synset.lemmas()) < 1: continue candidate = str(synset.lemmas()[0].name()) classids = verbnet.classids(lemma=candidate) if verbose: print('Class IDs for "{}": {}'.format(candidate, classids)) if len(classids) > 0: break if len(classids) < 1: if verbose: print( 'Unable to find entries on verbnet for neither of the synsets... Will go recursive now (which is not a good thing!)' ) for synset in wn_synsets: if len(synset.lemmas()) < 1: continue candidate = str(synset.hypernyms()[0].lemmas()[0].name()) return NLPUtils.get_verbnet_args(candidate, verbose=verbose) if verbose: print('Exhausted attempts... returning an empty list.') return [] for id in classids: class_number = id[id.find('-') + 1:] try: v = verbnet.vnclass(class_number) roles = [ t.attrib['type'] for t in v.findall('THEMROLES/THEMROLE') ] pass except ValueError: print('VN class number not found: {}'.format(class_number)) # Will handle these both below v = [None] roles = [] pass while len(roles) < 1 and len(v) > 0: fallback_class_number = class_number[:class_number.rfind('-')] if verbose: print('No roles found for class {}, falling back to {}.'. format(class_number, fallback_class_number)) class_number = fallback_class_number try: v = verbnet.vnclass(class_number) roles = [ t.attrib['type'] for t in v.findall('THEMROLES/THEMROLE') ] pass except ValueError: # Go on with the loop v = [None] roles = [] pass if len(roles) > 0: if verbose: print('Roles found: {}'.format(roles)) return roles return None
def prim_fram(input): s = parse(input, relations=True, lemmata=True) # print s l = parse(input).split()[0] m = nltk.pos_tag(input.split(" ")) # print m oy = [] adj = [] nph = [] pph = [] vbp = [] adv = [] exc = [] for i in range(len(l)): tup = (l[i][2],l[i][0]) oy.append(tup) # print oy for i in range(len(m)): if m[i][1] == "JJ": adj.append((m[i][0], i + 1)) j=0 x=0 for i in range(len(oy)-1): k = i c = i np = "" vp = "" if oy[i][0]=="B-PP": pph.append((oy[i][1],i+1)) if oy[i][0] == "B-ADVP": adv.append((oy[i][1], i + 1)) if oy[i][1] in list: # print oy[i][1] exc.append((oy[i][1], i + 1)) if k >=j: while(oy[k][0] == "B-NP" or oy[k][0] == "I-NP") and (k <= range(len(oy))): np = np + oy[k][1]+" " k = k+1 j = k if np!='': nph.append((np,j)) if c >= x: while (oy[k][0] == "B-VP" or oy[k][0] == "I-VP") and (k <= range(len(oy))): vp = vp + oy[k][1] + " " k = k + 1 x = k if vp != '': vbp.append((vp, j)) # print vbp sen = nph+pph+vbp+adv+exc+adj # print sen sen1 = sorted(sen, key=lambda x: x[1]) # print sen1 senf = [] for i in range(len(sen1)-1): u = sen1[i + 1] if sen1[i][0] != u[0]: senf.append(sen1[i]) senf.append(sen1[-1]) # print senf frame = [] for z in range(len(senf)): if (senf[z] in nph): if(z>=2 and "ing" in senf[z][0]): frame.append((senf[z][0],"ING")) continue frame.append((senf[z][0], "NP")) continue if senf[z] in pph: if (z>2 and "ing" in senf[z][0]): frame.append((senf[z][0], "ING")) continue frame.append((senf[z][0], "PP")) continue if senf[z] in exc: frame.append((senf[z][0], senf[z][0])) continue if senf[z] in vbp: if (z>=2 and "ing" in senf[z][0]): frame.append((senf[z][0], "ING")) continue frame.append((senf[z][0], "VP")) continue if senf[z] in adv: if (z>2 and "ing" in senf[z][0]): frame.append((senf[z][0], senf[z][0])) continue frame.append((senf[z][0], "ADVP")) continue if senf[z] in adj: if (z>2 and "ing" in senf[z][0]): frame.append((senf[z][0], senf[z][0])) continue frame.append((senf[z][0], "ADJ")) continue vbf = [] ps = PorterStemmer() for i in vbp: h = vb.classids(ps.stem(i[0].lower().strip())) # print h if h != []: vbf.append(ps.stem(i[0].strip())) return vbf,frame
import os import re from nltk.corpus import verbnet as vbnet thematic_roles = [] selres = [] semantics = [] themroles_dict = {} semantics_dict = {} selres_dict = {} for file in os.listdir("D:/Downloads/new_vn"): if file.endswith(".xml"): # print(file.strip(".xml").split("-")[0]) # s=str(vbnet.pprint(file.strip(".xml").split("-")[0])) l = vbnet.classids(file.strip(".xml").split("-")[0]) if l!=[]: for i in l: t=2 s = str(vbnet.pprint(i)) # print(s) subclasses = s.split("Subclasses:")[1].split("Members")[0].strip() theme = s.split("Thematic roles:")[1].split("Frames")[0] seman = s.split("Semantics:")[1:] for j in seman: k = j.split("\n") for w in k: if '*' in w: if w.strip(" * ").split("(")[0] in semantics_dict: semantics.append(w.strip(" * ").split("(")[0].strip()) continue else:
synsets = wordnet.all_synsets() supersenses = \ sorted(list(set(['supersense=' + x.lexname() for x in synsets]))) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: (lem2frame[lemma['name'] + '.' + \ framenet_posdict[lemma['POS']]]) = lm['frame']['name'] frame_names = sorted(['frame=' + x.name for x in framenet.frames()]) type_embedder['lem2frame'] = lem2frame # Verbnet classids verbnet_classids = \ sorted(['classid=' + vcid for vcid in verbnet.classids()]) type_hand_features = (verbnet_classids + supersenses + frame_names + lcs_feats + conc_cols) input_size += len(type_hand_features) for f in type_hand_features: type_embedder['embedder'][f] = 0 # Write all the feature names to a text file if args.type and args.token: with open('../../data/list_of_all_hand_eng_features.txt', 'w') as f: for feature in token_hand_features + type_hand_features: f.write(feature + "\n") # Preload embedders for bert, elmo, glove if necessary
def process(text='', lang='en', coreferences=False, constituents=False, dependencies=False, expressions=False, **kwargs) -> OrderedDict: # build nlp-json j: OrderedDict = get_base() j['meta']['DC.language'] = lang d: OrderedDict = get_base_document(1) #j['documents'][d['id']] = d j['documents'].append(d) d['meta']['DC.source'] = 'NLTK {}'.format(nltk_version) j['meta']['DC.language'] = lang d['text'] = text # collect parsers lemmatizer = get_lemmatizer() stemmer = get_stemmer() # tokenization and pos words = [] for sent in segment(text): for token in sent: words.append(token.value) # create the token list t_id = 1 for word, xpos in pos_tag(words): wordnet_pos = get_wordnet_pos(xpos) lemma = lemmatizer(word, pos=wordnet_pos) # start the token t = {'id': t_id, 'text': word, 'stem': stemmer(word)} #d['tokenList'][t['id']] = t d['tokenList'].append(t) t_id += 1 # wordnet try: synsets = wordnet.synsets(lemma, pos=wordnet_pos) senses = {} for s in synsets: hyponyms = [ y for x in s.hyponyms() for y in x.lemma_names() ] hypernyms = [ y for x in s.hypernyms() for y in x.lemma_names() ] synonyms = s.lemma_names()[1:] examples = s.examples() sense = { 'wordnetId': s.name(), 'definition': s.definition() } if synonyms: sense['synonyms'] = synonyms if hypernyms: sense['hypernyms'] = hypernyms if hyponyms: sense['hyponyms'] = hyponyms if examples: sense['examples'] = examples antonyms = [] for l in s.lemmas(): if l.antonyms(): for a in l.antonyms(): antonyms.append(a.name()) if antonyms: sense['antonyms'] = antonyms senses[sense['wordnetId']] = sense if senses: t['synsets'] = senses except: pass # verbnet try: verbs = dict((class_id, { 'classId': class_id, 'frames': vn.frames(class_id) }) for class_id in vn.classids(word)) if verbs: t['verbFrames'] = verbs except: pass # framenet try: frame_net = {} frames = invoke_frame(word) if frames is not None: for fr in frames: lu_temp = [] for lu in fn.lus(r'(?i)' + word.lower()): fr_ = fn.frames_by_lemma(r'(?i)' + lu.name) if len(fr_): if fr_[0] == fr: lu_temp.append({ 'name': lu.name, 'definition': lu.definition, 'pos': lu.name.split('.')[1] }) frame_net[fr.ID] = { 'name': fr.name, 'frameId': fr.ID, 'definition': fr.definition, # 'relations':fr.frameRelations, 'lu': lu_temp } if frame_net: t['frames'] = frame_net except: pass return remove_empty_fields(j)
def analyze_constructs(examples, role_mapping, evaluation_sets, verbnet): annotated_sentences, lemma_in_vn = 0, 0 n_correct_frames, n_frames = 0, 0 n_correct_roles, n_roles = 0, 0 n_classes_in_list, n_classes = 0, 0 for lexie, lemma, sentence_text, gold_syntax in examples: d = sentence_text in [ sentence for source, sentence in evaluation_sets['train'] ] test_context = sentence_text in [ sentence for source, sentence in evaluation_sets['test'] ] debug(d, []) if d == test_context: print(d, test_context, sentence_text) assert d != test_context debug(d, [lexie, lemma, sentence_text]) if test_context: annotated_sentences += 1 # First possible error: lemma does not exist in VerbNet if not verbnet.classids(lemma): continue if test_context: lemma_in_vn += 1 n_frames += 1 considered_syntax = [] for vn_frame in verbnet.frames_for_lemma(lemma): vn_syntax = vn_frame['frame'].find('SYNTAX') # If sentence starts with a verb, remove anything that's before # the verb in VerbNet if next(iter(gold_syntax)).tag == 'VERB': vn_syntax = remove_before_v(vn_syntax) considered_syntax.append((vn_frame['classid'], vn_syntax)) # Use an OrderedDict for now to get the same behavior than # with the tuple list vn_syntax_matches = OrderedDict() for classid, vn_syntax in considered_syntax: if matches_verbnet_frame(gold_syntax, vn_syntax): if classid not in vn_syntax_matches: vn_syntax_matches[classid] = [] # check if vn_syntax is already in there? vn_syntax_matches[classid].append(vn_syntax) # Second possible error: syntactic pattern is not in VerbNet if not vn_syntax_matches: debug(d, [' ', Fore.RED, syntax_to_str(gold_syntax), Fore.RESET]) continue if test_context: n_correct_frames += 1 n_classes += 1 if lexie not in role_mapping: raise Exception('Missing lexie {} ({}) in role mapping.'.format( lexie, lemma)) debug(d, [ ' ', Fore.GREEN, syntax_to_str(gold_syntax), '->', syntax_to_str( map_gold_frame(classid, gold_syntax, role_mapping[lexie])), Fore.RESET ]) for classid in vn_syntax_matches: debug(d, [ ' ', classid, ' -> ', [ syntax_to_str(vn_syntax) for vn_syntax in vn_syntax_matches[classid] ] ]) class_matches = set(vn_syntax_matches.keys()) & set( role_mapping[lexie]) if not class_matches: continue if test_context: n_classes_in_list += 1 classid = next(iter(class_matches)) vn_syntax = vn_syntax_matches[classid][0] if classid not in role_mapping[lexie]: continue for i, correct_syntax in enumerate(gold_syntax): # if this is a 'frame element', not a V or anything else if correct_syntax.tag in ['NP', 'S']: if role_mapping[lexie] == {}: # missing sense # TODO handle this explicitly using XML annotations pass elif classid not in role_mapping[lexie]: raise Exception('{} misses {} class'.format( lexie, classid)) elif correct_syntax.get( 'value') not in role_mapping[lexie][classid]: raise Exception('{} misses {} mapping'.format( lexie, correct_syntax.get('value'))) if test_context: n_roles += 1 candidate_roles = set() candidate_roles.add(list(vn_syntax)[i].get('value')) if role_mapping[lexie][classid][correct_syntax.get( 'value')] in candidate_roles: if test_context: n_correct_roles += 1 / len(candidate_roles) print(annotated_sentences, n_frames, n_classes, n_roles) print('- {:.0%} of lemma tokens are here'.format( lemma_in_vn / annotated_sentences)) print('- For these tokens, {:.1%} of constructions are correct'. format(n_correct_frames / n_frames)) print('- For these constructions, {:.1%} of classes are here'.format( n_classes_in_list / max(n_classes, 1))) print('- For these classes, {:.1%} of roles are correct'.format( n_correct_roles / max(n_roles, 1))) print()
def verbs_in_verbnet(verb): vn_results = vn.classids(lemma=verb) return 1 if vn_results else 0
def print_if_passive(sent): """Given a sentence, tag it and print if we think it's a passive-voice formation.""" lancaster_stemmer = LancasterStemmer() tagged = tag_sentence(sent) tags = map(lambda (tup): tup[1], tagged) if passivep(tags): file.write(oneline(sent)) blob = TextBlob(oneline(sent)) flag = True prevnoun = "" verb = "" nextnoun = "" for word, pos in blob.tags: if (pos == 'NN' or pos == 'NNP') and flag == True: prevnoun = word if (pos == 'VBG' or pos == 'RB' or pos == 'VBN') and flag == True: verb = word flag = False if (pos == 'NN' or pos == 'NNP') and flag == False: nextnoun = word break lancaster_stemmer.stem(verb) print verb if len(verbnet.classids(verb)) == 0: ans = prevnoun + " " + verb + " " + nextnoun + " " else: ans1 = verbnet.classids(verb) ansstring = ''.join(ans1) ans = prevnoun + " " + ansstring + " " + nextnoun + " " fileans.write(ans + '\n') #print verbnet.classids('acclaim') #print "passive:", oneline(sent) else: file1.write(oneline(sent)) blob = TextBlob(oneline(sent)) flag1 = True prevnoun1 = "" verb1 = "" nextnoun1 = "" for word, pos in blob.tags: #print word,pos if (pos == 'NN' or pos == 'NNP') and flag1 == True: prevnoun1 = word if (pos == 'VBG' or pos == 'RB' or pos == 'VBN') and flag1 == True: verb1 = word flag1 = False if (pos == 'NN' or pos == 'NNP') and flag1 == False: nextnoun1 = word break lancaster_stemmer.stem(verb1) print verb1 if len(verbnet.classids(verb1)) == 0: ans = prevnoun1 + " " + verb1 + " " + nextnoun1 + " " else: ans1 = verbnet.classids(verb1) ansstring = ''.join(ans1) ans = prevnoun1 + " " + ansstring + " " + nextnoun1 + " " fileans.write(ans + '\n')
from lcsreader import LexicalConceptualStructureLexicon lcs = LexicalConceptualStructureLexicon(home + '/Desktop/protocols/data/verbs-English.lcs') lcs_feats = ['lcs_eventive', 'lcs_stative'] # Wordnet supersenses(lexicographer names) supersenses = list(set(['supersense=' + x.lexname() for x in wordnet.all_synsets()])) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name'] frame_names = ['frame=' + x.name for x in framenet.frames()] # Verbnet classids verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()] dict_feats = {} for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols: dict_feats[f] = 0 x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, tokens, lemmas)]) dev_x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, dev_tokens, dev_lemmas)]) test_x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_test_x, test_tokens, test_lemmas)]) feature_names = (verbnet_classids, supersenses, frame_names, lcs_feats, conc_cols, lexical_feats, all_ud_feature_cols) y = {} dev_y = {}
from nltk.corpus import verbnet classID = verbnet.classids('to kill') for id in classID: print(verbnet.themroles(id))
def analyze_constructs(examples, role_mapping, evaluation_sets, verbnet): annotated_sentences, lemma_in_vn = 0, 0 n_correct_frames, n_frames = 0, 0 n_correct_roles, n_roles = 0, 0 n_classes_in_list, n_classes = 0, 0 for lexie, lemma, sentence_text, gold_syntax in examples: d = sentence_text in [sentence for source, sentence in evaluation_sets['train']] test_context = sentence_text in [sentence for source, sentence in evaluation_sets['test']] debug(d, []) if d == test_context: print(d, test_context, sentence_text) assert d != test_context debug(d, [lexie, lemma, sentence_text]) if test_context: annotated_sentences += 1 # First possible error: lemma does not exist in VerbNet if not verbnet.classids(lemma): continue if test_context: lemma_in_vn += 1 n_frames += 1 considered_syntax = [] for vn_frame in verbnet.frames_for_lemma(lemma): vn_syntax = vn_frame['frame'].find('SYNTAX') # If sentence starts with a verb, remove anything that's before # the verb in VerbNet if next(iter(gold_syntax)).tag == 'VERB': vn_syntax = remove_before_v(vn_syntax) considered_syntax.append((vn_frame['classid'], vn_syntax)) # Use an OrderedDict for now to get the same behavior than # with the tuple list vn_syntax_matches = OrderedDict() for classid, vn_syntax in considered_syntax: if matches_verbnet_frame(gold_syntax, vn_syntax): if classid not in vn_syntax_matches: vn_syntax_matches[classid] = [] # check if vn_syntax is already in there? vn_syntax_matches[classid].append(vn_syntax) # Second possible error: syntactic pattern is not in VerbNet if not vn_syntax_matches: debug(d, [' ', Fore.RED, syntax_to_str(gold_syntax), Fore.RESET]) continue if test_context: n_correct_frames += 1 n_classes += 1 if lexie not in role_mapping: raise Exception('Missing lexie {} ({}) in role mapping.'.format(lexie, lemma)) debug(d, [' ', Fore.GREEN, syntax_to_str(gold_syntax), '->', syntax_to_str(map_gold_frame(classid, gold_syntax, role_mapping[lexie])), Fore.RESET]) for classid in vn_syntax_matches: debug(d, [' ', classid, ' -> ', [syntax_to_str(vn_syntax) for vn_syntax in vn_syntax_matches[classid]]]) class_matches = set(vn_syntax_matches.keys()) & set(role_mapping[lexie]) if not class_matches: continue if test_context: n_classes_in_list += 1 classid = next(iter(class_matches)) vn_syntax = vn_syntax_matches[classid][0] if classid not in role_mapping[lexie]: continue for i, correct_syntax in enumerate(gold_syntax): # if this is a 'frame element', not a V or anything else if correct_syntax.tag in ['NP', 'S']: if role_mapping[lexie] == {}: # missing sense # TODO handle this explicitly using XML annotations pass elif classid not in role_mapping[lexie]: raise Exception('{} misses {} class'.format(lexie, classid)) elif correct_syntax.get('value') not in role_mapping[lexie][classid]: raise Exception('{} misses {} mapping'.format(lexie, correct_syntax.get('value'))) if test_context: n_roles += 1 candidate_roles = set() candidate_roles.add(list(vn_syntax)[i].get('value')) if role_mapping[lexie][classid][correct_syntax.get('value')] in candidate_roles: if test_context: n_correct_roles += 1 / len(candidate_roles) print(annotated_sentences, n_frames, n_classes, n_roles) print('- {:.0%} of lemma tokens are here'.format(lemma_in_vn/annotated_sentences)) print('- For these tokens, {:.1%} of constructions are correct'.format(n_correct_frames/n_frames)) print('- For these constructions, {:.1%} of classes are here'.format(n_classes_in_list/max(n_classes, 1))) print('- For these classes, {:.1%} of roles are correct'.format(n_correct_roles/max(n_roles, 1))) print()
def features_func(sent_feat, token, lemma, dict_feats, prot, concreteness, lcs, l2f): '''Extract features from a word''' sent = sent_feat[0] feats = sent_feat[1][0] all_lemmas = sent_feat[1][1] deps = [x[2] for x in sent.tokens[token].dependents] deps_text = [x[2].text for x in sent.tokens[token].dependents] deps_feats = '|'.join([(a + "_dep") for x in deps for a in feats[x.position].split('|')]) all_feats = (feats[token] + '|' + deps_feats).split('|') all_feats = list(filter(None, all_feats)) # UD Lexical features for f in all_feats: if f in dict_feats.keys(): dict_feats[f] = 1 # Lexical item features for f in deps_text: if f in dict_feats.keys(): dict_feats[f] = 1 # wordnet supersense of lemma for synset in wordnet.synsets(lemma): dict_feats['supersense=' + synset.lexname()] = 1 # framenet name pos = sent.tokens[token].tag if lemma + '.' + pos in l2f.keys(): frame = l2f[lemma + '.' + pos] dict_feats['frame=' + frame] = 1 # Predicate features if prot == "pred": # verbnet class f_lemma = verbnet.classids(lemma=lemma) for f in f_lemma: dict_feats['classid=' + f] = 1 # lcs eventiveness if lemma in lcs.verbs: if True in lcs.eventive(lemma): dict_feats['lcs_eventive'] = 1 else: dict_feats['lcs_stative'] = 1 dep_c_scores = [ concreteness_score(concreteness, g_lemma) for g_lemma in [all_lemmas[x[2].position] for x in sent.tokens[token].dependents] ] if len(dep_c_scores): dict_feats['concreteness'] = sum(dep_c_scores) / len(dep_c_scores) dict_feats['max_conc'] = max(dep_c_scores) dict_feats['min_conc'] = min(dep_c_scores) else: dict_feats['concreteness'] = 2.5 dict_feats['max_conc'] = 2.5 dict_feats['min_conc'] = 2.5 # Argument features else: dict_feats['concreteness'] = concreteness_score(concreteness, lemma) # lcs eventiveness score and verbnet class of argument head if sent.tokens[token].gov: gov_lemma = all_lemmas[sent.tokens[token].gov.position] # lexical features of dependent of governor deps_gov = [x[2].text for x in sent.tokens[token].gov.dependents] for f in deps_gov: if f in dict_feats.keys(): dict_feats[f] = 1 # lcs eventiveness if gov_lemma in lcs.verbs: if True in lcs.eventive(gov_lemma): dict_feats['lcs_eventive'] = 1 else: dict_feats['lcs_stative'] = 1 for f_lemma in verbnet.classids(lemma=gov_lemma): dict_feats['classid=' + f_lemma] = 1 # framenet name of head pos = sent.tokens[token].gov.tag if gov_lemma + '.' + pos in l2f.keys(): frame = l2f[gov_lemma + '.' + pos] dict_feats['frame=' + frame] = 1 return dict_feats
maxTerm = term # print name, maxTerm, maxCount total += maxCount maxCount = 0 avg = total / l # print bagOfWords["elizabeth"] # print avg allverbs = [] # Creating training set fr = open(extfile, 'r') for line in fr: token = line.strip("\n") extList[token] = avg words = verbnet.classids(token) for w in words: finalWord = w.decode("UTF-8", "ignore") allverbs += verbnet.lemmas(finalWord) for v in allverbs: extList[v] = avg / 2 # print len(extList) allverbs = [] fr = open(intfile, 'r') for line in fr: token = line.strip("\n") intList[token] = avg words = verbnet.classids(token)
nps = extract_phrase(tree_str, 'NP') vps = extract_phrase(tree_str, 'VP') pps = extract_phrase(tree_str, 'PP') if before_verb in nps: print("YES BEFORE VERB") if after_verb in nps: print("YES AFTER VERB") print(nps) print(vps) print(pps) for np in nps: print(np) print("=============") word = "come" vn_results = vn.classids(lemma=word) print(vn_results) frames = vn.frames('51.2')[0] syntax = frames['syntax'] for item in syntax: print(item['pos_tag']) print("=====================") nlp.close()
# In[8]: verbNet = [] for unit in data: sentence = [] for cap in unit[3]: # print(unit[3]) tagged = nltk.pos_tag(cap) verb_lists = [] for idx, tp in enumerate(tagged): if (tp[1][:2] == 'VB'): base_form = WordNetLemmatizer().lemmatize(cap[idx], 'v') if (verbnet.classids(base_form) != []): attr = verbnet.classids(base_form) at_list = [] for at in attr: splitted_at = [] splitted_string = at.split('-') splitted_at.append(splitted_string[0]) splitted_at.append(splitted_string[1].split('.')[0]) at_list.append([]) at_list[-1] = splitted_at verb_lists.append([base_form, at_list, len(attr)]) sentence.append(verb_lists) # print(sentence[-1]) verbNet.append(sentence) # print(verbNet)
# print(i) # # for i in featuresset: # # print(i) # random.shuffle(featuresset) # classifier = nltk.NaiveBayesClassifier.train(featuresset) # save_classifier_NBC(classifier) #-----------------------------------------testing--------------------------------------------------- input = "He need a ride from his home." verb_list, frames_list = prim_fram(input) print(frames_list) print(nltk.pos_tag(nltk.word_tokenize(input))) print(verb_list) for r in range(len(verb_list)): keys = [] ids = vb.classids(verb_list[r]) for i in ids: u = vb.vnclass(i) for j in [l.attrib['type'] for l in u.findall('THEMROLES/THEMROLE/SELRESTRS/SELRESTR')]: keys.append(j) for j in [l.attrib['type'] for l in u.findall('THEMROLES/THEMROLE')]: keys.append(j) for j in [l.attrib['value'] for l in u.findall('FRAMES/FRAME/SEMANTICS/PRED')]: keys.append(j) f = open("tmp/features_verbs.txt","r") word_features = [] for l,i in enumerate(f): word_features.append(i) f.close()
import numpy from sklearn import linear_model, cross_validation, neural_network from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVC from nltk.corpus import verbnet goog = utility.get_news_prices('google') goog.append(utility.get_news_prices('microsoft')) goog.append(utility.get_news_prices('apple')) goog.append(utility.get_news_prices('yahoo')) goog.append(utility.get_news_prices('adobe')) goog.append(utility.get_news_prices('ford')) # Select model of computation: model = neural_network.MLPRegressor( [len(verbnet.classids()) + 200, 500, 300, 100], 'relu', 'adam', 0.0001, 200, 'constant', 0.001, 0.5, 200, True, None, 0.0001, False, False, 0.9, True, False, 0.1, 0.9, 0.999, 1e-08) # model = RandomForestRegressor(n_estimators=50, max_features=30, max_depth=9, n_jobs=1) # model = SVC(kernel='linear', probability=True, random_state=40) # model = linear_model.LinearRegression() # model = utility.pipeline_setup(model) # model_fitted = model.fit(goog['message'], goog['Threshold Change']) # Select columns: x = goog.message.apply( lambda sentence: utility.get_feature_vector(sentence + ".")) # x.to_csv('data/google_msg_id.csv') # x = pandas.read_csv('data/google_msg_id.csv')
# In[ ]: # In[ ]: # In[33]: nltk.download() # In[34]: from nltk.corpus import verbnet # In[38]: verbnet.classids(lemma='add') # In[39]: verbnet.classids(lemma='buy') # In[41]: verbnet.classids(lemma='take') # In[42]: verbnet.classids(lemma='give') # In[44]:
def hand_engineering(prot, batch_size, data, data_dev): ''' Hand engineered feature extraction. Supports the following - UD, Verbnet classids, Wordnet supersenses, concreteness ratings, LCS eventivity scores ''' home = expanduser("~") framnet_posdict = { 'V': 'VERB', 'N': 'NOUN', 'A': 'ADJ', 'ADV': 'ADV', 'PREP': 'ADP', 'NUM': 'NUM', 'INTJ': 'INTJ', 'ART': 'DET', 'C': 'CCONJ', 'SCON': 'SCONJ', 'PRON': 'PRON', 'IDIO': 'X', 'AVP': 'ADV' } # Load the features features = {} with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f: for line in f.readlines(): feats = line.split('\t') features[feats[0]] = (feats[1].split(), feats[2].split()) # Load the predpatt objects for creating features files = [ '/Downloads/UD_English-r1.2/en-ud-train.conllu', '/Downloads/UD_English-r1.2/en-ud-dev.conllu', '/Downloads/UD_English-r1.2/en-ud-test.conllu' ] home = expanduser("~") options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True) # Resolve relative clause patt = {} for file in files: path = home + file with open(path, 'r') as infile: for sent_id, ud_parse in load_conllu(infile.read()): patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse, opts=options) data['Structure'] = data['Split.Sentence.ID'].map(lambda x: (patt[x], features[x])) data_dev['Structure'] = data_dev['Split.Sentence.ID'].map( lambda x: (patt[x], features[x])) raw_x = data['Structure'].tolist() raw_dev_x = data_dev['Structure'].tolist() all_x = raw_x + raw_dev_x all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))]) feature_cols = Counter(all_feats.split('|')) # All UD dataset features all_ud_feature_cols = list( feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()] # Concreteness f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb') concreteness = pickle.load(f) if prot == 'arg': conc_cols = ['concreteness'] else: conc_cols = ['concreteness', 'max_conc', 'min_conc'] f.close() # LCS eventivity from lcsreader import LexicalConceptualStructureLexicon lcs = LexicalConceptualStructureLexicon( home + '/Desktop/protocols/data/verbs-English.lcs') lcs_feats = ['lcs_eventive', 'lcs_stative'] # Wordnet supersenses(lexicographer names) supersenses = list( set(['supersense=' + x.lexname() for x in wordnet.all_synsets()])) # Framenet lem2frame = {} for lm in framenet.lus(): for lemma in lm['lexemes']: lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name'] frame_names = ['frame=' + x.name for x in framenet.frames()] # Verbnet classids verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()] # Lexical features lexical_feats = [ 'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must', 'ought', 'dare', 'need' ] + [ 'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every', 'this', 'that', 'any', 'most', 'all', 'both', 'these' ] dict_feats = {} for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols: dict_feats[f] = 0 x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist()) ]) dev_x_pd = pd.DataFrame([ features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist( ), data_dev['Lemma'].tolist()) ]) # Figure out which columns to drop(they're always zero) todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist() todrop = x_pd.columns[(x_pd == 0).all()].values.tolist() intdrop = [a for a in todrop if a not in todrop1] cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop)) x = x_pd.drop(cols_to_drop, axis=1).values.tolist() dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist() x = [[a[:] for a in x[i:i + batch_size]] for i in range(0, len(data), batch_size)] dev_x = [[a[:] for a in dev_x[i:i + batch_size]] for i in range(0, len(data_dev), batch_size)] return x, dev_x
def root_word_verbnet_features(self): self.root_word_lemma = lemmatizer.lemmatize(self.root_word, 'v') all_classids = vn.classids(lemma=self.root_word_lemma) self.verb_class = ' '.join([c_id.split('-')[0] for c_id in all_classids]) self.only_classids = ' '.join([vn.shortid(c_id) for c_id in all_classids])
from nltk.corpus import verbnet def GetVerbnetRestrictions(vnclass): role_restrictions = {} while True: for role in vnclass.findall('THEMROLES/THEMROLE'): restrictions = role.find('SELRESTRS') if restrictions: restriction_set = set() for restriction in restrictions.findall('SELRESTR'): predicate = restriction.attrib restriction_set.add((predicate['Value'], predicate['type'])) total = (restrictions.get('logic', 'and'), list(restriction_set)) role_restrictions[role.attrib['type']] = total if vnclass.tag == 'VNCLASS': break else: parent_class = vnclass.attrib['ID'].rsplit('-', 1)[0] vnclass = verbnet.vnclass(parent_class) return role_restrictions vnclasses = verbnet.classids('drink') v=verbnet.vnclass('39.1-2') GetVerbnetRestrictions(v)
from nltk.corpus import verbnet my_classids = verbnet.classids(lemma='take') print(my_classids) # my_lemmas = verbnet.lemmas(my_classids) # my_longid = longid(my_shortid) # my_shortid = shortid(my_longid) for i in my_classids: my_vnclass = verbnet.vnclass(i) # my_wordnetids = verbnet.wordnetids(mi) # Human-friendly methods verbnet.pprint(my_vnclass) # vnframe = my_vnclass.findall('FRAMES/FRAME') # print(verbnet.pprint_description(vnframe)) # print(verbnet.pprint_frames(vnframe)) print(verbnet.pprint_members(my_vnclass)) # print(verbnet.pprint_semantics(vnframe)) print(verbnet.pprint_subclasses(my_vnclass)) # print(verbnet.pprint_syntax(vnframe)) # x = verbnet.pprint_themroles(my_vnclass) print(verbnet.pprint_themroles(my_vnclass)) '''for j in x.split("]"): print(j)'''
def process_srl(srl_output, actual_data, just_phrases): porter_stemmer = PorterStemmer() wn_lem = WordNetLemmatizer() file_open = open (srl_output, "r") output = file_open.read() srl_output = output.split("\n================\n") srl_list = [] [srl_list.append(line.strip()) for line in srl_output] phrase_sentence = create_vector(just_phrases) corpus_data = create_vector(actual_data) number = 0 for line in corpus_data: sline = line.split("\t") sense = sline[2] # figurative or literal metaphor = sline[1] # along the line <- the metaphor itself try: current_srl = srl_list[number].split("\n") # semantic role labeling of give sentece except: import pdb; pdb.set_trace() #mtokens = metaphor.split(" ") mtokens_t = word_tokenize(phrase_sentence[number]) mtokens_t = [w for w in mtokens_t if not w.decode('utf8') in nlcor.stopwords.words('english')] mtokens = filter(lambda word: word not in ",-'", mtokens_t) sane_mt = [mt.decode('utf8') for mt in mtokens] pos_mtokens = nltk.pos_tag(sane_mt) only_verbs = [tkn[0] for tkn in pos_mtokens if 'VB' in tkn[1]] #print "===============================================" line_score = 0 token_count = 1 number += 1 #print "phrase tokens: %s" % mtokens_t #print "only verbs: %s" % only_verbs for mtoken in only_verbs: vnclasses = verbnet.classids(mtoken) if not vnclasses: vnclasses = verbnet.classids(wn_lem.lemmatize(mtoken)) if not vnclasses: continue #print "vnclasses: %s" % vnclasses mindex = [index for index, sl in enumerate(current_srl) if porter_stemmer.stem(mtoken) in sl.decode('utf8')] if not mindex: # print 0 continue token_count += 1 class_score = 0 class_count = 1 #print '----- %s -----' % mtoken for vn in vnclasses: v=verbnet.vnclass(vn) try: restrictions = GetVerbnetRestrictions(v) except: continue # print restrictions if restrictions: class_score = check_validity(current_srl, mindex[0], restrictions) class_count += 1 #print class_score else: #print "No restrictions for %s" % vn pass if class_count < 2: avg_class_score = class_score / class_count else: avg_class_score = class_score / (class_count - 1) #print '---------------' line_score += avg_class_score token_count += 1 if token_count < 2: avg_line_score = line_score / token_count else: avg_line_score = line_score / (token_count - 1) # print "%s - %s - %s" % (sline[1], sline[2], line_score) print avg_line_score
def run(self): print("Performing action identifier experiment ...") open(self.config['log_file'], 'w') count = 0 sentences_total = 0 start_time = time.time() utils.write_log(self.config, "RUNNING CONFIGURATION: {}".format(self.config)) # Create dataset object wikihow = Wikihow.Wikihow(self.config) statistic_list = [] statistic_similarity = [] ground_truth_count = 0 dataset_length = int( wikihow.get_length() * self.config['action_identifier']['dataset_evaluation_percent']) if dataset_length < 1: print("No examples to process in dataset. Aborting ...") return verbs = [] for idx in trange(dataset_length): instance = wikihow.get_entry(idx) text = wikihow.process_example(instance[1]) utils.write_log( self.config, "\n---------------------------------------------------------------------------\n" ) utils.write_log(self.config, "FILE: {}\n".format(instance[0])) spacy_en = spacy.load('en_core_web_sm') for sentence in text: sentences_total += 1 # Tokenize if self.config['action_identifier'][ 'ground_truth_generator'] == 'nltk': sentence_tokens = nltk.word_tokenize(sentence) sentence_tags = nltk.pos_tag(sentence_tokens) ground_truth_verbs = [ v[0] for v in sentence_tags if len(verbnet.classids(v[0])) > 0 ] elif self.config['action_identifier'][ 'ground_truth_generator'] == 'spacy': doc = spacy_en(sentence) sentence_tokens = [t for t in doc] sentence_tags = [(str(t), t.pos_) for t in doc] ground_truth_verbs = [v for v in doc if v.pos_ == 'VERB'] else: print("No ground-truth mechanism defined! Aborting ...") return utils.write_log(self.config, "\n>SENTENCE: {}".format(sentence)) utils.write_log(self.config, "\n >SENTENCE TAGS: {}".format(sentence_tags)) if len(ground_truth_verbs) == 0: ground_truth_count += 1 utils.write_log( self.config, "\n >GROUND-TRUTH VERBS: {}".format(ground_truth_verbs)) embedding_verbs = [] for token, tag in zip(sentence_tokens, sentence_tags): keyword_similarity = [] for keyword in self.config['action_identifier'][ 'keywords']: try: similarity = 1.0 - self.word_embedding.get_distance( str(token), str(keyword))[2] except KeyError: similarity = 0.0 keyword_similarity.append(similarity) mean = np.mean(keyword_similarity) if mean >= float(self.config['action_identifier'] ['similarity_threshold']): embedding_verbs.append((str(token), mean)) statistic_similarity.append(mean) verbs.append(token) ground_truth_set = {str(v) for v in ground_truth_verbs} print("Ground truth set: ", ground_truth_set) embedding_verbs_set = {str(v[0]) for v in embedding_verbs} print("Embedding set: ", embedding_verbs_set) true_positive = embedding_verbs_set.intersection( ground_truth_set) print("True positive: ", true_positive) false_positive = embedding_verbs_set.difference( ground_truth_set) print("False positive: ", false_positive) false_negative = ground_truth_set.difference( embedding_verbs_set.intersection(ground_truth_set)) print("False negative: ", false_negative) # false_negative # true_positive = [e[0] in ground_truth_verbs for e in embedding_verbs] # true_positive = np.count_nonzero(true_positive) # # false_positive = [e[0] not in ground_truth_verbs for e in embedding_verbs] # false_positive = np.count_nonzero(false_positive) # # true_negative = [] # false_negative = np.count_nonzero(true_negative) # # false_negative = [e not in embedding_verbs for e in ground_truth_verbs] # false_negative = np.count_nonzero(false_negative) true_positive = len(true_positive) false_positive = len(false_positive) false_negative = len(false_negative) sentence_entry = (token, tag, self.word_embedding.get_word_vector(token), keyword_similarity, mean) utils.write_log( self.config, "\n >EMBEDDING VERBS: {}".format(embedding_verbs)) # Text statistics [true positive, false negative, precision, recall, f-score] try: precision = true_positive / (true_positive + false_positive) except ZeroDivisionError: precision = 0.0 try: recall = true_positive / (true_positive + false_negative) except ZeroDivisionError: recall = 0.0 try: f_score = 2 * (recall * precision) / (recall + precision) except ZeroDivisionError: f_score = 0.0 utils.write_log( self.config, "\n >TP: {} FP: {} FN: {} Precision: {} Recall: {} F-Score: {}" .format(true_positive, false_positive, false_negative, precision, recall, f_score)) statistic_list.append([ true_positive, false_positive, false_negative, precision, recall, f_score ]) count += 1 print("Calculating statistics ...") statistic_mean = np.mean(statistic_list, axis=0) statistic_std = np.std(statistic_list, axis=0) utils.write_log( self.config, "\n=======================================================================\n" ) utils.write_log( self.config, "RESULTS (Elapsed time: {:.4f} seconds)".format(time.time() - start_time)) utils.write_log(self.config, "\n Total of examples: {}".format(count)) utils.write_log(self.config, "\n Total of sentences: {} - Mean per example: {:.4f} - Ground-truth sentences with zero verbs: {} ({:.4f} %)".format(sentences_total, \ sentences_total / count, ground_truth_count, ground_truth_count / sentences_total)) utils.write_log( self.config, "\n Mean True Positive: {:.4f} - Std: {:.4f}".format( statistic_mean[0], statistic_std[0])) utils.write_log( self.config, "\n Mean False Positive: {:.4f} - Std: {:.4f}".format( statistic_mean[1], statistic_std[1])) utils.write_log( self.config, "\n Mean False Negative: {:.4f} - Std: {:.4f}".format( statistic_mean[2], statistic_std[2])) utils.write_log( self.config, "\n Mean Similarity: {:.4f} - Std: {:.4f}".format( np.mean(statistic_similarity), np.std(statistic_similarity))) utils.write_log( self.config, "\n Mean Precision: {:.4f} - Recall: {:.4f} - F-Score: {:.4f}". format(statistic_mean[3], statistic_mean[4], statistic_mean[5])) # flatten = lambda l: [item for sublist in l for item in sublist] # # verbs = flatten(verbs) verbs = [str(v) for v in verbs] import pandas as pd df = pd.DataFrame(verbs)[0].value_counts().to_csv( self.config['log_file'] + "-dataframe")
def create_fvect(self, createfrom=None): """ adds features to feature vector, this function does not add labels, that is up to classes that derive from the CorrectionFeatures class @params list createfrom - a feature vector list to base this feature vector on (append to createfrom) """ if createfrom: fvect = createfrom else: fvect = [] corr = self.instance.correction error = self.instance.error #extract data needed for features subj = self.sentence.get_subject_token()[0] left = self.sentence.get_token_left(error.first().tid) right = self.sentence.get_token_right(error.last().tid) left2 = self.sentence.get_token_left(left.tid) left3 = self.sentence.get_token_left(left2.tid) left4 = self.sentence.get_token_left(left3.tid) right2 = self.sentence.get_token_right(right.tid) right3 = self.sentence.get_token_right(right2.tid) right4 = self.sentence.get_token_right(right3.tid) leftnoun = closest_noun(error.first(), self.sentence, True) rightnoun = closest_noun(error.last(), self.sentence, False) gov_tuple = self.sentence.get_gov(error.head().tid) gov_token = self.sentence.get_token(gov_tuple[1]) governee_list = self.sentence.get_governees(error.head().tid) governee_tuple = governee_list[0] governee_token = self.sentence.get_token(governee_tuple[1]) prevphrase = prev_vphrase(error, self.sentence) ladv = time_adverb(error.first(), self.sentence, True) radv = time_adverb(error.last(), self.sentence, False) governee_rels = [x[0] + "governeerel" for x in governee_list] governees = [self.sentence.get_token(x[1]).abbv_to_word() + "governee" for x in governee_list] governeespos = [self.sentence.get_token(x[1]).pos + "governee" for x in governee_list] det = self.sentence.get_det(subj.tid) vnet_classes = verbnet.classids(error.head().lemma) if not vnet_classes: vnet_class = [] else: vnet_class = ["".join([x for x in classes if str.isalpha(x)]) for classes in vnet_classes] vnet_class = [x + "class" for x in vnet_class] if prevphrase: prevhead = prevphrase.head() c = verbnet.classids(prevhead.lemma) if not c: prevclass = None else: prevclass = c[0] prevclass = "".join([x for x in prevclass if str.isalpha(x)]) prevaspect = get_aspect(prevphrase) else: prevhead = None prevclass = None prevaspect = None fvect.append(error.head().abbv_to_word() + "self") # fvect.extend(vnet_class) if prevhead: fvect.append(prevhead.abbv_to_word() + "prevword") fvect.append(prevhead.pos + "prevpos") # if prevclass: # fvect.append(prevclass + "prevclass") if prevaspect: fvect.append(prevaspect + "prevaspect") # fvect.append(right2.abbv_to_word()) # fvect.append(left2.abbv_to_word()) # fvect.append(right2.pos) # fvect.append(left2.pos) # fvect.append(right3.word + "right") # fvect.append(left3.word + "left") # fvect.append(right3.pos + "right") # fvect.append(left3.pos + "left") # # fvect.append(right4.word + "right") # fvect.append(left4.word + "left") # fvect.append(right4.pos + "right") # fvect.append(left4.pos + "left") fvect.append(right.abbv_to_word() + "right") fvect.append(right.pos + "right") fvect.append(left.abbv_to_word() + "left") fvect.append(left.pos + "left") fvect.append(subj.pos + "subj") fvect.append(subj.abbv_to_word() + "subjlem") fvect.append(str(subj.noun_person()) + "subj") fvect.append(str(subj.singular_noun()) + "subj") # fvect.append(det.word + "det") fvect.append(str(self.sentence.ispassive()) + "passive") # if leftnoun.isvalid(): # fvect.append(str(leftnoun.singular_noun()) + "leftn") # fvect.append(str(leftnoun.noun_person()) + "leftn") # fvect.append(leftnoun.pos + "leftn") # fvect.append(leftnoun.abbv_to_word() + "leftn") # if rightnoun.isvalid(): # fvect.append(str(rightnoun.noun_person()) + "rightn") # fvect.append(str(rightnoun.singular_noun()) + "rightn") # fvect.append(rightnoun.pos + "rightn") # fvect.append(rightnoun.abbv_to_word() + "rightn") # fvect.extend(governee_rels) # fvect.extend(governees) # fvect.extend(governeespos) fvect.append(gov_token.word + "gov") fvect.append(gov_token.pos + "gov") fvect.append(gov_tuple[0] + "govrel") fvect.append(governee_token.word + "governee") fvect.append(governee_token.pos + "governee") fvect.append(governee_tuple[0] + "governeerel") if ladv.isvalid(): fvect.append(ladv.word + "adverb") if radv.isvalid(): fvect.append(radv.word + "adverb") return fvect
def extractFeatures(token, sentence, filename, syntacticFeatures): rowOfFeats = [] verb = token['word'] idVerb = token['id'] Features = Verb(token['word'], token['lemma'], token['pos']) Features.set_metadata(sentence['id'], idVerb, filename) if token.has_key('attribution'): role = token['role'] if role == 'cue': Features.set_label('Y') else: Features.set_label('N') else: Features.set_label('N') if idVerb > 0: prevToken = sentence['tokens'][idVerb - 1] else: prevToken = None if idVerb < len(sentence['tokens']) - 1: nexToken = sentence['tokens'][idVerb + 1] else: nexToken = None if prevToken != None: Features.set_previousToken(prevToken['word'], prevToken['lemma'], prevToken['pos']) if prevToken['word'] == ':': Features.set_colonAdjacent() elif prevToken['word'] == '``': Features.set_quoteAdjacentInside() elif prevToken['word'] == "''": Features.set_quoteAdjacentOutside() elif prevToken['word'] == ',': beforeComma = sentence['tokens'][idVerb - 2] if beforeComma['word'] == '``': Features.set_quoteAdjacentInside() elif beforeComma['word'] == "''": Features.set_quoteAdjacentOutside() if nexToken != None: Features.set_nextToken(nexToken['word'], nexToken['lemma'], nexToken['pos']) if nexToken['word'] == ':': Features.set_colonAdjacent() elif nexToken['word'] == '``': Features.set_quoteAdjacentOutside() elif nexToken['word'] == "''": Features.set_quoteAdjacentInside() elif nexToken['word'] == ',': try: afterComma = sentence['tokens'][idVerb + 2] if afterComma['word'] == '``': Features.set_quoteAdjacentOutside() elif afterComma['word'] == "''": Features.set_quoteAdjacentInside() except: print 'out of range' else: Features.set_nextToken('NONE!!', 'NONE!!', 'NONE!!') Features.set_verbNet(";!".join(vn.classids(token['lemma']))) Features.set_distances(token['id'], len(sentence['tokens']) - (token['id'] + 1)) quoteMarkers = findQuoteMarkers(sentence) FEATinQuotes = 'False' for (beg, end) in quoteMarkers: if idVerb > beg and idVerb < end: Features.set_insideQuotes() (depth, parentNode, parentSiblings) = syntacticFeatures Features.set_syntactic(depth, parentNode, ";!".join(parentSiblings)) Features.makeList() rowOfFeats = Features.getList() return rowOfFeats
def GetVerbnetRestrictions(vnclass): role_restrictions = {} while True: for role in vnclass.findall('THEMROLES/THEMROLE'): restrictions = role.find('SELRESTRS') if restrictions: restriction_set = set() for restriction in restrictions.findall('SELRESTR'): predicate = restriction.attrib restriction_set.add( (predicate['Value'], predicate['type'])) total = (restrictions.get('logic', 'and'), list(restriction_set)) role_restrictions[role.attrib['type']] = total if vnclass.tag == 'VNCLASS': break else: parent_class = vnclass.attrib['ID'].rsplit('-', 1)[0] vnclass = verbnet.vnclass(parent_class) return role_restrictions vnclasses = verbnet.classids('drink') v = verbnet.vnclass('39.1-2') GetVerbnetRestrictions(v)
from nltk.corpus import verbnet as vn from nltk.corpus import framenet as fn from nltk.corpus import propbank as pb word1 = "melt" word2 = "oxidize" input = word1 vn_results = vn.classids(lemma=input) if not vn_results: print(input + ' not in verbnet.') else: print('verbnet:') for ele in vn_results: print(ele) print("") fn_results = fn.frames_by_lemma(input) if not fn_results: print(input + ' not in framenet.') else: print('framenet:') for ele in fn_results: print(ele) print("") pb_results = [] try:
generalThing = datum.thing verbnetRoot = generalThing.get("verbnet") wordnetRoot = generalThing.find("wordnet") class_ = verbnetRoot.get("class") verbclassID = verbnetRoot.get("verb class id") verbroot = verbnetRoot.get("verbroot") example = verbnetRoot.get("example") semantics = verbnetRoot.get("semantics") syntax = verbnetRoot.get("syntax") verbclass_ = verbnetRoot.get("verb class") description = verbnetRoot.get("description") semanticsArguments = verbnetRoot.get("semantics argument") syntaxArguments = verbnetRoot.get("syntax argument") syntaxFramesKatum = verbnetRoot.get("syntactic argument") semanticsFramesKatum = verbnetRoot.get("semantics predicate") predicateValue = verbnetRoot.get("predicate value") themroles = verbnetRoot.get("thematic role") roleType = verbnetRoot.get("role") listOfAllLemmas = vn.lemmas() uniqueClassIDs = [] for lemma in listOfAllLemmas: uniqueClassIDs.extend(vn.classids(lemma)) uniqueClassIDs = list(set(uniqueClassIDs)) processClassID(uniqueClassIDs) for v in vn.lemmas(): verbRootInstance = verbroot.get(v) for verbclass in vn.classids(v): verbRootInstance._is(classToKatumDict[verbclass], False) generalThing.save('wordnet-verbnet.datum')
def findpassives(sent): # Feature extraction code here. """Given a sentence, tag it and print if we think it's a passive-voice formation.""" lancaster_stemmer = LancasterStemmer() tagged = tag_sentence(sent) tags = map( lambda(tup): tup[1], tagged) ansi=[] # print sent if passivep(tags): #file.write(oneline(sent)) blob=TextBlob(oneline(sent)) flag =True prevnoun="" negative=0 number=0 verb="" nextnoun="" for word, pos in blob.tags: #print word,pos if (pos=='NN' or pos =='NNP') and flag== True: prevnoun= word if (pos=='RB'): negative=1 if (pos=='CD'): number= word if (pos=='VBG' or pos=='RB' or pos=='VBN'or pos=='VB') and flag==True: verb=word flag= False if (pos=='NN' or pos=='NNP') and flag== False: nextnoun=word break lancaster_stemmer.stem(verb) #print verb if verb=="": ansi.append([0]) ansi.append(negative) ansi.append(number) elif len(verbnet.classids(verb))==0: ans= prevnoun+" "+verb+" "+nextnoun+" " ansi.append([0]) ansi.append(negative) ansi.append(number) else: #ans1=verbnet.lemmas()[0:3620].index(verb) temp=verbnet.classids(verb) ans1 = [verbnet.classids().index(i) for i in temp] ansi.append(ans1) ansi.append(negative) ansi.append(number) #fileans.write(ans+'\n') result.append(ansi) if(len(ansi)==0): ansi=[[0],0,0] print ansi return ansi else: #file1.write(oneline(sent)) blob=TextBlob(oneline(sent)) flag1 =True prevnoun1="" verb1="" nextnoun1="" negative=0 number=0 for word, pos in blob.tags: #print word,pos if (pos=='NN' or pos =='NNP') and flag1== True: prevnoun1= word if (pos=='RB'): negative=1 if (pos=='CD'): number= word if (pos=='VBG' or pos=='RB' or pos=='VBN'or pos=='VB') and flag1==True: verb1=word flag1= False if (pos=='NN' or pos=='NNP') and flag1== False: nextnoun1=word break lancaster_stemmer.stem(verb1) #print verb1 if verb1=="": ansi.append([0]) ansi.append(negative) ansi.append(number) elif len(verbnet.classids(verb1))==0: ans= prevnoun1+" "+verb1+" "+nextnoun1+" " ansi.append([0]) ansi.append(negative) ansi.append(number) else: #ans1=ans1=verbnet.lemmas()[0:3620].index(verb1) temp=verbnet.classids(verb1) ans1 = [verbnet.classids().index(i) for i in temp] ansi.append(ans1) ansi.append(negative) ansi.append(number) if(len(ansi)==0): ansi=[[0],0,0] print ansi return ansi
def print_if_passive(sent): """Given a sentence, tag it and print if we think it's a passive-voice formation.""" lancaster_stemmer = LancasterStemmer() tagged = tag_sentence(sent) tags = map( lambda(tup): tup[1], tagged) if passivep(tags): file.write(oneline(sent)) blob=TextBlob(oneline(sent)) flag =True prevnoun="" verb="" nextnoun="" for word, pos in blob.tags: if (pos=='NN' or pos =='NNP') and flag== True: prevnoun= word if (pos=='VBG' or pos=='RB' or pos=='VBN') and flag==True: verb=word flag= False if (pos=='NN' or pos=='NNP') and flag== False: nextnoun=word break lancaster_stemmer.stem(verb) print verb if len(verbnet.classids(verb))==0: ans= prevnoun+" "+verb+" "+nextnoun+" " else: ans1=verbnet.classids(verb) ansstring=''.join(ans1) ans= prevnoun+" "+ansstring+" "+nextnoun+" " fileans.write(ans+'\n') #print verbnet.classids('acclaim') #print "passive:", oneline(sent) else: file1.write(oneline(sent)) blob=TextBlob(oneline(sent)) flag1 =True prevnoun1="" verb1="" nextnoun1="" for word, pos in blob.tags: #print word,pos if (pos=='NN' or pos =='NNP') and flag1== True: prevnoun1= word if (pos=='VBG' or pos=='RB' or pos=='VBN') and flag1==True: verb1=word flag1= False if (pos=='NN' or pos=='NNP') and flag1== False: nextnoun1=word break lancaster_stemmer.stem(verb1) print verb1 if len(verbnet.classids(verb1))==0: ans= prevnoun1+" "+verb1+" "+nextnoun1+" " else: ans1=verbnet.classids(verb1) ansstring=''.join(ans1) ans= prevnoun1+" "+ansstring+" "+nextnoun1+" " fileans.write(ans+'\n')