コード例 #1
0
ファイル: verbnetguess.py プロジェクト: aymara/knowledgesrl
    def test(self):
        skips = [
            'Eggs and cream mix well together.',
            'The eggs and the cream mixed together.'
        ]
        warnings.simplefilter("ignore", ResourceWarning)
        classid_list = sorted(verbnet.classids(), key=lambda c: LooseVersion(classid_to_number(c)))

        i = 0
        for classid in classid_list:
            for vn_frame in verbnet.frames(classid):
                text = vn_frame['frame'].find('EXAMPLES/EXAMPLE').text
                with self.subTest(i=i, msg='{}: {}'.format(classid, text)):
                    if text in skips:
                        continue
                    syntax = vn_frame['frame'].find('SYNTAX')
                    wanted_primary = strip_roles(
                        vn_frame['frame'].find('DESCRIPTION').get('primary'))
                    converted_primary = ' '.join(
                        [phrase for phrase, role in syntax_to_primary(syntax)])

                    self.assertEqual(wanted_primary, converted_primary)
                i += 1

        print('Total : {}'.format(i))
コード例 #2
0
def one_hot(list):
    print list
    array = numpy.zeros((len(verbnet.classids())))
    for i in list:
        # print i
        array[i-1] = 1
    return array
コード例 #3
0
def get_verbs(verb): 
 all_set_verbs = set() 
 all_verbs =  verbnet.classids(lemma=verb)
 for v in all_verbs:
   splitted = v.split("-")
   all_set_verbs.add(splitted[0])
 return all_set_verbs
コード例 #4
0
def genVerbnetFeatures(word, pos, features):
    if pos != 'V':
        return
    vids=vn.classids(word)
    for vid in vids:
        v=vn.vnclass(vid)
        types=[t.attrib['type'] for t in v.findall('THEMROLES/THEMROLE/SELRESTRS/SELRESTR')]
        for type in types:
            fstr = "THEMETYPE_"+type
            features.append(fstr)
コード例 #5
0
ファイル: qa_utils.py プロジェクト: mjhosseini/entgraph_eval
def is_transitive(lemma):
    try:
        cids = verbnet.classids(lemma)
        frames = verbnet.frames(verbnet.vnclass(cids[0]))
        ret = False
        # for frame in frames:
        #     print "primary:", frame['description']['primary']
        #     ret = ret or "Transitive" in frame['description']['primary']

        ret = "Transitive" in frames[0]['description']['primary']
        return ret
    except:
        return False
コード例 #6
0
ファイル: nlp.py プロジェクト: vijayendra-g/namebot
def get_verb_lemmas(verbs):
    """Return verbnet lemmas for the given verbs.

    These verbs are stemmed before lookup to prevent empty results.

    :param verbs (list) - The list of verbs (verbs) to reference.
    :rtype lemmas (list) - A list of lemmas for all verbs
                        - these are not separated by verb.
    """
    lemmas = []
    for verb in normalization.stem_words(verbs):
        _lemmas = verbnet.classids(lemma=verb)
        lemmas += [l.split('-')[0] for l in _lemmas]
    return lemmas
コード例 #7
0
    def test_remove_before_verb(self):
        """
        Whenever we detect that the sentence starts with a verb, we'll remove it from
        the VerbNet syntax
        """
        from nltk.corpus import verbnet

        buy_first_classid = verbnet.classids('buy')[0]
        buy_first_syntax = verbnet.vnclass(buy_first_classid).find('FRAMES/FRAME/SYNTAX')

        altered_syntax = remove_before_v(buy_first_syntax)
        wanted_syntax = ET.fromstring("""<SYNTAX><VERB /><NP value="Theme" /></SYNTAX>""")

        self.assertEqual(syntax_to_str(altered_syntax), syntax_to_str(wanted_syntax))
コード例 #8
0
ファイル: getframes.py プロジェクト: AllanRamsay/COMP34411
def getFrames(verb, frames):
    for classid in verbnet.classids(verb):
        vnclass = verbnet.pprint(verbnet.vnclass(classid))
        members = re.compile("\s+").split(membersPattern.search(vnclass).group("members"))
        for i in framePattern.finditer(vnclass):
            frame = mergeintrans(mergeNPs("%s"%(i.group("frame"))))
            frame = scomp.sub("SCOMP", frame)
            frame = german.sub("VERB", frame)
            frame = shifted.sub("NP VERB NP", frame)
            frame = finalPPs.sub("", frame)
            if frame in frames:
                frames[frame] += members
            else:
                frames[frame] = members
    return frames
コード例 #9
0
def get_transitivity(verb):
    """ 
    Take a verb lemma as input.
    Return transitivity score and VerbNet (VN) frames if available. 
    
    The returned tuple is constructed in the following way:
        -the first element is the transitivity score, where:
            -1 equals transitive
            -0 equals intransitive (or at least according to VN)
        -the second element is a list of tuples, each of which consists of:
            -first, the VN class_id of a given meaning of a verb
            -second, the corresponding frame itself
            
    Regardless of the length of the transitive frames list, 
    the transitivty score remains the same.
    """

    class_ids = vn.classids(verb)

    print(class_ids)

    # Define a list containing frames with transitive meanings of the given verb.
    trans_frames = []
    for class_id in class_ids:
        frames = vn.frames(class_id)
        for frame in frames:
            print(frame["description"]["primary"])
            #print(frame['description']['secondary'])
            if frame["description"]["primary"] == "NP V NP":
                entry = class_id, frame
                trans_frames.append(entry)


#            elif "NP V NP" in frame["description"]["primary"]:
#                entry = class_id, frame
#                trans_frames.append(entry)
#            elif "Transitive" in frame["description"]["secondary"]:
#                entry = class_id, frame
#                trans_frames.append(entry)

# If the trans_score is equal to one, the verb has a transitive meaning.
    if len(trans_frames) != 0:
        trans_score = 1

    else:
        trans_score = 0

    return trans_score, trans_frames
コード例 #10
0
 def predicate_generator2(readme):
     for words1 in '.!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~':
         readme = readme.replace(words1, '')
     words1 = word_tokenize(readme)
     words1 = [lem.lemmatize(word) for word in words1]
     # words1 = [ps.stem(word)for word in words1]
     source = [word.lower() for word in words1]
     verb1 = []
     for word, pos in nltk.pos_tag(source):
         if (pos == 'VB'):
             verb1.append(word)
     verbs1 = []
     for token in verb1:
         lemma = [lemma for lemma in vn.classids(token)]
         verbs1.append(lemma)
     return verbs1
コード例 #11
0
ファイル: tasks.py プロジェクト: jorgeecardona/muse
    def run(self, query):
        " From tweets extract the keywords"

        words = []

        for tweet in query.tweets.all():
            words.extend(self.extract_words(tweet.text))

        # for answer in query.yahoo_answers.all():
        #     words.extend(self.extract_words(answer.content))
        #     words.extend(self.extract_words(answer.chosen_answer))

        # Turn to downcase.
        new_words = []
        for w in words:

            if not valid_word.match(w):
                continue

            if w.endswith('ing'):
                continue

            if w in query.theme.split(' '):
                continue

            if w in query.text.split(' '):
                continue

            if len(verbnet.classids(w)) > 0:
                continue

            if w not in stopset:
                try:
                    new_words.append(w.decode('ascii'))
                except:
                    pass

        # Sort list
        new_words.sort()
        keywords = ['%s:%d' % (k, len(list(g))) for k, g in groupby(new_words)]

        # Save the keywords in the query
        query.keywords = ' '.join(keywords)
        query.save()

        return query
コード例 #12
0
def process(text: str, params: dict) -> OrderedDict:
    """Process provided text"""

    # set JSON-NLP
    j: OrderedDict = base_document()
    t: OrderedDict = base_nlp_json()
    t['DC.source'] = 'NLTK {}'.format(__version__)
    t['documents'].append(j)
    j['text'] = text

    # collect parsers
    lemmatizer = get_lemmatizer()
    tokenizer = get_tokenizer(params)
    sentence_tokenizer = get_sentence_tokenizer()
    stemmer = get_stemmer()
    parser = get_parser()
    language = Counter()

    # tokenize and tag
    tokens: List[str] = tokenizer.tokenize(text)
    tokens_tagged: List[tuple] = nltk.pos_tag(tokens)
    conll_tagged = tree2conlltags(ne_chunk(tokens_tagged))

    offset_list: List[Tuple[int, int]] = list(tokenizer.span_tokenize(text))

    token_list: List[dict] = []
    for token_idx, token_tuple in enumerate(tokens_tagged):
        token = token_tuple[0]
        pos_tag = token_tuple[1]
        wordnet_pos = get_wordnet_pos(pos_tag)
        entity_tag = conll_tagged[token_idx][2].split("-")

        if wordnet_pos != '':
            synsets = wordnet.synsets(token, pos=wordnet_pos)
        else:
            synsets = wordnet.synsets(token)
        sys_id = 0
        sys_list = []
        for syn in synsets:
            s_hypo = set([x.lemma_names()[0] for x in syn.hyponyms()])
            s_hyper = set([x.lemma_names()[0] for x in syn.hypernyms()])
            s_examples = [x for x in syn.examples()]

            s = {
                'wordnet_id': syn.name(),
                'id': sys_id,
                'synonym': syn.lemma_names()[1:],
                'hyponym': list(s_hypo),
                'hypernym': list(s_hyper),
                'examples': s_examples,
                'definition': syn.definition()
            }

            if len(s['synonym']) == 0: s.pop('synonym')
            if len(s['hyponym']) == 0: s.pop('hyponym')
            if len(s['hypernym']) == 0: s.pop('hypernym')
            if len(s['examples']) == 0: s.pop('examples')
            if len(s['definition']) == 0: s.pop('definition')

            if s:
                sys_list.append(s)
            sys_id += 1

        verb_list = []
        vn_classids = vn.classids(token)
        for classid in vn_classids:
            verb_list.append({
                'class_id': classid,
                'frames': vn.frames(classid)
            })

        t = {
            'id':
            token_idx,
            'text':
            token,
            'lemma':
            lemmatizer(token, wordnet_pos)
            if wordnet_pos else lemmatizer(token),
            'stem':
            stemmer(token),
            'pos':
            pos_tag,
            'entity':
            entity_tag[1] if len(entity_tag) > 1 else "",
            'entity_iob':
            entity_tag[0],
            'overt':
            True,
            'characterOffsetBegin':
            offset_list[token_idx][0],
            'characterOffsetEnd':
            offset_list[token_idx][1],
            'synsets':
            sys_list,
            'verbnet':
            verb_list
        }
        if len(t['synsets']) == 0: t.pop('synsets')
        if len(t['verbnet']) == 0: t.pop('verbnet')
        token_list.append(t)

    j['tokenList'] = token_list

    # sentence and dependency parsing
    sent_list = []
    token_from = 0
    sentence_tokens = sentence_tokenizer.sentences_from_tokens(tokens)
    sentence_texts = sentence_tokenizer.sentences_from_text(text)

    # check whether MALT parser is loaded! DC
    if parser:
        for sent_idx, sent in enumerate(zip(sentence_tokens, sentence_texts)):
            # Detecting language of each sentence
            la = pycountry.languages.get(alpha_2=detect(sent[1]))
            token_to = token_from + len(sent[0]) - 1
            dg = parser.parse_one(sent[1].split())
            s = {
                'id': sent_idx,
                'text': sent[1],
                'tokenFrom': token_from,
                'tokenTo': token_to,
                'tokens': list(range(token_from, token_to))
            }

            for token in dg.nodes:
                head = dg.nodes[token]['head']
                head_word = [
                    dg.nodes[i]['word'] for i in dg.nodes
                    if dg.nodes[i]['address'] == head
                ]
                if len(head_word) > 0:
                    j['dependenciesBasic'].append({
                        'governor':
                        head_word[0],
                        'dependent':
                        dg.nodes[token]['word'],
                        'type':
                        dg.nodes[token]['rel']
                    })
                else:
                    j['dependenciesBasic'].append({
                        'governor':
                        'null',
                        'dependent':
                        dg.nodes[token]['word'],
                        'type':
                        dg.nodes[token]['rel']
                    })
                if j['dependenciesBasic'][-1]['governor'] == 'null' or j['dependenciesBasic'][-1]['dependent'] == 'null' \
                        or j['dependenciesBasic'][-1]['type'] == 'null':
                    j['dependenciesBasic'].pop()
            token_from = token_to
            language[la.name] += 1
            sent_list.append(s)
        j['sentences'] = sent_list

    if params['language']:
        t['DC.language'] = params['language']
    else:
        # only if language has some elements can we check for max!!! DC
        if len(token_list) > 4 and language:
            t['DC.language'] = max(language)
        else:
            t['DC.language'] = ''

    # TODO:
    # 1. Schema: clauses, coreferences, constituents, expressions, paragraphs
    # 2. fields: token: sentiment, embeddings; sentence: sentiment, complex, type, embeddings

    return j
コード例 #13
0
 def getClasses(self, verb):
     return vn.classids(verb)
コード例 #14
0
ファイル: nlputils.py プロジェクト: ping543f/KGen
    def get_verbnet_args(verb, verbose=False):
        lemmatizer = WordNetLemmatizer()
        lemmatized_verb = lemmatizer.lemmatize(verb.lower(), 'v')

        classids = verbnet.classids(lemma=lemmatized_verb)
        if verbose:
            print('Class IDs for "{}": {}'.format(lemmatized_verb, classids))

        if len(classids) < 1:
            if verbose:
                print(
                    'No entry found on verbnet for "{}". Attempting WordNet synsets!'
                    .format(lemmatized_verb))

            wn_synsets = wordnet.synsets(lemmatized_verb)
            for synset in wn_synsets:
                if len(synset.lemmas()) < 1:
                    continue

                candidate = str(synset.lemmas()[0].name())
                classids = verbnet.classids(lemma=candidate)
                if verbose:
                    print('Class IDs for "{}": {}'.format(candidate, classids))

                if len(classids) > 0:
                    break

            if len(classids) < 1:
                if verbose:
                    print(
                        'Unable to find entries on verbnet for neither of the synsets... Will go recursive now (which is not a good thing!)'
                    )

                for synset in wn_synsets:
                    if len(synset.lemmas()) < 1:
                        continue

                    candidate = str(synset.hypernyms()[0].lemmas()[0].name())
                    return NLPUtils.get_verbnet_args(candidate,
                                                     verbose=verbose)

                if verbose:
                    print('Exhausted attempts... returning an empty list.')
                return []

        for id in classids:
            class_number = id[id.find('-') + 1:]
            try:
                v = verbnet.vnclass(class_number)
                roles = [
                    t.attrib['type'] for t in v.findall('THEMROLES/THEMROLE')
                ]
                pass
            except ValueError:
                print('VN class number not found: {}'.format(class_number))

                # Will handle these both below
                v = [None]
                roles = []
                pass

            while len(roles) < 1 and len(v) > 0:
                fallback_class_number = class_number[:class_number.rfind('-')]
                if verbose:
                    print('No roles found for class {}, falling back to {}.'.
                          format(class_number, fallback_class_number))
                class_number = fallback_class_number

                try:
                    v = verbnet.vnclass(class_number)
                    roles = [
                        t.attrib['type']
                        for t in v.findall('THEMROLES/THEMROLE')
                    ]
                    pass
                except ValueError:
                    # Go on with the loop
                    v = [None]
                    roles = []
                    pass

            if len(roles) > 0:
                if verbose:
                    print('Roles found: {}'.format(roles))

                return roles

        return None
コード例 #15
0
def prim_fram(input):
    s = parse(input, relations=True, lemmata=True)
    # print s
    l = parse(input).split()[0]
    m = nltk.pos_tag(input.split(" "))
    # print m
    oy = []
    adj = []
    nph = []
    pph = []
    vbp = []
    adv = []
    exc = []
    for i in range(len(l)):
        tup = (l[i][2],l[i][0])
        oy.append(tup)
    # print oy
    for i in range(len(m)):
        if m[i][1] == "JJ":
            adj.append((m[i][0], i + 1))
    j=0
    x=0
    for i in range(len(oy)-1):
        k = i
        c = i
        np = ""
        vp = ""
        if oy[i][0]=="B-PP":
            pph.append((oy[i][1],i+1))
        if oy[i][0] == "B-ADVP":
            adv.append((oy[i][1], i + 1))
        if oy[i][1] in list:
            # print oy[i][1]
            exc.append((oy[i][1], i + 1))
        if k >=j:

            while(oy[k][0] == "B-NP" or oy[k][0] == "I-NP") and (k <= range(len(oy))):
                np = np + oy[k][1]+" "
                k = k+1
            j = k
            if np!='':
             nph.append((np,j))
        if c >= x:

            while (oy[k][0] == "B-VP" or oy[k][0] == "I-VP") and (k <= range(len(oy))):
                vp = vp + oy[k][1] + " "
                k = k + 1
            x = k
            if vp != '':
                vbp.append((vp, j))

    # print vbp
    sen = nph+pph+vbp+adv+exc+adj
    # print sen
    sen1 = sorted(sen, key=lambda x: x[1])
    # print sen1
    senf = []
    for i in range(len(sen1)-1):
        u = sen1[i + 1]
        if sen1[i][0] != u[0]:
            senf.append(sen1[i])
    senf.append(sen1[-1])
    # print senf
    frame = []
    for z in range(len(senf)):
        if (senf[z] in nph):
            if(z>=2 and "ing" in senf[z][0]):
                frame.append((senf[z][0],"ING"))
                continue
            frame.append((senf[z][0], "NP"))
            continue
        if senf[z] in pph:
            if (z>2 and "ing" in senf[z][0]):
                frame.append((senf[z][0], "ING"))
                continue
            frame.append((senf[z][0], "PP"))
            continue
        if senf[z] in exc:
            frame.append((senf[z][0], senf[z][0]))
            continue
        if senf[z] in vbp:
            if (z>=2 and "ing" in senf[z][0]):
                frame.append((senf[z][0], "ING"))
                continue
            frame.append((senf[z][0], "VP"))
            continue
        if senf[z] in adv:
            if (z>2 and "ing" in senf[z][0]):
                frame.append((senf[z][0], senf[z][0]))
                continue
            frame.append((senf[z][0], "ADVP"))
            continue

        if senf[z] in adj:
            if (z>2 and "ing" in senf[z][0]):
                frame.append((senf[z][0], senf[z][0]))
                continue
            frame.append((senf[z][0], "ADJ"))
            continue
    vbf = []

    ps = PorterStemmer()
    for i in vbp:
        h = vb.classids(ps.stem(i[0].lower().strip()))
        # print h
        if h != []:
             vbf.append(ps.stem(i[0].strip()))

    return vbf,frame
コード例 #16
0
import os
import re
from nltk.corpus import verbnet as vbnet

thematic_roles = []
selres = []
semantics = []
themroles_dict = {}
semantics_dict = {}
selres_dict = {}
for file in os.listdir("D:/Downloads/new_vn"):
    if file.endswith(".xml"):
        # print(file.strip(".xml").split("-")[0])
        # s=str(vbnet.pprint(file.strip(".xml").split("-")[0]))
        l = vbnet.classids(file.strip(".xml").split("-")[0])
        if l!=[]:
            for i in l:
                t=2
                s = str(vbnet.pprint(i))
                # print(s)
                subclasses = s.split("Subclasses:")[1].split("Members")[0].strip()
                theme = s.split("Thematic roles:")[1].split("Frames")[0]
                seman = s.split("Semantics:")[1:]
                for j in seman:
                    k = j.split("\n")
                    for w in k:
                        if '*' in w:
                            if w.strip("        * ").split("(")[0] in semantics_dict:
                                semantics.append(w.strip("        * ").split("(")[0].strip())
                                continue
                            else:
コード例 #17
0
        synsets = wordnet.all_synsets()
        supersenses = \
              sorted(list(set(['supersense=' + x.lexname() for x in synsets])))

        # Framenet
        lem2frame = {}
        for lm in framenet.lus():
            for lemma in lm['lexemes']:
                (lem2frame[lemma['name'] + '.' + \
                        framenet_posdict[lemma['POS']]]) = lm['frame']['name']
        frame_names = sorted(['frame=' + x.name for x in framenet.frames()])
        type_embedder['lem2frame'] = lem2frame

        # Verbnet classids
        verbnet_classids = \
                     sorted(['classid=' + vcid for vcid in verbnet.classids()])

        type_hand_features = (verbnet_classids + supersenses + frame_names +
                              lcs_feats + conc_cols)
        input_size += len(type_hand_features)
        for f in type_hand_features:
            type_embedder['embedder'][f] = 0

    # Write all the feature names to a text file
    if args.type and args.token:
        with open('../../data/list_of_all_hand_eng_features.txt', 'w') as f:
            for feature in token_hand_features + type_hand_features:
                f.write(feature + "\n")

    # Preload embedders for bert, elmo, glove if necessary
コード例 #18
0
ファイル: __init__.py プロジェクト: dcavar/NLTK-JSON-NLP
    def process(text='',
                lang='en',
                coreferences=False,
                constituents=False,
                dependencies=False,
                expressions=False,
                **kwargs) -> OrderedDict:
        # build nlp-json
        j: OrderedDict = get_base()
        j['meta']['DC.language'] = lang
        d: OrderedDict = get_base_document(1)
        #j['documents'][d['id']] = d
        j['documents'].append(d)
        d['meta']['DC.source'] = 'NLTK {}'.format(nltk_version)
        j['meta']['DC.language'] = lang
        d['text'] = text

        # collect parsers
        lemmatizer = get_lemmatizer()
        stemmer = get_stemmer()

        # tokenization and pos
        words = []
        for sent in segment(text):
            for token in sent:
                words.append(token.value)

        # create the token list
        t_id = 1
        for word, xpos in pos_tag(words):
            wordnet_pos = get_wordnet_pos(xpos)
            lemma = lemmatizer(word, pos=wordnet_pos)

            # start the token
            t = {'id': t_id, 'text': word, 'stem': stemmer(word)}
            #d['tokenList'][t['id']] = t
            d['tokenList'].append(t)
            t_id += 1

            # wordnet
            try:
                synsets = wordnet.synsets(lemma, pos=wordnet_pos)
                senses = {}
                for s in synsets:
                    hyponyms = [
                        y for x in s.hyponyms() for y in x.lemma_names()
                    ]
                    hypernyms = [
                        y for x in s.hypernyms() for y in x.lemma_names()
                    ]
                    synonyms = s.lemma_names()[1:]
                    examples = s.examples()
                    sense = {
                        'wordnetId': s.name(),
                        'definition': s.definition()
                    }
                    if synonyms:
                        sense['synonyms'] = synonyms
                    if hypernyms:
                        sense['hypernyms'] = hypernyms
                    if hyponyms:
                        sense['hyponyms'] = hyponyms
                    if examples:
                        sense['examples'] = examples

                    antonyms = []
                    for l in s.lemmas():
                        if l.antonyms():
                            for a in l.antonyms():
                                antonyms.append(a.name())
                    if antonyms:
                        sense['antonyms'] = antonyms

                    senses[sense['wordnetId']] = sense

                if senses:
                    t['synsets'] = senses
            except:
                pass

            # verbnet
            try:
                verbs = dict((class_id, {
                    'classId': class_id,
                    'frames': vn.frames(class_id)
                }) for class_id in vn.classids(word))

                if verbs:
                    t['verbFrames'] = verbs
            except:
                pass

            # framenet
            try:
                frame_net = {}
                frames = invoke_frame(word)
                if frames is not None:
                    for fr in frames:
                        lu_temp = []
                        for lu in fn.lus(r'(?i)' + word.lower()):
                            fr_ = fn.frames_by_lemma(r'(?i)' + lu.name)
                            if len(fr_):
                                if fr_[0] == fr:
                                    lu_temp.append({
                                        'name': lu.name,
                                        'definition': lu.definition,
                                        'pos': lu.name.split('.')[1]
                                    })
                        frame_net[fr.ID] = {
                            'name': fr.name,
                            'frameId': fr.ID,
                            'definition': fr.definition,
                            # 'relations':fr.frameRelations,
                            'lu': lu_temp
                        }
                if frame_net:
                    t['frames'] = frame_net
            except:
                pass

        return remove_empty_fields(j)
コード例 #19
0
def analyze_constructs(examples, role_mapping, evaluation_sets, verbnet):
    annotated_sentences, lemma_in_vn = 0, 0
    n_correct_frames, n_frames = 0, 0
    n_correct_roles, n_roles = 0, 0
    n_classes_in_list, n_classes = 0, 0

    for lexie, lemma, sentence_text, gold_syntax in examples:
        d = sentence_text in [
            sentence for source, sentence in evaluation_sets['train']
        ]
        test_context = sentence_text in [
            sentence for source, sentence in evaluation_sets['test']
        ]

        debug(d, [])

        if d == test_context:
            print(d, test_context, sentence_text)
        assert d != test_context

        debug(d, [lexie, lemma, sentence_text])
        if test_context:
            annotated_sentences += 1

        # First possible error: lemma does not exist in VerbNet
        if not verbnet.classids(lemma):
            continue

        if test_context:
            lemma_in_vn += 1
            n_frames += 1

        considered_syntax = []
        for vn_frame in verbnet.frames_for_lemma(lemma):
            vn_syntax = vn_frame['frame'].find('SYNTAX')
            # If sentence starts with a verb, remove anything that's before
            # the verb in VerbNet
            if next(iter(gold_syntax)).tag == 'VERB':
                vn_syntax = remove_before_v(vn_syntax)
            considered_syntax.append((vn_frame['classid'], vn_syntax))

        # Use an OrderedDict for now to get the same behavior than
        # with the tuple list
        vn_syntax_matches = OrderedDict()
        for classid, vn_syntax in considered_syntax:
            if matches_verbnet_frame(gold_syntax, vn_syntax):
                if classid not in vn_syntax_matches:
                    vn_syntax_matches[classid] = []
                # check if vn_syntax is already in there?
                vn_syntax_matches[classid].append(vn_syntax)

        # Second possible error: syntactic pattern is not in VerbNet
        if not vn_syntax_matches:
            debug(d, ['   ', Fore.RED, syntax_to_str(gold_syntax), Fore.RESET])
            continue

        if test_context:
            n_correct_frames += 1
            n_classes += 1

        if lexie not in role_mapping:
            raise Exception('Missing lexie {} ({}) in role mapping.'.format(
                lexie, lemma))

        debug(d, [
            '   ', Fore.GREEN,
            syntax_to_str(gold_syntax), '->',
            syntax_to_str(
                map_gold_frame(classid, gold_syntax, role_mapping[lexie])),
            Fore.RESET
        ])

        for classid in vn_syntax_matches:
            debug(d, [
                '    ', classid, ' -> ',
                [
                    syntax_to_str(vn_syntax)
                    for vn_syntax in vn_syntax_matches[classid]
                ]
            ])

        class_matches = set(vn_syntax_matches.keys()) & set(
            role_mapping[lexie])
        if not class_matches:
            continue

        if test_context:
            n_classes_in_list += 1

        classid = next(iter(class_matches))
        vn_syntax = vn_syntax_matches[classid][0]

        if classid not in role_mapping[lexie]:
            continue

        for i, correct_syntax in enumerate(gold_syntax):
            # if this is a 'frame element', not a V or anything else
            if correct_syntax.tag in ['NP', 'S']:
                if role_mapping[lexie] == {}:
                    # missing sense
                    # TODO handle this explicitly using XML annotations
                    pass
                elif classid not in role_mapping[lexie]:
                    raise Exception('{} misses {} class'.format(
                        lexie, classid))
                elif correct_syntax.get(
                        'value') not in role_mapping[lexie][classid]:
                    raise Exception('{} misses {} mapping'.format(
                        lexie, correct_syntax.get('value')))

                if test_context:
                    n_roles += 1

                candidate_roles = set()
                candidate_roles.add(list(vn_syntax)[i].get('value'))

                if role_mapping[lexie][classid][correct_syntax.get(
                        'value')] in candidate_roles:
                    if test_context:
                        n_correct_roles += 1 / len(candidate_roles)

    print(annotated_sentences, n_frames, n_classes, n_roles)
    print('-                          {:.0%} of lemma tokens are here'.format(
        lemma_in_vn / annotated_sentences))
    print('- For these tokens,        {:.1%} of constructions are correct'.
          format(n_correct_frames / n_frames))
    print('- For these constructions, {:.1%} of classes are here'.format(
        n_classes_in_list / max(n_classes, 1)))
    print('- For these classes,       {:.1%} of roles are correct'.format(
        n_correct_roles / max(n_roles, 1)))
    print()
コード例 #20
0
def verbs_in_verbnet(verb):

    vn_results = vn.classids(lemma=verb)
    return 1 if vn_results else 0
コード例 #21
0
def print_if_passive(sent):
    """Given a sentence, tag it and print if we think it's a passive-voice
    formation."""
    lancaster_stemmer = LancasterStemmer()
    tagged = tag_sentence(sent)
    tags = map(lambda (tup): tup[1], tagged)

    if passivep(tags):
        file.write(oneline(sent))
        blob = TextBlob(oneline(sent))
        flag = True
        prevnoun = ""
        verb = ""
        nextnoun = ""
        for word, pos in blob.tags:
            if (pos == 'NN' or pos == 'NNP') and flag == True:
                prevnoun = word
            if (pos == 'VBG' or pos == 'RB' or pos == 'VBN') and flag == True:
                verb = word
                flag = False
            if (pos == 'NN' or pos == 'NNP') and flag == False:
                nextnoun = word
                break
        lancaster_stemmer.stem(verb)
        print verb
        if len(verbnet.classids(verb)) == 0:
            ans = prevnoun + " " + verb + " " + nextnoun + " "
        else:
            ans1 = verbnet.classids(verb)
            ansstring = ''.join(ans1)
            ans = prevnoun + " " + ansstring + " " + nextnoun + " "
        fileans.write(ans + '\n')

        #print verbnet.classids('acclaim')
        #print "passive:", oneline(sent)
    else:
        file1.write(oneline(sent))
        blob = TextBlob(oneline(sent))
        flag1 = True
        prevnoun1 = ""
        verb1 = ""
        nextnoun1 = ""
        for word, pos in blob.tags:
            #print word,pos
            if (pos == 'NN' or pos == 'NNP') and flag1 == True:
                prevnoun1 = word
            if (pos == 'VBG' or pos == 'RB' or pos == 'VBN') and flag1 == True:
                verb1 = word
                flag1 = False
            if (pos == 'NN' or pos == 'NNP') and flag1 == False:
                nextnoun1 = word
                break
        lancaster_stemmer.stem(verb1)
        print verb1
        if len(verbnet.classids(verb1)) == 0:
            ans = prevnoun1 + " " + verb1 + " " + nextnoun1 + " "
        else:
            ans1 = verbnet.classids(verb1)
            ansstring = ''.join(ans1)
            ans = prevnoun1 + " " + ansstring + " " + nextnoun1 + " "
        fileans.write(ans + '\n')
コード例 #22
0
        from lcsreader import LexicalConceptualStructureLexicon
        lcs = LexicalConceptualStructureLexicon(home + '/Desktop/protocols/data/verbs-English.lcs')
        lcs_feats = ['lcs_eventive', 'lcs_stative']

        # Wordnet supersenses(lexicographer names)
        supersenses = list(set(['supersense=' + x.lexname() for x in wordnet.all_synsets()]))

        # Framenet
        lem2frame = {}
        for lm in framenet.lus():
            for lemma in lm['lexemes']:
                lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name']
        frame_names = ['frame=' + x.name for x in framenet.frames()]

        # Verbnet classids
        verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()]

        dict_feats = {}
        for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols:
            dict_feats[f] = 0

        x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, tokens, lemmas)])

        dev_x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, dev_tokens, dev_lemmas)])

        test_x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_test_x, test_tokens, test_lemmas)])

        feature_names = (verbnet_classids, supersenses, frame_names, lcs_feats, conc_cols, lexical_feats, all_ud_feature_cols)

        y = {}
        dev_y = {}
コード例 #23
0
from nltk.corpus import verbnet

classID = verbnet.classids('to kill')

for id in classID:
    print(verbnet.themroles(id))
コード例 #24
0
def analyze_constructs(examples, role_mapping, evaluation_sets, verbnet):
    annotated_sentences, lemma_in_vn = 0, 0
    n_correct_frames, n_frames = 0, 0
    n_correct_roles, n_roles = 0, 0
    n_classes_in_list, n_classes = 0, 0

    for lexie, lemma, sentence_text, gold_syntax in examples:
        d = sentence_text in [sentence for source, sentence in evaluation_sets['train']]
        test_context = sentence_text in [sentence for source, sentence in evaluation_sets['test']]

        debug(d, [])

        if d == test_context:
            print(d, test_context, sentence_text)
        assert d != test_context

        debug(d, [lexie, lemma, sentence_text])
        if test_context:
            annotated_sentences += 1

        # First possible error: lemma does not exist in VerbNet
        if not verbnet.classids(lemma):
            continue

        if test_context:
            lemma_in_vn += 1
            n_frames += 1

        considered_syntax = []
        for vn_frame in verbnet.frames_for_lemma(lemma):
            vn_syntax = vn_frame['frame'].find('SYNTAX')
            # If sentence starts with a verb, remove anything that's before
            # the verb in VerbNet
            if next(iter(gold_syntax)).tag == 'VERB':
                vn_syntax = remove_before_v(vn_syntax)
            considered_syntax.append((vn_frame['classid'], vn_syntax))

        # Use an OrderedDict for now to get the same behavior than
        # with the tuple list
        vn_syntax_matches = OrderedDict()
        for classid, vn_syntax in considered_syntax:
            if matches_verbnet_frame(gold_syntax, vn_syntax):
                if classid not in vn_syntax_matches:
                    vn_syntax_matches[classid] = []
                # check if vn_syntax is already in there?
                vn_syntax_matches[classid].append(vn_syntax)

        # Second possible error: syntactic pattern is not in VerbNet
        if not vn_syntax_matches:
            debug(d, ['   ', Fore.RED, syntax_to_str(gold_syntax), Fore.RESET])
            continue

        if test_context:
            n_correct_frames += 1
            n_classes += 1

        if lexie not in role_mapping:
            raise Exception('Missing lexie {} ({}) in role mapping.'.format(lexie, lemma))

        debug(d, ['   ', Fore.GREEN, syntax_to_str(gold_syntax), '->', syntax_to_str(map_gold_frame(classid, gold_syntax, role_mapping[lexie])), Fore.RESET])

        for classid in vn_syntax_matches:
            debug(d, ['    ', classid, ' -> ',
                [syntax_to_str(vn_syntax) for vn_syntax in
                    vn_syntax_matches[classid]]])

        class_matches = set(vn_syntax_matches.keys()) & set(role_mapping[lexie])
        if not class_matches:
            continue

        if test_context:
            n_classes_in_list += 1

        classid  = next(iter(class_matches))
        vn_syntax = vn_syntax_matches[classid][0]

        if classid not in role_mapping[lexie]:
            continue

        for i, correct_syntax in enumerate(gold_syntax):
            # if this is a 'frame element', not a V or anything else
            if correct_syntax.tag in ['NP', 'S']:
                if role_mapping[lexie] == {}:
                    # missing sense
                    # TODO handle this explicitly using XML annotations
                    pass
                elif classid not in role_mapping[lexie]:
                    raise Exception('{} misses {} class'.format(lexie, classid))
                elif correct_syntax.get('value') not in role_mapping[lexie][classid]:
                    raise Exception('{} misses {} mapping'.format(lexie, correct_syntax.get('value')))

                if test_context:
                    n_roles += 1

                candidate_roles = set()
                candidate_roles.add(list(vn_syntax)[i].get('value'))

                if role_mapping[lexie][classid][correct_syntax.get('value')] in candidate_roles:
                    if test_context:
                        n_correct_roles += 1 / len(candidate_roles)

    print(annotated_sentences, n_frames, n_classes, n_roles)
    print('-                          {:.0%} of lemma tokens are here'.format(lemma_in_vn/annotated_sentences))
    print('- For these tokens,        {:.1%} of constructions are correct'.format(n_correct_frames/n_frames))
    print('- For these constructions, {:.1%} of classes are here'.format(n_classes_in_list/max(n_classes, 1)))
    print('- For these classes,       {:.1%} of roles are correct'.format(n_correct_roles/max(n_roles, 1)))
    print()
コード例 #25
0
def features_func(sent_feat, token, lemma, dict_feats, prot, concreteness, lcs,
                  l2f):
    '''Extract features from a word'''
    sent = sent_feat[0]
    feats = sent_feat[1][0]
    all_lemmas = sent_feat[1][1]
    deps = [x[2] for x in sent.tokens[token].dependents]
    deps_text = [x[2].text for x in sent.tokens[token].dependents]
    deps_feats = '|'.join([(a + "_dep") for x in deps
                           for a in feats[x.position].split('|')])

    all_feats = (feats[token] + '|' + deps_feats).split('|')
    all_feats = list(filter(None, all_feats))

    # UD Lexical features
    for f in all_feats:
        if f in dict_feats.keys():
            dict_feats[f] = 1

    # Lexical item features
    for f in deps_text:
        if f in dict_feats.keys():
            dict_feats[f] = 1

    # wordnet supersense of lemma
    for synset in wordnet.synsets(lemma):
        dict_feats['supersense=' + synset.lexname()] = 1

    # framenet name
    pos = sent.tokens[token].tag
    if lemma + '.' + pos in l2f.keys():
        frame = l2f[lemma + '.' + pos]
        dict_feats['frame=' + frame] = 1

    # Predicate features
    if prot == "pred":
        # verbnet class
        f_lemma = verbnet.classids(lemma=lemma)
        for f in f_lemma:
            dict_feats['classid=' + f] = 1

        # lcs eventiveness
        if lemma in lcs.verbs:
            if True in lcs.eventive(lemma):
                dict_feats['lcs_eventive'] = 1
            else:
                dict_feats['lcs_stative'] = 1

        dep_c_scores = [
            concreteness_score(concreteness, g_lemma) for g_lemma in
            [all_lemmas[x[2].position] for x in sent.tokens[token].dependents]
        ]
        if len(dep_c_scores):
            dict_feats['concreteness'] = sum(dep_c_scores) / len(dep_c_scores)
            dict_feats['max_conc'] = max(dep_c_scores)
            dict_feats['min_conc'] = min(dep_c_scores)
        else:
            dict_feats['concreteness'] = 2.5
            dict_feats['max_conc'] = 2.5
            dict_feats['min_conc'] = 2.5
    # Argument features
    else:
        dict_feats['concreteness'] = concreteness_score(concreteness, lemma)

        # lcs eventiveness score and verbnet class of argument head
        if sent.tokens[token].gov:
            gov_lemma = all_lemmas[sent.tokens[token].gov.position]

            # lexical features of dependent of governor
            deps_gov = [x[2].text for x in sent.tokens[token].gov.dependents]
            for f in deps_gov:
                if f in dict_feats.keys():
                    dict_feats[f] = 1

            # lcs eventiveness
            if gov_lemma in lcs.verbs:
                if True in lcs.eventive(gov_lemma):
                    dict_feats['lcs_eventive'] = 1
                else:
                    dict_feats['lcs_stative'] = 1

            for f_lemma in verbnet.classids(lemma=gov_lemma):
                dict_feats['classid=' + f_lemma] = 1

            # framenet name of head
            pos = sent.tokens[token].gov.tag
            if gov_lemma + '.' + pos in l2f.keys():
                frame = l2f[gov_lemma + '.' + pos]
                dict_feats['frame=' + frame] = 1

    return dict_feats
コード例 #26
0
            maxTerm = term
    # print name, maxTerm, maxCount
    total += maxCount
    maxCount = 0
avg = total / l
# print bagOfWords["elizabeth"]
# print avg

allverbs = []

# Creating training set
fr = open(extfile, 'r')
for line in fr:
    token = line.strip("\n")
    extList[token] = avg
    words = verbnet.classids(token)
    for w in words:
        finalWord = w.decode("UTF-8", "ignore")
        allverbs += verbnet.lemmas(finalWord)

for v in allverbs:
    extList[v] = avg / 2
# print len(extList)

allverbs = []

fr = open(intfile, 'r')
for line in fr:
    token = line.strip("\n")
    intList[token] = avg
    words = verbnet.classids(token)
コード例 #27
0
nps = extract_phrase(tree_str, 'NP')
vps = extract_phrase(tree_str, 'VP')
pps = extract_phrase(tree_str, 'PP')

if before_verb in nps:
    print("YES BEFORE VERB")

if after_verb in nps:
    print("YES AFTER VERB")

print(nps)
print(vps)
print(pps)

for np in nps:
    print(np)

print("=============")

word = "come"
vn_results = vn.classids(lemma=word)
print(vn_results)

frames = vn.frames('51.2')[0]

syntax = frames['syntax']
for item in syntax:
    print(item['pos_tag'])
    print("=====================")
nlp.close()
コード例 #28
0

# In[8]:


verbNet = []
for unit in data:
    sentence = []
    for cap in unit[3]:
#         print(unit[3])
        tagged = nltk.pos_tag(cap)
        verb_lists = []
        for idx, tp in enumerate(tagged):
            if (tp[1][:2] == 'VB'):
                base_form = WordNetLemmatizer().lemmatize(cap[idx], 'v')
                if (verbnet.classids(base_form) != []):
                    attr = verbnet.classids(base_form)
                    at_list = []
                    for at in attr:
                        splitted_at = []
                        splitted_string = at.split('-')
                        splitted_at.append(splitted_string[0])
                        splitted_at.append(splitted_string[1].split('.')[0])
                        at_list.append([])
                        at_list[-1] = splitted_at
                    verb_lists.append([base_form, at_list, len(attr)])
        sentence.append(verb_lists)
#         print(sentence[-1])
    verbNet.append(sentence)

# print(verbNet)
コード例 #29
0
#     print(i)
# # for i in featuresset:
# #     print(i)
# random.shuffle(featuresset)
# classifier = nltk.NaiveBayesClassifier.train(featuresset)
# save_classifier_NBC(classifier)

#-----------------------------------------testing---------------------------------------------------
input = "He need a ride from his home."
verb_list, frames_list = prim_fram(input)
print(frames_list)
print(nltk.pos_tag(nltk.word_tokenize(input)))
print(verb_list)
for r in range(len(verb_list)):
    keys = []
    ids = vb.classids(verb_list[r])
    for i in ids:
        u = vb.vnclass(i)
        for j in [l.attrib['type'] for l in u.findall('THEMROLES/THEMROLE/SELRESTRS/SELRESTR')]:
            keys.append(j)
        for j in [l.attrib['type'] for l in u.findall('THEMROLES/THEMROLE')]:
            keys.append(j)
        for j in [l.attrib['value'] for l in u.findall('FRAMES/FRAME/SEMANTICS/PRED')]:
            keys.append(j)
    f = open("tmp/features_verbs.txt","r")
    word_features = []

    for l,i in enumerate(f):
        word_features.append(i)
    f.close()
コード例 #30
0
import numpy
from sklearn import linear_model, cross_validation, neural_network
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from nltk.corpus import verbnet

goog = utility.get_news_prices('google')
goog.append(utility.get_news_prices('microsoft'))
goog.append(utility.get_news_prices('apple'))
goog.append(utility.get_news_prices('yahoo'))
goog.append(utility.get_news_prices('adobe'))
goog.append(utility.get_news_prices('ford'))

# Select model of computation:
model = neural_network.MLPRegressor(
    [len(verbnet.classids()) + 200, 500, 300, 100], 'relu', 'adam', 0.0001,
    200, 'constant', 0.001, 0.5, 200, True, None, 0.0001, False, False, 0.9,
    True, False, 0.1, 0.9, 0.999, 1e-08)
# model = RandomForestRegressor(n_estimators=50, max_features=30, max_depth=9, n_jobs=1)
# model = SVC(kernel='linear', probability=True, random_state=40)
# model = linear_model.LinearRegression()

# model = utility.pipeline_setup(model)

# model_fitted = model.fit(goog['message'], goog['Threshold Change'])

# Select columns:
x = goog.message.apply(
    lambda sentence: utility.get_feature_vector(sentence + "."))
# x.to_csv('data/google_msg_id.csv')
# x = pandas.read_csv('data/google_msg_id.csv')
コード例 #31
0
# In[ ]:

# In[ ]:

# In[33]:

nltk.download()

# In[34]:

from nltk.corpus import verbnet

# In[38]:

verbnet.classids(lemma='add')

# In[39]:

verbnet.classids(lemma='buy')

# In[41]:

verbnet.classids(lemma='take')

# In[42]:

verbnet.classids(lemma='give')

# In[44]:
コード例 #32
0
def hand_engineering(prot, batch_size, data, data_dev):
    '''
        Hand engineered feature extraction. Supports the following - UD,
        Verbnet classids, Wordnet supersenses, concreteness ratings, LCS
        eventivity scores
    '''
    home = expanduser("~")
    framnet_posdict = {
        'V': 'VERB',
        'N': 'NOUN',
        'A': 'ADJ',
        'ADV': 'ADV',
        'PREP': 'ADP',
        'NUM': 'NUM',
        'INTJ': 'INTJ',
        'ART': 'DET',
        'C': 'CCONJ',
        'SCON': 'SCONJ',
        'PRON': 'PRON',
        'IDIO': 'X',
        'AVP': 'ADV'
    }
    # Load the features
    features = {}
    with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f:
        for line in f.readlines():
            feats = line.split('\t')
            features[feats[0]] = (feats[1].split(), feats[2].split())

    # Load the predpatt objects for creating features
    files = [
        '/Downloads/UD_English-r1.2/en-ud-train.conllu',
        '/Downloads/UD_English-r1.2/en-ud-dev.conllu',
        '/Downloads/UD_English-r1.2/en-ud-test.conllu'
    ]
    home = expanduser("~")
    options = PredPattOpts(resolve_relcl=True,
                           borrow_arg_for_relcl=True,
                           resolve_conj=False,
                           cut=True)  # Resolve relative clause
    patt = {}

    for file in files:
        path = home + file
        with open(path, 'r') as infile:
            for sent_id, ud_parse in load_conllu(infile.read()):
                patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse,
                                                                opts=options)

    data['Structure'] = data['Split.Sentence.ID'].map(lambda x:
                                                      (patt[x], features[x]))
    data_dev['Structure'] = data_dev['Split.Sentence.ID'].map(
        lambda x: (patt[x], features[x]))

    raw_x = data['Structure'].tolist()
    raw_dev_x = data_dev['Structure'].tolist()

    all_x = raw_x + raw_dev_x
    all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))])
    feature_cols = Counter(all_feats.split('|'))

    # All UD dataset features
    all_ud_feature_cols = list(
        feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()]

    # Concreteness
    f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb')
    concreteness = pickle.load(f)
    if prot == 'arg':
        conc_cols = ['concreteness']
    else:
        conc_cols = ['concreteness', 'max_conc', 'min_conc']
    f.close()

    # LCS eventivity
    from lcsreader import LexicalConceptualStructureLexicon
    lcs = LexicalConceptualStructureLexicon(
        home + '/Desktop/protocols/data/verbs-English.lcs')
    lcs_feats = ['lcs_eventive', 'lcs_stative']

    # Wordnet supersenses(lexicographer names)
    supersenses = list(
        set(['supersense=' + x.lexname() for x in wordnet.all_synsets()]))

    # Framenet
    lem2frame = {}
    for lm in framenet.lus():
        for lemma in lm['lexemes']:
            lem2frame[lemma['name'] + '.' +
                      framnet_posdict[lemma['POS']]] = lm['frame']['name']
    frame_names = ['frame=' + x.name for x in framenet.frames()]

    # Verbnet classids
    verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()]

    # Lexical features
    lexical_feats = [
        'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must',
        'ought', 'dare', 'need'
    ] + [
        'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every',
        'this', 'that', 'any', 'most', 'all', 'both', 'these'
    ]

    dict_feats = {}
    for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols:
        dict_feats[f] = 0

    x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame) for sent, token, lemma in
        zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist())
    ])

    dev_x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame)
        for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist(
        ), data_dev['Lemma'].tolist())
    ])

    # Figure out which columns to drop(they're always zero)
    todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist()
    todrop = x_pd.columns[(x_pd == 0).all()].values.tolist()
    intdrop = [a for a in todrop if a not in todrop1]
    cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop))

    x = x_pd.drop(cols_to_drop, axis=1).values.tolist()
    dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist()

    x = [[a[:] for a in x[i:i + batch_size]]
         for i in range(0, len(data), batch_size)]
    dev_x = [[a[:] for a in dev_x[i:i + batch_size]]
             for i in range(0, len(data_dev), batch_size)]
    return x, dev_x
コード例 #33
0
 def root_word_verbnet_features(self):
     self.root_word_lemma = lemmatizer.lemmatize(self.root_word, 'v')
     all_classids = vn.classids(lemma=self.root_word_lemma)
     self.verb_class = ' '.join([c_id.split('-')[0] for c_id in all_classids])
     self.only_classids = ' '.join([vn.shortid(c_id) for c_id in all_classids])
コード例 #34
0
from nltk.corpus import verbnet

def GetVerbnetRestrictions(vnclass):
  role_restrictions = {}

  while True:
    for role in vnclass.findall('THEMROLES/THEMROLE'):
      restrictions = role.find('SELRESTRS')
      if restrictions:
        restriction_set = set()
        for restriction in restrictions.findall('SELRESTR'):
          predicate = restriction.attrib
          restriction_set.add((predicate['Value'], predicate['type']))

        total = (restrictions.get('logic', 'and'), list(restriction_set))
        role_restrictions[role.attrib['type']] = total

    if vnclass.tag == 'VNCLASS':
      break
    else:
      parent_class = vnclass.attrib['ID'].rsplit('-', 1)[0]
      vnclass = verbnet.vnclass(parent_class)

  return role_restrictions

vnclasses = verbnet.classids('drink')
v=verbnet.vnclass('39.1-2')
GetVerbnetRestrictions(v)
コード例 #35
0
from nltk.corpus import verbnet

my_classids = verbnet.classids(lemma='take')
print(my_classids)
# my_lemmas = verbnet.lemmas(my_classids)
# my_longid = longid(my_shortid)
# my_shortid = shortid(my_longid)
for i in my_classids:
    my_vnclass = verbnet.vnclass(i)
    # my_wordnetids = verbnet.wordnetids(mi)
    # Human-friendly methods
    verbnet.pprint(my_vnclass)
    # vnframe = my_vnclass.findall('FRAMES/FRAME')
    # print(verbnet.pprint_description(vnframe))
    # print(verbnet.pprint_frames(vnframe))
    print(verbnet.pprint_members(my_vnclass))
    # print(verbnet.pprint_semantics(vnframe))
    print(verbnet.pprint_subclasses(my_vnclass))
    # print(verbnet.pprint_syntax(vnframe))
    # x = verbnet.pprint_themroles(my_vnclass)
    print(verbnet.pprint_themroles(my_vnclass))
    '''for j in x.split("]"):
        print(j)'''
コード例 #36
0
def process_srl(srl_output, actual_data, just_phrases):
    porter_stemmer = PorterStemmer()
    wn_lem = WordNetLemmatizer()
    file_open = open (srl_output, "r")
    output    = file_open.read()
    srl_output = output.split("\n================\n")
    srl_list = []
    [srl_list.append(line.strip()) for line in srl_output]

    phrase_sentence = create_vector(just_phrases)

    corpus_data = create_vector(actual_data)
    number = 0
    for line in corpus_data:
        sline       = line.split("\t")
        sense       = sline[2] # figurative or literal
        metaphor    = sline[1] # along the line <- the metaphor itself
        try:
            current_srl = srl_list[number].split("\n") # semantic role labeling of give sentece
        except:
            import pdb; pdb.set_trace()

        #mtokens = metaphor.split(" ")
        mtokens_t = word_tokenize(phrase_sentence[number])
        mtokens_t = [w for w in mtokens_t if not w.decode('utf8') in nlcor.stopwords.words('english')]
        mtokens   = filter(lambda word: word not in ",-'", mtokens_t)
        sane_mt = [mt.decode('utf8') for mt in mtokens]
        pos_mtokens = nltk.pos_tag(sane_mt)
        only_verbs = [tkn[0] for tkn in pos_mtokens if 'VB' in tkn[1]]
        #print "==============================================="
        line_score = 0
        token_count = 1
        number += 1
        #print "phrase tokens: %s" % mtokens_t
        #print "only verbs: %s" % only_verbs

        for mtoken in only_verbs:
            vnclasses = verbnet.classids(mtoken)
            if not vnclasses:
                vnclasses = verbnet.classids(wn_lem.lemmatize(mtoken))
                if not vnclasses:
                    continue
            #print "vnclasses: %s" % vnclasses

            mindex = [index for index, sl in enumerate(current_srl) if porter_stemmer.stem(mtoken) in sl.decode('utf8')]
            if not mindex:
         #       print 0
                continue
            token_count += 1

            class_score = 0
            class_count = 1
            #print '----- %s -----' % mtoken
            for vn in vnclasses:
                v=verbnet.vnclass(vn)
                try:
                    restrictions = GetVerbnetRestrictions(v)
                except:
                    continue

             #   print restrictions
                if restrictions:
                    class_score = check_validity(current_srl, mindex[0], restrictions)
                    class_count += 1
                    #print class_score
                else:
                    #print "No restrictions for %s" % vn
                    pass
            if class_count < 2:
                avg_class_score = class_score / class_count
            else:
                avg_class_score = class_score / (class_count - 1)
            #print '---------------'

            line_score += avg_class_score
            token_count += 1
        if token_count < 2:
            avg_line_score = line_score / token_count
        else:
            avg_line_score = line_score / (token_count - 1)

#        print "%s - %s - %s" % (sline[1], sline[2], line_score)
        print avg_line_score
コード例 #37
0
    def run(self):
        print("Performing action identifier experiment ...")
        open(self.config['log_file'], 'w')
        count = 0
        sentences_total = 0
        start_time = time.time()

        utils.write_log(self.config,
                        "RUNNING CONFIGURATION: {}".format(self.config))

        # Create dataset object
        wikihow = Wikihow.Wikihow(self.config)

        statistic_list = []
        statistic_similarity = []
        ground_truth_count = 0
        dataset_length = int(
            wikihow.get_length() *
            self.config['action_identifier']['dataset_evaluation_percent'])

        if dataset_length < 1:
            print("No examples to process in dataset. Aborting ...")
            return

        verbs = []

        for idx in trange(dataset_length):
            instance = wikihow.get_entry(idx)
            text = wikihow.process_example(instance[1])
            utils.write_log(
                self.config,
                "\n---------------------------------------------------------------------------\n"
            )
            utils.write_log(self.config, "FILE: {}\n".format(instance[0]))
            spacy_en = spacy.load('en_core_web_sm')

            for sentence in text:
                sentences_total += 1

                # Tokenize
                if self.config['action_identifier'][
                        'ground_truth_generator'] == 'nltk':
                    sentence_tokens = nltk.word_tokenize(sentence)
                    sentence_tags = nltk.pos_tag(sentence_tokens)
                    ground_truth_verbs = [
                        v[0] for v in sentence_tags
                        if len(verbnet.classids(v[0])) > 0
                    ]
                elif self.config['action_identifier'][
                        'ground_truth_generator'] == 'spacy':
                    doc = spacy_en(sentence)
                    sentence_tokens = [t for t in doc]
                    sentence_tags = [(str(t), t.pos_) for t in doc]
                    ground_truth_verbs = [v for v in doc if v.pos_ == 'VERB']
                else:
                    print("No ground-truth mechanism defined! Aborting ...")
                    return

                utils.write_log(self.config,
                                "\n>SENTENCE: {}".format(sentence))
                utils.write_log(self.config,
                                "\n  >SENTENCE TAGS: {}".format(sentence_tags))

                if len(ground_truth_verbs) == 0:
                    ground_truth_count += 1

                utils.write_log(
                    self.config,
                    "\n  >GROUND-TRUTH VERBS: {}".format(ground_truth_verbs))

                embedding_verbs = []

                for token, tag in zip(sentence_tokens, sentence_tags):
                    keyword_similarity = []
                    for keyword in self.config['action_identifier'][
                            'keywords']:
                        try:
                            similarity = 1.0 - self.word_embedding.get_distance(
                                str(token), str(keyword))[2]
                        except KeyError:
                            similarity = 0.0

                        keyword_similarity.append(similarity)

                    mean = np.mean(keyword_similarity)

                    if mean >= float(self.config['action_identifier']
                                     ['similarity_threshold']):
                        embedding_verbs.append((str(token), mean))
                        statistic_similarity.append(mean)
                        verbs.append(token)

                ground_truth_set = {str(v) for v in ground_truth_verbs}
                print("Ground truth set: ", ground_truth_set)

                embedding_verbs_set = {str(v[0]) for v in embedding_verbs}
                print("Embedding set: ", embedding_verbs_set)

                true_positive = embedding_verbs_set.intersection(
                    ground_truth_set)
                print("True positive: ", true_positive)

                false_positive = embedding_verbs_set.difference(
                    ground_truth_set)
                print("False positive: ", false_positive)

                false_negative = ground_truth_set.difference(
                    embedding_verbs_set.intersection(ground_truth_set))
                print("False negative: ", false_negative)

                # false_negative

                # true_positive = [e[0] in ground_truth_verbs for e in embedding_verbs]
                # true_positive = np.count_nonzero(true_positive)
                #
                # false_positive = [e[0] not in ground_truth_verbs for e in embedding_verbs]
                # false_positive = np.count_nonzero(false_positive)
                #
                # true_negative = []
                # false_negative = np.count_nonzero(true_negative)
                #
                # false_negative = [e not in embedding_verbs for e in ground_truth_verbs]
                # false_negative = np.count_nonzero(false_negative)

                true_positive = len(true_positive)
                false_positive = len(false_positive)
                false_negative = len(false_negative)

                sentence_entry = (token, tag,
                                  self.word_embedding.get_word_vector(token),
                                  keyword_similarity, mean)

                utils.write_log(
                    self.config,
                    "\n  >EMBEDDING VERBS: {}".format(embedding_verbs))

                # Text statistics [true positive, false negative, precision, recall, f-score]
                try:
                    precision = true_positive / (true_positive +
                                                 false_positive)
                except ZeroDivisionError:
                    precision = 0.0

                try:
                    recall = true_positive / (true_positive + false_negative)
                except ZeroDivisionError:
                    recall = 0.0

                try:
                    f_score = 2 * (recall * precision) / (recall + precision)
                except ZeroDivisionError:
                    f_score = 0.0

                utils.write_log(
                    self.config,
                    "\n  >TP: {} FP: {} FN: {} Precision: {} Recall: {} F-Score: {}"
                    .format(true_positive, false_positive, false_negative,
                            precision, recall, f_score))
                statistic_list.append([
                    true_positive, false_positive, false_negative, precision,
                    recall, f_score
                ])
            count += 1

        print("Calculating statistics ...")
        statistic_mean = np.mean(statistic_list, axis=0)
        statistic_std = np.std(statistic_list, axis=0)

        utils.write_log(
            self.config,
            "\n=======================================================================\n"
        )
        utils.write_log(
            self.config,
            "RESULTS (Elapsed time: {:.4f} seconds)".format(time.time() -
                                                            start_time))
        utils.write_log(self.config, "\n  Total of examples: {}".format(count))
        utils.write_log(self.config, "\n  Total of sentences: {} - Mean per example: {:.4f} - Ground-truth sentences with zero verbs: {} ({:.4f} %)".format(sentences_total, \
                            sentences_total / count, ground_truth_count, ground_truth_count / sentences_total))
        utils.write_log(
            self.config, "\n  Mean True Positive: {:.4f} - Std: {:.4f}".format(
                statistic_mean[0], statistic_std[0]))
        utils.write_log(
            self.config,
            "\n  Mean False Positive: {:.4f} - Std: {:.4f}".format(
                statistic_mean[1], statistic_std[1]))
        utils.write_log(
            self.config,
            "\n  Mean False Negative: {:.4f} - Std: {:.4f}".format(
                statistic_mean[2], statistic_std[2]))
        utils.write_log(
            self.config, "\n  Mean Similarity: {:.4f} - Std: {:.4f}".format(
                np.mean(statistic_similarity), np.std(statistic_similarity)))
        utils.write_log(
            self.config,
            "\n  Mean Precision: {:.4f} - Recall: {:.4f} - F-Score: {:.4f}".
            format(statistic_mean[3], statistic_mean[4], statistic_mean[5]))

        # flatten = lambda l: [item for sublist in l for item in sublist]
        #
        # verbs = flatten(verbs)
        verbs = [str(v) for v in verbs]

        import pandas as pd
        df = pd.DataFrame(verbs)[0].value_counts().to_csv(
            self.config['log_file'] + "-dataframe")
コード例 #38
0
ファイル: lingstructs.py プロジェクト: nweb2000/verb-checker
    def create_fvect(self, createfrom=None): 
        """
            adds features to feature vector, this function does not add labels, that is up to 
            classes that derive from the CorrectionFeatures class
            @params
                list createfrom - a feature vector list to base this feature vector on (append to createfrom)
        """
        if createfrom:
            fvect = createfrom
        else:
            fvect = []

        corr = self.instance.correction
        error = self.instance.error
        #extract data needed for features
        subj = self.sentence.get_subject_token()[0]
        left = self.sentence.get_token_left(error.first().tid)
        right = self.sentence.get_token_right(error.last().tid)

        left2 = self.sentence.get_token_left(left.tid)
        left3 = self.sentence.get_token_left(left2.tid)
        left4 = self.sentence.get_token_left(left3.tid)
        right2 = self.sentence.get_token_right(right.tid)
        right3 = self.sentence.get_token_right(right2.tid)
        right4 = self.sentence.get_token_right(right3.tid)

        leftnoun = closest_noun(error.first(), self.sentence, True)
        rightnoun = closest_noun(error.last(), self.sentence, False)
        gov_tuple = self.sentence.get_gov(error.head().tid)
        gov_token = self.sentence.get_token(gov_tuple[1])
        governee_list = self.sentence.get_governees(error.head().tid)
        governee_tuple = governee_list[0]
        governee_token = self.sentence.get_token(governee_tuple[1])
        prevphrase = prev_vphrase(error, self.sentence)

        ladv = time_adverb(error.first(), self.sentence, True)
        radv = time_adverb(error.last(), self.sentence, False)

        governee_rels = [x[0] + "governeerel" for x in governee_list]
        governees = [self.sentence.get_token(x[1]).abbv_to_word() + "governee" for x in governee_list]
        governeespos = [self.sentence.get_token(x[1]).pos + "governee" for x in governee_list]

        det = self.sentence.get_det(subj.tid) 

        vnet_classes = verbnet.classids(error.head().lemma)
        if not vnet_classes:
            vnet_class = []
        else:
            vnet_class = ["".join([x for x in classes if str.isalpha(x)]) for classes in vnet_classes]
            vnet_class = [x + "class" for x in vnet_class]

        if prevphrase:
            prevhead = prevphrase.head()
            c = verbnet.classids(prevhead.lemma)
            if not c:   
                prevclass = None 
            else:
                prevclass = c[0] 
                prevclass = "".join([x for x in prevclass if str.isalpha(x)])
            prevaspect = get_aspect(prevphrase)
        else:
            prevhead = None
            prevclass = None  
            prevaspect = None

        fvect.append(error.head().abbv_to_word() + "self")
#       fvect.extend(vnet_class)

        if prevhead:    
            fvect.append(prevhead.abbv_to_word() + "prevword")
            fvect.append(prevhead.pos + "prevpos")
#       if prevclass:
#           fvect.append(prevclass + "prevclass")
        if prevaspect:
            fvect.append(prevaspect + "prevaspect")

#       fvect.append(right2.abbv_to_word())
#       fvect.append(left2.abbv_to_word())
#       fvect.append(right2.pos)
#       fvect.append(left2.pos)

#       fvect.append(right3.word + "right")
#       fvect.append(left3.word + "left")
#       fvect.append(right3.pos + "right")
#       fvect.append(left3.pos + "left")
#
#       fvect.append(right4.word + "right")
#       fvect.append(left4.word + "left")
#       fvect.append(right4.pos + "right")
#       fvect.append(left4.pos + "left")

        fvect.append(right.abbv_to_word() + "right")
        fvect.append(right.pos + "right")
        fvect.append(left.abbv_to_word() + "left")
        fvect.append(left.pos + "left")


        fvect.append(subj.pos + "subj")
        fvect.append(subj.abbv_to_word() + "subjlem")
        fvect.append(str(subj.noun_person()) + "subj")
        fvect.append(str(subj.singular_noun()) + "subj")
#       fvect.append(det.word + "det")
        fvect.append(str(self.sentence.ispassive()) + "passive")
#       if leftnoun.isvalid():
#           fvect.append(str(leftnoun.singular_noun()) + "leftn")
#           fvect.append(str(leftnoun.noun_person()) + "leftn")
#       fvect.append(leftnoun.pos + "leftn")
#           fvect.append(leftnoun.abbv_to_word() + "leftn")
#       if rightnoun.isvalid():
#           fvect.append(str(rightnoun.noun_person()) + "rightn")
#           fvect.append(str(rightnoun.singular_noun()) + "rightn")
#       fvect.append(rightnoun.pos + "rightn")
#           fvect.append(rightnoun.abbv_to_word() + "rightn")
#       fvect.extend(governee_rels)
#       fvect.extend(governees)
#       fvect.extend(governeespos)

        fvect.append(gov_token.word + "gov")
        fvect.append(gov_token.pos + "gov")
        fvect.append(gov_tuple[0] + "govrel")
        fvect.append(governee_token.word + "governee")
        fvect.append(governee_token.pos + "governee")
        fvect.append(governee_tuple[0] + "governeerel")
        if ladv.isvalid():
            fvect.append(ladv.word + "adverb")
        if radv.isvalid():
            fvect.append(radv.word + "adverb")

        return fvect
def extractFeatures(token, sentence, filename, syntacticFeatures):
    rowOfFeats = []

    verb = token['word']
    idVerb = token['id']

    Features = Verb(token['word'], token['lemma'], token['pos'])
    Features.set_metadata(sentence['id'], idVerb, filename)

    if token.has_key('attribution'):
        role = token['role']
        if role == 'cue':
            Features.set_label('Y')
        else:
            Features.set_label('N')
    else:
        Features.set_label('N')

    if idVerb > 0:
        prevToken = sentence['tokens'][idVerb - 1]
    else:
        prevToken = None
    if idVerb < len(sentence['tokens']) - 1:
        nexToken = sentence['tokens'][idVerb + 1]
    else:
        nexToken = None

    if prevToken != None:
        Features.set_previousToken(prevToken['word'], prevToken['lemma'],
                                   prevToken['pos'])

        if prevToken['word'] == ':':
            Features.set_colonAdjacent()
        elif prevToken['word'] == '``':
            Features.set_quoteAdjacentInside()
        elif prevToken['word'] == "''":
            Features.set_quoteAdjacentOutside()
        elif prevToken['word'] == ',':
            beforeComma = sentence['tokens'][idVerb - 2]
            if beforeComma['word'] == '``':
                Features.set_quoteAdjacentInside()
            elif beforeComma['word'] == "''":
                Features.set_quoteAdjacentOutside()

    if nexToken != None:
        Features.set_nextToken(nexToken['word'], nexToken['lemma'],
                               nexToken['pos'])

        if nexToken['word'] == ':':
            Features.set_colonAdjacent()
        elif nexToken['word'] == '``':
            Features.set_quoteAdjacentOutside()
        elif nexToken['word'] == "''":
            Features.set_quoteAdjacentInside()
        elif nexToken['word'] == ',':
            try:
                afterComma = sentence['tokens'][idVerb + 2]
                if afterComma['word'] == '``':
                    Features.set_quoteAdjacentOutside()
                elif afterComma['word'] == "''":
                    Features.set_quoteAdjacentInside()
            except:
                print 'out of range'
    else:
        Features.set_nextToken('NONE!!', 'NONE!!', 'NONE!!')

    Features.set_verbNet(";!".join(vn.classids(token['lemma'])))
    Features.set_distances(token['id'],
                           len(sentence['tokens']) - (token['id'] + 1))

    quoteMarkers = findQuoteMarkers(sentence)
    FEATinQuotes = 'False'

    for (beg, end) in quoteMarkers:
        if idVerb > beg and idVerb < end:
            Features.set_insideQuotes()

    (depth, parentNode, parentSiblings) = syntacticFeatures
    Features.set_syntactic(depth, parentNode, ";!".join(parentSiblings))

    Features.makeList()
    rowOfFeats = Features.getList()

    return rowOfFeats
コード例 #40
0
def GetVerbnetRestrictions(vnclass):
    role_restrictions = {}

    while True:
        for role in vnclass.findall('THEMROLES/THEMROLE'):
            restrictions = role.find('SELRESTRS')
            if restrictions:
                restriction_set = set()
                for restriction in restrictions.findall('SELRESTR'):
                    predicate = restriction.attrib
                    restriction_set.add(
                        (predicate['Value'], predicate['type']))

                total = (restrictions.get('logic',
                                          'and'), list(restriction_set))
                role_restrictions[role.attrib['type']] = total

        if vnclass.tag == 'VNCLASS':
            break
        else:
            parent_class = vnclass.attrib['ID'].rsplit('-', 1)[0]
            vnclass = verbnet.vnclass(parent_class)

    return role_restrictions


vnclasses = verbnet.classids('drink')
v = verbnet.vnclass('39.1-2')
GetVerbnetRestrictions(v)
コード例 #41
0
from nltk.corpus import verbnet as vn
from nltk.corpus import framenet as fn
from nltk.corpus import propbank as pb

word1 = "melt"
word2 = "oxidize"

input = word1

vn_results = vn.classids(lemma=input)

if not vn_results:
    print(input + ' not in verbnet.')
else:
    print('verbnet:')
    for ele in vn_results:
        print(ele)
    print("")

fn_results = fn.frames_by_lemma(input)

if not fn_results:
    print(input + ' not in framenet.')
else:
    print('framenet:')
    for ele in fn_results:
        print(ele)
    print("")

pb_results = []
try:
generalThing = datum.thing
verbnetRoot = generalThing.get("verbnet")
wordnetRoot = generalThing.find("wordnet")
class_ = verbnetRoot.get("class")
verbclassID = verbnetRoot.get("verb class id")
verbroot = verbnetRoot.get("verbroot")
example = verbnetRoot.get("example")
semantics = verbnetRoot.get("semantics")
syntax = verbnetRoot.get("syntax")
verbclass_ = verbnetRoot.get("verb class")
description = verbnetRoot.get("description")
semanticsArguments = verbnetRoot.get("semantics argument")
syntaxArguments = verbnetRoot.get("syntax argument")
syntaxFramesKatum = verbnetRoot.get("syntactic argument")
semanticsFramesKatum = verbnetRoot.get("semantics predicate")
predicateValue = verbnetRoot.get("predicate value")
themroles = verbnetRoot.get("thematic role")
roleType = verbnetRoot.get("role")
listOfAllLemmas = vn.lemmas()
uniqueClassIDs = []
for lemma in listOfAllLemmas:
    uniqueClassIDs.extend(vn.classids(lemma))
uniqueClassIDs = list(set(uniqueClassIDs))
processClassID(uniqueClassIDs)
for v in vn.lemmas():
    verbRootInstance = verbroot.get(v)
    for verbclass in vn.classids(v):
        verbRootInstance._is(classToKatumDict[verbclass], False)

generalThing.save('wordnet-verbnet.datum')
コード例 #43
0
def findpassives(sent):
    # Feature extraction code here.
    """Given a sentence, tag it and print if we think it's a passive-voice
    formation."""
    lancaster_stemmer = LancasterStemmer()
    tagged = tag_sentence(sent)
    tags = map( lambda(tup): tup[1], tagged)
    ansi=[]
    # print sent
    if passivep(tags):
        #file.write(oneline(sent))
        blob=TextBlob(oneline(sent))
        flag =True
        prevnoun=""
        negative=0
        number=0
        verb=""
        nextnoun=""
        for word, pos in blob.tags:
            #print word,pos
            if (pos=='NN' or pos =='NNP') and flag== True:
                prevnoun= word
            if (pos=='RB'):
                negative=1
            if (pos=='CD'):
                number= word
            if (pos=='VBG' or pos=='RB' or pos=='VBN'or pos=='VB') and flag==True:
                verb=word
                flag= False
            if (pos=='NN' or pos=='NNP') and flag== False:
                nextnoun=word
                break
        lancaster_stemmer.stem(verb)
        #print verb
        if verb=="":
            ansi.append([0])
            ansi.append(negative)
            ansi.append(number)
        elif len(verbnet.classids(verb))==0:
            ans= prevnoun+" "+verb+" "+nextnoun+" "

            ansi.append([0])
            ansi.append(negative)
            ansi.append(number)
        else:
            #ans1=verbnet.lemmas()[0:3620].index(verb)
            temp=verbnet.classids(verb)
            ans1 = [verbnet.classids().index(i) for i in temp]
            ansi.append(ans1)
            ansi.append(negative)
            ansi.append(number)
        #fileans.write(ans+'\n')
        result.append(ansi)
        if(len(ansi)==0):
            ansi=[[0],0,0]
        print ansi
        return ansi


    else:
        #file1.write(oneline(sent))
        blob=TextBlob(oneline(sent))
        flag1 =True
        prevnoun1=""
        verb1=""
        nextnoun1=""
        negative=0
        number=0
        for word, pos in blob.tags:
            #print word,pos
            if (pos=='NN' or pos =='NNP') and flag1== True:
                prevnoun1= word
            if (pos=='RB'):
                negative=1
            if (pos=='CD'):
                number= word
            if (pos=='VBG' or pos=='RB' or pos=='VBN'or pos=='VB') and flag1==True:
                verb1=word
                flag1= False
            if (pos=='NN' or pos=='NNP') and flag1== False:
                nextnoun1=word
                break
        lancaster_stemmer.stem(verb1)
        #print verb1
        if verb1=="":
            ansi.append([0])
            ansi.append(negative)
            ansi.append(number)
        elif len(verbnet.classids(verb1))==0:
            ans= prevnoun1+" "+verb1+" "+nextnoun1+" "

            ansi.append([0])
            ansi.append(negative)
            ansi.append(number)

        else:
            #ans1=ans1=verbnet.lemmas()[0:3620].index(verb1)
            temp=verbnet.classids(verb1)
            ans1 = [verbnet.classids().index(i) for i in temp]
            ansi.append(ans1)
            ansi.append(negative)
            ansi.append(number)

        if(len(ansi)==0):
            ansi=[[0],0,0]
        print ansi
        return ansi
コード例 #44
0
def print_if_passive(sent):
    """Given a sentence, tag it and print if we think it's a passive-voice
    formation."""
    lancaster_stemmer = LancasterStemmer()
    tagged = tag_sentence(sent)
    tags = map( lambda(tup): tup[1], tagged)

    if passivep(tags):
        file.write(oneline(sent))
        blob=TextBlob(oneline(sent))
        flag =True
        prevnoun=""
        verb=""
        nextnoun=""
        for word, pos in blob.tags:
            if (pos=='NN' or pos =='NNP') and flag== True:
                prevnoun= word
            if (pos=='VBG' or pos=='RB' or pos=='VBN') and flag==True:
                verb=word
                flag= False
            if (pos=='NN' or pos=='NNP') and flag== False:
                nextnoun=word
                break
        lancaster_stemmer.stem(verb)
        print verb
        if len(verbnet.classids(verb))==0:
            ans= prevnoun+" "+verb+" "+nextnoun+" "
        else:
            ans1=verbnet.classids(verb)
            ansstring=''.join(ans1)
            ans= prevnoun+" "+ansstring+" "+nextnoun+" "
        fileans.write(ans+'\n')

        #print verbnet.classids('acclaim')
        #print "passive:", oneline(sent)
    else:
        file1.write(oneline(sent))
        blob=TextBlob(oneline(sent))
        flag1 =True
        prevnoun1=""
        verb1=""
        nextnoun1=""
        for word, pos in blob.tags:
            #print word,pos
            if (pos=='NN' or pos =='NNP') and flag1== True:
                prevnoun1= word
            if (pos=='VBG' or pos=='RB' or pos=='VBN') and flag1==True:
                verb1=word
                flag1= False
            if (pos=='NN' or pos=='NNP') and flag1== False:
                nextnoun1=word
                break
        lancaster_stemmer.stem(verb1)
        print verb1
        if len(verbnet.classids(verb1))==0:
            ans= prevnoun1+" "+verb1+" "+nextnoun1+" "
        else:
            ans1=verbnet.classids(verb1)
            ansstring=''.join(ans1)
            ans= prevnoun1+" "+ansstring+" "+nextnoun1+" "
        fileans.write(ans+'\n')