Beispiel #1
0
def sdfprocess(rvdata, partidx):
    os.environ["MALT_PARSER"] = "/home/cosmo/Dropbox/Purdue/nlp/maltparser-1.8"
    parser = MaltParser(
        mco='engmalt.poly-1.7',
        working_dir='/home/cosmo/Dropbox/Purdue/nlp/maltparser-1.8',
        additional_java_args=['-Xmx5000m'])
    sdfdata = []
    cnn = 1
    # demo()
    print parser.raw_parse("I am a student.")
    for eg in rvdata:
        if cnn % 100 == 0:
            print "%f%% of document %d finished" % (cnn * 100 * 1.0 /
                                                    len(rvdata), partidx + 1)
        cmt = eg[3].decode('utf-8')  #3 is the idx of comment
        sentences = nltk.sent_tokenize(cmt)
        sdfparsed = [parser.raw_parse(sentence) for sentence in sentences]
        sdfdata.append(eg[:3] + [sdfparsed])
        # print cnn
        print sdfparsed
        # print sdfdata
        cnn += 1
        if cnn > 5: break

    return sdfdata
Beispiel #2
0
    def dep_parse(self, sentence='every cat leaves'):
        #Lazy-initialize the depparser
        if self.depparser is None:
            self.depparser = MaltParser(tagger=self.get_pos_tagger())
        if not self.depparser._trained:
            self.train_depparser()

        return [self.depparser.parse(sentence, verbose=self.verbose)]
Beispiel #3
0
    def parse_sents(self, sents):
        os.environ['MALT_PARSER'] = config['MALT_WORKING_DIR'] + '/malt.jar'
        parser = NltkMaltParser(working_dir=config['MALT_WORKING_DIR'],
                                mco=config['MALT_MCO'],
                                additional_java_args=config['MALT_JAVA_ARGS'],
                                tagger=self.tagger)
        graphs = [list(graph)[0] for graph in parser.parse_sents(sents)]

        # Sometimes, there is an empty graph at the end of the list. Delete it.
        if len(graphs) > 0 and len(graphs[-1].nodes) == 1:
            del graphs[-1]

        return graphs
    def parse_sents(self, sents):
        os.environ['MALT_PARSER'] = config['MALT_WORKING_DIR'] + '/malt.jar'
        parser = NltkMaltParser(working_dir=config['MALT_WORKING_DIR'],
                                mco=config['MALT_MCO'],
                                additional_java_args=config['MALT_JAVA_ARGS'],
                                tagger=self.tagger)
        graphs = [list(graph)[0] for graph in parser.parse_sents(sents)]

        # Sometimes, there is an empty graph at the end of the list. Delete it.
        if len(graphs) > 0 and len(graphs[-1].nodes) == 1:
            del graphs[-1]

        return graphs
Beispiel #5
0
class Parser(object):
    '''Write something here.'''
    nouns = defaultdict(set)
    verbs = defaultdict(set)

    def __init__(self, all_modules, malt_working_dir=None, malt_mco=None):
        '''...'''

        if not malt_working_dir:
            import os
            malt_working_dir = os.environ['MALTPARSERHOME']

        if not malt_mco:
            malt_mco = 'engmalt.linear-1.7'

        self._parser = MaltParser(working_dir=malt_working_dir, mco=malt_mco, additional_java_args=['-Xmx512m'])

        for module in all_modules:
            for noun in module.nouns:
                self.nouns[noun].add(module)
            for verb in module.verbs:
                self.verbs[verb].add(module)

    def _is_verb(self, word):
        return word in self.verbs

    def _is_noun(self, word):
        return word in self.nouns

    def _check_command(self, noun, verb):
        if not noun or not verb:
            return False
        print 'Noun:', noun
        print 'Verb:', verb
        return self.nouns[noun] & self.verbs[verb]

    def parse(self, command):
        '''...'''
        words = word_tokenize(command)
        pos = pos_tag(words)
        graph = self._parser.tagged_parse(pos)
        tree = graph.tree()
        print "Parse Tree: ", tree

        c_noun = None
        c_verb = None

        for subtree in tree.subtrees():
            words = [x for x in subtree if not isinstance(x, Tree)] + [subtree.node]
            for x in words:
                # TODO: Use the tagged sentence so that we can resolve this better
                if self._is_verb(x):
                    c_verb = x
                elif self._is_noun(x):
                    c_noun = x
                command_modules = self._check_command(c_noun, c_verb)
                if command_modules:
                    yield Command(command, c_noun, c_verb, [x for x in words if x not in (c_noun, c_verb)])
                    c_noun = None
Beispiel #6
0
def setup_module():
    import pytest
    from nltk.parse.malt import MaltParser

    try:
        depparser = MaltParser("maltparser-1.7.2")
    except LookupError as e:
        pytest.skip("MaltParser is not available")
def setup_module(module):
    from nose import SkipTest
    from nltk.parse.malt import MaltParser

    try:
        depparser = MaltParser("maltparser-1.7.2")
    except LookupError:
        raise SkipTest("MaltParser is not available")
Beispiel #8
0
def demo():
    discourse_demo()

    tagger = RegexpTagger([('^(chases|runs)$', 'VB'), ('^(a)$', 'ex_quant'),
                           ('^(every)$', 'univ_quant'), ('^(dog|boy)$', 'NN'),
                           ('^(he)$', 'PRP')])
    depparser = MaltParser(tagger=tagger)
    drt_discourse_demo(
        DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser))
Beispiel #9
0
def mltprocess(tp, path, filenamels, docid):
    parser=MaltParser(working_dir='/home/cosmo/Dropbox/Purdue/nlp/maltparser-1.8/maltparser-1.8.jar', mco='engmalt.poly-1.7.mco', additional_java_args='-mx5000m')
    sdfdata = []
    for i in range(len(filenamels)):
        if (i+1)%100 == 0: print "%f%% of document %d of %s finished" % ((i+1)*100*1.0/len(filenamels), docid, tp) 
        filename = filenamels[i]
        h = open(path + filename, 'r')
        lines = h.readlines()
        h.close()
        headraw, bodyraw = preprocess(lines[0]), preprocess(lines[1])

        sentences = [headraw] + nltk.sent_tokenize(bodyraw)
        sdfparsed = [parser.raw_parse(sentence) for sentence in sentences]
        sdfdata.append(sdfparsed)
        # print sdfparsed
        # print sdfdata      
        # if i > 5: break
    return sdfdata
Beispiel #10
0
def setup_module():
    import pytest

    from nltk.parse.malt import MaltParser

    try:
        depparser = MaltParser()
    except (AssertionError, LookupError) as e:
        pytest.skip("MaltParser is not available")
    def _init_glue(self):
        tagger = RegexpTagger([
            ('^(David|Mary|John)$', 'NNP'),
            ('^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$',
             'VB'), ('^(go|order|vanish|find|approach)$', 'VB'),
            ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'),
            ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
            ('^(big|gray|former)$', 'JJ'), ('^(him|himself)$', 'PRP')
        ])

        depparser = MaltParser(tagger=tagger)
        self._glue = DrtGlue(depparser=depparser, remove_duplicates=False)
Beispiel #12
0
def demo():
    discourse_demo()

    tagger = RegexpTagger([
        ("^(chases|runs)$", "VB"),
        ("^(a)$", "ex_quant"),
        ("^(every)$", "univ_quant"),
        ("^(dog|boy)$", "NN"),
        ("^(he)$", "PRP"),
    ])
    depparser = MaltParser(tagger=tagger)
    drt_discourse_demo(
        DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser))
class MaltParser(DependencyParserWrapper):
    # currently this can only be run from the malt parser directory
    # TODO: make more general/easy to run
    def __init__(self, version = "maltparser-1.9.2", model = "engmalt.linear-1.7.mco"):
        super().__init__()
        root = os.getcwd()
        version_path = os.path.join(root, "schemata", "parse", "maltparser", version )
        model_path =os.path.join(root, "schemata", "parse", "maltparser", model )
        self.base = MP(version_path, model_path)

    def get_spans(self, sent):
        dparse = self.base.parse_one(sent.split())
        heads = [node['head'] for _, node in sorted(dparse.nodes.items())][1:]
        tree = DependencyParserWrapper.head_to_tree(heads)
        non_singletons = DependencyParserWrapper.compute_spans(tree)
        singletons = [(n, n+1) for n in range(len(heads))]
        return set(non_singletons) | set(singletons)
Beispiel #14
0
    def __init__(self, all_modules, malt_working_dir=None, malt_mco=None):
        '''...'''

        if not malt_working_dir:
            import os
            malt_working_dir = os.environ['MALTPARSERHOME']

        if not malt_mco:
            malt_mco = 'engmalt.linear-1.7'

        self._parser = MaltParser(working_dir=malt_working_dir, mco=malt_mco, additional_java_args=['-Xmx512m'])

        for module in all_modules:
            for noun in module.nouns:
                self.nouns[noun].add(module)
            for verb in module.verbs:
                self.verbs[verb].add(module)
Beispiel #15
0
def demo(show_example=-1):
    examples = ['David sees Mary',
                'David eats a sandwich',
                'every man chases a dog',
                'every man believes a dog sleeps',
                'John gives David a sandwich',
                'John chases himself',
#                'John persuades David to order a pizza',
#                'John tries to go',
#                'John tries to find a unicorn',
#                'John seems to vanish',
#                'a unicorn seems to approach',
                'every big cat leaves',
                'every gray cat leaves',
                'every big gray cat leaves',
                'a former senator leaves']

    print '============== DEMO =============='
    
    tagger = RegexpTagger(
        [('^(David|Mary|John)$', 'NNP'),
         ('^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'),
         ('^(go|order|vanish|find|approach)$', 'VB'),
         ('^(a)$', 'ex_quant'),
         ('^(every)$', 'univ_quant'),
         ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
         ('^(big|gray|former)$', 'JJ'),
         ('^(him|himself)$', 'PRP')
    ])

    depparser = MaltParser(tagger=tagger)
    glue = Glue(depparser=depparser, verbose=False)
    
    for (i, sentence) in enumerate(examples):
        if i==show_example or show_example==-1:
            print '[[[Example %s]]]  %s' % (i, sentence)
            for reading in glue.parse_to_meaning(sentence):
                print reading.simplify()
            print ''
Beispiel #16
0
#!/usr/bin/env python
from nltk.parse.malt import MaltParser
parser = MaltParser('maltparser-1.8.1','espmalt-1.0.mco')
txt="This is a test sentence"
parser.train_from_file('Tibidabo_Treebank.txt')
parser.raw_parse(txt)
Beispiel #17
0
import sys
from nltk.parse.malt import MaltParser

PATH_TO_MALTPARSER = "maltparser-1.9.2"
PATH_TO_MODEL = "kaist-conll.mco"


def print_usage():
    print("usage: $ python3 parser.py <input text>")


if __name__ == '__main__':
    argv = sys.argv[1:]
    argc = len(sys.argv)

    if (argc != 2):
        print_usage()
        sys.exit()

    user_input = argv[0]
    tokens = user_input.split()

    mp = MaltParser(PATH_TO_MALTPARSER, PATH_TO_MODEL)
    graph = mp.parse_one(tokens).tree()
    print(graph)
Beispiel #18
0
def get_feature_names(pairs):
    feature_names = []
    c_vect = CountVectorizer(min_df=5, ngram_range=(1,3), tokenizer=word_tokenize)
    pos_vect = CountVectorizer(tokenizer=iterate_pos, ngram_range=(1,3), lowercase=False)
    mp = MaltParser("/home/lena/opt/maltparser-1.9.2","/home/lena/opt/russian.mco")
    model_d = Doc2Vec.load('vec/model_d.w2v')
    model_w = Word2Vec.load('vec/model_w.w2v')
    DataDict = {'edu1_position': [],
                'edu2_position': [],
                'edu1_endsent': [],
                'edu1_startsent': [],
                'edu2_endsent': [],
                'edu2_startsent': [],
                'edu1_len': [],
                'edu2_len': [],
                'same_tokens': [],
                'distance': [],
                'attribution1': [],
                'cause-effect1': [],
                'concession1': [],
                'condition1': [],
                'contrast1': [],
                'elaboration1': [],
                'joint1': [],
                'purpose1' :[],
                'attribution2': [],
                'cause-effect2': [],
                'concession2': [],
                'condition2': [],
                'contrast2': [],
                'elaboration2': [],
                'joint2': [],
                'purpose2' :[]}
    for pair in pairs:
        markers_dict1 = has_markers(pair.edu1.lemmatized_tokens)
        markers_dict2 = has_markers(pair.edu2.lemmatized_tokens)
        DataDict['edu1_position'].append(int(pair.edu1.position))
        DataDict['edu2_position'].append(int(pair.edu2.position))
        DataDict['edu1_endsent'].append(int(pair.edu1.sentence_end))
        DataDict['edu2_endsent'].append(int(pair.edu2.sentence_end))
        DataDict['edu1_startsent'].append(int(pair.edu1.sentence_start))
        DataDict['edu2_startsent'].append(int(pair.edu2.sentence_start))
        DataDict['edu1_len'].append(len(pair.edu1.tokens))
        DataDict['edu2_len'].append(len(pair.edu2.tokens))
        # количество совпадающих токенов (леммы)
        DataDict['same_tokens'].append(len(set(pair.edu1.lemmatized_tokens).intersection(
            pair.edu2.lemmatized_tokens)))
        DataDict['distance'].append(int(pair.edu2.position)-int(pair.edu1.position)-1)
        for rel_name in ['attribution','cause-effect','concession','condition','contrast','elaboration','joint','purpose']:
            DataDict[rel_name+'1'].append(markers_dict1[rel_name])
            DataDict[rel_name+'2'].append(markers_dict2[rel_name])


    X = pd.DataFrame(DataDict)
    feature_names.extend(X.columns)
    # векторайзер по словам
    all_texts = [pair.edu1.text for pair in pairs] + [pair.edu2.text for pair in pairs]
    c_vect.fit(all_texts)
    # edus1_vect = c_vect.transform([pair.edu1.text for pair in pairs])
    # edus2_vect = c_vect.transform([pair.edu2.text for pair in pairs])

    feature_names.extend([i+'1' for i in c_vect.get_feature_names() ])
    feature_names.extend([i+'2' for i in c_vect.get_feature_names() ])

    # векторайзер по тегам частей речи
    all_pos = [pair.edu1.pos for pair in pairs] + [pair.edu2.pos for pair in pairs]
    pos_vect.fit(all_pos)
    pos1_vect = pos_vect.transform([pair.edu1.pos for pair in pairs])
    pos2_vect = pos_vect.transform([pair.edu2.pos for pair in pairs])

    feature_names.extend([i+'1' for i in pos_vect.get_feature_names() ])
    feature_names.extend([i+'2' for i in pos_vect.get_feature_names() ])

    # Doc2Vec - вектор ЭДЕ
    d2v1 = csr_matrix(np.array([model_d.infer_vector(pair.edu1.tokens) for pair in pairs]))
    d2v2 = csr_matrix(np.array([model_d.infer_vector(pair.edu1.tokens) for pair in pairs]))

    feature_names.extend(['d2v1'+str(i) for i in range(100)])
    feature_names.extend(['d2v2'+str(i) for i in range(100)])

    # Word2Vec - векторы первого и последнего токена в ЭДЕ
    w2v1_first = csr_matrix(np.array([w2v_word_vector(model_w, pair.edu1.tokens[0]) for pair in pairs]))
    w2v2_first = csr_matrix(np.array([w2v_word_vector(model_w, pair.edu2.tokens[0]) for pair in pairs]))
    w2v1_last = csr_matrix(np.array([w2v_word_vector(model_w, pair.edu1.tokens[-1]) for pair in pairs]))
    w2v2_last = csr_matrix(np.array([w2v_word_vector(model_w, pair.edu2.tokens[-1]) for pair in pairs]))

    feature_names.extend(['w2v1_first'+str(i) for i in range(100)])
    feature_names.extend(['w2v2_first'+str(i) for i in range(100)])

    feature_names.extend(['w2v1_last'+str(i) for i in range(100)])
    feature_names.extend(['w2v2_last'+str(i) for i in range(100)])

    # # СИНТАКСИС Word2Vec - векторы head ЭДЕ, POS-теги head ЭДЕ
    # head_ids_edu1 = [detect_head(mp.parse_one(pair.edu1.tokens)) for pair in pairs]
    # head_ids_edu2 = [detect_head(mp.parse_one(pair.edu2.tokens)) for pair in pairs]
    # head_vectors_edu1 = csr_matrix(np.array([w2v_word_vector(model_w, pairs[i].edu1.tokens[head_ids_edu1[i]]) for i in range(len(pairs))]))
    # head_vectors_edu2 = csr_matrix(np.array([w2v_word_vector(model_w, pairs[i].edu2.tokens[head_ids_edu2[i]]) for i in range(len(pairs))]))
    # head_pos_edu1 = pos_vect.transform([[pairs[i].edu1.pos[head_ids_edu1[i]]] for i in range(len(pairs))])
    # head_pos_edu2 = pos_vect.transform([[pairs[i].edu2.pos[head_ids_edu2[i]]] for i in range(len(pairs))])

    # X_sparse = csr_matrix(np.array(X))
    # X_concat = hstack((X_sparse, edus1_vect, edus2_vect, pos1_vect, pos2_vect, d2v1, d2v2,
    #                    w2v1_first, w2v2_first, w2v1_last, w2v2_last))
    print(len(feature_names))

    return feature_names
Beispiel #19
0
def drt_discourse_demo(reading_command=None):
    """
    Illustrate the various methods of C{DiscourseTester}
    """
    dt = DiscourseTester(['every dog chases a boy', 'he runs'],
                         reading_command)
    dt.models()
    print
    dt.sentences()
    print
    dt.readings()
    print
    dt.readings(show_thread_readings=True)
    print
    dt.readings(filter=True, show_thread_readings=True)


def spacer(num=30):
    print '-' * num


if __name__ == '__main__':
    discourse_demo()

    tagger = RegexpTagger([('^(chases|runs)$', 'VB'), ('^(a)$', 'ex_quant'),
                           ('^(every)$', 'univ_quant'), ('^(dog|boy)$', 'NN'),
                           ('^(he)$', 'PRP')])
    depparser = MaltParser(tagger=tagger)
    drt_discourse_demo(
        DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser))
Beispiel #20
0
    #  S VP NP Bob <-> Bob Np VP S
    #  always the reversed list goes first and that's it?
    sublist1 = path1[j:]
    #print("sublist1",sublist1)
    if j < len(path2) - 1:
        j = j + 1
    sublist2 = path2[j:]
    #print("sublist2",sublist2)
    sublist2.reverse()
    #print("sublist2",sublist2)
    shortestpath = sublist2 + sublist1

    return shortestpath


dparser = MaltParser('../data/grammars/maltparser-1.8.1/',
                     'engmalt.linear-1.7.mco')
pt = dparser.parse_one('I shot an elephant in my pajamas .'.split()).tree()

# print(pt)

# print(pt)
# print(shortestPath(pt,'I','pajamas'))
# print(shortestPath(pt,'I','pajamas'))
# print(shortestPath(pt,'elephant','pajamas'))
# print(shortestPath(pt,'I','elephant'))

# parsing many sentences

tagged_sents = [
    "The other day I went to the beach.".split(),
    "It was a hot day so I swimmed in the water.".split()
Beispiel #21
0
class Glue(object):
    def __init__(self, semtype_file=None, remove_duplicates=False, 
                 depparser=None, verbose=False):
        self.verbose = verbose
        self.remove_duplicates = remove_duplicates
        self.depparser = depparser
        
        if semtype_file:
            self.semtype_file = semtype_file
        else:
            self.semtype_file = 'glue.semtype'
        
    def train_depparser(self, depgraphs=None):
        if depgraphs:
            self.depparser.train(depgraphs)
        else:
            self.depparser.train_from_file(data.find(os.path.join('grammars', 'sample_grammars', 'glue_train.conll')))
    
    def parse_to_meaning(self, sentence):
        readings = []
        for agenda in self.parse_to_compiled(sentence):
            readings.extend(self.get_readings(agenda))
        return readings
    
    def get_readings(self, agenda):
        readings = []
        agenda_length = len(agenda)
        atomics = dict()
        nonatomics = dict()
        while agenda: # is not empty
            cur = agenda.pop()
            glue_simp = cur.glue.simplify()
            if isinstance(glue_simp, linearlogic.ImpExpression): # if cur.glue is non-atomic
                for key in atomics:
                    try:
                        if isinstance(cur.glue, linearlogic.ApplicationExpression):
                            bindings = cur.glue.bindings
                        else:
                            bindings = linearlogic.BindingDict()
                        glue_simp.antecedent.unify(key, bindings)
                        for atomic in atomics[key]:
                            if not (cur.indices & atomic.indices): # if the sets of indices are disjoint
                                try:
                                    agenda.append(cur.applyto(atomic))
                                except linearlogic.LinearLogicApplicationException:
                                    pass
                    except linearlogic.UnificationException:
                        pass
                try:
                    nonatomics[glue_simp.antecedent].append(cur)
                except KeyError:
                    nonatomics[glue_simp.antecedent] = [cur]
    
            else: # else cur.glue is atomic
                for key in nonatomics:
                    for nonatomic in nonatomics[key]:
                        try:
                            if isinstance(nonatomic.glue, linearlogic.ApplicationExpression):
                                bindings = nonatomic.glue.bindings
                            else:
                                bindings = linearlogic.BindingDict()
                            glue_simp.unify(key, bindings)
                            if not (cur.indices & nonatomic.indices): # if the sets of indices are disjoint
                                try:
                                    agenda.append(nonatomic.applyto(cur))
                                except linearlogic.LinearLogicApplicationException:
                                    pass
                        except linearlogic.UnificationException:
                            pass
                try:
                    atomics[glue_simp].append(cur)
                except KeyError:
                    atomics[glue_simp] = [cur]
                    
        for entry in atomics:
            for gf in atomics[entry]:
                if len(gf.indices) == agenda_length:
                    self._add_to_reading_list(gf, readings)
        for entry in nonatomics:
            for gf in nonatomics[entry]:
                if len(gf.indices) == agenda_length:
                    self._add_to_reading_list(gf, readings)
        return readings
            
    def _add_to_reading_list(self, glueformula, reading_list):
        add_reading = True
        if self.remove_duplicates:
            for reading in reading_list:
                try:
                    if reading.tp_equals(glueformula.meaning, 'Prover9'):
                        add_reading = False
                        break;
                except:
                    #if there is an exception, the syntax of the formula  
                    #may not be understandable by the prover, so don't
                    #throw out the reading.
                    pass
        if add_reading:
            reading_list.append(glueformula.meaning)
        
    def parse_to_compiled(self, sentence='a man sees Mary'):
        gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)]
        return [self.gfl_to_compiled(gfl) for gfl in gfls]
    
    def dep_parse(self, sentence='every cat leaves'):
        #Lazy-initialize the depparser
        if self.depparser is None:
            self.depparser = MaltParser(tagger=self.get_pos_tagger())
        if not self.depparser._trained:
            self.train_depparser()

        return [self.depparser.parse(sentence, verbose=self.verbose)]
    
    def depgraph_to_glue(self, depgraph):
        return self.get_glue_dict().to_glueformula_list(depgraph)
    
    def get_glue_dict(self):
        return GlueDict(self.semtype_file)
    
    def gfl_to_compiled(self, gfl):
        index_counter = Counter()
        return_list = []
        for gf in gfl:
            return_list.extend(gf.compile(index_counter))
        
        if self.verbose:
            print 'Compiled Glue Premises:'
            for cgf in return_list:
                print cgf    
        
        return return_list
    
    def get_pos_tagger(self):
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
        
        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)
        
        return main_tagger
Beispiel #22
0
    def get_maltparse_tagger():
        maltparse_dir = os.environ['MALTPARSE_DIR']
        maltparse_model = os.path.join(maltparse_dir, 'TRL_maltparser_modul_ES.rar') ##Provided by IULA asa pretrained modelbut needs to be .mco filenot rar. 
	return MaltParser(maltparse_dir)
Beispiel #23
0
]
# hack for subprocess.DEVNULL on python 2.7
try:
    from subprocess import DEVNULL  # py3k
except ImportError:
    import os
    import subprocess
    subprocess.DEVNULL = open(os.devnull, 'wb')

os.environ['STANFORD_MODELS'] = 'libs/stanford-postagger-2018-10-16/models'

tagger = StanfordPOSTagger(
    'english-bidirectional-distsim.tagger',
    'libs/stanford-postagger-2018-10-16/stanford-postagger.jar')
parser = MaltParser(os.path.dirname(os.path.abspath(__file__)) +
                    '/libs/maltparser-1.9.1',
                    'libs/engmalt.linear-1.7.mco',
                    tagger=tagger.tag)
stemmer = WordNetLemmatizer()


def wsd_of(tree, node):
    head, pobj = getLink(tree, node, 'head'), getLink(tree, node, 'dep:pobj')
    if head['tag'] == 'CD' or head['word'] == 'many':  # 3 of them
        return 'f_part_whole'
    elif pobj['tag'] == 'CD':  # a total of 20
        return 'Scale_value'
    else:
        return 'Entity_association'


frames = {
Features
1. unigrams
2. bigrams
3. 
'''

# MaltParser for dependency triples
# how to get maltparser to work: http://stackoverflow.com/questions/13207394/step-by-step-to-getting-malt-parser-in-nltk-to-work
set_maltparser = "export MALT_PARSER=%s" % (
    os.path.abspath("maltparser-1.8.1"))
set_maltmodel = "export MALT_MODEL=%s" % (
    os.path.abspath("engmalt.linear-1.7.mco"))
os.system(set_maltmodel)
os.system(set_maltparser)
mp = MaltParser('maltparser-1.8.1', 'engmalt.linear-1.7.mco')

# Name entity tagger
#add the jar and model via their path:
ner_jar = os.path.abspath("stanford-ner-2016-10-31/stanford-ner-3.7.0.jar")
ner_model = os.path.abspath(
    "stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz"
)
ner_st = StanfordNERTagger(ner_model, ner_jar)
pos_jar = os.path.abspath(
    "stanford-postagger-2016-10-31/stanford-postagger.jar")
pos_model = os.path.abspath(
    "stanford-postagger-2016-10-31/models/english-bidirectional-distsim.tagger"
)
pos_st = StanfordPOSTagger(pos_model, pos_jar)
    #  always the reversed list goes first and that's it?
    sublist1 = path1[j:]
    #print("sublist1",sublist1)
    if j< len(path2)-1:
        j=j+1
    sublist2 = path2[j:]
    #print("sublist2",sublist2)
    sublist2.reverse()
    #print("sublist2",sublist2)
    shortestpath = sublist2 + sublist1

    return shortestpath



dparser = MaltParser('../data/grammars/maltparser-1.8.1/', 'engmalt.linear-1.7.mco')
pt = dparser.parse_one('I shot an elephant in my pajamas .'.split()).tree()

# print(pt)


# print(pt) 
# print(shortestPath(pt,'I','pajamas'))
# print(shortestPath(pt,'I','pajamas'))
# print(shortestPath(pt,'elephant','pajamas'))
# print(shortestPath(pt,'I','elephant'))


# parsing many sentences

tagged_sents= [
Beispiel #26
0
from role.corpus import *
from nltk.corpus import brown
from nltk.sem.glue import *
nltk.sem.logic._counter._value = 0
from nltk.parse.malt import MaltParser

# file = read_stanza_document_file("role/corpus/kant/fpmm_1_stanza.ann")
# tagged = [[(w.text, w.xpos) for w in s.words] for s in file.sentences]

brown_train = brown.tagged_sents(categories="news")
unigram_tagger = UnigramTagger(brown_train)
# bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
# trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
main_tagger = RegexpTagger(
    [(r"(A|a|An|an|The|the)$", "ex_quant"),
     (r"(Every|every|All|all|Any|any)$", "univ_quant")],
    backoff=unigram_tagger,
)

depparser = MaltParser('./maltparser-1.9.2', tagger=main_tagger.tag)
glue = DrtGlue(depparser=depparser)
print(
    main_tagger.tag(
        "The grand jury produced no evidence that any irregularities took place"
        .split()))
readings = glue.parse_to_meaning(
    "The grand jury produced no evidence that any irregularities took place".
    split())
 def __init__(self, version = "maltparser-1.9.2", model = "engmalt.linear-1.7.mco"):
     super().__init__()
     root = os.getcwd()
     version_path = os.path.join(root, "schemata", "parse", "maltparser", version )
     model_path =os.path.join(root, "schemata", "parse", "maltparser", model )
     self.base = MP(version_path, model_path)