コード例 #1
0
def main(transform_func = None, n = 10):
    parser=StanfordParser(
        path_to_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser.jar",
        path_to_models_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar",
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )

    test_sents = treebank.sents()[-n:]

    print "len(test_sents) = %d" %(len(test_sents))

    if transform_func and callable(transform_func):
        print "transforming it using ", transform_func
        test_sents = [[transform_func(w) for w in s] 
                      for s in test_sents] # transform it

    print test_sents[:10]

    print "predicting"
    pred_parses = parser.parse_sents(test_sents)
    
    gold_parses = treebank.parsed_sents()
    
    print "evaluating"

    correct_n = gold_n = predicted_n = 0.0
    
    for gparse, pparse in zip(gold_parses, pred_parses):
        cn, gn, pn = precision_and_recall_stat(get_nodes_with_range(gparse), 
                                               get_nodes_with_range(pparse))
        correct_n += cn
        gold_n += gn
        predicted_n += pn
        
    print "Prediction: %f, Recall: %f" %(correct_n / predicted_n, correct_n / gold_n)
コード例 #2
0
ファイル: advanced.py プロジェクト: hajoki/EliseMichon_HW3
def dependencies():
    #english_parser = StanfordParser('stanford-parser.jar', 'stanford-parser-3.6.0-models.jar')
    #english_parser.raw_parse_sents(("this is the english parser test", "the parser is from stanford parser"))                         
    parser = StanfordParser(model_path="C:\Python27\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar")
    sentences = parser.raw_parse_sents(("IBlood B cells secrete PROTX1  ( s )   upon stimulation via the PROTX2.", "Furthermore ,  blocking PROTX0 or PROTX0 had no effect on the levels of PROTX2 released in response to the anti -  PROTX1 mAb."))
    print sentences

    # GUI
    for line in sentences:
        for sentence in line:
            sentence.draw()
コード例 #3
0
ファイル: newsTest.py プロジェクト: choon94/choon94.github.io
def parser():
	os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar'
	os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

	eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',java_options="-mx2048m")
	for x in content:
		a = list(eng_parser.parse(x.split()))[0]
		print(a)
		# a.draw()

	eng_dep_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
	for x in content:
		a = list(eng_dep_parser.parse(x.split()))[0]
		for row in a.triples():
			print(row)
コード例 #4
0
ファイル: parse_viz.py プロジェクト: qcs4tracy/NER-Project
class SyntaxTreeParser:
    def __init__(self):
        self.parser = StanfordParser()
        if not self.parser:
            raise RuntimeError('Stanford Parsre could not be initialized.')
    
    def raw_parse(self, sent):
        tree = next(self.parser.raw_parse(sent))
        return tree

    def parse(self, sent):
        one_sent = sent
        if len(sent[0]) == 1:
            one_sent = nltk.pos_tag(sent)
        tree = self.parser.tagged_parse(one_sent)
        return tree
コード例 #5
0
def main():
    parser = StanfordParser(path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar)
    st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
    raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution."
    sent = word_tokenize(raw_sent)
    ne_tuple = st.cur_tag(sent)  # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized)
    print ne_tuple
    
    print parser.raw_parse(raw_sent).next()

    return
    # find name entity
    f = 0
    ne_list = []
    for (ne, label) in ne_tuple:
        if label == 'PERSON':
            f = 1
        if f and label != 'PERSON':
            break
        if f:
            ne_list.append(ne)
    # print ne_list

    init_file(main_tree)
                    ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ?
    # try head to ask who/what
    pattern = "S < NP=np"
    head = check_output(['bash',  ###add bash !!!!
                         tregex_path,
                         '-s',
                         pattern,
                         init_tree_file])
    print head

    def get_main_verbs(tree):
        pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)'
        main_verbs = check_output(['bash',  ###add bash !!!!
                                   tregex_path,
                                   '-s',
                                   pattern,
                                   init_tree_file])
        print main_verbs
        main_verbs = main_verbs.split('\n')[:-1]
        main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs]
        return main_verbs
コード例 #6
0
ファイル: sdfpreprocess.py プロジェクト: cosmozhang/satire
def sdfprocess(tp, path, filenamels, docid):
    parser=StanfordParser(path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', path_to_models_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options='-mx5000m')
    sdfdata = []
    for i in range(len(filenamels)):
        if (i+1)%100 == 0: print "%f%% of document %d of %s finished" % ((i+1)*100*1.0/len(filenamels), docid, tp) 
        filename = filenamels[i]
        h = open(path + filename, 'r')
        lines = h.readlines()
        h.close()
        headraw, bodyraw = preprocess(lines[0]), preprocess(lines[1])

        sentences = [headraw] + nltk.sent_tokenize(bodyraw)
        sdfparsed = parser.raw_parse_sents(sentences)
        sdfdata.append(sdfparsed)
        # print sdfparsed
        # print sdfdata      
        # if i > 5: break
    return sdfdata
コード例 #7
0
    def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        self.parser = StanfordParser(path_to_jar='dev/stanford-corenlp-3.6.0.jar',
                                     path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
                                     java_options=' -mx2G -Djava.ext.dirs=dev/')

        self.token_to_lemma = {}
        for lemma, tokens in self.lemma_to_token.iteritems():
            for t in tokens:
                self.token_to_lemma[t] = lemma
        self.all_verbs = set(self.token_to_lemma.keys())
コード例 #8
0
ファイル: textparser.py プロジェクト: Zarnosch/AVaAoMDTV
 def __init__(self):
     """ The Stanford Parser is required, download from http://nlp.stanford.edu/software/lex-parser.shtml and unpack somewhere """
     # insert path to java home
     if os.name == "nt":
         os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk1.8.0_66\bin\java.exe'
         # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar
         self.english_parser = StanfordParser(
             'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar',
             'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
     elif os.name != "posix":
         os.environ['JAVAHOME'] = 'C:/Program Files (x86)/Java/jdk1.8.0_65/bin/java.exe'
         # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar
         self.english_parser = StanfordParser(
             'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar',
             'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
     else:
         os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.8.0-openjdk-amd64'
         # insert path to the directory containing stanford-parser.ja and stanford-parser-3.5.2-models.jar
         self.english_parser = StanfordParser(
             expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser.jar',
             expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
コード例 #9
0
ファイル: parser.py プロジェクト: BioGeek/Lango
class OldStanfordLibParser(Parser):
    """For StanfordParser < 3.6.0"""

    def __init__(self):
        self.parser = StanfordParser()

    def parse(self, line):
        """Returns tree objects from a sentence

        Args:
            line: Sentence to be parsed into a tree

        Returns:
            Tree object representing parsed sentence
        """
        tree = list(self.parser.raw_parse(line))[0]
        tree = tree[0]
        return tree
コード例 #10
0
ファイル: textparser.py プロジェクト: Zarnosch/AVaAoMDTV
class Stanford:
    def __init__(self):
        """ The Stanford Parser is required, download from http://nlp.stanford.edu/software/lex-parser.shtml and unpack somewhere """
        # insert path to java home
        if os.name == "nt":
            os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk1.8.0_66\bin\java.exe'
            # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar
            self.english_parser = StanfordParser(
                'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar',
                'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
        elif os.name != "posix":
            os.environ['JAVAHOME'] = 'C:/Program Files (x86)/Java/jdk1.8.0_65/bin/java.exe'
            # insert path to the directory containing stanford-parser.jar and stanford-parser-3.5.2-models.jar
            self.english_parser = StanfordParser(
                'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser.jar',
                'C:/Python34/Lib/site-packages/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
        else:
            os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.8.0-openjdk-amd64'
            # insert path to the directory containing stanford-parser.ja and stanford-parser-3.5.2-models.jar
            self.english_parser = StanfordParser(
                expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser.jar',
                expanduser("~") + '/lib/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')

    def get_sent_depth(self, s):
        # remove linebreaks for syntax tree
        s = s.replace('\n', ' ').replace('\r', ' ')

        sentence = self.english_parser.raw_parse(s)
        current_tree = None
        depth = 0

        for line in sentence:
            current_tree = line
            depth = line.height() - 1

        sent_depth_feature_value = (depth - 4) / 20

        if sent_depth_feature_value < 0: return current_tree, 0
        if sent_depth_feature_value > 1: return current_tree, 1
        return current_tree, round(sent_depth_feature_value, 2)
コード例 #11
0
ファイル: data.py プロジェクト: strin/counter-squad
def create_stanford_parser():
    from nltk.parse.stanford import StanfordParser
    return StanfordParser(
        '/home/durin/software/stanford-parser-full-2015-12-09/stanford-parser.jar',
        '/home/durin/software/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar',
        java_options='-mx32000m')
コード例 #12
0
from nltk.parse.stanford import StanfordParser
from nltk.tag import StanfordNERTagger
import en
import utils
sentences = utils.get_tokenized_sentences("data/set1/a1.txt")
parser=StanfordParser()

print len(sentences)
print len([ x for x in sentences if "is" in x])

[parser.raw_parse((x)) for x in sentences]
コード例 #13
0
def simplify(sent):
    from anytree import NodeMixin, Node, AnyNode, RenderTree
    from nltk.parse.stanford import StanfordParser

    def make_tree(tree, t, sent_list):
        #this fn. converts nltk tree to anytree
        if tree not in sent_list:
            ttt = AnyNode(id=str(tree.label()), parent=t)
            for tt in tree:
                make_tree(tt, ttt, sent_list)
        else:
            AnyNode(id=str(tree), parent=t)

    parser = StanfordParser()

    #SBAR CASE
    def find_sbar(t):
        if t.id == 'SBAR':
            global sbar
            sbar = t
        for tt in t.children:
            find_sbar(tt)

    def find_vp_in_sbar(t):
        if t.id == 'VP':
            global vp_sbar
            vp_sbar.append(t)
        for tt in t.children:
            find_vp_in_sbar(tt)

    def find_np_in_sbar(t):
        global f
        global ff
        if t.id == 'VP':
            ff = False
        if (t.id == 'NP') and f == True and ff == True:
            global np_sbar
            np_sbar = t
            f = False
        for tt in t.children:
            find_np_in_sbar(tt)

    def find_vp(t):
        if t.id == 'SBAR':
            return
        global f
        if t.id == 'VP' and f == True:
            global vp
            vp = t
            f = False
        for tt in t.children:
            find_vp(tt)

    def find_np(t):
        if t.id == 'SBAR':
            return
        global f
        if t.id == 'NP' and f == True:
            global np
            np = t
            f = False
        for tt in t.children:
            find_np(tt)

    def find_vbz(t):
        if t.id == 'SBAR':
            return
        global f
        if t.id == 'VBZ' and f == True:
            global vbz
            vbz = t.children[0].id
            f = False
        for tt in t.children:
            find_vbz(tt)

    def make_sent(t):
        global simple_sentences
        if t.id in sent_list:
            simple_sentences[-1].append(t.id)
        for tt in t.children:
            make_sent(tt)

    #sent=sent8

    parse_trees = parser.raw_parse(sent)
    global sent_list
    sent_list = [s for s in sent.split()]
    tree = next(parse_trees)[0]
    #tree.draw()
    t = AnyNode(id='ROOT')
    make_tree(tree, t, sent_list)
    global sbar
    sbar = t
    global vp_sbar
    global f
    global ff
    global np_sbar
    global vp
    global np
    global vbz
    vp_sbar = []
    vp = t
    np = t
    vbz = 'bn2'
    np_sbar = t
    find_sbar(t)
    find_vp_in_sbar(sbar)
    f = True
    ff = True
    find_np_in_sbar(sbar)
    f = True
    find_vp(t)
    f = True
    find_np(t)
    f = True
    find_vbz(t)
    global simple_sentences
    simple_sentences = []
    simple_sentences.append([])
    make_sent(np)
    make_sent(vp)
    for i in range(len(vp_sbar)):
        simple_sentences.append([])
        if np_sbar == t:
            make_sent(np)
        else:
            make_sent(np_sbar)
        if vbz != 'bn2':
            simple_sentences[-1].append(vbz)
        make_sent(vp_sbar[i])
    #print (simple_sentences)
    simple = []
    for sentence in simple_sentences:
        string = ''
        for word in sentence:
            string += word + ' '
        string += '.'
        simple.append(string)

    def is_any_sbar(t):
        if t.id == 'SBAR':
            global f
            f = True
            return
        for tt in t.children:
            is_any_sbar(tt)

    f = False
    is_any_sbar(t)
    if f == False:
        simple = [sent]
    return simple
コード例 #14
0
    if (type(r) != Tree):
        print r,
        return

    thresh = C.get(f, {}).get(r.label(), -1)
    p = np.random.random_sample()
    # print p, thresh
    if thresh != -1 and p < thresh:
        return

    for i in range(0, len(r)):
        decoder(r[i], r.label(), k + 1)


dep_parser = StanfordParser(path_to_jar="./stanford-parser.jar",
                            path_to_models_jar="./stanford-models.jar")

load_model()

import sys
filename = sys.argv[1]
text = list(open(filename).readlines())
text = [s.strip() for s in text]

for i in range(len(text)):

    s1 = clean_str(text[i])
    if s1 == "":
        continue

    print 201
コード例 #15
0
import os
import sys
from nltk.parse.bllip import BllipParser

from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordNeuralDependencyParser
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer

#parser_path='/home/jihuni/.local/share/bllipparser/WSJ-PTB3'
#bllip = BllipParser.from_unified_model_dir(parser_path)
model_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
#stanford=StanfordParser(model_path)
stanford = StanfordParser()
parser = stanford


def IsLeaf(node):
    #From NLTK documentation:
    #    containing no children is 1; the height of a tree
    #    containing only leaves is 2; and the height of any other
    return node.height() == 2


def ToASCIIstring(node):
    if IsLeaf(node):
        return node[0]
    return '(%s %s)' % (ToASCIIstring(node[0]), ToASCIIstring(node[1]))

コード例 #16
0
ファイル: parser.py プロジェクト: techscientist/Lango
 def __init__(self):
     self.parser = StanfordParser()
コード例 #17
0
import os
import sys
import nltk
from nltk.parse.stanford import StanfordParser


f = open(sys.argv[1])

text = f.read()
text = text.decode('utf-8')


sents = nltk.sent_tokenize(text)

print sents

modelPath = 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'

parser = StanfordParser(model_path = modelPath)


for s in sents:
	print list(parser.raw_parse(s))




コード例 #18
0
import os 
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser


os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09'
os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_40.jdk/Contents/Home'

eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())))
a = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))[0]
a.draw()

eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
for row in res[0].triples():
	print(row)
res[0].tree().draw()
# -*- coding: utf-8 -*-
# export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
from __future__ import unicode_literals
import os
import sys
import io
import copy
import nltk
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
parser=StanfordParser(model_path="stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
stanford_dir = parser._classpath[0].rpartition('/')[0]
parser._classpath = tuple(find_jars_within_path(stanford_dir))
# from set_parser import parse_it
class Node(object):
	"""
		A generic representation of a tree node. Includes a string label and a list of a children.
	"""

	def __init__(self, label):
		"""
			Creates a node with the given label. The label must be a string for use with the PQ-Gram
			algorithm.
		"""
		self.label = label
		self.children = list()

	def addkid(self, node, before=False):
		"""
			Adds a child node. When the before flag is true, the child node will be inserted at the
			beginning of the list of children, otherwise the child node is appended.
コード例 #20
0
ファイル: featExtrWithSmor.py プロジェクト: cyriaka90/MyFiles
 # -*- coding: utf-8 -*- 

## This code extracts the features for several glosses and stores it in two text files to be fed to evaluation. py or predictGoodness.py

## import everything needed
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from nltk.parse.stanford import StanfordParser
from nltk.tag.stanford import StanfordPOSTagger
import string
from pattern.de import singularize
import subprocess
import os

## set variables
parser=StanfordParser(model_path="edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz")
st = StanfordPOSTagger('german-dewac.tagger')
featuresPhrases = []
finalRatings = []
count=0
path = '/home/hanna/Documents/SMOR/'

## read in the word frequencies from DeReWo
derewo = open('derewo-v-ww-bll-320000g-2012-12-31-1.0.txt')
freqWo= []
freqNo= []
for lines in derewo:
	lines = lines.strip()               
	parts = lines.split(" ")
	freqWo.append(parts[0].lower())
	freqNo.append(int(float(parts[1])))
コード例 #21
0
ファイル: importSD.py プロジェクト: fpang0502/PDC-Parser
"""
source from: https://pypi.org/project/PyStanfordDependencies/
			https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk


"""
import StanfordDependencies, os.path, sys
from nltk.parse.stanford import StanfordParser
parser = StanfordParser(
)  #be sure to have set environmental path to englishPCFG.ser.gz
sd = StanfordDependencies.get_instance(backend='subprocess')


def getTypeD(input):
    'returns our the string with the dependency tags'
    sS = ""
    myList = list(parser.raw_parse(input))

    for l in myList:
        sS += str(l)

    return sS


def createDepData(tag_sent):
    'method from the PyStanfordDependencies 0.3.1 package'
    data = sd.convert_tree(tag_sent)

    return data

コード例 #22
0
__author__ = 'laceyliu'

parser_path ='/Users/laceyliu/Documents/workspace/WikiQA/stanford-parser-full'
which_java = '/Library/Java/JavaVirtualMachines/jdk1.8.0_11.jdk/Contents/HOME/bin/java'
import os
from nltk.parse.stanford import StanfordParser
os.environ['JAVAHOME'] =  which_java
os.environ['CLASSPATH'] = parser_path
os.environ['STANFORD_MODELS'] = parser_path
sentence = "hello world"
sp=StanfordParser()

sentences = ['Clinton Drew \"Clint\" Dempsey (born March 9, 1983) is an American soccer player who plays for Tottenham Hotspur and the United States national team.',
             'Growing up in Nacogdoches, Texas, Dempsey played for one of the top youth soccer clubs in the state, the Dallas Texans, before playing for Furman University\'s men\'s soccer team. ',
             'In 2004, Dempsey was drafted by Major League Soccer club New England Revolution, where he quickly integrated himself into the starting lineup. ',
             'Hindered initially by a jaw injury, he would eventually score 25 goals in 71 appearances with the Revolution.',
             'Between 2007 and 2012, Dempsey played for Premier League team Fulham and is the club\'s highest Premier League goalscorer of all time.',
             'Dempsey first represented the United States at the 2003 FIFA World Youth Championship in the United Arab Emirates. He made his first appearance with the senior team on November 17, 2004, against Jamaica; he was then named to the squad for the 2006 World Cup and scored the team\'s only goal of the tournament. ',
             'In the 2010 FIFA World Cup, Dempsey scored against England, becoming the second American, after Brian McBride, to score goals in multiple World Cup tournaments.']

ss2 = []
for s in sentences:
    if s.count(' ') < 20 and s.count(' ') > 7:
        ss2.append(s.decode('utf-8').encode('ascii', 'ignore'))
trees = sp.raw_parse_sents(ss2)
for t in trees:
    print list(t)
コード例 #23
0
ファイル: SentenceParser.py プロジェクト: frozstone/concept
 def __init__(self):
     self.__parser = StanfordParser()
     self.__var_d  = 12.0/math.log(2.0)
     self.__var_s  = 4.0 * 1.0/math.log(2)
コード例 #24
0
# sent = "the big dog."
#
# p = parser.raw_parse(sent)
#
# # for tree in (p):
# #     print(list(tree))
#
# for line in p:
#     for sentence in line:
#         sentence.draw()




st=StanfordPOSTagger('english-bidirectional-distsim.tagger')
parser=StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

# setup corpus of texts
childStoryCorpusDir = '../resources/org_transcripts'
robotStoryCorpusDir = '../resources/robot_stories'

childStoryCorpus = PlaintextCorpusReader(childStoryCorpusDir, ".*\.txt")
robotStoryCorpus = PlaintextCorpusReader(robotStoryCorpusDir, ".*\.txt")


# average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score)
# for fileid in childStoryCorpus.fileids():
#     num_chars = len(childStoryCorpus.raw(fileid))
#     num_words = len(childStoryCorpus.words(fileid))
#     num_sents = len(childStoryCorpus.sents(fileid))
#     num_vocab = len(set([w.lower() for w in childStoryCorpus.words(fileid)]))
コード例 #25
0
ファイル: nlquery.py プロジェクト: codejitsu/labr
class NLQueryEngine(LoggingInterface):
    """
    Grammar mapping for knowledge queries of the form:
    - What is the X of Y
    - What is X's Y
    """
    def __init__(self, properties={'lang': 'en'}):
        LoggingInterface.__init__(self)
        self.parser = StanfordParser(
            model_path=MODELS_PATHS[properties['lang']])
        self.wd = WikiData()
        self.wd.set_properties(properties)
        self.properties = properties

    def subject_query(self,
                      qtype,
                      subject,
                      action,
                      jj=None,
                      prop=None,
                      prop2=None,
                      prop3=None):
        """Transforms matched context into query parameters and performs query

        Args:
            qtype: Matched type of query (what, who, where, etc.)
            subject: Matched subject (Obama)
            action: Matched verb action (is, was, ran)
            jj (optional): Matched adverb
            prop (optional): Matched prop
            prop2 (optional): Matched prop
            prop3 (optional): Matched prop

        Returns:
            Answer: Answer from query, or empty Answer if None
        """
        if (self.properties['lang'] == 'en'):
            if jj == 'old':
                # How old is Obama?
                prop = 'age'

            if jj in ['tall', 'high']:
                # How tall is Yao Ming / Eifel tower?
                prop = 'height'
        elif (self.properties['lang'] == 'de'):
            if jj == 'alt':
                # Wie alt ist Obama?
                prop = 'age'

            if jj in ['hoch', 'groß']:
                # Wie hoch ist die Zugspitze?
                prop = 'height'

            if prop in ['sprache', 'sprachen']:
                # Welche Sprache spricht man in Sweden?
                prop = 'language official'

        if prop2:
            prop = prop + ' ' + prop2

        if prop3 and not prop:
            prop = prop3

        if not prop:
            if self.properties['lang'] == 'en' and action not in ['is', 'was']:
                prop = action
            elif self.properties['lang'] == 'de' and action not in [
                    'ist', 'sind', 'war', 'hat', 'wurde', 'bedeutet'
            ]:
                prop = action

        ans = self.get_property(qtype, subject, prop)
        if not ans:
            ans = Answer()

        ans.params = {
            'qtype': qtype,
            'subject': subject,
            'prop': prop,
        }
        return ans

    def get_prop_tuple(self,
                       prop=None,
                       value=None,
                       op=None,
                       value_units=None,
                       pp_t=None):
        """Returns a property tuple (prop, value, op). E.g. (population, 1000000, >)

        Args:
            prop (str): Property to search for (e.g. population)
            value (str): Value property should equal (e.g. 10000000)
            op (str): Operator for value of property (e.g. >)

        Returns:
            tuple: Property tuple, e.g: (population, 10000000, >)
        """

        self.info('Prop tuple: {0},{1},{2},{3},{4}', prop, value, op,
                  value_units, pp_t)

        if op in ['in', 'by', 'of', 'from']:
            oper = op
        elif op in ['over', 'above', 'more', 'greater']:
            oper = '>'
        elif op in ['under', 'below', 'less']:
            oper = '<'
        else:
            self.error('NO OP {0}', op)
            return None

        # Infer property to match value
        if prop is None:
            if value_units is not None:
                if value_units in ['people']:
                    prop = 'population'
                if not prop:
                    return None

        props = [(prop, value, oper)]

        if pp_t:
            prop_tuple = match_rules(pp_t,
                                     RULES[properties['lang']]['prop_rules'],
                                     self.get_prop_tuple)
            if not prop_tuple:
                return None
            props += prop_tuple

        return props

    def find_entity_query(self,
                          qtype,
                          inst,
                          prop_match_t=None,
                          prop_match2_t=None):
        """Transforms matched context into query parameters and performs query for
        queries to find entities

        Args:
            qtype (str): Matched type of query (what, who, where, etc.)
            inst (str): Matched instance of entity to match (Obama)
            action (str): Matched verb action (is, was, ran)
            prop_match_t (Tree): Matched property Tree
            prop_match2_t (Tree): Matched property Tree

        Returns:
            Answer: Answer from query, or empty Answer if None
        """

        props = []
        if prop_match_t:
            prop = match_rules(prop_match_t,
                               RULES[self.properties['lang']]['prop_rules'],
                               self.get_prop_tuple)

            if not prop:
                return

            props += prop

        if prop_match2_t:
            prop = match_rules(prop_match2_t,
                               RULES[self.properties['lang']]['prop_rules'],
                               self.get_prop_tuple)

            if not prop:
                return

            props += prop

        if not inst.isupper():
            inst = singularize(inst)

        ans = self.wd.find_entity(qtype, inst, props)
        if not ans:
            ans = Answer()

        ans.params = {
            'qtype': qtype,
            'inst': inst,
            'props': props,
        }
        return ans

    def get_property(self, qtype, subject, prop):
        """Gets property of a subject
        Example:
            get_property('who', 'Obama', 'wife') = 'Michelle Obama'

        Args:
            subject: Subject to get property of
            prop: Property to get of subject

        Todo:
            * Add other APIs here

        Returns:
            Answer: Answer from query
        """
        return self.wd.get_property(qtype, subject, prop)

    def preprocess(self, sent):
        """Preprocesses a query by adding punctuation"""
        if sent[-1] != '?':
            sent = sent + '?'
        return sent

    def cleanup(self, sent):
        """Remove some stop words"""
        stopwords = ['der', 'die', 'das', 'ein', 'eine', 'einen']
        words = sent.split()

        result = [word for word in words if word.lower() not in stopwords]

        return ' '.join(result)

    def query(self, sent, format_='plain'):
        """Answers a query

        If format is plain, will return the answer as a string
        If format is raw, will return the raw context of query

        Args:
            sent: Query sentence
            format_: Format of answer to return (Default to plain)

        Returns:
            dict: Answer context
            str: Answer as a string

        Raises:
            ValueError: If format_ is incorrect
        """

        sent = self.preprocess(sent)
        sent = self.cleanup(sent)
        tree = next(self.parser.raw_parse(sent))

        pos = [tag for word, tag in tree.pos()]

        if self.properties['lang'] == 'de':
            if len(set(['PWS', 'PWAV', 'PWAT']) & set(pos)) == 0:
                print("Tree before:")
                for e in tree:
                    print(str(e))

                sent = "Was ist " + sent
                tree = next(self.parser.raw_parse(sent))
        # TODO
        #elif self.properties['lang'] == 'en':
        #    if len(set(['WHNP']) & set(pos)) == 0:
        #        print("Tree before:")
        #        for e in tree:
        #            print(str(e))
        #
        #        sent = "What is " + sent
        #        tree = next(self.parser.raw_parse(sent))

        context = {'query': sent, 'tree': tree}

        for e in tree:
            print(str(e))

        ans = first([
            match_rules(tree,
                        RULES[self.properties['lang']]['find_entity_rules'],
                        self.find_entity_query),
            match_rules(tree,
                        RULES[self.properties['lang']]['subject_prop_rules'],
                        self.subject_query),
        ])

        print("-> " + str(ans))

        if not ans:
            ans = Answer()

        ans.query = sent
        ans.tree = str(tree)

        if format_ == 'raw':
            return ans.to_dict()
        elif format_ == 'plain':
            return ans.to_plain()
        else:
            raise ValueError('Undefined format: %s' % format_)
コード例 #26
0
ファイル: SentenceParser.py プロジェクト: frozstone/concept
class SentenceParser:
    __parser = None
    __alpha  = 1.0
    __beta   = 1.0
    __gamma  = 0.1

    __var_d  = 0.0
    __var_s  = 0.0

    def __init__(self):
        self.__parser = StanfordParser()
        self.__var_d  = 12.0/math.log(2.0)
        self.__var_s  = 4.0 * 1.0/math.log(2)

    def __parse_sent(self, sentence):
        result = self.__parser.raw_parse(sentence) 
        return result.next()

    def __obtain_nps(self, sentence):
        parse_tree = self.__parse_sent(sentence)
        nps = set()
        for phrase in parse_tree.subtrees():
            if phrase.label() != "NP": continue
            nps.add(' '.join(phrase.leaves()))

        sent_tokens = " ".join(parse_tree.leaves())
        
        #Get the smallest NPs
        nps_smallest = set()
        for np1 in nps:
            if all(np2 not in np1 for np2 in nps if np2 != np1): 
                nps_smallest.add(np1)
        return sent_tokens, nps_smallest

    def __gaussian_weight(self, distance, variance):
        return math.exp(-0.5 * (distance**2)/variance)

    def __weight_tokens(self, mid, nps, sentences, sent_id):
        st          = PorterStemmer()
        sent_target = sentences[sent_id]
        token_id    = [idx for idx, token in enumerate(sent_target.strip().split(" ")) if mid in token][0]

        sent_lengths= [len(s.split(" ")) for s in sentences]

        nps_base = {np:" ".join(st.stem(token) for token in np.split(" ")) for np in nps}
        nps_proc = {}

        for sent_idx, sent in enumerate(sentences):
            sent_stem = " ".join(st.stem(token) for token in sent.split(" "))
            for np_ori, np in nps_base.iteritems():
                if np_ori not in nps_proc: nps_proc[np_ori] = {}

                if "dist_sent" not in nps_proc[np_ori] or abs(sent_idx - sent_id) < nps_proc[np_ori]["dist_sent"]:
                    #always update the info
                    if np not in sent_stem: 
                        continue
                    np_idx      = sent_stem.rindex(np)
                    np_token_idx= len(sent_target[:np_idx].strip().split(" "))
                    dist_start  = len(sent_stem[:np_idx].strip().split(" "))
                    dist_end    = len(sent_stem[np_idx+len(np):].strip().split(" "))

                    dist_sent   = abs(sent_idx - sent_id)
                    dist_token  = -1

                    if dist_sent == 0:
                        if mid in np_ori:
                            dist_token = 0
                        elif np_token_idx < token_id:
                            dist_token = token_id - np_token_idx - (len(np.split(" ")) - 1) - 1
                        elif np_token_idx > token_id:
                            dist_token = np_token_idx - token_id - 1
                    elif sent_idx < sent_id: 
                        dist_token = dist_end + sum(sent_lengths[sent_idx+1:sent_id]) + token_id
                    elif sent_idx > sent_id:
                        dist_token = (len(sent_target.strip().split(" "))-1-token_id) + sum(sent_lengths[sent_id+1:sent_idx]) + dist_start

                    nps_proc[np_ori]["dist_sent"]  = dist_sent
                    nps_proc[np_ori]["dist_token"] = dist_token

                np_count = sent_stem.count(np)
                nps_proc[np_ori]["tf"] = (nps_proc[np_ori].get("tf") or 0) + np_count

        nps_weight = {}
        for np, vals in nps_proc.iteritems():
            term1 = self.__alpha * self.__gaussian_weight(vals["dist_token"], self.__var_d)
            term2 = self.__beta  * self.__gaussian_weight(vals["dist_sent"],  self.__var_s)
            term3 = self.__gamma * vals["tf"]
            nps_weight[np] = (term1 + term2 + term3) / (self.__alpha + self.__beta + self.__gamma)
        return nps_weight

    def obtain_nps_from_sentences(self, mid, text):
        lst_sentences = sent_tokenize(text)
        lst_sent_pr  = []
        set_nps      = set()

        sent_match_id= -1
        for sent_idx, sent in enumerate(lst_sentences):
            if sent_match_id == -1 and mid in sent: 
                sent_match_id = sent_idx

            sent_tokens, nps = self.__obtain_nps(sent)
            lst_sent_pr.append(sent_tokens)
            set_nps.update(nps)

        dct_nps_weight = self.__weight_tokens(mid, set_nps, lst_sent_pr, sent_match_id)
        return lst_sent_pr, dct_nps_weight
コード例 #27
0
import os
#Set standford parser and models in your environment variables.
os.environ['STANFORD_PARSER'] = 'stanford-parser'
os.environ['STANFORD_MODELS'] = 'stanford-parser'
from nltk.parse.stanford import StanfordParser
from nltk.tree import ParentedTree, Tree

parser = StanfordParser()


def find_subject(t):
    for s in t.subtrees(lambda t: t.label() == 'NP'):
        for n in s.subtrees(lambda n: n.label().startswith('NN')):
            return (n[0], find_attrs(n))


def find_predicate(t):
    v = None

    for s in t.subtrees(lambda t: t.label() == 'VP'):
        for n in s.subtrees(lambda n: n.label().startswith('VB')):
            v = n
        return (v[0], find_attrs(v))


def find_object(t):
    for s in t.subtrees(lambda t: t.label() == 'VP'):
        for n in s.subtrees(lambda n: n.label() in ['NP', 'PP', 'ADJP']):
            if n.label() in ['NP', 'PP']:
                for c in n.subtrees(lambda c: c.label().startswith('NN')):
                    return (c[0], find_attrs(c))
コード例 #28
0
ファイル: nltk_test.py プロジェクト: xiabofei/python_details
# for word, tag in chi_tagger.tag(sent.split()):
#     print word.encode('utf-8'), tag
#
# # 英文词性标注
from nltk.tag import StanfordPOSTagger
# eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
# print eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split())
# # 中文词性标注
chi_tagger = StanfordPOSTagger('chinese-distsim.tagger')
# sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
sent = u'宫体 子宫 呈 垂直位 宫内膜 高 T2 信号 连续'
for _, word_and_tag in chi_tagger.tag(sent.split()):
    word, tag = word_and_tag.split('#')
    print word.encode('utf-8'), tag


# 中英文句法分析 区别在于词库不同
from nltk.parse.stanford import StanfordParser
eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
sent = list(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split())
for tree in eng_parser.parse(sent):
    tree.pprint()


# 依存关系分析
from nltk.parse.stanford import StanfordDependencyParser
eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
res = list(eng_parser.parse(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split()))
# st(context=21)
for row in res[0].triples():
    print '(' + row[0][0] + ',' + row[0][1] + ')', row[1], '(' + row[2][0] + ',' + row[2][1] + ')'
コード例 #29
0
            parse_string = remove_formatting(
                parser.raw_parse(test["sentence1"]))
            test["sentence1_parse"] = parse_string
            test["sentence1_binary_parse"] = format_binary_tree(
                Tree.fromstring(parse_string))
            test["sentence2"] = f.readline().strip()
            parse_string = remove_formatting(
                parser.raw_parse(test["sentence2"]))
            test["sentence2_parse"] = parse_string
            test["sentence2_binary_parse"] = format_binary_tree(
                Tree.fromstring(parse_string))
            test["gold_label"] = f.readline().strip()
            test = json.dumps(test)
            print(test)
            f1.write(test)
            f1.write("\n")
            count = count + 1
    f.close()
    f1.close()


"""
Stanford PCFG Parser 3.9.1
Dan Klein and Christopher D. Manning. 2003. Accurate Unlexicalized Parsing. Proceedings of the 41st Meeting of the Association for Computational Linguistics, pp. 423-430.
"""
jar = 'apps/stanford-parser-full-2018-02-27/stanford-parser.jar'
model = 'apps/stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar'
parser = StanfordParser(model, jar, encoding='utf8')

get_json(sys.argv[1], sys.argv[2], parser, count)
コード例 #30
0
ファイル: parse.py プロジェクト: radi9/python_talk
import os
import sys
from nltk.parse.stanford import StanfordParser

if __name__ == '__main__':
	if not os.environ.has_key('STANFORD_PARSE_CLASSPATH'):
		if not len(sys.argv) == 2:
			print 'no stanford parse folder identify'
			stanford_path = raw_input('please give stanford parse folder path : ')
	else:
		stanford_path = os.environ['STANFORD_PARSE_CLASSPATH']
	parser = StanfordParser(stanford_path+'/stanford-parser-3.5.1-models.jar',
							stanford_path+'/stanford-parser.jar')
	#sentence = 'A man previously convicted of harassing Yahoo CEO Marissa Mayer has been arrested by Austin police on suspicion of sending her sexually graphic emails, according to police records released on Friday.'
#	sentence = 'Type 2 diabetes (T2D) and Alzheimer`` disease (AD) are two major health issues nowadays. T2D is an ever increasing epidemic, affecting millions of elderly people worldwide, with major repercussions in the patients  daily life.'
	#sentence = 'MiR-145 is reported to be significantly down-regulated in ovarian cancer.'
	#sentence = 'In this report, we find out that up-regulation of miR-145 in OVCAR-3 and SKOV-3 cells inhibit cell proliferation and promote cell apoptosis.'
	sentence = 'promoted the proliferation of ovarian cancer cells'
	parse_result = list(parser.raw_parse(sentence))
	print parse_result
	print 'print out sentence structure'
	print parse_result[0].draw()
コード例 #31
0
from nltk.stem import PorterStemmer
import nltk

inputString = " "
import os
java_path = "C:\\Program Files\\Java\\jdk-9.0.4\\bin\\java.exe"
os.environ['JAVAHOME'] = java_path

for each in range(1, len(sys.argv)):
    inputString += sys.argv[each]
    inputString += " "

# inputString = raw_input("Enter the String to convert to ISL: ")

parser = StanfordParser(
    model_path=
    'D:/stanford-parser-full-2018-02-27/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
)

# o=parser.parse(s.split())

englishtree = [tree for tree in parser.parse(inputString.split())]
parsetree = englishtree[0]

dict = {}

# "***********subtrees**********"

parenttree = ParentedTree.convert(parsetree)
for sub in parenttree.subtrees():
    dict[sub.treeposition()] = 0
コード例 #32
0
class SVOSENT(object):
    """
    Class Methods to Extract Subject Verb Object Tuples from a Sentence
    """
    def __init__(self, language='english'):
        """
        Initialize 
        """
        self.parser = StanfordParser()
        self.sent_detector = data.load('tokenizers/punkt/' + language +
                                       '.pickle')
        self.analyzer = SentimentIntensityAnalyzer()

    def getTexts(self, directory):
        # function by Tye
        # Input: Directory
        # Output:List of all text files in the directory fully loaded into memory
        texts = []
        pathnames = file_io.getFilesRecurse(directory, '.txt')
        for pathname in pathnames:
            texts.append(file_io.openFile(pathname))
        return texts

    def split_and_clean(self, text):
        '''
        Temporay function only useful for corpus data
        '''
        textlist = text.split(
            '______________________________________________________')
        result = [
            text[text.find("Full text:") + 10:text.find("Publication title")]
            for text in textlist if len(text) != 0
        ]
        return result

    # find all ancestors of a subtree
    def find_ancestors(self, t):
        parents = []

        def find(t):
            parents.append(t.parent().label())
            if t.parent().label() == 'ROOT':
                return parents
            else:
                return find(t.parent())

        result = find(t)
        return result

        # Search for NN, NNP, PRP etc in subtrees based on some restrictions

    def find_subject(self, t):
        subjects = []
        for a in t.subtrees(lambda t: t.label() == 'S' and t.parent().label()
                            not in ['S']):
            for s in a.subtrees(lambda a: a.label() == 'NP' and a.parent().
                                label() != 'VP'):
                for n in s.subtrees(
                        lambda n: n.label() in ['NN', 'NNP', 'NNS', 'PRP'] and
                        len(set(self.find_ancestors(n)).intersection(['VP'])
                            ) == 0):
                    subjects.append(n[0])
        return list(set(subjects))

    # Depth First Search the tree and take verbs in VP subtree.
    def find_predicate(self, t):
        v = None
        predicates = []
        for s in t.subtrees(lambda t: t.label() == 'VP'):
            for n in s.subtrees(lambda n: n.label().startswith('VB')):
                v = n
                predicates.append(v[0])
        return list(set(predicates))

    def find_object(self, t):
        objects = []
        for s in t.subtrees(lambda t: t.label() == 'VP'):
            for n in s.subtrees(lambda n: n.label() in ['NP', 'PP', 'ADJP']):
                if n.label() in ['NP', 'PP']:
                    for c in n.subtrees(lambda c: c.label().startswith('NN')):
                        objects.append(c[0])
        return list(set(objects))

    def sentence_split(self, text):
        """
        split article to sentences
        """
        sentences = self.sent_detector.tokenize(text)
        return sentences

    def get_svo(self, sent):
        t = list(self.parser.raw_parse(sent))[0]
        t = ParentedTree.convert(t)
        return {
            'Subjects': self.find_subject(t),
            'Predicates': self.find_predicate(t),
            'Objects': self.find_object(t),
            'Sentence': sent
        }

    # return a dataframe
    def get_svo_from_article(self, article):
        sentences = self.sentence_split(article)
        val = []
        for sent in sentences:
            svoresult = self.get_svo(sent)
            val.append(svoresult)
        return pd.DataFrame(val)

    ####################################################
    # below are the functions for sentiment analysis

    def sentimentAnalysis(self, sentence):
        result = self.analyzer.polarity_scores(sentence)
        result['Sentence'] = sentence
        return result

    def get_senti_from_article(self, article):
        sentences = self.sentence_split(article)
        val = []
        for sent in sentences:
            result = self.sentimentAnalysis(sent)
            val.append(result)
        return pd.DataFrame(val)

    ###############################################
    # get both SVO and sent in one dataframe

    def svo_senti_from_article(self, article, subject=None):
        try:
            date = list(datefinder.find_dates(article))[0]
        except:
            date = '------'
        sentences = self.sentence_split(article)
        val1 = []
        val2 = []

        for sent in sentences:
            val1.append(self.sentimentAnalysis(sent))
            val2.append(self.get_svo(sent))
        result = pd.merge(pd.DataFrame(val1),
                          pd.DataFrame(val2),
                          on='Sentence')[[
                              'Sentence', 'Subjects', 'Predicates', 'Objects',
                              'compound', 'pos', 'neu', 'neg'
                          ]]
        try:
            result['date'] = date
        except:
            result['date'] = '-----'
        if subject == None:
            return result
        else:
            return result[result['Subjects'].apply(lambda x: subject in x)]
コード例 #33
0
tag_aux_map = {"VBD": "did", "VB": "do", "VBZ": "does", "VBP": "do"}
qhead_map = {
    "GPE": "Where",
    "PERSON": "Who",
    "ORGANIZATION": "What",
    "DATE": "When",
    "MONEY": "How much",
    "LOCATION": "Where"
}
where_prep = [
    'in', 'at', 'on', 'between', 'under', 'behind', 'upon', 'outside', 'above',
    'across', 'inside', 'toward', 'into', 'up', 'near', 'through', 'over', 'to'
]

parser = StanfordParser(path_to_jar=stanford_parser.stanford_parser_jar,
                        path_to_models_jar=stanford_parser.stanford_model_jar)
st = StanfordNERTagger(
    model_filename=
    '../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
    path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
lemmatizer = WordNetLemmatizer()
# embedded = []
'''
For future use.

def save_embedded_clause(tree):
    pattern = '/SBAR|S/=embed > VP'
    has_embedded =  check_output([tregex_path,
                        '-s',
                        pattern,
                        init_tree_file])
コード例 #34
0
    for wrds in depsEDU:
        dep.write(str(wrds))
        dep.write("\t")
    dep.write("\n")
    depsEDU = []
    return wrdroot


mys = "sentencepos2all" + ".txt"
#mys1 = "dep2"  + ".txt"
pos = open(mys, "w")
#dep = open(mys1,"w")
english_postagger = POSTagger(
    '../postagger/models/english-bidirectional-distsim.tagger',
    '../postagger/stanford-postagger.jar')
english_parser = StanfordParser('../postagger/stanford-parser.jar',
                                '../parser/stanford-parser-3.5.0-models.jar')
length = 0
i = 0
for fname in os.listdir('dev_data'):

    if fname.endswith('.edus'):
        print i
        print fname
        i = i + 1
        f = open(os.path.join('dev_data', fname), 'r')
        mys1 = os.path.join('dev_data', fname.split(".")[0] + ".pos")
        print mys1

        dep = open(mys1, "w")
        data = f.read().splitlines()
        edus = deque()
コード例 #35
0
path_to_jar = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\stanford-postagger-4.2.0.jar"
tagger = StanfordPOSTagger(path_to_model, path_to_jar)
# tagger.java_options='-mx4096m'          ### Setting higher memory limit for long sentences

sentences = []
for x in range(len(df['sentences'])):
    sentence = df.at[x, 'sentences']
    tagged = tagger.tag(sentence.split())
    sentences.append(tagged)
df['tagged'] = sentences

##Constituent Parser
from nltk.parse.stanford import StanfordParser
path_to_model_1 = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\model.ser.gz"
path_to_jar_1 = "D:\Constituency parsing\Parser\stanford-parser-full-2020-11-17\stanford-parser.jar"
parser = StanfordParser(path_to_model_1, path_to_jar_1)
# parser.java_options='-mx4096m'          ### Setting higher memory limit for long sentences
parse_string = []
for y in range(len(df['tagged'])):
    tagged = df.at[y, 'tagged']
    cons = next(parser.tagged_parse(tagged))
    cons = ' '.join(str(cons).split())
    parse_string.append(cons)
df['Parse_String'] = parse_string

# parse_string = ' '.join(str(cons).split())
# print(parse_string)

#Move into excel csv
import pandas as pd
df_new = df
コード例 #36
0
ファイル: rnn.py プロジェクト: giahy2507/rnn
                err += np.abs(dWh_l[i, j] - grad)
                count+=1

        if 0.001 > err/count:
            print "Grad check passed for dWh"
        else:
            print "Grad check failed for dWh: sum of error = %.9f"%(err/count)



from nltk.parse.stanford import StanfordParser
from nltk.treetransforms import chomsky_normal_form
from nltk.tree import Tree
from vector.wordvectors import WordVectors
parser = StanfordParser(path_to_jar="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser.jar",
                        path_to_models_jar="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar"
                        ,model_path="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

if __name__ == "__main__":

    rng = np.random.RandomState(4488)
    wordvector = WordVectors.load_from_text_format("model/word2vec.txt", "word2vec")
    pos_sent = []
    neg_sent = []
    with open("data/rt-polarity.neg.txt",mode="r") as f:
        neg_sent.append(f.readline())
        neg_sent.append(f.readline())
        neg_sent.append(f.readline())

    with open("data/rt-polarity.pos.txt",mode="r") as f:
        pos_sent.append(f.readline())
# -*- coding: utf-8 -*-
# export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
from __future__ import unicode_literals
import os
import sys
import io
import copy
import nltk
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
parser = StanfordParser(
    model_path=
    "stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
)
stanford_dir = parser._classpath[0].rpartition('/')[0]
parser._classpath = tuple(find_jars_within_path(stanford_dir))


# from set_parser import parse_it
class Node(object):
    """
		A generic representation of a tree node. Includes a string label and a list of a children.
	"""
    def __init__(self, label):
        """
			Creates a node with the given label. The label must be a string for use with the PQ-Gram
			algorithm.
		"""
        self.label = label
        self.children = list()
コード例 #38
0
##export CLASSPATH=$STANFORDTOOLSDIR/stanford-postagger-full-2015-04-20/stanford-postagger.jar:$STANFORDTOOLSDIR/stanford-ner-2015-04-20/stanford-ner.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-04-20/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar
##export STANFORD_MODELS=$STANFORDTOOLSDIR/stanford-postagger-full-2015-04-20/models:$STANFORDTOOLSDIR/stanford-ner-2015-04-20/classifiers

from nltk.tag.stanford import StanfordPOSTagger
from nltk.parse.stanford import StanfordParser
from nltk.corpus import stopwords

print("Sentence segmentation")
tokens = "this is pune.Pune is a great city"
tokens = tokens.split(".")
print(tokens)

print("\nTokenizer:")
tokens = "this is pune"
tokens = tokens.split(" ")
print(tokens)

print("\nStop Words Removal:")
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in tokens if not w in stop_words]
print(filtered_words)

st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
print("\nPOS tagging:")
print(st.tag('What is the airspeed of an unladen swallow ?'.split()))

parser = StanfordParser(
    model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
print("\nSyntax Parser:")
print(list(parser.raw_parse("rahul daksh fire")))
コード例 #39
0
os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk-9.0.4/bin/java.exe'
os.environ[
    'STANFORD_PARSER'] = 'C:/stanford/stanford-parser-full-2017-06-09/stanford-parser.jar'
os.environ[
    'STANFORD_MODELS'] = 'C:/stanford/stanford-parser-full-2017-06-09/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar'

path_model = 'C:/stanford/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
path_jar = 'C:/stanford/stanford-parser-full-2017-06-09/stanford-parser.jar'
path_models_jar = 'C:/stanford/stanford-parser-full-2017-06-09/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar'

wordsAndTags = wordsAndTagsParser(model_path=path_model,
                                  path_to_jar=path_jar,
                                  path_to_models_jar=path_models_jar)
parser = StanfordParser(model_path=path_model,
                        path_to_jar=path_jar,
                        path_to_models_jar=path_models_jar)
typedDependencies = typedDependenciesParser(model_path=path_model,
                                            path_to_jar=path_jar,
                                            path_to_models_jar=path_models_jar)

#a_sentence = "The strongest rain ever recorded in India shut down the financial hub of Mumbai, snapped communication lines, closed airports and forced thousands of people to sleep in their offices or walk home during the night, officials said today."
#a_sentence = a_sentence+" "+a_sentence
#sentence_list = [a_sentence]

# words and tags format for stanford parser
#sentences = wordsAndTags.raw_parse_sents(sentence_list)
#print(sentences)

# penn format for stanford parser
#sentences = parser.raw_parse_sents(sentence_list)
コード例 #40
0
ファイル: ask.py プロジェクト: yutongl1/661-NLP-Group-Project
from collections import Counter
import random
from ginger_python2 import get_ginger_result
import re

# stanford_pos = 'stanford/stanford-postagger-full-2015-04-20/'
# stanford_pos_model = stanford_pos + 'models/english-left3words-distsim.tagger'
# stanford_pos_jar = stanford_pos + 'stanford-postagger.jar'
# st_pos = StanfordPOSTagger(model_filename=stanford_pos_model, path_to_jar=stanford_pos_jar)

stanford_parser = 'stanford/stanford-parser-full-2015-04-20/'
eng_model_path = stanford_parser + "englishPCFG.caseless.ser.gz"
stanford_parser_model = stanford_parser + 'stanford-parser-3.5.2-models.jar'
stanford_parser_jar = stanford_parser + 'stanford-parser.jar'
st_parser = StanfordParser(model_path=eng_model_path,
                           path_to_models_jar=stanford_parser_model,
                           path_to_jar=stanford_parser_jar)

stanford_ner = 'stanford/stanford-ner-2015-04-20/'
stanford_ner_model1 = stanford_ner + 'classifiers/english.all.3class.distsim.crf.ser.gz'
stanford_ner_model2 = stanford_ner + 'classifiers/english.muc.7class.distsim.crf.ser.gz'
stanford_ner_jar = stanford_ner + 'stanford-ner.jar'
st_ner1 = StanfordNERTagger(model_filename=stanford_ner_model1,
                            path_to_jar=stanford_ner_jar)
st_ner2 = StanfordNERTagger(model_filename=stanford_ner_model2,
                            path_to_jar=stanford_ner_jar)

punctuation = ['\\', '/', ';', '@', '?', '^', '~', '`', '|']
lmtzr = WordNetLemmatizer()

# -------- Yutong's Editing ---------
コード例 #41
0
class SyntacticExtractor(SentenceExtractor):
    """ Tries to split sentences into sub-sentences so that each of them
        contains only one LU
    """

    splitter = None
    parser = None
    token_to_lemma = None
    all_verbs = None

    def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        self.parser = StanfordParser(path_to_jar='dev/stanford-corenlp-3.6.0.jar',
                                     path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
                                     java_options=' -mx2G -Djava.ext.dirs=dev/')

        self.token_to_lemma = {}
        for lemma, tokens in self.lemma_to_token.iteritems():
            for t in tokens:
                self.token_to_lemma[t] = lemma
        self.all_verbs = set(self.token_to_lemma.keys())

    def extract_from_item(self, item):
        extracted = []
        bio = item.get(self.document_key, '').lower()
        url = item.get('url')
        if not bio or not url:
            logger.warn('skipping item without url or bio')
            return

        try:
            roots = self.parser.raw_parse_sents(self.splitter.split(bio))
        except (OSError, UnicodeDecodeError):
            logger.exception('cannot parse biography, skipping')
            return

        for root in roots:
            root = root.next()
            try:
                sub_sents = self.find_sub_sentences(root)
            except:
                logger.exception('cannot find sub-sentences')
                continue

            for sub in sub_sents:
                try:
                    text = ' '.join(chunk for _, chunk in self.find_terminals(sub))
                    logger.debug('processing text ' + text)
                    verbs = set(chunk for _, chunk in self.find_terminals(sub, 'V'))
                except:
                    logger.exception('cannot extract verbs or parse sentence')
                    continue

                found = verbs.intersection(self.all_verbs)

                if len(found) == 0:
                    logger.debug('No matching verbs found in sub sentence')
                elif len(found) == 1:
                    extracted.append({
                        'lu': self.token_to_lemma[found.pop()],
                        'text': text,
                        'url': url,
                    })
                else:
                    logger.debug('More than one matching verbs found in sentence %s: %s',
                                 text, repr(found))

        if extracted:
            logger.debug("%d sentences extracted...", len(extracted))
            return item, extracted
        else:
            logger.debug("No sentences extracted. Skipping the whole item ...")

    def find_sub_sentences(self, tree):
        # sub-sentences are the lowest S nodes in the parse tree
        if not isinstance(tree, Tree):
            return []

        s = reduce(lambda x, y: x + y, map(self.find_sub_sentences, iter(tree)), [])
        if tree.label() == 'S':
            return s or [tree]
        else:
            return s

    def find_terminals(self, tree, label=None):
        # finds all terminals in the tree with the given label prefix
        if len(tree) == 1 and not isinstance(tree[0], Tree):
            if label is None or tree.label().startswith(label):
                yield (tree.label(), tree[0])
        else:
            for child in tree:
                for each in self.find_terminals(child, label):
                    yield each
コード例 #42
0
class question_handler():
    def __init__(self, conf, query_text):
        self.conf = conf
        self.stanford_parser_loc = self.conf.stanford_parser_home + 'stanford-parser.jar'
        self.stanford_parser_model_loc = self.conf.stanford_parser_home + 'stanford-parser-3.9.2-models.jar'
        self.parse_model = StanfordParser(self.stanford_parser_loc, self.stanford_parser_model_loc,
                                          model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
        self.query_text = query_text

    def question_phrase_extract(self):
        # Get the 'question phrase' for example:
        # 'How many' in How many movies in 2016 , 'What actors' from What actors where born in 2014

        # How: 1. get lowest level Wh phrase - exceptions are internal 'who'
        # 2. first child .. travers up to first Wh phrse - rare exceptions where first word isn't WP

        if '?' not in self.query_text:
            self.query_text = self.query_text + '?'

        it = self.parse_model.raw_parse(self.query_text)
        tree = [i for i in it]
        t = tree[0][0]

        wh_tags = [u'WHPP', u'WHNP', u'WHADJP', u'WHADVP']

        p = t.leaf_treeposition(0)
        assert (t[p[:-1]].label() in [u'WDT', u'WP', u'WRB'] + wh_tags)

        while len(p) > 0:
            p = p[:-1]
            if '+' not in t[p].label():
                if t[p].label() in wh_tags:
                    wh_type = t[p].label()
                    wh_position = p
                    break
            else:
                print("+ in label!!")

        all_leaves_positions = []
        for i in range(len(t.leaves())):
            all_leaves_positions.append(t.leaf_treeposition(i))

        wh_leaves_positions1 = wh_position + t[wh_position].leaf_treeposition(0)
        wh_leaves_positions2 = wh_position + t[wh_position].leaf_treeposition(len(t[wh_position].leaves()) - 1)

        # print wh_leaves_positions1, wh_leaves_positions2
        absolute_position1 = all_leaves_positions.index(wh_leaves_positions1)
        absolute_position2 = all_leaves_positions.index(wh_leaves_positions2)

        # print absolute_position1, absolute_position2, wh_type
        return absolute_position1, absolute_position2, wh_type

    def is_question(self, spacy_doc):
        if spacy_doc[0].tag_ in [u'WDT', u'WP', u'WRB']:
            return True

        it = self.parse_model.raw_parse(self.query_text)
        tree = [i for i in it]
        root = tree[0][0]
        root_label = root.label()
        if root_label in [u'SBARQ', u'SQ']:
            return True
        elif u'SBAR' in root_label:
            # TODO incorrect logic - 'who' coming in between
            nodes = [root]
            while type(nodes[0]) == type(root):
                label_str = ' '.join([n.label() for n in nodes])
                if u'WHADJP' in label_str or u'WHNP' in label_str or u'WHPP' in label_str:
                    return True
                level_nodes = []
                for n in nodes:
                    for i in n:
                        level_nodes.append(i)
                nodes = level_nodes
            return False
        else:
            return False
コード例 #43
0
ファイル: parser.py プロジェクト: AmalfiTrader/Text-Analysis
    """
    doc = spacy_nlp(sent)
    for sent in doc.sents:
        for token in sent:
            if token.tag_ in 'WRB':
                if token.nbor().tag_ == 'JJ':
                    if token.nbor().nbor().tag_ in ('NN', 'NNS'):
                        print(conversion_chart)


##########################################################################
# Stanford/NLTK
##########################################################################
# Required: download Stanford jar dependencies
# https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk
stanford_parser = StanfordParser(
    model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")


def nltk_stanford_parse(sent):
    """
    Use Stanford pretrained model to extract dependency tree
    for use by other methods
    :param sent: str
    :return: list of trees
    """
    parse = stanford_parser.raw_parse(sent)
    return list(parse)


def nltk_stanford_tree(sent):
    """
コード例 #44
0
def extract_subcat_insights(subcat):
    #if not subcategory_tweets_queue.empty():
    subcat_dict=subcat#subcategory_tweets_queue.get(False)
    cat_details=subcat_dict['cat_details']
    top_subcat_tweets_df=subcat_dict['top_subcat_tweets_df']
    top_trending_subcat = subcat_dict['top_trending_subcat']
    print os.getpid()," - ",top_trending_subcat

    english_parser = StanfordParser(os.path.join(os.path.curdir,'resources','stanford-parser.jar'),os.path.join(os.path.curdir,'resources','stanford-parser-3.4.1-models.jar'))
    clean_tweets_list = []
    raw_tweets = []

    unigrams={}
    phrases={}
    hashtags={}
    entites={}
    for tweet in top_subcat_tweets_df['tweet']:
        try:
            tweet = convert_to_ascii(tweet)
            tknzr = TweetTokenizer(reduce_len=True)
            tweet = ' '.join(tknzr.tokenize(tweet))
            raw_tweets.append(tweet)
            sentences = [
                clean_tags.clean(sent).replace('<hashtag>', '').replace('<allcaps>', '') for
                sent in tokenize.sent_tokenize(tweet)]
        except Exception as e:
            logger.debug(e.message)
            logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno,
                                                              sys.exc_info()[-1].tb_frame.f_code.co_filename))
        for each_sent in sentences:
            try:
                clean_tweets_list.append(
                    each_sent.encode('utf-8').translate(string.maketrans("", ""), string.punctuation))

            except Exception as e:
                logger.debug(e.message)
                logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno,
                                                                  sys.exc_info()[-1].tb_frame.f_code.co_filename))
    total_tweets_cat = len(raw_tweets)
    raw_tweets_doc = ' '.join(raw_tweets)
    try:
        entites['category'] = cat_details['category']
        entites['subcategory'] = top_trending_subcat
        entites['rank'] = cat_details['rank']
        entites['total_tweets_cat'] = total_tweets_cat
        entites['sentences'] = clean_tweets_list
        entities_queue.put(entites)

    except Exception as e:
        entites = {}
        entities_queue.put(entites)
        entities_queue.put("entities_done")
        logger.debug(e.message)
        logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno,
                                                          sys.exc_info()[-1].tb_frame.f_code.co_filename))
    try:
        hashtags['category'] = cat_details['category']
        hashtags['subcategory'] = top_trending_subcat
        hashtags['rank'] = cat_details['rank']
        hashtags['hashtags'] = collections.Counter(re.findall(r"#(\w+)", raw_tweets_doc.lower())).most_common(50)
        hashtags['total_tweets_cat'] = total_tweets_cat
        hashtags_queue.put(hashtags)
    except Exception as e:
        hashtags = {}
        hashtags_queue.put(hashtags)
        logger.debug(e.message)
        logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno,
                                                          sys.exc_info()[-1].tb_frame.f_code.co_filename))
    try:
        kw_unigrams, kw_phrases = information_parser.fetch_phrases_and_words(clean_tweets_list, english_parser)

        keywords_uni = [eachWord.encode('utf-8').translate(string.maketrans("", ""), string.punctuation).lower() for
                        eachWord in kw_unigrams if eachWord not in stopwords.words('english')]
        kw_uni = collections.Counter(keywords_uni).most_common(50)
        unigrams['category'] = cat_details['category']
        unigrams['subcategory'] = top_trending_subcat
        unigrams['rank'] = cat_details['rank']
        unigrams['kw_uni'] = kw_uni
        unigrams['total_tweets_cat'] = total_tweets_cat
        unigram_queue.put(unigrams)
    except Exception as e:
        logger.debug(e.message)
        logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno,
                                                          sys.exc_info()[-1].tb_frame.f_code.co_filename))
        unigrams = {}
        unigram_queue.put(unigrams)
    try:
        keywords_phr = [eachWord.encode('utf-8').translate(string.maketrans("", ""), string.punctuation).lower() for
                        eachWord in kw_phrases if eachWord not in stopwords.words('english')]
        kw_phr = collections.Counter(keywords_phr).most_common(50)
        phrases['category'] = cat_details['category']
        phrases['subcategory'] = top_trending_subcat
        phrases['rank'] = cat_details['rank']
        phrases['kw_phr'] = kw_phr
        phrases['total_tweets_cat'] = total_tweets_cat
        phrases_queue.put(phrases)
    except Exception as e:
        logger.debug(e.message)
        logger.debug('Error on line {} in file {}'.format(sys.exc_info()[-1].tb_lineno,
                                                          sys.exc_info()[-1].tb_frame.f_code.co_filename))
        phrases = {}
        phrases_queue.put(phrases)
コード例 #45
0
ファイル: parser.py プロジェクト: techscientist/Lango
 def __init__(self):
     self.parser = StanfordParser(
         model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
     stanford_dir = self.parser._classpath[0].rpartition('/')[0]
     self.parser._classpath = tuple(find_jars_within_path(stanford_dir))
コード例 #46
0
# find_entity_t = test.find_entity()
# find_VP_t = test.firstVP()
# test.drawTree()
test.show(firstNP_t)
# test.show(find_entity_t)
# test.show(find_VP_t)
# # test.show(find_entity_t)
# test.show(firstMinNP_t)
result = test.find_realtionship(firstNP_t)
print(result)
test.drawTree()
#
#
# print(test.rel)
# test.show(test.find_realtionship())

# 对比实验
chi_parser = StanfordParser(path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar',
                            path_to_models_jar='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
                            model_path='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
data_dir='../stanford-segmenter-2018-02-27/'
segmenter = StanfordSegmenter(path_to_jar=data_dir+"stanford-segmenter-3.9.1.jar",
                              path_to_sihan_corpora_dict=data_dir+"/data", path_to_model=data_dir+"/data/pku.gz",
                              path_to_dict=data_dir+"/data/dict-chris6.ser.gz",
                              java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
                              )
result=segmenter.segment(test_str)
result_ls = result.split()
ch_tree = list(chi_parser.parse(result_ls))[0]
ch_tree.draw()
# print(result)
コード例 #47
0
        example['question_raw_tree'] = raw_tree

        full_tree_data.append(example)

    assert len(raw_tree_data) == 0
    return full_tree_data


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Tree Pre-processing script for training and dev data.')
    parser.add_argument('-mini', action='store_true', default=False, help='To generate mini version of dataset.')
    parser.add_argument('-dev_only', action='store_true', default=False)
    args = parser.parse_args()

    mini_str = '/mini' if args.mini else ''
    parser = StanfordParser(java_options='-mx5g')

    categories = ['dev'] if args.dev_only else ['dev', 'train']

    for category in categories:
        print('Generating %s squad trees...' % category)

        version_suffix = '_v2.0' if CONSTANTS['SQUAD_VERSION'] == 2.0 else ''
        tokenized_data_in_path = 'data%s/squad_%s_tokens%s.json' % (mini_str, category, version_suffix)
        tokenized_data = json.load(open(tokenized_data_in_path))

        tree_data = generate_raw_trees(tokenized_data, _generate_raw_trees(tokenized_data, parser))

        out_path = 'data%s/squad_%s_raw_trees%s.npy' % (mini_str, category, version_suffix)
        save_as_pk(tree_data, out_path)
        print('Saved %s squad raw trees to %s' % (category, out_path))
コード例 #48
0
ファイル: nlp.py プロジェクト: elainemartin/text-analytics
from nltk.tokenize import sent_tokenize
from nltk.tag.stanford import NERTagger
from nltk.parse.stanford import StanfordParser
from corenlp import StanfordCoreNLP

wsj = open('wsj_0063.txt')

#extract named entities
nerTagger=NERTagger('stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-08-27/stanford-ner.jar')
ner = []
for line in wsj:
	ner.append(nerTagger.tag(unicode(line,errors='ignore').split()))

#parse sentences
paragraph = ""
for line in wsj:
	paragraph += line.replace('\n',' ')
sentences = sent_tokenize(paragraph)
parser = StanfordParser('stanford-parser-full-2014-10-31/stanford-parser.jar','stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar')
parsed = parser.raw_parse_sents(sentences)

#coreference
corenlp_dir = "stanford-corenlp-full-2014-08-27"
corenlp = StanfordCoreNLP(corenlp_dir)
corenlp.batch_parse(paragraph)

wsj.close()
コード例 #49
0
from Record import Record
from syntaxJudge import *
from config import *

import ServerPrint as sp
import numpy as np
import nltk
import os
"""
    This program is the implementation of dependency parsing.
    The parser is the standard stanford parser
"""

# Load stanford parser
model_path = "/home/sunner/nltk_data/stanford-english-corenlp-2016-01-10-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
dep_parser = StanfordParser(model_path=model_path)
sp.show("Finish stanford parser loading")

# Value judgement(should be initialized)
valueCount = 0  # The counter that help the parser to count the number of NP we had walked through
valuePhrase = None  # The variable that store the value phrase, it would help to organize the value sentence
itemPhrase = None  # The variable that store the item phrase, it would help to organize the item sentence
subjectSentence = None  # The subject that might be used while the subject is item, it would help to organize the subject sentence
valueSentence = None  # The variable to store the value phrase list and store the value info
itemSentence = None  # The variable to store the item phrase list and store the item info

# Variables
wordEmbedded = np.array([[1, 0], [0, 1]])


def parseSentence(parseTree):
コード例 #50
0
ファイル: stanford_utils.py プロジェクト: BinbinBian/WikiQA-1
def new_parser():
    os.environ['JAVAHOME'] = which_java
    os.environ['CLASSPATH'] = parser_path
    os.environ['STANFORD_MODELS'] = parser_path
    return StanfordParser()
コード例 #51
0
from ..lib.Tree import Tree

SETTINGS = utils.read_settings()
MAXLENGTH = 200  # number of words in the longest possible sentence. (longer sentences will be discarded)

stanford_parser_dir = os.path.join(os.getcwd(),
                                   SETTINGS.get('paths', 'stanfordParser'))
my_path_to_jar = os.path.join(stanford_parser_dir, 'stanford-parser.jar')
my_path_to_models_jar = os.path.join(stanford_parser_dir,
                                     'stanford-parser-3.6.0-models.jar')
eng_model_path = os.path.join(
    stanford_parser_dir,
    'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

PARSER = StanfordParser(model_path=eng_model_path,
                        path_to_models_jar=my_path_to_models_jar,
                        path_to_jar=my_path_to_jar,
                        java_options='-mx5000m')
PARSER._classpath = tuple([j for j in PARSER._classpath] + [
    stanford_parser_dir + '/slf4j-api.jar', stanford_parser_dir +
    '/slf4j-simple.jar'
])
SENT_TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle')


def get_sentences(f_path, sent_tokenize=False):
    with codecs.open(f_path, 'r', 'utf-8') as f:
        sents = [sent.strip() for sent in f.readlines()]
    if sent_tokenize:
        sents = SENT_TOKENIZER.tokenize(' '.join(sents))
    return sents
コード例 #52
0
import nltk
import os
from nltk.parse.stanford import StanfordParser
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tree import *
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

s = raw_input("Enter string")
parser = StanfordParser(
    model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
o = parser.parse(s.split())
tree1 = [tree for tree in parser.parse(s.split())]
parsetree = tree1[0]
dict = {}
#output = '(ROOT (S (PP (IN As) (NP (DT an) (NN accountant))) (NP (PRP I)) (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment))))))))'
#parsetree=Tree.fromstring(output)
#parsetree=parser.raw_parse(s)
print parsetree

print "***********subtrees**********"

ptree = ParentedTree.convert(parsetree)
for sub in ptree.subtrees():
    #print sub
    dict[sub.treeposition()] = 0
# print sub.label()

print "----------------------------------------------"
コード例 #53
0
ファイル: offlineDEP.py プロジェクト: parry2403/CodeRepo
                            print 's'
                          #    print depsEDU

        dep.write(str(curr.strip()))
        dep.write("@#%^&*")
        for wrds in depsEDU:
                dep.write(str(wrds))
                dep.write("\t")
        dep.write("\n")
        depsEDU =[]
        return wrdroot

#mys1 = "dpossall"  + ".txt"
#dep = open(mys1,"w")
english_postagger = POSTagger('../postagger/models/english-bidirectional-distsim.tagger', '../postagger/stanford-postagger.jar')
english_parser = StanfordParser('../postagger/stanford-parser.jar', '../parser/stanford-parser-3.5.0-models.jar')

i=0
for fname in os.listdir('test_data'):

     if fname.endswith('.edus') :
            print i
            print fname
            i=i+1
            if True:
                f = open(os.path.join('test_data',fname),'r')
                mys1 =os.path.join('test_data', fname.split(".")[0] +".dep")
                print mys1
                dep = open(mys1,"w")
                data = f.read().splitlines()
                edus = deque()
コード例 #54
0
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize

import script_wrapper as stanford_parser 


sentence = "Dempsey was drafted by Major League Soccer club New England Revolution."
st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
tags = st.tag(word_tokenize(sentence))
print(tags)

prev_tag_name = tags[0][1]
cur_entity = tags[0][0]
entities = {}
for i in range(1, len(tags)):
    cur_tag = tags[i]
    cur_token = cur_tag[0]
    cur_tag_name = cur_tag[1]
    if cur_tag_name == prev_tag_name:
        cur_entity = cur_entity + " " + cur_token
    else:
        if not prev_tag_name in entities:
            entities[prev_tag_name] = []
        entities[prev_tag_name].append(cur_entity)
        cur_entity = cur_token
    prev_tag_name = cur_tag_name
del entities['O']
print(entities)

parser = StanfordParser(path_to_jar=stanford_parser.stanford_parser_jar, path_to_models_jar=stanford_parser.stanford_model_jar)
print(parser.raw_parse("Dempsey was drafted by Major League Soccer club New England Revolution.").next())
コード例 #55
0
                           path_to_jar=stanford_pos_jar)

# # NER Tagging:
stanford_ner = '../stanford/stanford-ner-2015-04-20/'
stanford_ner_model = stanford_ner + 'classifiers/english.muc.7class.distsim.crf.ser.gz'
stanford_ner_jar = stanford_ner + 'stanford-ner.jar'
ner = StanfordNERTagger(model_filename=stanford_ner_model,
                        path_to_jar=stanford_ner_jar)

# Set up the stanford PCFG parser
stanford_parser_dir = '../stanford/stanford-parser-full-2015-04-20/'
eng_model_path = stanford_parser_dir + "englishPCFG.ser.gz"
my_path_to_models_jar = stanford_parser_dir + "stanford-parser-3.5.2-models.jar"
my_path_to_jar = stanford_parser_dir + "stanford-parser.jar"
parser = StanfordParser(model_path=eng_model_path,
                        path_to_models_jar=my_path_to_models_jar,
                        path_to_jar=my_path_to_jar)

stopWords = stopwords.words('english')


# cur: currect tree
# label: target label
# record: candidates
def searchLabel(cur, label, record):
    answer = None
    if cur.label() == label:
        # record.append(cur.leaves())
        record.append(cur)
    for i in cur:
        # print "--",    (i), isinstance(i, (str, unicode)), i
コード例 #56
0
# -*- coding: utf-8 -*-
"""
Created on Sat May 13 01:29:33 2017

@author: DIP
"""

from nltk.parse.stanford import StanfordParser

sentence = 'The quick brown fox jumps over the lazy dog'

# create parser object
scp = StanfordParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar',
                   path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')


# get parse tree
result = list(scp.raw_parse(sentence)) 
tree = result[0]

# print the constituency parse tree
print(tree) 

# visualize constituency parse tree
tree.draw() 
コード例 #57
0
ファイル: main.py プロジェクト: 5aurabhpathak/all-I-ve-done
#!/bin/env python3.5
#Author: Saurabh Pathak
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
from nltk.tokenize import sent_tokenize
from nltk import download
from nltk.tree import ParentedTree
import os

#download('punkt', quiet=True)
#download('names', quiet=True)

os.environ['CLASSPATH'] = os.getenv('CLASSPATH', '') + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser.jar:' + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
parser._classpath = find_jars_within_path(os.getcwd() + 'data/stanford-parser-full-2015-12-09')

text = input('Enter some text:')

tlist = [ParentedTree.fromstring(str(list(parsetree)[0])) for parsetree in parser.raw_parse_sents(sent_tokenize(text))]

tlist2 = [tree.copy(True) for tree in tlist]
from hobbs import *
from lappinleasse import *

print('Input text was:\n', text)
def resolve(ls, algo):
    print('\nResolving with', algo)
    i = -1
    for parsetree in ls:
        i += 1
コード例 #58
0
# Better question: Who is going to school on Monday?
# (Maybe)Even better question: Who is going to school on Monday to work on the project? **OR** Who is going to work on the project on Monday?

# Why:
# Not-so-good question: Why is Rohan going to school by car along with his friend on Monday?
# Good question: Why is Rohan going to school on Monday?

# How:
# Very Bad question: How is Rohan going to school by car on Monday to work on the project? (Answer: along with his friend. 
# This is valid but not a good question)
# Not-so-good question: How is Rohan going to school along with his friend on Monday to work on the project?

with open("example_article.txt") as f:
	tokenizer = PunktSentenceTokenizer()
	sentences = tokenizer.tokenize(f.read().decode('utf-8').replace("\n"," "))
	parser=StanfordParser()	

	print len(sentences)
	print len([ x for x in sentences if "is" in x])

	sentences[0] = "I am going to watch a movie in the evening."
	sentences[0] = "I have always wondered how I have always been so good on the guitar."
	sentences[0] =  "Our dinner has been eaten by the dog."
	sentences[0] = "Playing golf is my favorite pastime"
	sentences[0] = "He plays golf for a living"
	
	sentences[0] = sentences[0].rstrip('.')
	parseTree = list(parser.raw_parse((sentences[0])))
	print sentences[0] 
	
	# the parse tree for the entire sentence
コード例 #59
0
ファイル: parser.py プロジェクト: ayoungprogrammer/Lango
 def __init__(self):
     self.parser = StanfordParser()
コード例 #60
0
ファイル: Playaround.py プロジェクト: abiraja2004/NLP_Project
nltk.internals.config_java(
    "C:/Program Files (x86)/Java/jre1.8.0_151/bin/java.exe")
eng_tagger = StanfordNERTagger(model_filename = 'C:\\Users\\jingx\\Dropbox\\MSCF Course\\NLP\\stanford-ner-2017-06-09\\classifiers\\english.all.3class.distsim.crf.ser.gz',\
                               path_to_jar = 'C:\\Users\\jingx\\Dropbox\\MSCF Course\\NLP\\stanford-ner-2017-06-09\\stanford-ner.jar')
#print(eng_tagger.tag('Rami Eid is studying at Stony Brook University in NY'.split()))
a = eng_tagger.tag(
    'Rami Eid is studying at Stony Brook University in NY and loves Mike'.
    split())

#for tag, chunk in groupby(a, lambda x:x[1]):
#    if tag != "O":
#        print("%-12s"%tag, " ".join(w for w, t in chunk))
#b = eng_parser.parse("Rami Eid is studying at Stony Brook University in NY".split())

eng_parser = StanfordParser(
    r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser.jar",
    r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser-3.8.0-models.jar"
)
#print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())))

eng_parser = StanfordDependencyParser(
    r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser.jar",
    r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser-3.8.0-models.jar"
)
res = list(
    eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
#for row in res[0].triples():
#    print(row)

trainfile = r'C:\Users\jingx\Dropbox\MSCF Course\NLP\NLP_Project\data\set1\a6.txt'
with open(trainfile, encoding='utf8') as fin:
    train = fin.readlines()