Exemple #1
0
    def __init__(
            self,
            model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
    ):
        '''
			initialization
		'''
        self.eng_parser = StanfordParser()
        # self.eng_parser_dependency = StanfordDependencyParser()
        self.eng_parser_dependency = StanfordNeuralDependencyParser()
Exemple #2
0
class RelationExtractor:
    '''
		relation extraction
	'''
    def __init__(
            self,
            model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
    ):
        '''
			initialization
		'''
        self.eng_parser = StanfordParser()
        # self.eng_parser_dependency = StanfordDependencyParser()
        self.eng_parser_dependency = StanfordNeuralDependencyParser()

    def PCFG_parse(self, sentence, draw_graph=True):
        res = list(self.eng_parser.parse(sentence.split()))
        if draw_graph: res[0].draw()
        return res

    def dependency_parse(self, sentence, draw_graph=True):
        res = list(self.eng_parser_dependency.parse(sentence.split()))
        if draw_graph: res[0].tree().draw()
        return res

    def generate_relation(self,
                          sentence,
                          nes,
                          draw_graph=False,
                          dep_path_max=10**2):
        pairs = [(nes[i], nes[j]) for i in range(len(nes) - 1)
                 for j in range(i + 1, len(nes)) if nes[i][1] != nes[j][1]]
        if len(sentence.split()) > 60 or len(pairs) < 3: return

        def get_relation(n1, n2):
            get_range = lambda n: range(n[2] + 1, n[3] + 2)
            e1ind, e2ind = get_range(n1), get_range(n2)
            dep_path = nx.shortest_path(G, source=e1ind[-1], target=e2ind[-1])
            vbs = filter(lambda n: G.node[n]['tag'].startswith('VB'), dep_path)
            if len(dep_path) <= dep_path_max and vbs:
                ws = sentence.split()
                r = G.node[vbs[-1]]['word']
                e1, e2 = ' '.join(ws[i - 1]
                                  for i in e1ind), ' '.join(ws[i - 1]
                                                            for i in e2ind)
                print '{0}\n{1} | {2} | {3} | {4}'.format(
                    sentence, e1, e2, r, len(dep_path))
                return e1, e2, r, len(dep_path)
            else:
                return None, None, None, None

        rels = []
        res = self.dependency_parse(sentence, draw_graph=False)
        G = nx.Graph()
        nodes = {}
        edges = []
        return res[0].nodes.items()
from nltk.parse.stanford import StanfordNeuralDependencyParser

split = ["train", "trial", "test_annotated"]
for s in split:
    f = open("SICK_" + s + ".txt", "r")
    lines = f.readlines()
    sentences = []
    labels = []
    for i in range(1, len(lines)):
        a = lines[i].split("\t")
        sentences.extend([a[1], a[2]])
        labels.extend([a[3], a[3]])

    parser = StanfordNeuralDependencyParser(
        model_path="edu/stanford/nlp/models/parser/nndep/english_UD.gz")
    stanford_dir = parser._classpath[0].rpartition('/')[0]
    slf4j_jar = stanford_dir + '/slf4j-api.jar'
    parser._classpath = list(parser._classpath) + [slf4j_jar]
    parser.java_options = '-mx5000m'  # To increase the amount of RAM it can use.
    file = open("SICK_dep_parse_" + s + ".txt", "w")
    #a=[parse.tree()._pformat_flat(" ","()",False) for parse in parser.raw_parse("The young boys are playing outdoors and the man is smiling nearby")]

    parsed_sents = [[
        parse.tree()._pformat_flat("", "()", False) for parse in dep_graphs
    ] for dep_graphs in parser.raw_parse_sents(sentences)]
    for i in range(len(parsed_sents)):
        for j in range(len(parsed_sents[i])):
            sent1 = "(" + labels[i] + " " + parsed_sents[i][j] + ")"
            file.write(sent1 + "\n")
    file.close()
    f.close()
def main(argv):
  indir = argv[0]
  files = os.listdir(indir)

  numfiles = 0
  for filename in files:
    if not filename.endswith('.txt'):
      continue
    else:
      numfiles = numfiles + 1

  cnt = 0
  for filename in files:
     if not filename.endswith('.txt'):
      continue
     cnt = cnt + 1
     print('(%d/%d) - Processing %s' % (cnt,numfiles,filename))
     fname = os.path.join(indir,filename)
     text_file = open(fname,"r+")
     text = text_file.read()

     #-----remove html tags-----
     class TextExtracter(SGMLParser):
       def __init__(self):
         self.text = []
         SGMLParser.__init__(self)
       def handle_data(self, data):
         self.text.append(data)
       def getvalue(self):
         return ''.join(ex.text)
     ex = TextExtracter()
     ex.feed(text)
     text = ex.getvalue()
        
     #-----remove urls-----------------------
     Url_RE=re.compile(r'https?://[^\s<>"]+|www\.[^\s<>"]+')
     def remove_urls(text):
       return Url_RE.sub('',text)
     text = remove_urls(text)

     #------remove non-ascii characters------
     text = re.sub(r'[^\x00-\x7f]',r'', text)

     #-----remove urlLink--------------------
     text = text.replace("urlLink","")
     
     # #-------remove emoji--------------------
     # emoji_RE=re.compile(r'\*\s[a-z]+\s\*')
     # emoj_list = (':)',': )',':~',':-)',': - )',':-(',': - (',':(',': (',':B',':|','8-)',':<',':$',':X',': X',':Z',':\'(',':-|',': - |',':@',':P',': P',':D',': D',':O',':+','Cb',':Q',':T',',@P',',@-D',':d',',@o',':g','|-)',':!',':L',':>',',@f',':-S',',@x',',@@',',@!','xx','&-(','B-)','<@','@>',':-O',': - O','>-|','P-(',':\'|','X-)',':*','@x','8*','pd','<W>','@)','jj','@@','lvu','<L>','<O>','/[]','#-0','/[]','<&','&>','oY')
     # text=emoji_RE.sub('',text)
     # for i in xrange(len(emoj_list)):
     #  text=text.replace(emoj_list[i],'')

     #parse sentences and get valid sentences
     dep_parser = StanfordNeuralDependencyParser()
     sents = nltk.sent_tokenize(text)
     valid_sents = []
     invalid_sents = 0
     for sent in sents:
       try:
         parsed = dep_parser.raw_parse(sent)
         # Retain only english sentences
         if (detect(sent) == 'en'):
           valid_sents.append(sent)
         else:
           invalid_sents = invalid_sents + 1
       except:
         invalid_sents = invalid_sents + 1
     print('%d/%d sentences were valid' % (len(sents)-invalid_sents,len(sents)))
     text = ' '.join(valid_sents)
                
     text_file.close()
     if len(text) > 0:
       text_file = open(fname,"w")
       text_file.write(text)
       text_file.close()
     else:
       print('Deleting %s' % fname)
       os.remove(fname)
Exemple #5
0
def load_dep_parser():
    return StanfordNeuralDependencyParser(
        model_path=parser_modelpath,
        path_to_jar=parser_jarpath, path_to_models_jar=parser_modeljar, 
        java_options='-Xmx4g')
Exemple #6
0
from nltk.parse.stanford import StanfordNeuralDependencyParser
from os.path import expanduser
import os
import json

os.environ["JAVAHOME"] = 'C:/Program Files/Java/jdk1.8.0_121/bin/java.exe'
os.environ[
    'CLASSPATH'] = 'C:/Users/PuneetGrover/AppData/Roaming/nltk_data/stanford-corenlp/stanford-corenlp-3.7.0.jar'
_path_to_models_jar = 'C:/Users/PuneetGrover/Downloads/stanfordNLP/stanford-english-corenlp-2016-10-31-models.jar'

dep_parser = StanfordNeuralDependencyParser(
    path_to_models_jar=_path_to_models_jar, java_options='-mx4g')

#print([parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")])
print(
    json.dumps([
        list(parse.triples()) for parse in dep_parser.raw_parse(
            "Bell, based in Los Angeles, makes and distributes electronic, computer and building products."
        )
    ],
               indent=4))
Exemple #7
0
#    DependencyGraph(
#        ' '.join(items)  # NLTK expects an iterable of strings...
#        for n, *items in sorted(transform(parse))
#    )
#    for parse in parses
#]
#print("4")
#
#pprint(list(dgs))

#####################################################################################
from nltk.parse.stanford import StanfordNeuralDependencyParser
os.environ[
    'CLASSPATH'] = 'C:/Users/PuneetGrover/AppData/Roaming/nltk_data/stanford-corenlp/stanford-corenlp-3.7.0.jar'
_path_to_models_jar = 'C:/Users/PuneetGrover/Downloads/stanfordNLP/stanford-english-kbp-corenlp-2016-10-31-models.jar'
dep_parser = StanfordNeuralDependencyParser(
    path_to_models_jar=_path_to_models_jar, java_options='-mx4g')
print([
    parse.tree() for parse in dep_parser.raw_parse(
        "The quick brown fox jumps over the lazy dog.")
])
print([
    list(parse.triples()) for parse in dep_parser.raw_parse(
        "The quick brown fox jumps over the lazy dog.")
])
sum([[parse.tree() for parse in dep_graphs]
     for dep_graphs in dep_parser.raw_parse_sents((
         "The quick brown fox jumps over the lazy dog.",
         "The quick grey wolf jumps over the lazy fox."))], [])
sum([[parse.tree() for parse in dep_graphs]
     for dep_graphs in dep_parser.parse_sents((
         "I 'm a dog".split(),