Exemple #1
0
def parser_nltk(word_lists, filename):
    os.environ['JAVAHOME'] = JAVA_PATH
    os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH
    os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS
    chinese_parser = StanfordParser(model_path=nltk_parse_model_path)
    STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0]
    chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR))
    chinese_parser.java_options = '-mx15000m'
    all_parser_sentence = []
    file = shelve.open(filename)
    flag = 0

    for sentence in word_lists:
        if sentence.strip() != "":
            res = list(chinese_parser.parse((sentence.strip()).split()))
            new_str = return_str_tofile(sentence_parse=str(res[0]))
            file[str(flag)] = res
            all_parser_sentence.append(new_str)
            flag += 1
            print("###### NLTK Dependency Parser Have finished " + str(flag) +
                  " sentences ###")
    return all_parser_sentence
stanford_parser_dir = os.path.join(os.getcwd(),
                                   SETTINGS.get('paths', 'stanfordParser'))
my_path_to_jar = os.path.join(stanford_parser_dir, 'stanford-parser.jar')
my_path_to_models_jar = os.path.join(stanford_parser_dir,
                                     'stanford-parser-3.6.0-models.jar')
eng_model_path = os.path.join(
    stanford_parser_dir,
    'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

parser = StanfordParser(model_path=eng_model_path,
                        path_to_models_jar=my_path_to_models_jar,
                        path_to_jar=my_path_to_jar,
                        java_options='-mx5000m')
parser._classpath = tuple([j for j in parser._classpath] + [
    stanford_parser_dir + '/slf4j-api.jar', stanford_parser_dir +
    '/slf4j-simple.jar'
])

for r, ds, fs in os.walk(heldout_raw_path):
    ds.sort()
    fs.sort()
    file_counter = 0
    already_parsed = os.listdir(heldout_parse_path)
    files = [
        f for f in fs if f[:1] in ('E', 'F', 'G') and f not in already_parsed
    ]

    files_count = len(files)
    for f in files:
        file_counter += 1
        print f, file_counter / float(files_count)
Exemple #3
0
#!/bin/env python3.5
#Author: Saurabh Pathak
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
from nltk.tokenize import sent_tokenize
from nltk import download
from nltk.tree import ParentedTree
import os

#download('punkt', quiet=True)
#download('names', quiet=True)

os.environ['CLASSPATH'] = os.getenv('CLASSPATH', '') + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser.jar:' + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
parser._classpath = find_jars_within_path(os.getcwd() + 'data/stanford-parser-full-2015-12-09')

text = input('Enter some text:')

tlist = [ParentedTree.fromstring(str(list(parsetree)[0])) for parsetree in parser.raw_parse_sents(sent_tokenize(text))]

tlist2 = [tree.copy(True) for tree in tlist]
from hobbs import *
from lappinleasse import *

print('Input text was:\n', text)
def resolve(ls, algo):
    print('\nResolving with', algo)
    i = -1
    for parsetree in ls:
        i += 1
# -*- coding: utf-8 -*-
# export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
from __future__ import unicode_literals
import os
import sys
import io
import copy
import nltk
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
parser = StanfordParser(
    model_path=
    "stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
)
stanford_dir = parser._classpath[0].rpartition('/')[0]
parser._classpath = tuple(find_jars_within_path(stanford_dir))


# from set_parser import parse_it
class Node(object):
    """
		A generic representation of a tree node. Includes a string label and a list of a children.
	"""
    def __init__(self, label):
        """
			Creates a node with the given label. The label must be a string for use with the PQ-Gram
			algorithm.
		"""
        self.label = label
        self.children = list()
Exemple #5
0
def preprocess(flist, folder_path):
    """ (file open for reading, str) -> Nonetype

    flist contains one filename per line and folder_path represents a 
    directory. Do preprocessing on each file from flist in folder_path.
    """

    error_log = []
    for i in range(len(flist)):

        path = flist[i]

        stemmer = PorterStemmer()
        parser = StanfordParser(
            model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
            verbose=True)
        stanford_dir = parser._classpath[0].rpartition('/')[0]
        parser._classpath = tuple(find_jars_within_path(stanford_dir))

        with open(path, 'r') as rf:
            try:
                sent = [line.strip('\n ') for line in rf]
            except UnicodeDecodeError as e:
                error_log.append('Unicode Decode Error:\t' + path + '\n')
                pass
            else:
                if not sent:
                    error_log.append('Empty File Error:\t' + path + '\n')
                    pass
                else:
                    # Stemming with Porter Stemmer
                    pars_stem = stemmer.stem(' '.join(sent))
                    stemmed = '\n'.join(sent)

                    wf = open(
                        folder_path + path.split('.')[0].split('/')[-1] +
                        '.stem', 'w')
                    wf.write(stemmed)
                    wf.close()

                    # POS Tagging after tokenizing and stemming
                    pos = nltk.pos_tag(pars_stem.split())
                    wf = open(
                        folder_path + path.split('.')[0].split('/')[-1] +
                        '.pos', 'w')
                    wf.write(str(pos))
                    wf.close()

                    # CFG parser
                    try:
                        parsed = parser.raw_parse(pars_stem)
                    except (TypeError, IndexError, NameError) as e:
                        error_log.append('Unparsable Error:/t' + path + '/n')
                        pass
                    wf = open(
                        folder_path + path.split('.')[0].split('/')[-1] +
                        '.pars', 'w')
                    s_pars = " ".join(str(x) for x in list(parsed))
                    s_pars = s_pars.replace("Tree", "")
                    s_pars = s_pars.replace("[", "")
                    s_pars = s_pars.replace("]", "")
                    s_pars = s_pars.replace("\'", "")
                    wf.write(s_pars)
                    wf.close()

    # Print files paths with Errors
    if error_log:
        wf = open(folder_path + 'error_log', 'wb')
        for line in error_log:
            wf.write(line)
        wf.close()
# -*- coding: utf-8 -*-
# export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
from __future__ import unicode_literals
import os
import sys
import io
import copy
import nltk
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
parser=StanfordParser(model_path="stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
stanford_dir = parser._classpath[0].rpartition('/')[0]
parser._classpath = tuple(find_jars_within_path(stanford_dir))
# from set_parser import parse_it
class Node(object):
	"""
		A generic representation of a tree node. Includes a string label and a list of a children.
	"""

	def __init__(self, label):
		"""
			Creates a node with the given label. The label must be a string for use with the PQ-Gram
			algorithm.
		"""
		self.label = label
		self.children = list()

	def addkid(self, node, before=False):
		"""
			Adds a child node. When the before flag is true, the child node will be inserted at the
			beginning of the list of children, otherwise the child node is appended.