Example #1
0
def Init():
    parser = stanford.StanfordDependencyParser(model_path="./stanford_libs/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    stanford_dir = parser._classpath[0].rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    parser._classpath = tuple(find_jars_within_path(stanford_dir))

    '''st = StanfordNERTagger('./stanford_libs/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
    './stanford_libs/stanford-ner-2015-12-09/stanford-ner.jar')
    stanford_dir = st._stanford_jar.rpartition('/')[0]
    st._stanford_jar = ':'.join(find_jars_within_path(stanford_dir))
'''
    stop = stopwords.words('english')
    return parser, None, stop
Example #2
0
    def __init__(self,
                 model_filename,
                 path_to_jar=None,
                 encoding='utf8',
                 verbose=False,
                 java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn(
                'The StanfordTagger class is not meant to be '
                'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?'
            )
        self._stanford_jar = find_jar(self._JAR,
                                      path_to_jar,
                                      searchpath=(),
                                      url=_stanford_url,
                                      verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=('STANFORD_MODELS', ),
                                         verbose=verbose)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))

        self._encoding = encoding
        self.java_options = java_options
    def nonlocal_ner_tag_tokens(self):
        home = expanduser("~")
        os.environ['CLASSPATH'] = home + '/stanford-ner-2015-12-09'
        os.environ[
            'STANFORD_MODELS'] = home + '/stanford-ner-2015-12-09/classifiers'

        st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz",
                               java_options='-mx4000m')

        stanford_dir = st._stanford_jar[0].rpartition('/')[0]
        stanford_jars = find_jars_within_path(stanford_dir)

        st._stanford_jar = ':'.join(stanford_jars)

        # do not tokenise text
        nltk.internals.config_java(
            options=
            '-tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions "tokenizeNLs=true"'
        )

        self.nonlocal_ner_doc_tokens = []
        temp_nonlocal_bulk_process = []
        length_of_docs = [len(doc) for doc in self.tokenized_docs_by_lines]
        for doc_idx, doc in enumerate(self.tokenized_docs_by_lines):
            for line_idx, line in enumerate(doc):
                temp_nonlocal_bulk_process.append(line)

        temp_nonlocal_bulk_process = st.tag_sents(temp_nonlocal_bulk_process)

        current_idx = 0
        for doc_len_idx, doc_len in enumerate(length_of_docs):
            self.nonlocal_ner_doc_tokens.append(
                temp_nonlocal_bulk_process[current_idx:current_idx + doc_len])
            current_idx += doc_len
        print("NER nonlocal tagged tokens")
Example #4
0
def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source
Example #5
0
def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records,
                                             pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: ' + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' %
              (idx, len(records), len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source
Example #6
0
def get_word_dependencies(text):
    dependencies = {}
    dep_parser = StanfordDependencyParser(
        model_path=osp.join(
            datadir,
            "stanford_data/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
        ),
        java_options="-mx4g -XX:-UseGCOverheadLimit")
    st = StanfordPOSTagger(osp.join(datadir, "stanford_pos/stanford-postagger-3.9.1.jar"),\
  osp.join(datadir, 'stanford_pos/models/english-bidirectional-distsim.tagger'), java_options='-mx4g, XX:-UseGCOverheadLimit')
    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st.stanford_jar = ':'.join(stanford_jars)
    result = dep_parser.raw_parse(text)
    dep = result.__next__()
    #print(list(dep.triples()))
    for i in list(dep.triples()):
        w1 = i[0][0]
        w2 = i[2][0]
        if w1 in dependencies:
            dependencies[w1].append((w2, i[1]))
        else:
            dependencies[w1] = [(w2, i[1])]
    #print(dependencies)
    return dependencies
Example #7
0
def dependency_parser_nltk(word_lists, filename):
    os.environ['JAVAHOME'] = JAVA_PATH
    os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH
    os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS
    chinese_parser = StanfordDependencyParser(model_path=nltk_parse_model_path)
    STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0]
    chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR))
    chinese_parser.java_options = '-mx15000m'

    node_file = shelve.open(filename)
    all_dependency_list = []
    for index, sentence in enumerate(word_lists):
        # 存进all_dependency_list中,存储数据类型是列表
        res = list(chinese_parser.parse(sentence.strip().split()))
        print("we have finished ", index + 1, " sentence!!!")

        list_file = [triple for triple in res[0].triples()]
        all_dependency_list.append(list_file)

        #存进node_file,存储数据类型是dict/defaultdict,用作备份文件
        node_dict = {}
        node = res[0].nodes
        for inner_index in range(len(node.items()) * 2):
            if node[inner_index]['word'] != None or node[inner_index][
                    'ctag'] != None:
                # print(node[inner_index])
                node_dict[node[inner_index]["address"]] = node[inner_index]
                # print(node[inner_index]["address"], type(node[inner_index]["address"]))
        node_file[str(index)] = node_dict

    node_file.close()
    return all_dependency_list
Example #8
0
 def tokernizer(self,tagger):
 	stanford_dir = tagger._stanford_jar.rpartition('/')[0]
 	stanford_jars = find_jars_within_path(stanford_dir)
 	tagger._stanford_jar = ':'.join(stanford_jars)
 	#tags = tagger.tag(self.stop_wrds())
 	tags = tagger.tag(word_tokenize(self.sentence))
 	return tags
Example #9
0
    def __init__(self, model='stanford/models/english-bidirectional-distsim.tagger', libpath='stanford/', verbose=False):
        self._model = model
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)
        self._xml_regex = re.compile(
            r'  <word wid="[0-9]*" pos="([^"]*)" lemma="([^"]*)">(.*?)</word>')

        config_java(verbose=verbose)
Example #10
0
def load_pos_tagger(stanford_base_dir):
    jar = stanford_base_dir + '/stanford-postagger.jar'
    model = stanford_base_dir + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model_filename=model, path_to_jar=jar)

    stanford_base_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_base_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    return pos_tagger
Example #11
0
    def __init__(self,
                 model='stanford/models/english-bidirectional-distsim.tagger',
                 libpath='stanford/',
                 verbose=False):
        self._model = model
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)
        self._xml_regex = re.compile(
            r'  <word wid="[0-9]*" pos="([^"]*)" lemma="([^"]*)">(.*?)</word>')

        config_java(verbose=verbose)
Example #12
0
def load_pos_tagger():
    path = os.path.dirname(__file__)
    path =  os.path.join(file_dir[: file_dir.rfind('pykp') + 4], 'stanford-postagger')
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    return pos_tagger
Example #13
0
    def xxtest_StanfordPOSTagger(self):

        jar = '\\usr\\stanford-postagger-full-2015-12-09\\stanford-postagger.jar'
        model = '\\usr\\stanford-postagger-full-2015-12-09\\models\\english-left3words-distsim.tagger'

        tagger = StanfordPOSTagger(model, jar)

        stanford_dir = tagger._stanford_jar[0].rpartition('\\')[0]
        stanford_jars = find_jars_within_path(stanford_dir)
        tagger._stanford_jar = ':'.join(stanford_jars)

        text = tagger.tag(word_tokenize("What's the airspeed of an unladen swallow ?"))

        self.assertTrue(text is not None)
Example #14
0
    def __init__(
        self,
        path_to_jar=None,
        path_to_models_jar=None,
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        encoding='utf8',
        verbose=False,
        java_options='-mx4g',
        corenlp_options='',
    ):

        # find the most recent code and model jar
        stanford_jar = max(
            find_jar_iter(
                self._JAR,
                path_to_jar,
                env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        model_jar = max(
            find_jar_iter(
                self._MODEL_JAR_PATTERN,
                path_to_models_jar,
                env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        # self._classpath = (stanford_jar, model_jar)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(stanford_jar)[0]
        self._classpath = tuple([model_jar] +
                                find_jars_within_path(stanford_dir))

        self.model_path = model_path
        self._encoding = encoding
        self.corenlp_options = corenlp_options
        self.java_options = java_options
def get_pos_tag(sen):#pass sentence dataframe
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
Example #16
0
def stanford_ne_tagger(tokens):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)

    st._stanford_jar = ':'.join(stanford_jars)
    tags = st.tag(tokens)
    continuous_chunks = get_continuous_chunks(tags)
    named_entities_str_tag = set()
    for ne in continuous_chunks:
        if (ne[0][1] == u'LOCATION'):
            named_entities_str_tag.add(
                lower(u' '.join([token for token, tag in ne])))

    return named_entities_str_tag
Example #17
0
    def __init__(
        self,
        path_to_jar=None,
        path_to_models_jar=None,
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        encoding='utf8',
        verbose=False,
        java_options='-mx4g',
        corenlp_options='',
    ):

        # find the most recent code and model jar
        stanford_jar = max(
            find_jar_iter(
                self._JAR,
                path_to_jar,
                env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        model_jar = max(
            find_jar_iter(
                self._MODEL_JAR_PATTERN,
                path_to_models_jar,
                env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        # self._classpath = (stanford_jar, model_jar)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(stanford_jar)[0]
        self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))

        self.model_path = model_path
        self._encoding = encoding
        self.corenlp_options = corenlp_options
        self.java_options = java_options
Example #18
0
    def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            env_vars=('STANFORD_POSTAGGER',),
            searchpath=(), url=_stanford_url,
            verbose=verbose
        )
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options

        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
def get_pos_tag(sen):
    os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger
    os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models'
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
Example #20
0
    def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            env_vars=('STANFORD_POSTAGGER',),
            searchpath=(), url=_stanford_url,
            verbose=verbose
        )
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options

        options = {} if options is None else options
        self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
Example #21
0
def find_maltparser(parser_dirname):
    """
    A module to find MaltParser .jar file and its dependencies.
    """
    if os.path.exists(parser_dirname): # If a full path is given.
        _malt_dir = parser_dirname
    else: # Try to find path to maltparser directory in environment variables.
        _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
    # Checks that that the found directory contains all the necessary .jar
    malt_dependencies = ['','','']
    _malt_jars = set(find_jars_within_path(_malt_dir))
    _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
    malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])

    assert malt_dependencies.issubset(_jars)
    assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
    return list(_malt_jars)
Example #22
0
def find_maltparser(parser_dirname):
    """
    A module to find MaltParser .jar file and its dependencies.
    """
    if os.path.exists(parser_dirname): # If a full path is given.
        _malt_dir = parser_dirname
    else: # Try to find path to maltparser directory in environment variables.
        _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
    # Checks that that the found directory contains all the necessary .jar
    malt_dependencies = ['','','']
    _malt_jars = set(find_jars_within_path(_malt_dir))
    _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
    malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])

    assert malt_dependencies.issubset(_jars)
    assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
    return list(_malt_jars)
Example #23
0
def check_postag(config):
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(
        config['dataset'])

    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0,
                            len(path) - 10) + 1] + 'stanford-postagger/'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    for dataset_name in config['testing_datasets']:
        # override the original test_set
        # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type'])

        test_sets = load_additional_testing_data(config['testing_datasets'],
                                                 idx2word, word2idx, config)
        test_set = test_sets[dataset_name]

        # print(dataset_name)
        # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']])))
        test_data_plain = zip(*(test_set['source'], test_set['target']))

        test_size = len(test_data_plain)

        # Alternatively to setting the CLASSPATH add the jar and model via their path:
        jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
        # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
        model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
        pos_tagger = StanfordPOSTagger(model, jar)

        for idx in xrange(len(test_data_plain)):  # len(test_data_plain)
            test_s_o, test_t_o = test_data_plain[idx]

            source = keyphrase_utils.cut_zero(test_s_o, idx2word)

            print(source)

            # Add other jars from Stanford directory
            stanford_dir = jar.rpartition('/')[0]
            stanford_jars = find_jars_within_path(stanford_dir)
            pos_tagger._stanford_jar = ':'.join(stanford_jars)

            text = pos_tagger.tag(source)
            print(text)
Example #24
0
    def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):

        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                    'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(
                self._JAR, path_to_jar,
                searchpath=(), url=_stanford_url,
                verbose=verbose)

        self._stanford_model = find_file(model_filename,
                env_vars=('STANFORD_MODELS',), verbose=verbose)
        
        # Adding logging jar files to classpath 
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
        
        self._encoding = encoding
        self.java_options = java_options
Example #25
0
def parser_nltk(word_lists, filename):
    os.environ['JAVAHOME'] = JAVA_PATH
    os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH
    os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS
    chinese_parser = StanfordParser(model_path=nltk_parse_model_path)
    STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0]
    chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR))
    chinese_parser.java_options = '-mx15000m'
    all_parser_sentence = []
    file = shelve.open(filename)
    flag = 0

    for sentence in word_lists:
        if sentence.strip() != "":
            res = list(chinese_parser.parse((sentence.strip()).split()))
            new_str = return_str_tofile(sentence_parse=str(res[0]))
            file[str(flag)] = res
            all_parser_sentence.append(new_str)
            flag += 1
            print("###### NLTK Dependency Parser Have finished " + str(flag) +
                  " sentences ###")
    return all_parser_sentence
# -*- coding: utf-8 -*-
# export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
from __future__ import unicode_literals
import os
import sys
import io
import copy
import nltk
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
parser = StanfordParser(
    model_path=
    "stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
)
stanford_dir = parser._classpath[0].rpartition('/')[0]
parser._classpath = tuple(find_jars_within_path(stanford_dir))


# from set_parser import parse_it
class Node(object):
    """
		A generic representation of a tree node. Includes a string label and a list of a children.
	"""
    def __init__(self, label):
        """
			Creates a node with the given label. The label must be a string for use with the PQ-Gram
			algorithm.
		"""
        self.label = label
        self.children = list()
# -*- coding: utf-8 -*-
# export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar
from __future__ import unicode_literals
import os
import sys
import io
import copy
import nltk
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
parser=StanfordParser(model_path="stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
stanford_dir = parser._classpath[0].rpartition('/')[0]
parser._classpath = tuple(find_jars_within_path(stanford_dir))
# from set_parser import parse_it
class Node(object):
	"""
		A generic representation of a tree node. Includes a string label and a list of a children.
	"""

	def __init__(self, label):
		"""
			Creates a node with the given label. The label must be a string for use with the PQ-Gram
			algorithm.
		"""
		self.label = label
		self.children = list()

	def addkid(self, node, before=False):
		"""
			Adds a child node. When the before flag is true, the child node will be inserted at the
			beginning of the list of children, otherwise the child node is appended.
Example #28
0
def preprocess(flist, folder_path):
    """ (file open for reading, str) -> Nonetype

    flist contains one filename per line and folder_path represents a 
    directory. Do preprocessing on each file from flist in folder_path.
    """

    error_log = []
    for i in range(len(flist)):

        path = flist[i]

        stemmer = PorterStemmer()
        parser = StanfordParser(
            model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
            verbose=True)
        stanford_dir = parser._classpath[0].rpartition('/')[0]
        parser._classpath = tuple(find_jars_within_path(stanford_dir))

        with open(path, 'r') as rf:
            try:
                sent = [line.strip('\n ') for line in rf]
            except UnicodeDecodeError as e:
                error_log.append('Unicode Decode Error:\t' + path + '\n')
                pass
            else:
                if not sent:
                    error_log.append('Empty File Error:\t' + path + '\n')
                    pass
                else:
                    # Stemming with Porter Stemmer
                    pars_stem = stemmer.stem(' '.join(sent))
                    stemmed = '\n'.join(sent)

                    wf = open(
                        folder_path + path.split('.')[0].split('/')[-1] +
                        '.stem', 'w')
                    wf.write(stemmed)
                    wf.close()

                    # POS Tagging after tokenizing and stemming
                    pos = nltk.pos_tag(pars_stem.split())
                    wf = open(
                        folder_path + path.split('.')[0].split('/')[-1] +
                        '.pos', 'w')
                    wf.write(str(pos))
                    wf.close()

                    # CFG parser
                    try:
                        parsed = parser.raw_parse(pars_stem)
                    except (TypeError, IndexError, NameError) as e:
                        error_log.append('Unparsable Error:/t' + path + '/n')
                        pass
                    wf = open(
                        folder_path + path.split('.')[0].split('/')[-1] +
                        '.pars', 'w')
                    s_pars = " ".join(str(x) for x in list(parsed))
                    s_pars = s_pars.replace("Tree", "")
                    s_pars = s_pars.replace("[", "")
                    s_pars = s_pars.replace("]", "")
                    s_pars = s_pars.replace("\'", "")
                    wf.write(s_pars)
                    wf.close()

    # Print files paths with Errors
    if error_log:
        wf = open(folder_path + 'error_log', 'wb')
        for line in error_log:
            wf.write(line)
        wf.close()
Example #29
0
def update_tagger_jars(tagger):
    stanford_dir = tagger._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)

    tagger._stanford_jar = ':'.join(stanford_jars)
    return tagger
Example #30
0
    def __init__(self, libpath='stanford/', verbose=False):
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)

        config_java(verbose=verbose)
Example #31
0
#!/bin/env python3.5
from nltk.tag.stanford import StanfordNERTagger
from nltk.internals import find_jars_within_path
from nltk.tokenize import sent_tokenize
import os

tagger = StanfordNERTagger('data/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'data/stanford-ner-2015-12-09/stanford-ner.jar')
tagger._stanford_jar = ':'.join(find_jars_within_path(os.getcwd() + 'data/stanford-ner-2015-12-09'))
print(tagger.tag_sents([''.join([c for c in x if c not in '",:.?/!@#$%^&*()][{}~']).split() for x in sent_tokenize(input('Enter a sentence: '))]))
Example #32
0
 def __init__(self):
     self.parser = StanfordParser(
         model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
     stanford_dir = self.parser._classpath[0].rpartition('/')[0]
     self.parser._classpath = tuple(find_jars_within_path(stanford_dir))
Example #33
0
    word_tf = []
    for word in unique_terms:
        word_tf.append(collection.tf(word, document))
    return word_tf


stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()

java_path = 'C:/Program Files (x86)/Java/jre1.8.0_101/bin/'
os.environ['JAVA_HOME'] = java_path
stanford_dir = 'C:/stanford-ner-2016-10-31/'
jarfile = stanford_dir + 'stanford-ner.jar'
modelfile = stanford_dir + 'classifiers/english.muc.7class.distsim.crf.ser.gz'
st = StanfordNERTagger(modelfile, jarfile)
stanford_jars = find_jars_within_path(stanford_dir)
st._stanford_jar = ';'.join(stanford_jars)

if __name__ == "__main__":
    folder = "Thomas_Baker"
    # Empty list to hold text documents.
    texts = []

    listing = os.listdir(folder)
    for file in sorted(listing):
        if file.endswith(".txt"):
            url = folder + "/" + file
            f = open(url, encoding="latin-1")
            raw = f.read()
            f.close()
            tokens = nltk.word_tokenize(raw)
Example #34
0
#中文词性标注
chi_tagger=StanfordPOSTagger(model_filename='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/models/chinese-distsim.tagger',
                             path_to_jar='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/stanford-postagger.jar')
print(chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。'.split()))



#英文句法分析
#import os
#java_path='/usr/lib/jvm/jdk/jdk1.8.0_121'
#os.environ['JAVAHOME']=java_path
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
eng_parser=StanfordParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar',
                          '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar',
                          '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz')
eng_parser.__classpath=tuple(find_jars_within_path('/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/'))
print (list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())))


#英文依存句法分析
from nltk.parse.stanford import StanfordDependencyParser
eng_parser=StanfordDependencyParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar',
                          '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar',
                          '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz')
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
print (res[0])
for row in res[0].triples():
    print(row)

Example #35
0
#!/bin/env python3.5
#Author: Saurabh Pathak
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
from nltk.tokenize import sent_tokenize
from nltk import download
from nltk.tree import ParentedTree
import os

#download('punkt', quiet=True)
#download('names', quiet=True)

os.environ['CLASSPATH'] = os.getenv('CLASSPATH', '') + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser.jar:' + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
parser._classpath = find_jars_within_path(os.getcwd() + 'data/stanford-parser-full-2015-12-09')

text = input('Enter some text:')

tlist = [ParentedTree.fromstring(str(list(parsetree)[0])) for parsetree in parser.raw_parse_sents(sent_tokenize(text))]

tlist2 = [tree.copy(True) for tree in tlist]
from hobbs import *
from lappinleasse import *

print('Input text was:\n', text)
def resolve(ls, algo):
    print('\nResolving with', algo)
    i = -1
    for parsetree in ls:
        i += 1
Example #36
0
    def __init__(self, libpath='stanford/', verbose=False):
        self._verbose = verbose
        self._libs = find_jars_within_path(libpath)

        config_java(verbose=verbose)
Example #37
0
 def __init__(self):
     self.parser = StanfordParser(
         model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
     stanford_dir = self.parser._classpath[0].rpartition('/')[0]
     self.parser._classpath = tuple(find_jars_within_path(stanford_dir))
Example #38
0
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar')
text = 'While in Frabce'

tokenized_text = word_tokenize(text)
#print tokenized_text
#classified_text = st.tag(tokenized_text)
#print(classified_text)




import nltk
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar')
print st._stanford_jar
stanford_dir = st._stanford_jar.rpartition('/')[0]
from nltk.internals import find_jars_within_path
stanford_jars = find_jars_within_path(stanford_dir)
print ":".join(stanford_jars)
st._stanford_jar = ':'.join(stanford_jars)
print st._stanford_jar
text = st.tag('Rami Eid is studying at Stony Brook University in NY'.split())
print text