コード例 #1
0
def sdfprocess(rvdata):
    parser = StanfordParser(
        path_to_jar=
        '/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar',
        path_to_models_jar=
        '/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar',
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        java_options='-mx15000m')
    sdfdata = []
    cnn = 0
    widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker=RotatingMarker()), ' ',
        ETA(), ' ',
        FileTransferSpeed()
    ]
    pbar = ProgressBar(widgets=widgets, maxval=len(rvdata)).start()
    for eg in rvdata:
        # if cnn%100 == 0: print "%f%% of document %d finished" % (cnn*100*1.0/len(rvdata), partidx+1)
        cmt = eg[3].decode('utf-8')  #3 is the idx of comment
        sentences = nltk.sent_tokenize(cmt)
        parsedls = []
        for snt in sentences:
            sntparsed = parser.raw_parse(snt)
            parsedls.append(sntparsed)
        sdfdata.append(eg[:3] + [parsedls])
        # print cnn
        # print sdfparsed
        # print sdfdata
        # if cnn > 5: break
        pbar.update(cnn + 1)
        cnn += 1
    pbar.finish()
    return sdfdata
コード例 #2
0
    def __init__(self, sentence):
        en_parser = StanfordParser(
            path_to_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser.jar',
            path_to_models_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
            model_path=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        sg = StanfordTokenizer(
            path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar'
        )
        self.status = 0
        self.trans = googletrans.Translator()

        self.sentence = sentence.strip("\n").replace(" ", "")

        en_trans = self.trans.translate(sentence).text
        en_trans = sg.tokenize(en_trans)
        try:
            tree = list(en_parser.parse(en_trans))
            self.tree = tree[0]
            # print(self.tree)
            self.rel = []
        except:
            self.status = 1
コード例 #3
0
    def ConstituencyParser(sentence):

        from nltk.parse.stanford import StanfordParser
        # create parser object
        scp = StanfordParser(path_to_jar='/path/to/stanford-parser.jar', path_to_models_jar='path/to/stanford-parser-models.jar')
        # get parse tree
        result = list(scp.raw_parse(sentence))
コード例 #4
0
    def __init__(self, corpus):
        """
        We'll use the Stanford Parser to do the heavy lifting here.
        """
        def n_productions(parse_tree, production):
            """
            Returns the number of productions of type `production` in
            parse_tree.
            """
            productions = list(parse_tree.subtrees(
                filter=lambda t: t.label() == production))
            return len(productions)

        jar = '/usr/local/Cellar/stanford-parser/'
        '3.6.0/libexec/stanford-parser.jar'
        model = '/usr/local/Cellar/stanford-parser/'
        '3.6.0/libexec/stanford-parser-3.6.0-models.jar'
        self.corpus = [corpus] if isinstance(corpus[0], tuple) else corpus
        self.parser = StanfordParser(path_to_jar=jar, path_to_models_jar=model)
        self.stats = []

        parsed_sents = self.parser.tagged_parse_sents(self.corpus)
        self.trees = [t for tree in parsed_sents for t in tree]

        for tree in self.trees:
            self.stats.append({
                'depth': tree.height(),
                'noun_phrases': n_productions(tree, 'NP'),
                'prepositional_phrases': n_productions(tree, 'PP'),
                'sbars': n_productions(tree, 'SBAR'),
                'nonterminals': len(tree.productions()),
            })
コード例 #5
0
def Parser(parser_folder_name='',
		parser_folder='',
		parser_model_name='',
		parser_model_path='',
		parser_jarpath=''):
	###
	default_parser_folder_name = 'stanford-parser-full-2017-06-09'
	if len(parser_folder_name)==0:
		parser_folder_name = default_parser_folder_name
	###
	default_parser_folder = os.path.join(os.path.expanduser('~'), 'Stanford NLP', parser_folder_name)
	if len(parser_folder)==0:
		parser_folder = default_parser_folder
	###
	if len(parser_model_path)==0:
		default_parser_model_name = 'stanford-chinese-corenlp-2017-06-09-models.jar'
		if len(parser_model_name)==0:
			parser_model_name = default_parser_model_name
		parser_model = os.path.join(os.path.expanduser('~'), 'Stanford NLP', 'models', parser_model_name)
	else:
		parser_model = parser_model_path
	###
	default_parser_jarpath = os.path.join(parser_folder,'stanford-parser.jar')
	if len(parser_jarpath)==0:
		parser_jarpath = default_parser_jarpath
	###
	parser=StanfordParser(path_to_jar=parser_jarpath, path_to_models_jar=parser_model)
	return parser
コード例 #6
0
ファイル: nlp_helper.py プロジェクト: evadantir/ta-newsX
    def __init__(self):

        classifier_path1 = "stanford/english.muc.7class.distsim.crf.ser.gz"

        # scenario 1
        # classifier_path2 = "stanford/id-ner-model-half.ser.gz"
        # scenario 2
        # classifier_path2 = "stanford/id-ner-model-id.ser.gz"
        # scenario 3
        # classifier_path2 = "stanford/id-ner-model-2.ser.gz"
        ner_jar_path = "stanford/stanford-ner.jar"

        # for handling error nltk internals
        nltk.internals.config_java(options='-xmx5g')

        self.pre = Preprocess()
        self.scp = StanfordParser(
            './stanford/stanford-parser.jar',
            './stanford/stanford-parser-3.9.1-models.jar',
            encoding='utf8')
        self.ner_tagger = StanfordNERTagger(classifier_path1,
                                            ner_jar_path,
                                            encoding='utf8')  # for scenario 3
        self.pos_tagger = StanfordPOSTagger(
            './stanford/english-bidirectional-distsim.tagger',
            './stanford/stanford-postagger.jar',
            encoding='utf8')
        # combining classifier from Stanford with custom classifier
        # self.com_tagger = NERComboTagger(classifier_path1,ner_jar_path,stanford_ner_models=classifier_path1+","+classifier_path2) #for scenario 1 and 2
        self.core_nlp = StanfordCoreNLP('http://localhost', port=9000)
コード例 #7
0
ファイル: ask.py プロジェクト: abiraja2004/NLP_Project
def generateCandidateSentence(file_name, num_sentence):
    file = open(file_name, 'r').read().decode("utf8")
    sent_tokenize_list = sent_tokenize(file)
    #sent_tokenize_list =  [x.encode("utf8") for x in sent_tokenize_list]
    eng_parser = StanfordParser(
        'stanford-parser-full-2017-06-09/stanford-parser.jar',
        'stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar')
    num = 0
    name = sent_tokenize_list[0].split("\n")[0]
    #print(name)
    for sent in sent_tokenize_list:
        #filter some sentences:
        tmp_s = sent.split('\n')
        for s in tmp_s:
            if len(s) < 5:
                continue
            if checkNPVP(s, eng_parser):
                for p in pronoun:
                    if findWholeWord(p):
                        s = re.sub(p, name, s.lower(), count=1)
                print(s)
                num += 0
            if num == num_sentence:
                break
    if num < num_sentence:
        for i in range(num_sentence - num):
            print("None")
コード例 #8
0
def convert_eng_to_isl(input_string):
    # get all required packages
    download_required_packages()

    if len(list(input_string.split(' '))) is 1:
        return list(input_string.split(' '))

    # Initializing stanford parser
    parser = StanfordParser()

    # Generates all possible parse trees sort by probability for the sentence
    possible_parse_tree_list = [
        tree for tree in parser.parse(input_string.split())
    ]

    # Get most probable parse tree
    parse_tree = possible_parse_tree_list[0]
    print(parse_tree)
    # output = '(ROOT
    #               (S
    #                   (PP (IN As) (NP (DT an) (NN accountant)))
    #                   (NP (PRP I))
    #                   (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment))))))
    #                )
    #             )'

    # Convert into tree data structure
    parent_tree = ParentedTree.convert(parse_tree)

    modified_parse_tree = modify_tree_structure(parent_tree)

    parsed_sent = modified_parse_tree.leaves()
    return parsed_sent
コード例 #9
0
def main():
    """Main function of script."""
    args = utils.read_arguments(__doc__)

    # Read dataset. Each row of x_matrix is a sentence.
    x_matrix, y_vector = utils.pickle_from_file(args['input_filename'])

    # Get Stanford model
    parser = StanfordParser(
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        encoding='utf8')
    # Get parse trees.
    parsed_matrix = []
    for index, document in tqdm(enumerate(x_matrix), total=len(x_matrix)):
        parsed_document = []
        for paragraph_index, paragraph in enumerate(document):
            parsed_paragraph = []
            for sentence_index, sentence in enumerate(paragraph):
                try:
                    parsed_paragraph.append(
                        list(
                            parser.raw_parse(
                                six.text_type(sentence.decode('utf-8')))))
                except UnicodeDecodeError:
                    logging.warning(
                        'Skip sentence {}-{}-{} for unicode error'.format(
                            index, paragraph_index, sentence_index))
                    y_vector[index].pop(sentence_index)
            parsed_document.append(parsed_paragraph)
        parsed_matrix.append(parsed_document)

    # Save output
    logging.info('Saving {} documents'.format(len(parsed_matrix)))
    utils.pickle_to_file((parsed_matrix, y_vector), args['output_filename'])
    logging.info('All operations finished')
コード例 #10
0
ファイル: parser.py プロジェクト: ptravers/centi
def parse_sentences(raw_sentences):
    parser = StanfordParser()

    raw_trees = parser.raw_parse_sents(raw_sentences)

    # Converts messy iterables into simple list of trees
    return [raw_tree[0] for sublist in raw_trees for raw_tree in sublist]
コード例 #11
0
ファイル: nlquery.py プロジェクト: codejitsu/labr
 def __init__(self, properties={'lang': 'en'}):
     LoggingInterface.__init__(self)
     self.parser = StanfordParser(
         model_path=MODELS_PATHS[properties['lang']])
     self.wd = WikiData()
     self.wd.set_properties(properties)
     self.properties = properties
コード例 #12
0
    def clean_apriori_data(self, sentences):
        """
        filter apriori data
        methods:
        - clean stop words
        - stemming
        - fuzzy matching within sentence
        """
        stop_words = stopwords.words('english')
        eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

        if config.apriori_test_size < 6:
            for sent in sentences:
                print(sent)
        '''POS'''
        pos_sent = []
        for sent in sentences:
            pos_sent.append(list(eng_parser.parse(
                [w for w in sent.split()]))[0])

        '''filter noun phrase & NLTK stemming'''
        cleaned_sent = []
        for sent in pos_sent:
            wnl = WordNetLemmatizer()
            tmp_sent = []
            for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'):
                '''clean stop words & stemming'''
                tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words]
                '''lenght <= 3 & filter repeated list'''
                if 0 < len(tmp) <= 3 and tmp not in tmp_sent:
                    tmp_sent.append(tmp)
            cleaned_sent.append(tmp_sent)

        return pos_sent
コード例 #13
0
 def __init__(self, conf, query_text):
     self.conf = conf
     self.stanford_parser_loc = self.conf.stanford_parser_home + 'stanford-parser.jar'
     self.stanford_parser_model_loc = self.conf.stanford_parser_home + 'stanford-parser-3.9.2-models.jar'
     self.parse_model = StanfordParser(self.stanford_parser_loc, self.stanford_parser_model_loc,
                                       model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
     self.query_text = query_text
コード例 #14
0
def main(corpus, verbs, processes, outfile, sub_sentences):
    """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence
    """
    global splitter, tagger, parser, all_verbs
    splitter = PunktSentenceSplitter('en')
    tagger = TTPosTagger('en')
    parser = StanfordParser(
        path_to_jar='dev/stanford-corenlp-3.6.0.jar',
        path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
        java_options=' -mx1G -Djava.ext.dirs=dev/'
    )  # no way to make classpath work
    all_verbs = reduce(lambda x, y: x.union(y),
                       imap(set,
                            json.load(verbs).values()), set())
    all_verbs.discard('be')
    all_verbs.discard('have')

    args = load_corpus(corpus, 'bio', text_only=True)
    worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences
    counter = defaultdict(int)

    for i, counts in enumerate(parallel.map(worker, args, processes)):
        for k, v in counts.iteritems():
            counter[k] += v

        if (i + 1) % 10000 == 0:
            logger.info('Processed %d documents', i + 1)

    counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k))
    for k, v in counter.iteritems():
        print k, v

    json.dump(counter, outfile, indent=2)
コード例 #15
0
def main():
    TRAINING_INPUT_FILE = 'data/positive_negative_reviews_sentiment_2k.csv'
    OUTPUT_FILE = 'data/positive_negative_trigrams_2k.csv'
    rows = csv.getRows(TRAINING_INPUT_FILE)
    cols = csv.getHeader(TRAINING_INPUT_FILE)
    cols.append('trigrams')
    for row in rows:
        row.append('dummy data')
    csv.writeFile(OUTPUT_FILE, rows, cols)
    print cols

    # parser = stanford.StanfordParser(model_path="/location/of/the/englishPCFG.ser.gz")
    parser = StanfordParser(
        model_path=
        "/Users/rohankohli/Documents/workspace/CoreNLP/models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )
    sentences = parser.raw_parse_sents(
        ("Hello, My name is Melroy.", "What is your name?"))
    print sentences
    print sentences.next()
    return

    EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."
    print(sent_tokenize(EXAMPLE_TEXT))
    return

    # text = 'Punkt knows that the periods in Mr. Smith and Johann S. Bach do not mark sentence boundaries. And sometimes sentences can start with non-capitalized words.  i is a good variable name.'
    # sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    # print('\n-----\n'.join(sent_detector.tokenize(text.strip())))

    return
コード例 #16
0
    def POS_data(self):
        """POS sentences"""
        tag = 'pos'
        idx = 19
        file_name = 'data/normalize_{}_piece/nor_{}_{}.csv'.format(tag, tag, idx)
        with open(file_name, 'r') as file:
            sentences = file.read().strip().split('\n')

        stop_words = stopwords.words('english')
        eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
        eng_parser.java_options = '-mx3000m'

        print('=' * 100)
        print('current tag: {}, file idx: {}'.format(tag, idx))

        '''POS'''
        print('=' * 100)
        print('Starting POS...')
        pos_sent = []
        for sent in tqdm(sentences):
            pos_sent.append(list(eng_parser.parse(
                [w for w in sent.split()]))[0])

        '''save file'''
        save_file = 'data/{}_sent/{}_sent_{}.csv'.format(tag, tag, idx)
        with open(save_file, mode='w') as file:
            for sent, pos in zip(sentences, pos_sent):
                file.write(sent + '\t')
                file.write(str(pos) + '\t')
        print('Finish! Saved in {}'.format(save_file))
コード例 #17
0
def cStructure():
    print '######## C Structure'
    parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
    example = parser.raw_parse("Who were the CEO of IBM?")
#    example = parser.raw_parse("Steve Jobs was Founder of Apple. He was born in United States of America.")

    #for line in example:
        #for sentence in line:
            #sentence.draw()

    #print type(example)

    example = list(example)
    #print example
    abcabc = example[0]
    abcabc1 = abcabc[0]
    print type(abcabc)
    hello = str(abcabc)
    print type(abcabc)
    print hello
    #print abcabc1.label()

    for a in abcabc:
        #print a.height()
        if a.height() > 1:
            extractNP(a)


    print myNounPhrasesTree
コード例 #18
0
ファイル: converter.py プロジェクト: codejitsu/labr
    def __init__(self, conversion_path=CONVERSION_PATH):
        with open(conversion_path, 'r') as f:
            self.metrics = json.load(f)

        self.inflect = inflect.engine()
        self.stemmer = SnowballStemmer('english')
        self.parser = StanfordParser(model_path=MODELS_PATH)
コード例 #19
0
ファイル: featuremaker.py プロジェクト: marjanhs/AA_CNN
 def parser(self):
     if self._stf_parser is None:
         self._stf_parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
     result = self._stf_parser.parse_sents(self._split_data)
     result = sum([[parse for parse in dep_graphs] for dep_graphs in result], [])
     for i in result:
         print(i)
コード例 #20
0
def check(sent):

    parser = StanfordParser()

    # Parse the example sentence

    print(sent)
    t = list(parser.raw_parse(sent))[0]
    print(t)
    t = ParentedTree.convert(t)
    print(t)
    t.pretty_print()
    try:
        subj = find_subject(t)
    except:
        subj = []
    try:
        pred = find_predicate(t)
    except:
        pred = []
    try:
        obj = find_object(t)
    except:
        obj = []

    print(subj)
    print(pred)
    print(obj)
    return subj, pred, obj
コード例 #21
0
 def extract_h4_parser(self, sentence):
     list = []
     parser = StanfordParser(model_path="E:/Stanford parser/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
     t = parser.raw_parse(str(sentence))
     for i in t:
         for j in i.subtrees(lambda i: i.height() == 4):
             list.append(str(j))
     return list
コード例 #22
0
 def __init__(self, language='english'):
     """
     Initialize 
     """
     self.parser = StanfordParser()
     self.sent_detector = data.load('tokenizers/punkt/' + language +
                                    '.pickle')
     self.analyzer = SentimentIntensityAnalyzer()
コード例 #23
0
def parser(sentence):
    chi_parser = StanfordParser(
        path_to_jar=path_dit.get('path_to_jar'),
        path_to_models_jar=path_dit.get('path_to_models_jar'),
        model_path=path_dit.get('model_path'))
    re = chi_parser.parse(sentence.split())

    return re
コード例 #24
0
 def define_stanford_parser(
         self,
         path_to_models_jar='/Library/Tools/stanford/parser/stanford-parser-models.jar',
         model_path=u"edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz"
 ):
     _stanford_parser = StanfordParser(
         path_to_models_jar=path_to_models_jar, model_path=model_path)
     return _stanford_parser
コード例 #25
0
def parseSentence(inputSentence):
    parser = StanfordParser(
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    parsedSentence = parser.raw_parse(inputSentence)
    sent = printSentence(parsedSentence)
    ret = str(sent).replace("\n", "").replace('    ',
                                              "").replace("(", "{").replace(
                                                  ")", "}").replace(" {", "{")
    return ret
コード例 #26
0
def en_parse(sent):
    """
    对英文句子做句法分析
    """
    parser = StanfordParser(
        'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser.jar',
        'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser-3.9.1-models.jar'
    )
    return list(parser.raw_parse(sent))[0]
コード例 #27
0
def cn_parse(sent):
    """
    对中文句子做句法分析,记得model_path要改变
    """
    parser = StanfordParser(
        'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser.jar',
        'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser-3.9.1-models.jar',
        model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
    return list(parser.raw_parse(sent))[0]
コード例 #28
0
 def __init__(self):
     self.token_handler_obj = token_handler()
     self.semantic_group_obj = semantic_group()
     jar_file = '/root/Stanford_CoreNLP/stanford-corenlp-3.8.0.jar'
     model_path = '/root/Stanford_CoreNLP/stanford-corenlp-3.8.0-models.jar'
     self.parser = StanfordParser(jar_file, model_path)
     self.ambiguities = {'type': 'options', 'data': []}
     self.ambi_phrases = [[]]
     self.index = -1  # Variable used in the populate_ambiguites() definition
コード例 #29
0
ファイル: analisisChinese.py プロジェクト: jack1545/nltkdemo
def parser(tokens):
    from nltk.parse.stanford import StanfordParser

    chi_parser = StanfordParser(
        r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser.jar",
        r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar",
        r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-chinese-corenlp-2016-10-31-models\edu\stanford\nlp"
        r"\models\lexparser\chinesePCFG.ser.gz")
    print(list(chi_parser.parse(tokens)))
コード例 #30
0
def main():
    parser = StanfordParser(
        path_to_jar=script_wrapper.stanford_parser_jar,
        path_to_models_jar=script_wrapper.stanford_model_jar)
    st = StanfordNERTagger(
        model_filename=
        '../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
        path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
    raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution."
    sent = word_tokenize(raw_sent)
    ne_tuple = st.cur_tag(
        sent
    )  # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized)
    print ne_tuple

    print parser.raw_parse(raw_sent).next()

    return
    # find name entity
    f = 0
    ne_list = []
    for (ne, label) in ne_tuple:
        if label == 'PERSON':
            f = 1
        if f and label != 'PERSON':
            break
        if f:
            ne_list.append(ne)
    # print ne_list

    init_file(main_tree)
    ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ?
    # try head to ask who/what
    pattern = "S < NP=np"
    head = check_output([
        'bash',  ###add bash !!!!
        tregex_path,
        '-s',
        pattern,
        init_tree_file
    ])
    print head

    def get_main_verbs(tree):
        pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)'
        main_verbs = check_output([
            'bash',  ###add bash !!!!
            tregex_path,
            '-s',
            pattern,
            init_tree_file
        ])
        print main_verbs
        main_verbs = main_verbs.split('\n')[:-1]
        main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs]
        return main_verbs