def sdfprocess(rvdata): parser = StanfordParser( path_to_jar= '/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', path_to_models_jar= '/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', java_options='-mx15000m') sdfdata = [] cnn = 0 widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' ', FileTransferSpeed() ] pbar = ProgressBar(widgets=widgets, maxval=len(rvdata)).start() for eg in rvdata: # if cnn%100 == 0: print "%f%% of document %d finished" % (cnn*100*1.0/len(rvdata), partidx+1) cmt = eg[3].decode('utf-8') #3 is the idx of comment sentences = nltk.sent_tokenize(cmt) parsedls = [] for snt in sentences: sntparsed = parser.raw_parse(snt) parsedls.append(sntparsed) sdfdata.append(eg[:3] + [parsedls]) # print cnn # print sdfparsed # print sdfdata # if cnn > 5: break pbar.update(cnn + 1) cnn += 1 pbar.finish() return sdfdata
def __init__(self, sentence): en_parser = StanfordParser( path_to_jar= '../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) sg = StanfordTokenizer( path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar' ) self.status = 0 self.trans = googletrans.Translator() self.sentence = sentence.strip("\n").replace(" ", "") en_trans = self.trans.translate(sentence).text en_trans = sg.tokenize(en_trans) try: tree = list(en_parser.parse(en_trans)) self.tree = tree[0] # print(self.tree) self.rel = [] except: self.status = 1
def ConstituencyParser(sentence): from nltk.parse.stanford import StanfordParser # create parser object scp = StanfordParser(path_to_jar='/path/to/stanford-parser.jar', path_to_models_jar='path/to/stanford-parser-models.jar') # get parse tree result = list(scp.raw_parse(sentence))
def __init__(self, corpus): """ We'll use the Stanford Parser to do the heavy lifting here. """ def n_productions(parse_tree, production): """ Returns the number of productions of type `production` in parse_tree. """ productions = list(parse_tree.subtrees( filter=lambda t: t.label() == production)) return len(productions) jar = '/usr/local/Cellar/stanford-parser/' '3.6.0/libexec/stanford-parser.jar' model = '/usr/local/Cellar/stanford-parser/' '3.6.0/libexec/stanford-parser-3.6.0-models.jar' self.corpus = [corpus] if isinstance(corpus[0], tuple) else corpus self.parser = StanfordParser(path_to_jar=jar, path_to_models_jar=model) self.stats = [] parsed_sents = self.parser.tagged_parse_sents(self.corpus) self.trees = [t for tree in parsed_sents for t in tree] for tree in self.trees: self.stats.append({ 'depth': tree.height(), 'noun_phrases': n_productions(tree, 'NP'), 'prepositional_phrases': n_productions(tree, 'PP'), 'sbars': n_productions(tree, 'SBAR'), 'nonterminals': len(tree.productions()), })
def Parser(parser_folder_name='', parser_folder='', parser_model_name='', parser_model_path='', parser_jarpath=''): ### default_parser_folder_name = 'stanford-parser-full-2017-06-09' if len(parser_folder_name)==0: parser_folder_name = default_parser_folder_name ### default_parser_folder = os.path.join(os.path.expanduser('~'), 'Stanford NLP', parser_folder_name) if len(parser_folder)==0: parser_folder = default_parser_folder ### if len(parser_model_path)==0: default_parser_model_name = 'stanford-chinese-corenlp-2017-06-09-models.jar' if len(parser_model_name)==0: parser_model_name = default_parser_model_name parser_model = os.path.join(os.path.expanduser('~'), 'Stanford NLP', 'models', parser_model_name) else: parser_model = parser_model_path ### default_parser_jarpath = os.path.join(parser_folder,'stanford-parser.jar') if len(parser_jarpath)==0: parser_jarpath = default_parser_jarpath ### parser=StanfordParser(path_to_jar=parser_jarpath, path_to_models_jar=parser_model) return parser
def __init__(self): classifier_path1 = "stanford/english.muc.7class.distsim.crf.ser.gz" # scenario 1 # classifier_path2 = "stanford/id-ner-model-half.ser.gz" # scenario 2 # classifier_path2 = "stanford/id-ner-model-id.ser.gz" # scenario 3 # classifier_path2 = "stanford/id-ner-model-2.ser.gz" ner_jar_path = "stanford/stanford-ner.jar" # for handling error nltk internals nltk.internals.config_java(options='-xmx5g') self.pre = Preprocess() self.scp = StanfordParser( './stanford/stanford-parser.jar', './stanford/stanford-parser-3.9.1-models.jar', encoding='utf8') self.ner_tagger = StanfordNERTagger(classifier_path1, ner_jar_path, encoding='utf8') # for scenario 3 self.pos_tagger = StanfordPOSTagger( './stanford/english-bidirectional-distsim.tagger', './stanford/stanford-postagger.jar', encoding='utf8') # combining classifier from Stanford with custom classifier # self.com_tagger = NERComboTagger(classifier_path1,ner_jar_path,stanford_ner_models=classifier_path1+","+classifier_path2) #for scenario 1 and 2 self.core_nlp = StanfordCoreNLP('http://localhost', port=9000)
def generateCandidateSentence(file_name, num_sentence): file = open(file_name, 'r').read().decode("utf8") sent_tokenize_list = sent_tokenize(file) #sent_tokenize_list = [x.encode("utf8") for x in sent_tokenize_list] eng_parser = StanfordParser( 'stanford-parser-full-2017-06-09/stanford-parser.jar', 'stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar') num = 0 name = sent_tokenize_list[0].split("\n")[0] #print(name) for sent in sent_tokenize_list: #filter some sentences: tmp_s = sent.split('\n') for s in tmp_s: if len(s) < 5: continue if checkNPVP(s, eng_parser): for p in pronoun: if findWholeWord(p): s = re.sub(p, name, s.lower(), count=1) print(s) num += 0 if num == num_sentence: break if num < num_sentence: for i in range(num_sentence - num): print("None")
def convert_eng_to_isl(input_string): # get all required packages download_required_packages() if len(list(input_string.split(' '))) is 1: return list(input_string.split(' ')) # Initializing stanford parser parser = StanfordParser() # Generates all possible parse trees sort by probability for the sentence possible_parse_tree_list = [ tree for tree in parser.parse(input_string.split()) ] # Get most probable parse tree parse_tree = possible_parse_tree_list[0] print(parse_tree) # output = '(ROOT # (S # (PP (IN As) (NP (DT an) (NN accountant))) # (NP (PRP I)) # (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment)))))) # ) # )' # Convert into tree data structure parent_tree = ParentedTree.convert(parse_tree) modified_parse_tree = modify_tree_structure(parent_tree) parsed_sent = modified_parse_tree.leaves() return parsed_sent
def main(): """Main function of script.""" args = utils.read_arguments(__doc__) # Read dataset. Each row of x_matrix is a sentence. x_matrix, y_vector = utils.pickle_from_file(args['input_filename']) # Get Stanford model parser = StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8') # Get parse trees. parsed_matrix = [] for index, document in tqdm(enumerate(x_matrix), total=len(x_matrix)): parsed_document = [] for paragraph_index, paragraph in enumerate(document): parsed_paragraph = [] for sentence_index, sentence in enumerate(paragraph): try: parsed_paragraph.append( list( parser.raw_parse( six.text_type(sentence.decode('utf-8'))))) except UnicodeDecodeError: logging.warning( 'Skip sentence {}-{}-{} for unicode error'.format( index, paragraph_index, sentence_index)) y_vector[index].pop(sentence_index) parsed_document.append(parsed_paragraph) parsed_matrix.append(parsed_document) # Save output logging.info('Saving {} documents'.format(len(parsed_matrix))) utils.pickle_to_file((parsed_matrix, y_vector), args['output_filename']) logging.info('All operations finished')
def parse_sentences(raw_sentences): parser = StanfordParser() raw_trees = parser.raw_parse_sents(raw_sentences) # Converts messy iterables into simple list of trees return [raw_tree[0] for sublist in raw_trees for raw_tree in sublist]
def __init__(self, properties={'lang': 'en'}): LoggingInterface.__init__(self) self.parser = StanfordParser( model_path=MODELS_PATHS[properties['lang']]) self.wd = WikiData() self.wd.set_properties(properties) self.properties = properties
def clean_apriori_data(self, sentences): """ filter apriori data methods: - clean stop words - stemming - fuzzy matching within sentence """ stop_words = stopwords.words('english') eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') if config.apriori_test_size < 6: for sent in sentences: print(sent) '''POS''' pos_sent = [] for sent in sentences: pos_sent.append(list(eng_parser.parse( [w for w in sent.split()]))[0]) '''filter noun phrase & NLTK stemming''' cleaned_sent = [] for sent in pos_sent: wnl = WordNetLemmatizer() tmp_sent = [] for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'): '''clean stop words & stemming''' tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words] '''lenght <= 3 & filter repeated list''' if 0 < len(tmp) <= 3 and tmp not in tmp_sent: tmp_sent.append(tmp) cleaned_sent.append(tmp_sent) return pos_sent
def __init__(self, conf, query_text): self.conf = conf self.stanford_parser_loc = self.conf.stanford_parser_home + 'stanford-parser.jar' self.stanford_parser_model_loc = self.conf.stanford_parser_home + 'stanford-parser-3.9.2-models.jar' self.parse_model = StanfordParser(self.stanford_parser_loc, self.stanford_parser_model_loc, model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") self.query_text = query_text
def main(corpus, verbs, processes, outfile, sub_sentences): """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence """ global splitter, tagger, parser, all_verbs splitter = PunktSentenceSplitter('en') tagger = TTPosTagger('en') parser = StanfordParser( path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx1G -Djava.ext.dirs=dev/' ) # no way to make classpath work all_verbs = reduce(lambda x, y: x.union(y), imap(set, json.load(verbs).values()), set()) all_verbs.discard('be') all_verbs.discard('have') args = load_corpus(corpus, 'bio', text_only=True) worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences counter = defaultdict(int) for i, counts in enumerate(parallel.map(worker, args, processes)): for k, v in counts.iteritems(): counter[k] += v if (i + 1) % 10000 == 0: logger.info('Processed %d documents', i + 1) counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k)) for k, v in counter.iteritems(): print k, v json.dump(counter, outfile, indent=2)
def main(): TRAINING_INPUT_FILE = 'data/positive_negative_reviews_sentiment_2k.csv' OUTPUT_FILE = 'data/positive_negative_trigrams_2k.csv' rows = csv.getRows(TRAINING_INPUT_FILE) cols = csv.getHeader(TRAINING_INPUT_FILE) cols.append('trigrams') for row in rows: row.append('dummy data') csv.writeFile(OUTPUT_FILE, rows, cols) print cols # parser = stanford.StanfordParser(model_path="/location/of/the/englishPCFG.ser.gz") parser = StanfordParser( model_path= "/Users/rohankohli/Documents/workspace/CoreNLP/models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) sentences = parser.raw_parse_sents( ("Hello, My name is Melroy.", "What is your name?")) print sentences print sentences.next() return EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard." print(sent_tokenize(EXAMPLE_TEXT)) return # text = 'Punkt knows that the periods in Mr. Smith and Johann S. Bach do not mark sentence boundaries. And sometimes sentences can start with non-capitalized words. i is a good variable name.' # sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') # print('\n-----\n'.join(sent_detector.tokenize(text.strip()))) return
def POS_data(self): """POS sentences""" tag = 'pos' idx = 19 file_name = 'data/normalize_{}_piece/nor_{}_{}.csv'.format(tag, tag, idx) with open(file_name, 'r') as file: sentences = file.read().strip().split('\n') stop_words = stopwords.words('english') eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') eng_parser.java_options = '-mx3000m' print('=' * 100) print('current tag: {}, file idx: {}'.format(tag, idx)) '''POS''' print('=' * 100) print('Starting POS...') pos_sent = [] for sent in tqdm(sentences): pos_sent.append(list(eng_parser.parse( [w for w in sent.split()]))[0]) '''save file''' save_file = 'data/{}_sent/{}_sent_{}.csv'.format(tag, tag, idx) with open(save_file, mode='w') as file: for sent, pos in zip(sentences, pos_sent): file.write(sent + '\t') file.write(str(pos) + '\t') print('Finish! Saved in {}'.format(save_file))
def cStructure(): print '######## C Structure' parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) example = parser.raw_parse("Who were the CEO of IBM?") # example = parser.raw_parse("Steve Jobs was Founder of Apple. He was born in United States of America.") #for line in example: #for sentence in line: #sentence.draw() #print type(example) example = list(example) #print example abcabc = example[0] abcabc1 = abcabc[0] print type(abcabc) hello = str(abcabc) print type(abcabc) print hello #print abcabc1.label() for a in abcabc: #print a.height() if a.height() > 1: extractNP(a) print myNounPhrasesTree
def __init__(self, conversion_path=CONVERSION_PATH): with open(conversion_path, 'r') as f: self.metrics = json.load(f) self.inflect = inflect.engine() self.stemmer = SnowballStemmer('english') self.parser = StanfordParser(model_path=MODELS_PATH)
def parser(self): if self._stf_parser is None: self._stf_parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") result = self._stf_parser.parse_sents(self._split_data) result = sum([[parse for parse in dep_graphs] for dep_graphs in result], []) for i in result: print(i)
def check(sent): parser = StanfordParser() # Parse the example sentence print(sent) t = list(parser.raw_parse(sent))[0] print(t) t = ParentedTree.convert(t) print(t) t.pretty_print() try: subj = find_subject(t) except: subj = [] try: pred = find_predicate(t) except: pred = [] try: obj = find_object(t) except: obj = [] print(subj) print(pred) print(obj) return subj, pred, obj
def extract_h4_parser(self, sentence): list = [] parser = StanfordParser(model_path="E:/Stanford parser/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") t = parser.raw_parse(str(sentence)) for i in t: for j in i.subtrees(lambda i: i.height() == 4): list.append(str(j)) return list
def __init__(self, language='english'): """ Initialize """ self.parser = StanfordParser() self.sent_detector = data.load('tokenizers/punkt/' + language + '.pickle') self.analyzer = SentimentIntensityAnalyzer()
def parser(sentence): chi_parser = StanfordParser( path_to_jar=path_dit.get('path_to_jar'), path_to_models_jar=path_dit.get('path_to_models_jar'), model_path=path_dit.get('model_path')) re = chi_parser.parse(sentence.split()) return re
def define_stanford_parser( self, path_to_models_jar='/Library/Tools/stanford/parser/stanford-parser-models.jar', model_path=u"edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz" ): _stanford_parser = StanfordParser( path_to_models_jar=path_to_models_jar, model_path=model_path) return _stanford_parser
def parseSentence(inputSentence): parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") parsedSentence = parser.raw_parse(inputSentence) sent = printSentence(parsedSentence) ret = str(sent).replace("\n", "").replace(' ', "").replace("(", "{").replace( ")", "}").replace(" {", "{") return ret
def en_parse(sent): """ 对英文句子做句法分析 """ parser = StanfordParser( 'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser.jar', 'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser-3.9.1-models.jar' ) return list(parser.raw_parse(sent))[0]
def cn_parse(sent): """ 对中文句子做句法分析,记得model_path要改变 """ parser = StanfordParser( 'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser.jar', 'E:\standford_nlp\stanford-parser-full-2018-02-27\stanford-parser-3.9.1-models.jar', model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') return list(parser.raw_parse(sent))[0]
def __init__(self): self.token_handler_obj = token_handler() self.semantic_group_obj = semantic_group() jar_file = '/root/Stanford_CoreNLP/stanford-corenlp-3.8.0.jar' model_path = '/root/Stanford_CoreNLP/stanford-corenlp-3.8.0-models.jar' self.parser = StanfordParser(jar_file, model_path) self.ambiguities = {'type': 'options', 'data': []} self.ambi_phrases = [[]] self.index = -1 # Variable used in the populate_ambiguites() definition
def parser(tokens): from nltk.parse.stanford import StanfordParser chi_parser = StanfordParser( r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser.jar", r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar", r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-chinese-corenlp-2016-10-31-models\edu\stanford\nlp" r"\models\lexparser\chinesePCFG.ser.gz") print(list(chi_parser.parse(tokens)))
def main(): parser = StanfordParser( path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar) st = StanfordNERTagger( model_filename= '../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution." sent = word_tokenize(raw_sent) ne_tuple = st.cur_tag( sent ) # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized) print ne_tuple print parser.raw_parse(raw_sent).next() return # find name entity f = 0 ne_list = [] for (ne, label) in ne_tuple: if label == 'PERSON': f = 1 if f and label != 'PERSON': break if f: ne_list.append(ne) # print ne_list init_file(main_tree) ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ? # try head to ask who/what pattern = "S < NP=np" head = check_output([ 'bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file ]) print head def get_main_verbs(tree): pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)' main_verbs = check_output([ 'bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file ]) print main_verbs main_verbs = main_verbs.split('\n')[:-1] main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs] return main_verbs