class RPG: def __init__(self, storage_path, corenlp_url): self.storage_path = os.path.abspath(storage_path) self.parser = CoreNLPParser(url=corenlp_url) def create_question_json(self, filepath: str, max_question_word_count: int = 30, update: bool = False) -> str: newfilepath = os.path.join( self.storage_path, 'questions_' + os.path.basename(filepath)) if not update and os.path.exists(newfilepath): return newfilepath wfile = open(newfilepath, 'w') filepath = os.path.abspath(filepath) rfile = open(filepath, 'r') question_symbols = {'SBARQ', 'SQ'} for line in rfile: answer = next(rfile) json_line = json.loads(line) property = str(json_line['body']) if len(property.split()) > max_question_word_count: continue try: parse_trees = self.parser.parse_text(property) for parse_tree in parse_trees: for subtree in parse_tree.subtrees(): if subtree.label() in question_symbols: wfile.write(line) wfile.write(answer) raise Exception('Prevent duplicate write') except Exception as e: pass return newfilepath def perform_ner(self, filepath: str, entity_type: str = 'number', update: bool = False) -> str: newfilepath = os.path.join( self.storage_path, entity_type + '_' + os.path.basename(filepath)) if not update and os.path.exists(newfilepath): return newfilepath wfile = open(newfilepath, 'w') filepath = os.path.abspath(filepath) rfile = open(filepath, 'r') recognizer = NumberRecognizer(Culture.English) model = recognizer.get_number_model() for line in rfile: answer = next(rfile) answer_json = json.loads(answer) text = str(answer_json['body']) try: result = model.parse(text) if result: for x in result: if x.type_name == entity_type: wfile.write(line) wfile.write(answer) break except Exception as e: print(e) return newfilepath def create_subreddit_json(self, filepath: str, subreddit: str, update: bool = False) -> str: subreddit = subreddit.lower() newfilepath = os.path.join( self.storage_path, subreddit + '_' + os.path.basename(filepath)) if not update and os.path.exists(newfilepath): return newfilepath wfile = open(newfilepath, 'w') filepath = os.path.abspath(filepath) rfile = open(filepath, 'r') for line in rfile: try: json_line = json.loads(line) if json_line['subreddit'].lower() == subreddit: wfile.write(line) except Exception as e: print(e) rfile.close() wfile.close() return newfilepath def find_comment_pairs(self, filepath: str, min_score: int = 0, update: bool = False) -> str: newfilepath = os.path.join( self.storage_path, 'pairs_' + os.path.basename(filepath)) if not update and os.path.exists(newfilepath): return newfilepath wfile = open(newfilepath, 'w') filepath = os.path.abspath(filepath) rfile = open(filepath, 'r') for line in rfile: try: comment = json.loads(line) except Exception as e: print(e) continue if comment['score'] > min_score: rfile_comparison = open(filepath, 'r') highest_score_comparison = {'score': 0} for line_comparison in rfile_comparison: try: comment_comparison = json.loads(line_comparison) except Exception as e: print(e) continue if (comment['parent_id'][3:] == comment_comparison['id'] and comment_comparison['score'] > highest_score_comparison['score']): highest_score_comparison = comment_comparison if highest_score_comparison['score'] > 0: wfile.write(json.dumps(highest_score_comparison) + '\n') wfile.write(line) return newfilepath def perform_all(self, filepath: str, subreddits: list, update: bool): for subreddit in subreddits: sub_time = time.time() sub = self.create_subreddit_json(filepath, subreddit, update = update) print('Subreddit comments file created in {:.2f} seconds'.format(time.time() - sub_time)) pairs_time = time.time() pairs = self.find_comment_pairs(sub, update = update) print('Comment pairs file created in {:.2f} seconds'.format(time.time() - pairs_time)) questions_time = time.time() questions = self.create_question_json(pairs, update = update) print('Question-answer file created in {:.2f} seconds'.format(time.time() - questions_time)) ner_time = time.time() ner = self.perform_ner(questions) print('Entity file created in {:.2f} seconds'.format(time.time() - ner_time))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Oct 27 13:05:59 2018 @author: raja """ import pandas as pd df = pd.read_csv('text_alone.csv', header=None,delimiter="\t") from nltk.parse.corenlp import CoreNLPParser parser = CoreNLPParser(url='http://localhost:9010') i=len(df[0]) pp=[] for one_t in df[0]: text=one_t #text = "The runner scored from second on a base hit" parse123 = next(parser.parse_text(text)) #Flattenning the tree parse_string = ' '.join(str(parse123).split()) pp.append(parse_string) i=i-1 if i%10==0: print(i) ppdata=pd.DataFrame(pp) ppdata.to_csv('pos_tree1.csv',index=False,header=False)
import os import nltk from nltk.parse.corenlp import CoreNLPServer from nltk.parse.corenlp import CoreNLPParser from nltk.parse.corenlp import CoreNLPDependencyParser STANFORD = "stanford-corenlp-full-2018-10-05" jars = ( os.path.join(STANFORD, "stanford-corenlp-3.9.2.jar"), os.path.join(STANFORD, "stanford-corenlp-3.9.2-models.jar"), ) text = "turn right and go up the stairs and stand at the top." #text = "Walk out of the closet and into the hallway. Walk through the hallway entrance on the left. Stop just inside the entryway." #text = "Turn, putting the exit of the building on your left. Walk to the end of the entrance way and turn left. Travel across the kitchen area with the counter and chairs on your right. Continue straight until you reach the dining room. Enter the room and stop and wait one meter from the closest end of the long dining table." print(text) with CoreNLPServer(*jars): parser = CoreNLPParser() for i in parser.parse_text(text): print(i) parser = CoreNLPDependencyParser() for i in parser.raw_parse(text): print(i)