def set_dependency_parser(self, config): if isinstance(config, dict): helpers.cond_print("Dependency Parser: " + config["name"], self.verbose) self.dependency_parser = config["name"] if config["name"] == "spacy": """ Sets the model and returns the Spacy NLP instance. Example ways from the Spacy docs: spacy.load("en") # shortcut link spacy.load("en_core_web_sm") # package spacy.load("/path/to/en") # unicode path spacy.load(Path("/path/to/en")) # pathlib Path """ self.dependency_parser_instance = spacy.load(config["model"]) elif config["name"] == "corenlp": if 'CLASSPATH' not in os.environ: os.environ['CLASSPATH'] = "" cpath = config["model"] + os.pathsep + config["parser"] if cpath not in os.environ['CLASSPATH']: os.environ['CLASSPATH'] = cpath + os.pathsep + os.environ[ 'CLASSPATH'] # TODO:- DEPRECATED self.dependency_parser_instance = StanfordDependencyParser( path_to_models_jar=config["model"], encoding='utf8') elif config["name"] == "corenlp-server": # Requires the CoreNLPServer running in the background at the below URL (generally https://localhost:9000) # Start server by running the following command in the JARs directory. # `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000` self.dependency_parser_instance = CoreNLPDependencyParser( url=config["url"])
def set_attribute_datatype(self, attr_type_obj): # Set new datatype for attribute, data_type in attr_type_obj.items(): if data_type in constants.attribute_types.values(): self.data_attribute_map[attribute]['dataType'] = data_type self.populate_dataset_meta_for_attr(attribute, data_type) else: helpers.cond_print("Invalid Target DataType. Choose from " + str(constants.attribute_types.values()), debug=True) sys.exit(error_codes.BAD_INPUT_ATTRIBUTE_DATA_TYPE)
def analyze_query(self, query_raw, dialog=False, debug=False): # type: (str) -> dict self.execution_durations = dict() self.dialog = dialog self.debug = debug # If not a follow-up query, reset the output variables. if not dialog: self.extracted_vis_type = None self.extracted_vis_token = None self.extracted_tasks = OrderedDict() self.extracted_attributes = OrderedDict() self.vis_list = None # CLEAN AND PROCESS QUERY self.query_raw = query_raw helpers.cond_print("Raw Query: " + self.query_raw, self.verbose) st = time.time() self.query_processed = self.query_genie_instance.process_query( self.query_raw) self.query_tokens = self.query_genie_instance.clean_query_and_get_query_tokens( self.query_processed, self.reserve_words, self.ignore_words) self.query_ngrams = self.query_genie_instance.get_query_ngrams( ' '.join(self.query_tokens)) self.dependencies = self.query_genie_instance.create_dependency_tree( self.query_processed) helpers.cond_print("Processed Query: " + self.query_processed, self.verbose) self.execution_durations['clean_query'] = time.time() - st # DETECT EXPLICIT AND IMPLICIT ATTRIBUTES st = time.time() self.extracted_attributes = self.attribute_genie_instance.extract_attributes( self.query_ngrams) helpers.cond_print( "Final Extracted Attributes: " + str(list(self.extracted_attributes.keys())), self.verbose) self.execution_durations['extract_attributes'] = time.time() - st # DETECT EXPLICIT VISUALIZATION UTTERANCES st = time.time() self.extracted_vis_type, self.extracted_vis_token = self.vis_genie_instance.extract_vis_type( self.query_ngrams) self.execution_durations['extract_vis_type'] = time.time() - st # DETECT IMPLICIT AND EXPLICIT TASKS st = time.time() task_map = self.task_genie_instance.extract_explicit_tasks_from_dependencies( self.dependencies) # Filters from Domain Values task_map = self.task_genie_instance.extract_explicit_tasks_from_domain_value( task_map) # At this stage, which attributes are encodeable? encodeable_attributes = self.attribute_genie_instance.get_encodeable_attributes( ) # INFER tasks based on (encodeable) attribute Datatypes task_map = self.task_genie_instance.extract_implicit_tasks_from_attributes( task_map, encodeable_attributes) # From the generated TaskMap, ensure that the task "keys" are NOT EMPTY LISTS self.extracted_tasks = self.task_genie_instance.filter_empty_tasks( task_map) self.execution_durations['extract_tasks'] = time.time() - st # RECOMMEND VISUALIZATIONS FROM ATTRIBUTES, TASKS, and VISUALIZATIONS st = time.time() # Final list of encodeable attributes in the VIS final_encodeable_attributes = self.attribute_genie_instance.update_encodeable_attributes_based_on_tasks( ) self.vis_list = self.vis_genie_instance.get_vis_list( attribute_list=final_encodeable_attributes) self.execution_durations['get_vis_list'] = time.time() - st self.execution_durations['total'] = sum( self.execution_durations.values()) # Prepare output output = { 'status': 'SUCCESS' if len(self.vis_list) > 0 else 'FAILURE', 'debug': { 'execution_durations': self.execution_durations }, 'query_raw': self.query_raw, 'query': self.query_processed, 'dataset': self.data_url, 'visList': self.vis_list, 'attributeMap': self.extracted_attributes, 'taskMap': self.extracted_tasks, 'followUpQuery': self.dialog, 'contextObj': None } return output if debug else helpers.delete_keys_from_dict( output, keys=constants.keys_to_delete_in_output)