Python DataGenie Examples

Programming Language: Python

Namespace/Package Name: nl4dv.datagenie

Class/Type: DataGenie

Examples at hotexamples.com: 3

Python DataGenie - 3 examples found. These are the top rated real world Python examples of nl4dv.datagenie.DataGenie extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DataGenie(2)

set_alias_map(1)

set_attribute_datatype(1)

set_data(1)

set_ignore_words(1)

set_label_attribute(1)

set_reserve_words(1)

Example #1

Show file

File: __init__.py Project: MarcelRuoff-KIT/nl4dv

class NL4DV:
    """
    Class exposed to users to interact with the package. Exposes modules in the package via
    public methods

    """
    def __init__(self,
                 data_url=None,
                 alias_url=None,
                 alias_map=None,
                 label_attribute=None,
                 ignore_words=list(),
                 reserve_words=list(),
                 dependency_parser_config=None,
                 verbose=False):

        # inputs
        self.data_url = data_url
        self.alias_url = alias_url
        self.alias_map = alias_map
        self.label_attribute = label_attribute
        self.ignore_words = ignore_words
        self.reserve_words = reserve_words
        self.verbose = verbose

        # outputs
        self.execution_durations = dict()
        self.query_raw = None
        self.query_processed = ""
        self.query_tokens = list()
        self.query_ngrams = dict()
        self.extracted_vis_type = None
        self.extracted_vis_token = None
        self.extracted_tasks = OrderedDict()
        self.extracted_attributes = OrderedDict()
        self.vis_list = None
        self.dependencies = list()

        # Load constants: thresholds, mappings, scores
        self.vis_keyword_map = constants.vis_keyword_map
        self.task_keyword_map = constants.task_keyword_map
        self.match_scores = constants.match_scores
        self.match_thresholds = constants.match_thresholds

        # Others
        self.dialog = False
        self.debug = False

        # initialize porter stemmer instance
        self.porter_stemmer_instance = PorterStemmer()

        # Set the dependency parser
        self.dependency_parser = None
        self.dependency_parser_instance = None
        self.set_dependency_parser(dependency_parser_config)

        # Internal Class Instances
        self.data_genie_instance = DataGenie(
            self)  # initialize a DataGenie instance.
        self.query_genie_instance = QueryGenie(
            self)  # initialize a QueryGenie instance.
        self.attribute_genie_instance = AttributeGenie(
            self)  # initialize a AttributeGenie instance.
        self.task_genie_instance = TaskGenie(
            self)  # initialize a TaskGenie instance.
        self.vis_genie_instance = VisGenie(
            self)  # initialize a VisGenie instance.

    # returns a VegaLite object of the best (1st) visualization after analyzing the query.
    def render_vis(self, query_raw):
        # type: (str) -> VegaLite
        response = self.analyze_query(query_raw)
        if len(response['visList']) == 0:
            print("No best Viz; please try again.")
            return VegaLite({})
        return VegaLite(response['visList'][0]['vlSpec'])

    # ToDo:- Discuss support for non-ascii characters? Fallback from unicode to ascii good enough?
    # ToDo:- Discuss ERROR Handling
    # ToDo:- Utilities to perform unit conversion (eg. seconds > minutes). Problem: Tedious to infer base unit from data. - LATER
    def analyze_query(self, query_raw, dialog=False, debug=False):
        # type: (str) -> dict

        self.execution_durations = dict()
        self.dialog = dialog
        self.debug = debug

        # If not a follow-up query, reset the output variables.
        if not dialog:
            self.extracted_vis_type = None
            self.extracted_vis_token = None
            self.extracted_tasks = OrderedDict()
            self.extracted_attributes = OrderedDict()
            self.vis_list = None

        # CLEAN AND PROCESS QUERY
        self.query_raw = query_raw
        helpers.cond_print("Raw Query: " + self.query_raw, self.verbose)
        st = time.time()
        self.query_processed = self.query_genie_instance.process_query(
            self.query_raw)
        self.query_tokens = self.query_genie_instance.clean_query_and_get_query_tokens(
            self.query_processed, self.reserve_words, self.ignore_words)
        self.query_ngrams = self.query_genie_instance.get_query_ngrams(
            ' '.join(self.query_tokens))
        self.dependencies = self.query_genie_instance.create_dependency_tree(
            self.query_processed)
        helpers.cond_print("Processed Query: " + self.query_processed,
                           self.verbose)
        self.execution_durations['clean_query'] = time.time() - st

        # DETECT EXPLICIT AND IMPLICIT ATTRIBUTES
        st = time.time()
        self.extracted_attributes = self.attribute_genie_instance.extract_attributes(
            self.query_ngrams)
        helpers.cond_print(
            "Final Extracted Attributes: " +
            str(list(self.extracted_attributes.keys())), self.verbose)
        self.execution_durations['extract_attributes'] = time.time() - st

        # DETECT EXPLICIT VISUALIZATION UTTERANCES
        st = time.time()
        self.extracted_vis_type, self.extracted_vis_token = self.vis_genie_instance.extract_vis_type(
            self.query_ngrams)
        self.execution_durations['extract_vis_type'] = time.time() - st

        # DETECT IMPLICIT AND EXPLICIT TASKS
        st = time.time()
        task_map = self.task_genie_instance.extract_explicit_tasks_from_dependencies(
            self.dependencies)

        # Filters from Domain Values
        task_map = self.task_genie_instance.extract_explicit_tasks_from_domain_value(
            task_map)

        # At this stage, which attributes are encodeable?
        encodeable_attributes = self.attribute_genie_instance.get_encodeable_attributes(
        )

        # INFER tasks based on (encodeable) attribute Datatypes
        task_map = self.task_genie_instance.extract_implicit_tasks_from_attributes(
            task_map, encodeable_attributes)

        # From the generated TaskMap, ensure that the task "keys" are NOT EMPTY LISTS
        self.extracted_tasks = self.task_genie_instance.filter_empty_tasks(
            task_map)
        self.execution_durations['extract_tasks'] = time.time() - st

        # RECOMMEND VISUALIZATIONS FROM ATTRIBUTES, TASKS, and VISUALIZATIONS
        st = time.time()

        # Final list of encodeable attributes in the VIS
        final_encodeable_attributes = self.attribute_genie_instance.update_encodeable_attributes_based_on_tasks(
        )

        self.vis_list = self.vis_genie_instance.get_vis_list(
            attribute_list=final_encodeable_attributes)
        self.execution_durations['get_vis_list'] = time.time() - st
        self.execution_durations['total'] = sum(
            self.execution_durations.values())

        # Prepare output
        output = {
            'status': 'SUCCESS' if len(self.vis_list) > 0 else 'FAILURE',
            'debug': {
                'execution_durations': self.execution_durations
            },
            'query_raw': self.query_raw,
            'query': self.query_processed,
            'dataset': self.data_url,
            'visList': self.vis_list,
            'attributeMap': self.extracted_attributes,
            'taskMap': self.extracted_tasks,
            'followUpQuery': self.dialog,
            'contextObj': None
        }

        return output if debug else helpers.delete_keys_from_dict(
            output, keys=constants.keys_to_delete_in_output)

    # Update the attribute datatypes that were not correctly detected by NL4DV
    def set_attribute_datatype(self, attr_type_obj):
        return self.data_genie_instance.set_attribute_datatype(
            attr_type_obj=attr_type_obj)

    # Set Label attribute for the dataset, i.e. one that defines what the dataset is about.
    # e.g. "Correlate horsepower and MPG for sports car models" should NOT apply an explicit attribute for models since there are two explicit attributes already present.
    def set_label_attribute(self, label_attribute):
        return self.data_genie_instance.set_label_attribute(
            label_attribute=label_attribute)

    # WORDS that should be IGNORED in the query, i.e. NOT lead to the detection of attributes and tasks
    # `Movie` in movies dataset
    # `Car` in cars dataset
    def set_ignore_words(self, ignore_words):
        return self.data_genie_instance.set_ignore_words(
            ignore_words=ignore_words)

    # Custom STOPWORDS that should NOT removed from the query, as they might be present in the domain.
    # e.g. `A` in grades dataset
    def set_reserve_words(self, reserve_words):
        return self.data_genie_instance.set_reserve_words(
            reserve_words=reserve_words)

    # Sets the AliasMap
    def set_alias_map(self, alias_map=None, alias_url=None):
        return self.data_genie_instance.set_alias_map(alias_map=alias_map,
                                                      alias_url=alias_url)

    # Sets the Dataset
    def set_data(self, data_url=None):
        return self.data_genie_instance.set_data(data_url=data_url)

    # Sets the String Matching, Domain Word Limit, ... Thresholds
    def set_thresholds(self, thresholds):
        for t in thresholds:
            if t in self.match_thresholds and (isinstance(
                    thresholds[t], float) or isinstance(thresholds[t], int)):
                self.match_thresholds[t] = thresholds[t]
        return True

    # Sets the Scoring Weights for the way attributes / tasks and visualizations are detected.
    def set_importance_scores(self, scores):
        for domain in scores.keys():
            if domain in self.match_scores and isinstance(
                    scores[domain], dict):
                for setting in scores[domain].keys():
                    if setting in self.match_scores[domain] and isinstance(
                            scores[domain][setting], float):
                        self.match_scores[domain] = scores[domain][setting]

        return True

    # Get the dataset metadata
    def get_metadata(self):
        return self.data_genie_instance.data_attribute_map

    # Create a dependency parser instance
    def set_dependency_parser(self, config):
        if isinstance(config, dict):
            helpers.cond_print("Dependency Parser: " + config["name"],
                               self.verbose)
            self.dependency_parser = config["name"]
            if config["name"] == "spacy":
                """
                    Sets the model and returns the Spacy NLP instance. Example ways from the Spacy docs:
                    spacy.load("en") # shortcut link
                    spacy.load("en_core_web_sm") # package
                    spacy.load("/path/to/en") # unicode path
                    spacy.load(Path("/path/to/en")) # pathlib Path
                """
                self.dependency_parser_instance = spacy.load(config["model"])

            elif config["name"] == "corenlp":
                if 'CLASSPATH' not in os.environ:
                    os.environ['CLASSPATH'] = ""

                cpath = config["model"] + os.pathsep + config["parser"]
                if cpath not in os.environ['CLASSPATH']:
                    os.environ['CLASSPATH'] = cpath + os.pathsep + os.environ[
                        'CLASSPATH']

                # TODO:- DEPRECATED
                self.dependency_parser_instance = StanfordDependencyParser(
                    path_to_models_jar=config["model"], encoding='utf8')
            elif config["name"] == "corenlp-server":
                # Requires the CoreNLPServer running in the background at the below URL (generally https://localhost:9000)
                # Start server by running the following command in the JARs directory.
                # `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000`
                self.dependency_parser_instance = CoreNLPDependencyParser(
                    url=config["url"])

Example #2

Show file

File: __init__.py Project: MarcelRuoff-KIT/nl4dv

    def __init__(self,
                 data_url=None,
                 alias_url=None,
                 alias_map=None,
                 label_attribute=None,
                 ignore_words=list(),
                 reserve_words=list(),
                 dependency_parser_config=None,
                 verbose=False):

        # inputs
        self.data_url = data_url
        self.alias_url = alias_url
        self.alias_map = alias_map
        self.label_attribute = label_attribute
        self.ignore_words = ignore_words
        self.reserve_words = reserve_words
        self.verbose = verbose

        # outputs
        self.execution_durations = dict()
        self.query_raw = None
        self.query_processed = ""
        self.query_tokens = list()
        self.query_ngrams = dict()
        self.extracted_vis_type = None
        self.extracted_vis_token = None
        self.extracted_tasks = OrderedDict()
        self.extracted_attributes = OrderedDict()
        self.vis_list = None
        self.dependencies = list()

        # Load constants: thresholds, mappings, scores
        self.vis_keyword_map = constants.vis_keyword_map
        self.task_keyword_map = constants.task_keyword_map
        self.match_scores = constants.match_scores
        self.match_thresholds = constants.match_thresholds

        # Others
        self.dialog = False
        self.debug = False

        # initialize porter stemmer instance
        self.porter_stemmer_instance = PorterStemmer()

        # Set the dependency parser
        self.dependency_parser = None
        self.dependency_parser_instance = None
        self.set_dependency_parser(dependency_parser_config)

        # Internal Class Instances
        self.data_genie_instance = DataGenie(
            self)  # initialize a DataGenie instance.
        self.query_genie_instance = QueryGenie(
            self)  # initialize a QueryGenie instance.
        self.attribute_genie_instance = AttributeGenie(
            self)  # initialize a AttributeGenie instance.
        self.task_genie_instance = TaskGenie(
            self)  # initialize a TaskGenie instance.
        self.vis_genie_instance = VisGenie(
            self)  # initialize a VisGenie instance.

Example #3

Show file

File: __init__.py Project: pnz16101/SmartEMR-Local

    def __init__(self,
                 data_url=None,
                 data_value=None,
                 alias_url=None,
                 alias_value=None,
                 label_attribute=None,
                 ignore_words=list(),
                 reserve_words=list(),
                 dependency_parser_config=None,
                 thresholds=None,
                 importance_scores=None,
                 attribute_datatype=None,
                 verbose=False,
                 debug=False):

        # inputs
        self.data_url = data_url
        self.data_value = data_value
        self.alias_url = alias_url
        self.alias_value = alias_value
        self.label_attribute = label_attribute
        self.ignore_words = ignore_words
        self.reserve_words = reserve_words
        self.dependency_parser_config = dependency_parser_config
        self.verbose = verbose
        self.debug = debug

        # Load constants: thresholds, mappings, scores
        self.vis_keyword_map = constants.vis_keyword_map
        self.task_keyword_map = constants.task_keyword_map
        self.match_scores = constants.match_scores
        self.match_thresholds = constants.match_thresholds

        # Initialize intermediate/output variables
        self.execution_durations = dict()
        self.query_raw = None
        self.query_processed = ""
        self.query_for_task_inference = ""
        self.query_tokens = list()
        self.query_ngrams = dict()
        self.extracted_vis_type = None
        self.extracted_vis_token = None
        self.extracted_tasks = OrderedDict()
        self.extracted_attributes = OrderedDict()
        self.vis_list = None
        self.dependencies = list()
        self.dependency_parser = None
        self.dependency_parser_instance = None

        # Others
        self.dialog = False

        # initialize porter stemmer instance
        self.porter_stemmer_instance = PorterStemmer()

        # Initialize internal Class Instances
        self.data_genie_instance = DataGenie(
            self)  # initialize a DataGenie instance.
        self.query_genie_instance = QueryGenie(
            self)  # initialize a QueryGenie instance.
        self.attribute_genie_instance = AttributeGenie(
            self)  # initialize a AttributeGenie instance.
        self.task_genie_instance = TaskGenie(
            self)  # initialize a TaskGenie instance.
        self.vis_genie_instance = VisGenie(
            self)  # initialize a VisGenie instance.

        # Set the dependency parser if config is not None
        if self.dependency_parser_config is not None:
            self.set_dependency_parser(dependency_parser_config)

        # Set the thresholds, e.g., string matching
        if thresholds is not None:
            self.set_thresholds(thresholds=thresholds)

        # Set the importance scores, i.e., weights assigned to different ways in which attributes, tasks, and vis are detected/inferred.
        if importance_scores is not None:
            self.set_importance_scores(scores=importance_scores)

        # Override the attribute datatypes
        if attribute_datatype is not None:
            self.set_attribute_datatype(attr_type_obj=attribute_datatype)