def __init__(self):
     # initialize all parsing objects, functions and regex matchers
     self.words_to_numbers = WordsToNumbers()
     self.create_argument_pattern_dict()
     self.basic_name_matcher = re.compile(self.basic_name_pattern)
     self.web_jargon_matcher = re.compile(self.valid_web_jargon_pattern)
     self.url_matcher = re.compile(self.url_pattern)
     self.percent_matcher = re.compile(self.percentage_pattern)
     self.action_text_mappings = h.load_web_action_template(h.DEFAULT_ACTIONS_PATH, False)
     self.split_action_keys = [x.split("_") for x in self.action_text_mappings.keys()]
class TextProcessor():
    """
    Class for processing text commands received from the Web Jargon Chrome Extension and
    transforming those commands into action templates to send to the web text action mapper.
    """
    # store patterns for matching
    basic_name_pattern = "[a-zA-Z\s\.]+$"
    valid_web_jargon_pattern = "^[\s\w\d\>\<\;\,\{\}\[\]\-\_\+\=\!\@\#\$\%\^\&\*\|\'\.\:\(\)\\\/\"\?]+$"
    url_pattern = ".* ?(\.|dot){0,1} ?[a-z]{2,3}"
    percentage_pattern = "\d+ ?(%|percent)"

    # store pre-compiled matchers for fast matching
    basic_name_matcher = None
    web_jargon_matcher = None
    words_to_numbers = None
    url_matcher = None
    percent_matcher = None

    # initialize action-text mapping dict and patter_dict
    action_text_mappings = dict()
    PATTERN_DICT = dict()

    def __init__(self):
        # initialize all parsing objects, functions and regex matchers
        self.words_to_numbers = WordsToNumbers()
        self.create_argument_pattern_dict()
        self.basic_name_matcher = re.compile(self.basic_name_pattern)
        self.web_jargon_matcher = re.compile(self.valid_web_jargon_pattern)
        self.url_matcher = re.compile(self.url_pattern)
        self.percent_matcher = re.compile(self.percentage_pattern)
        self.action_text_mappings = h.load_web_action_template(h.DEFAULT_ACTIONS_PATH, False)
        self.split_action_keys = [x.split("_") for x in self.action_text_mappings.keys()]

    def create_argument_pattern_dict(self):
        # creates a dictionary from command argument token type to the callable match function to parse that argument
        self.PATTERN_DICT = {'ELEMENT_NAME': self.match_web_jargon, 'NUM_PAGES': self.words_to_numbers.parse,
                             'PERCENT': self.percentage, 'TAB_INDEX': self.tab_index,
                             'TAB_NAME': self.basic_names, 'URL': self.url, 'FORM_NAME': self.basic_names,
                             'EXCERPT': self.match_web_jargon, 'BUTTON_NAME': self.basic_names, 'DOMAIN_NAME': self.url,
                             'PAGE_NUM': self.words_to_numbers.parse, 'ARTIST': self.match_web_jargon,
                             'ALBUM': self.match_web_jargon, 'SONG': self.match_web_jargon}

    def basic_names(self, text):
        # matches to the basic names pattern in this class
        return extract_match(text, self.basic_name_matcher)

    def match_web_jargon(self, text):
        # matches to the general web jargon pattern in this class
        return extract_match(text, self.web_jargon_matcher)

    def valid_web_jargon(self, text):
        """
        Text is valid web jargon if it is good English of type str or unicode that is non-empty.
        :param text: the web jargon request
        :return: whether the input text is valid web jargon aka good English, no weird characters
        """
        return h.is_text_type(text) and len(text) > 0 and len(self.web_jargon_matcher.match(text).group()) > 0

    def process_web_action_request(self, text, curr_url):
        """
        Parses the provided text into web text actions that will be converted into
        web actions by the web text to action mapper. The order will be maintained.
        :param text: the input command text
        :param curr_url: the url of the current web page
        :return: the action request response, which will be empty or None if in error
        """
        web_action_request = None
        if self.valid_web_jargon(text) and h.is_text_type(curr_url) and len(curr_url) > 0:
            # extract action request from the current command and add to web action token list
            words = text.split(" ")
            words = [x for x in words if len(x) > 0]
            curr_request = self.extract_action_request(text, words, curr_url)
            if curr_request is not None:
                web_action_request = curr_request
        else:
            h.log_to_console(["request error: ", text])

        return web_action_request

    def extract_action_request(self, text, words, url):
        """
        Figure out the web actions that exist in the provided sentence using
        the given words as well as action command templates.
        :param text: the text said by the user
        :param words: the words of the sentence
        :param url: the url of the current web page
        :return: the web action token and arguments
        """
        curr_text = text
        command_text = ''

        # extract necessary part of sentence
        # only need command part of text
        for word in words:
            end_index = curr_text.index(word) + len(word)
            command_text += curr_text[:end_index]
            curr_text = curr_text[end_index:]

        # try to use templates to determine desired actions
        action_request = self.template_action_interpreter(command_text, words, url)

        # check if request is received
        if action_request is None or len(action_request) == 0 or h.CMD not in action_request.keys():
            print "error interpreting request"

        return action_request

    def template_action_interpreter(self, command_text, command_words, command_url):
        """
        This method will not always work. multiple instances of the same string may be detected
        in matching and may throw off the interpreter.
        :param command_text: the command text for the current action request
        :param command_words: the command words for the current action request
        :param command_url: the url of the command given used for context determination
        :return: the current action request response
        """

        # store lowercase of all strings and filter out quotes
        command_words = [x.lower() for x in command_words if x != '``' and x != '\'\'']

        # store lowercase, parens removed, stripped version of command text input
        command_text = command_text.lower().strip().lstrip("\"").lstrip('``').lstrip('\'\'')\
            .rstrip('\'\'').rstrip("\"").rstrip('``').strip()

        # clean up command url and get command context
        command_url = command_url.strip()
        command_context, context_type = h.determine_url_context(command_url)

        # get possible action mappings
        possible_action_text_mapping_keys = h.get_possible_action_text_mapping_keys(command_context,
                                                                                    self.action_text_mappings.keys())

        # store matches list
        matches = []
        has_exact_match = False
        # try to find match for command in templates
        for action_key in possible_action_text_mapping_keys:
            if not has_exact_match:
                for u_map in self.action_text_mappings[action_key]:
                    indices = []
                    curr_command_text = command_text
                    curr_command_words = [x for x in command_words]
                    # track the words found in the command words list
                    for part in u_map[h.PARTS]:
                        # check if part of the utterance is in the command
                        if part in curr_command_text:
                            part_start = command_text.index(part)
                            part_end = part_start + len(part)
                            indices.append((part_start, part_end))
                            # replace that part of string with underscore to signify removal
                            curr_command_text = curr_command_text.replace(part, '')
                            # remove this part from the word list (if not in list, problem but neglect)
                            part_split = part.split(" ")
                            for p in part_split:
                                if p in curr_command_words:
                                    curr_command_words.remove(p)

                    # store match if parts are in command
                    if len(indices) == len(u_map[h.PARTS]):

                        # store indices where args will be extracted from in string
                        arg_sections = h.extract_arg_sections(command_text, indices)

                        # do smart argument parsing use regex, parse trees, etc.
                        args = u_map[h.CMD_ARGS_DICT].copy()
                        if len(arg_sections) > 0:
                            for arg_type in u_map[h.CMD_ARGS_DICT]:
                                # extract argument using argument type
                                parsed_arg = self.match_arg(arg_type, curr_command_words, arg_sections)
                                if (type(parsed_arg) == int and parsed_arg > 0)\
                                        or (type(parsed_arg) == list
                                            or h.is_text_type(parsed_arg) and len(parsed_arg) > 0):
                                    args[arg_type] = parsed_arg
                        matches.append((action_key, " ".join(u_map[h.PARTS]), args, min(indices[:][0])))

        curr_action_request = dict()
        # select the earliest and/or longest command match for the current action request
        if len(matches) > 0:
            longest_phrase = 0
            earliest_pos = 0
            earliest_index = 0
            ctr = 0
            for match in matches:
                # get length of parts string that matched command
                mlen = len(match[1])
                # get start pos of command match
                start_pos = match[3]

                # look for longer phrase
                if mlen > longest_phrase:
                    longest_phrase = mlen
                    # take longer phrase (still same starting location)
                    if start_pos == earliest_pos:
                        earliest_pos = start_pos
                        earliest_index = ctr

                # look for same length phrase with earlier command match
                if start_pos < earliest_pos or (start_pos == earliest_pos and mlen == longest_phrase):
                    earliest_pos = start_pos
                    earliest_index = ctr
                ctr += 1

            # set command and args from action text mappings
            curr_action_request[h.CMD] = matches[earliest_index][0]
            curr_action_request[h.CMD_ARGS_DICT] = matches[earliest_index][2]
            curr_action_request[h.CONTEXT_TYPE] = context_type

            # handle music context boolean setting for music actions
            if command_context == h.MUSIC_CONTEXT:
                if "spotify" in command_url:
                    curr_action_request[h.CMD_ARGS_DICT][IS_SPOTIFY] = 'true'
                else:
                    curr_action_request[h.CMD_ARGS_DICT][IS_SPOTIFY] = 'false'

        return curr_action_request

    def tab_index(self, words):
        """
        Convert the words to a number index
        :param words:
        :return:
        """
        result = self.words_to_numbers.parse(words)
        if result < 0:
            result = self.get_index(words.split(" "))
        return result

    def percentage(self, words):
        """
        Try to match the given words string to a regular expression for percentages
        or use the number parser for long-form words.
        :param words: the word string that might contain a percentage-like number
        :return: the first match for a percentage in the string
        """
        parsed_arg = ''
        # try to match the percentage pattern
        match = self.percent_matcher.match(words)
        # check if match is valid
        if match is not None and len(match.group()) > 0:
            # extract match
            parsed_arg = match.group()
            # remove percent and strip off whitespace
            parsed_arg = parsed_arg.rstrip("%").rstrip("percent").strip()
        # check if nothing was found to match
        if len(parsed_arg) == 0:
            # try to use number parser to extract percentage
            result = self.words_to_numbers.parse(words)
            if result >= 0:
                parsed_arg = result
        # try to convert the parsed number into an integer, or fall back to the default value if any error occurs
        try:
            parsed_arg = int(parsed_arg)
        except:
            parsed_arg = 25
        return parsed_arg

    @staticmethod
    def get_index(words):
        """
        Returns the number of an English number index (indicating element position) found in the provided list of words.
        :param words: the list of words to find the English number index in
        :return: the number version of the found index
        """
        result = -1
        for word in words:
            if word in NUM_TO_INT.keys():
                result = NUM_TO_INT[word]
                break
        return result

    def url(self, words):
        # try to fix words and parse out a URL
        words = words.replace(' dot ', '.')
        words = words.replace('dot ', '.')
        words = words.replace(' dot', '.')
        words = words.replace('dot', '.')
        words = words.replace(' w w w ', 'www')
        words = words.replace('w w w ', 'www')
        words = words.replace(' w w w', 'www')
        words = words.replace('w w w', 'www')
        return extract_match(words, self.url_matcher)

    def match_arg(self, orig_arg_type, command_words, arg_sections):
        """
        Tries to find the given arg type in the list of argument sections,
        using the provided command words as backup evidence in decision making.
        :param orig_arg_type: the type of argument to search for as addressed by the global pattern dictionary
        in this class
        :param command_words: the words of the command to match to
        :param arg_sections: the already known argument sections in the command
        :return: the parsed argument from the given command and data
        """
        arg_sections = [x.strip() for x in arg_sections]
        parsed_arg = ''
        # may accept multiple argument types, so treat them independently
        if "|" in orig_arg_type:
            arg_types = orig_arg_type.split("|")
        else:
            # otherwise, just have one argument type to look for
            arg_types = [orig_arg_type]

        # run search for pattern matches to argument types in the command text
        for arg_type in arg_types:
            if len(command_words) > 0 and len(arg_sections) > 0 and arg_type in self.PATTERN_DICT.keys():
                # extract the proper pattern
                pattern = self.PATTERN_DICT[arg_type]
                # The pattern may be a function call, strings mean regex patterns are given
                if not h.is_text_type(pattern):
                    # match using a matching function that is callable
                    valid_match = False
                    for arg_section in arg_sections:
                        match = pattern(arg_section)
                        valid_match = (type(match) == int and match > 0) or (type(match) != int and match is not None)
                        if valid_match:
                            parsed_arg = match
                            break
                    if valid_match:
                        break
                else:
                    # compile a regex pattern on the fly (not really used in practice but always an option)
                    pat = re.compile(pattern)

                    # try to match to words first using regex
                    for word in command_words:
                        match = pat.match(word)
                        if match is not None and len(match.group()) > 0:
                            parsed_arg = match.group()
                            break

                    # otherwise, try to match to argument phrase sections
                    for arg_section in arg_sections:
                        match = pat.match(arg_section)
                        if match is not None and len(match.group()) > 0:
                            parsed_arg = match.group()
                            break
        return parsed_arg