def get_name_using_pos_tagger(self, text): """ First checks if the text contains cardinals or interrogation. Then passes the text through templates. Then returns words which are nouns or adjectives Args: text (string): The text obtained from the user. Example text= My name is yash modi Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], ["yash modi"] """ entity_value, original_text = [], [] pos_tagger_object = POS() pattern1 = re.compile(r"name\s+(?:is\s+)?([\w\s]+)") pattern2 = re.compile(r"myself\s+([\w\s]+)") pattern3 = re.compile(r"call\s+me\s+([\w\s]+)") pattern4 = re.compile(r"i\s+am\s+([\w\s]+)") name_tokens = text.split() # Passing empty tokens to tag will cause IndexError tagged_names = pos_tagger_object.tag(name_tokens) pattern1_match = pattern1.findall(text) pattern2_match = pattern2.findall(text) pattern3_match = pattern3.findall(text) pattern4_match = pattern4.findall(text) is_question = [ word[0] for word in tagged_names if word[1].startswith('WR') or word[1].startswith('WP') or word[1].startswith('CD') ] if is_question: return entity_value, original_text if pattern1_match: entity_value, original_text = self.get_format_name( pattern1_match[0].split(), self.text) elif pattern2_match: entity_value, original_text = self.get_format_name( pattern2_match[0].split(), self.text) elif pattern3_match: entity_value, original_text = self.get_format_name( pattern3_match[0].split(), self.text) elif pattern4_match: entity_value, original_text = self.get_format_name( pattern4_match[0].split(), self.text) elif len(name_tokens) < 4 and self.bot_message: pos_words = [ word[0] for word in tagged_names if word[1].startswith('NN') or word[1].startswith('JJ') ] if pos_words: entity_value, original_text = self.get_format_name( pos_words, self.text) return entity_value, original_text
def get_name_using_pos_tagger(self, text): """ First checks if the text contains cardinals or interrogation. Then passes the text through templates. Then returns words which are nouns or adjectives Args: text (string): The text obtained from the user. Example text= My name is yash modi Returns: [{first_name: "yash", middle_name: None, last_name: "modi"}], ["yash modi"] """ entity_value, original_text = [], [] if self.language in EUROPEAN_LANGUAGES_SET: tagged_names = spacy_utils.tag(text=text.strip(), language=self.language) else: pos_tagger_object = POS() name_tokens = text.split() # Passing empty tokens to tag will cause IndexError tagged_names = pos_tagger_object.tag(name_tokens) is_question = [ word[0] for word in tagged_names if word[1].startswith('WR') or word[1].startswith('WP') or word[1].startswith('CD') ] if is_question: return entity_value, original_text if len(tagged_names) < 4 and self.bot_message: if self.language in EUROPEAN_LANGUAGES_SET: pos_words = [ word[0] for word in tagged_names if word[1].startswith('NOUN') or word[1].startswith('ADJ') or word[1].startswith('PROPN') ] else: pos_words = [ word[0] for word in tagged_names if word[1].startswith('NN') or word[1].startswith('JJ') ] if pos_words: entity_value, original_text = self.get_format_name( pos_words, self.text) return entity_value, original_text
def get_pos_tagged_dict(docs): """ This method is used to apply pos_tags to every token Args: docs (dict): List of tuples consisting of the token and label in (token, label) form. Returns: data (dict): This method assigns pos_tags to the tokens Example: For city entity docs = { 'labels': [['O', 'O', 'O', 'O', 'B', 'O', 'B'], ['O', 'O', 'O', 'O', 'B']], 'text_list': [['book', 'a', 'flight', 'from', 'Mumbai', 'to', 'Delhi'], ['Book', 'a', 'flight', 'to', 'Pune']]} pos_tag(docs) >> { 'labels': [['O', 'O', 'O', 'O', 'B', 'O', 'B'], ['O', 'O', 'O', 'O', 'B']], 'pos_tags': [['NN', 'DT', 'NN', 'IN', 'NNP', 'TO', 'VB'], ['VB', 'DT', 'NN', 'TO', 'VB']], 'text_list': [['book', 'a', 'flight', 'from', 'Mumbai', 'to', 'Delhi'], ['Book', 'a', 'flight', 'to', 'Pune']] } """ docs[CRF_POS_TAGS] = [] pos_tagger = POS() for text in docs[SENTENCE_LIST]: docs[CRF_POS_TAGS].append( [tag[1] for tag in pos_tagger.tagger.tag(text)]) return docs
def __init__(self): self.tagger = None self._model_path = None self.pos_tagger = POS()
class PredictCRF(object): def __init__(self): self.tagger = None self._model_path = None self.pos_tagger = POS() def get_model_output(self, entity_type, bot_message, user_message): """ This function is a calls all other in order get final json list of tagged data. If model has been loaded then it calls initialize_files(), add_data_to_tagger and run_crf to get the tagged data otherwise it will throw an error message """ output_list = [] if MODEL_RUN: self.initialize_files(entity_type=entity_type) self.add_data_to_tagger(bot_message, user_message) crf_output = self.run_crf() if entity_type == CITY_ENTITY_TYPE: output_list = generate_city_output(crf_data=crf_output) ner_logger.debug('NER MODEL OUTPUT: %s' % output_list) elif entity_type == DATE_ENTITY_TYPE: output_list = generate_date_output(crf_data=crf_output) ner_logger.debug('NER MODEL OUTPUT: %s' % output_list) else: ner_logger.debug('MODEL IS NOT RUNNING: CRFPP not installed') return output_list def initialize_files(self, entity_type): """ This function checks the type of entity. We have currently done it for entity_type='city'. If the input parameter is entity_type city, it will run CRF model loaded for city and initialize the tagger and model_path accordingly Args: entity_type: type of entity """ global CITY_MODEL_OBJECT, DATE_MODEL_OBJECT if entity_type == CITY_ENTITY_TYPE: self._model_path = CITY_MODEL_PATH if not CITY_MODEL_OBJECT: CITY_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" % self._model_path) ner_logger.debug('CITY CRF model loaded %s' % self._model_path) self.tagger = CITY_MODEL_OBJECT elif entity_type == DATE_ENTITY_TYPE: self._model_path = DATE_MODEL_PATH if not DATE_MODEL_OBJECT: DATE_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" % self._model_path) ner_logger.debug('date CRF model loaded %s' % self._model_path) self.tagger = DATE_MODEL_OBJECT def add_data_to_tagger(self, bot_message, user_message): """ As explained, CRF need data in a particular format, this function converts the bot_message and user_message into that format and add it to the tagger. Args: bot_message: message from bot user_message: message from user for Example: Args: bot_message = 'none' user_message = 'flights from delhi to goa' Then this functions tokenize the bot and user messages, gets the POS tags, tags them as outbound or inbound as per the sender and adds it to the tagger object. tokens_bot_message = ['none'] tokens_user_message = ['flights', 'from', 'delhi', 'goa'] pos_bot_message = [['none', 'NN']] pos_user_message = [['flights','NNS'], ['from', 'VBP'], ['delhi', 'NN'], ['to', 'TO'], ['goa', 'VB']] none NN o flights NNS i from VBP i delhi NN i to TO i goa VB i """ if bot_message is None: bot_message = '' tokens_bot_message = nltk_tokenizer.tokenize(bot_message) tokens_user_message = nltk_tokenizer.tokenize(user_message) pos_bot_message = self.pos_tagger.tag(tokens_bot_message) pos_user_message = self.pos_tagger.tag(tokens_user_message) for token in pos_bot_message: self.tagger.add( str(token[0]) + ' ' + str(token[1]) + ' ' + OUTBOUND) for token in pos_user_message: self.tagger.add( str(token[0]) + ' ' + str(token[1]) + ' ' + INBOUND) def run_crf(self): """ This function runs CRF on data added to tagger and stores the [word predicted_label] in output list and returns it. This list is then passed to generate_crf_output() to get the json list of data tagged. """ output = [] self.tagger.parse() size = self.tagger.size() for i in range(0, size): output.append([self.tagger.x(i, 0), self.tagger.y2(i)]) self.tagger.clear() return output