def get_num_features_for_given_tuple(self, data_tuple): num_f = [] word_tag = data_tuple[1] history = data_tuple[0] word_index = history[3] split_sentence = (history[2]).split() word = split_sentence[word_index] tag_minus = history[1] tag_minus2 = history[0] if (word, word_tag) in self.features: num_f.append(self.features[(word, word_tag)]) if (tag_minus, word_tag) in self.features: num_f.append(self.features[(tag_minus, word_tag)]) if ((tag_minus2, tag_minus), word_tag) in self.features: num_f.append(self.features[((tag_minus2, tag_minus), word_tag)]) if self.mode == 'Improved' or self.mode == 'Comp': if (Utilities.get_suffix(word, 2), word_tag) in self.features: num_f.append(self.features[(Utilities.get_suffix(word, 2), word_tag)]) if (Utilities.get_suffix(word, 3), word_tag) in self.features: num_f.append(self.features[(Utilities.get_suffix(word, 3), word_tag)]) if (Utilities.get_prefix(word, 2), word_tag) in self.features: num_f.append(self.features[(Utilities.get_prefix(word, 2), word_tag)]) if Utilities.check_number(word) and word_tag == 'CD': num_f.append(self.features['number', 'number']) if Utilities.check_capital(word, word_index) and word_tag == 'NNP': num_f.append(self.features['capital', 'capital']) if Utilities.check_bar(word) and word_tag == 'JJ': num_f.append(self.features['bar', 'bar']) # Features only for improved if self.mode == 'Improved': if (Utilities.get_suffix(word, 1), word_tag) in self.features: num_f.append(self.features[(Utilities.get_suffix(word, 1), word_tag)]) if (Utilities.get_suffix(word, 4), word_tag) in self.features: num_f.append(self.features[(Utilities.get_suffix(word, 4), word_tag)]) if (Utilities.get_prefix(word, 3), word_tag) in self.features: num_f.append(self.features[(Utilities.get_prefix(word, 3), word_tag)]) if (Utilities.get_prefix(word, 4), word_tag) in self.features: num_f.append(self.features[(Utilities.get_prefix(word, 4), word_tag)]) if (Utilities.get_prefix(word, 1), word_tag) in self.features: num_f.append(self.features[(Utilities.get_prefix(word, 1), word_tag)]) if (word_tag, '') in self.features: num_f.append(self.features[(word_tag, '')]) return num_f
def add_general_prefix_to_dict(self, word, word_tag): prefix_2 = Utilities.get_prefix(word, 2) if prefix_2 in feature_prefix_dict: prefix_tags_dict = feature_prefix_dict[prefix_2] if word_tag in prefix_tags_dict: prefix_tags_dict[word_tag] += 1 else: prefix_tags_dict[word_tag] = 1 self.num_features += 1 else: feature_prefix_dict[prefix_2] = {word_tag: 1} self.num_features += 1
def calculate_all_dot_f_for_tuple(self): for data_tuple in self.history_tag_tuples: for word_tag in self.tags: temp_arr = [] history = data_tuple[0] word_index = history[3] split_sentence = (history[2]).split() word = split_sentence[word_index] tag_minus = history[1] tag_minus2 = history[0] suffix_3 = Utilities.get_suffix(word, 3) suffix_2 = Utilities.get_suffix(word, 2) prefix_2 = Utilities.get_prefix(word, 2) if (word, word_tag) in self.features: temp_arr.append(self.features[(word, word_tag)]) if (tag_minus, word_tag) in self.features: temp_arr.append(self.features[(tag_minus, word_tag)]) if ((tag_minus2, tag_minus), word_tag) in self.features: temp_arr.append(self.features[((tag_minus2, tag_minus), word_tag)]) if (suffix_3, word_tag) in self.features: temp_arr.append(self.features[(suffix_3, word_tag)]) if (suffix_2, word_tag) in self.features: temp_arr.append(self.features[(suffix_2, word_tag)]) if (prefix_2, word_tag) in self.features: temp_arr.append(self.features[(prefix_2, word_tag)]) if Utilities.check_number(word) and word_tag == 'CD': temp_arr.append(self.features[('number', 'number')]) if Utilities.check_capital(word, word_index) and word_tag == 'NNP': temp_arr.append(self.features[('capital', 'capital')]) if Utilities.check_bar(word) and word_tag == 'JJ': temp_arr.append(self.features[('bar', 'bar')]) self.calculated_features[(history, word_tag)] = temp_arr for num_feature in temp_arr: if num_feature in self.tuples_per_feature: self.tuples_per_feature[num_feature].append((history, word_tag)) else: self.tuples_per_feature[num_feature] = [(history, word_tag)]