def check_if_np_contains_number(self, np): parts = np.split(' ') # print parts if math_modifiers.is_word_number(parts[0]) == True: return 1 if '-' in np: parts = np.split('-') if math_modifiers.is_word_number(parts[0]) == True: return 1 return 0
def write_features(self): output_file = open(self.feature_file_path, 'w') self.question_prop = entity_properties() self.question_prop.find_word_list( self.file_path_refrence.synonym_list_path, self.file_path_refrence.num_of_dimentions) for i in range(self.data_start_index, self.data_end_index): print "i is:" print i if i in self.file_path_refrence.problematic_indexes: continue self.read_data(i) # for pair in self.question_prop.relevant_pairs: # parts = pair.split('\t') # np1 = parts[0] # np2 = parts[1] for np1 in self.question_prop.question_strings_np: for np2 in self.question_prop.question_strings_np: if np1 == np2: continue # print np1 + ' ' + np2 if np1.endswith(np2): if math_modifiers.is_word_number(np1[:np1.index(np2) - 1]): continue if np2.endswith(np1): if math_modifiers.is_word_number(np2[:np2.index(np1) - 1]): continue # writing features starts from here # first:: checking for the label class_finder_mode = 'joint' if self.binary_multiclass_mode == 0: class_finder_mode = 'pair_relevant' class_finder_obj = class_finder(class_finder_mode, self.question_prop) output_line = class_finder_obj.check_for_label(np1, np2) output_file.write(output_line) features_list = [] loglinear_features = logLinear_Feature_finder( self.question_prop) features_list = loglinear_features.appropriate_feature_finder_list( features_list, np1, np1, output_line, self.binary_multiclass_mode, self.test_train_mode) for i in range(0, len(features_list)): output_file.write(' ' + str(i) + ':' + str(features_list[i])) output_file.write('\n') self.question_prop.flush()
def check_if_np_contains_number_anywhere(self, np): parts = np.split(' ') # print parts for part in parts: if math_modifiers.is_word_number(part) == True: return 1 if '-' in np: parts = np.split('-') if math_modifiers.is_word_number(parts) == True: return 1 return 0
def check_for_math_modifier_or_number_inside_np(self, np): parts = np.split(' ') part = parts[0] if self.check_for_known_word(part) == '1': return '1' elif math_modifiers.is_word_number(part) == True: return '1' if '-' in np: parts = np.split('-') part = parts[0] if self.check_for_known_word(part) == '1': return '1' elif math_modifiers.is_word_number(part) == True: return '1' return '0'
def check_if_np_contains_number_anywhere(self, np): parts = np.split(' ') # print parts if len(parts) == 1: return (0, -1) for i in range(1,len(parts)): # print parts # print i part = parts[i] if math_modifiers.is_word_number(part) == True: return (1, part) if '-' in np: parts_dash = np.split('-') if math_modifiers.is_word_number(parts_dash) == True: return (1, parts_dash) return (0, -1)
def write_features(self): output_file = open(self.feature_file_path, 'w') self.question_prop = entity_properties() self.question_prop.find_word_list(self.file_path_refrence.synonym_list_path, self.file_path_refrence.num_of_dimentions) for i in range(self.data_start_index, self.data_end_index): print "i is:" print i if i in self.file_path_refrence.problematic_indexes: continue self.read_data(i) for np1 in self.question_prop.question_strings_np: if np1 in self.question_prop.pronoun_list or len(np1.split(' ')) >= 6: continue for np2 in self.question_prop.question_strings_np: if np2 in self.question_prop.pronoun_list or len(np2.split(' ')) >= 6: continue if np1 == np2: continue if np1.endswith(np2): if math_modifiers.is_word_number(np1[:np1.index(np2)-1]): continue if np2.endswith(np1): if math_modifiers.is_word_number(np2[:np2.index(np1)-1]): continue start_index = 0 class_finder_obj = class_finder('str_disjoint', self.question_prop) output_line = class_finder_obj.check_for_label(np1, np2) output_file.write(output_line) feature_list = [] basic_features = basic_faeture_finder(self.question_prop) feature_list = basic_features.appropriate_feature_finder_list(feature_list, np1) feature_list = basic_features.appropriate_feature_finder_list(feature_list, np2) useful_np_features = useful_np_feature_finder(self.question_prop) feature_list = useful_np_features.appropriate_feature_finder_list(feature_list, np1, np2) disjoint_features = disjoint_feature_finder(self.question_prop) feature_list = disjoint_features.appropriate_feature_finder_list(feature_list, np1, np2) for i in range(0, len(feature_list)): output_file.write(' ' + str(i) + ':' + str(feature_list[i])) output_file.write('\n') self.question_prop.flush()
def calc_merge_count_feature(self, feature_list, feature_name_list, chain1, chain2): chain1_num_of_counts = 0 for np1 in chain1: if np1 in self.noun_phrase_with_counts: chain1_num_of_counts = chain1_num_of_counts + 1 if ' ' in np1: parts = np1.split(' ') if math_modifiers.is_word_number(parts[0]) == True: rest_nount = np1[np1.index(' '):] if rest_nount in self.noun_phrase_with_counts: chain1_num_of_counts = chain1_num_of_counts + 1 chain2_num_of_counts = 0 for np2 in chain2: if np2 in self.noun_phrase_with_counts: chain2_num_of_counts = chain2_num_of_counts + 1 if ' ' in np2: parts = np2.split(' ') if math_modifiers.is_word_number(parts[0]) == True: rest_nount = np2[np2.index(' '):] if rest_nount in self.noun_phrase_with_counts: chain2_num_of_counts = chain2_num_of_counts + 1 if chain1_num_of_counts > 0 and chain2_num_of_counts > 0: feature_list.append(1) else: feature_list.append(-1) feature_name_list.append("chain 1 has counts and chain 2 has counts") if chain1_num_of_counts > 0 or chain2_num_of_counts > 0: feature_list.append(1) else: feature_list.append(-1) feature_name_list.append("chain 1 has counts or chain 2 has counts") if chain1_num_of_counts > 2 and chain2_num_of_counts > 2: feature_list.append(1) else: feature_list.append(-1) feature_name_list.append("chain 1 and chain 2 has counts more than 2") if chain1_num_of_counts > 2 or chain2_num_of_counts > 2: feature_list.append(1) else: feature_list.append(-1) feature_name_list.append("chain 1 or chain 2 has counts more than 2") return (feature_list, feature_name_list)
def write_features(self): output_file = open(self.feature_file_path, 'w') self.question_prop = entity_properties() for i in range(self.data_start_index, self.data_end_index): print "i is:" print i if i == 17 or i == 30 or i == 33 or i == 70: continue self.read_data(i) for pair in self.question_prop.relevant_pairs: for str_sample in self.non_relevant_string: if pair.startswith(str_sample): continue parts = pair.split('\t') np1 = parts[0] np2 = parts[1] if np1.endswith(np2): if math_modifiers.is_word_number(np1[:np1.index(np2)-1]): continue if np2.endswith(np1): if math_modifiers.is_word_number(np2[:np2.index(np1)-1]): continue # writing features starts from here # first:: checking for the label class_finder_obj = class_finder('eq_noRel', self.question_prop) output_line = class_finder_obj.check_for_label(np1, np2) output_file.write(output_line) feature_list = [] basic_features = basic_faeture_finder(self.question_prop) feature_list = basic_features.appropriate_feature_finder_list(feature_list, np1) feature_list = basic_features.appropriate_feature_finder_list(feature_list, np2) eq_features = equivalence_feature_finder(self.question_prop) feature_list = eq_features.appropriate_feature_finder_list(feature_list, np1, np2) for i in range(0, len(feature_list)): output_file.write(' ' + str(i) + ':' + str(feature_list[i])) output_file.write('\n') self.question_prop.flush()
def find_number_after(self, whole_question, type_index): is_word_after_number = False space_index = self.find_next_space(whole_question, type_index+1) word_count = 0 beginning_of_the_word_index = type_index word_list_inBetween = [] while is_word_after_number == False: beginning_index = space_index + 1 beginning_of_the_word_index = beginning_index space_index = self.find_next_space(whole_question, space_index + 1) end_of_the_word_index = space_index word_count = word_count + 1 word_list_inBetween.append(whole_question[beginning_index:end_of_the_word_index]) is_word_after_number = math_modifiers.is_word_number(whole_question[beginning_index:end_of_the_word_index]) if space_index == len(whole_question): break if is_word_after_number == False: return (-1, -1, []) return (word_count, beginning_of_the_word_index, word_list_inBetween)
def find_number_before(self, whole_question, type_index): is_word_before_number = False space_index = self.find_prev_space(whole_question, type_index) word_count = 0 beginning_of_the_word_index = 0 word_list_inBetween = [] while is_word_before_number == False: end_of_the_word_index = space_index space_index = self.find_prev_space(whole_question, space_index - 1) beginning_index = space_index + 1 if space_index == 0: beginning_index = 0 word_count = word_count + 1 beginning_of_the_word_index = beginning_index word_count = word_count + 1 word_list_inBetween.append(whole_question[beginning_index:end_of_the_word_index]) is_word_before_number = math_modifiers.is_word_number(whole_question[beginning_index:end_of_the_word_index]) if space_index == 0: break if is_word_before_number == False: return (-1, -1, []) return (word_count, beginning_of_the_word_index, word_list_inBetween)
def write_features(self): output_file = open(self.feature_file_path, 'w') self.question_prop = entity_properties() self.question_prop.find_word_list( self.file_path_refrence.synonym_list_path, self.file_path_refrence.num_of_dimentions) file_path_refrence = file_refrence() output_file = open(file_path_refrence.pair_visualize_data_file, 'w') for i in range(self.data_start_index, self.data_end_index): print "i is:" print i if i in self.file_path_refrence.problematic_indexes: continue self.read_data(i) for np1 in self.question_prop.question_strings_np: for np2 in self.question_prop.question_strings_np: if np1 == np2: continue output_file.write(np1 + '\t' + np2 + '\n') # print np1 + " " + np2 # for pair in self.question_prop.relevant_pairs: # for str_sample in self.non_relevant_string: # if pair.startswith(str_sample): # continue # parts = pair.split('\t') # np1 = parts[0] # np2 = parts[1] # check if one is number instantiation of the other don't check the pair # sample shoes 100 shoes if np1.endswith(np2): if math_modifiers.is_word_number(np1[:np1.index(np2) - 1]): continue if np2.endswith(np1): if math_modifiers.is_word_number(np2[:np2.index(np1) - 1]): continue # writing features starts from here # first:: checking for the label class_finder_obj = class_finder('joint', self.question_prop) output_line = class_finder_obj.check_for_label(np1, np2) output_file.write(output_line) features_list = [] basic_features = basic_feature_finder_revised( self.question_prop) features_list = basic_features.appropriate_feature_finder_list( features_list, np1, np2) # features_list = basic_features.appropriate_feature_finder_list(features_list, np2) useful_np_features = useful_np_feature_finder_revised( self.question_prop) features_list = useful_np_features.appropriate_feature_finder_list( features_list, np1, np2) disjoint_features = disjoint_feature_finder_revised( self.question_prop) features_list = disjoint_features.appropriate_feature_finder_list( features_list, np1, np2) subset_features = subset_feature_finder_revised( self.question_prop) features_list = subset_features.appropriate_feature_finder_list( features_list, np1, np2) eq_features = equivalence_feature_finder_revised( self.question_prop) features_list = eq_features.appropriate_feature_finder_list( features_list, np1, np2) for i in range(0, len(features_list)): output_file.write(' ' + str(i) + ':' + str(features_list[i])) output_file.write('\n') self.question_prop.flush()
if not line_parts[useful_indexes[j]].startswith('-'): count = count + 1 row_index.append(line_count) column_index.append(line_parts[useful_indexes[j]]) seen_nps.append(line_parts[useful_indexes[j]]) box_row.append(line_parts[useful_indexes[j]]) if len(box_row) != 0: true_table.append(box_row) # print 'here' num_list = [] word_list = [] for np in np_list: if len(np.split(' ')) == 1 and '-' not in np: if math_modifiers.is_word_number(np) == True: num_list.append(np) else: if np not in word_list: word_list.append(np) elif '-' in np: parts = np.split(' ') if math_modifiers.is_word_number(parts[0]): num_list.append(parts[0]) rest_noun = '' for jj in range(1, len(parts)): rest_noun = rest_noun + ' ' + parts[jj] rest_noun = rest_noun[1:] if rest_noun not in word_list: word_list.append(rest_noun) else: