def get_f_contain_name_without_last(self): f_contain_name = 0 last_name = self.person_name.split(' ')[-1] name_part_list = self.person_name.split(' ')[:-1] # 将名字部分列表按长度排序,长的部分先参与计算 name_part_list = sorted(name_part_list, key=lambda name_part: len(name_part), reverse=True) temp_addr = self.email_addr[:self.email_addr.find('@')].replace('.', '').replace('_', '').replace('-', '') temp_addr = del_a_from_b(last_name, temp_addr) for name_part in name_part_list: # 如果名字的某部分在前缀中,f_contain_name加上该部分的长度,并从前缀和name_part_list中去掉该部分 if name_part in temp_addr: temp_addr = del_a_from_b(name_part, temp_addr) f_contain_name += len(name_part) # 在for循环中使用remove是危险的,故标为空,相当于删除这个名字部分 name_part_list[name_part_list.index(name_part)] = '' # 对于没有被全包含的名字部分,看其首字母是否在前缀中 for name_part in name_part_list: if name_part and name_part[0] in temp_addr: f_contain_name += 1 # 归一化 f_contain_name = float(f_contain_name) / (float(len(del_a_from_b(last_name, self.email_addr[:self.email_addr.find('@')]))) + 1) return f_contain_name
def get_f_contain_name_without_last(self): f_contain_name = 0 last_name = self.person_name.split(' ')[-1] name_part_list = self.person_name.split(' ')[:-1] # 将名字部分列表按长度排序,长的部分先参与计算 name_part_list = sorted(name_part_list, key=lambda name_part: len(name_part), reverse=True) temp_addr = self.email_addr[:self.email_addr.find('@')].replace( '.', '').replace('_', '').replace('-', '') temp_addr = del_a_from_b(last_name, temp_addr) for name_part in name_part_list: # 如果名字的某部分在前缀中,f_contain_name加上该部分的长度,并从前缀和name_part_list中去掉该部分 if name_part in temp_addr: temp_addr = del_a_from_b(name_part, temp_addr) f_contain_name += len(name_part) # 在for循环中使用remove是危险的,故标为空,相当于删除这个名字部分 name_part_list[name_part_list.index(name_part)] = '' # 对于没有被全包含的名字部分,看其首字母是否在前缀中 for name_part in name_part_list: if name_part and name_part[0] in temp_addr: f_contain_name += 1 # 归一化 f_contain_name = float(f_contain_name) / (float( len( del_a_from_b(last_name, self.email_addr[:self.email_addr.find('@')]))) + 1) return f_contain_name
def get_f_first_char_all_in_addr(self): name_part_list = self.person_name.split(' ') first_char_in_num = 0 email_prefix = self.email_addr[:self.email_addr.find('@')] prefix_len = len(email_prefix) for name_part in name_part_list: if name_part[0] in email_prefix: first_char_in_num += 1 del_a_from_b(name_part[0], email_prefix) return float(first_char_in_num) / float(prefix_len)
def get_binary_relationship_list(self): binary_relationship_list = [] for node_looper in range(len(self.node_mln_list)): node = self.node_mln_list[node_looper] for another_node_looper in range(node_looper+1, len(self.node_mln_list)): another_node = self.node_mln_list[another_node_looper] if node.node_name == another_node.node_name: continue # if node.prefix == another_node.prefix and node.domain != another_node.domain and not node.prefix_is_invalid_keyword()[0]: # binary_relationship_list.append(node.grounding_string_binary('same_prefix', another_node.node_name)) if node.domain == another_node.domain and node.prefix == another_node.prefix: binary_relationship_list.append(node.grounding_string_binary('same_addr', another_node.node_name)) if (node.domain == another_node.domain or node.domain in another_node.domain or another_node.domain in node.domain) and node.prefix != another_node.prefix and another_node.prefix_is_invalid_keyword()[0]: binary_relationship_list.append(node.grounding_string_binary('same_domain_with_invalid', another_node.node_name)) if node.prefix != another_node.prefix and (another_node.prefix in node.prefix or node.prefix in another_node.prefix): big_node = node small_node = another_node if node.prefix in another_node.prefix: big_node = another_node small_node = node remain_prefix = del_a_from_b(small_node.prefix, big_node.prefix) if remain_prefix[0] in big_node.first_char_list: binary_relationship_list.append(node.grounding_string_binary('a_contain_prefix_b', another_node.node_name)) if node.prefix == another_node.prefix and node.domain != another_node.domain and not node.prefix_is_invalid_keyword()[0]: binary_relationship_list.append(node.grounding_string_binary('same_prefix', another_node.node_name)) if node.addr_repeat_time > 2: binary_relationship_list.append(node.grounding_string_binary('addr_repeat_over_3', node.node_name)) # elif node.addr_repeat_time > 0: # binary_relationship_list.append(node.grounding_string_binary('addr_repeat_under_2', node.node_name)) return binary_relationship_list
def get_binary_relationship_list(self): binary_relationship_list = [] for node_looper in range(len(self.node_mln_list)): node = self.node_mln_list[node_looper] for another_node_looper in range(node_looper + 1, len(self.node_mln_list)): another_node = self.node_mln_list[another_node_looper] if node.node_name == another_node.node_name: continue # if node.prefix == another_node.prefix and node.domain != another_node.domain and not node.prefix_is_invalid_keyword()[0]: # binary_relationship_list.append(node.grounding_string_binary('same_prefix', another_node.node_name)) if node.domain == another_node.domain and node.prefix == another_node.prefix: binary_relationship_list.append( node.grounding_string_binary('same_addr', another_node.node_name)) if ( node.domain == another_node.domain or node.domain in another_node.domain or another_node.domain in node.domain ) and node.prefix != another_node.prefix and another_node.prefix_is_invalid_keyword( )[0]: binary_relationship_list.append( node.grounding_string_binary( 'same_domain_with_invalid', another_node.node_name)) if node.prefix != another_node.prefix and ( another_node.prefix in node.prefix or node.prefix in another_node.prefix): big_node = node small_node = another_node if node.prefix in another_node.prefix: big_node = another_node small_node = node remain_prefix = del_a_from_b(small_node.prefix, big_node.prefix) if remain_prefix[0] in big_node.first_char_list: binary_relationship_list.append( node.grounding_string_binary( 'a_contain_prefix_b', another_node.node_name)) if node.prefix == another_node.prefix and node.domain != another_node.domain and not node.prefix_is_invalid_keyword( )[0]: binary_relationship_list.append( node.grounding_string_binary('same_prefix', another_node.node_name)) if node.addr_repeat_time > 2: binary_relationship_list.append( node.grounding_string_binary('addr_repeat_over_3', node.node_name)) # elif node.addr_repeat_time > 0: # binary_relationship_list.append(node.grounding_string_binary('addr_repeat_under_2', node.node_name)) return binary_relationship_list