def get_features(self, token): ltoken = token.lower() feature_list = [ltoken] # First off, if it looks like it's a RT, HT, USR, or URL don't # add any features but the symbol tags # Get potential symbol tags pos = symbol_tag.tag_token(ltoken) if pos: feature_list.append('SYMBOL_REGX=' + str(pos)) if pos in ['usr', 'rt', 'ht', 'url']: return ['SYMBOL_REGX=' + str(pos)] # Use the dictionaries to see what common tags exist in_pos_dict = False dictionary_list = [] for dict_name, dictionary in self.dictionaries.iteritems(): if ltoken in dictionary.token_pos_set: in_pos_dict = True # Record all POS tags the token has been seen with pos_set = dictionary.token_pos_set[ltoken] for pos in pos_set: feature_list.append(dict_name + '=' + pos) # Record if it has only been seen with one if len(pos_set) == 1: feature_list.append(dict_name + '_ONLY=' + pos) # Record the majority POS tag # Make sure the majority is a real majority pos_l = [(count, pos) for pos, count in pos_set.items()] pos_l.sort() pos_l.reverse() # If one tag and count greater than 1 if len(pos_l) == 1 and pos_l[0][0] > 1: feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1]) elif len(pos_l) > 1 and (pos_l[0][0] > 1.5 * pos_l[1][0]): feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1]) # Record that dictionary found something dictionary_list.append(dict_name) # Check if the token occurs in new dictionaries (lexicons) for dictname, dictionary in self.occurences.iteritems(): if ltoken in dictionary.token_pos_set: feature_list.append(dictname) # Get basic reg expression features # Check if the token is all caps and no symbols if len(token) > 1 and re.match('^[A-Z]*$', token): feature_list.append('ALL_CAPS') # Check if the token is capitalized if re.match('[A-Z]', token[0]): feature_list.append('IS_CAPITALIZED') # Check if the token contains a number if re.match('.*[0-9].*', token): feature_list.append('IS_NUM') # New ortho features if re.match(r'[0-9]', token): feature_list.append('SINGLEDIGIT') if re.match(r'[0-9][0-9]', token): feature_list.append('DOUBLEDIGIT') if re.match(r'.*-.*', token): feature_list.append('HASDASH') if re.match(r'[.,;:?!-+\'"]', token): feature_list.append('PUNCTUATION') # Only for words with 4 or longer chars if len(ltoken) >= 4: # Get prefixes for i in range(1, 5): if i <= len(ltoken): feature_list.append('PREFIX=' + ltoken[:i]) # Get suffixes for i in range(1, 5): if i <= len(ltoken): feature_list.append('SUFFIX=' + ltoken[-1 * i:]) # Add cluster features (RAW) if self.cluster_dictionary: feature_list.extend(self.cluster_dictionary.get_clusters(token)) # Add cluster similarity features if not in_pos_dict and token != '<': m, s = cluster_sim.get_best_match(token.lower()) if m: feature_list.append('CLUST_' + m) return feature_list
def get_features(self, token, use_dicts=True, use_cap=True, use_num=True, use_prefix='PREFIX=', use_suffix='SUFFIX=', use_major_pos=True, diction_avail=False, new_maj=True, all_caps=True, new_dictions=True, use_domain_transfer=False, dt_label='TARG', use_symbol_tag=True, lim_suffix=True, new_ortho=True, lim_tags=True): ltoken = token.lower() feature_list = [ltoken] # First off, if it looks like it's a RT, HT, USR, or URL don't # add any features but the symbol tags # Get potential symbol tags if use_symbol_tag: pos = symbol_tag.tag_token(ltoken) if pos: feature_list.append('SYMBOL_REGX=' + str(pos)) if lim_tags and pos in ['usr', 'rt', 'ht', 'url']: return ['SYMBOL_REGX=' + str(pos)] # Use the dictionaries to see what common tags exist if use_dicts: dictionary_list = [] for dict_name, dictionary in self.dictionaries.iteritems(): if ltoken in dictionary.token_pos_set: # Record all POS tags the token has been seen with pos_set = dictionary.token_pos_set[ltoken] for pos in pos_set: feature_list.append(dict_name + '=' + pos) # Record if it has only been seen with one if len(pos_set) == 1: feature_list.append(dict_name + '_ONLY=' + pos) # Record the majority POS tag if use_major_pos: if not new_maj: major = dictionary.token_pos_majority[ltoken] feature_list.append(dict_name + '_MAJORITY=' + major) else: # Make sure the majority is a real majority pos_l = [(count, pos) for pos, count in pos_set.items()] pos_l.sort() pos_l.reverse() # If one tag and count greater than 1 if len(pos_l) == 1 and pos_l[0][0] > 1: feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1]) elif len(pos_l) > 1 and (pos_l[0][0] > 1.5*pos_l[1][0]): feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1]) # Record that dictionary found something dictionary_list.append(dict_name) # Record which dictionaries the token is found int if diction_avail: if not dictionary_list: feature_list.append('NOT_IN_DICTS') elif len(dictionary_list) > 1: feature_list.append('IN_MULTIPLE_DICTS') else: feature_list.append('ONLY_IN=' + dictionary_list[0]) # Check if the token occurs in new dictionaries if new_dictions: for dictname, dictionary in self.occurences.iteritems(): if ltoken in dictionary.token_pos_set: feature_list.append(dictname) # Get basic reg expression features # Check if the token is all caps and no symbols if all_caps: if len(token) > 1 and re.match('^[A-Z]*$', token): feature_list.append('ALL_CAPS') # Check if the token is capitalized if re.match('[A-Z]', token[0]): feature_list.append('IS_CAPITALIZED') elif not new_ortho: feature_list.append('IS_LOWERCASE') # Check if the token contains a number if re.match('.*[0-9].*', token): feature_list.append('IS_NUM') elif not new_ortho: feature_list.append('NOT_NUM') # New ortho features if new_ortho: if re.match(r'[0-9]', token): feature_list.append('SINGLEDIGIT') if re.match(r'[0-9][0-9]', token): feature_list.append('DOUBLEDIGIT') if re.match(r'.*-.*', token): feature_list.append('HASDASH') if re.match(r'[.,;:?!-+\'"]', token): feature_list.append('PUNCTUATION') # Only for words with 4 or longer chars if (not lim_suffix) or len(ltoken) >= 4: # Get prefixes for i in range(1, 5): if i <= len(ltoken): feature_list.append(use_prefix + ltoken[:i]) # Get suffixes for i in range(1, 5): if i <= len(ltoken): feature_list.append(use_suffix + ltoken[-1*i:]) # Modify features for domain transfer if necessary if use_domain_transfer: return create_dt_features(feature_list, dt_label) else: return feature_list
def get_features(self, token): ltoken = token.lower() feature_list = [ltoken] # First off, if it looks like it's a RT, HT, USR, or URL don't # add any features but the symbol tags # Get potential symbol tags pos = symbol_tag.tag_token(ltoken) if pos: feature_list.append('SYMBOL_REGX=' + str(pos)) if pos in ['usr', 'rt', 'ht', 'url']: return ['SYMBOL_REGX=' + str(pos)] # Use the dictionaries to see what common tags exist in_pos_dict = False dictionary_list = [] for dict_name, dictionary in self.dictionaries.iteritems(): if ltoken in dictionary.token_pos_set: in_pos_dict = True # Record all POS tags the token has been seen with pos_set = dictionary.token_pos_set[ltoken] for pos in pos_set: feature_list.append(dict_name + '=' + pos) # Record if it has only been seen with one if len(pos_set) == 1: feature_list.append(dict_name + '_ONLY=' + pos) # Record the majority POS tag # Make sure the majority is a real majority pos_l = [(count, pos) for pos, count in pos_set.items()] pos_l.sort() pos_l.reverse() # If one tag and count greater than 1 if len(pos_l) == 1 and pos_l[0][0] > 1: feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1]) elif len(pos_l) > 1 and (pos_l[0][0] > 1.5*pos_l[1][0]): feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1]) # Record that dictionary found something dictionary_list.append(dict_name) # Check if the token occurs in new dictionaries (lexicons) for dictname, dictionary in self.occurences.iteritems(): if ltoken in dictionary.token_pos_set: feature_list.append(dictname) # Get basic reg expression features # Check if the token is all caps and no symbols if len(token) > 1 and re.match('^[A-Z]*$', token): feature_list.append('ALL_CAPS') # Check if the token is capitalized if re.match('[A-Z]', token[0]): feature_list.append('IS_CAPITALIZED') # Check if the token contains a number if re.match('.*[0-9].*', token): feature_list.append('IS_NUM') # New ortho features if re.match(r'[0-9]', token): feature_list.append('SINGLEDIGIT') if re.match(r'[0-9][0-9]', token): feature_list.append('DOUBLEDIGIT') if re.match(r'.*-.*', token): feature_list.append('HASDASH') if re.match(r'[.,;:?!-+\'"]', token): feature_list.append('PUNCTUATION') # Only for words with 4 or longer chars if len(ltoken) >= 4: # Get prefixes for i in range(1, 5): if i <= len(ltoken): feature_list.append('PREFIX=' + ltoken[:i]) # Get suffixes for i in range(1, 5): if i <= len(ltoken): feature_list.append('SUFFIX=' + ltoken[-1*i:]) # Add cluster features (RAW) if self.cluster_dictionary: feature_list.extend(self.cluster_dictionary.get_clusters(token)) # Add cluster similarity features if not in_pos_dict and token != '<': m, s = cluster_sim.get_best_match(token.lower()) if m: feature_list.append('CLUST_' + m) return feature_list