Ejemplo n.º 1
0
    def get_features(self, token):
        ltoken = token.lower()
        feature_list = [ltoken]

        # First off, if it looks like it's a RT, HT, USR, or URL don't
        # add any features but the symbol tags
        # Get potential symbol tags
        pos = symbol_tag.tag_token(ltoken)
        if pos:
            feature_list.append('SYMBOL_REGX=' + str(pos))
        if pos in ['usr', 'rt', 'ht', 'url']:
            return ['SYMBOL_REGX=' + str(pos)]

        # Use the dictionaries to see what common tags exist
        in_pos_dict = False
        dictionary_list = []
        for dict_name, dictionary in self.dictionaries.iteritems():
            if ltoken in dictionary.token_pos_set:
                in_pos_dict = True
                # Record all POS tags the token has been seen with
                pos_set = dictionary.token_pos_set[ltoken]
                for pos in pos_set:
                    feature_list.append(dict_name + '=' + pos)
                # Record if it has only been seen with one
                if len(pos_set) == 1:
                    feature_list.append(dict_name + '_ONLY=' + pos)
                # Record the majority POS tag
                # Make sure the majority is a real majority
                pos_l = [(count, pos) for pos, count in pos_set.items()]
                pos_l.sort()
                pos_l.reverse()
                # If one tag and count greater than 1
                if len(pos_l) == 1 and pos_l[0][0] > 1:
                    feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1])
                elif len(pos_l) > 1 and (pos_l[0][0] > 1.5 * pos_l[1][0]):
                    feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1])

                # Record that dictionary found something
                dictionary_list.append(dict_name)

        # Check if the token occurs in new dictionaries (lexicons)
        for dictname, dictionary in self.occurences.iteritems():
            if ltoken in dictionary.token_pos_set:
                feature_list.append(dictname)

        # Get basic reg expression features
        # Check if the token is all caps and no symbols
        if len(token) > 1 and re.match('^[A-Z]*$', token):
            feature_list.append('ALL_CAPS')

        # Check if the token is capitalized
        if re.match('[A-Z]', token[0]):
            feature_list.append('IS_CAPITALIZED')

        # Check if the token contains a number
        if re.match('.*[0-9].*', token):
            feature_list.append('IS_NUM')

        # New ortho features
        if re.match(r'[0-9]', token):
            feature_list.append('SINGLEDIGIT')
        if re.match(r'[0-9][0-9]', token):
            feature_list.append('DOUBLEDIGIT')
        if re.match(r'.*-.*', token):
            feature_list.append('HASDASH')
        if re.match(r'[.,;:?!-+\'"]', token):
            feature_list.append('PUNCTUATION')

        # Only for words with 4 or longer chars
        if len(ltoken) >= 4:
            # Get prefixes
            for i in range(1, 5):
                if i <= len(ltoken):
                    feature_list.append('PREFIX=' + ltoken[:i])
            # Get suffixes
            for i in range(1, 5):
                if i <= len(ltoken):
                    feature_list.append('SUFFIX=' + ltoken[-1 * i:])

        # Add cluster features (RAW)
        if self.cluster_dictionary:
            feature_list.extend(self.cluster_dictionary.get_clusters(token))

        # Add cluster similarity features
        if not in_pos_dict and token != '&lt;':
            m, s = cluster_sim.get_best_match(token.lower())
            if m: feature_list.append('CLUST_' + m)

        return feature_list
Ejemplo n.º 2
0
    def get_features(self, token, use_dicts=True, use_cap=True, use_num=True,
                     use_prefix='PREFIX=', use_suffix='SUFFIX=',
                     use_major_pos=True, diction_avail=False,
                     new_maj=True, all_caps=True, new_dictions=True,
                     use_domain_transfer=False, dt_label='TARG',
                     use_symbol_tag=True, lim_suffix=True,
                     new_ortho=True, lim_tags=True):
        ltoken = token.lower()
        feature_list = [ltoken]

        # First off, if it looks like it's a RT, HT, USR, or URL don't
        # add any features but the symbol tags
        # Get potential symbol tags
        if use_symbol_tag:
            pos = symbol_tag.tag_token(ltoken)
            if pos:
                feature_list.append('SYMBOL_REGX=' + str(pos))        
            if lim_tags and pos in ['usr', 'rt', 'ht', 'url']:
                return ['SYMBOL_REGX=' + str(pos)]

        # Use the dictionaries to see what common tags exist
        if use_dicts:
            dictionary_list = []
            for dict_name, dictionary in self.dictionaries.iteritems():
                if ltoken in dictionary.token_pos_set:
                    # Record all POS tags the token has been seen with
                    pos_set = dictionary.token_pos_set[ltoken]
                    for pos in pos_set:
                        feature_list.append(dict_name + '=' + pos)
                    # Record if it has only been seen with one
                    if len(pos_set) == 1:
                        feature_list.append(dict_name + '_ONLY=' + pos)
                    # Record the majority POS tag
                    if use_major_pos:
                        if not new_maj:
                            major = dictionary.token_pos_majority[ltoken]
                            feature_list.append(dict_name + '_MAJORITY=' 
                                                + major)
                        else:
                            # Make sure the majority is a real majority
                            pos_l = [(count, pos) for pos, count
                                     in pos_set.items()]
                            pos_l.sort()
                            pos_l.reverse()
                            # If one tag and count greater than 1
                            if len(pos_l) == 1 and pos_l[0][0] > 1:
                                feature_list.append(dict_name + '_MAJORITY='
                                                    + pos_l[0][1])
                            elif len(pos_l) > 1 and (pos_l[0][0] > 
                                                     1.5*pos_l[1][0]):
                                feature_list.append(dict_name + '_MAJORITY='
                                                    + pos_l[0][1])
                            
                    # Record that dictionary found something
                    dictionary_list.append(dict_name)

            # Record which dictionaries the token is found int
            if diction_avail:
                if not dictionary_list:
                    feature_list.append('NOT_IN_DICTS')
                elif len(dictionary_list) > 1:
                    feature_list.append('IN_MULTIPLE_DICTS')
                else:
                    feature_list.append('ONLY_IN=' + dictionary_list[0])

        # Check if the token occurs in new dictionaries
        if new_dictions:
            for dictname, dictionary in self.occurences.iteritems():
                if ltoken in dictionary.token_pos_set:
                    feature_list.append(dictname)

        # Get basic reg expression features
        # Check if the token is all caps and no symbols
        if all_caps:
            if len(token) > 1 and re.match('^[A-Z]*$', token):
                feature_list.append('ALL_CAPS')

        # Check if the token is capitalized  
        if re.match('[A-Z]', token[0]):
            feature_list.append('IS_CAPITALIZED')
        elif not new_ortho:
            feature_list.append('IS_LOWERCASE')

        # Check if the token contains a number 
        if re.match('.*[0-9].*', token):
            feature_list.append('IS_NUM')
        elif not new_ortho:
            feature_list.append('NOT_NUM')

        # New ortho features
        if new_ortho:
            if re.match(r'[0-9]', token):
                feature_list.append('SINGLEDIGIT')
            if re.match(r'[0-9][0-9]', token):
                feature_list.append('DOUBLEDIGIT')
            if re.match(r'.*-.*', token):
                feature_list.append('HASDASH')
            if re.match(r'[.,;:?!-+\'"]', token):
                feature_list.append('PUNCTUATION')


        # Only for words with 4 or longer chars
        if (not lim_suffix) or len(ltoken) >= 4:
            # Get prefixes
            for i in range(1, 5):
                if i <= len(ltoken):
                    feature_list.append(use_prefix + ltoken[:i])
            # Get suffixes                            
            for i in range(1, 5):
                if i <= len(ltoken):
                    feature_list.append(use_suffix + ltoken[-1*i:])

        # Modify features for domain transfer if necessary
        if use_domain_transfer:
            return create_dt_features(feature_list, dt_label)
        else:
            return feature_list
Ejemplo n.º 3
0
    def get_features(self, token):
        ltoken = token.lower()
        feature_list = [ltoken]

        # First off, if it looks like it's a RT, HT, USR, or URL don't
        # add any features but the symbol tags
        # Get potential symbol tags
        pos = symbol_tag.tag_token(ltoken)
        if pos:
            feature_list.append('SYMBOL_REGX=' + str(pos))        
        if pos in ['usr', 'rt', 'ht', 'url']:
            return ['SYMBOL_REGX=' + str(pos)]

        # Use the dictionaries to see what common tags exist
        in_pos_dict = False
        dictionary_list = []
        for dict_name, dictionary in self.dictionaries.iteritems():
            if ltoken in dictionary.token_pos_set:
                in_pos_dict = True
                # Record all POS tags the token has been seen with
                pos_set = dictionary.token_pos_set[ltoken]
                for pos in pos_set:
                    feature_list.append(dict_name + '=' + pos)
                # Record if it has only been seen with one
                if len(pos_set) == 1:
                    feature_list.append(dict_name + '_ONLY=' + pos)
                # Record the majority POS tag
                # Make sure the majority is a real majority
                pos_l = [(count, pos) for pos, count in pos_set.items()]
                pos_l.sort()
                pos_l.reverse()
                # If one tag and count greater than 1
                if len(pos_l) == 1 and pos_l[0][0] > 1:
                    feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1])
                elif len(pos_l) > 1 and (pos_l[0][0] > 1.5*pos_l[1][0]):
                    feature_list.append(dict_name + '_MAJORITY=' + pos_l[0][1])
                            
                # Record that dictionary found something
                dictionary_list.append(dict_name)

        # Check if the token occurs in new dictionaries (lexicons)
        for dictname, dictionary in self.occurences.iteritems():
            if ltoken in dictionary.token_pos_set:
                feature_list.append(dictname)

        # Get basic reg expression features
        # Check if the token is all caps and no symbols
        if len(token) > 1 and re.match('^[A-Z]*$', token):
            feature_list.append('ALL_CAPS')

        # Check if the token is capitalized  
        if re.match('[A-Z]', token[0]):
            feature_list.append('IS_CAPITALIZED')

        # Check if the token contains a number 
        if re.match('.*[0-9].*', token):
            feature_list.append('IS_NUM')

        # New ortho features
        if re.match(r'[0-9]', token):
            feature_list.append('SINGLEDIGIT')
        if re.match(r'[0-9][0-9]', token):
            feature_list.append('DOUBLEDIGIT')
        if re.match(r'.*-.*', token):
            feature_list.append('HASDASH')
        if re.match(r'[.,;:?!-+\'"]', token):
            feature_list.append('PUNCTUATION')

        # Only for words with 4 or longer chars
        if len(ltoken) >= 4:
            # Get prefixes
            for i in range(1, 5):
                if i <= len(ltoken):
                    feature_list.append('PREFIX=' + ltoken[:i])
            # Get suffixes                            
            for i in range(1, 5):
                if i <= len(ltoken):
                    feature_list.append('SUFFIX=' + ltoken[-1*i:])

        # Add cluster features (RAW)
        if self.cluster_dictionary:
            feature_list.extend(self.cluster_dictionary.get_clusters(token))

        # Add cluster similarity features
        if not in_pos_dict and token != '&lt;':
            m, s = cluster_sim.get_best_match(token.lower())
            if m: feature_list.append('CLUST_' + m)

        return feature_list