Ejemplo n.º 1
0
def make_feature(fc):
    '''Builds a new `StringCounter` from the many `StringCounters` in the
    input `fc`.  This StringCounter will define one of the targets for
    the `MultinomialNB` classifier.

    This crucial function decides the relative importance of features
    extracted by the ETL pipeline.  This is essentially a form of
    domain fitting that allows us to tune the extraction to the fields
    that are important to a domain.  However, if the NER for a domain
    is inadequate, then the primary purpose of these relative
    weightings is to remove bogus NER extractions.

    '''
    feat = StringCounter()
    rejects = set()
    keepers = set()
    #keepers_keys = ['GPE', 'PERSON', 'ORGANIZATION', 'usernames']
    keepers_keys = ['phone', 'email'] #['usernames', 'phone', 'email', 'ORGANIZATION', 'PERSON']
    rejects_keys = ['keywords', 'usernames', 'ORGANIZATION', 'PERSON']
    # The features used to pull the keys for the classifier
    for f, strength in [('keywords', 10**4), ('GPE', 1), ('bow', 1), ('bowNP_sip', 10**8),
                        ('phone', 10**12), ('email', 10**12),
                        ('bowNP', 10**3), ('PERSON', 10**8), ('ORGANIZATION', 10**6), ('usernames', 10**12)]:
        if strength == 1:
            feat += fc[f]
        else:
            feat += StringCounter({key: strength * count
                                   for key, count in fc[f].items()})
        if f in rejects_keys:
            map(rejects.add, fc[f])
        if f in keepers_keys:
            map(keepers.add, fc[f])
        if u'' in feat: feat.pop(u'')
    return feat, rejects, keepers