def get_feature_vectors_from_data(data, interest=['obfuscation'], exclude=['verification','imitation']):
    """
       Get the features and classes from the data
    """
    set_f = []
    set_c = []
    for in_name in data.keys():
        for n, f in data[in_name].items():
            if not name_has_substring(n, exclude):
                set_f.append(f)
                if name_has_substring(n, interest):
                    set_c.append(1)
                else:
                    set_c.append(0)

    return set_f, set_c
Beispiel #2
0
def create_splits(data, samples=10, exclude=['verification', 'imitation', 'obfuscation'], attack=['obfuscation']):
    """
        Creates splits of 39 authors to learn natural vs obfuscation
            and then the author of an obfuscated text must be identified on the basis of only natural texts.
        Works similarly as att_classify.create_splits_attack, but also ads obf vs natural pairs.
    """
    sets = []
    seed(1)     # Set seed to always have same outcome
    authorsets = [ sample(sorted(data.keys()), 40) for _ in xrange(samples) ]   # sort for same outcome on different systems
    for authorset in authorsets:    # Loop over different selections of authors
        for exclude_author in authorset:    # Loop over different authors to leave out and attribute the obfuscated text of.
            inset_f = []
            inset_c = []
            outset_f = []
            outset_c = []
            reg_obf_pairs = []

            for story in data[exclude_author].keys():
                if name_has_substring(story, attack):
                    outset_f.append(data[exclude_author][story])
                    outset_c.append(exclude_author)
                else:
                    if not name_has_substring(story, exclude):
                        inset_f.append(data[exclude_author][story])
                        inset_c.append(exclude_author)

            for include_author in authorset:
                if not include_author == exclude_author:
                    regular_texts = []
                    obfuscation = None  # There only ever is one of those in the EBG data set
                    for story in data[include_author].keys():
                        if name_has_substring(story, attack):
                            obfuscation = data[include_author][story]
                        else:
                            if not name_has_substring(story, exclude):
                                regular_texts.append(data[include_author][story])
                                inset_f.append(data[include_author][story])
                                inset_c.append(include_author)
                    reg_obf_pairs.append((regular_texts, obfuscation))

            sets.append(((inset_f, inset_c),(outset_f, outset_c), reg_obf_pairs))

    return sets