def get_feature_vectors_from_data(data, interest=['obfuscation'], exclude=['verification','imitation']): """ Get the features and classes from the data """ set_f = [] set_c = [] for in_name in data.keys(): for n, f in data[in_name].items(): if not name_has_substring(n, exclude): set_f.append(f) if name_has_substring(n, interest): set_c.append(1) else: set_c.append(0) return set_f, set_c
def create_splits(data, samples=10, exclude=['verification', 'imitation', 'obfuscation'], attack=['obfuscation']): """ Creates splits of 39 authors to learn natural vs obfuscation and then the author of an obfuscated text must be identified on the basis of only natural texts. Works similarly as att_classify.create_splits_attack, but also ads obf vs natural pairs. """ sets = [] seed(1) # Set seed to always have same outcome authorsets = [ sample(sorted(data.keys()), 40) for _ in xrange(samples) ] # sort for same outcome on different systems for authorset in authorsets: # Loop over different selections of authors for exclude_author in authorset: # Loop over different authors to leave out and attribute the obfuscated text of. inset_f = [] inset_c = [] outset_f = [] outset_c = [] reg_obf_pairs = [] for story in data[exclude_author].keys(): if name_has_substring(story, attack): outset_f.append(data[exclude_author][story]) outset_c.append(exclude_author) else: if not name_has_substring(story, exclude): inset_f.append(data[exclude_author][story]) inset_c.append(exclude_author) for include_author in authorset: if not include_author == exclude_author: regular_texts = [] obfuscation = None # There only ever is one of those in the EBG data set for story in data[include_author].keys(): if name_has_substring(story, attack): obfuscation = data[include_author][story] else: if not name_has_substring(story, exclude): regular_texts.append(data[include_author][story]) inset_f.append(data[include_author][story]) inset_c.append(include_author) reg_obf_pairs.append((regular_texts, obfuscation)) sets.append(((inset_f, inset_c),(outset_f, outset_c), reg_obf_pairs)) return sets