set_f.append(f) if name_has_substring(n, interest): set_c.append(1) else: set_c.append(0) return set_f, set_c if __name__ == '__main__': from matplotlib import pyplot as plt print "Loading data.." from feature_extraction.Cached_Features import data print "Normalizing..." # Select features data = data_select_specific_features(data, ['bi_char_dist', 'legomena', 'word_length', 'tri_char_dist', 'mono_tag_dist', 'sentence_length', 'readability']) # Get the data separated in features and classes features, classes = get_feature_vectors_from_data(data) # Compres the features to two numbers (points) FP = Feature_Preprocessor(features, False, True, 2) features = FP.batch_normalize(features) print "Data processed, now plotting..." # Convert a list of points to two lists of x and y points (fortran style) x = [ p[0] for p in features ] y = [ p[1] for p in features ]
Decide based on _pairs_ if text described as _outset_f_ is obfuscated or not """ regular = [] obfuscated = [] for (reg, obf) in pairs: obfuscated.append(obf) for r in reg: regular.append(r) features = regular + obfuscated classes = [ 0 for _ in xrange(len(regular)) ] + [ 1 for _ in xrange(len(obfuscated)) ] return AdaBoostClassifier_predict_texttype(features, classes, outset_f)[0] == 1 if __name__ == '__main__': print "Loading data.." from feature_extraction.Cached_Features import data print "Working..." features=['mono_char_dist', 'mono_chunk_dist', 'bi_tag_dist', 'word_length', 'legomena', 'bi_char_dist', 'readability', 'mono_tag_dist'] samples = 1 # Set to high number for more accurate mearusements data = data_select_specific_features(data, features) sets = create_splits(data, samples=samples) average = lambda x : sum(x) / len(x) for deobf in ['never', 'detect', 'always']: ranks = get_precision_at_rank(sets, deobf=deobf) print "deobf:"+deobf, ", ave(recall):", average(ranks), ranks print "Done"