Esempio n. 1
0
def packet_sequences_only(X):
    P = []
    # Find maximum packet sequence length
    maximum = 0
    for p in X:
        if len(p) > maximum:
            maximum = len(p)
    # Pad to maximum length, in place
    log('Padding each sequence to {}'.format(len(p)))
    for i in xrange(len(X)):
        # Flatten and pad
        X[i] = [x for p in X[i] for x in p] + [0] * (2 * (maximum - len(X[i])))

    return np.array(X)
Esempio n. 2
0
def split_training_test(D, Y, train_size, test_size):
    """Splits distances D and labels Y into training and test
    sets as explained below.

    Note: train_size + test_size <= len(D)

    The test set of size test_size is first sampled.
    Then, a training set of size train_size is sampled from the
    remaining examples.
    This allows to vary the size of the training set, keeping
    a fixed test set.

    If train_size + test_size = len(D), the function simply
    splits (D,Y) into a training and test set using all the
    examples.
    """
    n = len(D)
    # NOTE: we need to remove both rows AND columns from
    # the distance matrix.
    log('Splitting into training/test set keeping uniform labels')
    # First get the test set
    I = range(n)  # Indexes
    Iother, Itest = train_test_split(I,
                                     test_size=test_size,
                                     stratify=Y,
                                     random_state=args.seed)
    Ytest = Y[Itest]
    # Need to do this in two steps
    Dtest = D[Itest, :]
    Dtest = D[:, Itest]
    # Now the training set
    if train_size < len(Iother):
        log('Reduced training set')
        # Now sample train_size instances from Iother to create the training set
        Itrain, _ = train_test_split(Iother,
                                     train_size=train_size,
                                     stratify=Y[Iother],
                                     random_state=args.seed)
    else:
        Itrain = Iother
    log('Training set has size {}'.format(len(Itrain)))
    Ytrain = Y[Itrain]
    # Need to do this in two steps
    Dtrain = D[Itrain, :]
    Dtrain = Dtrain[:, Itrain]

    return Dtrain, Ytrain, Dtest, Ytest
def run(traces, outfname):
    X, Y, W, _, _ = load_dataset(traces)

    sizes = encode_sizes(X)

    log('Computing pairwise distances')
    D = pairwise_levenshtein_distances(sizes)
    log('Computing subtractions')

    log('Storing distances into {}'.format(outfname))

    data = {
        'webpage-id': W,
        'label': np.array(Y),
        'pairdist': D,
    }

    with open(outfname, 'wb') as f:
        dill.dump(data, f)
Esempio n. 4
0
                        action='store_true',
                        help='Compute on packet sequences.',
                        required=False,
                        default=False)
    parser.add_argument('--out',
                        type=str,
                        help='Distance file (.distances).',
                        required=True)
    args = parser.parse_args()

    if not args.sequences:
        X, Y, W, _, _ = load_features(args.features)
    else:
        X, Y, W, _, _ = load_dataset(args.features)
        X = packet_sequences_only(X)

    log('Computing pairwise distances')
    D = pairwise_distances(X)
    log('Computing subtractions')

    log('Storing distances into {}'.format(args.out))

    data = {
        'webpage-id': W,
        'label': np.array(Y),
        'pairdist': D,
    }

    with open(args.out, 'wb') as f:
        dill.dump(data, f)
Esempio n. 5
0
                        default=0)
    parser.add_argument('--bootstrap',
                        help='Use bootstrap.',
                        action='store_true',
                        default=False)
    parser.add_argument('--target',
                        help='Target page for 1 vs All setting.',
                        required=False,
                        type=int)
    parser.add_argument('--out',
                        type=str,
                        help='Results file (.json).',
                        required=True)
    args = parser.parse_args()

    log('Loading distances from {}'.format(args.distances))
    with open(args.distances, 'rb') as f:
        data = dill.load(f)

    D = data['pairdist']
    Y = np.array(data['label'])

    if args.target is not None:
        log('One-vs-all setting using {} as target'.format(args.target))
        log('Reducing the dataset for one-vs-all')
        D, Y = one_vs_all_setting(D, Y, args.target)

    log('Seed is {}'.format(args.seed))

    # (Maybe) apply bootstrap
    if args.bootstrap:
Esempio n. 6
0
                        help='Percentage (or number) of test instances.',
                        required=True)
    parser.add_argument('--seed',
                        type=int,
                        help='PRNG seed (default: 0).',
                        required=False,
                        default=0)
    parser.add_argument('--out',
                        type=str,
                        help='Output file (.json).',
                        required=True)
    args = parser.parse_args()

    X, Y, _, Npages, Nloads = load_features(args.features)

    log('Seed is {}'.format(args.seed))

    n = len(X)
    # Get training/test set size
    if args.train > 1:
        train_size = int(args.train)
    else:
        train_size = int(args.train * n)
    if args.test > 1:
        test_size = int(args.test)
    else:
        test_size = int(args.test * n)

    log('Training set size: {}. Test set size: {}.'.format(
        train_size, test_size))
    if train_size + test_size != n: