Exemple #1
0
def load_project_data(project_name, n_folds=5):
    """returns two sets of data from the specified project. The first one
    contains 4/5 of the data for training. The second contains the remaining 1/5 of 
    the data for testing."""
    root_dir = abspath(DATA_PATH)

    data = parse_c45(project_name, root_dir)

    n_data = len(data)
    pos_data = []
    neg_data = []
    for ex in data:
        if ex[-1]:
            pos_data.append(ex)
        else:
            neg_data.append(ex)

    n_pos = len(pos_data)
    n_neg = len(neg_data)

    random.shuffle(pos_data)
    random.shuffle(neg_data)

    n_pos_fold = int(ceil(n_pos / float(n_folds)))  #n fold cross validation
    n_neg_fold = int(ceil(n_neg / float(n_folds)))

    folds = []
    for i in range(0, n_folds):  #n_folds folds
        pos_fold = pos_data[n_pos_fold * i:n_pos_fold * i + n_pos_fold]
        neg_fold = neg_data[n_neg_fold * i:n_neg_fold * i + n_neg_fold]

        #not sure you need this, but it seems like a bad idea to train a
        # all on positive then negative examples. It' can't hurt really.
        # should not do anything for deterministic backprop, but will for
        # stochastic backprop
        pos_fold.extend(neg_fold)
        random.shuffle(pos_fold)
        folds.append(pos_fold)

    #create the different training and test set pairs
    fold_sets = []
    for i in range(0, n_folds):
        test = folds.pop(i)
        train = []
        for fold in folds:
            train.extend(fold)
        fold_sets.append((ExampleSet(train), ExampleSet(test)))
        folds.insert(i, test)
    return fold_sets
Exemple #2
0
def load_project_data(project_name, n_folds=5):
    """returns two sets of data from the specified project. The first one
    contains 4/5 of the data for training. The second contains the remaining 1/5 of 
    the data for testing."""
    root_dir = abspath(DATA_PATH)

    data = parse_c45(project_name, root_dir)

    n_data = len(data)
    pos_data = []
    neg_data = []
    for ex in data:
        if ex[-1]:
            pos_data.append(ex)
        else:
            neg_data.append(ex)

    n_pos = len(pos_data)
    n_neg = len(neg_data)

    random.shuffle(pos_data)
    random.shuffle(neg_data)

    n_pos_fold = int(ceil(n_pos / float(n_folds)))  # n fold cross validation
    n_neg_fold = int(ceil(n_neg / float(n_folds)))

    folds = []
    for i in range(0, n_folds):  # n_folds folds
        pos_fold = pos_data[n_pos_fold * i : n_pos_fold * i + n_pos_fold]
        neg_fold = neg_data[n_neg_fold * i : n_neg_fold * i + n_neg_fold]

        # not sure you need this, but it seems like a bad idea to train a
        # all on positive then negative examples. It' can't hurt really.
        # should not do anything for deterministic backprop, but will for
        # stochastic backprop
        pos_fold.extend(neg_fold)
        random.shuffle(pos_fold)
        folds.append(pos_fold)

    # create the different training and test set pairs
    fold_sets = []
    for i in range(0, n_folds):
        test = folds.pop(i)
        train = []
        for fold in folds:
            train.extend(fold)
        fold_sets.append((ExampleSet(train), ExampleSet(test)))
        folds.insert(i, test)
    return fold_sets
def load_project_data(project_name):
    """returns two sets of data from the specified project. The first one
    contains 4/5 of the data for training. The second contains the remaining 1/5 of 
    the data for testing."""
    root_dir = abspath(DATA_PATH)

    data = parse_c45(project_name, root_dir)

    n_data = len(data)
    n_train, n_test = int(floor(4 / 5.0 * n_data)), int(ceil(1 / 5.0 * n_data))

    train_choices = set(random.sample(xrange(n_data), n_train))

    train_data, test_data = [], []
    for i, ex in enumerate(data):
        if i in train_choices:
            train_data.append(ex)
        else:
            test_data.append(ex)

    return ExampleSet(train_data), ExampleSet(test_data)
def load_project_data(project_name): 
    """returns two sets of data from the specified project. The first one
    contains 4/5 of the data for training. The second contains the remaining 1/5 of 
    the data for testing.""" 
    root_dir = abspath(DATA_PATH)
    
    data = parse_c45(project_name,root_dir)
    
    n_data = len(data)
    n_train,n_test = int(floor(4/5.0*n_data)),int(ceil(1/5.0*n_data))
    
    train_choices = set(random.sample(xrange(n_data),n_train))
        
    train_data, test_data = [],[]
    for i,ex in enumerate(data): 
        if i in train_choices: 
            train_data.append(ex)
        else: 
            test_data.append(ex)
    
    return ExampleSet(train_data),ExampleSet(test_data)