Python PsobDatasetの例、psob_authorship.features.PsobDataset.PsobDataset Pythonの例

コード例 #1

0

ファイルを表示

def run_train():
    features, labels = torch.load("../calculated_features/features.tr"),\
                       torch.load("../calculated_features/labels.tr")
    permutation = torch.randperm(labels.shape[0])
    features, labels = features[permutation], labels[permutation]
    train_test_split_id = int(labels.shape[0] * 0.75)
    train_features, train_labels = features[:train_test_split_id], labels[:train_test_split_id]
    test_features, test_labels = features[train_test_split_id:], labels[train_test_split_id:]
    trainloader = torch.utils.data.DataLoader(
        PsobDataset(train_features, train_labels),
        batch_size=BATCH_SIZE, shuffle=True, num_workers=2
    )
    testloader = torch.utils.data.DataLoader(
        PsobDataset(test_features, test_labels),
        batch_size=BATCH_SIZE, shuffle=False, num_workers=2
    )

    model = Model(features.shape[1])
    criterion = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    for epoch in range(EPOCHS):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 10 == 9:  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 10))
                running_loss = 0.0

    print('Finished Training')

    correct = 0
    total = 0
    labels_correct = torch.zeros(NUM_OF_AUTHORS)
    with torch.no_grad():
        for data in testloader:
            features, labels = data
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            for i, label in enumerate(labels):
                labels_correct[label] += predicted[i] == labels[i]
    print('Accuracy of the network: %d / %d = %d %%' % (
        correct, total, 100 * correct / total))
    print(labels_correct)

コード例 #2

0

ファイルを表示

def get_test_accuracy_by_epoch() -> Tuple[List[int], List[float], List[int]]:
    logger = logging.getLogger('early_stopping')
    configure_logger_by_default(logger)
    logger.info("START get_test_accuracy_by_epoch")
    train_index, test_index = next(CONFIG['cv'].split(INPUT_FEATURES, INPUT_LABELS))

    model = Model(INPUT_FEATURES.shape[1])
    criterion = CONFIG['criterion']()
    optimizer = CONFIG['optimizer'](model.parameters(), lr=CONFIG['lr'])
    train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[train_index]
    test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[test_index]
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features = scaler.transform(train_features)
    test_features = scaler.transform(test_features)
    trainloader = torch.utils.data.DataLoader(
        PsobDataset(train_features, train_labels),
        batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2
    )
    testloader = torch.utils.data.DataLoader(
        PsobDataset(test_features, test_labels),
        batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2
    )
    accuracies = []
    best_accuracy = -1
    durations = []
    current_duration = 0
    for epoch in tqdm(range(CONFIG['epochs'])):
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        correct = 0
        total = 0
        with torch.no_grad():
            for data in testloader:
                features, labels = data
                outputs = model(features)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total
        if best_accuracy >= accuracy:
            current_duration += 1
        else:
            if current_duration != 0:
                durations.append(current_duration)
            current_duration = 0
        best_accuracy = max(best_accuracy, accuracy)
        accuracies.append(accuracy)
        logger.info(str(epoch) + ": " + str(accuracy))
    if current_duration != 0:
        durations.append(current_duration)
    logger.info("END get_test_accuracy_by_epoch")
    return [i for i in range(CONFIG['epochs'])], accuracies, durations

コード例 #3

0

ファイルを表示

ファイル: learning_rate.py プロジェクト: ml-in-programming/ml-on-source-code-models

def get_accuracies_for_lr() -> Dict[float, float]:
    logger = logging.getLogger('learning_rate')
    configure_logger_by_default(logger)
    logger.info("START get_accuracies_for_lr")
    accuracies_by_lr = defaultdict(lambda: -1.0)
    for lr in CONFIG['params']['lr']:
        logger.info("lr = " + str(lr))
        skf = CONFIG['cv']
        train_index, test_index = next(skf.split(INPUT_FEATURES, INPUT_LABELS))

        model = Model(INPUT_FEATURES.shape[1])
        criterion = CONFIG['criterion']()
        optimizer = CONFIG['optimizer'](model.parameters(), lr=lr, momentum=CONFIG['momentum'])
        train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[train_index]
        test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[test_index]
        trainloader = torch.utils.data.DataLoader(
            PsobDataset(train_features, train_labels),
            batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2
        )
        testloader = torch.utils.data.DataLoader(
            PsobDataset(test_features, test_labels),
            batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2
        )
        current_duration = 0
        for epoch in tqdm(range(CONFIG['epochs'])):
            for i, data in enumerate(trainloader, 0):
                inputs, labels = data
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            correct = 0
            total = 0
            with torch.no_grad():
                for data in testloader:
                    features, labels = data
                    outputs = model(features)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
            accuracy = correct / total
            if accuracies_by_lr[lr] >= accuracy:
                current_duration += 1
            else:
                current_duration = 0
            accuracies_by_lr[lr] = max(accuracies_by_lr[lr], accuracy)
            if current_duration > CONFIG['early_stopping_rounds']:
                break
            if epoch % 10 == 0:
                logger.info("CHECKPOINT EACH 10th EPOCH" + str(epoch) + ": " + str(accuracy))
            if epoch % 100 == 0:
                logger.info("CHECKPOINT EACH 100th EPOCH" + str(epoch) + ": " + str(accuracy))
            logger.info(str(epoch) + ": " + str(accuracy))
    logger.info("END get_accuracies_for_lr")
    return accuracies_by_lr

コード例 #4

0

ファイルを表示

def get_best_metrics_and_accuracy_from_metrics_set(metrics_sets) -> Tuple[List[int], float]:
    logger = logging.getLogger('finding_best_metrics_and_accuracy')
    configure_logger_by_default(logger)
    logger.info("STARTED FINDING BEST METRICS SET")
    loaded_features, loaded_labels = \
        torch.load("../calculated_features/split_each_file_features.tr"), torch.load("../calculated_features/split_each_file_labels.tr")
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    train_index, test_index = next(skf.split(loaded_features, loaded_labels))

    best_metrics = None
    best_accuracy = -1
    for metrics in tqdm(metrics_sets):
        metrics = list(metrics)
        if len(metrics) == 0:
            continue

        model = Model(len(metrics))
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        train_features, train_labels = loaded_features[train_index], loaded_labels[train_index]
        test_features, test_labels = loaded_features[test_index], loaded_labels[test_index]
        trainloader = torch.utils.data.DataLoader(
            PsobDataset(train_features, train_labels, metrics),
            batch_size=BATCH_SIZE, shuffle=True, num_workers=2
        )
        testloader = torch.utils.data.DataLoader(
            PsobDataset(test_features, test_labels, metrics),
            batch_size=BATCH_SIZE, shuffle=False, num_workers=2
        )
        for _ in range(EPOCHS):
            for i, data in enumerate(trainloader, 0):
                inputs, labels = data
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
        correct = 0
        total = 0
        with torch.no_grad():
            for data in testloader:
                features, labels = data
                outputs = model(features)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total
        log_info = str(metrics) + ": " + str(accuracy)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_metrics = metrics
            log_info += " NEW BEST"
        logger.info(log_info)
    logger.info("END FINDING BEST METRICS SET")
    return best_metrics, best_accuracy

コード例 #5

0

ファイルを表示

ファイル: train_bp.py プロジェクト: ml-in-programming/ml-on-source-code-models

def train_bp(model, train_features, train_labels, test_features, test_labels,
             config):
    print_info = config['pso_options']['print_info']

    criterion = config['criterion']
    optimizer = config['optimizer'](model.parameters(), lr=config['lr'])

    trainloader = torch.utils.data.DataLoader(PsobDataset(
        train_features, train_labels),
                                              batch_size=config['batch_size'],
                                              shuffle=config['shuffle'],
                                              num_workers=2)
    testloader = torch.utils.data.DataLoader(PsobDataset(
        test_features, test_labels),
                                             batch_size=config['batch_size'],
                                             shuffle=config['shuffle'],
                                             num_workers=2)

    best_accuracy = -1.0
    train_accuracy = -1.0
    train_loss = -1.0
    test_loss = -1.0
    current_duration = 0
    print_evaluation_before_train(model, criterion, train_features,
                                  train_labels, test_features, test_labels,
                                  print_info)
    for epoch in range(config['epochs']):
        for inputs, labels in trainloader:
            inputs = inputs.to(config['device'])
            labels = labels.to(config['device'])
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in testloader:
                inputs = inputs.to(config['device'])
                labels = labels.to(config['device'])
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total
        if best_accuracy >= accuracy:
            current_duration += 1
        else:
            current_duration = 0
            with torch.no_grad():
                train_correct = 0
                train_total = 0
                for inputs, labels in trainloader:
                    inputs = inputs.to(config['device'])
                    labels = labels.to(config['device'])
                    outputs = model(inputs)
                    _, predicted = torch.max(outputs.data, 1)
                    train_total += labels.size(0)
                    train_correct += (predicted == labels).sum().item()
                train_accuracy = train_correct / train_total
                train_loss = criterion(model(train_features),
                                       train_labels).item()
                test_loss = criterion(model(test_features), test_labels).item()
        best_accuracy = max(best_accuracy, accuracy)
        if current_duration > config['early_stopping_rounds']:
            print_info("On epoch " + str(epoch) +
                       " training was early stopped")
            break
        if epoch % 100 == 0:
            with torch.no_grad():
                print_100th_checkpoint_evaluation(epoch, model, criterion,
                                                  train_features, train_labels,
                                                  test_features, test_labels,
                                                  print_info)
    correct = 0
    total = 0
    labels_dist = torch.zeros(config['number_of_authors'])
    labels_correct = torch.zeros(config['number_of_authors'])
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs = inputs.to(config['device'])
            labels = labels.to(config['device'])
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            for i, label in enumerate(labels):
                labels_dist[label] += 1
                labels_correct[label] += predicted[i] == labels[i]
    print_info('Finished training')
    best_accuracy = max(best_accuracy, correct / total)
    print_info('Best accuracy: ' + str(best_accuracy))
    print_info(
        'Accuracy of the last validation of the network: %d / %d = %d %%' %
        (correct, total, 100 * correct / total))
    print_info(
        "Correct labels / labels for each author of last validation:\n" +
        str(torch.stack((labels_correct, labels_dist), dim=1)))
    return best_accuracy, train_accuracy, test_loss, train_loss

コード例 #6

0

ファイルを表示

ファイル: one_split_fit.py プロジェクト: ml-in-programming/ml-on-source-code-models

def fit_model(file_to_print):
    logger = logging.getLogger('one_split_fit')
    configure_logger_by_default(logger)
    logger.info("START fit_model")

    def print_info(info):
        logger.info(info)
        print(info)
        file_to_print.write(info + "\n")

    train_index, test_index = next(CONFIG['cv'].split(INPUT_FEATURES,
                                                      INPUT_LABELS))
    model = Model(len(CONFIG['metrics']))
    criterion = CONFIG['criterion']()
    optimizer = CONFIG['optimizer'](model.parameters(),
                                    lr=CONFIG['lr'],
                                    momentum=CONFIG['momentum'])
    train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[
        train_index]
    test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[
        test_index]
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features = scaler.transform(train_features)
    test_features = scaler.transform(test_features)
    trainloader = torch.utils.data.DataLoader(PsobDataset(
        train_features, train_labels, CONFIG['metrics']),
                                              batch_size=CONFIG['batch_size'],
                                              shuffle=CONFIG['shuffle'],
                                              num_workers=2)
    testloader = torch.utils.data.DataLoader(PsobDataset(
        test_features, test_labels, CONFIG['metrics']),
                                             batch_size=CONFIG['batch_size'],
                                             shuffle=CONFIG['shuffle'],
                                             num_workers=2)
    best_accuracy = -1.0
    current_duration = 0
    for epoch in range(CONFIG['epochs']):
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        correct = 0
        total = 0
        with torch.no_grad():
            for data in testloader:
                features, labels = data
                outputs = model(features)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total
        if best_accuracy >= accuracy:
            current_duration += 1
        else:
            current_duration = 0
        best_accuracy = max(best_accuracy, accuracy)
        if current_duration > CONFIG['early_stopping_rounds']:
            print_info("On epoch " + str(epoch) +
                       " training was early stopped")
            break
        if epoch % 10 == 0:
            logger.info("CHECKPOINT EACH 10th EPOCH" + str(epoch) + ": " +
                        str(accuracy))
        if epoch % 100 == 0:
            print_info("CHECKPOINT EACH 100th EPOCH " + str(epoch) +
                       ": current accuracy " + str(accuracy) + " , best " +
                       str(best_accuracy))
        logger.info(str(epoch) + ": " + str(accuracy))

    logger.info('Finished Training')

    correct = 0
    total = 0
    labels_dist = torch.zeros(CONFIG['number_of_authors'])
    labels_correct = torch.zeros(CONFIG['number_of_authors'])
    with torch.no_grad():
        for data in testloader:
            features, labels = data
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            for i, label in enumerate(labels):
                labels_dist[label] += 1
                labels_correct[label] += predicted[i] == labels[i]
    print_info('Best accuracy: ' + str(max(best_accuracy, correct / total)))
    print_info('Final accuracy of the network: %d / %d = %d %%' %
               (correct, total, 100 * correct / total))
    print_info("Correct labels / labels for each author:\n" +
               str(torch.stack((labels_correct, labels_dist), dim=1)))
    logger.info("END fit_model")

コード例 #7

0

ファイルを表示

def run_cross_validation():
    k_fold = 10
    loaded_features, loaded_labels = \
        torch.load("../calculated_features/split_each_file_features.tr"),\
        torch.load("../calculated_features/split_each_file_labels.tr")
    skf = RepeatedStratifiedKFold(n_splits=k_fold, n_repeats=10)
    # metrics = [i for i in range(19)]
    metrics = [0, 1, 2, 3, 4, 6, 7, 9]
    accuraces = torch.zeros((10, 10))
    loop = 0
    for train_index, test_index in skf.split(loaded_features, loaded_labels):
        model = Model(len(metrics))
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        train_features, train_labels = loaded_features[train_index], loaded_labels[train_index]
        test_features, test_labels = loaded_features[test_index], loaded_labels[test_index]
        trainloader = torch.utils.data.DataLoader(
            PsobDataset(train_features, train_labels, metrics),
            batch_size=BATCH_SIZE, shuffle=True, num_workers=2
        )
        testloader = torch.utils.data.DataLoader(
            PsobDataset(test_features, test_labels, metrics),
            batch_size=BATCH_SIZE, shuffle=False, num_workers=2
        )
        for epoch in range(EPOCHS):
            running_loss = 0.0
            for i, data in enumerate(trainloader, 0):
                # get the inputs
                inputs, labels = data

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs = model(inputs)

                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                # print statistics
                running_loss += loss.item()
                if i % 10 == 9:
                    # print('[%d, %5d] loss: %.3f' %
                    #      (epoch + 1, i + 1, running_loss / 10))
                    running_loss = 0.0

        print('Finished Training')

        correct = 0
        total = 0
        labels_correct = torch.zeros(NUM_OF_AUTHORS)
        with torch.no_grad():
            for data in testloader:
                features, labels = data
                outputs = model(features)
                _, predicted = torch.max(outputs.data, 1)
                print(predicted)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                for i, label in enumerate(labels):
                    labels_correct[label] += predicted[i] == labels[i]
        print('Accuracy of the network: %d / %d = %d %%' % (
            correct, total, 100 * correct / total))
        print(labels_correct)
        accuraces[loop % 10][int(loop / 10)] = correct / total
        loop += 1
        return
    print(torch.mean(accuraces, 1))
    print(torch.std(accuraces, 1))
    print(accuraces)