def optimize_syntactic_feature_sets2():

    # We'll write results for this hyperparameter optimization here:
    out_path = os.path.join(HYPERPARAMETER_TUNING_DIR,
                            'optimize_syntactic_feature_sets2.tsv')

    # Read in the training set splits and the features
    train, test = utils.get_train_test_split()
    features_path = os.path.join(DATA_DIR,
                                 'relational-noun-features-lexical-wordnet',
                                 '0ba')
    features = extract_features.FeatureAccumulator(
        vocabulary=utils.read_wordnet_index(), load=features_path)

    # Define the ranges over which parameters should be varied
    parameter_ranges = {
        'syntax_feature_types': [
            #[],
            #['baseline'],
            #['dependency'],
            #['hand_picked'],
            ['pos_unigram'],
            ['pos_unigram', 'pos_bigram'],
            ['lemma_unigram'],
            ['lemma_unigram', 'lemma_bigram'],
            ['surface_unigram', 'surface_bigram'],
            #['dependency', 'hand_picked'],
            #['baseline', 'hand_picked'],
            #['baseline', 'dependency'],
            #['baseline', 'dependency', 'hand_picked'],
        ]
    }

    # Define the values of parameters to be held constant
    constants = {
        'kind': 'svm',
        'on_unk': False,
        'C': 0.01,
        'semantic_similarity': 'res',
        'include_suffix': True,
        'syntactic_multiplier': 10.0,
        'semantic_multiplier': 2.0,
        'suffix_multiplier': 0.2
    }

    # Generate all combinations of variable parameters, while including
    # constant paramteres.
    classifier_definitions = test_classifier.generate_classifier_definitions(
        parameter_ranges, constants)

    # Evaluate the classifier when running for all classifier definitions
    test_classifier.optimize_classifier(classifier_definitions,
                                        features,
                                        train['pos'],
                                        train['neg'],
                                        test['pos'],
                                        test['neg'],
                                        out_path,
                                        num_procs=1)
def optimize_pruning2():

    # We'll write results for this hyperparameter optimization here:
    out_path = os.path.join(HYPERPARAMETER_TUNING_DIR, 'optimize_pruning2.tsv')

    # Read in the training set splits and the features
    train, test = utils.get_train_test_split()
    features = extract_features.FeatureAccumulator(load=os.path.join(
        DATA_DIR, 'relational-noun-features-wordnet-only', 'accumulated'))

    # Define the ranges over which parameters should be varied
    parameter_ranges = {
        'min_feature_frequency': [
            200,
            500,
            1000,
            2000,
            5000,
            10000,
            #20000, 50000, 100000, 200000, 500000, 1000000,
        ]
    }

    # Define the values of parameters to be held constant
    constants = {
        'kind': 'svm',
        'on_unk': False,
        'C': 0.01,
        'syntax_feature_types': ['baseline', 'dependency', 'hand_picked'],
        'semantic_similarity': 'res',
        'include_suffix': True,
        'syntactic_multiplier': 0.33,
        'semantic_multiplier': 0.33,
        'suffix_multiplier': 0.33,
    }

    # Generate all combinations of variable parameters, while including
    # constant paramteres.
    classifier_definitions = test_classifier.generate_classifier_definitions(
        parameter_ranges, constants)

    # Evaluate the classifier when running for all classifier definitions
    test_classifier.optimize_classifier(classifier_definitions,
                                        features,
                                        train['pos'],
                                        train['neg'],
                                        test['pos'],
                                        test['neg'],
                                        out_path,
                                        num_procs=12)
Esempio n. 3
0
    tokenizer_fname = kw['tokenizer']
    if not path.isfile(tokenizer_fname):
        print('tokenizer file "%s" does not exist' % tokenizer_fname)
        exit()

    # load model
    embedding_matrix = np.load(embedding_fname)
    variant = kw['model']
    model = full_model(embedding_matrix=embedding_matrix, variant=variant)
    model.load_weights(weights_fname)

    # load data
    cards = pd.read_csv('processed_sets.csv', sep='\t')

    _, _, m_test, _ = get_train_test_split(cards, FULL_INPUTS)
    _, _, x_test, y_test = get_train_test_split(cards, ['text'])

    # tokenize test set
    tokenizer = Tokenizer()
    with open(tokenizer_fname, 'rb') as handle:
        tokenizer = pickle.load(handle)

    train_split = 0.8
    valid_split = 0.1
    test_split = 1 - train_split - valid_split
    frac = int((valid_split / (valid_split + test_split)) * len(x_test))

    m_test = m_test[frac:]
    x_test = x_test[frac:]
    y_test = y_test[frac:]
    wv_size = kw['size']
    if 'zzz' in embedding_fname:
        embedding_fname = embedding_fname.replace('zzz', str(wv_size))

    batch_size = 48
    epochs = 50

    # load data
    cards = pd.read_csv('processed_sets.csv', sep='\t')

    # split data
    train_split = 0.8
    valid_split = 0.1
    test_split = 1 - train_split - valid_split
    assert test_split > 0, 'there is no data to test on'
    manas_train, _, manas_test, _ = get_train_test_split(
        cards, FULL_INPUTS, train_split)
    x_train, y_train, x_test, y_test = get_train_test_split(
        cards, ['text'], train_split)

    # split the test set into validation and test sets
    frac = int((valid_split / (valid_split + test_split)) * len(x_test))
    x_valid = x_test[:frac]
    y_valid = y_test[:frac]
    manas_valid = manas_test[:frac]

    x_test = x_test[frac:]
    y_test = y_test[frac:]
    manas_test = manas_test[frac:]

    # tokenize descriptions
    corpus = cards['text'].str.split().values
Esempio n. 5
0
                        '-c',
                        help='choose classifier',
                        default='mlp')
    args = parser.parse_args()
    kw = vars(args)
    method = kw['classifier']

    #type_cmc = ['type', 'cmc', 'legendary']
    full_inputs = ['type', 'C', 'R', 'U', 'B', 'G', 'W', 'X',  \
                   'B/G', 'B/R', 'G/U', 'G/W', 'R/G', 'R/W', 'U/B', \
                   'U/R', 'W/B', 'W/U', 'legendary', 'text']

    # load dataset
    cards = pd.read_csv('processed_sets.csv', sep='\t')
    train_values, train_labels, test_values, test_labels = \
        get_train_test_split(cards, full_inputs)

    # choose and train model
    if method == 'mlp':
        print('classifying with MLP...')
        model = mlp_classification(train_values, train_labels)
    elif method == 'rf':
        print('classifying with random forest...')
        model = rf_classification(train_values, train_labels)
    elif method == 'svm':
        print('classifying with SVM...')
        model = svm_classification(train_values, train_labels)
    else:
        print('Unrecognized classifier: "%s"' % method)
        exit()