Ejemplo n.º 1
0
    def __init__(
        self,
        classifier,
        train_positives,
        train_negatives,
        test_positives,
        test_negatives,
    ):

        # Register parameters
        self.classifier = classifier
        self.training_data = train_positives | train_negatives
        self.testing_data = test_positives | test_negatives
        self.test_positives = test_positives
        self.test_negatives = test_negatives
        self.vocabulary = utils.read_wordnet_index()

        # Make a MAP evaluator
        descriminant_func = get_descriminant_func(test_positives,
                                                  test_negatives)
        self.evaluator = MapEvaluator(descriminant_func)

        # This will hold the "relational-nounishness-scores" to rank the
        # nouns that seem most relational
        self.scores = None
Ejemplo n.º 2
0
def prune_to_top_k_features(k):
    """
    Prunes the features to only the k features having highest mutual 
    information.  Only the features listed in
    extract_features.COUNT_BASED_FEATURES were subjected to this filtering.
    """

    in_path = os.path.join( 
        RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000')
        
    out_path = os.path.join(
        RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-top_%d' % k)

    # Get the top k features to be kept
    print 'getting the top k features'
    keep_features = get_top_k_features(k)

    # Load the base set of features that we'll be pruning
    features = ef.FeatureAccumulator(
        vocabulary=utils.read_wordnet_index(), load=in_path)

    # Do the pruning
    print 'pruning...'
    features.prune_features_keep_only(keep_features)

    # Save the pruned features
    print 'writing pruned features to disc...'
    features.write(out_path)
def optimize_syntactic_feature_sets2():

    # We'll write results for this hyperparameter optimization here:
    out_path = os.path.join(HYPERPARAMETER_TUNING_DIR,
                            'optimize_syntactic_feature_sets2.tsv')

    # Read in the training set splits and the features
    train, test = utils.get_train_test_split()
    features_path = os.path.join(DATA_DIR,
                                 'relational-noun-features-lexical-wordnet',
                                 '0ba')
    features = extract_features.FeatureAccumulator(
        vocabulary=utils.read_wordnet_index(), load=features_path)

    # Define the ranges over which parameters should be varied
    parameter_ranges = {
        'syntax_feature_types': [
            #[],
            #['baseline'],
            #['dependency'],
            #['hand_picked'],
            ['pos_unigram'],
            ['pos_unigram', 'pos_bigram'],
            ['lemma_unigram'],
            ['lemma_unigram', 'lemma_bigram'],
            ['surface_unigram', 'surface_bigram'],
            #['dependency', 'hand_picked'],
            #['baseline', 'hand_picked'],
            #['baseline', 'dependency'],
            #['baseline', 'dependency', 'hand_picked'],
        ]
    }

    # Define the values of parameters to be held constant
    constants = {
        'kind': 'svm',
        'on_unk': False,
        'C': 0.01,
        'semantic_similarity': 'res',
        'include_suffix': True,
        'syntactic_multiplier': 10.0,
        'semantic_multiplier': 2.0,
        'suffix_multiplier': 0.2
    }

    # Generate all combinations of variable parameters, while including
    # constant paramteres.
    classifier_definitions = test_classifier.generate_classifier_definitions(
        parameter_ranges, constants)

    # Evaluate the classifier when running for all classifier definitions
    test_classifier.optimize_classifier(classifier_definitions,
                                        features,
                                        train['pos'],
                                        train['neg'],
                                        test['pos'],
                                        test['neg'],
                                        out_path,
                                        num_procs=1)
Ejemplo n.º 4
0
def prune_features_more():
    in_path = os.path.join(
        RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat5000')
    out_path = os.path.join(
        RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000')
    features = ef.FeatureAccumulator(
        vocabulary=utils.read_wordnet_index(), load=in_path)
    features.prune_features(1000)
    features.write(out_path)
Ejemplo n.º 5
0
def combine_batches():
    # Note, when run, pruning to features having at least 5000 occurrences
    # didn't work.  So I ran ``prune_features_more()`` somewhat later after
    # noticing that.
    in_dir = os.path.join(DATA_DIR, 'relational-noun-features-lexical-wordnet')
    feature_dirs = t4k.ls(
        in_dir, absolute=True, match='.*accumulated50-', files=False
    )
    out_dir = os.path.join(
        in_dir, 'accumulated450-min_token_5-min_feat5000')

    ef.coalesce_features(
        out_dir=out_dir,
        min_token_occurrences=5,
        min_feature_occurrences=5000,
        vocabulary=utils.read_wordnet_index(),
        feature_dirs=feature_dirs
    )
def calculate_mutual_information(feature_sets, out_fname):

    # Tolerate providing a single feature set.  Make into a proper set.
    if isinstance(feature_sets, basestring):
        feature_sets = set([feature_sets])
    else:
        feature_sets = set(feature_sets)

    # Separate count based features and non-count features
    count_based_features = list(feature_sets & set(ef.COUNT_BASED_FEATURES))
    non_count_features = list(feature_sets & set(ef.NON_COUNT_FEATURES))

    # Validation: ensure no unexpected features were provided
    unexpected_features = (feature_sets - set(ef.COUNT_BASED_FEATURES) -
                           set(ef.NON_COUNT_FEATURES))
    # Make sure no misspelled features were included
    if len(unexpected_features):
        raise ValueError('Unexpected feature(s): %s' %
                         ', '.join(unexpected_features))

    # Define the path at which to write.  If no fname was given, then name
    # the file after the first element of names_of_runs
    out_path = os.path.join(RELATIONAL_NOUN_FEATURES_DIR, out_fname)

    # Load the features if not provided
    wni = utils.read_wordnet_index()
    features_path = os.path.join(RELATIONAL_NOUN_FEATURES_DIR,
                                 'accumulated450-min_token_5-min_feat1000')
    start = time.time()
    features = ef.FeatureAccumulator(wni, load=features_path)
    print 'time to read features elapsed: %s' % (time.time() - start)

    # Load relational noun annotations
    annots = annotations.Annotations(features.dictionary)

    features.calculate_mutual_information(annots, out_path,
                                          count_based_features,
                                          non_count_features)
Ejemplo n.º 7
0
def coalesce_batch(batch_num):
    """
    Coalesce some of the feature extracts (a batch of 50 of them).  
    batch_num determines which 50 extracts will be coalesced.
    Do some light pruning.
    """

    in_dir = os.path.join(DATA_DIR, 'relational-noun-features-lexical-wordnet')
    start = 50*batch_num
    stop = 50*(batch_num+1)
    feature_dirs = t4k.ls(
        in_dir, absolute=True, match='/[0-9a-f]{3,3}$', files=False
    )[start:stop]
    out_dir = os.path.join(
        in_dir, 'accumulated50-min_token_5-min_feat100-%d'%batch_num)

    ef.coalesce_features(
        out_dir=out_dir,
        min_token_occurrences=2,
        min_feature_occurrences=100,
        vocabulary=utils.read_wordnet_index(),
        feature_dirs=feature_dirs
    )
def extract_all_featurea_for_wordet_nouns():
    do_extract_all_features(out_dir=os.path.join(
        DATA_DIR, 'relational-noun-features-wordnet-only'),
                            vocabulary=utils.read_wordnet_index())