def __init__( self, classifier, train_positives, train_negatives, test_positives, test_negatives, ): # Register parameters self.classifier = classifier self.training_data = train_positives | train_negatives self.testing_data = test_positives | test_negatives self.test_positives = test_positives self.test_negatives = test_negatives self.vocabulary = utils.read_wordnet_index() # Make a MAP evaluator descriminant_func = get_descriminant_func(test_positives, test_negatives) self.evaluator = MapEvaluator(descriminant_func) # This will hold the "relational-nounishness-scores" to rank the # nouns that seem most relational self.scores = None
def prune_to_top_k_features(k): """ Prunes the features to only the k features having highest mutual information. Only the features listed in extract_features.COUNT_BASED_FEATURES were subjected to this filtering. """ in_path = os.path.join( RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000') out_path = os.path.join( RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-top_%d' % k) # Get the top k features to be kept print 'getting the top k features' keep_features = get_top_k_features(k) # Load the base set of features that we'll be pruning features = ef.FeatureAccumulator( vocabulary=utils.read_wordnet_index(), load=in_path) # Do the pruning print 'pruning...' features.prune_features_keep_only(keep_features) # Save the pruned features print 'writing pruned features to disc...' features.write(out_path)
def optimize_syntactic_feature_sets2(): # We'll write results for this hyperparameter optimization here: out_path = os.path.join(HYPERPARAMETER_TUNING_DIR, 'optimize_syntactic_feature_sets2.tsv') # Read in the training set splits and the features train, test = utils.get_train_test_split() features_path = os.path.join(DATA_DIR, 'relational-noun-features-lexical-wordnet', '0ba') features = extract_features.FeatureAccumulator( vocabulary=utils.read_wordnet_index(), load=features_path) # Define the ranges over which parameters should be varied parameter_ranges = { 'syntax_feature_types': [ #[], #['baseline'], #['dependency'], #['hand_picked'], ['pos_unigram'], ['pos_unigram', 'pos_bigram'], ['lemma_unigram'], ['lemma_unigram', 'lemma_bigram'], ['surface_unigram', 'surface_bigram'], #['dependency', 'hand_picked'], #['baseline', 'hand_picked'], #['baseline', 'dependency'], #['baseline', 'dependency', 'hand_picked'], ] } # Define the values of parameters to be held constant constants = { 'kind': 'svm', 'on_unk': False, 'C': 0.01, 'semantic_similarity': 'res', 'include_suffix': True, 'syntactic_multiplier': 10.0, 'semantic_multiplier': 2.0, 'suffix_multiplier': 0.2 } # Generate all combinations of variable parameters, while including # constant paramteres. classifier_definitions = test_classifier.generate_classifier_definitions( parameter_ranges, constants) # Evaluate the classifier when running for all classifier definitions test_classifier.optimize_classifier(classifier_definitions, features, train['pos'], train['neg'], test['pos'], test['neg'], out_path, num_procs=1)
def prune_features_more(): in_path = os.path.join( RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat5000') out_path = os.path.join( RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000') features = ef.FeatureAccumulator( vocabulary=utils.read_wordnet_index(), load=in_path) features.prune_features(1000) features.write(out_path)
def combine_batches(): # Note, when run, pruning to features having at least 5000 occurrences # didn't work. So I ran ``prune_features_more()`` somewhat later after # noticing that. in_dir = os.path.join(DATA_DIR, 'relational-noun-features-lexical-wordnet') feature_dirs = t4k.ls( in_dir, absolute=True, match='.*accumulated50-', files=False ) out_dir = os.path.join( in_dir, 'accumulated450-min_token_5-min_feat5000') ef.coalesce_features( out_dir=out_dir, min_token_occurrences=5, min_feature_occurrences=5000, vocabulary=utils.read_wordnet_index(), feature_dirs=feature_dirs )
def calculate_mutual_information(feature_sets, out_fname): # Tolerate providing a single feature set. Make into a proper set. if isinstance(feature_sets, basestring): feature_sets = set([feature_sets]) else: feature_sets = set(feature_sets) # Separate count based features and non-count features count_based_features = list(feature_sets & set(ef.COUNT_BASED_FEATURES)) non_count_features = list(feature_sets & set(ef.NON_COUNT_FEATURES)) # Validation: ensure no unexpected features were provided unexpected_features = (feature_sets - set(ef.COUNT_BASED_FEATURES) - set(ef.NON_COUNT_FEATURES)) # Make sure no misspelled features were included if len(unexpected_features): raise ValueError('Unexpected feature(s): %s' % ', '.join(unexpected_features)) # Define the path at which to write. If no fname was given, then name # the file after the first element of names_of_runs out_path = os.path.join(RELATIONAL_NOUN_FEATURES_DIR, out_fname) # Load the features if not provided wni = utils.read_wordnet_index() features_path = os.path.join(RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000') start = time.time() features = ef.FeatureAccumulator(wni, load=features_path) print 'time to read features elapsed: %s' % (time.time() - start) # Load relational noun annotations annots = annotations.Annotations(features.dictionary) features.calculate_mutual_information(annots, out_path, count_based_features, non_count_features)
def coalesce_batch(batch_num): """ Coalesce some of the feature extracts (a batch of 50 of them). batch_num determines which 50 extracts will be coalesced. Do some light pruning. """ in_dir = os.path.join(DATA_DIR, 'relational-noun-features-lexical-wordnet') start = 50*batch_num stop = 50*(batch_num+1) feature_dirs = t4k.ls( in_dir, absolute=True, match='/[0-9a-f]{3,3}$', files=False )[start:stop] out_dir = os.path.join( in_dir, 'accumulated50-min_token_5-min_feat100-%d'%batch_num) ef.coalesce_features( out_dir=out_dir, min_token_occurrences=2, min_feature_occurrences=100, vocabulary=utils.read_wordnet_index(), feature_dirs=feature_dirs )
def extract_all_featurea_for_wordet_nouns(): do_extract_all_features(out_dir=os.path.join( DATA_DIR, 'relational-noun-features-wordnet-only'), vocabulary=utils.read_wordnet_index())