def generate_candidates(num_to_generate, out_path, pos, neg, exclude): # Open the file to which we will write candidates out_f = open(out_path, 'w') # Read in the extracted features, which we'll also need for a couple things features = extract_features.FeatureAccumulator( load=BEST_WORDNET_ONLY_FEATURES_PATH) # Make the best performing classifier. This is what we'll use to score the # "relationalness" of new words. clf = classifier.make_classifier(kind='svm', features=features, positives=pos, negatives=neg, **BEST_CLASSIFIER_CONFIG) # Now generate the candidates. We only keep track of the number of # positives generated, because there are always more negatives num_generated = 0 for token in features.dictionary.get_token_list(): if token in exclude: print '\t\tx\t%s' % token continue score = clf.score(token)[0] if score > clf.threshold: print '%s\t+' % token out_f.write('%s\t+\t%f\n' % (token, score)) num_generated += 1 if num_generated == num_to_generate: break else: print '\t-\t%s' % token out_f.write('%s\t-\t%f\n' % (token, score))
def prune_to_top_k_features(k): """ Prunes the features to only the k features having highest mutual information. Only the features listed in extract_features.COUNT_BASED_FEATURES were subjected to this filtering. """ in_path = os.path.join( RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000') out_path = os.path.join( RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-top_%d' % k) # Get the top k features to be kept print 'getting the top k features' keep_features = get_top_k_features(k) # Load the base set of features that we'll be pruning features = ef.FeatureAccumulator( vocabulary=utils.read_wordnet_index(), load=in_path) # Do the pruning print 'pruning...' features.prune_features_keep_only(keep_features) # Save the pruned features print 'writing pruned features to disc...' features.write(out_path)
def optimize_syntactic_feature_sets2(): # We'll write results for this hyperparameter optimization here: out_path = os.path.join(HYPERPARAMETER_TUNING_DIR, 'optimize_syntactic_feature_sets2.tsv') # Read in the training set splits and the features train, test = utils.get_train_test_split() features_path = os.path.join(DATA_DIR, 'relational-noun-features-lexical-wordnet', '0ba') features = extract_features.FeatureAccumulator( vocabulary=utils.read_wordnet_index(), load=features_path) # Define the ranges over which parameters should be varied parameter_ranges = { 'syntax_feature_types': [ #[], #['baseline'], #['dependency'], #['hand_picked'], ['pos_unigram'], ['pos_unigram', 'pos_bigram'], ['lemma_unigram'], ['lemma_unigram', 'lemma_bigram'], ['surface_unigram', 'surface_bigram'], #['dependency', 'hand_picked'], #['baseline', 'hand_picked'], #['baseline', 'dependency'], #['baseline', 'dependency', 'hand_picked'], ] } # Define the values of parameters to be held constant constants = { 'kind': 'svm', 'on_unk': False, 'C': 0.01, 'semantic_similarity': 'res', 'include_suffix': True, 'syntactic_multiplier': 10.0, 'semantic_multiplier': 2.0, 'suffix_multiplier': 0.2 } # Generate all combinations of variable parameters, while including # constant paramteres. classifier_definitions = test_classifier.generate_classifier_definitions( parameter_ranges, constants) # Evaluate the classifier when running for all classifier definitions test_classifier.optimize_classifier(classifier_definitions, features, train['pos'], train['neg'], test['pos'], test['neg'], out_path, num_procs=1)
def prune_features_more(): in_path = os.path.join( RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat5000') out_path = os.path.join( RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000') features = ef.FeatureAccumulator( vocabulary=utils.read_wordnet_index(), load=in_path) features.prune_features(1000) features.write(out_path)
def test_get_dep_tree_features(self): # Make a mock (empty) dictionary (does not affect test, but needed to # create the feature accumulator). dictionary = set() # Make a mock dependency tree F = { 'parents':[], 'children':[], 'pos':'pos_F' } E = { 'parents':[('rel_F', F)], 'children':[], 'pos':'pos_E' } D = { 'parents':[], 'children':[], 'pos':'pos_D' } C = { 'parents':[('rel_E', E)], 'children':[('rel_D', D)], 'pos':'pos_C' } B = { 'parents':[], 'children':[], 'pos':'pos_B' } BB = { 'parents':[], 'children':[], 'pos':'pos_BB' } A = { 'parents':[('rel_C', C)], 'children':[('rel_B', B), ('rel_BB', BB)], 'pos':'pos_A' } accumulator = extract_features.FeatureAccumulator(dictionary) features = accumulator.get_dep_tree_features_recurse(A, depth=2) # Note that because we called it with depth=2, no feature is made for # token F expected_features = [ 'parent:rel_C:pos_C', 'parent:rel_C:pos_C-parent:rel_E:pos_E', 'parent:rel_C:pos_C-child:rel_D:pos_D', 'child:rel_B:pos_B', 'child:rel_BB:pos_BB' ] self.assertItemsEqual(features, expected_features)
def optimize_pruning2(): # We'll write results for this hyperparameter optimization here: out_path = os.path.join(HYPERPARAMETER_TUNING_DIR, 'optimize_pruning2.tsv') # Read in the training set splits and the features train, test = utils.get_train_test_split() features = extract_features.FeatureAccumulator(load=os.path.join( DATA_DIR, 'relational-noun-features-wordnet-only', 'accumulated')) # Define the ranges over which parameters should be varied parameter_ranges = { 'min_feature_frequency': [ 200, 500, 1000, 2000, 5000, 10000, #20000, 50000, 100000, 200000, 500000, 1000000, ] } # Define the values of parameters to be held constant constants = { 'kind': 'svm', 'on_unk': False, 'C': 0.01, 'syntax_feature_types': ['baseline', 'dependency', 'hand_picked'], 'semantic_similarity': 'res', 'include_suffix': True, 'syntactic_multiplier': 0.33, 'semantic_multiplier': 0.33, 'suffix_multiplier': 0.33, } # Generate all combinations of variable parameters, while including # constant paramteres. classifier_definitions = test_classifier.generate_classifier_definitions( parameter_ranges, constants) # Evaluate the classifier when running for all classifier definitions test_classifier.optimize_classifier(classifier_definitions, features, train['pos'], train['neg'], test['pos'], test['neg'], out_path, num_procs=12)
def generate_candidates_ordinal(num_to_generate, out_path, pos, neg, neut, exclude, kernel=None, features=None): # Open the file to which we will write candidates out_f = open(out_path, 'w') # Read in the extracted features, which we'll also need for a couple things if features is None: features = extract_features.FeatureAccumulator( load=BEST_WORDNET_ONLY_FEATURES_PATH) # Make the best performing classifier. This is what we'll use to score the # "relationalness" of new words. clf = classifier.make_classifier(kind='osvm', kernel=kernel, features=features, positives=pos, negatives=neg, neutrals=neut, **BEST_CLASSIFIER_CONFIG) # Now generate the candidates. We only keep track of the number of # positives generated, because there are always more negatives num_generated = 0 filtered_tokens = [ t for t in features.dictionary.get_token_list() if t not in exclude ] for token, score in clf.score_parallel(filtered_tokens): if score >= 1: print '%s\t+' % token out_f.write('%s\t+\t%f\n' % (token, score)) num_generated += 1 if num_generated == num_to_generate: break elif score > -1: print '\t0\t%s' % token out_f.write('%s\t0\t%f\n' % (token, score)) else: print '\t-\t%s' % token out_f.write('%s\t-\t%f\n' % (token, score))
def calculate_mutual_information(feature_sets, out_fname): # Tolerate providing a single feature set. Make into a proper set. if isinstance(feature_sets, basestring): feature_sets = set([feature_sets]) else: feature_sets = set(feature_sets) # Separate count based features and non-count features count_based_features = list(feature_sets & set(ef.COUNT_BASED_FEATURES)) non_count_features = list(feature_sets & set(ef.NON_COUNT_FEATURES)) # Validation: ensure no unexpected features were provided unexpected_features = (feature_sets - set(ef.COUNT_BASED_FEATURES) - set(ef.NON_COUNT_FEATURES)) # Make sure no misspelled features were included if len(unexpected_features): raise ValueError('Unexpected feature(s): %s' % ', '.join(unexpected_features)) # Define the path at which to write. If no fname was given, then name # the file after the first element of names_of_runs out_path = os.path.join(RELATIONAL_NOUN_FEATURES_DIR, out_fname) # Load the features if not provided wni = utils.read_wordnet_index() features_path = os.path.join(RELATIONAL_NOUN_FEATURES_DIR, 'accumulated450-min_token_5-min_feat1000') start = time.time() features = ef.FeatureAccumulator(wni, load=features_path) print 'time to read features elapsed: %s' % (time.time() - start) # Load relational noun annotations annots = annotations.Annotations(features.dictionary) features.calculate_mutual_information(annots, out_path, count_based_features, non_count_features)