def test_from_entries_and_from_matrix(): # Note: for convenience from_matrix() is tested here implicitly, rather # than in a separate test. # Reject outright a space with no entries and a space with insufficient # entries assert AssocSpace.from_entries([], k=1) is None assert AssocSpace.from_entries([(1, 'apple', 'red')], k=1) is None # Build with mostly-default parameters and check some simple properties assoc_default = AssocSpace.from_entries(ENTRIES, k=4) eq_(assoc_default.k, 4) eq_(assoc_default.sigma[0], 1.0) assert assoc_default.assoc_between_two_terms('apple', 'red') > 0.5 assert assoc_default.assoc_between_two_terms('red', 'red') > 0.999 assert assoc_default.assoc_between_two_terms('lemon', 'red') < 0.2 # Build with strip_a0=False; in this case we have negative eigenvalues, # so we lose an eigenvalue from the middle to make room for a0 assoc_no_strip = AssocSpace.from_entries(ENTRIES, k=4, strip_a0=False) eq_(assoc_no_strip.k, 4) assert np.allclose(assoc_no_strip.sigma[-1] / assoc_no_strip.sigma[1], assoc_default.sigma[-1]) assert (np.allclose(assoc_no_strip.u[:, 1], assoc_default.u[:, 0]) or np.allclose(assoc_no_strip.u[:, 1], -assoc_default.u[:, 0])) # Build with normalize_gm=False assoc_no_norm = AssocSpace.from_entries(ENTRIES, k=4, normalize_gm=False) eq_(assoc_no_norm.k, 4)
def merge_vector_spaces(subspace_dir, mergers): merged = None for sourceA, sourceB, target in mergers: print('Merging: %s + %s -> %s' % (sourceA, sourceB, target)) spaceA = AssocSpace.load_dir(os.path.join(subspace_dir, sourceA)) spaceB = AssocSpace.load_dir(os.path.join(subspace_dir, sourceB)) # On the first step, we want to keep all the axes from merging subparts. # Through most of the merging, we want to maintain that number of axes. # At the end, we want to go back to the original number of axes. # For example, when we are merging 300-dimensional spaces, the # intermediate merge results will have 600 dimensions, and the final # result will have 300 dimensions again. # We don't refer to the number of axes in spaceB in this code, because # we're assuming all the sub-parts have equal numbers of axes. if target.startswith('part'): k = spaceA.k * 2 elif target == 'merged_complete': k = spaceA.k // 2 else: k = spaceA.k merged = spaceA.merged_with(spaceB, k=k) del spaceA del spaceB merged.save_dir(os.path.join(subspace_dir, target)) magnitudes = (merged.u ** 2).sum(1) good_indices = np.flatnonzero(magnitudes >= 1e-5) filtered = merged[good_indices] filtered.save_dir(os.path.join(subspace_dir, 'merged_filtered')) return filtered
def build_assoc_space(input_file, output_dir): print('loading') counts = defaultdict(int) triples = [] for line in codecs.open(input_file, encoding='utf-8'): left, right, value = line.strip().split('\t') if not concept_is_bad(left) and not concept_is_bad(right): value = float(value) triples.append((value, left, right)) counts[left] += 1 counts[right] += 1 print('filtering entries') sparse = SparseEntryStorage() for (value, left, right) in triples: if concept_is_frequent_enough(left, counts) and concept_is_frequent_enough( right, counts) and left != right: sparse.add_entry((value, left, right)) del triples # Add links from a concept to itself, and negative links to its opposite if it's there for concept in counts: if concept_is_frequent_enough(concept, counts): sparse.add_entry((1., concept, concept)) negation = negate_concept(concept) if concept_is_frequent_enough(negation, counts): sparse.add_entry((-1., concept, negation)) print('making assoc space') space = AssocSpace.from_sparse_storage(sparse, 150, offset_weight=4e-5) print('saving') space.save_dir(output_dir)
def test_vectorizing_and_similar_terms(): # Simple test for vectorizing weighted terms assoc = AssocSpace.from_entries(ENTRIES, k=3) weighted_terms = [('apple', 5), ('banana', 22), ('not a term', 17)] apple = assoc.row_named('apple') banana = assoc.row_named('banana') vector = assoc.vector_from_terms(weighted_terms) # The similarity of 'apple' to itself is approximately 1 assert abs(assoc.assoc_between_two_terms('apple', 'apple') - 1.0) < 1e-3 # 'apple' and 'banana' are at least 10% less similar to each other than # to themselves assert assoc.assoc_between_two_terms('apple', 'banana') < 0.9 # The vector is some linear combination of apple and banana. Test this # by subtracting out apple and banana components, so that there is nothing # left. norm_apple = normalize(apple) banana_perp_apple = normalize(banana - norm_apple * norm_apple.dot(banana)) residual = vector - norm_apple * norm_apple.dot(vector) residual -= banana_perp_apple * banana_perp_apple.dot(residual) assert norm(residual) < 1e-3 # Simple test for finding similar terms labels, scores = zip(*assoc.terms_similar_to_vector(vector)) eq_(list(scores), sorted(scores, reverse=True)) most_similar = assoc.most_similar_to_vector(vector) eq_(most_similar[0], labels[0]) eq_(most_similar[1], scores[0]) assert labels.index('banana') < labels.index('apple') assert labels.index('apple') < labels.index('green') assert labels.index('apple') < labels.index('celery')
def build_assoc_space(input_file, output_dir): print('loading') counts = defaultdict(int) triples = [] for line in codecs.open(input_file, encoding='utf-8'): left, right, value = line.strip().split('\t')[:3] if not concept_is_bad(left) and not concept_is_bad(right): value = float(value) triples.append((value, left, right)) counts[left] += 1 counts[right] += 1 print('filtering entries') sparse = SparseEntryStorage() for (value, left, right) in triples: if concept_is_frequent_enough(left, counts) and concept_is_frequent_enough(right, counts) and left != right: sparse.add_entry((value, left, right)) del triples # Add links from a concept to itself, and negative links to its opposite if it's there for concept in counts: if concept_is_frequent_enough(concept, counts): sparse.add_entry((1., concept, concept)) negation = negate_concept(concept) if concept_is_frequent_enough(negation, counts): sparse.add_entry((-1., concept, negation)) print('making assoc space') space = AssocSpace.from_sparse_storage(sparse, k=300, offset_weight=1e-4) print('saving') space.save_dir(output_dir)
def optimizeAllAndInferConceptsModelTwo(assocDir): ## load assocSpace assocSpace = AssocSpace.load_dir(assocDir); ## targets and image-indices dictionary targetsToImageIndicesAndWeights={}; # target-word ->[(index,weight_i)...] targetsToCentralities={}; #target-word -> centrality-score loadTargetWordsFromAllImages(targetsToCentralities,targetsToImageIndicesAndWeights); # Model m = Model("psl2") variables= set(); targets = {} loadDecisionVariablesForTargets(m,targets,variables,targetsToImageIndicesAndWeights); ## TODO: populate the rules objective = LinExpr(); objective = createObjective(m,targets,variables,objective,assocSpace,targetsToCentralities,targetsToImageIndicesAndWeights); m.update(); m.setObjective(objective); # The objective is to minimize the costs m.modelSense = GRB.MINIMIZE # Update model to integrate new variables m.update() m.optimize(); m.write('out2.lp'); m.write('out2.sol'); outputFile = open(sys.argv[1]+ sys.argv[2]+"_inferred.txt","w"); printSolution(m,targets,outputFile);
def test_truncation(): # Simple test of truncation assoc = AssocSpace.from_entries(ENTRIES, k=3) truncated = assoc.truncated_to(2) assert np.allclose(truncated.u, assoc.u[:, :2]) assert np.allclose(truncated.sigma, assoc.sigma[:2]) eq_(truncated.labels, assoc.labels) assert 0.999 < norm(truncated.assoc[0]) < 1.0
def test_strip_a0(): """When stripping a0, AssocSpace uses axes [1,k] instead of [0,k-1].""" assoc = AssocSpace.from_entries(entries, 3, strip_a0=False) assoc_stripped_mat = AssocSpace.from_entries(entries, 3, strip_a0=True) # Check for the same number of k eq_(assoc.u.shape[1], 3) assert np.allclose(np.abs(assoc.u[:,1]), np.abs(assoc_stripped_mat.u[:,0])) # check that the ratio between sigma values is preserved assert np.allclose(assoc.sigma[1] / assoc.sigma[2], assoc_stripped_mat.sigma[0] / assoc_stripped_mat.sigma[1]) assoc_stripped_dropa0 = AssocSpace.from_entries(entries, 3).with_first_axis_dropped() assert np.allclose(np.abs(assoc.u[:,1]), np.abs(assoc_stripped_dropa0.u[:,0])) assert np.allclose(assoc.sigma[1] / assoc.sigma[2], assoc_stripped_dropa0.sigma[0] / assoc_stripped_dropa0.sigma[1])
def load_assoc(): """ Load the association matrix. Requires the open source Python package 'assoc_space'. """ global commonsense_assoc if commonsense_assoc: return commonsense_assoc dirname = ASSOC_DIR commonsense_assoc = AssocSpace.load_dir(ASSOC_DIR) return commonsense_assoc
def test_merging(): # The actual math of merging is tested separately in test_eigenmath; here # we just spot-verify that AssocSpace is using it reasonably # Generate test assoc spaces and merge them assoc1 = AssocSpace.from_entries(ENTRIES, k=4) assoc2 = AssocSpace.from_entries(MORE_ENTRIES, k=4) merged = assoc1.merged_with(assoc2) eq_(merged.k, 8) # Check some simple things merged = assoc1.merged_with(assoc2, k=4) eq_(merged.k, 4) eq_(' '.join(merged.labels), 'apple red green celery orange banana yellow lemon blue tasty ferret') assert merged.assoc_between_two_terms('ferret', 'yellow') > 0.5 assert (assoc2.assoc_between_two_terms( 'apple', 'red') < merged.assoc_between_two_terms('apple', 'red') < assoc1.assoc_between_two_terms('apple', 'red'))
def test_merging(): # The actual math of merging is tested separately in test_eigenmath; here # we just spot-verify that AssocSpace is using it reasonably # Generate test assoc spaces and merge them assoc1 = AssocSpace.from_entries(ENTRIES, k=4) assoc2 = AssocSpace.from_entries(MORE_ENTRIES, k=4) merged = assoc1.merged_with(assoc2) eq_(merged.k, 8) # Check some simple things merged = assoc1.merged_with(assoc2, k=4) eq_(merged.k, 4) eq_(' '.join(merged.labels), 'apple red green celery orange banana yellow lemon blue tasty ferret') assert merged.assoc_between_two_terms('ferret', 'yellow') > 0.5 assert (assoc2.assoc_between_two_terms('apple', 'red') < merged.assoc_between_two_terms('apple', 'red') < assoc1.assoc_between_two_terms('apple', 'red'))
def load(self): if self.assoc is not None: return try: from assoc_space import AssocSpace self.assoc = AssocSpace.load_dir(self.path) except ImportError: raise MissingAssocSpace("The assoc_space package is not installed.") except ZeroDivisionError: raise MissingAssocSpace("The space of term associations could not " "be loaded.")
def test_assoc_constructor(): # Make a nice, normal AssocSpace u = np.asarray([[0, 1, 0.6], [1, 0, 0.8]]) sigma = np.asarray([0.5, 0.3, 0.2]) labels = LabelSet(['A', 'B']) assoc = AssocSpace(u, sigma, labels) eq_(assoc.k, 3) assert 'assoc' not in assoc.__dict__ # Test some error conditions with assert_raises(ValueError): AssocSpace(u, np.asarray([0.0, -0.2, -0.4]), labels) with assert_raises(ValueError): AssocSpace(u, np.asarray([0.6, 0.4]), labels) with assert_raises(ValueError): AssocSpace(u, np.asarray([0.6, 0.7, 0.2]), labels) # Test assoc hinting assoc_matrix = assoc.assoc.copy() assoc_hinted = AssocSpace(u, sigma, labels, assoc=assoc_matrix) assert np.allclose(assoc_hinted.row_named('A'), assoc.row_named('A'))
def load(self): if self.assoc is not None: return try: from assoc_space import AssocSpace self.assoc = AssocSpace.load_dir(self.path) except ImportError: raise MissingAssocSpace( "The assoc_space package is not installed.") except ZeroDivisionError: raise MissingAssocSpace("The space of term associations could not " "be loaded.")
def test_filter(): # Build and filter an assoc space assoc = AssocSpace.from_entries(ENTRIES, k=5) filtered = assoc.filter(_filter) # Check simple properties of the filtered space eq_(filtered.k, 5) eq_(' '.join(filtered.labels), 'red green celery banana lemon') # Check that redecomposition happened assert np.allclose(norm(filtered.u[:, 1]), 1.0) # Redecomposition can be kind of weird, but this result is intuitive assert (assoc.assoc_between_two_terms('red', 'banana') < filtered.assoc_between_two_terms('red', 'banana') < assoc.assoc_between_two_terms('yellow', 'banana'))
def test_filter(): # Build and filter an assoc space assoc = AssocSpace.from_entries(ENTRIES, k=5) filtered = assoc.filter(_filter) # Check simple properties of the filtered space eq_(filtered.k, 5) eq_(' '.join(filtered.labels), 'red green celery banana lemon') # Check that redecomposition happened assert np.allclose(norm(filtered.u[:, 1]), 1.0) # Redecomposition can be kind of weird, but this result is intuitive assert (assoc.assoc_between_two_terms( 'red', 'banana') < filtered.assoc_between_two_terms('red', 'banana') < assoc.assoc_between_two_terms('yellow', 'banana'))
def run(): ENTRIES = [ (4, '/c/en/apple', '/c/en/red'), (1, '/c/en/apple', '/c/en/green'), (3, '/c/en/apple', '/c/en/orange'), (3, '/c/en/banana', '/c/en/orange'), (1, '/c/en/banana', '/c/en/yellow'), (0.5, '/c/en/lemon', '/c/en/yellow'), (1.5, '/c/en/orange', '/c/en/lemon'), (0.1, '/c/en/apple', '/c/en/lemon'), (0.2, '/c/en/banana', '/c/en/lemon'), (0.5, '/c/en/ideas', '/c/en/colorless'), (0.5, '/c/en/ideas', '/c/en/green'), (1, '/c/en/example', '/c/en/green'), ] space = AssocSpace.from_entries(ENTRIES, k=4) space.save_dir('../conceptnet5/support_data/testdata/input/assoc_space')
from assoc_space import AssocSpace import sys import threading import math def computeNormalizedValue(value, maxV, minV, addOne=False): if addOne: return (value - minV + 1) / (maxV - minV + 1) return (value - minV) / (maxV - minV) if len(sys.argv) < 4: print "python conceptnetAssocSpace.py <seedsfile> <targetfile> <AssocSpaceDirectory>" sys.exit() assocSpace = AssocSpace.load_dir(sys.argv[3]) words = [] minSimilarity = -0.358846 maxSimilarity = 0.999747 minCentrality = -0.00188222 maxCentrality = 0.00324597 with open(sys.argv[1], "r") as f: i = 0 for line in f: if line.startswith("##"): continue words = line.split("\t") word1 = "/c/en/" + words[0].strip() with open(sys.argv[2], "r") as f2: for line in f2: if line.startswith("##"):
def test_pickle_round_trip(): """An AssocSpace survives a round-trip to pickle format and back.""" assoc = AssocSpace.from_entries(entries, 3) pickled = pickle.dumps(assoc) assoc2 = pickle.loads(pickled) eq_(assoc, assoc2)
def test_dir_round_trip(): assoc = AssocSpace.from_entries(ENTRIES, k=3) assoc.save_dir('/tmp/assoc_test') assoc2 = AssocSpace.load_dir('/tmp/assoc_test') eq_(assoc, assoc2)
def test_dir_round_trip(): assoc = AssocSpace.from_entries(entries, 3) assoc.save_dir('/tmp/assoc_test') assoc2 = AssocSpace.load_dir('/tmp/assoc_test') eq_(assoc, assoc2)
def test_pickle_round_trip(): """An AssocSpace survives a round-trip to pickle format and back.""" assoc = AssocSpace.from_entries(ENTRIES, k=3) pickled = pickle.dumps(assoc) assoc2 = pickle.loads(pickled) eq_(assoc, assoc2)
def test_association_calculations(): assoc = AssocSpace.from_entries(entries, 3) assert abs(assoc.assoc_between_two_terms('apple', 'apple') - 1.0) < 1e-3 assert assoc.assoc_between_two_terms('apple', 'banana') < 0.9
def main(dir): assoc = AssocSpace.load_dir(dir) test(assoc)
if not os.path.isfile(sortedIndicesFileName): sim = assocSpace.assoc.dot(assocSpace.row_named("/c/en/" + word)) indices = np.argsort(sim)[::-1] np.savez_compressed(sortedIndicesFileName, indices[:1000]) sim_first1k = np.array([sim[index] for index in indices[:1000]]) np.savez_compressed(simFileName, sim_first1k) sim = np.load(simFileName) indices = np.load(sortedIndicesFileName) data = [] for index in indices: if len(data) == limit: break if filterEnglishWords(names[index]): data.append((names[index], sim[index])) return data minSimilarity = -1 maxSimilarity = 1 minCentrality = -0.00188222 maxCentrality = 0.00324597 assocDir = "../conceptnet5/data/assoc/assoc-space-5.4" assocSpace = AssocSpace.load_dir(assocDir) names = assocSpace.labels word2vec_model = models.word2vec.Word2Vec.load_word2vec_format( '../../../DATASETS/GoogleNews-vectors-negative300.bin', binary=True) word2vec_model.init_sims(replace=True) TOO_RARE_WORD_CODE = -3 NOT_FOUND_IN_CORPUS_CODE = -2