def test_combined_vector_sparse(self): semspace = SemanticSpace(sparse_space, rows, columns, readme_title, readme_desc, prenorm=False) vector = semspace.combined_vector(['first', 'third']) assert (space[[0,2], :].sum(0) == vector).all() vector = semspace.combined_vector(['second', 'fourth']) assert (space[[1,3], :].sum(0) == vector).all()
def test_metrics(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=False) for metric in semspace.allowed_metrics(): print metric pairs = [('first', 'second'), ('third', 'eighth'), ('twelfth', 'first'), ('twelfth', 'thirteenth')] self.semspace.pair_distances(pairs, metric=metric)
def test_combined_vector_prenorm(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) vector = semspace.combined_vector(['first', 'third']) self.assertEqual(np.linalg.norm(vector), 1)
def test_prenorm_exception_on_non_cosine(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) # cosine should work on prenormalized space semspace.pair_distance('first', 'second', metric='cosine') # but not euclidean with self.assertRaises(Exception): semspace.pair_distance('first', 'second', metric='euclidean')
def test_metrics(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=False) for metric in semspace.allowed_metrics(): print(metric) pairs = [('first', 'second'), ('third', 'eighth'), ('twelfth', 'first'), ('twelfth', 'thirteenth')] self.semspace.pair_distances(pairs, metric=metric)
def test_combined_vector_sparse(self): semspace = SemanticSpace(sparse_space, rows, columns, readme_title, readme_desc, prenorm=False) vector = semspace.combined_vector(['first', 'third']) assert (space[[0, 2], :].sum(0) == vector).all() vector = semspace.combined_vector(['second', 'fourth']) assert (space[[1, 3], :].sum(0) == vector).all()
def load_semspace(semspace_path, semspace_format='semspace'): """Load a semantic space based on the path and format.""" global semspace if semspace_format == 'ssmarket': semspace = SemanticSpace.from_ssmarket(semspace_path, prenorm=prenormalize) return True elif semspace_format == 'csv': semspace = SemanticSpace.from_csv(semspace_path, prenorm=prenormalize, dtype=numpy_dtype) return True else: raise Exception("Space format '%s' unknown!" % semspace_format)
def test_cosine_equals_prenorm(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) cosine_non_prenorm = self.semspace.pair_distance('first', 'second', metric='cosine') cosine_prenorm = semspace.pair_distance('first', 'second', metric='cosine') self.assertEqual(cosine_non_prenorm, cosine_prenorm) pairs = [('first', 'second'), ('third', 'eighth'), ('twelfth', 'first'), ('twelfth', 'thirteenth')] pairs_sims_non_prenorm = self.semspace.pair_distances(pairs) pairs_sims_prenorm = semspace.pair_distances(pairs) self.assertEqual(pairs_sims_non_prenorm, pairs_sims_prenorm)
def test_prenormalization(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) for row in semspace.vectors: row_norm = np.linalg.norm(row) print(row_norm, row) self.assertAlmostEqual(row_norm, 1.0)
def test_allowed_metrics_when_prenormed(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=False) semspace_p = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) self.assertIn('cosine', semspace.allowed_metrics()) self.assertIn('cosine', semspace_p.allowed_metrics()) self.assertNotIn('manhattan', semspace_p.allowed_metrics()) self.assertIn('manhattan', semspace.allowed_metrics())
from semspaces.space import SemanticSpace import numpy as np space = np.array([[0.61502426, 0.35800892, 0.46591138], [0.06256679, 0.80705953, 0.87805124], [0.18189868, 0.37707662, 0.89973192], [0.32667934, 0.0994168, 0.75457225], [0.43300126, 0.17586539, 0.88097073], [0.62085788, 0.29817756, 0.62991792], [0.37163458, 0.86633926, 0.31679958], [0.37416635, 0.82935107, 0.34275204], [0.26996958, 0.57101081, 0.60706083], [0.36690094, 0.70666147, 0.3300295], [0.19479401, 0.3334173, 0.79296408]]) rows = [ 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'eleventh' ] columns = ['one', 'two', 'three'] readme_title = 'Random semantic space' readme_desc = 'Demo semantic space description.' example_semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc)
def setUp(self): self.semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc)
class TestSemanticSpace(unittest.TestCase): def setUp(self): self.semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc) def test_defined_at_words(self): assert self.semspace.defined_at('first') assert not self.semspace.defined_at('twelfth') assert not self.semspace.defined_at('one') def test_defined_at_seqs(self): assert self.semspace.defined_at(['first', 'second']) assert not self.semspace.defined_at(['first', 'twelfth']) assert not self.semspace.defined_at(['one', 'twelfth']) def test_similarity_pairs(self): self.assertAlmostEqual(self.semspace.pair_distance('first', 'first'), 0) assert self.semspace.pair_distance('first', 'second') > 1e-10 def test_prenormalization(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) for row in semspace.vectors: row_norm = np.linalg.norm(row) print(row_norm, row) self.assertAlmostEqual(row_norm, 1.0) def test_prenorm_exception_on_non_cosine(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) # cosine should work on prenormalized space semspace.pair_distance('first', 'second', metric='cosine') # but not euclidean with self.assertRaises(Exception): semspace.pair_distance('first', 'second', metric='euclidean') def test_cosine_equals_prenorm(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) cosine_non_prenorm = self.semspace.pair_distance('first', 'second', metric='cosine') cosine_prenorm = semspace.pair_distance('first', 'second', metric='cosine') self.assertEqual(cosine_non_prenorm, cosine_prenorm) pairs = [('first', 'second'), ('third', 'eighth'), ('twelfth', 'first'), ('twelfth', 'thirteenth')] pairs_sims_non_prenorm = self.semspace.pair_distances(pairs) pairs_sims_prenorm = semspace.pair_distances(pairs) self.assertEqual(pairs_sims_non_prenorm, pairs_sims_prenorm) def test_subset(self): words = ['third', 'second', 'tenth', 'eighth'] subset = self.semspace.subset(words) assert subset.vectors.shape == (4, 3) assert not subset.defined_at('first') assert subset.defined_at('second') assert subset.defined_at('third') assert subset.defined_at('eighth') assert not subset.defined_at('ninth') assert subset.defined_at('tenth') self.assertAlmostEqual(self.semspace.pair_distance('second', 'third'), subset.pair_distance('second', 'third')) self.assertAlmostEqual(self.semspace.pair_distance('third', 'tenth'), subset.pair_distance('third', 'tenth')) def test_pair_distances(self): pairs = [('first', 'second'), ('third', 'eighth'), ('twelfth', 'first'), ('twelfth', 'thirteenth')] pairs_sims = self.semspace.pair_distances(pairs) first_second = self.semspace.pair_distance('first', 'second') assert pairs_sims[('first', 'second')] == first_second assert ('third', 'eighth') in list(pairs_sims.keys()) assert ('twelfth', 'first') not in list(pairs_sims.keys()) assert ('twelfth', 'thirteenth') not in list(pairs_sims.keys()) pairs_sims_nan = self.semspace.pair_distances(pairs, na_val=True) assert pairs_sims_nan[('first', 'second')] == first_second assert ('third', 'eighth') in list(pairs_sims_nan.keys()) assert ('twelfth', 'first') in list(pairs_sims_nan.keys()) assert ('twelfth', 'thirteenth') in list(pairs_sims_nan.keys()) assert pairs_sims_nan[('twelfth', 'first')] is np.nan assert pairs_sims_nan[('twelfth', 'thirteenth')] is np.nan def test_vector_entropy(self): self.assertAlmostEqual(self.semspace.vector_entropy('first'), 1.5502257500054266) self.assertAlmostEqual(self.semspace.vector_entropy('fifth'), 1.3302170534376188) self.assertAlmostEqual(self.semspace.vector_entropy('second'), 0.99871934706694587) def test_allowed_metrics_when_prenormed(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=False) semspace_p = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) self.assertIn('cosine', semspace.allowed_metrics()) self.assertIn('cosine', semspace_p.allowed_metrics()) self.assertNotIn('manhattan', semspace_p.allowed_metrics()) self.assertIn('manhattan', semspace.allowed_metrics()) def test_metrics(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=False) for metric in semspace.allowed_metrics(): print(metric) pairs = [('first', 'second'), ('third', 'eighth'), ('twelfth', 'first'), ('twelfth', 'thirteenth')] self.semspace.pair_distances(pairs, metric=metric) def test_combined_vector(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=False) vector = semspace.combined_vector(['first', 'third']) assert (space[[0, 2], :].sum(0) == vector).all() vector = semspace.combined_vector(['second', 'fourth']) assert (space[[1, 3], :].sum(0) == vector).all() def test_combined_vector_prenorm(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) vector = semspace.combined_vector(['first', 'third']) self.assertEqual(np.linalg.norm(vector), 1) def test_combined_vector_sparse(self): semspace = SemanticSpace(sparse_space, rows, columns, readme_title, readme_desc, prenorm=False) vector = semspace.combined_vector(['first', 'third']) assert (space[[0, 2], :].sum(0) == vector).all() vector = semspace.combined_vector(['second', 'fourth']) assert (space[[1, 3], :].sum(0) == vector).all()
class TestSemanticSpace(unittest.TestCase): def setUp(self): self.semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc) def test_defined_at_words(self): assert self.semspace.defined_at('first') assert not self.semspace.defined_at('twelfth') assert not self.semspace.defined_at('one') def test_defined_at_seqs(self): assert self.semspace.defined_at(['first', 'second']) assert not self.semspace.defined_at(['first', 'twelfth']) assert not self.semspace.defined_at(['one', 'twelfth']) def test_similarity_pairs(self): self.assertAlmostEqual(self.semspace.pair_distance('first', 'first'), 0) assert self.semspace.pair_distance('first', 'second') > 1e-10 def test_prenormalization(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) for row in semspace.vectors: row_norm = np.linalg.norm(row) print row_norm, row self.assertAlmostEqual(row_norm, 1.0) def test_prenorm_exception_on_non_cosine(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) # cosine should work on prenormalized space semspace.pair_distance('first', 'second', metric='cosine') # but not euclidean with self.assertRaises(Exception): semspace.pair_distance('first', 'second', metric='euclidean') def test_cosine_equals_prenorm(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) cosine_non_prenorm = self.semspace.pair_distance('first', 'second', metric='cosine') cosine_prenorm = semspace.pair_distance('first', 'second', metric='cosine') self.assertEqual(cosine_non_prenorm, cosine_prenorm) pairs = [('first', 'second'), ('third', 'eighth'), ('twelfth', 'first'), ('twelfth', 'thirteenth')] pairs_sims_non_prenorm = self.semspace.pair_distances(pairs) pairs_sims_prenorm = semspace.pair_distances(pairs) self.assertEqual(pairs_sims_non_prenorm, pairs_sims_prenorm) def test_subset(self): words = ['third', 'second', 'tenth', 'eighth'] subset = self.semspace.subset(words) assert subset.vectors.shape == (4, 3) assert not subset.defined_at('first') assert subset.defined_at('second') assert subset.defined_at('third') assert subset.defined_at('eighth') assert not subset.defined_at('ninth') assert subset.defined_at('tenth') self.assertAlmostEqual( self.semspace.pair_distance('second', 'third'), subset.pair_distance('second', 'third')) self.assertAlmostEqual( self.semspace.pair_distance('third', 'tenth'), subset.pair_distance('third', 'tenth')) def test_pair_distances(self): pairs = [('first', 'second'), ('third', 'eighth'), ('twelfth', 'first'), ('twelfth', 'thirteenth')] pairs_sims = self.semspace.pair_distances(pairs) first_second = self.semspace.pair_distance('first', 'second') assert pairs_sims[('first', 'second')] == first_second assert ('third', 'eighth') in pairs_sims.keys() assert ('twelfth', 'first') not in pairs_sims.keys() assert ('twelfth', 'thirteenth') not in pairs_sims.keys() pairs_sims_nan = self.semspace.pair_distances(pairs, na_val=True) assert pairs_sims_nan[('first', 'second')] == first_second assert ('third', 'eighth') in pairs_sims_nan.keys() assert ('twelfth', 'first') in pairs_sims_nan.keys() assert ('twelfth', 'thirteenth') in pairs_sims_nan.keys() assert pairs_sims_nan[('twelfth', 'first')] is np.nan assert pairs_sims_nan[('twelfth', 'thirteenth')] is np.nan def test_vector_entropy(self): self.assertAlmostEqual( self.semspace.vector_entropy('first'), 1.5502257500054266) self.assertAlmostEqual( self.semspace.vector_entropy('fifth'), 1.3302170534376188) self.assertAlmostEqual( self.semspace.vector_entropy('second'), 0.99871934706694587) def test_allowed_metrics_when_prenormed(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=False) semspace_p = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) self.assertIn('cosine', semspace.allowed_metrics()) self.assertIn('cosine', semspace_p.allowed_metrics()) self.assertNotIn('manhattan', semspace_p.allowed_metrics()) self.assertIn('manhattan', semspace.allowed_metrics()) def test_metrics(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=False) for metric in semspace.allowed_metrics(): print metric pairs = [('first', 'second'), ('third', 'eighth'), ('twelfth', 'first'), ('twelfth', 'thirteenth')] self.semspace.pair_distances(pairs, metric=metric) def test_combined_vector(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=False) vector = semspace.combined_vector(['first', 'third']) assert (space[[0,2], :].sum(0) == vector).all() vector = semspace.combined_vector(['second', 'fourth']) assert (space[[1,3], :].sum(0) == vector).all() def test_combined_vector_prenorm(self): semspace = SemanticSpace(space, rows, columns, readme_title, readme_desc, prenorm=True) vector = semspace.combined_vector(['first', 'third']) self.assertEqual(np.linalg.norm(vector), 1) def test_combined_vector_sparse(self): semspace = SemanticSpace(sparse_space, rows, columns, readme_title, readme_desc, prenorm=False) vector = semspace.combined_vector(['first', 'third']) assert (space[[0,2], :].sum(0) == vector).all() vector = semspace.combined_vector(['second', 'fourth']) assert (space[[1,3], :].sum(0) == vector).all()
random_baseline = False sample_vocab = True data_dir = os.path.join(os.getcwd(), 'data/') out_dir = os.path.join(os.getcwd(), 'output/') if not os.path.exists(out_dir): os.makedirs(out_dir) # LOAD DATA print( datetime.now().strftime("%d/%m/%Y %H:%M:%S: Started loading the data...")) embedding_space = SemanticSpace.from_csv(os.path.join( data_dir, '/home/gcassani/Resources/Embeddings/embedding_space.cbow.ukwac.subtlex.300dims.w5.w2v' ), prenorm=True) w2v_words = embedding_space.included_words() celex = json.load(open(os.path.join(data_dir, 'celex_dict.json'))) aoa_words, aoa_norms = aoa.read(os.path.join(data_dir, "AoA.xlsx")) w2aoa = pd.Series(aoa_norms["Rating.Mean"].values, index=aoa_norms["Word"]).to_dict() w2concr = concreteness.read(os.path.join(data_dir, "concreteness.txt")) w2val = valence.read(os.path.join(data_dir, "valence.csv")) w2freq = subtlex.read(os.path.join(data_dir, "subtlex.csv")) w2old = old20.read(os.path.join(data_dir, "word2old.csv")) mono = list(morpholex.read_mono(os.path.join(data_dir, "MorphoLEX_en.xlsx")))
def load_space(): print('Loading semantic space...') return SemanticSpace.from_csv('spaces/lemmas.w2v.gz')