def __maxSimC(self, vecs_a, context_a, vecs_b, context_b): vecmanip = VectorManipulation() closest_a = self.__closestSenseContext(vecs_a, context_a) closest_b = self.__closestSenseContext(vecs_b, context_b) result = vecmanip.cosine_similarity(closest_a, closest_b) return result
def globalSim(self, vecs_a, vecs_b): vecmanip = VectorManipulation() if not vecs_a or not vecs_b: global_sim = 0.0 else: global_a = numpy.average(vecs_a, axis=0) global_b = numpy.average(vecs_b, axis=0) global_sim = vecmanip.cosine_similarity(global_a, global_b) return global_sim
def maxSim(self, vecs_a, vecs_b): vecmanip = VectorManipulation() highest = -1.0 for vec_a in vecs_a: for vec_b in vecs_b: tmp_high = vecmanip.cosine_similarity(vec_a, vec_b) if tmp_high > highest: highest = tmp_high return highest
def test_SpearmanGlobal(self): fio = FileManipulation() vec_manip = VectorManipulation() gold_tokens = fio.readFileLine('scws.txt') ruby_global = fio.readFileLine('scws_globsim.txt') ruby_globalc = fio.readFileLine('scws_globsimc.txt') t11, __ = vec_manip.spearmanCorrelation(gold_tokens, ruby_global) # not using rho t21, __ = vec_manip.spearmanCorrelation(gold_tokens, ruby_globalc) # not using rho self.assertEqual('0.6670118503142607', str(t11)) self.assertEqual(numpy.float64('0.2969117412433547'), t21)
def test_SpearmanMax(self): fio = FileManipulation() vec_manip = VectorManipulation() gold_tokens = fio.readFileLine('scws.txt') ruby_max = fio.readFileLine('scws_maxsim.txt') ruby_maxc = fio.readFileLine('scws_maxsimc.txt') t11, __ = vec_manip.spearmanCorrelation(gold_tokens, ruby_max) # not using rho t21, __ = vec_manip.spearmanCorrelation(gold_tokens, ruby_maxc) # not using rho self.assertEqual('0.6127420529962664', str(t11)) self.assertEqual(numpy.float64('0.6367583108796157'), t21)
def test_SpearmanAvg(self): fio = FileManipulation() vec_manip = VectorManipulation() gold_tokens = fio.readFileLine('scws.txt') ruby_avg = fio.readFileLine('scws_avgsim.txt') ruby_avgc = fio.readFileLine('scws_avgsimc.txt') t11, __ = vec_manip.spearmanCorrelation(gold_tokens, ruby_avg) # not using rho t21, __ = vec_manip.spearmanCorrelation(gold_tokens, ruby_avgc) # not using rho self.assertEqual('0.6672948584312471', str(t11)) self.assertEqual(numpy.float64('0.5809138966365319'), t21)
def __closestSenseContext(self, synset_vecs, contextvec): vecmanip = VectorManipulation() high_so_far = -1.0 nearest = [] for synset_vec in synset_vecs: # closest sense (synset_vec) of 'word-A' to its context context_sim = vecmanip.cosine_similarity(synset_vec, contextvec) if context_sim > high_so_far: high_so_far = context_sim nearest = synset_vec return nearest
def avgSim(self, vecs_a, vecs_b): vecmanip = VectorManipulation() partial_sim = 0.0 for vec_a in vecs_a: for vec_b in vecs_b: tmp_ab = vecmanip.cosine_similarity(vec_a, vec_b) partial_sim += tmp_ab if not vecs_a or not vecs_b: final_sim = 0.0 else: final_sim = (partial_sim / (len(vecs_a) * len(vecs_b))) return final_sim
def __avgSimC(self, vecs_a, context_a, vecs_b, context_b): vecmanip = VectorManipulation() partial_sim = 0.0 for vec_a in vecs_a: pcwa = vecmanip.cosine_similarity(vec_a, context_a) for vec_b in vecs_b: pcwb = vecmanip.cosine_similarity(vec_b, context_b) dwab = vecmanip.cosine_similarity(vec_a, vec_b) partial_sim += pcwa * pcwb * dwab if not vecs_a or not vecs_b: final_sim = 0.0 else: final_sim = (partial_sim / (len(vecs_a) * len(vecs_b))) return final_sim
def __contextParser(self, text_items, trained_model): track_synset = SynsetParserVector() vector_manip = VectorManipulation() context_vector = [] for text_item in text_items: synsets = wn.synsets(text_item) for synset in synsets: key = track_synset.keyParser(text_item, synset) try: key_vector = trained_model.word_vec(key) context_vector.append( key_vector ) # put all vector words in the sentence together except KeyError: pass return numpy.average(context_vector, axis=0)
def test_cosine_similarity(self): vec_manip = VectorManipulation() a = [1, 0] b = [-1, 0] self.assertEqual(-1, vec_manip.cosine_similarity(a, b))
def __globalSimC(self, context_a, context_b): vecmanip = VectorManipulation() global_simc = vecmanip.cosine_similarity(context_a, context_b) return global_simc
# python module absolute path pydir_name = os.path.dirname(os.path.abspath(__file__)) ppydir_name = os.path.dirname(pydir_name) # python path definition sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) # local-imports from utilities.commandLine import CommandLineStats from utilities.fileOperations import FileManipulation from vecmanip.vectorOperations import VectorManipulation if __name__ == '__main__': params = CommandLineStats() # command line parameter validation fio = FileManipulation() stats_metrics = VectorManipulation() gold_path = os.path.join(ppydir_name, params.gold_input) ruby_path = os.path.join(ppydir_name, params.ruby_input) ou_loc = os.path.join(ppydir_name, params.output_folder) gold = fio.readFileLine(gold_path) docs = fio.doclist_multifolder(ruby_path) result = "Metric\tSpearman\tS-Rho\n" for doc in docs: ruby = fio.readFileLine(doc) fname = doc.split(os.sep) fname = fname[-1] tmp_sp, tmp_rho = stats_metrics.spearmanCorrelation(gold, ruby) result += fname[:-3] + '\t' + str(tmp_sp) + '\t' + str(tmp_rho) + '\n'