def __init__(self, vocabfile, ignore_target, context_math, word_path, context_path, conll_filename, window_size, top_inferences_to_analyze): CsInferrer.__init__(self) self.ignore_target = ignore_target self.context_math = context_math self.word_vecs = Embedding(word_path) self.context_vecs = Embedding(context_path) self.use_stopwords = False assert (not (window_size >= 0 and conll_filename is not None)) self.window_size = window_size if (window_size < 0): self.conll_file = open(conll_filename, 'r') self.sents = read_conll(self.conll_file, True) self.top_inferences_to_analyze = top_inferences_to_analyze self.w2counts, ignore, self.stopwords = load_vocabulary_counts( vocabfile)
def __init__(self, path, vocabfile, top_inferences_to_analyze): CsInferrer.__init__(self) self.embeddings = Embedding(path) self.top_inferences_to_analyze = top_inferences_to_analyze self.w2counts, ignore1, ignore2 = load_vocabulary_counts(vocabfile)
class EmbeddingInferrer(CsInferrer): ''' classdocs ''' def __init__(self, path, vocabfile, top_inferences_to_analyze): CsInferrer.__init__(self) self.embeddings = Embedding(path) self.top_inferences_to_analyze = top_inferences_to_analyze self.w2counts, ignore1, ignore2 = load_vocabulary_counts(vocabfile) def new_target_key(self, target_key): pass def find_inferred(self, lst_instance, tfo): if lst_instance.target in self.embeddings: result_vec, deltatime = self.embeddings.closest_with_time( lst_instance.target, -1) else: result_vec, deltatime = None, 0 tfo.write("\nDeltatime: %f msec\n" % ((deltatime) * 1000)) self.inference_time(deltatime) if (result_vec is not None): tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') else: tfo.write("Top most similar embeddings: " + " contexts: None\n") return result_vec def filter_inferred(self, result_vec, candidates, pos): filtered_results = {} candidates_found = set() if result_vec != None: for word, weight in result_vec: wn_pos = to_wordnet_pos[pos] lemma = WordNetLemmatizer().lemmatize(word, wn_pos) if lemma in candidates: self.add_inference_result(lemma, weight, filtered_results, candidates_found) if lemma.title() in candidates: # match also capitalized words self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found) if word in candidates: # there are some few cases where the candidates are not lemmatized self.add_inference_result(word, weight, filtered_results, candidates_found) if word.title( ) in candidates: # there are some few cases where the candidates are not lemmatized self.add_inference_result(word.title(), weight, filtered_results, candidates_found) # assign negative weights for candidates with no score # they will appear last sorted according to their unigram count candidates_left = candidates - candidates_found for candidate in candidates_left: count = self.w2counts[ candidate] if candidate in self.w2counts else 1 score = -1 - (1.0 / count) # between (-1,-2] filtered_results[candidate] = score return filtered_results
class CsEmbeddingInferrer(CsInferrer): def __init__(self, vocabfile, ignore_target, context_math, word_path, context_path, conll_filename, window_size, top_inferences_to_analyze): CsInferrer.__init__(self) self.ignore_target = ignore_target self.context_math = context_math self.word_vecs = Embedding(word_path) self.context_vecs = Embedding(context_path) self.use_stopwords = False assert (not (window_size >= 0 and conll_filename is not None)) self.window_size = window_size if (window_size < 0): self.conll_file = open(conll_filename, 'r') self.sents = read_conll(self.conll_file, True) self.top_inferences_to_analyze = top_inferences_to_analyze self.w2counts, ignore, self.stopwords = load_vocabulary_counts( vocabfile) # def close(self): # self.conll_file.close() def represent(self, target, deps, avg_flag, tfo): target_vec = None if target is None else np.copy( self.word_vecs.represent(target)) dep_vec = None deps_found = 0 for dep in deps: if dep in self.context_vecs: deps_found += 1 if dep_vec is None: dep_vec = np.copy(self.context_vecs.represent(dep)) else: dep_vec += self.context_vecs.represent(dep) else: tfo.write("NOTICE: %s not in context embeddings. Ignoring.\n" % dep) ret_vec = None if target_vec is not None: ret_vec = target_vec if dep_vec is not None: if avg_flag: dep_vec /= deps_found if ret_vec is None: ret_vec = dep_vec else: ret_vec += dep_vec norm = (ret_vec.dot(ret_vec.transpose()))**0.5 ret_vec /= norm return ret_vec def mult(self, target, deps, geo_mean_flag, tfo): # SUPPORT NONE TARGET target_vec = self.word_vecs.represent(target) scores = self.word_vecs.pos_scores(target_vec) for dep in deps: if dep in self.context_vecs: dep_vec = self.context_vecs.represent(dep) mult_scores = self.word_vecs.pos_scores(dep_vec) if geo_mean_flag: mult_scores = mult_scores**(1.0 / len(deps)) scores = np.multiply(scores, mult_scores) else: tfo.write("NOTICE: %s not in context embeddings. Ignoring.\n" % dep) result_vec = self.word_vecs.top_scores(scores, -1) return result_vec def extract_contexts(self, lst_instance): if self.window_size < 0: cur_sent = next(self.sents) cur_sent_target_ind = lst_instance.target_ind + 1 while ((cur_sent_target_ind < len(cur_sent) and cur_sent[cur_sent_target_ind].form != lst_instance.target)): sys.stderr.write( "Target word form mismatch in target id %s: %s != %s Checking next word.\n" % (lst_instance.target_id, cur_sent[cur_sent_target_ind].form, lst_instance.target)) cur_sent_target_ind += 1 if cur_sent_target_ind == len(cur_sent): sys.stderr.write("Start looking backwards.\n") cur_sent_target_ind = lst_instance.target_ind while ((cur_sent_target_ind > 0) and (cur_sent[cur_sent_target_ind].form != lst_instance.target)): sys.stderr.write( "Target word form mismatch in target id %s: %s != %s Checking previous word.\n" % (lst_instance.target_id, cur_sent[cur_sent_target_ind].form, lst_instance.target)) cur_sent_target_ind -= 1 if cur_sent_target_ind == 0: sys.stderr.write("ERROR: Couldn't find a match for target.") cur_sent_target_ind = lst_instance.target_ind + 1 stopwords = self.stopwords if self.use_stopwords else set() contexts = get_deps(cur_sent, cur_sent_target_ind, stopwords) else: contexts = lst_instance.get_neighbors(self.window_size) return contexts def find_inferred(self, lst_instance, tfo): contexts = self.extract_contexts(lst_instance) tfo.write("Contexts for target %s are: %s\n" % (lst_instance.target, contexts)) contexts = [c for c in contexts if c in self.context_vecs] tfo.write("Contexts in vocabulary for target %s are: %s\n" % (lst_instance.target, contexts)) if self.ignore_target: target = None else: if lst_instance.target not in self.word_vecs: tfo.write("ERROR: %s not in word embeddings.Trying lemma.\n" % lst_instance.target) if lst_instance.target_lemma not in self.word_vecs: tfo.write( "ERROR: lemma %s also not in word embeddings. Giving up.\n" % lst_instance.target_lemma) return None else: target = lst_instance.target_lemma else: target = lst_instance.target # 'add' and 'avg' metrics are implemented more efficiently with vector representation arithmetics # as shown in Omer's linguistic regularities paper, this is equivalent as long as the vectors are normalized to 1 if self.context_math == 'add': cs_rep = self.represent(target, contexts, False, tfo) if cs_rep is None: cs_rep = self.word_vecs.zeros() result_vec = self.word_vecs.closest_vec(cs_rep, -1) elif self.context_math == 'avg': cs_rep = self.represent(target, contexts, True, tfo) if cs_rep is None: cs_rep = self.word_vecs.zeros() result_vec = self.word_vecs.closest_vec(cs_rep, -1) elif self.context_math == 'mult': result_vec = self.mult(target, contexts, False, tfo) elif self.context_math == 'geomean': result_vec = self.mult(target, contexts, True, tfo) elif self.context_math == 'none' and self.ignore_target is not None: result_vec = self.word_vecs.closest(target, -1) else: raise Exception('Unknown context math: %s' % self.context_math) if (result_vec is not None): tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') else: tfo.write("Top most similar embeddings: " + " contexts: None\n") return result_vec
class EmbeddingInferrer(CsInferrer): ''' classdocs ''' def __init__(self, path, vocabfile, top_inferences_to_analyze): CsInferrer.__init__(self) self.embeddings = Embedding(path) self.top_inferences_to_analyze = top_inferences_to_analyze self.w2counts, ignore1, ignore2 = load_vocabulary_counts(vocabfile) def new_target_key(self, target_key): pass def find_inferred(self, lst_instance, tfo): if lst_instance.target in self.embeddings: result_vec, deltatime = self.embeddings.closest_with_time(lst_instance.target, -1) else: result_vec, deltatime = None, 0 tfo.write("\nDeltatime: %f msec\n" % ((deltatime)*1000)) self.inference_time(deltatime) if (result_vec is not None): tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') else: tfo.write("Top most similar embeddings: " + " contexts: None\n") return result_vec def filter_inferred(self, result_vec, candidates, pos, orig_subvec_ignored): filtered_results = {} candidates_found = set() if result_vec != None: for word, weight in result_vec: wn_pos = to_wordnet_pos[pos] lemma = WordNetLemmatizer().lemmatize(word, wn_pos) if lemma in candidates: self.add_inference_result(lemma, weight, filtered_results, candidates_found) if lemma.title() in candidates: # match also capitalized words self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found) if word in candidates: # there are some few cases where the candidates are not lemmatized self.add_inference_result(word, weight, filtered_results, candidates_found) if word.title() in candidates: # there are some few cases where the candidates are not lemmatized self.add_inference_result(word.title(), weight, filtered_results, candidates_found) # assign negative weights for candidates with no score # they will appear last sorted according to their unigram count candidates_left = candidates - candidates_found for candidate in candidates_left: count = self.w2counts[candidate] if candidate in self.w2counts else 1 score = -1 - (1.0/count) # between (-1,-2] filtered_results[candidate] = score return filtered_results