def measure_embeddings_correlations(embeddings_file1, embeddings_file2, sample_size=10): """ :param embeddings_file1: :param embeddings_file2: :param sample_size: randomly select this many words from the embeddings, and compute their rankings. That way we can compute the expected rank correlation coefficient. :return: None """ warnings.filterwarnings("ignore") #First step: read in embeddings files embeddings1 = kNearestNeighbors.read_in_embeddings(embeddings_file1) embeddings2 = kNearestNeighbors.read_in_embeddings(embeddings_file2) #Second step: prune out the uncommon words form both embeddings forbidden = set() for k, v in embeddings1.items(): if k not in embeddings2: forbidden.add(k) for f in forbidden: del embeddings1[f] forbidden = set() for k, v in embeddings2.items(): if k not in embeddings1: forbidden.add(k) for f in forbidden: del embeddings2[f] #Third step: sample words = embeddings1.keys() shuffle(words) words = words[0:sample_size] print words #Fourth step: build ranked lists correlations = dict() pvalues = dict() for word in words: score_dict = kNearestNeighbors._generate_scored_dict(embeddings1, word) l1 = kNearestNeighbors._extract_top_k(score_dict, k=0, disable_k=True) score_dict = kNearestNeighbors._generate_scored_dict(embeddings2, word) l2 = kNearestNeighbors._extract_top_k(score_dict, k=0, disable_k=True) results = EmbeddingsAnalyses._compute_rank_corr_coeff(l1, l2) correlations[word] = results[0] pvalues[word] = results[1] print 'word: corr. coeff' pp = pprint.PrettyPrinter(indent=4) pp.pprint(correlations) print 'word: p_value' pp = pprint.PrettyPrinter(indent=4) pp.pprint(pvalues) print 'Expected spearman\'s rank corr. coeff.: ', print np.mean(correlations.values()) # path = '/home/mayankkejriwal/Downloads/memex-cp4-october/' # EmbeddingsAnalyses.measure_embeddings_correlations(path+'unigram-embeddings-gt.json', path+'unigram-embeddings-10000docs.json')
def filter_r_lines(sample_file, embeddings_file, output_file, preprocess_function=TextPreprocessors.TextPreprocessors._preprocess_tokens): """ The goal of this function is to take a sample file, and to print to file all 'r' annotated lines such that at least some token from the last column has an embedding. :param sample_file: :param embeddings_file: :param output_file: :param preprocess_function: :return: """ embeddings = set(kNearestNeighbors.read_in_embeddings(embeddings_file).keys()) out = codecs.open(output_file, 'w', 'utf-8') with codecs.open(sample_file, 'r', 'utf-8') as f: for line in f: cols = re.split('\t',line) if cols[1] != 'r': continue last_field = cols[-1][0:-1] # take the last value, then strip out the newline. fields = re.split(',',last_field) if preprocess_function: fields = set(preprocess_function(fields)) if len(fields.intersection(embeddings)) > 0: out.write(line) out.close()
def cluster_embeddings(doc_embeddings_file, cluster_file, output_file): """ Uses a doc embedding file to generate cluster embeddings. The cluster_file is a jlines file where a cluster id refers to doc ids. We get the embedding by looking up the doc_embeddings; we ignore docs that don't have embeddings. We do a sum and normalize. :param doc_embeddings_file: :param cluster_file: :return: """ doc_embeddings_dict = kNearestNeighbors.read_in_embeddings( doc_embeddings_file) cluster_dict = dict() with codecs.open(cluster_file, 'r', 'utf-8') as f: for line in f: obj = json.loads(line) for k, v in obj.items(): cluster_dict[k] = v out = codecs.open(output_file, 'w', 'utf-8') for k, v in cluster_dict.items(): list_of_vecs = list() for doc in v: if doc in doc_embeddings_dict: list_of_vecs.append(doc_embeddings_dict[doc]) else: print 'doc not in doc embedding ', print doc tmp = dict() tmp[k] = VectorUtils.normalize_vector(np.sum(list_of_vecs, axis=0)).tolist() json.dump(tmp, out) out.write('\n') out.close()
def sum_and_normalize(embeddings_file, tokens_file, output_file): """ Doc embeddings are computed by doing summing all tokens that exist in the embeddings file, following which we do an l2 normalization :param embeddings_file: :param tokens_file: :param output_file: :return: """ embeddings_dict = kNearestNeighbors.read_in_embeddings(embeddings_file) out = codecs.open(output_file, 'w', 'utf-8') count = 0 with codecs.open(tokens_file, 'r', 'utf-8') as f: for line in f: count += 1 if count % 5 == 0: print 'in document...', print count obj = json.loads(line) list_of_vectors = list() flag = False tmp = dict() for token in obj.values()[0]: if token not in embeddings_dict: continue else: list_of_vectors.append(embeddings_dict[token]) flag = True if flag: tmp[obj.keys()[0]] = VectorUtils.normalize_vector( np.sum(list_of_vectors, axis=0)).tolist() json.dump(tmp, out) out.write('\n') out.close()
def sum_and_normalize(embeddings_file, tokens_file, output_file): """ Doc embeddings are computed by doing summing all tokens that exist in the embeddings file, following which we do an l2 normalization :param embeddings_file: :param tokens_file: :param output_file: :return: """ embeddings_dict = kNearestNeighbors.read_in_embeddings(embeddings_file) out = codecs.open(output_file, 'w', 'utf-8') count = 0 with codecs.open(tokens_file, 'r', 'utf-8') as f: for line in f: count += 1 if count%5==0: print 'in document...', print count obj = json.loads(line) list_of_vectors = list() flag = False tmp = dict() for token in obj.values()[0]: if token not in embeddings_dict: continue else: list_of_vectors.append(embeddings_dict[token]) flag = True if flag: tmp[obj.keys()[0]] = VectorUtils.normalize_vector(np.sum(list_of_vectors, axis=0)).tolist() json.dump(tmp, out) out.write('\n') out.close()
def cluster_embeddings(doc_embeddings_file, cluster_file, output_file): """ Uses a doc embedding file to generate cluster embeddings. The cluster_file is a jlines file where a cluster id refers to doc ids. We get the embedding by looking up the doc_embeddings; we ignore docs that don't have embeddings. We do a sum and normalize. :param doc_embeddings_file: :param cluster_file: :return: """ doc_embeddings_dict = kNearestNeighbors.read_in_embeddings(doc_embeddings_file) cluster_dict = dict() with codecs.open(cluster_file, 'r', 'utf-8') as f: for line in f: obj = json.loads(line) for k, v in obj.items(): cluster_dict[k] = v out = codecs.open(output_file, 'w', 'utf-8') for k, v in cluster_dict.items(): list_of_vecs = list() for doc in v: if doc in doc_embeddings_dict: list_of_vecs.append(doc_embeddings_dict[doc]) else: print 'doc not in doc embedding ', print doc tmp = dict() tmp[k] = VectorUtils.normalize_vector(np.sum(list_of_vecs, axis=0)).tolist() json.dump(tmp, out) out.write('\n') out.close()
def filter_lines_with_embeddings(sample_file, embeddings_file, output_file, preprocess_function=TextPreprocessors. TextPreprocessors._preprocess_tokens): """ The goal of this function is to take a sample file, and to print to file all lines such that at least some token from the last column has an embedding. :param sample_file: :param embeddings_file: :param output_file: :param preprocess_function: :return: """ embeddings = set( kNearestNeighbors.read_in_embeddings(embeddings_file).keys()) out = codecs.open(output_file, 'w', 'utf-8') with codecs.open(sample_file, 'r', 'utf-8') as f: for line in f: cols = re.split('\t', line) last_field = cols[-1][ 0:-1] # take the last value, then strip out the newline. fields = re.split(',', last_field) if preprocess_function: fields = set(preprocess_function(fields)) if len(fields.intersection(embeddings)) > 0: out.write(line) out.close()
def idf_weighted_embedding(embeddings_file, tokens_file, idf_file, output_file): """ Doc embeddings are computed by doing a weighted idf sum of tokens that exist in the embeddings file, following which we do an l2 normalization :param embeddings_file: :param tokens_file: :param idf_file: :param output_file: :return: None """ embeddings_dict = kNearestNeighbors.read_in_embeddings(embeddings_file) idf_dict = TextAnalyses.TextAnalyses.read_in_and_prune_idf( idf_file, lower_prune_ratio=0.0, upper_prune_ratio=1.0) out = codecs.open(output_file, 'w', 'utf-8') count = 0 with codecs.open(tokens_file, 'r', 'utf-8') as f: for line in f: count += 1 tmp = dict() # weights = list() # vectors = list() total_weights = 0.0 vector = list() obj = json.loads(line) for token in obj.values()[0]: if token not in embeddings_dict: continue elif token in embeddings_dict and token not in idf_dict: continue else: # print idf_dict[token] weight = float(idf_dict[token]) total_weights += weight vector1 = [ element * weight for element in list(embeddings_dict[token]) ] if not vector: vector = vector1 else: vector = list(np.sum([vector, vector1], axis=0)) if total_weights == 0.0: print 'no doc embedding. Skipping document...' continue tmp[obj.keys()[0]] = list(VectorUtils.normalize_vector(vector)) print count json.dump(tmp, out) out.write('\n') out.close()
def filter_dict_terms_with_embeddings(dictionary_file, embeddings_file, output_file): """ :param dictionary_file: :param embeddings_file: :param output_file: :return: """ input = codecs.open(dictionary_file, 'r', 'utf-8') dictionary_set = set(json.load(input)) embeddings = set(kNearestNeighbors.read_in_embeddings(embeddings_file).keys()) out = codecs.open(output_file, 'w', 'utf-8') for m in dictionary_set.intersection(embeddings): out.write(m) out.write('\n') out.close()
def idf_weighted_embedding(embeddings_file, tokens_file, idf_file, output_file): """ Doc embeddings are computed by doing a weighted idf sum of tokens that exist in the embeddings file, following which we do an l2 normalization :param embeddings_file: :param tokens_file: :param idf_file: :param output_file: :return: None """ embeddings_dict = kNearestNeighbors.read_in_embeddings(embeddings_file) idf_dict = TextAnalyses.TextAnalyses.read_in_and_prune_idf(idf_file, lower_prune_ratio=0.0, upper_prune_ratio=1.0) out = codecs.open(output_file, 'w', 'utf-8') count = 0 with codecs.open(tokens_file, 'r', 'utf-8') as f: for line in f: count += 1 tmp = dict() # weights = list() # vectors = list() total_weights = 0.0 vector = list() obj = json.loads(line) for token in obj.values()[0]: if token not in embeddings_dict: continue elif token in embeddings_dict and token not in idf_dict: continue else: # print idf_dict[token] weight = float(idf_dict[token]) total_weights += weight vector1 = [element * weight for element in list(embeddings_dict[token])] if not vector: vector = vector1 else: vector = list(np.sum([vector, vector1], axis=0)) if total_weights == 0.0: print 'no doc embedding. Skipping document...' continue tmp[obj.keys()[0]] = list(VectorUtils.normalize_vector(vector)) print count json.dump(tmp, out) out.write('\n') out.close()
def _check_embeddings_coverage(sample_file, embeddings_file, preprocess_function=TextPreprocessors. TextPreprocessors._preprocess_tokens): """ Designed for any sample file. Will first read in all tokens (using space as separator) from the first column of sample_file, and if preprocess_function is not None, will preprocess the token list. Next, we'll read in the embeddings file and compute token coverage. Note that if multiple tokens appear in some line, it will be as if they are differnt lines. This is because each token would be a separate 'instance' in any ML algorithm we use. :param sample_file: :param embeddings_file: :param preprocess_function: a function :return: None """ list_of_r_tokens = list() list_of_nr_tokens = list() with codecs.open(sample_file, 'r', 'utf-8') as f: for line in f: cols = re.split('\t', line) first_field = cols[0] fields = re.split(' ', first_field) if preprocess_function: fields = (preprocess_function(fields)) if cols[1] == 'r': list_of_r_tokens += fields elif cols[1] == 'nr\n': list_of_nr_tokens += fields else: print 'Error in line! Run sample validation code' embeddings = set( kNearestNeighbors.read_in_embeddings(embeddings_file).keys()) covered_r = 0 covered_nr = 0 for r in list_of_r_tokens: if r in embeddings: covered_r += 1 for nr in list_of_nr_tokens: if nr in embeddings: covered_nr += 1 print 'Covered r is ' + str(covered_r) + ' out of a total of ' + str( len(list_of_r_tokens)) + ' tokens' print 'Covered nr is ' + str(covered_nr) + ' out of a total of ' + str( len(list_of_nr_tokens)) + ' tokens'
def filter_dict_terms_with_embeddings(dictionary_file, embeddings_file, output_file): """ :param dictionary_file: :param embeddings_file: :param output_file: :return: """ input = codecs.open(dictionary_file, 'r', 'utf-8') dictionary_set = set(json.load(input)) embeddings = set( kNearestNeighbors.read_in_embeddings(embeddings_file).keys()) out = codecs.open(output_file, 'w', 'utf-8') for m in dictionary_set.intersection(embeddings): out.write(m) out.write('\n') out.close()
def _check_embeddings_coverage(sample_file, embeddings_file, preprocess_function=TextPreprocessors.TextPreprocessors._preprocess_tokens): """ Designed for any sample file. Will first read in all tokens (using space as separator) from the first column of sample_file, and if preprocess_function is not None, will preprocess the token list. Next, we'll read in the embeddings file and compute token coverage. Note that if multiple tokens appear in some line, it will be as if they are differnt lines. This is because each token would be a separate 'instance' in any ML algorithm we use. :param sample_file: :param embeddings_file: :param preprocess_function: a function :return: None """ list_of_r_tokens = list() list_of_nr_tokens = list() with codecs.open(sample_file, 'r', 'utf-8') as f: for line in f: cols = re.split('\t',line) first_field = cols[0] fields = re.split(' ',first_field) if preprocess_function: fields = (preprocess_function(fields)) if cols[1] == 'r': list_of_r_tokens += fields elif cols[1] == 'nr\n': list_of_nr_tokens += fields else: print 'Error in line! Run sample validation code' embeddings = set(kNearestNeighbors.read_in_embeddings(embeddings_file).keys()) covered_r = 0 covered_nr = 0 for r in list_of_r_tokens: if r in embeddings: covered_r += 1 for nr in list_of_nr_tokens: if nr in embeddings: covered_nr += 1 print 'Covered r is '+str(covered_r)+' out of a total of '+str(len(list_of_r_tokens))+' tokens' print 'Covered nr is '+str(covered_nr)+' out of a total of '+str(len(list_of_nr_tokens))+' tokens'
def _build_vector_set_for_attribute(embeddings_file, ground_truth_file, attribute): """ :param embeddings_file: :param ground_truth_file: :param attribute: :return: A dictionary with keys that are values """ ground_truth_list = FieldAnalyses.read_in_ground_truth_file(ground_truth_file) embeddings = kNearestNeighbors.read_in_embeddings(embeddings_file) attribute_vectors = dict() for obj in ground_truth_list: if attribute in obj: tokens_list = TextPreprocessors.TextPreprocessors.tokenize_field(obj, attribute) processed_tokens_list = TextPreprocessors.TextPreprocessors.preprocess_tokens(tokens_list) for token in processed_tokens_list: if token in embeddings: attribute_vectors[token] = embeddings[token] # else: # print 'token in file, but not in embeddings: ', # print token # print len(attribute_vectors) return attribute_vectors
def measure_embeddings_correlations(embeddings_file1, embeddings_file2, sample_size=10): """ :param embeddings_file1: :param embeddings_file2: :param sample_size: randomly select this many words from the embeddings, and compute their rankings. That way we can compute the expected rank correlation coefficient. :return: None """ warnings.filterwarnings("ignore") #First step: read in embeddings files embeddings1 = kNearestNeighbors.read_in_embeddings(embeddings_file1) embeddings2 = kNearestNeighbors.read_in_embeddings(embeddings_file2) #Second step: prune out the uncommon words form both embeddings forbidden = set() for k, v in embeddings1.items(): if k not in embeddings2: forbidden.add(k) for f in forbidden: del embeddings1[f] forbidden = set() for k, v in embeddings2.items(): if k not in embeddings1: forbidden.add(k) for f in forbidden: del embeddings2[f] #Third step: sample words = embeddings1.keys() shuffle(words) words = words[0:sample_size] print words #Fourth step: build ranked lists correlations = dict() pvalues = dict() for word in words: score_dict = kNearestNeighbors._generate_scored_dict( embeddings1, word) l1 = kNearestNeighbors._extract_top_k(score_dict, k=0, disable_k=True) score_dict = kNearestNeighbors._generate_scored_dict( embeddings2, word) l2 = kNearestNeighbors._extract_top_k(score_dict, k=0, disable_k=True) results = EmbeddingsAnalyses._compute_rank_corr_coeff(l1, l2) correlations[word] = results[0] pvalues[word] = results[1] print 'word: corr. coeff' pp = pprint.PrettyPrinter(indent=4) pp.pprint(correlations) print 'word: p_value' pp = pprint.PrettyPrinter(indent=4) pp.pprint(pvalues) print 'Expected spearman\'s rank corr. coeff.: ', print np.mean(correlations.values()) # path = '/home/mayankkejriwal/Downloads/memex-cp4-october/' # EmbeddingsAnalyses.measure_embeddings_correlations(path+'unigram-embeddings-gt.json', path+'unigram-embeddings-10000docs.json')