def check_all(repo_kernel_file, threshold_list, top_k): total = 0 tp_list = [0]*len(threshold_list) fp_list = [0]*len(threshold_list) tn_list = [0]*len(threshold_list) fn_list = [0]*len(threshold_list) acc_list = [0]*len(threshold_list) # read kernel only once sim = Similarity() sim.read_graph_kernels(repo_kernel_file) with open(repo_kernel_file, 'r') as fi: for line in fi: line = line.rstrip() parts = line.split('\t') dot_file = parts[0] result_program_list_with_score = sim.find_top_k_similar_graphs(dot_file, 'g', top_k, 3) # num_iter = 3 path_parts = dot_file.split(os.sep) true_prob = path_parts[-4] total += 1 for (i, threshold) in enumerate(threshold_list): cr = check_result(true_prob, result_program_list_with_score, threshold) if cr=='tp': tp_list[i] += 1 elif cr=='fp': fp_list[i] += 1 elif cr=='fn': fn_list[i] += 1 else: tn_list[i] += 1 acc = check_top_k_result(true_prob, result_program_list_with_score, threshold, top_k) acc_list[i] += acc return total, tp_list, fp_list, tn_list, fn_list, acc_list
def __init__(self): self.faces = {} self.similarity = Similarity() self.bling = Image.open('images/' + 'bling.png').resize((50, 50)) for i in ikon_categories.keys(): self.faces[i] = Image.open('images/' + ikon_categories[i] + '.png').resize((60, 60))
def __init__(self, path_to_tfcsv): self.database = Database(path_to_tfcsv) self.dictionary = self.database.get_dictionary() self.similarity = Similarity(self.database.documents) self.rank_limit = 6 self.num_leaders = 5 self.similarity.k_means_cluster(self.num_leaders)
def test_generate(self): df_features = self.spark.read.csv( 'tests/fixtures/similarity/features.csv', header=True) columns_to_convert = [ col for col in df_features.columns if 'id' not in col ] df_features_int = df_features for col in columns_to_convert: df_features_int = df_features_int.withColumn( col, f.col(col).cast(IntegerType())) similarity_cos = Similarity(df_features=df_features_int, similarity_type='cosine') pd_df_similarity_cos, _ = similarity_cos.generate() self.assertEqual(pd_df_similarity_cos.shape[0], df_features.count()) self.assertEqual(pd_df_similarity_cos.shape[1], df_features.count()) similarity_euc = Similarity(df_features=df_features_int, similarity_type='euclidean') pd_df_similarity_euc, _ = similarity_euc.generate() self.assertEqual(pd_df_similarity_euc.shape[0], df_features.count()) self.assertEqual(pd_df_similarity_euc.shape[1], df_features.count()) similarity_fail = Similarity(df_features=df_features_int, similarity_type='test') with self.assertRaises(ValueError): similarity_fail.generate()
def contentBasedFiltering(self, key, n=3): '''Return list of n top match scores along with other keys''' dataset = self.dataset scores = [] for other_key in dataset: if other_key == key: continue # Fetching common inner keys to calculate similarity score common_inner_keys = self.fetchCommonInnerKeys(key, other_key) # If there is no common inner key, skip this other keys if len(common_inner_keys) == 0: continue x = [dataset[key][inner_key] for inner_key in common_inner_keys] y = [ dataset[other_key][inner_key] for inner_key in common_inner_keys ] # Appending the similarity score to a list sim = Similarity() scores.append((sim.pearson(x, y), other_key)) # Sorting the list so the highest score appear at the top scores.sort() scores.reverse() return scores[0:n]
def find_top_k_similar_program(repo_kernel_file, user_prog_graph_dot_file, graph_name, k, num_iter, cluster_json): sim = Similarity() sim.read_graph_kernels(repo_kernel_file) result_program_list_with_score = sim.find_top_k_similar_graphs( user_prog_graph_dot_file, graph_name, k, num_iter, cluster_json) return result_program_list_with_score
def contentBasedFiltering(self, key, n=3): '''Return list of n top match scores along with other keys''' dataset = self.dataset scores = [] for other_key in dataset: if other_key == key: continue # Fetching common inner keys to calculate similarity score common_inner_keys = self.fetchCommonInnerKeys(key, other_key) # If there is no common inner key, skip this other keys if len(common_inner_keys) == 0: continue x = [dataset[key][inner_key] for inner_key in common_inner_keys] y = [dataset[other_key][inner_key] for inner_key in common_inner_keys] # Appending the similarity score to a list sim = Similarity() scores.append((sim.pearson(x, y), other_key)) # Sorting the list so the highest score appear at the top scores.sort() scores.reverse() return scores[0:n]
def test__check_nulls_in_feature_columns(self): df_nulls_features = self.spark.read.csv( 'tests/fixtures/similarity/nulls_features.csv', header=True) df_no_nulls_features = self.spark.read.csv( 'tests/fixtures/similarity/no_nulls_features.csv', header=True) columns_to_convert_nulls = [ col for col in df_nulls_features.columns if 'id' not in col ] for col in columns_to_convert_nulls: df_nulls_features = df_nulls_features.withColumn( col, f.col(col).cast(IntegerType())) columns_to_convert_no_nulls = [ col for col in df_no_nulls_features.columns if 'id' not in col ] for col in columns_to_convert_no_nulls: df_no_nulls_features = df_no_nulls_features.withColumn( col, f.col(col).cast(IntegerType())) Similarity(df_features=df_no_nulls_features) with self.assertRaises(AssertionError): Similarity(df_features=df_nulls_features)
def extractPhrasePaireFeature(phrasedir): for lec in annotation.Lectures: path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.' % (prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrasefile = os.path.join(path, "%s.%s.key" % (prompt, method)) phrases = fio.LoadList(phrasefile) for p1 in phrases: for p2 in phrases: featureset.append( (feature_extractor.get_features(p1, p2), 0.0, { 'p1': p1, 'p2': p2 })) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def extractPhrasePaireFeature(phrasedir): for lec in annotation.Lectures: path = phrasedir + str(lec)+ '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.'%(prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrasefile = os.path.join(path, "%s.%s.key"%(prompt, method)) phrases = fio.LoadList(phrasefile) for p1 in phrases: for p2 in phrases: featureset.append((feature_extractor.get_features(p1, p2), 0.0, {'p1':p1, 'p2':p2})) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def build_sim_matrix(self, sentence_list, logger): sim = Similarity() self.sentences = sentence_list sim_matrix = np.empty([len(self.sentences), len(self.sentences)]) for i in range(0, len(self.sentences)): logger.info('Processing sentence # {} => {}'.format( i, self.sentences[i])) for j in range(i + 1, len(self.sentences)): s1 = self.sentences[i] s2 = self.sentences[j] try: score = sim.calculate_similarity_score(s1, s2) except ZeroDivisionError: # print('Problematic s1 {}'.format(s1)) # print('Problematic s2 {}\n'.format(s2)) pass # print('{} | {} | {},{} | {}'.format(s1, s2, i, j, score)) sim_matrix[i][j] = round(score, 2) sim_matrix[j][i] = sim_matrix[i][j] sim_matrix[i][i] = 1.00 try: del sim del score except: pass return sim_matrix
def similarity_action(self): dialog = Similarity(parent=self, df=self.table) if dialog.exec_(): res = dialog.execute() QMessageBox.information( self, f'Similarity: {dialog.method}', f'Columns {dialog.first_column} and {dialog.second_column} have a similarity value of {res}', QMessageBox.Ok)
def __init__(self): self.m_preprocessor = Preprocessor() self.m_similarity = Similarity() self.m_plt = Plot() self.m_evaluator = Evaluator() self.m_file = "sts-train" self.m_metric = "path" self.m_ic = "brown" self.m_metric_w2v = "cosine" self.m_metric_t = "path" self.m_thr = 20 self.m_mode = "ontology"
def collaborativeRecommendation(self, key, n=3): '''Return list of n top match scores along with inner keys''' dataset = self.dataset weighted_inner_values = {} total_scores = {} for other_key in dataset: if other_key == key: continue # Fetching common inner keys to calculate similarity score common_inner_keys = self.fetchCommonInnerKeys(key, other_key) # If there is no common inner key, skip this other keys if len(common_inner_keys) == 0: continue x = [dataset[key][inner_key] for inner_key in common_inner_keys] y = [ dataset[other_key][inner_key] for inner_key in common_inner_keys ] # Finding similarity score sim = Similarity() score = sim.pearson(x, y) # Ignoring scores of zero or below if score <= 0: continue for inner_key in dataset[other_key]: if inner_key not in dataset[key] or dataset[key][ inner_key] == 0: # Weighted sum of value times similarity score weighted_inner_values.setdefault(inner_key, 0) weighted_inner_values[ inner_key] += score * dataset[other_key][inner_key] # Sum of similarity score total_scores.setdefault(inner_key, 0) total_scores[inner_key] += score scores = [(weighted_inner_values[inner_key] / total_scores[inner_key], inner_key) for inner_key in weighted_inner_values] # Sorting the list so that highest score appear at the top scores.sort() scores.reverse() return scores[0:n]
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id): for doc, lec, annotator in annotation.generate_all_files( annotation.datadir + 'json/', '.json', anotators=annotators, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) path = phrasedir + str(lec) + '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.' % (prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrase_annotation = task.get_phrase_annotation(prompt) #positive examples for rank1 in sorted(phrase_annotation): for rank2 in sorted(phrase_annotation): if rank1 == rank2: score = 1.0 else: score = 0.0 phrases1 = phrase_annotation[rank1] phrases2 = phrase_annotation[rank2] for phrasedict1 in phrases1: p1 = phrasedict1['phrase'].lower().strip() for phrasedict2 in phrases2: p2 = phrasedict2['phrase'].lower().strip() featureset.append( (feature_extractor.get_features(p1, p2), score, { 'p1': p1, 'p2': p2 })) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def main(args, config): wDir = os.getcwd() #Instance Preprocessing class window = Preprocessing(args.fasta_file, config['win_length'], config['win_step']) window.output_window() print >> sys.stderr, "Creating windows_sequence.fasta" #Instance Similarity and Composition class sim = Similarity(args.fasta_file, config['score_adj'],wDir) sim_matrix = sim.mcl_perform() comp_results = Composition(config['kmer_len']) comp_matrix = comp_results.joined() #Join similarity and composition matrix for PCA join = pd.concat([comp_matrix, sim_matrix], axis= 1, join='inner') print >> sys.stderr, "Calculating similarity and composition matrix" #Instance Reduction class pca = Reduction(join, config['pca_comp']) pca_data = pca.perform_pca() print >> sys.stderr, "Performing PCA" #Instance Clustering class cluster = Clustering(pca_data) clust_obj = cluster.plot() print >> sys.stderr, "Performing clustering plot" #Instance ClusterReport class report = ClusterReport(clust_obj) file_name, querySeq = report.output_queryseq() print >> sys.stderr, "Doing report of clusters" #Instance Validate class valid = Validate(file_name, args.fasta_file,wDir) jfileComp, jfileMinus = valid.roundTwo() print >> sys.stderr, "Validation of results" #Instance ParseJplace Class parsing = ParseJplace(jfileComp, jfileMinus) corrMat = parsing.correlation() print >> sys.stderr, "Doing profiles" #Instance Profile Class ttest = Profiles(corrMat, querySeq) bestWin = ttest.windowsAssigment() print >>sys.stderr, "Doing permutations" #Instance StatsBinom finalResult = StatsBinom(args.fasta_file, config['win_length'],bestWin) finalResult.binomial() cleaning(file_name)
def __init__(self, messages, model, questions: set, answers: set, pc_questions: dict, pc_answers: dict, tokenizer): self.questions = questions self.answers = answers self.pc_questions = pc_questions self.pc_answers = pc_answers self.tokenizer = tokenizer self.model = model self.messages = messages self.pp = PreProcessing() self.s = Similarity(questions=self.questions, answers=self.answers )
def main(experiment_name, phenotypes, data_directory, anchor_genes, num_replicates=1, percent=0.4, num_anchors=50, min_dangle_size=3, max_dangle_size=10, test_ratio=0.5): assert isinstance(phenotypes, list) alphas = random.choices(range(min_dangle_size, max_dangle_size), k=int(num_anchors * test_ratio)) assert len(alphas) < len(anchor_genes) anchor_train_groups = [] anchor_test_groups = [] backbones = [] # Create all backbones for rep_id in range(num_replicates): random.shuffle(anchor_genes) candidates = anchor_genes[:int(num_anchors)] genes_of_interest_train, genes_of_interest_test = train_test_split( candidates, shuffle=True, test_size=test_ratio) anchor_train_groups.append(genes_of_interest_train) anchor_test_groups.append(genes_of_interest_test) backbones.append( build_backbone(anchors=anchor_train_groups[rep_id], alphas=alphas, weight=1, edge_percentage=percent)) # Write train anchors to file with open(os.path.join(experiment_name, 'train_anchors.csv'), 'w') as fout: for gene_group in anchor_train_groups: fout.write(','.join(gene_group)) fout.write("\n") # Write test anchors to file with open(os.path.join(experiment_name, 'test_anchors.csv'), 'w') as fout: for gene_group in anchor_test_groups: fout.write(','.join(gene_group)) fout.write("\n") # Adding the backbones and create the similarity object for pheno in phenotypes: file_name = os.path.join(data_directory, "{}.csv".format(pheno)) for rep_id in range(num_replicates): sim_file_name = "anchored_{}_{}.csv".format(pheno, str(rep_id)) out_address = os.path.join(experiment_name, sim_file_name) similarity = Similarity(file_name, anchors=anchor_train_groups[rep_id], alphas=alphas, string_id=True) similarity.transform() similarity.apply_threshold(lower_cor=0.2, upper_cor=0.8, value=0) similarity.augment(backbones[rep_id]) similarity.to_csv(out_address)
class Prediction: def __init__(self, messages, model, questions: set, answers: set, pc_questions: dict, pc_answers: dict, tokenizer): self.questions = questions self.answers = answers self.pc_questions = pc_questions self.pc_answers = pc_answers self.tokenizer = tokenizer self.model = model self.messages = messages self.pp = PreProcessing() self.s = Similarity(questions=self.questions, answers=self.answers ) def predict(self, msg): if msg == '' or msg is None: return emergency_message() try: msg = self.pp.pre_processing_text_for_similarity(msg) msg_nn = self.pp.pre_processing_text_for_neural_network(msg) except Exception as e: save_content_to_log(e) return BOT_PREFIX + emergency_message() + '\n' + str(e) if msg == '' or msg is None: return emergency_message() p = self.tokenizer.texts_to_matrix([msg_nn]) res = self.model.predict(p) if res >= 0.5: pc = self.pc_questions else: pc = self.pc_answers conversations = self.s.return_conversation_by_cossine(msg, res) conversations = self.s.return_conversation_by_page_rank(msg, conversations, page_compute=pc, reverse=True) return self.s.get_the_next_conversation(conversations, self.messages)
def train_IE256_svm(traincourse, model_dir, name='simlearn_cv'): sim_extractor = Similarity() allfeatures = sorted(sim_extractor.features.keys()) features = allfeatures name = '_'.join(features) lectures = annotation.Lectures dict = defaultdict(int) if traincourse == 'IE256': train = [x for x in range(14, 26) if x != 22] else: train = [x for x in range(3, 27)] model_file = os.path.join(model_dir, '%s_%s.model' % (traincourse, name)) if fio.IsExist(model_file): with open(model_file, 'rb') as handle: clf = pickle.load(handle) else: train_X, train_Y = combine_files_course(traincourse, train, features) clf = svm.SVC() clf.fit(train_X, train_Y) with open(model_file, 'wb') as handle: pickle.dump(clf, handle)
def combine_files(lectures, features=None, prompts=['q1', 'q2']): phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course X = [] Y = [] if features == None: sim_extractor = Similarity() features = sorted(sim_extractor.features.keys()) for i, lec in enumerate(lectures): for q in prompts: for phrasedir in [phrasedir1, phrasedir2]: path = phrasedir + str(lec) + '/' filename = os.path.join(path, q + sim_exe) data = fio.LoadDictJson(filename) for fdict, score, _ in data: row = [] for name in features: x = fdict[name] if str(x) == 'nan': x = 0.0 row.append(x) X.append(row) Y.append(score) return X, Y
def gather_performance(output): sim_extractor = Similarity() allfeatures = sorted(sim_extractor.features.keys()) allbody = [] for k in range(len(allfeatures) + 1): #features = allfeatures#['WordEmbedding'] if k == len(allfeatures): #use all features features = allfeatures else: features = [allfeatures[k]] #features = allfeatures[0:k] + allfeatures[k+1:] name = '_'.join(features) resultfile = '../data/%s/simlearning.cv.svm.%s.txt' % (course, name) head, body = fio.ReadMatrix(resultfile, hasHead=True) #get the average allhead = ['name'] + head[2:] average = [name] for i in range(2, len(head)): #start from the third one values = [float(row[i]) for row in body] average.append(np.mean(values)) allbody.append(average) fio.WriteMatrix(output, allbody, allhead)
def test__convert_to_long_format(self): pd_df_similarities_wide = pd.read_csv( 'tests/fixtures/similarity/similarities_wide.csv', index_col=0) df_simple_table = self.spark.read.csv( 'tests/fixtures/similarity/simple_table.csv', header=True) columns_to_convert = [ col for col in df_simple_table.columns if 'id' not in col ] for col in columns_to_convert: df_simple_table = df_simple_table.withColumn( col, f.col(col).cast(IntegerType())) similarity = Similarity(df_features=df_simple_table) pd_df_similarities_long = similarity._convert_to_long_format( pd_df_similarities_wide) self.assertEqual( pd_df_similarities_long.shape[0], pd_df_similarities_wide.shape[0] * pd_df_similarities_wide.shape[1]) check_1_3 = pd_df_similarities_long.loc[ (pd_df_similarities_long['recipe_id_1'] == 1) & (pd_df_similarities_long['recipe_id_2'] == '3' )]['similarity'].values[0] self.assertEqual(check_1_3, 9) check_3_1 = pd_df_similarities_long.loc[ (pd_df_similarities_long['recipe_id_1'] == 3) & (pd_df_similarities_long['recipe_id_2'] == '1' )]['similarity'].values[0] self.assertEqual(check_3_1, 6) check_2_3 = pd_df_similarities_long.loc[ (pd_df_similarities_long['recipe_id_1'] == 2) & (pd_df_similarities_long['recipe_id_2'] == '3' )]['similarity'].values[0] self.assertEqual(check_2_3, 1) check_1_2 = pd_df_similarities_long.loc[ (pd_df_similarities_long['recipe_id_1'] == 1) & (pd_df_similarities_long['recipe_id_2'] == '2' )]['similarity'].values[0] self.assertEqual(check_1_2, 6)
def collaborativeRecommendation(self, key, n=3): '''Return list of n top match scores along with inner keys''' dataset = self.dataset weighted_inner_values = {} total_scores = {} for other_key in dataset: if other_key == key: continue # Fetching common inner keys to calculate similarity score common_inner_keys = self.fetchCommonInnerKeys(key, other_key) # If there is no common inner key, skip this other keys if len(common_inner_keys) == 0: continue x = [dataset[key][inner_key] for inner_key in common_inner_keys] y = [dataset[other_key][inner_key] for inner_key in common_inner_keys] # Finding similarity score sim = Similarity() score = sim.pearson(x, y) # Ignoring scores of zero or below if score <= 0: continue for inner_key in dataset[other_key]: if inner_key not in dataset[key] or dataset[key][inner_key] == 0: # Weighted sum of value times similarity score weighted_inner_values.setdefault(inner_key, 0) weighted_inner_values[inner_key] += score * dataset[other_key][inner_key] # Sum of similarity score total_scores.setdefault(inner_key, 0) total_scores[inner_key] += score scores = [(weighted_inner_values[inner_key]/total_scores[inner_key], inner_key) for inner_key in weighted_inner_values] # Sorting the list so that highest score appear at the top scores.sort() scores.reverse() return scores[0:n]
def train_leave_one_lecture_out(model_dir, name='simlearn_cv'): # model_dir = '../data/IE256/%s/model/%s/'%(system, name) # fio.NewPath(model_dir) # # outputdir = '../data/IE256/%s/extraction/%s_output/'%(system, name) # fio.NewPath(outputdir) sim_extractor = Similarity() allfeatures = sorted(sim_extractor.features.keys()) if True: k = len(allfeatures) #for k in range(len(allfeatures)+1): #features = allfeatures#['WordEmbedding'] if k == len(allfeatures): #use all features features = allfeatures else: features = [allfeatures[k]] name = '_'.join(features) lectures = annotation.Lectures dict = defaultdict(int) MSE = [] for i, lec in enumerate(lectures): train = [x for x in lectures if x != lec] test = [lec] print train print test model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name)) if fio.IsExist(model_file): with open(model_file, 'rb') as handle: clf = pickle.load(handle) else: train_X, train_Y = combine_files(train, features) clf = svm.SVR() clf.fit(train_X, train_Y) with open(model_file, 'wb') as handle: pickle.dump(clf, handle) for q in ['q1', 'q2']: test_X, test_Y = combine_files(test, features, prompts=[q]) predict_Y = clf.predict(test_X) mse = mean_squared_error(test_Y, predict_Y) MSE.append([lec, q, mse]) output = '../data/%s/simlearning.cv.%s.txt' % (course, name) fio.WriteMatrix(output, MSE, header=['lec', 'prompt', 'MSE'])
def calculate(self): self.allPredicts = np.zeros((4, self.testSize)) bias = Bias(self.trainData, self.testData) bias.calculateBias() answers, predicts = bias.predict() self.biasClass = bias self.allPredicts[0, :] = predicts #print("Bias: %f" % evaluationRMSE(answers, predicts)) similarity = Similarity(self.trainData, self.testData) similarity.calculateBias() similarity.calcSimiMatrix() answers, predicts = similarity.predict() self.similarityClass = similarity self.allPredicts[1, :] = predicts #print("Similarity: %f" % evaluationRMSE(answers, predicts)) svd = SVD(self.trainData, self.testData) svd.generaterMat() svd.calcSVD() answers, predicts = svd.predict() self.svdClass = svd self.allPredicts[2, :] = predicts #print("SVD: %f" % evaluationRMSE(answers, predicts)) matFactory = MatFactory(self.trainData, self.testData) matFactory.train(10, 11) answers, predicts = matFactory.predict() self.matFactoryClass = matFactory self.allPredicts[3, :] = predicts #print("MatFactory: %f" % evaluationRMSE(answers, predicts)) pickleFile = open(predictsFile, 'wb') pickle.dump(self.allPredicts, pickleFile)
def correlation_analysis(course): phrasedir1 = '../data/%s/oracle_annotator_1/phrase/' % course phrasedir2 = '../data/%s/oracle_annotator_2/phrase/' % course outdir = '../data/%s/simlearning/' % course fio.NewPath(outdir) sim_extractor = Similarity() features = sorted(sim_extractor.features.keys()) head = features + ['score', 'predict'] body = [] lectures = annotation.Lectures name = '_'.join(features) for i, lec in enumerate(lectures): model_file = os.path.join(model_dir, '%d_%s.model' % (lec, name)) with open(model_file, 'rb') as handle: clf = pickle.load(handle) for q in ['q1', 'q2']: outfile = os.path.join(outdir, str(lec), '%s%s' % (q, sim_exe)) for phrasedir in [phrasedir1, phrasedir2]: path = phrasedir + str(lec) + '/' filename = os.path.join(path, q + sim_exe) data = fio.LoadDictJson(filename) for fdict, score, _ in data: row = [] for fname in features: x = fdict[fname] if str(x) == 'nan': x = 0.0 row.append(x) predict_score = clf.predict([row]) row.append(score) row.append(predict_score[0]) body.append(row) out_correlation = os.path.join(outdir, 'data.txt') print out_correlation fio.WriteMatrix(out_correlation, body, head)
def extractPhrasePaireFromAnnotation(phrasedir, annotators, id): for doc, lec, annotator in annotation.generate_all_files(annotation.datadir + 'json/', '.json', anotators = annotators, lectures=annotation.Lectures): print doc #load task task = annotation.Task() task.loadjson(doc) path = phrasedir + str(lec)+ '/' fio.NewPath(path) for prompt in ['q1', 'q2']: prefix = os.path.join(path, '%s.%s.'%(prompt, method)) filename = path + prompt + sim_exe print filename featureset = [] feature_extractor = Similarity(prefix) phrase_annotation = task.get_phrase_annotation(prompt) #positive examples for rank1 in sorted(phrase_annotation): for rank2 in sorted(phrase_annotation): if rank1 == rank2: score = 1.0 else: score = 0.0 phrases1 = phrase_annotation[rank1] phrases2 = phrase_annotation[rank2] for phrasedict1 in phrases1: p1 = phrasedict1['phrase'].lower().strip() for phrasedict2 in phrases2: p2 = phrasedict2['phrase'].lower().strip() featureset.append((feature_extractor.get_features(p1, p2), score, {'p1':p1, 'p2':p2})) fio.SaveDict2Json(featureset, filename) feature_extractor.save()
def test__check_is_spark_data_frame(self): df_simple_table = self.spark.read.csv( 'tests/fixtures/similarity/simple_table.csv', header=True) pd_df_simple_table = pd.read_csv( 'tests/fixtures/similarity/simple_table.csv') columns_to_convert = [ col for col in df_simple_table.columns if 'id' not in col ] for col in columns_to_convert: df_simple_table = df_simple_table.withColumn( col, f.col(col).cast(IntegerType())) Similarity(df_features=df_simple_table) with self.assertRaises(AssertionError): Similarity(df_features=pd_df_simple_table)
class QueryEngine: def __init__(self, path_to_tfcsv): self.database = Database(path_to_tfcsv) self.dictionary = self.database.get_dictionary() self.similarity = Similarity(self.database.documents) self.rank_limit = 6 self.num_leaders = 5 self.similarity.k_means_cluster(self.num_leaders) def handle_query(self, query): """ process user search query """ query = query.lower() query_terms = query.split() if query_terms[0] == "stop": exit(0) # only include terms that are in the data set query_terms = [x for x in query_terms if x in self.dictionary] if not query_terms: print("No results for the given query") # generate ranked results and print them to console ranked_results = sorted(self.similarity.cosine_scores(query_terms), key=lambda x: x[1], reverse=True) print("\nQUERY RESULTS: ") print("--------------------------------------") # include top k results that have a score > 0 for i, res in enumerate(filter(lambda x: x[1] > 0, ranked_results)): if i < self.rank_limit: doc_title = res[0] cos_score = res[1] doc_url = self.database.get_url(doc_title) print("[{}] {}\n{}\n(cosine_score={})".format( i + 1, doc_title, doc_url, cos_score)) print("--------------------------------------") print()
def test__check_is_numerical_data(self): df_numerical = self.spark.read.csv( 'tests/fixtures/similarity/numerical_data.csv', header=True) columns_to_convert = [ col for col in df_numerical.columns if 'id' not in col ] df_numerical_int = df_numerical df_numerical_float = df_numerical for col in columns_to_convert: df_numerical_int = df_numerical_int.withColumn( col, f.col(col).cast(IntegerType())) df_numerical_float = df_numerical_float.withColumn( col, f.col(col).cast(DoubleType())) Similarity(df_features=df_numerical_int) Similarity(df_features=df_numerical_float) with self.assertRaises(AssertionError): Similarity(df_features=df_numerical)
def generate_walks(edge_list_address, walk_per_node, walk_length, workers = 4): similarity = Similarity(correlation_file_path=edge_list_address, anchors=[], alphas=[], sep=',', prefix='pseudo') genes = list(similarity.idx.keys()) start_time = time.time() gen_walk = WalkGenerator(similarity.matrix, genes, walk_length, walk_per_node) print("takes {} seconds to create walk object.".format( time.time() - start_time)) num_cpus = workers pool = mp.Pool(num_cpus) arguments = list(range(len(gen_walk))) chunk_size = len(gen_walk) // num_cpus walks = pool.map(gen_walk, arguments, chunksize=chunk_size) return walks
def check_all(repo_kernel_file, threshold_list, top_k): total = 0 tp_list = [0] * len(threshold_list) fp_list = [0] * len(threshold_list) tn_list = [0] * len(threshold_list) fn_list = [0] * len(threshold_list) acc_list = [0] * len(threshold_list) # read kernel only once sim = Similarity() sim.read_graph_kernels(repo_kernel_file) with open(repo_kernel_file, 'r') as fi: for line in fi: line = line.rstrip() parts = line.split('\t') dot_file = parts[0] result_program_list_with_score = sim.find_top_k_similar_graphs( dot_file, 'g', top_k, 3) # num_iter = 3 path_parts = dot_file.split(os.sep) true_prob = path_parts[-4] total += 1 for (i, threshold) in enumerate(threshold_list): cr = check_result(true_prob, result_program_list_with_score, threshold) if cr == 'tp': tp_list[i] += 1 elif cr == 'fp': fp_list[i] += 1 elif cr == 'fn': fn_list[i] += 1 else: tn_list[i] += 1 acc = check_top_k_result(true_prob, result_program_list_with_score, threshold, top_k) acc_list[i] += acc return total, tp_list, fp_list, tn_list, fn_list, acc_list
def main(): cats = prepare_categories() test_token_freqs = prepare_test_data() similarity = Similarity(cats=cats, data=test_token_freqs) print(max(similarity.jaccard(), key=similarity.jaccard().get)) print(max(similarity.cosine(), key=similarity.cosine().get))
def main(): similarity = Similarity() pg.init() clock = pg.time.Clock() clock.tick(30) # create screen display_width = 960 display_height = 650 pos_x = 0 pos_y = 30 os.environ['SDL_VIDEO_WINDOW_POS'] = '%i,%i' % (pos_x, pos_y) gameDisplay = pg.display.set_mode((display_width, display_height)) pg.display.set_caption('Pose Dance') game = PoseDance(gameDisplay, similarity) game.run()
def run(self, corpus_path, test_path, minfreq): self._database = self._construct_database(corpus_path) before_unique, before_total = self._stas(self._database) self._database.apply_minfreq(minfreq) after_unique, after_total = self._stas(self._database) sim = Similarity(self._database) test_phrases = self.IO.read_phrases(test_path) with open('trace.txt', 'w', encoding='utf8') as f: # Write the head line. args = [before_unique, after_unique, before_total, after_total] f.write('\n') self._write_head(f, args) for phrase in test_phrases: most_similar = self._find_k_similar(phrase, sim, 5) self._write_result(f, phrase, most_similar)
class TestSimilarityFunkcions(unittest.TestCase): def setUp(self): self.sim = Similarity(1,1) def test_getIndex(self): self.sim.indexes = {} self.assertEqual(0, self.sim.getIndex('aa')) self.sim.indexes = {'aa':0} self.assertEqual(0, self.sim.getIndex('aa')) self.sim.indexes = {'aa':0} self.assertEqual(1, self.sim.getIndex('bb')) self.sim.indexes = {'bb':0,'aa':1} self.assertEqual(0, self.sim.getIndex('bb')) self.sim.indexes = {'bb':0,'aa':1} self.assertEqual(1, self.sim.getIndex('aa')) self.sim.indexes = {1:0,'aa':1} self.assertEqual(0, self.sim.getIndex(1)) self.sim.indexes = {'bb':0,'aa':1} self.assertEqual(1, self.sim.getIndex('aa'))
def calAll(self): self.errs = [0] * 5 bias = Bias(self.data, self.test) bias.calculateBias() answers, predicts = bias.predict() err = evaluationRMSE(answers, predicts) self.errs[0] = err print("Bias: %f" % err) similarity = Similarity(self.data, self.test) similarity.calculateBias() similarity.calcSimiMatrix() answers, predicts = similarity.predict() err = evaluationRMSE(answers, predicts) self.errs[1] = err print("Similarity: %f" % err) svd = SVD(self.data, self.test) svd.generaterMat() svd.calcSVD() answers, predicts = svd.predict() err = evaluationRMSE(answers, predicts) self.errs[2] = err print("SVD: %f" % err) matFactory = MatFactory(self.data, self.test) matFactory.train(20, 35) answers, predicts = matFactory.predict() err = evaluationRMSE(answers, predicts) self.errs[3] = err print("MatFactory: %f" % evaluationRMSE(answers, predicts)) combination = Combination(self.data) combination.separateData() combination.calculate() combination.train(alpha = 0.01, iter = 10000) answers, predicts = combination.predict(self.test) err = evaluationRMSE(answers, predicts) self.errs[4] = err print("Combination: %f" % err) return self.errs
def find_top_k_similar_program(repo_kernel_file, user_prog_graph_dot_file, graph_name, k, num_iter): sim = Similarity() sim.read_graph_kernels(repo_kernel_file) result_program_list_with_score = sim.find_top_k_similar_graphs(user_prog_graph_dot_file, graph_name, k, num_iter) return result_program_list_with_score
# data = vstack((rand(5,2) + array([.5,.5]),rand(5,2))) #print data # computing K-Means with K = 2 (2 clusters) centroids,_ = kmeans(data,3) print 'centroids' print centroids # assign each sample to a cluster idx,cc = vq(data,centroids) print 'indice de cada especie' print cc # some plotting using numpy's logical indexing plot(data[idx==0,0],data[idx==0,1],'ob', data[idx==1,0],data[idx==1,1],'or') plot(centroids[:,0],centroids[:,1],'sg',markersize=8) #show() for c in c1: for c_ in c2: coef = Similarity.intersection(c, c_) # print '%d -> %d : %f' % (c.code, c_.code, coef) ## Caracteristiscas de uma especie #for cs in s2.characters(): # print cs[0].name + " -> " + cs[1].label
def setUp(self): self.sim = Similarity(1,1)
def main(corpus, annotations): """ SUMMARY: use case of the user-driven functionality of PASCALI. Scenario: User provides the concept of Sequence and the equivalent Java types, and the concept of sorted sequence and the relevant type invariant. Goal: learn how to get from Sequence -> Sorted Sequence. """ """ INPUT: annotations, dictionary mapping string -> list of strings OUTPUT: recompiles generic-inference-solver with new annotations""" run_pa2checker(annotations) """ Look for new mapping from 'ontology concepts'->'java type' and run checker framework. Should be implemented in type_inference Mapping example: Sequence -> java.lang.Array, java.util.List, LinkedHashSet, etc. INPUT: corpus, file containing set of concept->java_type mapping OUTPUT: Set of jaif files that are merged into the classes. The jaif files are stored as default.jaif in each project's directory. BODY: This also triggers back-end labeled graph generation. """ for project in corpus: run_inference(project) """ Missing step: interact with PA to add a definition of Sorted Sequence which is a specialization of Sequence that has a sortedness invariants. The sortedness invariant gets turned into a Daikon template INPUT: user interaction OUTPUT: type_annotation and type_invariant (for sorted sequence) """ ordering_operator = "<=" ontology_invariant_file = "TODO_from_Howie.txt" with open(ontology_invariant_file, 'w') as f: f.write(ordering_operator) invariant_name = "TODO_sorted_sequence" daikon_pattern_java_file = ontology_to_daikon.create_daikon_invariant(ontology_invariant_file, invariant_name) """ Find all methods that have one input parameter annotated as Sequence and return a variable also annotated as Sequence. INPUT: The corpus and the desired annotations on the method signature OUTPUT: List of methods that have the desired signature. NOTE: This is a stub and will be implemented as LB query in the future. """ sig_methods = find_methods_with_signature(corpus, "@ontology.qual.Sequence", ["@ontology.qual.Sequence"]) print ("\n ************") print ("The following corpus methods have the signature Sequence->Sequence {}:") for (project, package, clazz, method) in sig_methods: print("{}:\t{}.{}.{}".format(project, package, clazz, method)) print ("\n ************") """ Search for methods that have a return type annotated with Sequence and for which we can establish a sortedness invariant (may done by LB). INPUT: dtrace file of project daikon_pattern_java_file that we want to check on the dtrace file. OUTPUT: list of ppt names that establish the invariant. Here a ppt is a Daikon program point, s.a. test01.TestClass01.sort(int[]):::EXIT Note: this step translate the type_invariant into a Daikon template (which is a Java file). """ pattern_class_name = invariant_name pattern_class_dir = os.path.join(common.WORKING_DIR, "invClass") if os.path.isdir(pattern_class_dir): shutil.rmtree(pattern_class_dir) os.mkdir(pattern_class_dir) cmd = ["javac", "-g", "-classpath", common.get_jar('daikon.jar'), daikon_pattern_java_file, "-d", pattern_class_dir] common.run_cmd(cmd) list_of_methods = [] for project in corpus: dtrace_file = backend.get_dtrace_file_for_project(project) if not dtrace_file: print ("Ignoring folder {} because it does not contain dtrace file".format(project)) continue ppt_names = inv_check.find_ppts_that_establish_inv(dtrace_file, pattern_class_dir, pattern_class_name) methods = set() for ppt in ppt_names: method_name = ppt[:ppt.find(':::EXIT')] methods.add(method_name) list_of_methods +=[(project, methods)] print ("\n ************") print ("The following corpus methods return a sequence sorted by {}:".format(ordering_operator)) for project, methods in list_of_methods: if len(methods)>0: print (project) for m in methods: print("\t{}".format(m)) print ("\n ************") shutil.rmtree(pattern_class_dir) """ Expansion of dynamic analysis results .... Find a list of similar methods that are similar to the ones found above (list_of_methods). INPUT: list_of_methods, corpus with labeled graphs generated, threshold value for similarity, OUTPUT: superset_list_of_methods """ # WENCHAO print("Expanding the dynamic analysis results using graph-based similarity:") union_set = set() for project, methods in list_of_methods: # map Daikon output on sort method to method signature in methods.txt in generated graphs for m in methods: method_name = common.get_method_from_daikon_out(m) #kernel_file = common.get_kernel_path(project) method_file = common.get_method_path(project) dot_name = common.find_dot_name(method_name, method_file) if dot_name: # find the right dot file for each method dot_file = common.get_dot_path(project, dot_name) # find all graphs that are similar to it using WL based on some threshold sys.path.insert(0, 'simprog') from similarity import Similarity sim = Similarity() sim.read_graph_kernels("corpus_kernel.txt") top_k = 3 iter_num = 3 result_program_list_with_score = sim.find_top_k_similar_graphs(dot_file, 'g', top_k, iter_num) print(project+":") print(result_program_list_with_score) result_set = set([x[0] for x in result_program_list_with_score]) # take the union of all these graphs union_set = union_set | result_set print("Expanded set:") print([x.split('/')[-4] for x in union_set]) # return this set as a list of (project, method) fo = open("methods.txt", "w") expanded_list = [] for dot_path in union_set: method_summary = common.get_method_summary_from_dot_path(dot_path) fo.write(method_summary) fo.write("\n") fo.close() """ Update the type annotations for the expanded dynamic analysis results. INPUT: superset_list_of_methods, annotation to be added OUTPUT: nothing EFFECT: updates the type annotations of the methods in superset_list_of_methods. This requires some additional checks to make sure that the methods actually perform some kind of sorting. Note that we do it on the superset because the original list_of_methods might miss many implementations because fuzz testing could not reach them. """ for class_file in []: # MARTIN generated_jaif_file = "TODO" insert_jaif.merge_jaif_into_class(class_file, generated_jaif_file) """ Ordering of expanded dynamic analysis results .... Find the k 'best' implementations in superset of list_of_methods INPUT: superset_list_of_methods, corpus, k OUTPUT: k_list_of_methods Note: similarity score is used. may consider using other scores; e.g., TODO:??? """ #TODO: create input file for huascar where each line is formatted like: # ../corpus/Sort05/src/Sort05.java::sort(int[]):int[] ordering_dir = os.path.join(common.WORKING_DIR, "ordering_results/") methods_file = os.path.join(common.WORKING_DIR, 'methods.txt') with common.cd(ordering_dir): #TODO generate a proper relevant methods file. cmd = ["./run.sh", "-k", "3", "-t", "typicality", "-f", methods_file] common.run_cmd(cmd, print_output=True) """ Close the loop and add the best implementation found in the previous step back to the ontology. INPUT: k_list_of_methods OUTPUT: patch file for the ontology. Worst case: just add the 'best' implementation found in the corpus as a blob to the ontology. Best case: generate an equivalent flow-graph in the ontology. """ print "TODO" # ALL