def main(): import numpy as np from rosetta_score_files import how_many_purples_in_file import os import re global coh_names, doc_names design_list = ['ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33', 'ct36', 'ct38', 'ct41', 'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49', 'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55', 'ct59'] # df = DataFrame({name: Series([-1 * len(design_list)], index=design_list) for name in design_list}) mtrx = np.zeros([len(design_list), len(design_list)], dtype=int) score_file_list = [x for x in os.listdir( '/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/') if re.match('.*\.score', x)] for score_file in score_file_list: coh_name = score_file.split('_')[1] doc_name = score_file.split('_')[3] purple_num = int(how_many_purples_in_file( '/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/' + score_file)) mtrx[design_list.index(coh_name)][design_list.index(doc_name)] = 1 if purple_num >= 12 else 0 dof_vec = find_degree_vector(mtrx) print dof_vec print mtrx mtrx = clean_all_zeros(mtrx) while not are_all_ones(dof_vec): new_all_ones(dof_vec) to_remove = dof_vec[-1].values()[0] mtrx = remove_from_matrix(mtrx, to_remove) mtrx = clean_all_zeros(mtrx) dof_vec = find_degree_vector(mtrx) # break print mtrx print dof_vec print coh_names print doc_names
def main(): from pandas import DataFrame, Series from rosetta_score_files import how_many_purples_in_file import os import re design_list = ['ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33', 'ct36', 'ct38', 'ct41', 'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49', 'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55', 'ct59'] # df = DataFrame({name: Series([-1 * len(design_list)], index=design_list) for name in design_list}) score_file_list = [x for x in os.listdir('.') if re.match('.*\.score', x)] coh_name_list = sorted(list(set(['_'.join(a.split('_VS_')[0].split('_')[1:]) for a in score_file_list]))) doc_name_list = sorted(list(set(['_'.join(a.split('_VS_')[1].split('_')[:-1]) for a in score_file_list]))) # print coh_name_list # print doc_name_list df = DataFrame({coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list}) for score_file in score_file_list: # coh_name = score_file.split('_')[1] # doc_name = score_file.split('_')[3] coh_name = '_'.join(score_file.split('_VS_')[0].split('_')[1:]) doc_name = '_'.join(score_file.split('_VS_')[1].split('_')[:-1]) # print coh_name, doc_name purple_num = int(how_many_purples_in_file(score_file)) df[coh_name][doc_name] = purple_num # pandas.set_option('display.max_columns', None) # print df show_prediction_heat_map(df.copy())
def main(): from pandas import DataFrame, Series from rosetta_score_files import how_many_purples_in_file import os import re # from matplotlib import pyplot as plt import networkx as nx score_file_list = [x for x in os.listdir('./') if re.match('.*\.score', x)] coh_name_list = sorted(list(set([a.split('_')[1] for a in score_file_list]))) doc_name_list = sorted(list(set([a.split('_')[3] for a in score_file_list]))) df = DataFrame({coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list}) df_true_score = DataFrame({coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list}) for score_file in score_file_list: coh_name = score_file.split('_')[1] doc_name = score_file.split('_')[3] purple_num = int(how_many_purples_in_file('./'+score_file)) df[coh_name][doc_name] = 1 if purple_num >= 10 else 0 df_true_score[coh_name][doc_name] = purple_num G = nx.Graph() # labels = {} for coh in coh_name_list: for doc in doc_name_list: if df[coh][doc] == 1: G.add_node((coh, doc)) # labels[(coh, doc)] = '%s<>%s' % (coh, doc) for c1, d1 in G.nodes_iter(): for c2, d2 in G.nodes_iter(): if df[c1][d2] == 0 and df[c2][d1] == 0: G.add_edge((c1, d1), (c2, d2)) # pos = nx.spring_layout(G) # for node in labels: # plt.annotate(labels[node], xy=pos[node]) cliques = [a for a in nx.find_cliques(G)] max_len = max([len(a) for a in cliques]) max_cliques = [a for a in cliques if len(a) == max_len] print len(max_cliques) clique_coh_list, clique_doc_list = coh_doc_set_span_maximal_cliques(max_cliques) print 'cohs that span entire clique list', clique_coh_list print 'docs that span entire clique list', clique_doc_list # best_ranker, best_rank = best_clique_by_overlapp(max_cliques, clique_coh_list, clique_doc_list) best_ranker, best_rank = best_clique_by_purples(max_cliques, df_true_score) print 'best ranker\n', best_ranker, best_rank ### find least similar clique: min_similarity = min(clique_similarity(best_ranker, a) for a in max_cliques) min_similars = [] for clique in max_cliques: similarity = clique_similarity(best_ranker, clique) if similarity == min_similarity: min_similars.append(clique) best_min_similar_ranker, best_min_similar_rank = best_clique_by_purples(min_similars, df_true_score) print best_min_similar_ranker, best_min_similar_rank print min_similarity ### show true-score heat map for the best ranks clique: show_clique_heatmap(best_ranker, df_true_score, coh_name_list, doc_name_list) ### show true-score heat map for the least similar clique: show_clique_heatmap(best_min_similar_ranker, df_true_score, coh_name_list, doc_name_list)
def main(): import numpy as np from rosetta_score_files import how_many_purples_in_file import os import re global coh_names, doc_names design_list = [ 'ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33', 'ct36', 'ct38', 'ct41', 'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49', 'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55', 'ct59' ] # df = DataFrame({name: Series([-1 * len(design_list)], index=design_list) for name in design_list}) mtrx = np.zeros([len(design_list), len(design_list)], dtype=int) score_file_list = [ x for x in os.listdir( '/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/' ) if re.match('.*\.score', x) ] for score_file in score_file_list: coh_name = score_file.split('_')[1] doc_name = score_file.split('_')[3] purple_num = int( how_many_purples_in_file( '/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/' + score_file)) mtrx[design_list.index(coh_name)][design_list.index( doc_name)] = 1 if purple_num >= 12 else 0 dof_vec = find_degree_vector(mtrx) print dof_vec print mtrx mtrx = clean_all_zeros(mtrx) while not are_all_ones(dof_vec): new_all_ones(dof_vec) to_remove = dof_vec[-1].values()[0] mtrx = remove_from_matrix(mtrx, to_remove) mtrx = clean_all_zeros(mtrx) dof_vec = find_degree_vector(mtrx) # break print mtrx print dof_vec print coh_names print doc_names
def main(): from pandas import DataFrame, Series from rosetta_score_files import how_many_purples_in_file import os import re # design_list = ['ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33', 'ct36', 'ct38', 'ct41', # 'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49', 'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55', # 'ct59'] # df = DataFrame({name: Series([-1], index=design_list) for name in design_list}) # score_file_list = [x for x in os.listdir('/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/') # if re.match('.*\.score', x)] score_file_list = [x for x in os.listdir('./') if re.match('.*\.score', x)] coh_name_list = sorted( list(set([a.split('_')[1] for a in score_file_list]))) doc_name_list = sorted( list(set([a.split('_')[3] for a in score_file_list]))) df = DataFrame({ coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list }) for score_file in score_file_list: coh_name = score_file.split('_')[1] doc_name = score_file.split('_')[3] # purple_num = int(how_many_purples_in_file('/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/'+score_file)) # df[coh_name][doc_name] = 1 if purple_num >= 10 else 0 purple_num = int(how_many_purples_in_file('./' + score_file)) df[coh_name][doc_name] = 1 if purple_num >= 10 else 0 i = 1 while not all_dof_ones(df): dof_vec = find_degree_vector(df) df = remove_from_df(df, dof_vec[-1].values()[0]) df = clean_zeroes(df) print 'printing dof for %i time' % i print dof_vec[-1] print find_degree_vector(df) if i > -1: show_prediction_heat_map(df) i += 1
def main(): from pandas import DataFrame, Series from rosetta_score_files import how_many_purples_in_file import os import re # design_list = ['ct11', 'ct12', 'ct13', 'ct15', 'ct16', 'ct17', 'ct29', 'ct31', 'ct33', 'ct36', 'ct38', 'ct41', # 'ct44', 'ct45', 'ct46', 'ct47', 'ct48', 'ct49', 'ct50', 'ct51', 'ct52', 'ct53', 'ct54', 'ct55', # 'ct59'] # df = DataFrame({name: Series([-1], index=design_list) for name in design_list}) # score_file_list = [x for x in os.listdir('/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/') # if re.match('.*\.score', x)] score_file_list = [x for x in os.listdir('./') if re.match('.*\.score', x)] coh_name_list = sorted(list(set([a.split('_')[1] for a in score_file_list]))) doc_name_list = sorted(list(set([a.split('_')[3] for a in score_file_list]))) df = DataFrame({coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list}) for score_file in score_file_list: coh_name = score_file.split('_')[1] doc_name = score_file.split('_')[3] # purple_num = int(how_many_purples_in_file('/Users/jonathan/eden/no_backup/designs/Ct_8parts_10.2/prediction/results/'+score_file)) # df[coh_name][doc_name] = 1 if purple_num >= 10 else 0 purple_num = int(how_many_purples_in_file('./'+score_file)) df[coh_name][doc_name] = 1 if purple_num >= 10 else 0 i = 1 while not all_dof_ones(df): dof_vec = find_degree_vector(df) df = remove_from_df(df, dof_vec[-1].values()[0]) df = clean_zeroes(df) print 'printing dof for %i time' % i print dof_vec[-1] print find_degree_vector(df) if i > -1: show_prediction_heat_map(df) i += 1
def main(): from pandas import DataFrame, Series from rosetta_score_files import how_many_purples_in_file import os import re # from matplotlib import pyplot as plt import networkx as nx score_file_list = [x for x in os.listdir('./') if re.match('.*\.score', x)] coh_name_list = sorted( list(set([a.split('_')[1] for a in score_file_list]))) doc_name_list = sorted( list(set([a.split('_')[3] for a in score_file_list]))) df = DataFrame({ coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list }) df_true_score = DataFrame({ coh_name: Series([-1 * len(doc_name_list)], index=doc_name_list) for coh_name in coh_name_list }) for score_file in score_file_list: coh_name = score_file.split('_')[1] doc_name = score_file.split('_')[3] purple_num = int(how_many_purples_in_file('./' + score_file)) df[coh_name][doc_name] = 1 if purple_num >= 10 else 0 df_true_score[coh_name][doc_name] = purple_num G = nx.Graph() # labels = {} for coh in coh_name_list: for doc in doc_name_list: if df[coh][doc] == 1: G.add_node((coh, doc)) # labels[(coh, doc)] = '%s<>%s' % (coh, doc) for c1, d1 in G.nodes_iter(): for c2, d2 in G.nodes_iter(): if df[c1][d2] == 0 and df[c2][d1] == 0: G.add_edge((c1, d1), (c2, d2)) # pos = nx.spring_layout(G) # for node in labels: # plt.annotate(labels[node], xy=pos[node]) cliques = [a for a in nx.find_cliques(G)] max_len = max([len(a) for a in cliques]) max_cliques = [a for a in cliques if len(a) == max_len] print len(max_cliques) clique_coh_list, clique_doc_list = coh_doc_set_span_maximal_cliques( max_cliques) print 'cohs that span entire clique list', clique_coh_list print 'docs that span entire clique list', clique_doc_list # best_ranker, best_rank = best_clique_by_overlapp(max_cliques, clique_coh_list, clique_doc_list) best_ranker, best_rank = best_clique_by_purples(max_cliques, df_true_score) print 'best ranker\n', best_ranker, best_rank ### find least similar clique: min_similarity = min( clique_similarity(best_ranker, a) for a in max_cliques) min_similars = [] for clique in max_cliques: similarity = clique_similarity(best_ranker, clique) if similarity == min_similarity: min_similars.append(clique) best_min_similar_ranker, best_min_similar_rank = best_clique_by_purples( min_similars, df_true_score) print best_min_similar_ranker, best_min_similar_rank print min_similarity ### show true-score heat map for the best ranks clique: show_clique_heatmap(best_ranker, df_true_score, coh_name_list, doc_name_list) ### show true-score heat map for the least similar clique: show_clique_heatmap(best_min_similar_ranker, df_true_score, coh_name_list, doc_name_list)