def multi_class_misprediction_freq(res_folder): """ Look at multi class instances that are frequently mispredicted :param res_folder: :return: """ print("Loading Iter") wrong_res_iter=WrongResultsIter.load_iter_from_file(res_folder) right_res_iter=RightResultsIter.load_iter_from_file(res_folder) genre_to_wrong_genre_count=coll.Counter() right_count=0 for c,res_obj in enumerate(it.chain(wrong_res_iter,right_res_iter)): if c%500==0: print(c) actual=set(res_obj.actual) #check multiple genres if len(actual)>1: if not set(actual) <= set(res_obj.predicted[:len(actual)]): genre_to_wrong_genre_count.update([(tuple(actual),tuple(res_obj.predicted))]) right_count+=1 #sort the whole thing sorted_genre_to_wrong=sorted(genre_to_wrong_genre_count.items(),key=op.itemgetter(1),reverse=True) print(sorted_genre_to_wrong) print(right_count)
def frequently_predicted_class(res_path,top_x=2): """ Top x frequently predicted together class. The tuple of genre and genre is sorted so there is no repeats. :param res_path: :param top_x: :return: """ print("Loading Iter") wrong_res_iter=WrongResultsIter.load_iter_from_file(res_path) right_res_iter=RightResultsIter.load_iter_from_file(res_path) predicted_counter=coll.Counter() actual_counter=coll.Counter() predicted_counter.update((tuple(sorted(p)) for p in ( res_obj.predicted[:top_x] for res_obj in it.chain(wrong_res_iter,right_res_iter) if len(res_obj.actual)>1) )) actual_counter.update((tuple(sorted(p)) for p in ( tuple(set(res_obj.actual)) for res_obj in it.chain(wrong_res_iter,right_res_iter) if len(set(res_obj.actual))>1) )) print("Predicted") print(predicted_counter) print("Actual") print(actual_counter)
def single_class_mispredition_freq(res_path): """ Get the frequency of misprediction between single genre instances and the predicted genre :param res_path: :return: """ print("Loading Iter") wrong_res_iter=WrongResultsIter.load_iter_from_file(res_path) right_res_iter=RightResultsIter.load_iter_from_file(res_path) genre_to_wrong_genre_count=coll.Counter() for c,res_obj in enumerate(it.chain(wrong_res_iter,right_res_iter)): if c%500==0: print(c) actual=res_obj.actual #single genre if len(actual)==1 and actual[0] != res_obj.predicted[0]: genre_to_wrong_genre_count.update([(actual[0],res_obj.predicted[0])]) #plot plt=plot_word_frequency("Single Genre Mispredition",genre_to_wrong_genre_count) plt.tight_layout() save_fig("C:\\\\Users\\\\Kevin\\\\Desktop\\\\GitHub\\\\Research\\\\Webscraper\\\\classification_res\\\\genre_analysis\\\\single_miss.pdf", plt)
def top_level_cdf(res_folder): print("Loading Iter") wrong_res_iter=WrongResultsIter.load_iter_from_file(res_folder) right_res_iter=RightResultsIter.load_iter_from_file(res_folder) dist_count={} for c,res_obj in enumerate(it.chain(wrong_res_iter,right_res_iter)): dist_count[len(set(res_obj.actual))]=dist_count.get(len(set(res_obj.actual)),0)+1 print(dist_count)
def consensus_class(res_path,top_prediction=2,filter_func=lambda x:len(x)==4): """ Calculates the portions of the predictions that agrees with the classes's multiple classes For example: if :param res_path: :return: """ #dictionary to hold the xth class and the number of agreements for it consensus_count=coll.defaultdict(lambda:0) consensus_total=coll.defaultdict(lambda:0) ref_id_to_pred_and_actual={} print("Loading Iter") #wrong_res_iter=WrongResultsIter.load_iter_from_file(res_path) right_res_iter=RightResultsIter.load_iter_from_file(res_path) #gather together all of an instance's data and id right_res_instances=coll.defaultdict(lambda: []) for res_obj in right_res_iter: right_res_instances[res_obj.ref_id].append(res_obj) #just grab the mispredictions for ref_id,list_of_res_obj in right_res_instances.items(): actual=tuple(set(list_of_res_obj[0].actual)) #Todo:change here if not filter_func(actual): continue genre_hit_count=[0]*len(actual) genre_by_consensus=sorted([sum(g in set(res_obj.predicted[:top_prediction]) for res_obj in list_of_res_obj) for g in actual] ,reverse=True) for c,g_hit in enumerate(genre_by_consensus): genre_hit_count[c]+=g_hit #now we sort the the hit for each class #genre_hit_count=sorted(genre_hit_count,reverse=True) for class_num in range(0,len(actual)): consensus_count[class_num]+=genre_hit_count[class_num] consensus_total[class_num]+=6 print(sorted(consensus_count.items())) print(sorted(consensus_total.items()))
def consensus_class_per_genre(res_path,top_prediction=2,filter_func=lambda x:len(x)==4): """ Get the consensus for each genre, return the list :param res_path: :param top_prediction: :param filter_func: :return: """ #dictionary to hold the xth class and the number of agreements for it consensus_count=coll.defaultdict(lambda:coll.defaultdict(lambda:[])) consensus_total=coll.defaultdict(lambda:coll.defaultdict(lambda:[])) ref_id_to_pred_and_actual={} num_classes=0 print("Loading Iter") #wrong_res_iter=WrongResultsIter.load_iter_from_file(res_path) right_res_iter=RightResultsIter.load_iter_from_file(res_path) #gather together all of an instance's data and id right_res_instances=coll.defaultdict(lambda: []) for res_obj in right_res_iter: right_res_instances[res_obj.ref_id].append(res_obj) #just grab the mispredictions for ref_id,list_of_res_obj in right_res_instances.items(): actual=tuple(set(list_of_res_obj[0].actual)) #Todo:change here if not filter_func(actual): continue genre_consensus={g:sum(g in set(res_obj.predicted[:top_prediction]) for res_obj in list_of_res_obj) for g in actual} num_classes=len(genre_consensus) for index,(g,count) in enumerate(sorted(genre_consensus.items(),key=op.itemgetter(1),reverse=True)): consensus_count[index][g].append(count) consensus_total[index][g].append(6) #now we sort the the hit for each class #genre_hit_count=sorted(genre_hit_count,reverse=True) return consensus_count,consensus_total