def write_prediction_to_file (prediction_seq, label_seq, overlapped_predictions, output, options_dict): crf_sequence_length = options_dict ['crf_sequence_length'] sliding_window_length = options_dict ['sliding_window_length'] ''' if len(overlapped_predictions) == 0 : if crf_sequence_length != len(prediction_seq): # write from 0 to len(prediction_seq) for i in range(0: legnt) overlapped_predictions = [] return overlapped_predictions else: # write from 0 to (prediction_seq - sliding window length) new_start = crf_sequence_length - sliding_window_length overlapped_predictions = prediction_seq[new_start:len(prediction_seq)] return overlapped_predictions ''' # voting mechanism: we can find overlap between subsequent sequences as follows, ## 1. for the overlap received from the previous sequence, voting can be done and written ## 2. for no overlap region, the prediction can be written as are, to the output file ## 2. if the sequence is not cut off, it will have some overlap with the next sequence. So the overlap can be passed to the next ## of this method for voting # I assume that the length of the prediction sequence should be at least the length of the overlapped predictions. This will # fail if the sequence cut off exactly at the provided sequence length (crf_sequence_file). # Need to have an exact way of finding overlaps. Sequences could have an additional feature in the input file # telling if shares an overlap with the next one. for i in range(0, len(overlapped_predictions)) : if prediction_seq[i] != overlapped_predictions[i] : print ("applying voting. i: ",i,", prediction_seq[i]: ", prediction_seq[i], ", overlapped_predictions[i]: ",overlapped_predictions[i],"label_seq[i]: ",label_seq[i],"\n") similarity_cur = util.get_similarity(prediction_seq[i],prediction_seq[0:len(prediction_seq)]) similarity_prev = util.get_similarity(overlapped_predictions[i], overlapped_predictions) print("similarity_cur: ", similarity_cur, ", similarity_prev: ", similarity_prev, "\n") if similarity_prev < similarity_cur : output.write( prediction_seq[i] + "," + label_seq[i] +"\n") else : output.write( overlapped_predictions[i] + "," + label_seq[i] +"\n") else : output.write( prediction_seq[i] + "," + label_seq[i] +"\n") no_overlap_region = min(crf_sequence_length - sliding_window_length, len(prediction_seq)) for i in range(len(overlapped_predictions), no_overlap_region) : output.write( prediction_seq[i] + "," + label_seq[i] +"\n") if(no_overlap_region == len(prediction_seq)): overlapped_predictions = [] return overlapped_predictions overlapped_predictions = prediction_seq[no_overlap_region:crf_sequence_length] return overlapped_predictions
def process_feedback_movie_vector(feedback, input_movie_ids, movies_list, movies_df): #process feedback to get relevant movies and movies to be excluded relevant_movies, movie_to_exclude = util.process_feedback( feedback, input_movie_ids) relevant_movie_count = len(relevant_movies) #if all recommended movies are relevant then return if relevant_movie_count == 5: print "\nAll the movies were relevant hence no modification to the suggestion" return #fetch data frames for relevant and feedback movies relevant_movies_df = movies_df.loc[relevant_movies] feedback_movies_df = movies_df.loc[feedback.keys()] modified_query = util.probabilistic_feedback_query(feedback_movies_df, relevant_movies_df, movies_list, relevant_movie_count) similarity_matrix = util.get_similarity(movies_df, modified_query) return similarity_matrix, movie_to_exclude
def update_similarity_and_mmr(hyp, importances, batch, enc_tokens, vocab): summ_sents, summ_tokens = get_summ_sents_and_tokens( hyp.tokens, batch, vocab) hyp.similarity = get_similarity(enc_tokens, summ_tokens, vocab) hyp.mmr = calc_mmr_from_sim_and_imp(hyp.similarity, importances)
def save_distribution_plots(importances, enc_sentences, enc_tokens, hyp, batch, vocab, ex_index, sort=True): enc_sentences_str = [' '.join(sent) for sent in enc_sentences] summ_sents, summ_tokens = get_summ_sents_and_tokens( hyp.tokens, batch, vocab) prev_mmr = importances if sort: sort_order = np.argsort(importances, 0)[::-1] for sent_idx in range(0, len(summ_sents)): cur_summ_sents = summ_sents[:sent_idx] cur_summ_tokens = summ_tokens[:sent_idx] summ_str = ' '.join([' '.join(sent) for sent in cur_summ_sents]) similarity_amount = get_similarity(enc_tokens, cur_summ_tokens, vocab) if FLAGS.pg_mmr: mmr_for_sentences = calc_mmr_from_sim_and_imp( similarity_amount, importances) else: mmr_for_sentences = None # Don't use mmr if no sentence-level option is used distr_dir = os.path.join(FLAGS.log_root, 'mmr_distributions') if not os.path.exists(distr_dir): os.makedirs(distr_dir) save_name = os.path.join("%06d_decoded_%s_%d_sent" % (ex_index, '', sent_idx)) plot_importances(enc_sentences_str, distribution, summ_str, save_location=distr_dir, save_name=save_name) file_path = os.path.join(distr_dir, save_name) np.savez(file_path, mmr=mmr_for_sentences, importances=importances, enc_sentences=enc_sentences, summ_str=summ_str) distributions = [('similarity', similarity_amount), ('importance', importances), ('mmr', mmr_for_sentences)] for distr_str, distribution in distributions: if sort: distribution = distribution[sort_order] save_name = os.path.join("%06d_decoded_%s_%d_sent" % (ex_index, distr_str, sent_idx)) img_file_names = sorted([ file_name for file_name in os.listdir(distr_dir) if save_name in file_name and 'jpg' in file_name and 'combined' not in file_name ]) imgs = [] for file_name in img_file_names: img = PIL.Image.open(os.path.join(distr_dir, file_name)) imgs.append(img) max_shape = sorted([(np.sum(i.size), i.size) for i in imgs])[-1][1] combined_img = np.vstack( (np.asarray(i.resize(max_shape)) for i in imgs)) combined_img = PIL.Image.fromarray(combined_img) combined_img.save( os.path.join(distr_dir, save_name + '_combined.jpg')) for file_name in img_file_names: os.remove(os.path.join(distr_dir, file_name)) prev_mmr = mmr_for_sentences return mmr_for_sentences