def get_last_accuracy_in_sequence(estimator, n_votes_to_sample, texts, vote_lists, truths, X, text_similarity, idx=None, return_final=False, *args): """ Randomly sample votes and re-calculate estimates. """ random.seed() unknown_votes = copy_and_shuffle_sublists(vote_lists) known_votes = [ [] for _ in unknown_votes ] estimates = [None for _ in vote_lists] accuracy_sequence = [None] * n_votes_to_sample # This is a crowdsourcing procedure for index in xrange(n_votes_to_sample): # Counter # sys.stderr.write(str(index)+'\n') # Draw one vote for a random document updated_doc_idx = random.randrange(len(vote_lists)) if not unknown_votes[updated_doc_idx]: # We ran out of votes for this document, diregard this sequence return None vote = unknown_votes[updated_doc_idx].pop() known_votes[updated_doc_idx].append(vote) # Calculate all the estimates try: estimates = estimator(texts, known_votes, X, text_similarity, *args) #sys.stderr.write('Success\n') return get_accuracy(estimates, truths) except Exception, e: traceback.print_exc() #sys.stdout.write('Fail\n') return None
def get_last_accuracy_in_sequence(estimator, n_votes_to_sample, texts, vote_lists, truths, X, text_similarity, idx=None, return_final=False, *args): """ Randomly sample votes and re-calculate estimates. """ random.seed() unknown_votes = copy_and_shuffle_sublists(vote_lists) known_votes = [[] for _ in unknown_votes] estimates = [None for _ in vote_lists] accuracy_sequence = [None] * n_votes_to_sample # This is a crowdsourcing procedure for index in xrange(n_votes_to_sample): # Counter # sys.stderr.write(str(index)+'\n') # Draw one vote for a random document updated_doc_idx = random.randrange(len(vote_lists)) if not unknown_votes[updated_doc_idx]: # We ran out of votes for this document, diregard this sequence return None vote = unknown_votes[updated_doc_idx].pop() known_votes[updated_doc_idx].append(vote) # Calculate all the estimates try: estimates = estimator(texts, known_votes, X, text_similarity, *args) #sys.stderr.write('Success\n') return get_accuracy(estimates, truths) except Exception, e: traceback.print_exc() #sys.stdout.write('Fail\n') return None
def sample_and_minimise_entropy(final_estimator, estimator, n_votes_to_sample, texts, vote_lists, truths, X, text_similarity, idx=None, return_final=False, *args): """ Active learning scenario, where we decide at each step what document to pick. Then we ask a worker to label it for us. Our hypothesis is that it should be better than randomly asking for labels of some documents. We're only picking docs that minimise the uncertainity in system. """ random.seed() unknown_votes = copy_and_shuffle_sublists(vote_lists) known_votes = [ [] for _ in unknown_votes ] random_vote_taken = False doc_to_be_sampled = None #Continue until you can get one vote for some document. #This is just some way of initialisation. while not random_vote_taken: #print "Trying to take the first vote" updated_doc_idx = random.randrange(len(vote_lists)) if not unknown_votes[updated_doc_idx]: continue else: #Note that the vote is not replaced here for now. #Since we're dealing with low budget scenario, this should be fine. vote = unknown_votes[updated_doc_idx].pop() known_votes[updated_doc_idx].append(vote) random_vote_taken = True #print "First vote taken ", updated_doc_idx, vote #print known_votes for index in xrange(n_votes_to_sample): # This is how the system entropy looks like, right now. # We have just sampled one vote about a random document. #print "Sampling vote number ", index if doc_to_be_sampled is not None: #print "Document Sampled ", doc_to_be_sampled try: vote = unknown_votes[doc_to_be_sampled].pop() known_votes[doc_to_be_sampled].append(vote) except IndexError: vote = None #print "Picking random vote, as all votes exhausted for this document" #TODO(What better can be done? I'm taking a random vote, if no vote is available) known_votes[doc_to_be_sampled].append(bool(random.randint(0,1))) labels = estimator(texts, known_votes, X, text_similarity, *args) known_votes = add_lambda_votes_to_vote_lists(doc_to_be_sampled, known_votes, labels) #print known_votes last_iter_entropy = float("inf") # Pick a document that will minimise the system entropy the most for doc_index, doc_vote_list in enumerate(known_votes): copy_known_votes = copy.deepcopy(known_votes) #print known_votes #Adding a positive vote to this document. copy_known_votes[doc_index].append(True) # At this point, we can either add full votes, or a portion of those. # Consider GP, we can add probability which is outputted at the end. labels = estimator(texts, copy_known_votes, X, text_similarity, *args) known_votes_plus_labels = add_lambda_votes_to_vote_lists(doc_index, copy_known_votes, labels) relevance_label_added_system_entropy = get_system_entropy(known_votes_plus_labels) #Adding a negative vote to this document. copy_known_votes[doc_index].pop(-1) copy_known_votes[doc_index].append(False) # At this point, we can either add full votes, or a portion of those. # Consider GP, we can add probability which is outputted at the end. labels = estimator(texts, copy_known_votes, X, text_similarity, *args) known_votes_plus_labels = add_lambda_votes_to_vote_lists(doc_index, copy_known_votes, labels) non_relevance_label_added_system_entropy = get_system_entropy(known_votes_plus_labels) #Calculating the average entropy of the system in both cases. doc_avg_system_entropy = (relevance_label_added_system_entropy + non_relevance_label_added_system_entropy) / 2 #print "Looking at doc, and it's entropy", doc_index, doc_avg_system_entropy #Restore the state of the doc_vote_list. copy_known_votes[doc_index].pop() if doc_avg_system_entropy < last_iter_entropy: doc_to_be_sampled = doc_index last_iter_entropy = doc_avg_system_entropy #print "current best doc and entropy ", doc_index, doc_avg_system_entropy if doc_to_be_sampled is not None: try: vote = unknown_votes[doc_to_be_sampled].pop() known_votes[doc_to_be_sampled].append(vote) except IndexError: vote = None #TODO(What better can be done? I'm taking a random vote, if no vote is available) known_votes[doc_to_be_sampled].append(bool(random.randint(0,1))) labels = estimator(texts, known_votes, X, text_similarity, *args) known_votes = add_lambda_votes_to_vote_lists(doc_to_be_sampled, known_votes, labels) try: estimates = final_estimator(texts, known_votes, X, text_similarity, *args) #sys.stderr.write('Success\n') return get_accuracy(estimates, truths) except Exception, e: traceback.print_exc() #sys.stdout.write('Fail\n') return None
def sample_gp_variance_min_entropy(estimator, n_votes_to_sample, texts, vote_lists, truths, X, text_similarity, idx=None, return_final=False, *args): """ Randomly sample votes and re-calculate estimates. """ random.seed() unknown_votes = copy_and_shuffle_sublists(vote_lists) known_votes = [ [] for _ in unknown_votes ] estimates = [None for _ in vote_lists] accuracy_sequence = [None] * n_votes_to_sample curr_doc_selected = None # This is a crowdsourcing procedure for index in xrange(n_votes_to_sample): # Counter # sys.stderr.write(str(index)+'\n') max_variance_seen = 0 # Draw one vote for a random document if curr_doc_selected is None: updated_doc_idx = random.randrange(len(vote_lists)) if not unknown_votes[updated_doc_idx]: # We ran out of votes for this document, diregard this sequence return None vote = unknown_votes[updated_doc_idx].pop() known_votes[updated_doc_idx].append(vote) else: #print "Selected doc number ", curr_doc_selected try: vote = unknown_votes[curr_doc_selected].pop() except IndexError: # We ran out of votes for this document, disregard this. return None known_votes[curr_doc_selected].append(vote) print known_votes estimates = estimator(texts, known_votes, X, text_similarity, *args) #print estimates #sorted_estimates = sorted(enumerate(estimates), key=lambda x: x[1][1]) estimates = list(estimates) #print len(estimates) num_votes_step = sum(map(lambda x: bool(x), known_votes))/ len(unknown_votes) print num_votes_step possibilities = filter(lambda x: len(known_votes[x[0]]) < num_votes_step + 1 ,enumerate(estimates)) #print possibilities, len(possibilities), list(enumerate(estimates)) #Just need to get the document index, which is element[0] for enumerate(estimates) try: curr_doc_selected = get_best_sample(possibilities,[x[1][1] for x in possibilities])[0] #curr_doc_selected = get_weighted_sample(possibilities,[x[1][1] for x in possibilities])[0] except: print "Excepted" curr_doc_selected = get_best_sample(enumerate(estimates),[x[1] for x in estimates])[0] #curr_doc_selected = get_weighted_sample(enumerate(estimates),[x[1] for x in estimates])[0] print curr_doc_selected #curr_doc_selected = random.choice([element[0] for element in sorted_estimates][:5]) #for doc_index, (label, variance) in enumerate(estimates): #print doc_index, label, variance #if variance > max_variance_seen: # max_variance_seen = variance # curr_doc_selected = doc_index # Calculate all the estimates try: estimates = estimator(texts, known_votes, X, text_similarity, *args) labels = [x[0] for x in estimates] #sys.stderr.write('Success\n') return get_accuracy(labels, truths) except Exception, e: traceback.print_exc() #sys.stdout.write('Fail\n') return None
def sample_and_minimise_entropy(final_estimator, estimator, n_votes_to_sample, texts, vote_lists, truths, X, text_similarity, idx=None, return_final=False, *args): """ Active learning scenario, where we decide at each step what document to pick. Then we ask a worker to label it for us. Our hypothesis is that it should be better than randomly asking for labels of some documents. We're only picking docs that minimise the uncertainity in system. """ random.seed() unknown_votes = copy_and_shuffle_sublists(vote_lists) known_votes = [[] for _ in unknown_votes] random_vote_taken = False doc_to_be_sampled = None #Continue until you can get one vote for some document. #This is just some way of initialisation. while not random_vote_taken: #print "Trying to take the first vote" updated_doc_idx = random.randrange(len(vote_lists)) if not unknown_votes[updated_doc_idx]: continue else: #Note that the vote is not replaced here for now. #Since we're dealing with low budget scenario, this should be fine. vote = unknown_votes[updated_doc_idx].pop() known_votes[updated_doc_idx].append(vote) random_vote_taken = True #print "First vote taken ", updated_doc_idx, vote #print known_votes for index in xrange(n_votes_to_sample): # This is how the system entropy looks like, right now. # We have just sampled one vote about a random document. #print "Sampling vote number ", index if doc_to_be_sampled is not None: #print "Document Sampled ", doc_to_be_sampled try: vote = unknown_votes[doc_to_be_sampled].pop() known_votes[doc_to_be_sampled].append(vote) except IndexError: vote = None #print "Picking random vote, as all votes exhausted for this document" #TODO(What better can be done? I'm taking a random vote, if no vote is available) known_votes[doc_to_be_sampled].append( bool(random.randint(0, 1))) labels = estimator(texts, known_votes, X, text_similarity, *args) known_votes = add_lambda_votes_to_vote_lists( doc_to_be_sampled, known_votes, labels) #print known_votes last_iter_entropy = float("inf") # Pick a document that will minimise the system entropy the most for doc_index, doc_vote_list in enumerate(known_votes): copy_known_votes = copy.deepcopy(known_votes) #print known_votes #Adding a positive vote to this document. copy_known_votes[doc_index].append(True) # At this point, we can either add full votes, or a portion of those. # Consider GP, we can add probability which is outputted at the end. labels = estimator(texts, copy_known_votes, X, text_similarity, *args) known_votes_plus_labels = add_lambda_votes_to_vote_lists( doc_index, copy_known_votes, labels) relevance_label_added_system_entropy = get_system_entropy( known_votes_plus_labels) #Adding a negative vote to this document. copy_known_votes[doc_index].pop(-1) copy_known_votes[doc_index].append(False) # At this point, we can either add full votes, or a portion of those. # Consider GP, we can add probability which is outputted at the end. labels = estimator(texts, copy_known_votes, X, text_similarity, *args) known_votes_plus_labels = add_lambda_votes_to_vote_lists( doc_index, copy_known_votes, labels) non_relevance_label_added_system_entropy = get_system_entropy( known_votes_plus_labels) #Calculating the average entropy of the system in both cases. doc_avg_system_entropy = ( relevance_label_added_system_entropy + non_relevance_label_added_system_entropy) / 2 #print "Looking at doc, and it's entropy", doc_index, doc_avg_system_entropy #Restore the state of the doc_vote_list. copy_known_votes[doc_index].pop() if doc_avg_system_entropy < last_iter_entropy: doc_to_be_sampled = doc_index last_iter_entropy = doc_avg_system_entropy #print "current best doc and entropy ", doc_index, doc_avg_system_entropy if doc_to_be_sampled is not None: try: vote = unknown_votes[doc_to_be_sampled].pop() known_votes[doc_to_be_sampled].append(vote) except IndexError: vote = None #TODO(What better can be done? I'm taking a random vote, if no vote is available) known_votes[doc_to_be_sampled].append(bool(random.randint(0, 1))) labels = estimator(texts, known_votes, X, text_similarity, *args) known_votes = add_lambda_votes_to_vote_lists(doc_to_be_sampled, known_votes, labels) try: estimates = final_estimator(texts, known_votes, X, text_similarity, *args) #sys.stderr.write('Success\n') return get_accuracy(estimates, truths) except Exception, e: traceback.print_exc() #sys.stdout.write('Fail\n') return None
def sample_gp_variance_min_entropy(estimator, n_votes_to_sample, texts, vote_lists, truths, X, text_similarity, idx=None, return_final=False, *args): """ Randomly sample votes and re-calculate estimates. """ random.seed() unknown_votes = copy_and_shuffle_sublists(vote_lists) known_votes = [[] for _ in unknown_votes] estimates = [None for _ in vote_lists] accuracy_sequence = [None] * n_votes_to_sample curr_doc_selected = None # This is a crowdsourcing procedure for index in xrange(n_votes_to_sample): # Counter # sys.stderr.write(str(index)+'\n') max_variance_seen = 0 # Draw one vote for a random document if curr_doc_selected is None: updated_doc_idx = random.randrange(len(vote_lists)) if not unknown_votes[updated_doc_idx]: # We ran out of votes for this document, diregard this sequence return None vote = unknown_votes[updated_doc_idx].pop() known_votes[updated_doc_idx].append(vote) else: #print "Selected doc number ", curr_doc_selected try: vote = unknown_votes[curr_doc_selected].pop() except IndexError: # We ran out of votes for this document, disregard this. return None known_votes[curr_doc_selected].append(vote) print known_votes estimates = estimator(texts, known_votes, X, text_similarity, *args) #print estimates #sorted_estimates = sorted(enumerate(estimates), key=lambda x: x[1][1]) estimates = list(estimates) #print len(estimates) num_votes_step = sum(map(lambda x: bool(x), known_votes)) / len(unknown_votes) print num_votes_step possibilities = filter( lambda x: len(known_votes[x[0]]) < num_votes_step + 1, enumerate(estimates)) #print possibilities, len(possibilities), list(enumerate(estimates)) #Just need to get the document index, which is element[0] for enumerate(estimates) try: curr_doc_selected = get_best_sample( possibilities, [x[1][1] for x in possibilities])[0] #curr_doc_selected = get_weighted_sample(possibilities,[x[1][1] for x in possibilities])[0] except: print "Excepted" curr_doc_selected = get_best_sample(enumerate(estimates), [x[1] for x in estimates])[0] #curr_doc_selected = get_weighted_sample(enumerate(estimates),[x[1] for x in estimates])[0] print curr_doc_selected #curr_doc_selected = random.choice([element[0] for element in sorted_estimates][:5]) #for doc_index, (label, variance) in enumerate(estimates): #print doc_index, label, variance #if variance > max_variance_seen: # max_variance_seen = variance # curr_doc_selected = doc_index # Calculate all the estimates try: estimates = estimator(texts, known_votes, X, text_similarity, *args) labels = [x[0] for x in estimates] #sys.stderr.write('Success\n') return get_accuracy(labels, truths) except Exception, e: traceback.print_exc() #sys.stdout.write('Fail\n') return None
def sample_min_entropy_kde(estimator_dict, start_idx, n_votes_to_sample, texts, vote_lists, truths, X, text_similarity, idx=None, return_final=False, *args): """ Randomly sample votes and re-calculate estimates. """ random.seed() unknown_votes = copy_and_shuffle_sublists(vote_lists) accuracy_sequences = {} for estimator_name, estimator_args in estimator_dict.iteritems(): estimator, args = estimator_args accuracy_sequences[estimator_name] = [] known_votes = [ [] for _ in unknown_votes ] estimates = [None for _ in vote_lists] curr_doc_selected = None document_idx_vote_seq = [] document_vote_counts = [ 0 for _ in vote_lists ] for votes_required in range(start_idx): min_vote_doc_idxs = get_indexes_of_smallest_elements(document_vote_counts) updated_doc_idx = random.choice(min_vote_doc_idxs) document_vote_counts[updated_doc_idx] += 1 # Randomly pick a vote for this document vote_idx = random.randrange(len(vote_lists[updated_doc_idx])) vote = vote_lists[updated_doc_idx][vote_idx] document_idx_vote_seq.append( (updated_doc_idx, vote ) ) for document_idx, vote in document_idx_vote_seq: known_votes[document_idx].append(vote) print n_votes_to_sample # This is a crowdsourcing procedure for index in xrange(n_votes_to_sample): print "Sampling vote number ", index # Draw one vote for a random document if curr_doc_selected is None: updated_doc_idx = random.randrange(len(vote_lists)) if not unknown_votes[updated_doc_idx]: # We ran out of votes for this document, diregard this sequence return None vote = random.choice(unknown_votes[updated_doc_idx]) known_votes[updated_doc_idx].append(vote) else: #print "Selected doc number ", curr_doc_selected try: vote = random.choice(unknown_votes[curr_doc_selected]) except IndexError: # We ran out of votes for this document, disregard this sequence return None known_votes[curr_doc_selected].append(vote) print "Known votes ", known_votes estimates = estimator(texts, known_votes, X, text_similarity, *args) #Just need to get the document index, which is element[0] for enumerate(estimates) estimates = list(estimates) #print len(estimates) num_votes_step = sum(map(lambda x: len(x), known_votes))/ len(unknown_votes) #print num_votes_step possibilities = filter(lambda x: len(known_votes[x[0]]) < 1 + num_votes_step ,enumerate(estimates)) #print possibilities, len(possibilities), list(enumerate(estimates)) #Just need to get the document index, which is element[0] for enumerate(estimates) try: #curr_doc_selected = get_best_sample(possibilities,[x[1][1] for x in possibilities])[0] #curr_doc_selected = get_weighted_sample(possibilities,[x[1][1] for x in possibilities])[0] curr_doc_selected = get_min_entropy_sample(known_votes, possibilities) except: #print "Excepted" #curr_doc_selected = get_best_sample(enumerate(estimates),[x[1] for x in estimates])[0] #curr_doc_selected = get_weighted_sample(enumerate(estimates),[x[1] for x in estimates])[0] curr_doc_selected = get_min_entropy_sample(known_votes, enumerate(estimates)) #print "Curr_doc_selected ", curr_doc_selected #objects = list(enumerate(estimates)) #print "estimates ", objects #curr_doc_selected = get_weighted_sample(objects,[x[1][1] for x in objects]) #print curr_doc_selected # Calculate all the estimates estimates = estimator(texts, known_votes, X, text_similarity, *args) #labels = [x[0] for x in estimates] try: accuracy = get_accuracy(estimates, truths) accuracy_sequences[estimator_name].append(accuracy) except Exception, e: accuracy_sequences[estimator_name].append(None)
def sample_min_entropy(estimator_dict, n_votes_to_sample, texts, vote_lists, truths, X, text_similarity, idx=None, return_final=False, *args): """ Randomly sample votes and re-calculate estimates. """ random.seed() unknown_votes = copy_and_shuffle_sublists(vote_lists) accuracy_sequences = {} for estimator_name, estimator_args in estimator_dict.iteritems(): estimator, args = estimator_args accuracy_sequences[estimator_name] = [] known_votes = [ [] for _ in unknown_votes ] estimates = [None for _ in vote_lists] curr_doc_selected = None document_idx_vote_seq = [] document_vote_counts = [ 0 for _ in vote_lists ] #Randomly sampling 30 votes first for avoiding bias etc. """for votes_required in range(0): min_vote_doc_idxs = get_indexes_of_smallest_elements(document_vote_counts) updated_doc_idx = random.choice(min_vote_doc_idxs) document_vote_counts[updated_doc_idx] += 1 # Randomly pick a vote for this document vote_idx = random.randrange(len(vote_lists[updated_doc_idx])) vote = vote_lists[updated_doc_idx][vote_idx] document_idx_vote_seq.append( (updated_doc_idx, vote ) ) for document_idx, vote in document_idx_vote_seq: known_votes[document_idx].append(vote) estimates = estimator(texts, known_votes, X, text_similarity, *args) try: accuracy = get_accuracy(estimates, truths) accuracy_sequences[estimator_name].append(accuracy) except Exception, e: print "Pooped" return None print "known_votes ", known_votes""" # This is a crowdsourcing procedure, random sampling end for index in xrange(n_votes_to_sample): print "Sampling vote number ", index # Draw one vote for a random document if curr_doc_selected is None: print "Sampling random vote yall" updated_doc_idx = random.randrange(len(vote_lists)) if not unknown_votes[updated_doc_idx]: # We ran out of votes for this document, diregard this sequence return None vote = random.choice(unknown_votes[updated_doc_idx]) known_votes[updated_doc_idx].append(vote) else: print "Selected doc number ", curr_doc_selected try: vote = random.choice(unknown_votes[curr_doc_selected]) except IndexError: # We ran out of votes for this document, disregard this sequence return None known_votes[curr_doc_selected].append(vote) #print "Known votes ", known_votes if not index % 50: # While doing density based sampling, we don't really need to do label aggregation at each point. # Still doing it at every 50th vote, just to keep this code around for other # sampling methods like entropy based. estimates = estimator(texts, known_votes, X, text_similarity, *args) estimates = list(estimates) print estimates, len(estimates) num_votes_step = sum(map(lambda x: len(x), known_votes))/ len(unknown_votes) print 'num_vote_step ', num_votes_step possibilities = filter(lambda x: len(known_votes[x[0]]) < 1 + num_votes_step ,enumerate(known_votes)) #print possibilities, len(possibilities), list(enumerate(estimates)) #Just need to get the document index, which is element[0] for enumerate(estimates) try: #curr_doc_selected = get_best_sample(possibilities,[x[1][1] for x in possibilities])[0] #curr_doc_selected = get_weighted_sample(possibilities,[x[1][1] for x in possibilities])[0] #curr_doc_selected = get_min_entropy_sample(known_votes, possibilities) #curr_doc_selected = get_density_based_best_sample(X, known_votes, possibilities) #curr_doc_selected = get_covariance_based_best_sample(X, known_votes, possibilities) curr_doc_selected = get_mutual_information_based_best_sample(X, known_votes, possibilities) except Exception as e: print "Excepted", e #curr_doc_selected = get_best_sample(enumerate(estimates),[x[1] for x in estimates])[0] #curr_doc_selected = get_weighted_sample(enumerate(estimates),[x[1] for x in estimates])[0] #curr_doc_selected = get_min_entropy_sample(known_votes, enumerate(estimates)) curr_doc_selected = get_density_based_best_sample(X, known_votes, enumerate(estimates)) #print "Curr_doc_selected ", curr_doc_selected #objects = list(enumerate(estimates)) #print "estimates ", objects #curr_doc_selected = get_weighted_sample(objects,[x[1][1] for x in objects]) #print curr_doc_selected # Calculate all the estimates estimates = estimator(texts, known_votes, X, text_similarity, *args) #labels = [x[0] for x in estimates] try: accuracy = get_accuracy(estimates, truths) accuracy_sequences[estimator_name].append(accuracy) except Exception, e: return None