def get_worker(self) -> Worker: """ Return the worker that is using this agent for a task """ if self._worker is None: self._worker = Worker.get(self.db, self.worker_id) return self._worker
def test_worker(self) -> None: """Test creation and querying of workers""" assert self.db is not None, "No db initialized" db: MephistoDB = self.db # Check creation and retrieval of a worker worker_name = "test_worker" provider_type = PROVIDER_TYPE worker_id = db.new_worker(worker_name, provider_type) self.assertIsNotNone(worker_id) self.assertTrue(isinstance(worker_id, str)) worker_row = db.get_worker(worker_id) self.assertEqual(worker_row["worker_name"], worker_name) worker = Worker.get(db, worker_id) self.assertEqual(worker.worker_name, worker_name) # Check finding for workers workers = db.find_workers() self.assertEqual(len(workers), 1) self.assertTrue(isinstance(workers[0], Worker)) self.assertEqual(workers[0].db_id, worker_id) self.assertEqual(workers[0].worker_name, worker_name) # Check finding for specific workers workers = db.find_workers(worker_name=worker_name) self.assertEqual(len(workers), 1) self.assertTrue(isinstance(workers[0], Worker)) self.assertEqual(workers[0].db_id, worker_id) self.assertEqual(workers[0].worker_name, worker_name) workers = db.find_workers(worker_name="fake_name") self.assertEqual(len(workers), 0)
def _extract_response_by_index( self, unit_details: Dict[str, Any], idx: int ) -> Optional[Dict[str, Any]]: """ Extract response data from task data. :param unit_details: full extracted data from a unit :param idx: index of the singular evaluation within unit_details to extract :return response: Formatted worker's response data from the task """ task_data = unit_details['data'][idx] response: Dict[str, Any] = { 'run_id': self.run_id, 'worker': unit_details['worker_id'], 'worker_name': Worker.get( self.mephisto_db, unit_details['worker_id'] ).worker_name, 'time_taken': unit_details['task_end'] - unit_details['task_start'], 'question': task_data['task_specs']['question'], 'unit_id': unit_details['unit_id'], 'task_start': unit_details['task_start'], } onboarding = task_data['task_specs'].get('is_onboarding', False) if 'speakerChoice' not in task_data or task_data['speakerChoice'] == '': print('speakerChoice not in task data!') return choice = task_data['speakerChoice'] if onboarding: response['correct'] = choice == task_data['pairing_dict']['correct_answer'] else: response['correct'] = -1 speakers_to_eval = sorted(task_data["pairing_dict"]["speakers_to_eval"]) response.update( { 'winner': choice, 'loser': speakers_to_eval[1 - (speakers_to_eval.index(choice))], 'eval_choice_0': speakers_to_eval[0], 'eval_choice_1': speakers_to_eval[1], 'reason': task_data['textReason'], 'is_onboarding': onboarding, 'matchup': f"{'__vs__'.join(speakers_to_eval)}", 'pairing_id': task_data['pair_id'], } ) # If it exists, add in which checkboxes of possible reasons the Turkers checked if len(task_data.get('speakerReasons', {})) > 0: response.update( { self.checkbox_prefix + reason: checked for reason, checked in task_data['speakerReasons'].items() } ) return response
def get_workers_with_qualification( self, qualification_name: str) -> List[Worker]: """ Returns a list of 'Worker's for workers who are qualified wrt `qualification_name`. """ qual_list = self.db.find_qualifications( qualification_name=qualification_name) assert len(qual_list ) >= 1, f"No qualification found named {qualification_name}" qualification_id = qual_list[0].db_id qualifieds = self.db.check_granted_qualifications( qualification_id=qualification_id, value=1) return [Worker.get(self.db, qual.worker_id) for qual in qualifieds]
def test_worker(self) -> None: """Ensure we can query and use a worker""" db: MephistoDB = self.db requester = self.get_test_requester() WorkerClass = self.CrowdProviderClass.WorkerClass test_worker = WorkerClass.new(db, self.get_test_worker_name()) test_worker_2 = Worker.get(db, test_worker.db_id) self.assertEqual( test_worker.worker_name, test_worker_2.worker_name, "Worker gotten from db not same as first init", ) # Ensure blocking is doable test_worker.block_worker("Test reason", requester=requester) self.assertTrue(test_worker.is_blocked(requester)) test_worker.unblock_worker("Test reason", requester=requester) self.assertFalse(test_worker.is_blocked(requester))
def test_create_and_find_worker(self) -> None: """Ensure we can find a worker by MTurk id""" db = self.db TEST_MTURK_WORKER_ID = "ABCDEFGHIJ" test_worker = MTurkWorker.new(db, TEST_MTURK_WORKER_ID) test_worker_2 = Worker.get(db, test_worker.db_id) self.assertEqual( test_worker.worker_name, test_worker_2.worker_name, "Worker gotten from db not same as first init", ) test_worker_3 = MTurkWorker.get_from_mturk_worker_id(db, TEST_MTURK_WORKER_ID) assert test_worker_3 is not None self.assertEqual( test_worker.worker_name, test_worker_3.worker_name, "Worker gotten from db not same as first init", ) failed_worker = MTurkWorker.get_from_mturk_worker_id(db, "FAKE_ID") self.assertIsNone(failed_worker, f"Found worker {failed_worker} from a fake id")
def format_for_printing_data(data): global db # Custom tasks can define methods for how to display their data in a relevant way worker_name = Worker.get(db, data["worker_id"]).worker_name contents = data["data"] duration = contents["times"]["task_end"] - contents["times"]["task_start"] metadata_string = ( f"Worker: {worker_name}\nUnit: {data['unit_id']}\n" f"Duration: {int(duration)}\nStatus: {data['status']}\n") inputs = contents["inputs"] inputs_string = f"Character: {inputs['character_name']}\nDescription: {inputs['character_description']}\n" outputs = contents["outputs"] output_string = f" Rating: {outputs['rating']}\n" found_files = outputs.get("files") if found_files is not None: file_dir = Unit.get( db, data["unit_id"]).get_assigned_agent().get_data_dir() output_string += f" Files: {found_files}\n" output_string += f" File directory {file_dir}\n" else: output_string += f" Files: No files attached\n" return f"-------------------\n{metadata_string}{inputs_string}{output_string}"
def test_worker_fails(self) -> None: """Ensure workers fail to be created or loaded under failure conditions""" assert self.db is not None, "No db initialized" db: MephistoDB = self.db # Cant get non-existent entry with self.assertRaises(EntryDoesNotExistException): worker = Worker.get(db, self.get_fake_id("Worker")) worker_name = "test_worker" provider_type = PROVIDER_TYPE worker_id = db.new_worker(worker_name, provider_type) # Can't create same worker again with self.assertRaises(EntryAlreadyExistsException): worker_id = db.new_worker(worker_name, provider_type) # Can't use no name with self.assertRaises(MephistoDBException): worker_id = db.new_worker("", provider_type) # Ensure no workers were created workers = db.find_workers() self.assertEqual(len(workers), 1)
def make_registered_worker(self, worker_name) -> Worker: worker_id = self.db.new_worker(worker_name + "_sandbox", "mock") return Worker.get(self.db, worker_id)
def run_examine_by_worker( db: "MephistoDB", format_data_for_printing: Callable[[Dict[str, Any]], str], task_name: Optional[str] = None, block_qualification: Optional[str] = None, approve_qualification: Optional[str] = None, ): """ Basic script for reviewing work, grouped by worker for convenience. First gets the required information to run a review, then """ data_browser = DataBrowser(db=db) # Get initial arguments if task_name is None: task_name, block_qualification, approve_qualification = prompt_for_options( task_name, block_qualification, approve_qualification ) tasks = db.find_tasks(task_name=task_name) assert len(tasks) >= 1, f"No task found under name {task_name}" print( "You will be reviewing actual tasks with this flow. Tasks that you either Accept or Pass " "will be paid out to the worker, while rejected tasks will not. Passed tasks will be " "specially marked such that you can leave them out of your dataset. \n" "You may enter the option in caps to apply it to the rest of the units for a given worker." ) if block_qualification is not None: created_block_qual = find_or_create_qualification(db, block_qualification) print( "When you pass or reject a task, the script gives you an option to disqualify the worker " "from future tasks by assigning a qualification. If provided, this worker will no " "longer be able to work on tasks where the set --block-qualification shares the same name " f"you provided above: {block_qualification}\n" ) if approve_qualification is not None: created_approve_qual = find_or_create_qualification(db, approve_qualification) print( "You may use this script to establish a qualified worker pool by granting the provided " f"approve qualification {approve_qualification} to workers you think understand the task " "well. This will be provided as an option for workers you (A)pprove all on. " "Future tasks can use this qual as a required qualification, as described in the " "common qualification flows document." ) print( "**************\n" "You should only reject tasks when it is clear the worker has acted in bad faith, and " "didn't actually do the task. Prefer to pass on tasks that were misunderstandings.\n" "**************\n" ) units = data_browser.get_units_for_task_name(task_name) others = [u for u in units if u.get_status() != "completed"] units = [u for u in units if u.get_status() == "completed"] reviews_left = len(units) previous_work_by_worker = get_worker_stats(others) # Determine allowed options options = ["a", "p", "r"] options_string = "Do you want to accept this work? (a)ccept, (r)eject, (p)ass:" units_by_worker: Dict[str, List["Unit"]] = {} for u in units: w_id = u.worker_id if w_id not in units_by_worker: units_by_worker[w_id] = [] units_by_worker[w_id].append(u) # Run the review for w_id, w_units in units_by_worker.items(): worker = Worker.get(db, w_id) worker_name = worker.worker_name apply_all_decision = None reason = None for idx, unit in enumerate(w_units): print( f"Reviewing for worker {worker_name}, ({idx+1}/{len(w_units)}), " f"Previous {format_worker_stats(w_id, previous_work_by_worker)} " f"(total remaining: {reviews_left})" ) reviews_left -= 1 print(format_data_for_printing(data_browser.get_data_from_unit(unit))) if apply_all_decision is not None: decision = apply_all_decision else: decision = input( "Do you want to accept this work? (a)ccept, (r)eject, (p)ass: " ) while decision.lower() not in options: decision = input( "Decision must be one of a, p, r. Use CAPS to apply to all remaining for worker: " ) agent = unit.get_assigned_agent() assert ( agent is not None ), f"Can't make decision on None agent... issue with {unit}" if decision.lower() == "a": agent.approve_work() if decision == "A" and approve_qualification is not None: should_special_qualify = input( "Do you want to approve qualify this worker? (y)es/(n)o: " ) if should_special_qualify.lower() in ["y", "yes"]: worker.grant_qualification(approve_qualification, 1) elif decision.lower() == "p": agent.soft_reject_work() if apply_all_decision is None and block_qualification is not None: should_soft_block = input( "Do you want to soft block this worker? (y)es/(n)o: " ) if should_soft_block.lower() in ["y", "yes"]: worker.grant_qualification(block_qualification, 1) else: # decision = 'r' if apply_all_decision is None: reason = input("Why are you rejecting this work? ") should_block = input( "Do you want to hard block this worker? (y)es/(n)o: " ) if should_block.lower() in ["y", "yes"]: block_reason = input("Why permanently block this worker? ") worker.block_worker(block_reason) agent.reject_work(reason) if decision.lower() != decision: apply_all_decision = decision.lower()
def get_named_test_worker(self, worker_name: str) -> Worker: """Create a test worker with the given worker name""" worker_id = self.db.new_worker(worker_name, "mock") return Worker.get(self.db, worker_id)
def compile_results(self) -> pd.DataFrame: # Load task data logging.info('Retrieving task data from Mephisto.') task_units_data = self.get_task_data() logging.info( f'Data for {len(task_units_data)} units loaded successfully.') num_convos_with_no_save_data = 0 num_wrong_status_convos = 0 num_complete_convos = 0 unacceptable_task_units = [] unacceptable_worker_ids = [] conversation_idx = 0 conversation_dfs = [] for task_unit in task_units_data: worker_id = task_unit['worker_id'] assignment_id = task_unit['assignment_id'] # Skipping this conversation if save data is not found or the status is # invalid if task_unit['data']['save_data'] is None: logging.info('Found a task unit with no save data! Skipping.') num_convos_with_no_save_data += 1 continue elif task_unit['status'] not in ['completed', 'approved']: logging.info( f'Found a HIT with the status "{task_unit["status"]}"!.' f'Skipping.') num_wrong_status_convos += 1 continue else: num_complete_convos += 1 # Extract out useful conversation-level data custom_data = task_unit['data']['save_data']['custom_data'] mturk_worker_id = Worker.get(self.get_mephisto_db(), worker_id).worker_name task_start = datetime.utcfromtimestamp(task_unit['task_start']) task_end = datetime.utcfromtimestamp(task_unit['task_end']) info_dict = { ('worker_id', ''): worker_id, ('mturk_worker_id', ''): mturk_worker_id, ('unit_id', ''): task_unit['unit_id'], ('assignment_id', ''): assignment_id, ('conversation_idx', ''): conversation_idx, ('date', ''): task_start.strftime('%Y-%m-%d'), ('completion_time', ''): (task_end - task_start).total_seconds(), } # Check that the conversation consists of pairs of comments between # Speaker 1 and Speaker 2, with Speaker 1 speaking first assert 'final_rating' in task_unit['data']['messages'][-1][ 'task_data'] convo_messages = [m for m in task_unit['data']['messages'][:-1]] # The final message is just a final rating assert all([ message['id'] == 'Speaker 2' if message_idx % 2 else 'Speaker 1' for message_idx, message in enumerate(convo_messages) ]) messages_1 = [m for m in convo_messages if m['id'] == 'Speaker 1'] messages_2 = [m for m in convo_messages if m['id'] == 'Speaker 2'] assert len(messages_1) + len(messages_2) == len(convo_messages) # Determine whether the HIT contains unacceptable messages. (We do this for # every HIT, even if acceptability violation info was already saved, because # the violation criteria may have changed since the HIT was collected.) utterances_1 = [m['text'] for m in messages_1] assert utterances_1[0] == 'Hi!', ( 'This script assumes that the first human message is "Hi!", which is ' 'set by default and cannot be changed by the crowdsourcing worker.' ) acceptability_violations = self.acceptability_checker.check_messages( messages=utterances_1[1:], # Don't use the initial "Hi!" is_worker_0=True, violation_types=self.acceptability_checker.ALL_VIOLATION_TYPES, ) # Here, "worker 0" refers to Speaker 1, because we mix 0- and 1-indexing if acceptability_violations != '': logging.info( f'Conversation fails acceptability checks with a violation of ' f'"{acceptability_violations}", given the following utterances: ' f'{utterances_1[1:]}. Skipping.') unacceptable_task_units.append(task_unit) assert ( mturk_worker_id is not None ), "MTurk worker ID cannot be determined for this unacceptable conversation!" unacceptable_worker_ids.append(mturk_worker_id) continue # Ignore the conversation if ratings for all turns are the same, because # it's somewhat implausible that *all* turns in a conversation should garner # the same rating of engagingness, humanness, interestingness, or none. # (However, don't put these workers on the "unacceptable worker IDs" list, # to give them a little bit of the benefit of the doubt: i.e. maybe the # worker just didn't try hard enough to find which responses were more # engaging, etc. than others, but that doesn't mean that all of their HITs # across all evals are bad and should be removed.) if self.filter_uniform_hits: annotations = [ m['task_data']['problem_data_for_prior_message'] for m in task_unit['data']['messages'] if 'problem_data_for_prior_message' in m.get( 'task_data', {}) ] hashable_annotations = [ tuple(a[key] for key in sorted(a.keys())) for a in annotations ] unique_annotations = set(hashable_annotations) if len(unique_annotations) < 1: raise ValueError('No annotations found for this HIT!') elif len(unique_annotations) == 1: logging.info( f'All model responses in the conversation received the same ' f'annotation: {hashable_annotations[0]}. Skipping.') unacceptable_task_units.append(task_unit) continue single_turn_dicts = [] # Compile personas and previous utterances text_parts = [] if custom_data['personas'] is not None and len( custom_data['personas']) > 0: assert len(custom_data['personas']) == 2 text_parts += [ 'HUMAN PERSONA: ' + ' '.join(custom_data['personas'][0]), 'BOT PERSONA: ' + ' '.join(custom_data['personas'][1]), ] if (custom_data['additional_context'] is not None and len(custom_data['additional_context']) > 0): text_parts.append('ADDITIONAL CONTEXT: ' + custom_data['additional_context']) single_turn_dicts.append({ **info_dict, ('context', ''): ' '.join(text_parts) }) # Loop over conversation turns turns_per_speaker = defaultdict(int) for message in task_unit['data']['messages']: if 'text' in message: speaker_id = message['id'] # Add in annotation results, if they exist if 'problem_data_for_prior_message' in message.get( 'task_data', {}): bucket_data = { ('annotation_bucket', bucket): value for bucket, value in message['task_data'] ['problem_data_for_prior_message'].items() } else: bucket_data = {} # Add in results from the final rating(s), if they exist if 'final_rating' in message.get('task_data', {}): ratings = message['task_data']['final_rating'].split( '|') final_rating_data = { ('final_rating', str(idx)): value for idx, value in enumerate(ratings) } else: final_rating_data = {} turns_per_speaker[speaker_id] += 1 single_turn_dicts.append({ **info_dict, ('speaker_id', ''): speaker_id, ('speaker_turn_idx', ''): turns_per_speaker[speaker_id], ('text', ''): message['text'].replace('\n', '__newline__'), **bucket_data, **final_rating_data, }) # Adding the full conversation to the list of conversations single_turn_series = [ pd.Series(dict_).to_frame().transpose() for dict_ in single_turn_dicts ] single_convo_df = pd.concat(single_turn_series, axis=0, sort=False) conversation_dfs.append(single_convo_df) conversation_idx += 1 logging.info( f'{num_convos_with_no_save_data:d} conversations found with no save data.' ) logging.info( f'{num_wrong_status_convos:d} conversations found with the wrong status.' ) logging.info(f'{num_complete_convos:d} complete conversations found:') logging.info( f'\t{len(unacceptable_task_units):d} unacceptable conversations.') logging.info(f'\t{len(conversation_dfs):d} acceptable conversations.') # # Compile results across all conversations if len(conversation_dfs) == 0: raise ValueError('No acceptable conversations found!') unordered_conversation_df = pd.concat(conversation_dfs, axis=0) initial_ordered_columns = list(info_dict.keys()) + [ ('context', ''), ('speaker_id', ''), ('speaker_turn_idx', ''), ('text', ''), ] all_ordered_columns = initial_ordered_columns + [ col for col in unordered_conversation_df.columns if col not in initial_ordered_columns ] conversation_df = unordered_conversation_df[all_ordered_columns] # TODO: is there a less hacky way than this, which relies on the most recent # value of `info_dict`, to put the columns back into the right order? # # Calculate and save auxiliary stats logging.info( f'Saving MTurk IDs of workers with unacceptable conversations to ' f'{self.unacceptable_worker_ids_path}.') with open(self.unacceptable_worker_ids_path, 'w') as f: for worker_id in unacceptable_worker_ids: f.write(worker_id + '\n') # Calculate rates of selecting various annotation buckets annotation_bucket_df = conversation_df['annotation_bucket'].dropna( axis=0, how='any') if annotation_bucket_df.isna().sum().sum() > 0: raise ValueError( 'There is at least one row in which only partial annotation bucket data exists!' ) annotation_selection_rate_df = annotation_bucket_df.mean().to_frame( 'selection_rate') annotation_selection_rate_df.to_csv( self.annotation_selection_rate_path) logging.info( f'Annotation bucket selection rates saved to {self.annotation_selection_rate_path}.' ) output_strings = [ f'{series.name}: {100*series["selection_rate"]:0.0f}%' for _, series in annotation_selection_rate_df.iterrows() ] logging.info('Annotation bucket selection rates:\n' + '\n'.join(output_strings)) # Calculate Likert score stats final_rating_df = conversation_df['final_rating'].dropna(axis=0, how='any') if final_rating_df.isna().sum().sum() > 0: raise ValueError( 'There is at least one row in which only partial final rating data exists!' ) likert_score_stat_df = final_rating_df.astype(int).describe() likert_score_stat_df.to_csv(self.likert_score_stat_path) logging.info( f'Likert score statistics saved to {self.likert_score_stat_path}.') logging.info( f'Mean Likert scores:\n{likert_score_stat_df.loc["mean"]}') return conversation_df