def test_worker(self) -> None: """Test creation and querying of workers""" assert self.db is not None, "No db initialized" db: MephistoDB = self.db # Check creation and retrieval of a worker worker_name = "test_worker" provider_type = PROVIDER_TYPE worker_id = db.new_worker(worker_name, provider_type) self.assertIsNotNone(worker_id) self.assertTrue(isinstance(worker_id, str)) worker_row = db.get_worker(worker_id) self.assertEqual(worker_row["worker_name"], worker_name) worker = Worker(db, worker_id) self.assertEqual(worker.worker_name, worker_name) # Check finding for workers workers = db.find_workers() self.assertEqual(len(workers), 1) self.assertTrue(isinstance(workers[0], Worker)) self.assertEqual(workers[0].db_id, worker_id) self.assertEqual(workers[0].worker_name, worker_name) # Check finding for specific workers workers = db.find_workers(worker_name=worker_name) self.assertEqual(len(workers), 1) self.assertTrue(isinstance(workers[0], Worker)) self.assertEqual(workers[0].db_id, worker_id) self.assertEqual(workers[0].worker_name, worker_name) workers = db.find_workers(worker_name="fake_name") self.assertEqual(len(workers), 0)
def get_worker(self) -> Worker: """ Return the worker that is using this agent for a task """ if self._worker is None: self._worker = Worker(self.db, self.worker_id) return self._worker
def _extract_response_by_index( self, unit_details: Dict[str, Any], idx: int ) -> Optional[Dict[str, Any]]: """ Extract response data from task data. :param unit_details: full extracted data from a unit :param idx: index of the singular evaluation within unit_details to extract :return response: Formatted worker's response data from the task """ task_data = unit_details['data'][idx] response: Dict[str, Any] = { 'run_id': self.run_id, 'worker': unit_details['worker_id'], 'worker_name': Worker.get( self.mephisto_db, unit_details['worker_id'] ).worker_name, 'time_taken': unit_details['task_end'] - unit_details['task_start'], 'question': task_data['task_specs']['question'], 'unit_id': unit_details['unit_id'], 'task_start': unit_details['task_start'], } onboarding = task_data['task_specs'].get('is_onboarding', False) if 'speakerChoice' not in task_data or task_data['speakerChoice'] == '': print('speakerChoice not in task data!') return choice = task_data['speakerChoice'] if onboarding: response['correct'] = choice == task_data['pairing_dict']['correct_answer'] else: response['correct'] = -1 speakers_to_eval = sorted(task_data["pairing_dict"]["speakers_to_eval"]) response.update( { 'winner': choice, 'loser': speakers_to_eval[1 - (speakers_to_eval.index(choice))], 'eval_choice_0': speakers_to_eval[0], 'eval_choice_1': speakers_to_eval[1], 'reason': task_data['textReason'], 'is_onboarding': onboarding, 'matchup': f"{'__vs__'.join(speakers_to_eval)}", 'pairing_id': task_data['pair_id'], } ) # If it exists, add in which checkboxes of possible reasons the Turkers checked if len(task_data.get('speakerReasons', {})) > 0: response.update( { self.checkbox_prefix + reason: checked for reason, checked in task_data['speakerReasons'].items() } ) return response
def timing_charts(run_id: int) -> None: completed_units = retrieve_units(run_id) db = LocalMephistoDB() data_browser = DataBrowser(db=db) workers = {"total": []} unit_timing = {"total": [], "end": []} question_results = {1: [], 2: [], 3: [], 4: []} pass_rates = {1: [], 2: [], 3: [], 4: []} starttime = math.inf endtime = -math.inf feedback = [] num_correct_hist = [] bug_count = 0 for unit in completed_units: data = data_browser.get_data_from_unit(unit) worker = Worker(db, data["worker_id"]).worker_name workers["total"].append(worker) starttime, endtime, unit_timing = hit_timing(data["data"], starttime, endtime, unit_timing) outputs = data["data"]["outputs"] feedback.append(outputs["feedback"]) if outputs["bug"] == "true": bug_count += 1 num_correct = 0 for q in question_results.keys(): key = "q" + str(q) + "Answer" question_results[q].append(outputs[key]) if outputs[key] == "true": num_correct += 1 num_correct_hist.append(num_correct) print(f"Job start time: {datetime.fromtimestamp(starttime)}") print(f"Job end time: {datetime.fromtimestamp(endtime)}") plot_hist_sorted( unit_timing["total"], cutoff=1200, target_val=600, xlabel="", ylabel="Total HIT Time (sec)" ) calc_percentiles(unit_timing["total"], "HIT Length") for q in question_results.keys(): results_dict = Counter(question_results[q]) pass_rates[q] = ( results_dict["true"] / (results_dict["true"] + results_dict["false"]) ) * 100 print( f"Question #{q} pass rate: {(results_dict['true']/(results_dict['true'] + results_dict['false']))*100:.1f}%" ) plot_hist(pass_rates, xlabel="Question #", ylabel=f"Pass Rate %") print( f"Number of workers who didn't get any right: {len([x for x in num_correct_hist if x == 0])}" ) keys = range(len(num_correct_hist)) vals_dict = dict(zip(keys, num_correct_hist)) plot_hist(vals_dict, xlabel="HIT #", ylabel="# Correct", ymax=4) print(f"Number of workers who experienced a window crash: {bug_count}") print(feedback)
def format_for_printing_data(data): # Custom tasks can define methods for how to display their data in a relevant way worker_name = Worker(db, data["worker_id"]).worker_name contents = data["data"] duration = contents["times"]["task_end"] - contents["times"]["task_start"] metadata_string = ( f"Worker: {worker_name}\nUnit: {data['unit_id']}\n" f"Duration: {int(duration)}\nStatus: {data['status']}\n") inputs = contents["inputs"] inputs_string = f"Domain: {inputs['subdomain']}\n" outputs = contents["outputs"] output_string = "" try: output_string += f"Usability Rating: {outputs['usability-rating']}\n" except: pass try: output_string += f"Self Performance Rating: {outputs['self-rating']}\n" except: pass try: output_string += f"Instructions Read Time (sec): {outputs['instructionsReadTime']}\n" except: pass try: output_string += f"Pre Interaction Time (sec): {outputs['preInteractTime']}\n" except: pass try: output_string += f"Interaction Time (sec): {outputs['interactTime']}\n" except: pass try: output_string += f"Clicks (timestamp): {outputs['clickedElements']}\n" except: pass try: output_string += f"OS & Browser Info: {outputs['userAgent']}\n" except: pass try: output_string += f"User Feeback: {outputs['feedback']}\n" except: pass # found_files = outputs.get("files") # if found_files is not None: # file_dir = Unit(db, data["unit_id"]).get_assigned_agent().get_data_dir() # output_string += f" Files: {found_files}\n" # output_string += f" File directory {file_dir}\n" # else: # output_string += f" Files: No files attached\n" return f"-------------------\n{metadata_string}{inputs_string}{output_string}"
def get_workers_with_qualification( self, qualification_name: str) -> List[Worker]: """ Returns a list of 'Worker's for workers who are qualified wrt `qualification_name`. """ qual_list = self.db.find_qualifications( qualification_name=qualification_name) assert len(qual_list ) >= 1, f"No qualification found named {qualification_name}" qualification_id = qual_list[0].db_id qualifieds = self.db.check_granted_qualifications( qualification_id=qualification_id, value=1) return [Worker.get(self.db, qual.worker_id) for qual in qualifieds]
def test_worker(self) -> None: """Ensure we can query and use a worker""" db: MephistoDB = self.db requester = self.get_test_requester() WorkerClass = self.CrowdProviderClass.WorkerClass test_worker = WorkerClass.new(db, self.get_test_worker_name()) test_worker_2 = Worker(db, test_worker.db_id) self.assertEqual( test_worker.worker_name, test_worker_2.worker_name, "Worker gotten from db not same as first init", ) # Ensure blocking is doable test_worker.block_worker("Test reason", requester=requester) self.assertTrue(test_worker.is_blocked(requester)) test_worker.unblock_worker("Test reason", requester=requester) self.assertFalse(test_worker.is_blocked(requester))
def find_workers( self, worker_name: Optional[str] = None, provider_type: Optional[str] = None ) -> List[Worker]: """ Try to find any worker that matches the above. When called with no arguments, return all workers. """ with self.table_access_condition: conn = self._get_connection() c = conn.cursor() c.execute( """ SELECT * from workers WHERE (?1 IS NULL OR worker_name = ?1) AND (?2 IS NULL OR provider_type = ?2) """, (worker_name, provider_type), ) rows = c.fetchall() return [Worker(self, str(r["worker_id"]), row=r) for r in rows]
def format_for_printing_data(data): # Custom tasks can define methods for how to display their data in a relevant way worker_name = Worker(db, data["worker_id"]).worker_name contents = data["data"] duration = contents["times"]["task_end"] - contents["times"]["task_start"] metadata_string = ( f"Worker: {worker_name}\nUnit: {data['unit_id']}\n" f"Duration: {int(duration)}\nStatus: {data['status']}\n") inputs = contents["inputs"] inputs_string = f"Domain: {inputs['subdomain']}\n" outputs = contents["outputs"] output_string = "" try: output_string += f"Question #1 Result: {outputs['q1Answer']}\n" except: pass try: output_string += f"Question #2 Result: {outputs['q2Answer']}\n" except: pass try: output_string += f"Question #3 Result: {outputs['q3Answer']}\n" except: pass try: output_string += f"Question #4 Result: {outputs['q4Answer']}\n" except: pass try: output_string += f"OS & Browser Info: {outputs['userAgent']}\n" except: pass try: output_string += f"User Feeback: {outputs['feedback']}\n" except: pass return f"-------------------\n{metadata_string}{inputs_string}{output_string}"
def test_create_and_find_worker(self) -> None: """Ensure we can find a worker by MTurk id""" db = self.db TEST_MTURK_WORKER_ID = "ABCDEFGHIJ" test_worker = MTurkWorker.new(db, TEST_MTURK_WORKER_ID) test_worker_2 = Worker(db, test_worker.db_id) self.assertEqual( test_worker.worker_name, test_worker_2.worker_name, "Worker gotten from db not same as first init", ) test_worker_3 = MTurkWorker.get_from_mturk_worker_id(db, TEST_MTURK_WORKER_ID) self.assertEqual( test_worker.worker_name, test_worker_3.worker_name, "Worker gotten from db not same as first init", ) failed_worker = MTurkWorker.get_from_mturk_worker_id(db, "FAKE_ID") self.assertIsNone(failed_worker, f"Found worker {failed_worker} from a fake id")
def format_for_printing_data(data): # Custom tasks can define methods for how to display their data in a relevant way worker_name = Worker(db, data["worker_id"]).worker_name contents = data["data"] duration = contents["times"]["task_end"] - contents["times"]["task_start"] metadata_string = ( f"Worker: {worker_name}\nUnit: {data['unit_id']}\n" f"Duration: {int(duration)}\nStatus: {data['status']}\n") inputs = contents["inputs"] inputs_string = f"Character: {inputs['character_name']}\nDescription: {inputs['character_description']}\n" outputs = contents["outputs"] output_string = f" Rating: {outputs['rating']}\n" found_files = outputs.get("files") if found_files is not None: file_dir = Unit(db, data["unit_id"]).get_assigned_agent().get_data_dir() output_string += f" Files: {found_files}\n" output_string += f" File directory {file_dir}\n" else: output_string += f" Files: No files attached\n" return f"-------------------\n{metadata_string}{inputs_string}{output_string}"
def test_worker_fails(self) -> None: """Ensure workers fail to be created or loaded under failure conditions""" assert self.db is not None, "No db initialized" db: MephistoDB = self.db # Cant get non-existent entry with self.assertRaises(EntryDoesNotExistException): worker = Worker(db, self.get_fake_id("Worker")) worker_name = "test_worker" provider_type = PROVIDER_TYPE worker_id = db.new_worker(worker_name, provider_type) # Can't create same worker again with self.assertRaises(EntryAlreadyExistsException): worker_id = db.new_worker(worker_name, provider_type) # Can't use no name with self.assertRaises(MephistoDBException): worker_id = db.new_worker("", provider_type) # Ensure no workers were created workers = db.find_workers() self.assertEqual(len(workers), 1)
def _register_agent(self, packet: Packet, channel_info: ChannelInfo): """Process an agent registration packet to register an agent""" # First see if this is a reconnection crowd_data = packet.data["provider_data"] agent_registration_id = crowd_data["agent_registration_id"] logger.debug( f"Incoming request to register agent {agent_registration_id}.") if agent_registration_id in self.agents_by_registration_id: agent = self.agents_by_registration_id[agent_registration_id].agent # Update the source channel, in case it has changed self.agents[ agent.get_agent_id()].used_channel_id = channel_info.channel_id self.message_queue.append( Packet( packet_type=PACKET_TYPE_PROVIDER_DETAILS, sender_id=SYSTEM_CHANNEL_ID, receiver_id=channel_info.channel_id, data={ "request_id": packet.data["request_id"], "agent_id": agent.get_agent_id(), }, )) logger.debug( f"Found existing agent_registration_id {agent_registration_id}, " f"reconnecting to agent {agent.get_agent_id()}.") return # Process a new agent task_runner = channel_info.job.task_runner task_run = task_runner.task_run worker_id = crowd_data["worker_id"] worker = Worker(self.db, worker_id) # get the list of tentatively valid units units = task_run.get_valid_units_for_worker(worker) if len(units) == 0: self.message_queue.append( Packet( packet_type=PACKET_TYPE_PROVIDER_DETAILS, sender_id=SYSTEM_CHANNEL_ID, receiver_id=channel_info.channel_id, data={ "request_id": packet.data["request_id"], "agent_id": None }, )) logger.debug( f"Found existing agent_registration_id {agent_registration_id}, " f"had no valid units.") return # If there's onboarding, see if this worker has already been disqualified worker_id = crowd_data["worker_id"] worker = Worker(self.db, worker_id) blueprint = task_run.get_blueprint(args=task_runner.args) if isinstance(blueprint, OnboardingRequired) and blueprint.use_onboarding: if worker.is_disqualified(blueprint.onboarding_qualification_name): self.message_queue.append( Packet( packet_type=PACKET_TYPE_PROVIDER_DETAILS, sender_id=SYSTEM_CHANNEL_ID, receiver_id=channel_info.channel_id, data={ "request_id": packet.data["request_id"], "agent_id": None, }, )) logger.debug( f"Worker {worker_id} is already disqualified by onboarding " f"qual {blueprint.onboarding_qualification_name}.") return elif not worker.is_qualified( blueprint.onboarding_qualification_name): # Send a packet with onboarding information onboard_data = blueprint.get_onboarding_data(worker.db_id) onboard_agent = OnboardingAgent.new(self.db, worker, task_run) onboard_agent.state.set_init_state(onboard_data) agent_info = AgentInfo(agent=onboard_agent, used_channel_id=channel_info.channel_id) onboard_id = onboard_agent.get_agent_id() # register onboarding agent self.agents[onboard_id] = agent_info self.onboarding_packets[onboard_id] = packet self.message_queue.append( Packet( packet_type=PACKET_TYPE_PROVIDER_DETAILS, sender_id=SYSTEM_CHANNEL_ID, receiver_id=channel_info.channel_id, data={ "request_id": packet.data["request_id"], "agent_id": onboard_id, "onboard_data": onboard_data, }, )) logger.debug( f"Worker {worker_id} is starting onboarding thread with " f"onboarding agent id {onboard_id}.") # Create an onboarding thread onboard_thread = threading.Thread( target=self._launch_and_run_onboarding, args=(agent_info, channel_info.job.task_runner), name=f"Onboard-thread-{onboard_id}", ) onboard_agent.update_status(AgentState.STATUS_ONBOARDING) agent_info.assignment_thread = onboard_thread onboard_thread.start() return # Not onboarding, so just register directly self._assign_unit_to_agent(packet, channel_info, units)
def _assign_unit_to_agent(self, packet: Packet, channel_info: ChannelInfo, units: List["Unit"]): """Handle creating an agent for the specific worker to register an agent""" crowd_data = packet.data["provider_data"] task_run = channel_info.job.task_runner.task_run crowd_provider = channel_info.job.provider worker_id = crowd_data["worker_id"] worker = Worker(self.db, worker_id) logger.debug(f"Worker {worker_id} is being assigned one of " f"{len(units)} units.") reserved_unit = None while len(units) > 0 and reserved_unit is None: unit = units.pop(0) reserved_unit = task_run.reserve_unit(unit) if reserved_unit is None: self.message_queue.append( Packet( packet_type=PACKET_TYPE_PROVIDER_DETAILS, sender_id=SYSTEM_CHANNEL_ID, receiver_id=channel_info.channel_id, data={ "request_id": packet.data["request_id"], "agent_id": None }, )) else: agent = crowd_provider.AgentClass.new_from_provider_data( self.db, worker, unit, crowd_data) logger.debug(f"Created agent {agent}, {agent.db_id}.") self.message_queue.append( Packet( packet_type=PACKET_TYPE_PROVIDER_DETAILS, sender_id=SYSTEM_CHANNEL_ID, receiver_id=channel_info.channel_id, data={ "request_id": packet.data["request_id"], "agent_id": agent.get_agent_id(), }, )) agent_info = AgentInfo(agent=agent, used_channel_id=channel_info.channel_id) self.agents[agent.get_agent_id()] = agent_info self.agents_by_registration_id[ crowd_data["agent_registration_id"]] = agent_info # Launch individual tasks if not channel_info.job.task_runner.is_concurrent: unit_thread = threading.Thread( target=self._launch_and_run_unit, args=(unit, agent_info, channel_info.job.task_runner), name=f"Unit-thread-{unit.db_id}", ) agent_info.assignment_thread = unit_thread unit_thread.start() else: # See if the concurrent unit is ready to launch assignment = unit.get_assignment() agents = assignment.get_agents() if None in agents: agent.update_status(AgentState.STATUS_WAITING) return # need to wait for all agents to be here to launch # Launch the backend for this assignment agent_infos = [ self.agents[a.db_id] for a in agents if a is not None ] assign_thread = threading.Thread( target=self._launch_and_run_assignment, args=(assignment, agent_infos, channel_info.job.task_runner), name=f"Assignment-thread-{assignment.db_id}", ) for agent_info in agent_infos: agent_info.agent.update_status(AgentState.STATUS_IN_TASK) agent_info.assignment_thread = assign_thread assign_thread.start()
def make_registered_worker(self, worker_name) -> Worker: worker_id = self.db.new_worker(worker_name + "_sandbox", "mock") return Worker.get(self.db, worker_id)
def get_named_test_worker(self, worker_name: str) -> Worker: """Create a test worker with the given worker name""" worker_id = self.db.new_worker(worker_name, "mock") return Worker.get(self.db, worker_id)
def run_examine_by_worker( db: "MephistoDB", format_data_for_printing: Callable[[Dict[str, Any]], str], task_name: Optional[str] = None, block_qualification: Optional[str] = None, approve_qualification: Optional[str] = None, ): """ Basic script for reviewing work, grouped by worker for convenience. First gets the required information to run a review, then """ data_browser = DataBrowser(db=db) # Get initial arguments if task_name is None: task_name, block_qualification, approve_qualification = prompt_for_options( task_name, block_qualification, approve_qualification ) tasks = db.find_tasks(task_name=task_name) assert len(tasks) >= 1, f"No task found under name {task_name}" print( "You will be reviewing actual tasks with this flow. Tasks that you either Accept or Pass " "will be paid out to the worker, while rejected tasks will not. Passed tasks will be " "specially marked such that you can leave them out of your dataset. \n" "You may enter the option in caps to apply it to the rest of the units for a given worker." ) if block_qualification is not None: created_block_qual = find_or_create_qualification(db, block_qualification) print( "When you pass or reject a task, the script gives you an option to disqualify the worker " "from future tasks by assigning a qualification. If provided, this worker will no " "longer be able to work on tasks where the set --block-qualification shares the same name " f"you provided above: {block_qualification}\n" ) if approve_qualification is not None: created_approve_qual = find_or_create_qualification(db, approve_qualification) print( "You may use this script to establish a qualified worker pool by granting the provided " f"approve qualification {approve_qualification} to workers you think understand the task " "well. This will be provided as an option for workers you (A)pprove all on. " "Future tasks can use this qual as a required qualification, as described in the " "common qualification flows document." ) print( "**************\n" "You should only reject tasks when it is clear the worker has acted in bad faith, and " "didn't actually do the task. Prefer to pass on tasks that were misunderstandings.\n" "**************\n" ) units = data_browser.get_units_for_task_name(task_name) others = [u for u in units if u.get_status() != "completed"] units = [u for u in units if u.get_status() == "completed"] reviews_left = len(units) previous_work_by_worker = get_worker_stats(others) # Determine allowed options options = ["a", "p", "r"] options_string = "Do you want to accept this work? (a)ccept, (r)eject, (p)ass:" units_by_worker: Dict[str, List["Unit"]] = {} for u in units: w_id = u.worker_id if w_id not in units_by_worker: units_by_worker[w_id] = [] units_by_worker[w_id].append(u) # Run the review for w_id, w_units in units_by_worker.items(): worker = Worker.get(db, w_id) worker_name = worker.worker_name apply_all_decision = None reason = None for idx, unit in enumerate(w_units): print( f"Reviewing for worker {worker_name}, ({idx+1}/{len(w_units)}), " f"Previous {format_worker_stats(w_id, previous_work_by_worker)} " f"(total remaining: {reviews_left})" ) reviews_left -= 1 print(format_data_for_printing(data_browser.get_data_from_unit(unit))) if apply_all_decision is not None: decision = apply_all_decision else: decision = input( "Do you want to accept this work? (a)ccept, (r)eject, (p)ass: " ) while decision.lower() not in options: decision = input( "Decision must be one of a, p, r. Use CAPS to apply to all remaining for worker: " ) agent = unit.get_assigned_agent() assert ( agent is not None ), f"Can't make decision on None agent... issue with {unit}" if decision.lower() == "a": agent.approve_work() if decision == "A" and approve_qualification is not None: should_special_qualify = input( "Do you want to approve qualify this worker? (y)es/(n)o: " ) if should_special_qualify.lower() in ["y", "yes"]: worker.grant_qualification(approve_qualification, 1) elif decision.lower() == "p": agent.soft_reject_work() if apply_all_decision is None and block_qualification is not None: should_soft_block = input( "Do you want to soft block this worker? (y)es/(n)o: " ) if should_soft_block.lower() in ["y", "yes"]: worker.grant_qualification(block_qualification, 1) else: # decision = 'r' if apply_all_decision is None: reason = input("Why are you rejecting this work? ") should_block = input( "Do you want to hard block this worker? (y)es/(n)o: " ) if should_block.lower() in ["y", "yes"]: block_reason = input("Why permanently block this worker? ") worker.block_worker(block_reason) agent.reject_work(reason) if decision.lower() != decision: apply_all_decision = decision.lower()
def compile_results(self) -> pd.DataFrame: # Load task data logging.info('Retrieving task data from Mephisto.') task_units_data = self.get_task_data() logging.info( f'Data for {len(task_units_data)} units loaded successfully.') num_convos_with_no_save_data = 0 num_wrong_status_convos = 0 num_complete_convos = 0 unacceptable_task_units = [] unacceptable_worker_ids = [] conversation_idx = 0 conversation_dfs = [] for task_unit in task_units_data: worker_id = task_unit['worker_id'] assignment_id = task_unit['assignment_id'] # Skipping this conversation if save data is not found or the status is # invalid if task_unit['data']['save_data'] is None: logging.info('Found a task unit with no save data! Skipping.') num_convos_with_no_save_data += 1 continue elif task_unit['status'] not in ['completed', 'approved']: logging.info( f'Found a HIT with the status "{task_unit["status"]}"!.' f'Skipping.') num_wrong_status_convos += 1 continue else: num_complete_convos += 1 # Extract out useful conversation-level data custom_data = task_unit['data']['save_data']['custom_data'] mturk_worker_id = Worker.get(self.get_mephisto_db(), worker_id).worker_name task_start = datetime.utcfromtimestamp(task_unit['task_start']) task_end = datetime.utcfromtimestamp(task_unit['task_end']) info_dict = { ('worker_id', ''): worker_id, ('mturk_worker_id', ''): mturk_worker_id, ('unit_id', ''): task_unit['unit_id'], ('assignment_id', ''): assignment_id, ('conversation_idx', ''): conversation_idx, ('date', ''): task_start.strftime('%Y-%m-%d'), ('completion_time', ''): (task_end - task_start).total_seconds(), } # Check that the conversation consists of pairs of comments between # Speaker 1 and Speaker 2, with Speaker 1 speaking first assert 'final_rating' in task_unit['data']['messages'][-1][ 'task_data'] convo_messages = [m for m in task_unit['data']['messages'][:-1]] # The final message is just a final rating assert all([ message['id'] == 'Speaker 2' if message_idx % 2 else 'Speaker 1' for message_idx, message in enumerate(convo_messages) ]) messages_1 = [m for m in convo_messages if m['id'] == 'Speaker 1'] messages_2 = [m for m in convo_messages if m['id'] == 'Speaker 2'] assert len(messages_1) + len(messages_2) == len(convo_messages) # Determine whether the HIT contains unacceptable messages. (We do this for # every HIT, even if acceptability violation info was already saved, because # the violation criteria may have changed since the HIT was collected.) utterances_1 = [m['text'] for m in messages_1] assert utterances_1[0] == 'Hi!', ( 'This script assumes that the first human message is "Hi!", which is ' 'set by default and cannot be changed by the crowdsourcing worker.' ) acceptability_violations = self.acceptability_checker.check_messages( messages=utterances_1[1:], # Don't use the initial "Hi!" is_worker_0=True, violation_types=self.acceptability_checker.ALL_VIOLATION_TYPES, ) # Here, "worker 0" refers to Speaker 1, because we mix 0- and 1-indexing if acceptability_violations != '': logging.info( f'Conversation fails acceptability checks with a violation of ' f'"{acceptability_violations}", given the following utterances: ' f'{utterances_1[1:]}. Skipping.') unacceptable_task_units.append(task_unit) assert ( mturk_worker_id is not None ), "MTurk worker ID cannot be determined for this unacceptable conversation!" unacceptable_worker_ids.append(mturk_worker_id) continue # Ignore the conversation if ratings for all turns are the same, because # it's somewhat implausible that *all* turns in a conversation should garner # the same rating of engagingness, humanness, interestingness, or none. # (However, don't put these workers on the "unacceptable worker IDs" list, # to give them a little bit of the benefit of the doubt: i.e. maybe the # worker just didn't try hard enough to find which responses were more # engaging, etc. than others, but that doesn't mean that all of their HITs # across all evals are bad and should be removed.) if self.filter_uniform_hits: annotations = [ m['task_data']['problem_data_for_prior_message'] for m in task_unit['data']['messages'] if 'problem_data_for_prior_message' in m.get( 'task_data', {}) ] hashable_annotations = [ tuple(a[key] for key in sorted(a.keys())) for a in annotations ] unique_annotations = set(hashable_annotations) if len(unique_annotations) < 1: raise ValueError('No annotations found for this HIT!') elif len(unique_annotations) == 1: logging.info( f'All model responses in the conversation received the same ' f'annotation: {hashable_annotations[0]}. Skipping.') unacceptable_task_units.append(task_unit) continue single_turn_dicts = [] # Compile personas and previous utterances text_parts = [] if custom_data['personas'] is not None and len( custom_data['personas']) > 0: assert len(custom_data['personas']) == 2 text_parts += [ 'HUMAN PERSONA: ' + ' '.join(custom_data['personas'][0]), 'BOT PERSONA: ' + ' '.join(custom_data['personas'][1]), ] if (custom_data['additional_context'] is not None and len(custom_data['additional_context']) > 0): text_parts.append('ADDITIONAL CONTEXT: ' + custom_data['additional_context']) single_turn_dicts.append({ **info_dict, ('context', ''): ' '.join(text_parts) }) # Loop over conversation turns turns_per_speaker = defaultdict(int) for message in task_unit['data']['messages']: if 'text' in message: speaker_id = message['id'] # Add in annotation results, if they exist if 'problem_data_for_prior_message' in message.get( 'task_data', {}): bucket_data = { ('annotation_bucket', bucket): value for bucket, value in message['task_data'] ['problem_data_for_prior_message'].items() } else: bucket_data = {} # Add in results from the final rating(s), if they exist if 'final_rating' in message.get('task_data', {}): ratings = message['task_data']['final_rating'].split( '|') final_rating_data = { ('final_rating', str(idx)): value for idx, value in enumerate(ratings) } else: final_rating_data = {} turns_per_speaker[speaker_id] += 1 single_turn_dicts.append({ **info_dict, ('speaker_id', ''): speaker_id, ('speaker_turn_idx', ''): turns_per_speaker[speaker_id], ('text', ''): message['text'].replace('\n', '__newline__'), **bucket_data, **final_rating_data, }) # Adding the full conversation to the list of conversations single_turn_series = [ pd.Series(dict_).to_frame().transpose() for dict_ in single_turn_dicts ] single_convo_df = pd.concat(single_turn_series, axis=0, sort=False) conversation_dfs.append(single_convo_df) conversation_idx += 1 logging.info( f'{num_convos_with_no_save_data:d} conversations found with no save data.' ) logging.info( f'{num_wrong_status_convos:d} conversations found with the wrong status.' ) logging.info(f'{num_complete_convos:d} complete conversations found:') logging.info( f'\t{len(unacceptable_task_units):d} unacceptable conversations.') logging.info(f'\t{len(conversation_dfs):d} acceptable conversations.') # # Compile results across all conversations if len(conversation_dfs) == 0: raise ValueError('No acceptable conversations found!') unordered_conversation_df = pd.concat(conversation_dfs, axis=0) initial_ordered_columns = list(info_dict.keys()) + [ ('context', ''), ('speaker_id', ''), ('speaker_turn_idx', ''), ('text', ''), ] all_ordered_columns = initial_ordered_columns + [ col for col in unordered_conversation_df.columns if col not in initial_ordered_columns ] conversation_df = unordered_conversation_df[all_ordered_columns] # TODO: is there a less hacky way than this, which relies on the most recent # value of `info_dict`, to put the columns back into the right order? # # Calculate and save auxiliary stats logging.info( f'Saving MTurk IDs of workers with unacceptable conversations to ' f'{self.unacceptable_worker_ids_path}.') with open(self.unacceptable_worker_ids_path, 'w') as f: for worker_id in unacceptable_worker_ids: f.write(worker_id + '\n') # Calculate rates of selecting various annotation buckets annotation_bucket_df = conversation_df['annotation_bucket'].dropna( axis=0, how='any') if annotation_bucket_df.isna().sum().sum() > 0: raise ValueError( 'There is at least one row in which only partial annotation bucket data exists!' ) annotation_selection_rate_df = annotation_bucket_df.mean().to_frame( 'selection_rate') annotation_selection_rate_df.to_csv( self.annotation_selection_rate_path) logging.info( f'Annotation bucket selection rates saved to {self.annotation_selection_rate_path}.' ) output_strings = [ f'{series.name}: {100*series["selection_rate"]:0.0f}%' for _, series in annotation_selection_rate_df.iterrows() ] logging.info('Annotation bucket selection rates:\n' + '\n'.join(output_strings)) # Calculate Likert score stats final_rating_df = conversation_df['final_rating'].dropna(axis=0, how='any') if final_rating_df.isna().sum().sum() > 0: raise ValueError( 'There is at least one row in which only partial final rating data exists!' ) likert_score_stat_df = final_rating_df.astype(int).describe() likert_score_stat_df.to_csv(self.likert_score_stat_path) logging.info( f'Likert score statistics saved to {self.likert_score_stat_path}.') logging.info( f'Mean Likert scores:\n{likert_score_stat_df.loc["mean"]}') return conversation_df
def issue_bonuses(task_name: str) -> list: logging.info(f"Initializing bonus script for Mephisto task_name: {task_name}") # Download the shared list of issued bonuses and pull out unique reference tuples to check against logging.info(f"Downloading interaction bonus records from S3...") with open("bonus_records.csv", "wb") as f: s3.download_fileobj("droidlet-hitl", "bonus_records.csv", f) logging.info(f"Building list of already issued bonuses...") previously_issued_units = [] with open("bonus_records.csv", newline="") as csvfile: reader = csv.reader(csvfile) for row in reader: previously_issued_units.append( (row[0], row[1]) ) # the combination of task_name and unit_id is essentially unique # Get completed units from the run_id logging.info(f"Retrieving units from Mephisto based on task_name...") units = data_browser.get_units_for_task_name(task_name) completed_units = [] for unit in units: if unit.db_status == "completed": completed_units.append(unit) logging.info(f"Completed units for job {task_name} retrieved") # Retrieve bonus info from DB and issue new_bonus_records = [] bonus_results = [] total_bonus = 0 units_skipped = 0 for unit in completed_units: data = data_browser.get_data_from_unit(unit) unit_id = data["unit_id"] if (task_name, unit_id) not in previously_issued_units: worker = Worker(db, data["worker_id"]) outputs = data["data"]["outputs"] clean_click_string = outputs["clickedElements"].replace("'", "") clicks = json.loads(clean_click_string) bonus_result = False if clicks: for click in clicks: if "interactionScores" in click["id"]: try: amount = float( f'{(click["id"]["interactionScores"]["stoplight"] * 0.30):.2f}' ) bonus_result, _ = worker.bonus_worker( amount, "Virtual assistant interaction quality bonus", unit ) total_bonus += amount new_bonus_records.append( (task_name, unit_id, worker.worker_name, amount) ) except: logging.error( f"Exception raised on bonus issue for {worker.worker_name}, debug" ) new_bonus_records.append( (task_name, unit_id, worker.worker_name, "ERR") ) pass if not bonus_result: logging.info( f"Bonus NOT successfully issued for worker {worker.worker_name}, but no error was raised. \ Make sure interaction score exists and retry." ) else: logging.info( f"Recorded click data not found for {worker.worker_name}, no bonus will be issued" ) bonus_results.append(bonus_result) else: units_skipped += 1 logging.info(f"Num completed units: {len(completed_units)}") logging.info( f"Num bonuses skipped because bonus was issued previously for the same unit: {units_skipped}" ) logging.info(f"Num new bonuses issued: {len([x for x in bonus_results if x])}") logging.info(f"Num bonuses FAILED: {len([x for x in bonus_results if not x])}") logging.info(f"Total bonus amount issued: {total_bonus}") if new_bonus_records: logging.info(f"There are newly issued bonuses to record") logging.info(f"Writing new bonuses to csv and uploading to S3...") with open("bonus_records.csv", "a") as f: writer = csv.writer(f) for record in new_bonus_records: writer.writerow(record) s3.upload_file("bonus_records.csv", "droidlet-hitl", "bonus_records.csv") os.remove("bonus_records.csv") logging.info(f"Finished issuing bonuses!") return