Ejemplo n.º 1
0
def timing_charts(run_id: int) -> None:
    completed_units = retrieve_units(run_id)
    db = LocalMephistoDB()
    data_browser = DataBrowser(db=db)
    workers = {"total": []}
    unit_timing = {"total": [], "end": []}
    question_results = {1: [], 2: [], 3: [], 4: []}
    pass_rates = {1: [], 2: [], 3: [], 4: []}
    starttime = math.inf
    endtime = -math.inf
    feedback = []
    num_correct_hist = []
    bug_count = 0
    for unit in completed_units:
        data = data_browser.get_data_from_unit(unit)
        worker = Worker(db, data["worker_id"]).worker_name
        workers["total"].append(worker)
        starttime, endtime, unit_timing = hit_timing(data["data"], starttime, endtime, unit_timing)

        outputs = data["data"]["outputs"]
        feedback.append(outputs["feedback"])
        if outputs["bug"] == "true":
            bug_count += 1
        num_correct = 0
        for q in question_results.keys():
            key = "q" + str(q) + "Answer"
            question_results[q].append(outputs[key])
            if outputs[key] == "true":
                num_correct += 1
        num_correct_hist.append(num_correct)

    print(f"Job start time: {datetime.fromtimestamp(starttime)}")
    print(f"Job end time: {datetime.fromtimestamp(endtime)}")

    plot_hist_sorted(
        unit_timing["total"], cutoff=1200, target_val=600, xlabel="", ylabel="Total HIT Time (sec)"
    )
    calc_percentiles(unit_timing["total"], "HIT Length")

    for q in question_results.keys():
        results_dict = Counter(question_results[q])
        pass_rates[q] = (
            results_dict["true"] / (results_dict["true"] + results_dict["false"])
        ) * 100
        print(
            f"Question #{q} pass rate: {(results_dict['true']/(results_dict['true'] + results_dict['false']))*100:.1f}%"
        )
    plot_hist(pass_rates, xlabel="Question #", ylabel=f"Pass Rate %")
    print(
        f"Number of workers who didn't get any right: {len([x for x in num_correct_hist if x == 0])}"
    )

    keys = range(len(num_correct_hist))
    vals_dict = dict(zip(keys, num_correct_hist))
    plot_hist(vals_dict, xlabel="HIT #", ylabel="# Correct", ymax=4)

    print(f"Number of workers who experienced a window crash: {bug_count}")
    print(feedback)
Ejemplo n.º 2
0
    def mephistoDBReader():
        from mephisto.abstractions.databases.local_database import LocalMephistoDB
        from mephisto.tools.data_browser import DataBrowser as MephistoDataBrowser

        db = LocalMephistoDB()
        mephisto_data_browser = MephistoDataBrowser(db=db)

        units = mephisto_data_browser.get_units_for_task_name(database_task_name)
        for unit in units:
            yield mephisto_data_browser.get_data_from_unit(unit)
Ejemplo n.º 3
0
    def mephistoDBReader():
        from mephisto.abstractions.databases.local_database import LocalMephistoDB
        from mephisto.tools.data_browser import DataBrowser as MephistoDataBrowser

        db = LocalMephistoDB()
        mephisto_data_browser = MephistoDataBrowser(db=db)

        def format_data_for_review(data):
            contents = data["data"]
            return f"{data}"

        units = mephisto_data_browser.get_units_for_task_name(
            database_task_name)
        for unit in units:
            yield format_data_for_review(
                mephisto_data_browser.get_data_from_unit(unit))
Ejemplo n.º 4
0
def print_results(
    db: "MephistoDB",
    task_name: str,
    format_data_for_printing: Callable[[Dict[str, Any]], str],
    start: Optional[int] = None,
    end: Optional[int] = None,
) -> None:
    """
    Script to write out to stdout from start to end results from the task with the given task name
    """
    data_browser = DataBrowser(db=db)
    units = data_browser.get_units_for_task_name(task_name)
    if end is None:
        end = len(units)
    if start is None:
        start = 0
    units.reverse()
    for unit in units[start:end]:
        print(_get_and_format_data(data_browser, format_data_for_printing, unit))
Ejemplo n.º 5
0
def review(
    review_app_dir,
    port,
    output,
    output_method,
    csv_headers,
    json,
    database_task_name,
    all_data,
    debug,
):
    """Launch a local review UI server. Reads in rows froms stdin and outputs to either a file or stdout."""
    from mephisto.client.review.review_server import run

    if output == "" and output_method == "file":
        raise click.UsageError(
            "You must specify an output file via --output=<filename>, unless the --stdout flag is set."
        )
    if database_task_name is not None:
        from mephisto.abstractions.databases.local_database import LocalMephistoDB
        from mephisto.tools.data_browser import DataBrowser as MephistoDataBrowser

        db = LocalMephistoDB()
        mephisto_data_browser = MephistoDataBrowser(db=db)
        name_list = mephisto_data_browser.get_task_name_list()
        if database_task_name not in name_list:
            raise click.BadParameter(
                f'The task name "{database_task_name}" did not exist in MephistoDB.\n\nPerhaps you meant one of these? {", ".join(name_list)}\n\nFlag usage: mephisto review --db [task_name]\n'
            )

    run(
        review_app_dir,
        port,
        output,
        csv_headers,
        json,
        database_task_name,
        all_data,
        debug,
    )
Ejemplo n.º 6
0
    def test_find_workers_by_quals(self) -> None:
        """Ensure we can find a worker by an assigned qualification"""
        db = self.db
        WORKER_1_NAME = "worker_1"
        WORKER_2_NAME = "worker_2"
        WORKER_3_NAME = "worker_3"
        QUAL_NAME = "test_qualification"
        worker_1 = self.get_named_test_worker(WORKER_1_NAME)
        worker_2 = self.get_named_test_worker(WORKER_2_NAME)
        worker_3 = self.get_named_test_worker(WORKER_3_NAME)

        find_or_create_qualification(db, QUAL_NAME)

        worker_1.grant_qualification(QUAL_NAME, skip_crowd=True)
        worker_3.grant_qualification(QUAL_NAME, skip_crowd=True)

        data_browser = DataBrowser(db)

        qualified_workers = data_browser.get_workers_with_qualification(
            QUAL_NAME)
        qualified_ids = [w.db_id for w in qualified_workers]
        self.assertEqual(
            len(qualified_workers),
            2,
            f"Should only be two qualified workers, found {qualified_ids}",
        )

        self.assertIn(
            worker_1.db_id,
            qualified_ids,
            f"Worker 1 not in qualified list, found {qualified_ids}",
        )
        self.assertIn(
            worker_3.db_id,
            qualified_ids,
            f"Worker 3 not in qualified list, found {qualified_ids}",
        )
        self.assertNotIn(worker_2.db_id, qualified_ids,
                         "Worker 2 should not be in qualified list")
Ejemplo n.º 7
0
    def __init__(self, opt: Dict, remove_failed: bool = True):
        """
        Initialize the analyzer.

        Builds up the dataframe

        :param opt:
            opt dict

        :param remove_failed:
            Whether to remove ratings from turkers who failed onboarding
        """
        assert ',' not in opt['run_ids'], "AcuteAnalyzer can only handle one run ID!"
        self.run_id = opt['run_ids']
        self.pairings_filepath = opt['pairings_filepath']
        self.outdir = opt['outdir']
        self.root_dir = opt['root_dir']
        # Get task for loading pairing files
        self.task = opt.get('task', 'q')
        if opt.get('model_ordering') is not None:
            self.custom_model_ordering = opt['model_ordering'].split(',')
        else:
            self.custom_model_ordering = None
        if not self.outdir or not self.pairings_filepath:
            # Default to using self.root_dir as the root directory for outputs
            assert self.root_dir is not None and os.path.isdir(
                self.root_dir
            ), '--root-dir must be a real directory!'
        if not self.pairings_filepath:
            # Will be set to a non-empty path later
            self.pairings_filepath = ''
        if not self.outdir:
            self.outdir = os.path.join(self.root_dir, f'{self.run_id}-results')
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir, exist_ok=True)
        mephisto_root_path = opt['mephisto_root']
        if not mephisto_root_path:
            mephisto_root_path = None
        self.mephisto_db = LocalMephistoDB(database_path=mephisto_root_path)
        self.mephisto_data_browser = MephistoDataBrowser(db=self.mephisto_db)
        self.checkbox_prefix = self.CHECKBOX_PREFIX
        # Prepended to checkbox columns in self.dataframe
        self.dataframe = self._extract_to_dataframe()
        self._check_eval_question()
        if remove_failed:
            self._remove_failed_onboarding()
        if self.dataframe.index.size == 0:
            raise ValueError('No valid results found!')
        self._get_model_nick_names()
        self._load_pairing_files()
Ejemplo n.º 8
0
class AcuteAnalyzer(object):
    """
    Analyzer.

    Given a run_id, we can do lots of fun things!
    """

    def __init__(self, opt: Dict, remove_failed: bool = True):
        """
        Initialize the analyzer.

        Builds up the dataframe

        :param opt:
            opt dict

        :param remove_failed:
            Whether to remove ratings from turkers who failed onboarding
        """
        self.root_dir = opt['root_dir']
        assert os.path.isdir(self.root_dir), '--root-dir must be a real directory!'
        self.run_id = opt['run_id']
        self.outdir = opt['outdir']
        # Get task for loading pairing files
        self.task = opt.get('task', 'q')
        if opt.get('model_ordering') is not None:
            self.custom_model_ordering = opt['model_ordering'].split(',')
        else:
            self.custom_model_ordering = None
        if not self.outdir:
            self.outdir = os.path.join(self.root_dir, f'{self.run_id}-results')
        if not os.path.exists(self.root_dir):
            os.makedirs(self.root_dir, exist_ok=True)
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir, exist_ok=True)
        mephisto_root_path = opt['mephisto_root']
        if not mephisto_root_path:
            mephisto_root_path = None
        mephisto_db = LocalMephistoDB(database_path=mephisto_root_path)
        self.mephisto_data_browser = MephistoDataBrowser(db=mephisto_db)
        self.checkbox_prefix = CHECKBOX_PREFIX
        # Prepended to checkbox columns in self.dataframe
        self.dataframe = self._extract_to_dataframe()
        if remove_failed:
            self._remove_failed_onboarding()
        if self.dataframe.index.size == 0:
            raise ValueError('No valid results found!')
        self._get_model_nick_names()
        self._load_pairing_files()

    def _extract_response_by_index(
        self, unit_details: Dict[str, Any], idx: int
    ) -> Dict[str, Any]:
        """
        Extract response data from task data.

        :param unit_details:
            full extracted data from a unit
        :param idx:
            index of the singular evaluation within unit_details to extract

        :return response:
            Formatted worker's response data from the task
        """
        task_data = unit_details['data'][idx]
        response: Dict[str, Any] = {
            'run_id': self.run_id,
            'worker': unit_details['worker_id'],
            'time_taken': unit_details['task_end'] - unit_details['task_start'],
            'question': task_data['task_specs']['question'],
            'unit_id': unit_details['unit_id'],
            'task_start': unit_details['task_start'],
        }
        onboarding = task_data['task_specs'].get('is_onboarding', False)
        if 'speakerChoice' not in task_data or task_data['speakerChoice'] == '':
            print('speakerChoice not in task data!')
            return None
        choice = task_data['speakerChoice']
        if onboarding:
            response['correct'] = choice == task_data['pairing_dict']['correct_answer']
        else:
            response['correct'] = -1

        speakers_to_eval = sorted(task_data["pairing_dict"]["speakers_to_eval"])
        response.update(
            {
                'winner': choice,
                'loser': speakers_to_eval[1 - (speakers_to_eval.index(choice))],
                'eval_choice_0': speakers_to_eval[0],
                'eval_choice_1': speakers_to_eval[1],
                'reason': task_data['textReason'],
                'is_onboarding': onboarding,
                'matchup': f"{'__vs__'.join(speakers_to_eval)}",
                'pairing_id': task_data['pair_id'],
            }
        )

        # If it exists, add in which checkboxes of possible reasons the Turkers checked
        if len(task_data.get('speakerReasons', {})) > 0:
            response.update(
                {
                    self.checkbox_prefix + reason: checked
                    for reason, checked in task_data['speakerReasons'].items()
                }
            )
        return response

    def _parse_unit(self, unit: MephistoUnit) -> Optional[Dict[str, Any]]:
        """
        Return data for a given unit.

        If the data is corrupt for whatever reason, we return None

        :param unit:
            MephistoUnit of what should be a completed task by a worker

        :return data:
            Optional dict with the task's formatted data
        """
        try:
            return self.mephisto_data_browser.get_data_from_unit(unit)
        except AssertionError:
            print(
                f"WARNING: Data for run_id `{self.run_id}` not found for "
                f"unit id {unit.db_id}"
            )
            return None

    def _extract_to_dataframe(self) -> pd.DataFrame:
        """
        Extract the data from the run to a pandas dataframe.
        """
        units = self.mephisto_data_browser.get_units_for_task_name(self.run_id)
        responses: List[Dict[str, Any]] = []
        for unit in units:
            unit_details = self._parse_unit(unit)
            if unit_details is None:
                continue
            for idx in range(len(unit_details['data'])):
                response = self._extract_response_by_index(unit_details, idx)
                if response is not None:
                    responses.append(response)

        if len(responses) == 0:
            raise ValueError('No valid results found!')
        else:
            return pd.DataFrame(responses)

    def _remove_failed_onboarding(self):
        """
        Remove workers who failed onboarding.
        """
        df = self.dataframe

        all_workers_failing_onboarding = df.loc[
            df['is_onboarding'] & (df['correct'] == False), 'worker'  # noqa: E712
        ].values

        workers_failing_onboarding = sorted(
            np.unique(all_workers_failing_onboarding).tolist()
        )

        self.dataframe = df[
            ~df["worker"].isin(workers_failing_onboarding) & ~df["is_onboarding"]
        ]
        print(
            f'{self.dataframe.size:d} dataframe entries remaining after removing users who failed onboarding.'
        )

    def _load_pairing_files(self):
        df = self.dataframe
        self.pairings_filepath = get_hashed_combo_path(
            root_dir=self.root_dir,
            subdir='pairings_files',
            task=self.task,
            combos=self.combos,
        )
        if not os.path.exists(self.pairings_filepath):
            print(
                f'WARNING: Pairings filepath {self.pairings_filepath} could not be found.'
            )
            self.pairings_filepath = os.path.join(
                self.root_dir,
                'pairings_files',
                hashlib.sha1(
                    '___vs___'.join(
                        [f"{m}.{'q'.replace(':', '_')}" for m in self.models]
                    ).encode('utf-8')
                ).hexdigest()[:10],
            )
        if not os.path.exists(self.pairings_filepath):
            # For backward compatibility
            print(
                f'WARNING: Pairings filepath {self.pairings_filepath} could not be found.'
            )
            self.pairings_filepath = os.path.join(
                self.root_dir,
                'pairings_files',
                '___vs___'.join(
                    [f"{m}.{self.task.replace(':', '_')}" for m in self.models]
                ),
            )
        if not os.path.exists(self.pairings_filepath):
            print(
                f'NOTE: Pairings filepath {self.pairings_filepath} could not be found!'
            )
            return
        self.pairings = []
        with open(self.pairings_filepath, 'r') as f:
            for line in f:
                pair = json.loads(line)
                model1, model2 = pair['speakers_to_eval']
                pair[model1] = pair['dialogue_dicts'][0]
                pair[model2] = pair['dialogue_dicts'][1]
                del pair['dialogue_dicts']
                self.pairings.append(pair)
        self.pairs_to_eval = [self.pairings[i] for i in df.pairing_id.values.tolist()]
        # Build dialogue_ids => dialogue mappings

        winner_dialogues = []
        loser_dialogues = []
        for i, (_, row) in enumerate(df.iterrows()):
            winner = row['winner']
            loser = row['loser']
            winner_dialogues.append(self.pairs_to_eval[i][winner])
            loser_dialogues.append(self.pairs_to_eval[i][loser])
        df['pairs_to_eval'] = pd.Series(self.pairs_to_eval, index=df.index)
        df['winner_dialogue'] = pd.Series(winner_dialogues, index=df.index)
        df['loser_dialogue'] = pd.Series(loser_dialogues, index=df.index)
        self.dataframe = df

    def _get_model_nick_names(self):
        df = self.dataframe
        df = df[df['run_id'] == self.run_id]
        matchups = list(df.matchup.unique())
        models = set()
        combos = set()
        for matchup in matchups:
            model1, model2 = matchup.split('__vs__')
            models.add(model1)
            models.add(model2)
            combos.add(tuple(sorted((model1, model2))))
        self.models = list(models)
        self.models.sort()
        self.combos = list(combos)
        self.combos.sort()

    def get_reasons(self) -> List[str]:
        """
        Return dataframe reasons.
        """
        return self.dataframe['reason'].values.tolist()

    def get_max_hits_per_worker(self) -> List[int]:
        """
        Get max number of hits per worker.
        """
        return self.dataframe.groupby('worker')['run_id'].count().max()

    def get_wins_per_model_matchup(self) -> pd.DataFrame:
        """
        Return the wins for each model by matchup.
        """
        self.matchup_total_df = (
            self.dataframe.groupby(['eval_choice_0', 'eval_choice_1'])['run_id']
            .count()
            .to_frame('matchup_total')
        )
        self.win_total_df = (
            self.dataframe.groupby(
                ['eval_choice_0', 'eval_choice_1', 'winner', 'loser']
            )['loser']
            .count()
            .to_frame('win_total')
            .reset_index()
            .set_index(['eval_choice_0', 'eval_choice_1'])
        )
        return self.win_total_df

    def get_win_fractions(self) -> pd.DataFrame:
        """
        Return the joined matchup + win totals, get win fractions.

        Sorted according to win percentage
        """
        if not hasattr(self, 'win_total_df'):
            self.get_wins_per_model_matchup()

        self.win_fraction_df = self.matchup_total_df.join(self.win_total_df).assign(
            win_frac=lambda df: df['win_total'] / df['matchup_total']
        )

        pivoted_df = self.win_fraction_df.pivot(
            index="loser", columns="winner", values="win_frac"
        )
        if self.custom_model_ordering is not None:
            # Use the ordering of the models supplied by the user
            assert set(self.custom_model_ordering) == set(pivoted_df.columns)
            self.model_ordering = self.custom_model_ordering
        else:
            self.model_ordering = (
                self.win_fraction_df.groupby("winner")["win_frac"]
                .mean()
                .sort_values()
                .index.values.tolist()
            )
        self.sorted_win_frac_df = pivoted_df.reindex(
            index=self.model_ordering, columns=self.model_ordering
        )
        return self.sorted_win_frac_df

    def get_num_hits_per_matchup(self):
        """
        Return the number of hits per matchup.
        """
        matchup_total_1_df = self.matchup_total_df.reset_index()
        matchup_total_2_df = matchup_total_1_df.rename(
            columns={'eval_choice_0': 'eval_choice_1', 'eval_choice_1': 'eval_choice_0'}
        )
        self.num_hits_per_matchup_df = (
            pd.concat([matchup_total_1_df, matchup_total_2_df], axis=0)
            .pivot(
                index='eval_choice_0', columns='eval_choice_1', values='matchup_total'
            )
            .reindex(index=self.model_ordering, columns=self.model_ordering)
        )
        return self.num_hits_per_matchup_df

    def _compile_checkbox_stats(self) -> Dict[str, pd.DataFrame]:
        """
        Return the fraction of time that Turkers selected each checkbox.

        Results are cut both (1) by matchup and winner and (2) by just the winner. Each
        checkbox represents one reason that the Turkers could have chosen the speaker
        that they did.
        """
        checkbox_columns = [
            col
            for col in self.dataframe.columns
            if col.startswith(self.checkbox_prefix)
        ]
        group_column_types = {
            'matchup_and_winner': ['matchup', 'winner'],
            'winner': ['winner'],
        }
        grouped_dataframes = {}
        for group_type, group_columns in group_column_types.items():
            selected_columns = (
                self.dataframe[group_columns + checkbox_columns]
                .rename(
                    columns={
                        col: col[len(self.checkbox_prefix) :]
                        for col in checkbox_columns
                    }
                )
                .set_index(group_columns)
                .fillna(False)
            )
            grouped_dataframes[group_type] = selected_columns.groupby(
                group_columns
            ).mean()
        return grouped_dataframes

    def _compile_convos_and_reasons(self) -> str:
        """
        Create a human-readable string of all pairs of conversations, as well as which
        conversation each Turker chose and their reason for choosing it.
        """

        pairing_outputs = []

        for _, pairing_sr in self.dataframe.iterrows():
            winning_dialogue = self._dialogue_to_string(
                pairing_sr['winner_dialogue']['dialogue']
            )
            loser_dialogue = self._dialogue_to_string(
                pairing_sr['loser_dialogue']['dialogue']
            )
            pairing_output = f"""CONVO PAIR ID: {pairing_sr['pairing_id']}

WINNING DIALOGUE: {pairing_sr['winner']}
{winning_dialogue}

LOSING DIALOGUE: {pairing_sr['loser']}
{loser_dialogue}

QUESTION: {pairing_sr['question']}
TURKER'S CHOICE: {pairing_sr['winner']}
REASON: {pairing_sr['reason']}



"""
            pairing_outputs.append(pairing_output)

        return ''.join(pairing_outputs)

    @staticmethod
    def _dialogue_to_string(dialogue: List[dict]) -> str:
        """
        Convert a list of dictionaries into a human-readable conversation.

        Each dictionary represents one utterance.
        """
        utterance_strings = []
        for utterance_dict in dialogue:
            if utterance_dict["id"] == "human_evaluator":
                speaker_string = "HUMAN"
            else:
                speaker_string = utterance_dict["id"]
            utterance = utterance_dict["text"]
            utterance_strings.append(f"[{speaker_string}]: {utterance}")
        return "\n".join(utterance_strings)

    def get_matchup_totals_with_significance(self) -> pd.DataFrame:
        """
        Return dataframe with matchup win totals + significance.
        """

        def _signf_level(p):
            if p < 0.001:
                return "***", "p<.001"
            elif p < 0.01:
                return "**", "p<.01"
            elif p < 0.05:
                return "*", "p<.05"
            else:
                return "", "p>.05"

        output = []
        for _, run_annotations in self.dataframe.groupby('run_id'):
            question = list(run_annotations.question)[0]
            for matchup, annotations in run_annotations.groupby('matchup'):
                model1, model2 = matchup.split('__vs__')
                wincount1 = np.sum(annotations['winner'] == model1)
                wincount2 = np.sum(annotations['winner'] == model2)
                numratings = wincount1 + wincount2
                winrate1 = np.mean(annotations['winner'] == model1)
                winrate2 = np.mean(annotations['winner'] == model2)
                p = binom_test([wincount1, wincount2])

                stars, plevel = _signf_level(p)

                agreements = []
                for _, pairing_annotations in annotations.groupby('pairing_id'):
                    pair_wincount1 = np.sum(pairing_annotations['winner'] == model1)
                    pair_wincount2 = np.sum(pairing_annotations['winner'] == model2)
                    if pair_wincount1 < 2 and pair_wincount2 < 2:
                        if pair_wincount1 == 1 and pair_wincount2 == 1:
                            agreements.append(0)
                    else:
                        majority_wincount = max(pair_wincount1, pair_wincount2)
                        num_pair_annotations = pair_wincount1 + pair_wincount2
                        pair_agreement = majority_wincount / num_pair_annotations
                        agreements.append(pair_agreement)
                total_agreement = np.mean(agreements)

                output.append(
                    {
                        'question': question,
                        'matchup': matchup,
                        'model1': model1,
                        'model2': model2,
                        'numwins1': wincount1,
                        'numwins2': wincount2,
                        'winrate1': winrate1,
                        'winrate2': winrate2,
                        'numratings': numratings,
                        'p': p,
                        'stars': stars,
                        'sigf': plevel,
                        'agree': total_agreement,
                    }
                )
        output = pd.DataFrame(output)
        # order the columns how we want
        self.significance_df = output[
            [
                'question',
                'matchup',
                'model1',
                'numwins1',
                'winrate1',
                'model2',
                'numwins2',
                'winrate2',
                'numratings',
                'sigf',
                'stars',
                'p',
                'agree',
            ]
        ]
        return self.significance_df

    def save_results(self, path: str = None):
        """
        Save results to a certain path.
        """
        if not hasattr(self, 'significance_df'):
            self.get_matchup_totals_with_significance()
        if path is None:
            path = self.outdir

        # Save raw dataframe
        self.dataframe.to_csv(f'{path}/{self.run_id}.full.csv', index=False)

        with open('{}/{}.significance.csv'.format(path, self.run_id), 'w') as f:
            f.write(self.significance_df.to_csv(index=False))
        print(
            'To visualize significance result, try cat {} | column -t -s, | less -S'.format(
                '{}/{}.significance.csv'.format(path, self.run_id)
            )
        )
        with open('{}/{}.grid.csv'.format(path, self.run_id), 'w') as f:
            f.write(self.get_win_fractions().to_csv(index=True))
        with open(f'{path}/{self.run_id}.grid.winners_as_rows.csv', 'w') as f:
            f.write(self.get_win_fractions().transpose().to_csv(index=True))
        print(
            'To visualize grid result, try cat {} | column -t -s, | less -S'.format(
                '{}/{}.grid.csv'.format(path, self.run_id)
            )
        )

        # Save stats on how many ratings each worker did
        ratings_per_worker = (
            self.dataframe.groupby('worker')['run_id']
            .count()
            .sort_values(ascending=False)
        )
        ratings_per_worker.to_csv(f'{path}/{self.run_id}.ratings_per_worker.csv')

        # Save stats on how often Turkers selected each checkbox that represents one
        # reason to pick the speaker they did
        if any(col.startswith(self.checkbox_prefix) for col in self.dataframe.columns):
            checkbox_stats_dataframes = self._compile_checkbox_stats()
            for group_type, stats in checkbox_stats_dataframes.items():
                stats.to_csv(f'{path}/{self.run_id}.checkbox_stats.{group_type}.csv')

        if not hasattr(self, 'pairings'):

            print('No pairing file found, skipping conversation visualizations.')

        else:

            with open('{}/{}.reason.html'.format(path, self.run_id), 'w') as f:
                f.write(render_conversations_per_matchups(self.dataframe, True).data)
            print(
                'To visualize conversations with reasons only result, '
                'try scp username@devfair:{} to your local machine'.format(
                    ' {}/{}.reason.html'.format(path, self.run_id)
                )
            )
            with open('{}/{}.all.html'.format(path, self.run_id), 'w') as f:
                f.write(render_conversations_per_matchups(self.dataframe, False).data)
            print(
                'To visualize conversations result, try scp username@devfair:{}'
                ' to your local machine'.format(
                    '{}/{}.all.html'.format(path, self.run_id)
                )
            )

            # Write all pairs of dialogues, as well as the Turkers' choices and reasons, as
            # a text file
            compiled_text = self._compile_convos_and_reasons()
            with open(f'{path}/{self.run_id}.all_convo_pairs.txt', 'w') as f:
                f.write(compiled_text)
Ejemplo n.º 9
0
    assignment = Assignment(db, assignment_id)
    assignment.write_assignment_data(
        InitializationData(unit_data={}, shared=annotation["inputs"])
    )

    unit_id = db.new_unit(
        task_run.task_id,
        task_run.db_id,
        task_run.requester_id,
        assignment_id,
        0,  # Unit_index
        0,  # reward
        task_run.provider_type,
        task_run.task_type,
        task_run.sandbox,
    )

    unit = Unit(db, unit_id)
    agent = MockAgent.new(db, worker, unit)
    agent.state.state["inputs"] = annotation["inputs"]
    agent.state.state["outputs"] = annotation["outputs"]
    agent.state.save_data()
    agent.mark_done()
    agent.update_status(AgentState.STATUS_COMPLETED)

# Show tasks appear in MephistoDB:
mephisto_data_browser = MephistoDataBrowser(db=db)
units = mephisto_data_browser.get_units_for_task_name(input("Input task name: "))
for unit in units:
    print(mephisto_data_browser.get_data_from_unit(unit))
Ejemplo n.º 10
0
def check_run_status(run_id: int, qual_name: str) -> None:
    db = LocalMephistoDB()
    units = db.find_units(task_run_id=run_id)
    units_num = len(units)
    completed_num = 0
    launched_num = 0
    assigned_num = 0
    accepted_num = 0
    completed_units = []
    for unit in units:
        if unit.db_status == "completed":
            completed_num += 1
            completed_units.append(unit)
        elif unit.db_status == "launched":
            launched_num += 1
        elif unit.db_status == "assigned":
            assigned_num += 1
        elif unit.db_status == "accepted":
            accepted_num += 1
    print(
        f"Total HIT num: {units_num}\tCompleted HIT num: {completed_num}\tCompleted rate: {completed_num / units_num * 100}%"
    )
    print(
        f"Total HIT num: {units_num}\tLaunched HIT num: {launched_num}\tLaunched rate: {launched_num / units_num * 100}%"
    )
    print(
        f"Total HIT num: {units_num}\tAssigned HIT num: {assigned_num}\tAssigned rate: {assigned_num / units_num * 100}%"
    )
    print(
        f"Total HIT num: {units_num}\tAccepted HIT num: {accepted_num}\tAccepted rate: {accepted_num / units_num * 100}%"
    )

    data_browser = DataBrowser(db=db)
    total_time_completed_in_min = 0
    total_cnt = 0
    passed_time = 0
    passed_cnt = 0
    turkers_with_mturk_qual_cnt = 0
    for unit in completed_units:
        data = data_browser.get_data_from_unit(unit)
        duration = (
            data["data"]["times"]["task_end"] - data["data"]["times"]["task_start"]
        ) / 60  # in minutes
        if duration > 0:
            total_time_completed_in_min += duration
            total_cnt += 1
        worker_name = db.get_worker(worker_id=unit.worker_id)["worker_name"]
        turkers_with_mturk_qual_cnt += 1
        worker = db.find_workers(worker_name=worker_name)[0]
        if worker.get_granted_qualification(qual_name):
            passed_time += duration
            passed_cnt += 1

    print(
        f"For mephisto/mturk debug: total num: {total_cnt}, # who pass mturk qual: {turkers_with_mturk_qual_cnt}"
    )
    print(
        f"Total completed HITS\t\t{total_cnt}\tavg time spent\t{total_time_completed_in_min / total_cnt} mins"
    )
    print(
        f"HITS passed qualification\t{passed_cnt}\tavg time spent\t{passed_time / passed_cnt} mins"
    )
    try:
        print(
            f"HITS failed qualification\t{total_cnt - passed_cnt}\tavg time spent\t{(total_time_completed_in_min - passed_time) / (total_cnt - passed_cnt)} mins"
        )
    except:
        pass
Ejemplo n.º 11
0
import argparse
import logging
import json
import boto3
import csv
import os

from mephisto.abstractions.databases.local_database import LocalMephistoDB
from mephisto.tools.data_browser import DataBrowser
from mephisto.data_model.worker import Worker

logging.basicConfig(level="INFO")

db = LocalMephistoDB()
data_browser = DataBrowser(db=db)

s3 = boto3.client("s3")


def issue_bonuses(task_name: str) -> list:
    logging.info(f"Initializing bonus script for Mephisto task_name: {task_name}")

    # Download the shared list of issued bonuses and pull out unique reference tuples to check against
    logging.info(f"Downloading interaction bonus records from S3...")
    with open("bonus_records.csv", "wb") as f:
        s3.download_fileobj("droidlet-hitl", "bonus_records.csv", f)

    logging.info(f"Building list of already issued bonuses...")
    previously_issued_units = []
    with open("bonus_records.csv", newline="") as csvfile:
Ejemplo n.º 12
0
def run_examine_by_worker(
    db: "MephistoDB",
    format_data_for_printing: Callable[[Dict[str, Any]], str],
    task_name: Optional[str] = None,
    block_qualification: Optional[str] = None,
    approve_qualification: Optional[str] = None,
):
    """
    Basic script for reviewing work, grouped by worker for convenience. First gets
    the required information to run a review, then
    """
    data_browser = DataBrowser(db=db)

    # Get initial arguments
    if task_name is None:
        task_name, block_qualification, approve_qualification = prompt_for_options(
            task_name, block_qualification, approve_qualification
        )

    tasks = db.find_tasks(task_name=task_name)
    assert len(tasks) >= 1, f"No task found under name {task_name}"

    print(
        "You will be reviewing actual tasks with this flow. Tasks that you either Accept or Pass "
        "will be paid out to the worker, while rejected tasks will not. Passed tasks will be "
        "specially marked such that you can leave them out of your dataset. \n"
        "You may enter the option in caps to apply it to the rest of the units for a given worker."
    )
    if block_qualification is not None:
        created_block_qual = find_or_create_qualification(db, block_qualification)
        print(
            "When you pass or reject a task, the script gives you an option to disqualify the worker "
            "from future tasks by assigning a qualification. If provided, this worker will no "
            "longer be able to work on tasks where the set --block-qualification shares the same name "
            f"you provided above: {block_qualification}\n"
        )
    if approve_qualification is not None:
        created_approve_qual = find_or_create_qualification(db, approve_qualification)
        print(
            "You may use this script to establish a qualified worker pool by granting the provided "
            f"approve qualification {approve_qualification} to workers you think understand the task "
            "well. This will be provided as an option for workers you (A)pprove all on. "
            "Future tasks can use this qual as a required qualification, as described in the "
            "common qualification flows document."
        )
    print(
        "**************\n"
        "You should only reject tasks when it is clear the worker has acted in bad faith, and "
        "didn't actually do the task. Prefer to pass on tasks that were misunderstandings.\n"
        "**************\n"
    )

    units = data_browser.get_units_for_task_name(task_name)

    others = [u for u in units if u.get_status() != "completed"]
    units = [u for u in units if u.get_status() == "completed"]
    reviews_left = len(units)
    previous_work_by_worker = get_worker_stats(others)

    # Determine allowed options
    options = ["a", "p", "r"]
    options_string = "Do you want to accept this work? (a)ccept, (r)eject, (p)ass:"

    units_by_worker: Dict[str, List["Unit"]] = {}

    for u in units:
        w_id = u.worker_id
        if w_id not in units_by_worker:
            units_by_worker[w_id] = []
        units_by_worker[w_id].append(u)

    # Run the review
    for w_id, w_units in units_by_worker.items():
        worker = Worker.get(db, w_id)
        worker_name = worker.worker_name
        apply_all_decision = None
        reason = None
        for idx, unit in enumerate(w_units):

            print(
                f"Reviewing for worker {worker_name}, ({idx+1}/{len(w_units)}), "
                f"Previous {format_worker_stats(w_id, previous_work_by_worker)} "
                f"(total remaining: {reviews_left})"
            )
            reviews_left -= 1
            print(format_data_for_printing(data_browser.get_data_from_unit(unit)))
            if apply_all_decision is not None:
                decision = apply_all_decision
            else:
                decision = input(
                    "Do you want to accept this work? (a)ccept, (r)eject, (p)ass: "
                )
            while decision.lower() not in options:
                decision = input(
                    "Decision must be one of a, p, r. Use CAPS to apply to all remaining for worker: "
                )

            agent = unit.get_assigned_agent()
            assert (
                agent is not None
            ), f"Can't make decision on None agent... issue with {unit}"
            if decision.lower() == "a":
                agent.approve_work()
                if decision == "A" and approve_qualification is not None:
                    should_special_qualify = input(
                        "Do you want to approve qualify this worker? (y)es/(n)o: "
                    )
                    if should_special_qualify.lower() in ["y", "yes"]:
                        worker.grant_qualification(approve_qualification, 1)
            elif decision.lower() == "p":
                agent.soft_reject_work()
                if apply_all_decision is None and block_qualification is not None:
                    should_soft_block = input(
                        "Do you want to soft block this worker? (y)es/(n)o: "
                    )
                    if should_soft_block.lower() in ["y", "yes"]:
                        worker.grant_qualification(block_qualification, 1)
            else:  # decision = 'r'
                if apply_all_decision is None:
                    reason = input("Why are you rejecting this work? ")
                    should_block = input(
                        "Do you want to hard block this worker? (y)es/(n)o: "
                    )
                    if should_block.lower() in ["y", "yes"]:
                        block_reason = input("Why permanently block this worker? ")
                        worker.block_worker(block_reason)
                agent.reject_work(reason)

            if decision.lower() != decision:
                apply_all_decision = decision.lower()