def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.DEBUG if args['--debug'] else logging.INFO,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )
    if args['--class-weight'] is not None:
        class_weights = dict(
            map(_parse_class_weight_option, args['--class-weight'])
        )
        global CLASS_WEIGHTS
        CLASS_WEIGHTS.update(class_weights)

    paths = args['<dump-file>']
    with open(args['--model']) as f:
        model = Model.load(f)

    sunset = mwtypes.Timestamp(args['--sunset'])

    if args['--score-at'] not in SCORE_ATS:
        raise ValueError("--score-at value {0} not available in {1}"
                         .format(args['--score-at'], SCORE_ATS))
    else:
        score_at = args['--score-at']

    if args['--rev-scores'] == "<stdout>":
        rev_scores = mysqltsv.Writer(sys.stdout, headers=HEADERS)
    else:
        rev_scores = mysqltsv.Writer(
            open(args['--rev-scores'], "w"), headers=HEADERS)

    if args['--extend'] is None:
        skip_scores_before = {}
    else:
        logger.info("Reading in past scores from {0}".format(args['--extend']))
        skip_scores_before = {}
        rows = mysqltsv.read(
            open(args['--extend']),
            types=[int, str, int, mwtypes.Timestamp, str, float])
        for row in rows:
            skip_scores_before[row.page_id] = row.timestamp
        logger.info("Completed reading scores from old output.")

    if args['--processes'] == "<cpu count>":
        processes = cpu_count()
    else:
        processes = int(args['--processes'])

    verbose = args['--verbose']
    run(paths, model, sunset, score_at, rev_scores, skip_scores_before,
        processes, verbose=verbose)
Example #2
0
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    input_building_data_file = mysqltsv.Reader(open(
        args['<input_building_data>'], 'rt'),
                                               headers=False,
                                               types=[str, str, str, str, str])

    input_i1_testing_data_file = mysqltsv.Reader(
        open(args['<input_i1_testing_data>'], 'rt'),
        headers=False,
        types=[str, str, str, str, str])

    input_i2_testing_data_file = mysqltsv.Reader(
        open(args['<input_i2_testing_data>'], 'rt'),
        headers=False,
        types=[str, str, str, str, str])

    output_file = mysqltsv.Writer(open(args['<output>'], "w"))

    verbose = args['--verbose']

    run(input_building_data_file, input_i1_testing_data_file,
        input_i2_testing_data_file, output_file, verbose)
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    input_file = mysqltsv.Reader(open(args['<input>'], 'rt'),
                                 headers=True,
                                 types=[
                                     str, str, str, str, str, int, str, str,
                                     str, str, str, str, str
                                 ])

    input_file_used_in_second_iteration = mysqltsv.Reader(
        open(args['<input>'], 'rt'),
        headers=True,
        types=[
            str, str, str, str, str, int, str, str, str, str, str, str, str
        ])

    output_file = mysqltsv.Writer(open(args['<output>'], "w"),
                                  headers=[
                                      'title', 'rev_id', 'user', 'username',
                                      'comment', 'namespace', 'timestamp',
                                      'prev_timestamp', 'session_start',
                                      'session_end', 'session_index',
                                      'session_events', 'event_index'
                                  ])

    verbose = args['--verbose']

    run(input_file, input_file_used_in_second_iteration, output_file, verbose)
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    input_file = mysqltsv.Reader(open(args['<input>'], 'rt'),
                                 headers=True,
                                 types=[
                                     str, str, str, str, str, int, str, str,
                                     str, str, str, str, str, str
                                 ])

    output_file = mysqltsv.Writer(
        open(args['<output>'], "w"),
        headers=[
            'user', 'username', 'session_start', 'mean_in_seconds',
            'std_in_seconds', 'namespace_0_edits', 'namespace_1_edits',
            'namespace_2_edits', 'namespace_3_edits', 'namespace_4_edits',
            'namespace_5_edits', 'namespace_120_edits', 'namespace_121_edits',
            'edits', 'bot', 'human', 'session_length_in_seconds',
            'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'claims', 'distinct_claims',
            'distinct_pages', 'disinct_edit_kinds', 'generic_bot_comment',
            'bot_revision_comment', 'sitelink_changes', 'alias_changed',
            'label_changed', 'description_changed', 'edit_war',
            'inter_edits_less_than_2_seconds', 'things_removed',
            'things_modified'
        ])

    verbose = args['--verbose']

    run(input_file, output_file, verbose)
Example #5
0
def run(observations, output, features, label_name, model, additional_fields,
        verbose):
    headers = [str(f) for f in features] + [label_name]
    if model is not None:
        headers.append("score_doc")
    headers.extend(additional_fields)
    writer = mysqltsv.Writer(output, headers=headers)

    for ob in observations:
        try:
            feature_values = list(solve(features, cache=ob['cache']))
            row = feature_values + [ob[label_name]]
            if model is not None:
                score_doc = model.score(feature_values)
                row += [json.dumps(score_doc)]
            for field_name in additional_fields:
                row += [ob[field_name]]
            writer.write(row)
            if verbose:
                sys.stderr.write(".")
                sys.stderr.flush()
        except:  # noqa: E722
            # Naughty indiscriminate exception consumption.
            sys.stderr.write(traceback.format_exc())

    if verbose:
        sys.stderr.write("\n")
Example #6
0
def run(dump_files, extractors):
    writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)

    cites = extract(dump_files, extractors=extractors)
    for page_id, title, rev_id, timestamp, type, id in cites:
        writer.write(
            [page_id, title, rev_id,
             timestamp.long_format(), type, id])
Example #7
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    revert_radius = int(args['--revert-radius'])
    revert_window = int(args['--revert-window']) * (60 * 60)  # secs --> hrs

    if args['--host']:
        session = mwapi.Session(args['--host'],
                                user_agent="ORES revert labeling utility")
    else:
        session = None
    dumps = args['<dump-file>']

    verbose = args['--verbose']
    start = args['--start']
    if start:
        start = Timestamp(start)
    end = args['--end']
    if end:
        end = Timestamp(end)
    reverted_only = args['--reverted-only']
    trusted_groups = args['--trusted-groups']
    if trusted_groups:
        trusted_groups = trusted_groups.split(',')
        trusted_users = load_user_group_members(trusted_groups, session)
    else:
        trusted_users = None
    trusted_edits = args['--trusted-edits']
    if trusted_edits:
        trusted_edits = int(trusted_edits)

    if args['--rev-reverteds'] == "<stdout>":
        rev_reverteds = mysqltsv.Writer(sys.stdout)
    else:
        rev_reverteds = mysqltsv.Writer(open(args['--rev-reverteds'], "w"))

    check_blocked = args['--check-blocked']
    run(dumps, session, start, end, revert_radius, revert_window,
        reverted_only, trusted_users, trusted_edits, rev_reverteds,
        check_blocked, verbose=verbose)
Example #8
0
def run(ores_url, context, model, revs):

    writer = mysqltsv.Writer(sys.stdout, headers=revs.headers + ['proba'])

    for batch in batches(revs):
        rev_ids = [r.rev_id for r in batch]
        probas = get_probas(ores_url, context, model, rev_ids)

        for rev, proba in zip(batch, probas):
            writer.write(list(rev) + [proba])
Example #9
0
def run(paths, rate):

    writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)

    def process_path(path):
        f = mwcli.files.reader(path)

        return sample_tokens((json.loads(line) for line in f), rate)

    for values in para.map(process_path, paths):
        writer.write(values)
Example #10
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    if len(args['<source>']) > 0:
        sources = [
            mysqltsv.Reader(open(path, errors='replace'),
                            error_handler=log_error)
            for path in args['<source>']
        ]
    else:
        input_stream = io.TextIOWrapper(sys.stdin.buffer,
                                        encoding='utf-8',
                                        errors='replace')
        sources = [mysqltsv.Reader(input_stream, error_handler=log_error)]

    user_cols = args['--user']
    timestamp_col = args['--timestamp']
    cutoff = float(args['--cutoff'])

    if args['--sessions'] == "<stdout>":
        session_writer = mysqltsv.Writer(sys.stdout,
                                         headers=user_cols + SESSION_SUFFIX)
    else:
        session_writer = mysqltsv.Writer(open(args['--sessions'], 'w'),
                                         headers=user_cols + SESSION_SUFFIX)

    if args['--events'] is not None:
        event_writer = mysqltsv.Writer(open(args['--events'], 'w'),
                                       headers=sources[0].headers +
                                       EVENT_SUFFIX)
    else:
        event_writer = None

    verbose = args['--verbose']
    debug = args['--debug']

    run(sources, cutoff, session_writer, event_writer, user_cols,
        timestamp_col, verbose, debug)
Example #11
0
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    input_files = args['<input>']

    revisions_output_file =\
        mysqltsv.Writer(open(args['--revisions-output'], "w"))

    verbose = args['--verbose']

    run(input_files, revisions_output_file, verbose)
def run(rev_ids, score_processor, cache, verbose, debug):

    writer = mysqltsv.Writer(sys.stdout, headers=['rev_id', 'true_proba'])

    for rev_id, score in score_processor.score(rev_ids, cache=cache):
        if 'type' in score:
            sys.stderr.write("e")
        elif 'probability' in score:
            writer.write([rev_id, score['probability'][True]])
            sys.stderr.write(".")
        else:
            sys.stderr.write(json.dumps(score))

        sys.stderr.flush()

    sys.stderr.write("\n")
Example #13
0
def run(observations, features, label_name, verbose):
    writer = mysqltsv.Writer(
        sys.stdout, headers=[str(f) for f in features] + [label_name])

    for ob in observations:
        try:
            row = list(solve(features, cache=ob['cache'])) + [ob[label_name]]
            writer.write(row)
            if verbose:
                sys.stderr.write(".")
                sys.stderr.flush()
        except:  # noqa: E722
            # Naughty indiscriminate exception consumption.
            sys.stderr.write(traceback.format_exc())

    if verbose:
        sys.stderr.write("\n")
Example #14
0
def main():
    args = docopt.docopt(__doc__)

    HEADINGS = [
        "month", "page_namespace", "reverts", "bot_reverts", "bot_reverteds",
        "bot2bot_reverts"
    ]

    if args['--bots']:
        bots = {u.strip() for u in open(args['--bots'])}
    else:
        bots = None

    logging.basicConfig(
        level=logging.WARNING,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    writer = mysqltsv.Writer(sys.stdout, headers=HEADINGS)

    nmc = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    for doc in read_json_lines(sys.stdin):
        reverted_username = doc['reverteds'][-1].get('user', {}).get('text')
        reverting_username = doc['reverting'].get('user', {}).get('text')
        if reverted_username == reverting_username:
            continue
        dbts = Timestamp(doc['reverting']['timestamp']).short_format()
        month = dbts[:6] + "01"
        namespace = doc['reverting']['page']['namespace']

        nmc[month][namespace]['reverts'] += 1
        nmc[month][namespace]['bot_reverts'] += reverting_username in bots
        nmc[month][namespace]['bot_reverteds'] += reverted_username in bots
        nmc[month][namespace]['bot2bot_reverts'] += (reverting_username in bots
                                                     and reverted_username
                                                     in bots)

    for month in sorted(nmc.keys()):
        for page_namespace in sorted(nmc[month].keys()):
            counts = nmc[month][page_namespace]
            writer.write([
                month, page_namespace, counts['reverts'],
                counts['bot_reverts'], counts['bot_reverteds'],
                counts['bot2bot_reverts']
            ])
Example #15
0
def run(models, obs, output, workers, verbose):
    writer = mysqltsv.Writer(
        output, headers=['rev_id', 'i', 'sentence', 'length', 'productions'] +
                        [name for name, model in models])

    se_scorer = SentenceExtractorScorer(models)
    with ProcessPoolExecutor(max_workers=workers) as executor:
        logger.info("Processing sentences...")
        for error, r_i_s_l_scores in executor.map(se_scorer.score, obs):
            if error is None:
                for rev_id, i, sent, slen, scores in r_i_s_l_scores:
                    writer.write([rev_id, i, sent, slen,
                                  scores[0]['productions']] +
                                 [s['log_proba'] for s in scores])
                    if verbose:
                        sys.stderr.write(".")
                        sys.stderr.flush()
            else:
                sys.stderr.write(str(error))
                sys.stderr.write('\n')

            if verbose:
                sys.stderr.flush()
Example #16
0
                        process_language(detect(
                            wikicode.strip_code().strip())))
                except:
                    pass

                languages = list(set(languages))
                detected_languages = list(set(detected_languages))

                yield page.id, wikitext_length, has_infobox, has_description_field, str(
                    languages), str(detected_languages)

    print("total files: " + str(number_of_files))


output = mysqltsv.Writer(
    open("data/sdoc/commonswiki_20171120_files_description.tsv", "w"),
    headers=[
        "page_id", "wikitext_length", "has_infobox", "has_description_field",
        "languages", "detected_languages"
    ])

for page_id, wikitext_length, has_infobox, has_description_field, languages, detected_languages in mwxml.map(
        process_dump, dump_files):
    output.write([
        page_id, wikitext_length, has_infobox, has_description_field,
        languages, detected_languages
    ])

# compress the tsv file
# tar -czvf commonswiki_20171120_files_description.tar.gz commonswiki_20171120_files_description.tsv
Example #17
0
import io
import sys

import mysqltsv

writer = mysqltsv.Writer(sys.stdout, headers=['user_id', 'user_text', 'edits'])
writer.write([10, 'Foobar_Barman', 2344])
writer.write({'user_text': 'Barfoo_Fooman', 'user_id': 11, 'edits': 20})
writer.write([None, "127.0.0.1", 42])

my_file = io.StringIO("user_id\tuser_text\tedits\n" +
                      "10\tFoobar_Barman\t2344\n" + "11\tBarfoo_Fooman\t20\n" +
                      "NULL\t127.0.0.1\t42\n")
reader = mysqltsv.Reader(my_file, types=[int, str, int])
for row in reader:
    print(repr(row.user_id), repr(row['user_text']), repr(row[2]))
Example #18
0
def main():
    args = docopt.docopt(__doc__)

    HEADINGS = [
        "rev_id", "rev_timestamp", "rev_user", "rev_user_text", "rev_page",
        "rev_sha1", "rev_minor_edit", "rev_deleted", "rev_parent_id",
        "archived", "reverting_id", "reverting_timestamp", "reverting_user",
        "reverting_user_text", "reverting_page", "reverting_sha1",
        "reverting_minor_edit", "reverting_deleted", "reverting_parent_id",
        "reverting_archived", "reverting_comment", "rev_revert_offset",
        "revisions_reverted", "reverted_to_rev_id", "page_namespace"
    ]

    if args['--users']:
        users = {u.strip() for u in open(args['--users'])}
    else:
        users = None

    writer = mysqltsv.Writer(sys.stdout, headers=HEADINGS)

    for doc in (json.loads(l) for l in sys.stdin):
        reverted_username = doc['reverteds'][-1].get('user', {}).get('text')
        reverting_username = doc['reverting'].get('user', {}).get('text')
        if reverted_username == reverting_username:
            continue
        if users is not None and \
           not (reverted_username in users and reverting_username in users):
            continue

        writer.write([
            doc['reverteds'][-1]['id'],  # rev_id
            Timestamp(doc['reverteds'][-1]
                      ['timestamp']).short_format(),  # rev_timestamp
            doc['reverteds'][-1].get('user', {}).get('id'),  # rev_user
            doc['reverteds'][-1].get('user', {}).get('text'),  # rev_user_text
            doc['reverteds'][-1]['page']['id'],  # rev_page
            doc['reverteds'][-1].get('sha1'),  # rev_sha1
            doc['reverteds'][-1]['minor'],  # rev_minor_edit
            doc['reverteds'][-1]['deleted']['text'],  # rev_deleted
            doc['reverteds'][-1].get('parent_id'),  # rev_parent_id
            False,  # archived
            doc['reverting']['id'],  # reverting_id
            Timestamp(doc['reverting']
                      ['timestamp']).short_format(),  # reverting_timestamp
            doc['reverting'].get('user', {}).get('id'),  # reverting_user
            doc['reverting'].get('user',
                                 {}).get('text'),  # reverting_user_text
            doc['reverting']['page']['id'],  #  reverting_page
            doc['reverting'].get('sha1'),  # reverting_sha1
            doc['reverting']['minor'],  # reverting_minor_edit
            doc['reverting']['deleted']['text'],  # reverting_deleted
            doc['reverting'].get('parent_id'),  # reverting_parent_id
            False,  # reverting_archived
            doc['reverting'].get('comment', '-'),  # reverting_comment
            len(doc['reverteds']),  # rev_revert_offset
            len(doc['reverteds']),  # revisions_reverted
            doc['reverted_to']['id'],  # reverted_to_rev_id
            doc['reverting']['page']['namespace']  # page_namespace
        ])
        sys.stderr.write(".")
        sys.stderr.flush()

    sys.stderr.write("\n")
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    input_testing_file = mysqltsv.Reader(
        open(args['<input_testing>'],'rt'), headers=True, 
        types=[str, str, str, float, float, int, int, int, int, int, int, int, 
            int, int, float, int, int, int, str, str, int])

    input_anonymous_user_threshold_scores_file = mysqltsv.Reader(
        open(args['<input_anonymous_user_threshold_scores>'],'rt'),
        headers=True, types=[str, str, float, float, int, int, int, int, int, 
        int, int, int, int, float, int, int, int, float])

    input_anonymous_user_threshold_scores_i2_file = mysqltsv.Reader(
        open(args['<input_anonymous_user_threshold_scores_i2>'],'rt'),
        headers=True, types=[str, str, float, float, int, int, int, int, int, 
        int, int, int, int, float, int, int, int, int, int, int, int, int, int, 
        int, int, int, int, int, int, int, int, float])


    anonymous_user_samples_output_file = mysqltsv.Writer(
        open(args['<anonymous_user_samples_output>'], "w"),
        headers=['session start timestamp', 'session completed timestamp',
                 'url', 'Consistent revision frequency', 
                 'Comment is "Updated item"', 
                 'Similar operations occur to different pages', 
                 'More than one claim edited per revision', 
                 'At least one rev. comment is prefixed by "bot" or "robot"', 
                 'Short session with rapid revisions', 'Not-obviously a bot'])


    anonymous_user_samples_i2_output_file = mysqltsv.Writer(
        open(args['<anonymous_user_samples_i2_output>'], "w"),
        headers=['session start timestamp', 'session completed timestamp',
                 'url', 'Consistent revision frequency', 
                 'Comment is "Updated item"', 
                 'Similar operations occur to different pages', 
                 'More than one claim edited per revision', 
                 'At least one rev. comment is prefixed by "bot" or "robot"', 
                 'Short session with rapid revisions', 'Not-obviously a bot'])

    testing_samples_output_file = mysqltsv.Writer(
        open(args['<testing_samples_output>'], "w"),
        headers=['session start timestamp', 'session completed timestamp',
                 'url', 'Consistent revision frequency', 
                 'Comment is "Updated item"', 
                 'Similar operations occur to different pages', 
                 'More than one claim edited per revision', 
                 'At least one rev. comment is prefixed by "bot" or "robot"', 
                 'Short session with rapid revisions', 'Not-obviously a bot'])

    verbose = args['--verbose']

    run(input_testing_file, input_anonymous_user_threshold_scores_file, 
        input_anonymous_user_threshold_scores_i2_file,
        anonymous_user_samples_output_file, 
        anonymous_user_samples_i2_output_file, testing_samples_output_file, 
        verbose)
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    input_training_file = mysqltsv.Reader(
        open(args['<input_training>'], 'rt'),
        headers=True,
        types=[
            str, str, str, float, float, int, int, int, int, int, int, int,
            int, int, str, str, float, int, int, int, int, int, int, int, int,
            int, int, int, int, int, int, int, int, int
        ])

    input_testing_file = mysqltsv.Reader(
        open(args['<input_testing>'], 'rt'),
        headers=True,
        types=[
            str, str, str, float, float, int, int, int, int, int, int, int,
            int, int, str, str, float, int, int, int, int, int, int, int, int,
            int, int, int, int, int, int, int, int, int
        ])

    input_anonymous_data_file = mysqltsv.Reader(
        open(args['<input_anonymous_data>'], 'rt'),
        headers=True,
        types=[
            str, str, float, float, int, int, int, int, int, int, int, int,
            int, float, int, int, int, int, int, int, int, int, int, int, int,
            int, int, int, int, int, int
        ])

    r_forest_predictions_output_file = mysqltsv.Writer(
        open(args['<r_forest_predictions_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'bot_prediction'
        ])

    gradient_b_predictions_output_file = mysqltsv.Writer(
        open(args['<gradient_b_predictions_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'bot_prediction'
        ])

    gradient_b_predictions_i2_output_file = mysqltsv.Writer(
        open(args['<gradient_b_predictions_i2_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'claims', 'distinct_claims',
            'distinct_pages', 'disinct_edit_kinds', 'generic_bot_comment',
            'bot_revision_comment', 'sitelink_changes', 'alias_changed',
            'label_changed', 'description_changed', 'edit_war',
            'inter_edits_less_than_2_seconds', 'things_removed',
            'things_modified', 'bot_prediction'
        ])

    gradient_b_threshold_scores_output_file = mysqltsv.Writer(
        open(args['<gradient_b_threshold_scores_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'threshold_score'
        ])

    gradient_b_threshold_scores_i2_output_file = mysqltsv.Writer(
        open(args['<gradient_b_threshold_scores_i2_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'claims', 'distinct_claims',
            'distinct_pages', 'disinct_edit_kinds', 'generic_bot_comment',
            'bot_revision_comment', 'sitelink_changes', 'alias_changed',
            'label_changed', 'description_changed', 'edit_war',
            'inter_edits_less_than_2_seconds', 'things_removed',
            'things_modified', 'threshold_score'
        ])

    testing_output_file = mysqltsv.Writer(
        open(args['<testing_output>'], "w"),
        headers=[
            'user', 'username', 'session_start', 'mean_in_seconds',
            'std_in_seconds', 'namespace_0_edits', 'namespace_1_edits',
            'namespace_2_edits', 'namespace_3_edits', 'namespace_4_edits',
            'namespace_5_edits', 'namespace_120_edits', 'namespace_121_edits',
            'edits', 'session_length_in_seconds',
            'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'bot', 'human',
            'bot_prediction'
        ])

    pr_output_file = mysqltsv.Writer(open(args['<pr_output>'], "w"),
                                     headers=['precision', 'recall'])

    roc_output_file = mysqltsv.Writer(
        open(args['<roc_output>'], "w"),
        headers=['false_positives', 'true_positives'])

    verbose = args['--verbose']

    run(input_training_file, input_testing_file, input_anonymous_data_file,
        r_forest_predictions_output_file, gradient_b_predictions_output_file,
        gradient_b_predictions_i2_output_file,
        gradient_b_threshold_scores_output_file,
        gradient_b_threshold_scores_i2_output_file, testing_output_file,
        pr_output_file, roc_output_file, verbose)