Example #1
0
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    input_building_data_file = mysqltsv.Reader(open(
        args['<input_building_data>'], 'rt'),
                                               headers=False,
                                               types=[str, str, str, str, str])

    input_i1_testing_data_file = mysqltsv.Reader(
        open(args['<input_i1_testing_data>'], 'rt'),
        headers=False,
        types=[str, str, str, str, str])

    input_i2_testing_data_file = mysqltsv.Reader(
        open(args['<input_i2_testing_data>'], 'rt'),
        headers=False,
        types=[str, str, str, str, str])

    output_file = mysqltsv.Writer(open(args['<output>'], "w"))

    verbose = args['--verbose']

    run(input_building_data_file, input_i1_testing_data_file,
        input_i2_testing_data_file, output_file, verbose)
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    input_file = mysqltsv.Reader(open(args['<input>'], 'rt'),
                                 headers=True,
                                 types=[
                                     str, str, str, str, str, int, str, str,
                                     str, str, str, str, str
                                 ])

    input_file_used_in_second_iteration = mysqltsv.Reader(
        open(args['<input>'], 'rt'),
        headers=True,
        types=[
            str, str, str, str, str, int, str, str, str, str, str, str, str
        ])

    output_file = mysqltsv.Writer(open(args['<output>'], "w"),
                                  headers=[
                                      'title', 'rev_id', 'user', 'username',
                                      'comment', 'namespace', 'timestamp',
                                      'prev_timestamp', 'session_start',
                                      'session_end', 'session_index',
                                      'session_events', 'event_index'
                                  ])

    verbose = args['--verbose']

    run(input_file, input_file_used_in_second_iteration, output_file, verbose)
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    input_file = mysqltsv.Reader(open(args['<input>'], 'rt'),
                                 headers=True,
                                 types=[
                                     str, str, str, str, str, int, str, str,
                                     str, str, str, str, str, str
                                 ])

    output_file = mysqltsv.Writer(
        open(args['<output>'], "w"),
        headers=[
            'user', 'username', 'session_start', 'mean_in_seconds',
            'std_in_seconds', 'namespace_0_edits', 'namespace_1_edits',
            'namespace_2_edits', 'namespace_3_edits', 'namespace_4_edits',
            'namespace_5_edits', 'namespace_120_edits', 'namespace_121_edits',
            'edits', 'bot', 'human', 'session_length_in_seconds',
            'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'claims', 'distinct_claims',
            'distinct_pages', 'disinct_edit_kinds', 'generic_bot_comment',
            'bot_revision_comment', 'sitelink_changes', 'alias_changed',
            'label_changed', 'description_changed', 'edit_war',
            'inter_edits_less_than_2_seconds', 'things_removed',
            'things_modified'
        ])

    verbose = args['--verbose']

    run(input_file, output_file, verbose)
Example #4
0
def main():
    args = docopt.docopt(__doc__)

    ores_url = args['<ores>']
    context = args['<context>']
    model = args['<model>']

    revs = mysqltsv.Reader(sys.stdin)

    run(ores_url, context, model, revs)
Example #5
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    if len(args['<source>']) > 0:
        sources = [
            mysqltsv.Reader(open(path, errors='replace'),
                            error_handler=log_error)
            for path in args['<source>']
        ]
    else:
        input_stream = io.TextIOWrapper(sys.stdin.buffer,
                                        encoding='utf-8',
                                        errors='replace')
        sources = [mysqltsv.Reader(input_stream, error_handler=log_error)]

    user_cols = args['--user']
    timestamp_col = args['--timestamp']
    cutoff = float(args['--cutoff'])

    if args['--sessions'] == "<stdout>":
        session_writer = mysqltsv.Writer(sys.stdout,
                                         headers=user_cols + SESSION_SUFFIX)
    else:
        session_writer = mysqltsv.Writer(open(args['--sessions'], 'w'),
                                         headers=user_cols + SESSION_SUFFIX)

    if args['--events'] is not None:
        event_writer = mysqltsv.Writer(open(args['--events'], 'w'),
                                       headers=sources[0].headers +
                                       EVENT_SUFFIX)
    else:
        event_writer = None

    verbose = args['--verbose']
    debug = args['--debug']

    run(sources, cutoff, session_writer, event_writer, user_cols,
        timestamp_col, verbose, debug)
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    input_training_file = mysqltsv.Reader(
        open(args['<input_training>'], 'rt'),
        headers=True,
        types=[
            str, str, str, float, float, int, int, int, int, int, int, int,
            int, int, str, str, float, int, int, int, int, int, int, int, int,
            int, int, int, int, int, int, int, int, int
        ])

    input_testing_file = mysqltsv.Reader(
        open(args['<input_testing>'], 'rt'),
        headers=True,
        types=[
            str, str, str, float, float, int, int, int, int, int, int, int,
            int, int, str, str, float, int, int, int, int, int, int, int, int,
            int, int, int, int, int, int, int, int, int
        ])

    input_anonymous_data_file = mysqltsv.Reader(
        open(args['<input_anonymous_data>'], 'rt'),
        headers=True,
        types=[
            str, str, float, float, int, int, int, int, int, int, int, int,
            int, float, int, int, int, int, int, int, int, int, int, int, int,
            int, int, int, int, int, int
        ])

    r_forest_predictions_output_file = mysqltsv.Writer(
        open(args['<r_forest_predictions_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'bot_prediction'
        ])

    gradient_b_predictions_output_file = mysqltsv.Writer(
        open(args['<gradient_b_predictions_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'bot_prediction'
        ])

    gradient_b_predictions_i2_output_file = mysqltsv.Writer(
        open(args['<gradient_b_predictions_i2_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'claims', 'distinct_claims',
            'distinct_pages', 'disinct_edit_kinds', 'generic_bot_comment',
            'bot_revision_comment', 'sitelink_changes', 'alias_changed',
            'label_changed', 'description_changed', 'edit_war',
            'inter_edits_less_than_2_seconds', 'things_removed',
            'things_modified', 'bot_prediction'
        ])

    gradient_b_threshold_scores_output_file = mysqltsv.Writer(
        open(args['<gradient_b_threshold_scores_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'threshold_score'
        ])

    gradient_b_threshold_scores_i2_output_file = mysqltsv.Writer(
        open(args['<gradient_b_threshold_scores_i2_output>'], "w"),
        headers=[
            'username', 'session_start', 'mean_in_seconds', 'std_in_seconds',
            'namespace_0_edits', 'namespace_1_edits', 'namespace_2_edits',
            'namespace_3_edits', 'namespace_4_edits', 'namespace_5_edits',
            'namespace_120_edits', 'namespace_121_edits', 'edits',
            'session_length_in_seconds', 'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'claims', 'distinct_claims',
            'distinct_pages', 'disinct_edit_kinds', 'generic_bot_comment',
            'bot_revision_comment', 'sitelink_changes', 'alias_changed',
            'label_changed', 'description_changed', 'edit_war',
            'inter_edits_less_than_2_seconds', 'things_removed',
            'things_modified', 'threshold_score'
        ])

    testing_output_file = mysqltsv.Writer(
        open(args['<testing_output>'], "w"),
        headers=[
            'user', 'username', 'session_start', 'mean_in_seconds',
            'std_in_seconds', 'namespace_0_edits', 'namespace_1_edits',
            'namespace_2_edits', 'namespace_3_edits', 'namespace_4_edits',
            'namespace_5_edits', 'namespace_120_edits', 'namespace_121_edits',
            'edits', 'session_length_in_seconds',
            'inter_edits_less_than_5_seconds',
            'inter_edits_between_5_and_20_seconds',
            'inter_edits_greater_than_20_seconds', 'bot', 'human',
            'bot_prediction'
        ])

    pr_output_file = mysqltsv.Writer(open(args['<pr_output>'], "w"),
                                     headers=['precision', 'recall'])

    roc_output_file = mysqltsv.Writer(
        open(args['<roc_output>'], "w"),
        headers=['false_positives', 'true_positives'])

    verbose = args['--verbose']

    run(input_training_file, input_testing_file, input_anonymous_data_file,
        r_forest_predictions_output_file, gradient_b_predictions_output_file,
        gradient_b_predictions_i2_output_file,
        gradient_b_threshold_scores_output_file,
        gradient_b_threshold_scores_i2_output_file, testing_output_file,
        pr_output_file, roc_output_file, verbose)
def main(argv=None):
    args = docopt.docopt(__doc__)
    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    input_testing_file = mysqltsv.Reader(
        open(args['<input_testing>'],'rt'), headers=True, 
        types=[str, str, str, float, float, int, int, int, int, int, int, int, 
            int, int, float, int, int, int, str, str, int])

    input_anonymous_user_threshold_scores_file = mysqltsv.Reader(
        open(args['<input_anonymous_user_threshold_scores>'],'rt'),
        headers=True, types=[str, str, float, float, int, int, int, int, int, 
        int, int, int, int, float, int, int, int, float])

    input_anonymous_user_threshold_scores_i2_file = mysqltsv.Reader(
        open(args['<input_anonymous_user_threshold_scores_i2>'],'rt'),
        headers=True, types=[str, str, float, float, int, int, int, int, int, 
        int, int, int, int, float, int, int, int, int, int, int, int, int, int, 
        int, int, int, int, int, int, int, int, float])


    anonymous_user_samples_output_file = mysqltsv.Writer(
        open(args['<anonymous_user_samples_output>'], "w"),
        headers=['session start timestamp', 'session completed timestamp',
                 'url', 'Consistent revision frequency', 
                 'Comment is "Updated item"', 
                 'Similar operations occur to different pages', 
                 'More than one claim edited per revision', 
                 'At least one rev. comment is prefixed by "bot" or "robot"', 
                 'Short session with rapid revisions', 'Not-obviously a bot'])


    anonymous_user_samples_i2_output_file = mysqltsv.Writer(
        open(args['<anonymous_user_samples_i2_output>'], "w"),
        headers=['session start timestamp', 'session completed timestamp',
                 'url', 'Consistent revision frequency', 
                 'Comment is "Updated item"', 
                 'Similar operations occur to different pages', 
                 'More than one claim edited per revision', 
                 'At least one rev. comment is prefixed by "bot" or "robot"', 
                 'Short session with rapid revisions', 'Not-obviously a bot'])

    testing_samples_output_file = mysqltsv.Writer(
        open(args['<testing_samples_output>'], "w"),
        headers=['session start timestamp', 'session completed timestamp',
                 'url', 'Consistent revision frequency', 
                 'Comment is "Updated item"', 
                 'Similar operations occur to different pages', 
                 'More than one claim edited per revision', 
                 'At least one rev. comment is prefixed by "bot" or "robot"', 
                 'Short session with rapid revisions', 'Not-obviously a bot'])

    verbose = args['--verbose']

    run(input_testing_file, input_anonymous_user_threshold_scores_file, 
        input_anonymous_user_threshold_scores_i2_file,
        anonymous_user_samples_output_file, 
        anonymous_user_samples_i2_output_file, testing_samples_output_file, 
        verbose)
Example #8
0
import io
import sys

import mysqltsv

writer = mysqltsv.Writer(sys.stdout, headers=['user_id', 'user_text', 'edits'])
writer.write([10, 'Foobar_Barman', 2344])
writer.write({'user_text': 'Barfoo_Fooman', 'user_id': 11, 'edits': 20})
writer.write([None, "127.0.0.1", 42])

my_file = io.StringIO("user_id\tuser_text\tedits\n" +
                      "10\tFoobar_Barman\t2344\n" + "11\tBarfoo_Fooman\t20\n" +
                      "NULL\t127.0.0.1\t42\n")
reader = mysqltsv.Reader(my_file, types=[int, str, int])
for row in reader:
    print(repr(row.user_id), repr(row['user_text']), repr(row[2]))
Example #9
0
import sys

import mysqltsv

revs = mysqltsv.Reader(sys.stdin)

for rev in revs:
    print(rev.values())