コード例 #1
0
def process_args(args):
    try:
        Transformer = yamlconf.import_path(args['<content-transformer>'])
    except ImportError:
        Transformer = yamlconf.import_path("mwtext.content_transformers." +
                                           args['<content-transformer>'])
    if args['--siteinfo'] is not None:
        siteinfo = json.load(open(args['--siteinfo']))['query']
    else:
        logger.info("Gathering siteinfo from {0}".format(args['--wiki-host']))
        session = mwapi.Session(args['--wiki-host'],
                                user_agent="mwtext transform_content")
        siteinfo = get_siteinfo(session)

    kwarg_params = {}
    for kv in args['--param']:
        key, value = process_param(kv)
        kwarg_params[key] = value

    transformer = Transformer.from_siteinfo(siteinfo, **kwarg_params)

    if args['--include']:
        try:
            include_criteria = yamlconf.import_path(args['--include'])
        except ImportError:
            include_criteria = yamlconf.import_path(
                "mwtext.filter_functions." + args['--include'])
    else:
        include_criteria = all_pages_and_revisions

    include_redirects = bool(args['--include-redirects'])

    if len(args['--namespace']) == 0:
        allowed_namespaces = None
    else:
        allowed_namespaces = set(int(v) for v in args['--namespace'])

    if len(args['--content-model']) == 0:
        allowed_content_models = None
    else:
        allowed_content_models = set(cm for cm in args['--content-model'])

    min_content_length = int(args['--min-content-length'])

    return {
        'transformer': transformer,
        'include_criteria': include_criteria,
        'include_redirects': include_redirects,
        'allowed_namespaces': allowed_namespaces,
        'allowed_content_models': allowed_content_models,
        'min_content_length': min_content_length
    }
コード例 #2
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    dependent = yamlconf.import_path(args['<dependent>'])

    label_name = args['<label>']

    if args['--input'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    logger.info("Reading observations...")
    value_labels = [(list(solve(dependent.dependencies,
                                cache=ob['cache'])), ob[label_name])
                    for ob in observations]
    logger.debug(" -- {0} observations gathered".format(len(value_labels)))

    if args['--datasource-file'] == "<stdout>":
        datasource_f = sys.stdout
    else:
        datasource_f = open(args['--datasource-file'], 'w')

    debug = args['--debug']

    run(dependent, label_name, value_labels, datasource_f, debug)
コード例 #3
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.WARNING if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    dependents = []
    for dependent_path in args['<dependent>']:
        dependent_or_list = yamlconf.import_path(dependent_path)
        if isinstance(dependent_or_list, Dependent):
            dependents.append(dependent_or_list)
        else:
            dependents.extend(dependent_or_list)

    if args['--input'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    if args['--extractors'] == "<cpu count>":
        extractors = cpu_count()
    else:
        extractors = int(args['--extractors'])

    verbose = args['--verbose']

    run(observations, dependents, output, extractors, verbose)
コード例 #4
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.WARNING if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    dependents = []
    for dependent_path in args['<dependent>']:
        dependent_or_list = yamlconf.import_path(dependent_path)
        if isinstance(dependent_or_list, Dependent):
            dependents.append(dependent_or_list)
        else:
            dependents.extend(dependent_or_list)

    if args['--input'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    if args['--extractors'] == "<cpu count>":
        extractors = cpu_count()
    else:
        extractors = int(args['--extractors'])

    verbose = args['--verbose']

    run(observations, dependents, output, extractors, verbose)
コード例 #5
0
ファイル: fit_scorer.py プロジェクト: FajneFarita/wikigrammar
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')

    logger.info("Loading parser...")
    parser = yamlconf.import_path(args['<parser>'])

    min_freq = int(args['--min-freq'])

    verbose = args['--verbose']

    if args['--sentences'] == "<stdin>":
        sentences = read_sentences(sys.stdin, verbose)
    else:
        sentences = read_sentences(open(args['--sentences']), verbose)

    if args['--ss-model'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--ss-model'], "w")

    run(parser, min_freq, sentences, output, verbose)
コード例 #6
0
ファイル: fit.py プロジェクト: wiki-ai/revscoring
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    dependent = yamlconf.import_path(args['<dependent>'])

    label_name = args['<label>']

    if args['--input'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    logger.info("Reading observations...")
    value_labels = [
        (list(solve(dependent.dependencies, cache=ob['cache'])),
         ob[label_name])
        for ob in observations]
    logger.debug(" -- {0} observations gathered".format(len(value_labels)))

    if args['--datasource-file'] == "<stdout>":
        datasource_f = sys.stdout
    else:
        datasource_f = open(args['--datasource-file'], 'w')

    debug = args['--debug']

    run(dependent, label_name, value_labels, datasource_f, debug)
コード例 #7
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s')
    logging.getLogger("revscoring.scoring.models").setLevel(logging.WARNING)

    params_config = yamlconf.load(open(args['<params-config>']))

    features_path = args['<features>']
    features = yamlconf.import_path(features_path)

    if args['--observations'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--observations']))

    logger.info("Reading feature values & labels...")
    label_name = args['<label>']
    value_labels = \
        [(list(solve(features, cache=ob['cache'])), ob[label_name])
         for ob in observations]

    statistic_path = args['<statistic>']
    additional_params = {}

    labels, label_weights, population_rates = \
        util.read_labels_and_population_rates(
            None, args['--label-weight'], args['--pop-rate'])
    if label_weights is not None:
        additional_params['label_weights'] = label_weights
    if population_rates is not None:
        additional_params['population_rates'] = population_rates

    maximize = not args['--minimize']

    folds = int(args['--folds'])

    if args['--report'] == "<stdout>":
        report = sys.stdout
    else:
        report = open(args['--report'], "w")

    if args['--processes'] == "<cpu-count>":
        processes = multiprocessing.cpu_count()
    else:
        processes = int(args['--processes'])

    if args['--cv-timeout'] == "<forever>":
        cv_timeout = None
    else:
        cv_timeout = float(args['--cv-timeout']) * 60  # Convert to seconds

    verbose = args['--verbose']

    run(params_config, features, features_path, value_labels, statistic_path,
        additional_params, maximize, folds, report, processes, cv_timeout,
        verbose)
コード例 #8
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    params_config = yamlconf.load(open(args['<params-config>']))

    features_path = args['<features>']
    features = yamlconf.import_path(features_path)

    if args['--observations'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--observations']))

    logger.info("Reading feature values & labels...")
    label_name = args['<label>']
    value_labels = \
        [(list(solve(features, cache=ob['cache'])), ob[label_name])
         for ob in observations]

    # Get a sepecialized scorer if we have one
    scoring = metrics.SCORERS.get(args['--scoring'], args['--scoring'])

    folds = int(args['--folds'])

    if args['--report'] == "<stdout>":
        report = sys.stdout
    else:
        report = open(args['--report'], "w")

    if args['--processes'] == "<cpu-count>":
        processes = multiprocessing.cpu_count()
    else:
        processes = int(args['--processes'])

    if args['--cv-timeout'] == "<forever>":
        cv_timeout = None
    else:
        cv_timeout = float(args['--cv-timeout']) * 60  # Convert to seconds

    scale_features = args['--scale-features']
    verbose = args['--verbose']

    run(params_config, features_path, value_labels, scoring, folds,
        report, processes, cv_timeout, scale_features, verbose)
コード例 #9
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.WARNING if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    dependents = []
    for dependent_path in args['<dependent>']:
        dependent_or_list = yamlconf.import_path(dependent_path)
        if isinstance(dependent_or_list, Dependent):
            dependents.append(dependent_or_list)
        else:
            dependents.extend(dependent_or_list)

    session = mwapi.Session(args['--host'],
                            user_agent="Revscoring extract utility")
    if args['--login']:
        mwapi.cli.do_login(session, args['--host'])
    extractor = api.Extractor(session)

    if args['--input'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    if args['--extractors'] == "<cpu count>":
        extractors = cpu_count()
    else:
        extractors = int(args['--extractors'])

    batch_size = int(args['--batch-size'])

    if args['--profile'] is not None:
        profile_f = open(args['--profile'], 'w')
    else:
        profile_f = None

    verbose = args['--verbose']
    debug = args['--debug']

    run(observations, output, dependents, extractor, extractors, batch_size,
        profile_f, verbose, debug)
コード例 #10
0
ファイル: extract.py プロジェクト: wiki-ai/revscoring
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.WARNING if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )

    dependents = []
    for dependent_path in args['<dependent>']:
        dependent_or_list = yamlconf.import_path(dependent_path)
        if isinstance(dependent_or_list, Dependent):
            dependents.append(dependent_or_list)
        else:
            dependents.extend(dependent_or_list)

    session = mwapi.Session(args['--host'],
                            user_agent="Revscoring extract utility")
    if args['--login']:
        mwapi.cli.do_login(session, args['--host'])
    extractor = api.Extractor(session)

    if args['--input'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--input']))

    if args['--output'] == "<stdout>":
        output = sys.stdout
    else:
        output = open(args['--output'], 'w')

    if args['--extractors'] == "<cpu count>":
        extractors = cpu_count()
    else:
        extractors = int(args['--extractors'])

    batch_size = int(args['--batch-size'])

    if args['--profile'] is not None:
        profile_f = open(args['--profile'], 'w')
    else:
        profile_f = None

    verbose = args['--verbose']
    debug = args['--debug']

    run(observations, output, dependents, extractor, extractors, batch_size,
        profile_f, verbose, debug)
コード例 #11
0
def extract_features(label_file,context):
    rev_ids = [json.loads(label) for label in load_labels(label_file)]
    
    session = mwapi.Session(
        host= "https://{0}.wikipedia.org".format(
            context.replace("wiki","")),
        user_agent="Ores bias analysis project by Nate TeBlunthuis <*****@*****.**>")

    dependent_names = ["editquality.feature_lists.{0}.damaging".format(context),
                  "editquality.feature_lists.{0}.goodfaith".format(context)]
    dependents = []
    for dependent_path in dependent_names:
        dependent_or_list = yamlconf.import_path(dependent_path)
        if isinstance(dependent_or_list, Dependent):
            dependents.append(dependent_or_list)
        else:
            dependents.extend(dependent_or_list)

    extractor = api.Extractor(session)
    features = extract(dependents, rev_ids, extractor,extractors=os.cpu_count() - 1)
    return features
コード例 #12
0
ファイル: tune.py プロジェクト: wiki-ai/revscoring
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    logging.basicConfig(
        level=logging.INFO if not args['--debug'] else logging.DEBUG,
        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
    )
    logging.getLogger("revscoring.scoring.models").setLevel(logging.WARNING)

    params_config = yamlconf.load(open(args['<params-config>']))

    features_path = args['<features>']
    features = yamlconf.import_path(features_path)

    if args['--observations'] == "<stdin>":
        observations = read_observations(sys.stdin)
    else:
        observations = read_observations(open(args['--observations']))

    logger.info("Reading feature values & labels...")
    label_name = args['<label>']
    value_labels = \
        [(list(solve(features, cache=ob['cache'])), ob[label_name])
         for ob in observations]

    statistic_path = args['<statistic>']
    additional_params = {}

    labels, label_weights, population_rates = \
        util.read_labels_and_population_rates(
            args['--labels'], args['--label-weight'], args['--pop-rate'],
            args['--labels-config'])
    if label_weights is not None:
        additional_params['label_weights'] = label_weights
    if population_rates is not None:
        additional_params['population_rates'] = population_rates

    if args['--center']:
        additional_params['center'] = args['--center']
    if args['--scale']:
        additional_params['scale'] = args['--scale'],

    if args['--multilabel']:
        additional_params['multilabel'] = True

    maximize = not args['--minimize']

    folds = int(args['--folds'])

    if args['--report'] == "<stdout>":
        report = sys.stdout
    else:
        report = open(args['--report'], "w")

    if args['--processes'] == "<cpu-count>":
        processes = multiprocessing.cpu_count()
    else:
        processes = int(args['--processes'])

    if args['--cv-timeout'] == "<forever>":
        cv_timeout = None
    else:
        cv_timeout = float(args['--cv-timeout']) * 60  # Convert to seconds

    verbose = args['--verbose']

    run(params_config, features, labels, features_path, value_labels,
        statistic_path, additional_params, maximize, folds, report,
        processes, cv_timeout, verbose)