def process_args(args): try: Transformer = yamlconf.import_path(args['<content-transformer>']) except ImportError: Transformer = yamlconf.import_path("mwtext.content_transformers." + args['<content-transformer>']) if args['--siteinfo'] is not None: siteinfo = json.load(open(args['--siteinfo']))['query'] else: logger.info("Gathering siteinfo from {0}".format(args['--wiki-host'])) session = mwapi.Session(args['--wiki-host'], user_agent="mwtext transform_content") siteinfo = get_siteinfo(session) kwarg_params = {} for kv in args['--param']: key, value = process_param(kv) kwarg_params[key] = value transformer = Transformer.from_siteinfo(siteinfo, **kwarg_params) if args['--include']: try: include_criteria = yamlconf.import_path(args['--include']) except ImportError: include_criteria = yamlconf.import_path( "mwtext.filter_functions." + args['--include']) else: include_criteria = all_pages_and_revisions include_redirects = bool(args['--include-redirects']) if len(args['--namespace']) == 0: allowed_namespaces = None else: allowed_namespaces = set(int(v) for v in args['--namespace']) if len(args['--content-model']) == 0: allowed_content_models = None else: allowed_content_models = set(cm for cm in args['--content-model']) min_content_length = int(args['--min-content-length']) return { 'transformer': transformer, 'include_criteria': include_criteria, 'include_redirects': include_redirects, 'allowed_namespaces': allowed_namespaces, 'allowed_content_models': allowed_content_models, 'min_content_length': min_content_length }
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') dependent = yamlconf.import_path(args['<dependent>']) label_name = args['<label>'] if args['--input'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) logger.info("Reading observations...") value_labels = [(list(solve(dependent.dependencies, cache=ob['cache'])), ob[label_name]) for ob in observations] logger.debug(" -- {0} observations gathered".format(len(value_labels))) if args['--datasource-file'] == "<stdout>": datasource_f = sys.stdout else: datasource_f = open(args['--datasource-file'], 'w') debug = args['--debug'] run(dependent, label_name, value_labels, datasource_f, debug)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.WARNING if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') dependents = [] for dependent_path in args['<dependent>']: dependent_or_list = yamlconf.import_path(dependent_path) if isinstance(dependent_or_list, Dependent): dependents.append(dependent_or_list) else: dependents.extend(dependent_or_list) if args['--input'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') if args['--extractors'] == "<cpu count>": extractors = cpu_count() else: extractors = int(args['--extractors']) verbose = args['--verbose'] run(observations, dependents, output, extractors, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.WARNING if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) dependents = [] for dependent_path in args['<dependent>']: dependent_or_list = yamlconf.import_path(dependent_path) if isinstance(dependent_or_list, Dependent): dependents.append(dependent_or_list) else: dependents.extend(dependent_or_list) if args['--input'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') if args['--extractors'] == "<cpu count>": extractors = cpu_count() else: extractors = int(args['--extractors']) verbose = args['--verbose'] run(observations, dependents, output, extractors, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') logger.info("Loading parser...") parser = yamlconf.import_path(args['<parser>']) min_freq = int(args['--min-freq']) verbose = args['--verbose'] if args['--sentences'] == "<stdin>": sentences = read_sentences(sys.stdin, verbose) else: sentences = read_sentences(open(args['--sentences']), verbose) if args['--ss-model'] == "<stdout>": output = sys.stdout else: output = open(args['--ss-model'], "w") run(parser, min_freq, sentences, output, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) dependent = yamlconf.import_path(args['<dependent>']) label_name = args['<label>'] if args['--input'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) logger.info("Reading observations...") value_labels = [ (list(solve(dependent.dependencies, cache=ob['cache'])), ob[label_name]) for ob in observations] logger.debug(" -- {0} observations gathered".format(len(value_labels))) if args['--datasource-file'] == "<stdout>": datasource_f = sys.stdout else: datasource_f = open(args['--datasource-file'], 'w') debug = args['--debug'] run(dependent, label_name, value_labels, datasource_f, debug)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') logging.getLogger("revscoring.scoring.models").setLevel(logging.WARNING) params_config = yamlconf.load(open(args['<params-config>'])) features_path = args['<features>'] features = yamlconf.import_path(features_path) if args['--observations'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--observations'])) logger.info("Reading feature values & labels...") label_name = args['<label>'] value_labels = \ [(list(solve(features, cache=ob['cache'])), ob[label_name]) for ob in observations] statistic_path = args['<statistic>'] additional_params = {} labels, label_weights, population_rates = \ util.read_labels_and_population_rates( None, args['--label-weight'], args['--pop-rate']) if label_weights is not None: additional_params['label_weights'] = label_weights if population_rates is not None: additional_params['population_rates'] = population_rates maximize = not args['--minimize'] folds = int(args['--folds']) if args['--report'] == "<stdout>": report = sys.stdout else: report = open(args['--report'], "w") if args['--processes'] == "<cpu-count>": processes = multiprocessing.cpu_count() else: processes = int(args['--processes']) if args['--cv-timeout'] == "<forever>": cv_timeout = None else: cv_timeout = float(args['--cv-timeout']) * 60 # Convert to seconds verbose = args['--verbose'] run(params_config, features, features_path, value_labels, statistic_path, additional_params, maximize, folds, report, processes, cv_timeout, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) params_config = yamlconf.load(open(args['<params-config>'])) features_path = args['<features>'] features = yamlconf.import_path(features_path) if args['--observations'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--observations'])) logger.info("Reading feature values & labels...") label_name = args['<label>'] value_labels = \ [(list(solve(features, cache=ob['cache'])), ob[label_name]) for ob in observations] # Get a sepecialized scorer if we have one scoring = metrics.SCORERS.get(args['--scoring'], args['--scoring']) folds = int(args['--folds']) if args['--report'] == "<stdout>": report = sys.stdout else: report = open(args['--report'], "w") if args['--processes'] == "<cpu-count>": processes = multiprocessing.cpu_count() else: processes = int(args['--processes']) if args['--cv-timeout'] == "<forever>": cv_timeout = None else: cv_timeout = float(args['--cv-timeout']) * 60 # Convert to seconds scale_features = args['--scale-features'] verbose = args['--verbose'] run(params_config, features_path, value_labels, scoring, folds, report, processes, cv_timeout, scale_features, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.WARNING if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) dependents = [] for dependent_path in args['<dependent>']: dependent_or_list = yamlconf.import_path(dependent_path) if isinstance(dependent_or_list, Dependent): dependents.append(dependent_or_list) else: dependents.extend(dependent_or_list) session = mwapi.Session(args['--host'], user_agent="Revscoring extract utility") if args['--login']: mwapi.cli.do_login(session, args['--host']) extractor = api.Extractor(session) if args['--input'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') if args['--extractors'] == "<cpu count>": extractors = cpu_count() else: extractors = int(args['--extractors']) batch_size = int(args['--batch-size']) if args['--profile'] is not None: profile_f = open(args['--profile'], 'w') else: profile_f = None verbose = args['--verbose'] debug = args['--debug'] run(observations, output, dependents, extractor, extractors, batch_size, profile_f, verbose, debug)
def extract_features(label_file,context): rev_ids = [json.loads(label) for label in load_labels(label_file)] session = mwapi.Session( host= "https://{0}.wikipedia.org".format( context.replace("wiki","")), user_agent="Ores bias analysis project by Nate TeBlunthuis <*****@*****.**>") dependent_names = ["editquality.feature_lists.{0}.damaging".format(context), "editquality.feature_lists.{0}.goodfaith".format(context)] dependents = [] for dependent_path in dependent_names: dependent_or_list = yamlconf.import_path(dependent_path) if isinstance(dependent_or_list, Dependent): dependents.append(dependent_or_list) else: dependents.extend(dependent_or_list) extractor = api.Extractor(session) features = extract(dependents, rev_ids, extractor,extractors=os.cpu_count() - 1) return features
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) logging.getLogger("revscoring.scoring.models").setLevel(logging.WARNING) params_config = yamlconf.load(open(args['<params-config>'])) features_path = args['<features>'] features = yamlconf.import_path(features_path) if args['--observations'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--observations'])) logger.info("Reading feature values & labels...") label_name = args['<label>'] value_labels = \ [(list(solve(features, cache=ob['cache'])), ob[label_name]) for ob in observations] statistic_path = args['<statistic>'] additional_params = {} labels, label_weights, population_rates = \ util.read_labels_and_population_rates( args['--labels'], args['--label-weight'], args['--pop-rate'], args['--labels-config']) if label_weights is not None: additional_params['label_weights'] = label_weights if population_rates is not None: additional_params['population_rates'] = population_rates if args['--center']: additional_params['center'] = args['--center'] if args['--scale']: additional_params['scale'] = args['--scale'], if args['--multilabel']: additional_params['multilabel'] = True maximize = not args['--minimize'] folds = int(args['--folds']) if args['--report'] == "<stdout>": report = sys.stdout else: report = open(args['--report'], "w") if args['--processes'] == "<cpu-count>": processes = multiprocessing.cpu_count() else: processes = int(args['--processes']) if args['--cv-timeout'] == "<forever>": cv_timeout = None else: cv_timeout = float(args['--cv-timeout']) * 60 # Convert to seconds verbose = args['--verbose'] run(params_config, features, labels, features_path, value_labels, statistic_path, additional_params, maximize, folds, report, processes, cv_timeout, verbose)