def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.WARNING if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) dependents = [] for dependent_path in args['<dependent>']: dependent_or_list = yamlconf.import_path(dependent_path) if isinstance(dependent_or_list, Dependent): dependents.append(dependent_or_list) else: dependents.extend(dependent_or_list) if args['--input'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') if args['--extractors'] == "<cpu count>": extractors = cpu_count() else: extractors = int(args['--extractors']) verbose = args['--verbose'] run(observations, dependents, output, extractors, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.WARNING if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') if args['--input'] == "<stdin>": obs = read_observations(sys.stdin) else: obs = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') if args['--workers'] == "<cpu-count>": workers = cpu_count() else: workers = int(args['--workers']) verbose = args['--verbose'] run(obs, output, workers, verbose)
def main(argv=None): args = docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) if args['--input'] == '<stdin>': observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output']) claims = args['--claim'] session = mwapi.Session(args['--api-host'], user_agent="ArticleQuality fetch_text utility.") verbose = args['--verbose'] run(session, observations, claims, output, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') sys.path.insert(0, ".") # Search local directory first features = yamlconf.import_module(args['<features>']) label_name = args['<label>'] if args['<model>'] is not None: model = Model.load(open(args['<model>'])) else: model = None additional_fields = args['<additional-field>'] if args['--input'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') verbose = args['--verbose'] run(observations, output, features, label_name, model, additional_fields, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') logging.getLogger('requests').setLevel(logging.WARNING) host = args['--host'] try_deleted_first = args['--deleted-1st'] if args['--input'] == "<stdin>": obs = read_observations(sys.stdin) else: obs = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') if args['--threads'] == "<cpu-count>": threads = cpu_count() else: threads = int(args['--threads']) verbose = args['--verbose'] run(host, obs, try_deleted_first, output, threads, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.WARNING if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') dependents = [] for dependent_path in args['<dependent>']: dependent_or_list = yamlconf.import_path(dependent_path) if isinstance(dependent_or_list, Dependent): dependents.append(dependent_or_list) else: dependents.extend(dependent_or_list) if args['--input'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') if args['--extractors'] == "<cpu count>": extractors = cpu_count() else: extractors = int(args['--extractors']) verbose = args['--verbose'] run(observations, dependents, output, extractors, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) dump_paths = args['<dump-file>'] if args['--labelings'] == "<stdin>": labelings = read_observations(sys.stdin) else: path = os.path.expanduser(args['--labelings']) labelings = read_observations(open(path)) grouped_labelings = groupby(labelings, key=lambda l: l['page_title']) page_labelings = {title: sorted(list(labs), key=lambda l: l['timestamp']) for title, labs in grouped_labelings} if args['--threads'] == "<cpu_count>": threads = cpu_count() else: threads = int(args['--threads']) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(os.path.expanduser(args['--output']), "w") verbose = args['--verbose'] run(dump_paths, page_labelings, output, threads, verbose=verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) # Requests is loud. Be quiet requests. requests.packages.urllib3.disable_warnings() ores_urls = args['<ores-url>'] context = args['<context>'] if args['--input'] == "<stdin>": rev_ids = [ob['rev_id'] for ob in read_observations(sys.stdin)] else: rev_ids = [ ob['rev_id'] for ob in read_observations(open(args['--input']))] if args['--model'] is None: models = [] else: models = args['--model'] batch_size = int(args['--batch-size']) delay = float(args['--delay']) verbose = args['--verbose'] run(ores_urls, context, models, rev_ids, batch_size, delay, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) logging.getLogger('requests').setLevel(logging.WARNING) logger.info("Loading models...") models = [(os.path.basename(path), SentenceScorer.load(open(path))) for path in args['<model>']] if args['--input'] == "<stdin>": obs = read_observations(sys.stdin) else: obs = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') if args['--workers'] == "<cpu-count>": workers = cpu_count() else: workers = int(args['--workers']) verbose = args['--verbose'] run(models, obs, output, workers, verbose)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) dump_paths = args['<dump-file>'] if args['--labelings'] == "<stdin>": labelings = read_observations(sys.stdin) else: path = os.path.expanduser(args['--labelings']) labelings = read_observations(open(path)) grouped_labelings = groupby(labelings, key=lambda l: l['page_title']) page_labelings = { title: sorted(list(labs), key=lambda l: l['timestamp']) for title, labs in grouped_labelings } if args['--threads'] == "<cpu_count>": threads = cpu_count() else: threads = int(args['--threads']) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(os.path.expanduser(args['--output']), "w") verbose = args['--verbose'] run(dump_paths, page_labelings, output, threads, verbose=verbose)
def main(argv=None): args = docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') if args['--input'] == '<stdin>': observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output']) claims = args['--claim'] session = mwapi.Session(args['--api-host'], user_agent="WikiClass fetch_text utility.") verbose = args['--verbose'] run(session, observations, claims, output, verbose)
def main(argv=None): args = docopt(__doc__, argv=argv) if args['--labelings'] == '<stdin>': labelings = read_observations(sys.stdin) else: labelings = read_observations(open(args['--labelings'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output'], 'w') session = mwapi.Session(args['--api-host'], user_agent=DRAFTTOPIC_UA) verbose = args['--verbose'] run(labelings, output, session, verbose)
def main(argv=None): args = docopt(__doc__, argv=argv) if args['--labelings'] == '<stdin>': labelings = read_observations(sys.stdin) else: labelings = read_observations(open(args['--labelings'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output']) session = mwapi.Session(args['--api-host'], user_agent="WikiClass fetch_text utility.") verbose = args['--verbose'] run(labelings, output, session, verbose)
def main(argv=None): args = docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') label_field = args['<label>'] if args['--input'] == '<stdin>': observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output'], 'w') write_labels(observations, label_field, output)
def main(argv=None): args = docopt(__doc__, argv=argv) if args['--labelings'] == '<stdin>': labelings = read_observations(sys.stdin) else: labelings = read_observations(open(args['--labelings'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output']) session = mwapi.Session(args['--api-host'], user_agent="ArticleQuality fetch_text utility.") verbose = args['--verbose'] run(labelings, output, session, verbose)
def main(argv=None): args = docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') if args['--input'] == '<stdin>': observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output'], 'w') session = mwapi.Session(args['--api-host'], user_agent="WikiProjects \ fetch_wikiprojects utility.") mid_level_wp = None try: with open(args['--mid_level_wp']) as fwp: mid_level_wp = json.loads(fwp.read()) except: # noqa: E722 logger.error( "Failed to load mid-level wikiprojects file, check and run\ again") pdb.set_trace() sys.exit() mid_level_wp = invert_mid_level_projects(mid_level_wp) verbose = args['--verbose'] start_time = datetime.now() run(session, observations, output, mid_level_wp, verbose) end_time = datetime.now() time_elapsed = end_time - start_time if verbose: logger.info('Time taken (hh:mm:ss.ms): {}'.format(time_elapsed))
def main(argv=None): args = docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') lang_code = args['<lang>'] max_n = int(args['-n']) if args['--input'] == '<stdin>': observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output'], 'w') balance_sample(observations, lang_code, max_n, output)
def main(argv=None): args = docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') if args['--input'] == '<stdin>': observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output'], 'w') label_name = args['<label>'] config = process_labels(observations, label_name) output.write(json.dumps(config, indent=4)) output.close()
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') observations = read_observations(sys.stdin) model = ScorerModel.load(open(args['<model-file>'])) verbose = args['--verbose'] run(observations, model, verbose)
def main(argv=None): args = docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING) if args['--input'] == '<stdin>': observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == '<stdout>': output = sys.stdout else: output = open(args['--output'], 'w') threads = int(args['--threads']) session = mwapi.Session(args['--api-host'], user_agent=DRAFTTOPIC_UA) run(observations, session, threads, output)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) observations = read_observations(sys.stdin) sys.path.insert(0, ".") # Search local directory first features = yamlconf.import_module(args['<features>']) label_name = args['<label>'] verbose = args['--verbose'] run(observations, features, label_name, verbose)
trainingData = [] trainingInfo = [] for revTrainId in trainingRevId: revTrainId = int(revTrainId) try: #print("https://en.wikipedia.org/wiki/?diff={0}".format(revTrainId)) trainingRevData = list(api_extractor.extract(revTrainId, features)) trainingObserv = {"rev_id": revTrainId, "cache": trainingRevData} except: print('Revision Data Not Found') continue trainingObserv = json.dumps(trainingObserv) trainingData.append(trainingObserv) for trainings in read_observations(trainingData): trainingInfo.append(trainings) testData = [] testInfo = [] for revTestId in testRevId: revTestId = int(revTestId) try: #print("https://en.wikipedia.org/wiki/?diff={0}".format(revTestId)) testRevData = list(api_extractor.extract(revTestId, features)) testObserv = {"rev_id": revTestId, "cache": testRevData} except: print('Revision Data Not Found') continue testObserv = json.dumps(testObserv) testData.append(testObserv)
} except RuntimeError as e: sys.stderr.write(str(e)) else: print(observation) training_features.append(observation) print("Dump observations to file") from revscoring.utilities.util import dump_observation, read_observations with open("observations.json.bz2", "wt") as dumpfile: for observation in training_features: dump_observation(observation, dumpfile) with open("observations.json.bz2", "r") as dumpfile: training_features = list(read_observations(dumpfile)) from revscoring.scoring.models import GradientBoosting is_approved = GradientBoosting(features, labels=[True, False], version="Demo", learning_rate=0.01, max_features="log2", n_estimators=700, max_depth=5, population_rates={ False: 0.5, True: 0.5 }, scale=True,