def load_models(self, draft_model_file, wp10_model_file): ''' Load in the ORES models. ''' self.draft_model = Model.load(open(draft_model_file, 'rb')) self.wp10_model = Model.load(open(wp10_model_file, 'rb'))
def from_config(cls, config, name, section_key="scoring_contexts"): """ Expects: scoring_contexts: enwiki: scorer_models: damaging: enwiki_damaging_2014 good-faith: enwiki_good-faith_2014 extractor: enwiki ptwiki: scorer_models: damaging: ptwiki_damaging_2014 good-faith: ptwiki_good-faith_2014 extractor: ptwiki extractors: enwiki_api: ... ptwiki_api: ... scorer_models: enwiki_damaging_2014: ... enwiki_good-faith_2014: ... """ logger.info("Loading {0} '{1}' from config.".format(cls.__name__, name)) section = config[section_key][name] model_map = {} for model_name, key in section['scorer_models'].items(): scorer_model = Model.from_config(config, key) model_map[model_name] = scorer_model extractor = Extractor.from_config(config, section['extractor']) return cls(name, model_map=model_map, extractor=extractor)
def map_from_config(cls, config, context_names, section_key="scoring_contexts"): """ Loads a whole set of ScoringContext's from a configuration file while maintaining a cache of model names. This aids in better memory management and allows model aliases to be implemented at the configuration level. :Returns: A map of context_names and ScoringContext's where models are loaded once and reused cross contexts. """ model_key_map = {} context_map = {} for context_name in context_names: section = config[section_key][context_name] model_map = {} for model_name, key in section['scorer_models'].items(): if key in model_key_map: scorer_model = model_key_map[key] else: scorer_model = Model.from_config(config, key) model_key_map[key] = scorer_model model_map[model_name] = scorer_model extractor = Extractor.from_config(config, section['extractor']) context_map[context_name] = cls( context_name, model_map=model_map, extractor=extractor) return context_map
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.INFO if not args['--debug'] else logging.DEBUG, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s') sys.path.insert(0, ".") # Search local directory first features = yamlconf.import_module(args['<features>']) label_name = args['<label>'] if args['<model>'] is not None: model = Model.load(open(args['<model>'])) else: model = None additional_fields = args['<additional-field>'] if args['--input'] == "<stdin>": observations = read_observations(sys.stdin) else: observations = read_observations(open(args['--input'])) if args['--output'] == "<stdout>": output = sys.stdout else: output = open(args['--output'], 'w') verbose = args['--verbose'] run(observations, output, features, label_name, model, additional_fields, verbose)
def load(self): with open("enwiki.goodfaith.gradient_boosting.model") as f: self.model = Model.load(f) self.extractor = api.Extractor( mwapi.Session("https://en.wikipedia.org", user_agent="KFServing revscoring demo")) self.ready = True
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) scorer_model = Model.load(open(args['<model-file>'], 'rb')) if args['<text>'] == "<stdin>": text = sys.stdin.read() else: text = open(args['<text>']).read() print(score(scorer_model, text))
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) logging.basicConfig( level=logging.DEBUG if args['--debug'] else logging.INFO, format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) if args['--class-weight'] is not None: class_weights = dict( map(_parse_class_weight_option, args['--class-weight']) ) global CLASS_WEIGHTS CLASS_WEIGHTS.update(class_weights) paths = args['<dump-file>'] with open(args['--model']) as f: model = Model.load(f) sunset = mwtypes.Timestamp(args['--sunset']) if args['--score-at'] not in SCORE_ATS: raise ValueError("--score-at value {0} not available in {1}" .format(args['--score-at'], SCORE_ATS)) else: score_at = args['--score-at'] if args['--rev-scores'] == "<stdout>": rev_scores = mysqltsv.Writer(sys.stdout, headers=HEADERS) else: rev_scores = mysqltsv.Writer( open(args['--rev-scores'], "w"), headers=HEADERS) if args['--extend'] is None: skip_scores_before = {} else: logger.info("Reading in past scores from {0}".format(args['--extend'])) skip_scores_before = {} rows = mysqltsv.read( open(args['--extend']), types=[int, str, int, mwtypes.Timestamp, str, float]) for row in rows: skip_scores_before[row.page_id] = row.timestamp logger.info("Completed reading scores from old output.") if args['--processes'] == "<cpu count>": processes = cpu_count() else: processes = int(args['--processes']) verbose = args['--verbose'] run(paths, model, sunset, score_at, rev_scores, skip_scores_before, processes, verbose=verbose)
def load_model_and_queue(self, q, config, key): model = Model.from_config(config, key) # Just return the model info and the root of the features q.put((model.info, list(dig(model.features))))
def load(self, config, key): return Model.from_config(config, key)
def load_model_and_queue(self, q, config, key): model = Model.from_config(config, key) model.info = None # We don't need info on the server-side q.put(model)
from pprint import pprint import articlequality from revscoring import Model scorer_model = Model.load(open('../revscoring_models/enwiki.nettrom_wp10.gradient_boosting.model', 'rb')) # Classifies a revision of an article based on wikitext alone text = "An '''anachronism''' {{cite }}(from the [[Ancient Greek|Greek]] <ref ..." prediction_results = articlequality.score(scorer_model, text) # Print predicted assessment class and probabilities for all classes. pprint(("assessment", prediction_results['prediction'])) pprint(("probs", prediction_results['probability']))
from pprint import pprint import articlequality from revscoring import Model scorer_model = Model.load( open('../revscoring_models/enwiki.nettrom_wp10.gradient_boosting.model', 'rb')) # Classifies a revision of an article based on wikitext alone text = "An '''anachronism''' {{cite }}(from the [[Ancient Greek|Greek]] <ref ..." prediction_results = articlequality.score(scorer_model, text) # Print predicted assessment class and probabilities for all classes. pprint(("assessment", prediction_results['prediction'])) pprint(("probs", prediction_results['probability']))
import mwapi import bz2 from revscoring import Model from revscoring.extractors import api model = Model.load( bz2.open("models/ptwiki.draft_quality.gradient_boosting.model.bz2", "rb")) extractor = api.Extractor( mwapi.Session(host="https://pt.wikipedia.org", user_agent="draftquality test")) values = extractor.extract(58071111, model.features) print(model.score(values))
import mwapi from revscoring import Model from revscoring.extractors import api with open("models/enwiki.damaging.linear_svc.model") as f: model = Model.load(f) extractor = api.Extractor(mwapi.Session(host="https://en.wikipedia.org", user_agent="revscoring demo")) values = extractor.extract(123456789, model.features) print(model.score(values))
import mwapi from revscoring import Model from revscoring.extractors.api.extractor import Extractor from revscoring.errors import RevisionNotFound from revscoring.errors import TextDeleted import pandas as pd import numpy as np #enwiki.goodfaith.gradient_boosting.model #enwiki.damaging.gradient_boosting.model with open("models/enwiki.damaging.gradient_boosting.model") as f: scorer_model = Model.load(f) extractor = Extractor(mwapi.Session(host="https://en.wikipedia.org", user_agent="revscoring demo")) def get_score(rev_id): feature_values = list(extractor.extract(rev_id, scorer_model.features)) results = scorer_model.score(feature_values) return results df = pd.read_csv("data.csv") df["label_damage"] = "" df["confidence_damage"] = "" for i in range(len(df["rev_id"])): print(str(i) + "/" + str(len(df["rev_id"]))) try: results = get_score(df["rev_id"][i]) df["label_damage"][i] = results["prediction"]