Example #1
0
 def inject_model(self):
     cfg_obj = validate_config(self.model_path(CFG_FILE), CFG_SPEC)
     model_name = cfg_obj['model_name']
     label = Label.by_name(cfg_obj['target_label_name'])
     assert label
     url_model = UrlModel.create(name=model_name)
     self.tar_and_upload(url_model)
     logger.info('URL model injected %s' % url_model)
def process_page(page):
    """ Runs langid's language detection on webpage text"""
    logger.info("Detecting language for page: %d" % page.id)
    lang_name = LanguageDetector.detect_language(page.title_and_text)
    lang_label = Label.by_name(lang_name)
    assert lang_label is not None, "Label %s does not exist" % lang_name

    det = LanguageDetector.query.one()
    LanguageDetector.delete_detector_results(page, [det.id])
    det.save_result(page.id, lang_label.id)

    return lang_label
Example #3
0
 def inject_classifier(self, replace_old):
     # TODO: This seems like it could be generalized for all classifiers
     cfg_obj = validate_config(self.model_path(CFG_FILE), CFG_SPEC)
     clf_name = cfg_obj['classifier_name']
     label = Label.by_name(cfg_obj['target_label_name'])
     assert label
     clf = UrlClassifier.by_name(clf_name)
     if replace_old:
         assert clf, 'UrlClassifier with name %s does not exist!'\
             % clf_name
     else:
         assert not clf, 'UrlClassifier with name %s already exists!'\
             % clf_name
         # create the new classifier
         clf = UrlClassifier.create(name=clf_name)
     # note that failures above while running the script does not roll back
     # previously inserted models
     self.tar_and_upload(clf)
     clf.updated_at = datetime.utcnow()
     session.flush()
     clf.add_targets([label])
     logger.info('URL classifier injected %s' % clf)