class SuggestCategoriesView(formbase.PageForm): """Suggest categories to the user and let him set them. """ implements(IPlonePageForm) label = _(u"Suggested categories") description = _(u"Choose among the proposed subjects. Clicking on apply "\ "will add the chosen categories to the existing ones.") def getSuggestedSubjects(self): """ """ classifier = getUtility(IContentClassifier) uid = IClassifiable(self.context).UID return classifier.probabilityClassify(uid) @property def form_fields(self): """ """ ff = form.Fields(ISuggestCategories) suggestions = self.getSuggestedSubjects() if not suggestions: url = getMultiAdapter((self.context, self.request), name='absolute_url')() IStatusMessage(self.request).addStatusMessage( _(u"Classifier has not been trained or has "\ "not sufficient information."), type="error") self.request.response.redirect(url) return [] subject_prob_list = [ (suggestions.prob(subject), subject) for subject in suggestions.samples()] subject_prob_list = sorted(subject_prob_list, reverse=True) vocab_terms = [] for (probability, subject) in subject_prob_list: label = "%s %2.1f%%"%(subject, probability*100) vocab_terms.append(SimpleTerm(value=subject, token=b64encode(subject), title=label)) choice = schema.Choice(vocabulary=SimpleVocabulary(vocab_terms)) ff['suggestions'].field.value_type = choice return ff @form.action(_(u"Apply")) def action_submit(self, action, data): """ """ obj = IClassifiable(self.context) subjects = obj.categories for subject in data['suggestions']: if subject not in subjects: subjects.append(subject) obj.categories = subjects url = getMultiAdapter((self.context, self.request), name='absolute_url')() IStatusMessage(self.request).addStatusMessage( _(u"Categories saved."), type="info") self.request.response.redirect(url) return ''
class ISuggestCategories(Interface): """ """ suggestions = schema.List( title = _(u"Suggestions"), description = _(u""), default = [])
class IClassifierSettingsSchema(Interface): """Classifier settings """ train_after_update = schema.Bool( title=_(u"Train after update"), description=_(u"Enabling this will trigger training the classifier " \ "every time tagged content is added, modified or deleted. " \ "Disabling it means you will have to periodically manually " \ "retrain the classifier."))
def form_fields(self): """ """ ff = form.Fields(ISuggestCategories) suggestions = self.getSuggestedSubjects() if not suggestions: url = getMultiAdapter((self.context, self.request), name='absolute_url')() IStatusMessage(self.request).addStatusMessage( _(u"Classifier has not been trained or has "\ "not sufficient information."), type="error") self.request.response.redirect(url) return [] subject_prob_list = [ (suggestions.prob(subject), subject) for subject in suggestions.samples()] subject_prob_list = sorted(subject_prob_list, reverse=True) vocab_terms = [] for (probability, subject) in subject_prob_list: label = "%s %2.1f%%"%(subject, probability*100) vocab_terms.append(SimpleTerm(value=subject, token=b64encode(subject), title=label)) choice = schema.Choice(vocabulary=SimpleVocabulary(vocab_terms)) ff['suggestions'].field.value_type = choice return ff
class ClassifierSettings(ControlPanelForm): form_fields = form.FormFields(IClassifierSettingsSchema) label = _("Classification settings") description = _("Settings for collective.classification.") form_name = _("Classification settings") @form.action(_(u"Save")) def save_action(self, action, data): form.applyChanges(self.context, self.form_fields, data, self.adapters) self.status = _(u"Changes saved.") @form.action(_(u"Retrain classifier")) def retrain_classifier_action(self, action, data): classifier = getUtility(IContentClassifier) classifier.train() self.status = _(u"Classifier trained.") @form.action(_(u"Statistics"), validator=null_validator) def stats_action(self, action, data): """Displays the stats view. """ url = getMultiAdapter((self.context, self.request), name='absolute_url')() self.request.response.redirect(url + '/@@classification-stats') return '' @form.action(_(u"Cancel"), validator=null_validator) def cancel_action(self, action, data): self.status = _(u"Changes cancelled.") url = getMultiAdapter((self.context, self.request), name='absolute_url')() self.request.response.redirect(url + '/plone_control_panel') return ''
def retrain_classifier_action(self, action, data): storage = getUtility(INounPhraseStorage) classifier = getUtility(IContentClassifier) classifier.clear() catalog = getToolByName(self.context, "portal_catalog") types_to_search = storage.friendlyTypes or self._friendlyContentTypes() trainContent = catalog.searchResults(portal_type=types_to_search) for item in trainContent: if item.Subject: classifier.addTrainingDocument(item["UID"], item["Subject"]) classifier.train() self.status = _(u"Classifier trained.")
def action_submit(self, action, data): """ """ obj = IClassifiable(self.context) subjects = obj.categories for subject in data["suggestions"]: if subject not in subjects: subjects.append(subject) obj.categories = subjects url = getMultiAdapter((self.context, self.request), name="absolute_url")() IStatusMessage(self.request).addStatusMessage(_(u"Categories saved."), type="info") self.request.response.redirect(url) return ""
def action_submit(self, action, data): """ """ obj = IClassifiable(self.context) subjects = obj.categories for subject in data['suggestions']: if subject not in subjects: subjects.append(subject) obj.categories = subjects url = getMultiAdapter((self.context, self.request), name='absolute_url')() IStatusMessage(self.request).addStatusMessage( _(u"Categories saved."), type="info") self.request.response.redirect(url) return ''
def retrain_termextractor_action(self, action, data): storage = getUtility(INounPhraseStorage) storage.clear() catalog = getToolByName(self.context, "portal_catalog") types_to_search = storage.friendlyTypes or self._friendlyContentTypes() trainContent = catalog.searchResults(portal_type=types_to_search) for item in trainContent: # NOTE: Why can't I obtain item.SearchableText? # Is it too big to be returned in catalog brains? obj = item.getObject() uid = obj.UID() text = convertHtmlToWebIntelligentPlainText(obj.SearchableText()) storage.addDocument(uid, text) self.status = _( u"Term extractor trained and NP storage updated." " You will need to re-train the classifier as well." )
def save_action(self, action, data): form.applyChanges(self.context, self.form_fields, data, self.adapters) extractor = getUtility(ITermExtractor) # Check if user has changed the tagger... ttype = data["tagger_type"] tcategories = data["brown_categories"] if extractor.tagger_metadata["type"] != ttype or extractor.tagger_metadata["categories"] != tcategories: if ttype == "N-Gram": tagged_sents = brown.tagged_sents(categories=tcategories) tagger = getUtility(IPOSTagger, name="collective.classification.taggers.NgramTagger") tagger.train(tagged_sents) extractor.setTagger(tagger, {"type": "N-Gram", "categories": tcategories}) else: tagger = getUtility(IPOSTagger, name="collective.classification.taggers.PennTreebankTagger") extractor.setTagger(tagger, {"type": "Pen TreeBank", "categories": []}) self.status = _(u"Changes saved. You will need to reparse the " "content and then retrain the classifier.")
class ClassificationStatsView(formbase.PageForm): form_fields = form.Fields(IStats) template = ViewPageTemplateFile('classificationstats.pt') def __init__(self, *args, **kwargs): """ """ super(ClassificationStatsView, self).__init__(*args,**kwargs) catalog = getToolByName(self.context, 'portal_catalog') self.classifier = getUtility(IContentClassifier) self.informativeFeatures = self.classifier.informativeFeatures() self.parsedDocs = len(catalog._catalog.getIndex('noun_terms')._unindex) @form.action(_(u"Apply")) def action_apply(self, action, data): """ """ self.informativeFeatures = \ self.classifier.informativeFeatures(data['no_features'])
class ClusterizeView(formbase.PageForm): form_fields = form.Fields(IClusterize) template = ViewPageTemplateFile('clusterize.pt') @form.action(_(u"Clusterize")) def action_clusterize(self, action, data): """ """ catalog = getToolByName(self.context, 'portal_catalog') clusterer = KMeans() clusters = clusterer.clusterize(data['no_clusters'], data['no_noun_ranks'], repeats=data['repeats']) result = [] for cluster in clusters.values(): clusterlist = [] for uid in cluster: item = catalog.unrestrictedSearchResults(UID=uid)[0] clusterlist.append( (item.getURL(), item.Title, item.Description)) result.append(clusterlist) self.clusters = result
class IClusterize(Interface): no_clusters = schema.Int( title=_(u"Number of clusters"), description=_(u""), required=True, ) no_noun_ranks = schema.Int( title=_(u"Important nouns to keep"), description=_(u"Indicates how many nouns to keep when building the" \ "list of most frequent nouns in the text."), default=20, required=True) repeats = schema.Int( title=_(u"Number of runs"), description=_(u""), default=10, required=True, )
def save_action(self, action, data): form.applyChanges(self.context, self.form_fields, data, self.adapters) self.status = _(u"Changes saved.")
class IStats(Interface): no_features = schema.Int( title=_(u"Number of informative features to show"), required=True, default=10)
def retrain_classifier_action(self, action, data): classifier = getUtility(IContentClassifier) classifier.train() self.status = _(u"Classifier trained.")
def cancel_action(self, action, data): self.status = _(u"Changes cancelled.") url = getMultiAdapter((self.context, self.request), name='absolute_url')() self.request.response.redirect(url + '/plone_control_panel') return ''