def submit_selected_urls(self, positive, negative): #Perform ranking and diversifing on all urls with regard to the positive urls # #Args: # labeled_urls: a list of pair <url, label>. Label 1 means positive and 0 means negative. #Returns: # urls: list of urls with ranking scores # Test new positive and negative examples with exisitng classifier # If accuracy above threshold classify pages # Ranking # Diversification print '\n\nsubmit_selected_urls\n\n' entries = [] for pos_url in positive: entry = { 'url': pos_url, 'relevance': 1 } entries.append(entry) for neg_url in negative: entry = { 'url': pos_url, 'relevance': 0 } entries.append(entry) if len(entries) > 0: update_document(entries) other = [] for url in positive: if url in self.urls_set: self.positive_urls_set.add(url) self.negative_urls_set.discard(url) for url in negative: if url in self.urls_set: self.negative_urls_set.add(url) self.positive_urls_set.discard(url) for url in self.urls_set: if (len(self.negative_urls_set) == 0) or (url not in self.negative_urls_set): if url not in self.positive_urls_set: other.append(url) chdir(self.memex_home + '/seed_crawler/ranking') ranker = rank.rank() [ranked_urls,scores] = ranker.results(self.tfidf,self.positive_urls_set, other) return [ranked_urls, scores] # classified, ranked, diversified
def updateColors(self, session, colors): es_info = self._esInfo(session['domainId']) entry = { session['domainId']: { "colors": colors["colors"], "index": colors["index"] } } update_document(entry, "config", "tag_colors", self._es)
def submit_selected_urls(self, positive, negative): #Perform ranking and diversifing on all urls with regard to the positive urls # #Args: # labeled_urls: a list of pair <url, label>. Label 1 means positive and 0 means negative. #Returns: # urls: list of urls with ranking scores # Test new positive and negative examples with exisitng classifier # If accuracy above threshold classify pages # Ranking # Diversification print '\n\nsubmit_selected_urls\n\n' entries = [] for pos_url in positive: entry = {'url': pos_url, 'relevance': 1} entries.append(entry) for neg_url in negative: entry = {'url': pos_url, 'relevance': 0} entries.append(entry) if len(entries) > 0: update_document(entries) other = [] for url in positive: if url in self.urls_set: self.positive_urls_set.add(url) self.negative_urls_set.discard(url) for url in negative: if url in self.urls_set: self.negative_urls_set.add(url) self.positive_urls_set.discard(url) for url in self.urls_set: if (len(self.negative_urls_set) == 0) or (url not in self.negative_urls_set): if url not in self.positive_urls_set: other.append(url) chdir(self.memex_home + '/seed_crawler/ranking') ranker = rank.rank() [ranked_urls, scores] = ranker.results(self.tfidf, self.positive_urls_set, other) return [ranked_urls, scores] # classified, ranked, diversified
def saveModelTags(self, session): """ Method to save tags to be considered positive or negative for building a model Parameters: session (json): should have domainId, should have {"model": {"positive": []}} to set positive tags, should have {"model": {"negative": []}} to set negative tags Returns: None """ domainId = session["domainId"] es_info = self._esInfo(domainId) pos_tags = [] try: pos_tags = session['model']['positive'] except KeyError: print "Using default positive tags" neg_tags = [] try: neg_tags = session['model']['negative'] except KeyError: print "Using default negative tags" model_tags = self.getModelTags(domainId) entry = { domainId: { "positive": pos_tags, "index": es_info["activeDomainIndex"] } } update_document(entry, "config", "model_tags", self._es) entry = { domainId: { "negative": neg_tags, "index": es_info["activeDomainIndex"] } } update_document(entry, "config", "model_tags", self._es)
def saveModelTags(self, session): domainId = session["domainId"] es_info = self._esInfo(domainId) pos_tags = [] try: pos_tags = session['model']['positive'] except KeyError: print "Using default positive tags" neg_tags = [] try: neg_tags = session['model']['negative'] except KeyError: print "Using default negative tags" model_tags = self.getModelTags(domainId) entry = { domainId: { "positive": pos_tags, "index": es_info["activeDomainIndex"] } } update_document(entry, "config", "model_tags", self._es) entry = { domainId: { "negative": neg_tags, "index": es_info["activeDomainIndex"] } } update_document(entry, "config", "model_tags", self._es)
def setTermsTag(self, terms, tag, applyTagFlag, session): # TODO(Yamuna): Apply tag to page and update in elastic search. Suggestion: concatenate tags # with semi colon, removing repetitions. es_info = self.esInfo(session['domainId']) s_fields = { "term": "", "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'], } tags = [] for term in terms: s_fields["term"] = term res = multifield_term_search(s_fields, 1, ['tag'], self._termsIndex, 'terms', self._es) tags.extend(res) results = {result['id']: result['tag'][0] for result in tags} add_entries = [] update_entries = {} if applyTagFlag: for term in terms: if len(results) > 0: if results.get(term) is None: entry = { "term" : term, "tag" : tag, "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'], "_id" : term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType'] } add_entries.append(entry) else: old_tag = results[term] if tag not in old_tag: entry = { "term" : term, "tag" : tag, "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'], } update_entries[term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']] = entry else: entry = { "term" : term, "tag" : tag, "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'], "_id": term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType'] } add_entries.append(entry) else: for term in terms: if len(results) > 0: if not results.get(term) is None: if tag in results[term]: entry = { "term" : term, "tag" : "", "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'] } update_entries[term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']] = entry if add_entries: add_document(add_entries, self._termsIndex, 'terms', self._es) if update_entries: update_document(update_entries, self._termsIndex, 'terms', self._es)
def setPagesTag(self, pages, tag, applyTagFlag, session): es_info = self.esInfo(session['domainId']) entries = {} results = get_documents(pages, 'url', [es_info['mapping']['tag']], es_info['activeCrawlerIndex'], es_info['docType'], self._es) if applyTagFlag and len(results) > 0: print '\n\napplied tag ' + tag + ' to pages' + str(pages) + '\n\n' for page in pages: if not results.get(page) is None: # pages to be tagged exist records = results[page] for record in records: entry = {} if record.get(es_info['mapping']['tag']) is None: # there are no previous tags entry[es_info['mapping']['tag']] = tag else: current_tag = record[es_info['mapping']['tag']][0] tags = [] if current_tag != '': # all previous tags were removed tags = list(set(current_tag.split(';'))) if len(tags) != 0: # previous tags exist if not tag in tags: # append new tag entry[es_info['mapping']['tag']] = ';'.join(tags)+';'+tag else: # add new tag entry[es_info['mapping']['tag']] = tag if entry: entries[record['id']] = entry elif len(results) > 0: print '\n\nremoved tag ' + tag + ' from pages' + str(pages) + '\n\n' for page in pages: if not results.get(page) is None: records = results[page] for record in records: entry = {} if not record.get(es_info['mapping']['tag']) is None: current_tag = record[es_info['mapping']['tag']][0] if tag in current_tag: tags = list(set(current_tag.split(';'))) tags.remove(tag) entry[es_info['mapping']['tag']] = ';'.join(tags) entries[record['id']] = entry if entries: update_try = 0 while (update_try < 10): try: update_document(entries, es_info['activeCrawlerIndex'], es_info['docType'], self._es) break except: update_try = update_try + 1