def _getMostRecentPages(self, session): es_info = self.esInfo(session['domainId']) hits = [] if session['fromDate'] is None: hits = get_most_recent_documents(session['pagesCap'], es_info['mapping'], ["url", "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], self._es) else: if(session['filter'] is None): hits = range(es_info['mapping']["timestamp"], session['fromDate'], session['toDate'], ['url',"x", "y", es_info['mapping']['tag'], es_info['mapping']["timestamp"], es_info['mapping']["text"]], True, session['pagesCap'], es_info['activeCrawlerIndex'], es_info['docType'], self._es) else: s_fields = { es_info['mapping']["text"]: "(" + session['filter'].replace('"','\"') + ")", es_info['mapping']["timestamp"]: "[" + str(session['fromDate']) + " TO " + str(session['toDate']) + "]" } hits = multifield_query_search(s_fields, session['pagesCap'], ["url", "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], es_info['activeCrawlerIndex'], es_info['docType'], self._es) return hits
def getPagesSummarySeedCrawler(self, opt_ts1 = None, opt_ts2 = None, opt_applyFilter = False, session = None): es_info = self.esInfo(session['domainId']) # If ts1 not specified, sets it to -Infinity. if opt_ts1 is None: now = time.localtime(0) opt_ts1 = float(time.mktime(now)) * 1000 else: opt_ts1 = float(opt_ts1) # If ts2 not specified, sets it to now. if opt_ts2 is None: now = time.localtime() opt_ts2 = float(time.mktime(now)) * 1000 else: opt_ts2 = float(opt_ts2) if opt_applyFilter: # TODO(Yamuna): apply filter if it is None. Otherwise, match_all. results = get_most_recent_documents(session['pagesCap'], es_info['mapping'], ["url", es_info['mapping']["tag"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], \ self._es) else: results = \ range(es_info['mapping']["timestamp"], opt_ts1, opt_ts2, ['url',es_info['mapping']['tag']], True, session['pagesCap'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) relevant = 0 irrelevant = 0 neutral = 0 # TODO(Yamuna): Double check the return values for crawler for res in results: try: tags = res[es_info['mapping']['tag']] if 'Relevant' in res[es_info['mapping']['tag']]: relevant = relevant + 1 elif 'Irrelevant' in res[es_info['mapping']['tag']]: irrelevant = irrelevant + 1 else: # Page has tags, but not Relevant or Irrelevant. neutral = neutral + 1 except KeyError: # Page does not have tags. neutral = neutral + 1 return { \ 'Relevant': relevant, 'Irrelevant': irrelevant, 'Neutral': neutral }
def getPagesSummarySeedCrawler(self, opt_ts1 = None, opt_ts2 = None, opt_applyFilter = False, session = None): es_info = self.esInfo(session['domainId']) # If ts1 not specified, sets it to -Infinity. if opt_ts1 is None: now = time.gmtime(0) opt_ts1 = float(calendar.timegm(now)) else: opt_ts1 = float(opt_ts1) # If ts2 not specified, sets it to now. if opt_ts2 is None: now = time.gmtime() opt_ts2 = float(calendar.timegm(now)) else: opt_ts2 = float(opt_ts2) if opt_applyFilter and session['filter'] != "": results = get_most_recent_documents(session['pagesCap'], es_info['mapping'], ["url", es_info['mapping']["tag"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], \ self._es) else: results = \ range(es_info['mapping']["timestamp"], opt_ts1, opt_ts2, ['url',es_info['mapping']['tag']], True, session['pagesCap'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) relevant = 0 irrelevant = 0 neutral = 0 for res in results: try: tags = res[es_info['mapping']['tag']] if 'Irrelevant' in res[es_info['mapping']['tag']]: irrelevant = irrelevant + 1 else: # Page has tags Relevant or custom. if "" not in tags: relevant = relevant + 1 else: neutral = neutral + 1 except KeyError: # Page does not have tags. neutral = neutral + 1 return { \ 'Relevant': relevant, 'Irrelevant': irrelevant, 'Neutral': neutral }
def getTermsSummarySeedCrawler(self, opt_maxNumberOfTerms = 40, session = None): es_info = self.esInfo(session['domainId']) format = '%m/%d/%Y %H:%M %Z' if not session['fromDate'] is None: session['fromDate'] = long(CrawlerModel.convert_to_epoch(datetime.strptime(session['fromDate'], format)) * 1000) if not session['toDate'] is None: session['toDate'] = long(CrawlerModel.convert_to_epoch(datetime.strptime(session['toDate'], format)) * 1000) s_fields = { "tag": "Positive", "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'] } pos_terms = [field['term'][0] for field in multifield_term_search(s_fields, self._capTerms, ['term'], self._termsIndex, 'terms', self._es)] s_fields["tag"]="Negative" neg_terms = [field['term'][0] for field in multifield_term_search(s_fields, self._capTerms, ['term'], self._termsIndex, 'terms', self._es)] results = term_search(es_info['mapping']['tag'], ['Relevant'], self._pagesCapTerms, ['url', es_info['mapping']['text']], es_info['activeCrawlerIndex'], es_info['docType'], self._es) pos_urls = [field["id"] for field in results] top_terms = [] top_bigrams = [] top_trigrams = [] if session['filter'] is None: urls = [] if len(pos_urls) > 0: # If positive urls are available search for more documents like them results_more_like_pos = get_more_like_this(pos_urls, ['url', es_info['mapping']["text"]], self._pagesCapTerms, es_info['activeCrawlerIndex'], es_info['docType'], self._es) results.extend(results_more_like_pos) urls = pos_urls[0:self._pagesCapTerms] + [field['id'] for field in results_more_like_pos] if not urls: # If positive urls are not available then get the most recent documents results = get_most_recent_documents(self._pagesCapTerms, es_info['mapping'], ['url',es_info['mapping']["text"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], self._es) urls = [field['id'] for field in results] if len(results) > 0: text = [field[es_info['mapping']["text"]][0] for field in results] if len(urls) > 0: tfidf_all = tfidf.tfidf(urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) if pos_terms: extract_terms_all = extract_terms.extract_terms(tfidf_all) [ranked_terms, scores] = extract_terms_all.results(pos_terms) top_terms = [ term for term in ranked_terms if (term not in neg_terms)] top_terms = top_terms[0:opt_maxNumberOfTerms] else: top_terms = tfidf_all.getTopTerms(opt_maxNumberOfTerms) if len(text) > 0: [_,_,_,_,_,_,_,_,top_bigrams, top_trigrams] = get_bigrams_trigrams.get_bigrams_trigrams(text, urls, opt_maxNumberOfTerms+len(neg_terms), self.w2v, self._es) top_bigrams = [term for term in top_bigrams if term not in neg_terms] top_trigrams = [term for term in top_trigrams if term not in neg_terms] else: s_fields = { es_info['mapping']["text"]: "(" + session['filter'].replace('"','\"') + ")" } if not session['fromDate'] is None: s_fields[es_info['mapping']["timestamp"]] = "[" + str(session['fromDate']) + " TO " + str(session['toDate']) + "]" results = multifield_query_search(s_fields, self._pagesCapTerms, ["url", es_info['mapping']["text"]], es_info['activeCrawlerIndex'], es_info['docType'], self._es) ids = [field['id'] for field in results] text = [field[es_info['mapping']["text"]][0] for field in results] urls = [field[es_info['mapping']["url"]][0] for field in results] top_terms = get_significant_terms(ids, opt_maxNumberOfTerms, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) if len(text) > 0: [_,_,_,_,_,_,_,_,top_bigrams, top_trigrams] = get_bigrams_trigrams.get_bigrams_trigrams(text, urls, opt_maxNumberOfTerms+len(neg_terms), self.w2v, self._es) top_bigrams = [term for term in top_bigrams if term not in neg_terms] top_trigrams = [term for term in top_trigrams if term not in neg_terms] s_fields = { "tag": "Custom", "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'] } custom_terms = [field['term'][0] for field in multifield_query_search(s_fields, 500, ['term'], self._termsIndex, 'terms', self._es)] top_terms = custom_terms + top_terms if not top_terms: return [] pos_freq = {} if len(pos_urls) > 1: tfidf_pos = tfidf.tfidf(pos_urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) [_,corpus,ttfs_pos] = tfidf_pos.getTfArray() total_pos_tf = np.sum(ttfs_pos, axis=0) total_pos = np.sum(total_pos_tf) pos_freq={} for key in top_terms: try: pos_freq[key] = (float(total_pos_tf[corpus.index(key)])/total_pos) except ValueError: pos_freq[key] = 0 else: pos_freq = { key: 0 for key in top_terms } neg_urls = [field['id'] for field in term_search(es_info['mapping']['tag'], ['Irrelevant'], self._pagesCapTerms, ['url'], es_info['activeCrawlerIndex'], es_info['docType'], self._es)] neg_freq = {} if len(neg_urls) > 1: tfidf_neg = tfidf.tfidf(neg_urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es) [_,corpus,ttfs_neg] = tfidf_neg.getTfArray() total_neg_tf = np.sum(ttfs_neg, axis=0) total_neg = np.sum(total_neg_tf) neg_freq={} for key in top_terms: try: neg_freq[key] = (float(total_neg_tf[corpus.index(key)])/total_neg) except ValueError: neg_freq[key] = 0 else: neg_freq = { key: 0 for key in top_terms } terms = [] s_fields = { "term": "", "index": es_info['activeCrawlerIndex'], "doc_type": es_info['docType'], } results = [] for term in top_terms: s_fields["term"] = term res = multifield_term_search(s_fields, self._capTerms, ['tag', 'term'], self._termsIndex, 'terms', self._es) results.extend(res) tags = {result['term'][0]: result['tag'][0] for result in results} for term in top_terms: entry = [term, pos_freq[term], neg_freq[term], []] if tags and not tags.get(term) is None: entry[3] = tags[term].split(';') terms.append(entry) for term in top_bigrams: entry = [term, 0, 0, []] terms.append(entry) for term in top_trigrams: entry = [term, 0, 0, []] terms.append(entry) return terms
def getPagesSummaryDomain(self, opt_ts1 = None, opt_ts2 = None, opt_applyFilter = False, session = None): es_info = self._esInfo(session['domainId']) if opt_ts1 is None: now = time.gmtime(0) opt_ts1 = float(calendar.timegm(now)) else: opt_ts1 = float(opt_ts1) if opt_ts2 is None: now = time.gmtime() opt_ts2 = float(calendar.timegm(now)) else: opt_ts2 = float(opt_ts2) total_results = [] total_results = get_most_recent_documents(2000, es_info['mapping'], ["url", es_info['mapping']["tag"]], None, es_info['activeDomainIndex'], es_info['docType'], \ self._es) if opt_applyFilter and session['filter'] != "": results = self._getPagesQuery(session) else: results = \ range_search(es_info['mapping']["timestamp"], opt_ts1, opt_ts2, ['url',es_info['mapping']['tag']], True, session['pagesCap'], es_index=es_info['activeDomainIndex'], es_doc_type=es_info['docType'], es=self._es) relevant = 0 irrelevant = 0 neutral = 0 otherTags = 0 total_relevant = 0 total_irrelevant = 0 total_neutral = 0 for res_total in total_results: try: total_tags = res_total[es_info['mapping']['tag']] if 'Irrelevant' in res_total[es_info['mapping']['tag']]: total_irrelevant = total_irrelevant + 1 else: if "" not in total_tags: if 'Relevant' in total_tags: total_relevant = total_relevant + 1 else: total_neutral = total_neutral + 1 except KeyError: total_neutral = total_neutral + 1 for res in results: try: tags = res[es_info['mapping']['tag']] if 'Irrelevant' in res[es_info['mapping']['tag']]: irrelevant = irrelevant + 1 else: if "" not in tags: if 'Relevant' in tags: relevant = relevant + 1 else: otherTags = otherTags + 1 else: neutral = neutral + 1 except KeyError: neutral = neutral + 1 #1 return { \ 'Relevant': relevant, 'Irrelevant': irrelevant, 'Neutral': neutral, 'OtherTags': otherTags, 'TotalRelevant': total_relevant, 'TotalIrrelevant': total_irrelevant, 'TotalNeutral': total_neutral }