コード例 #1
0
  def _getMostRecentPages(self, session):
    es_info = self.esInfo(session['domainId'])

    hits = []
    if session['fromDate'] is None:
      hits = get_most_recent_documents(session['pagesCap'], es_info['mapping'], ["url", "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]],  
                                       session['filter'],
                                       es_info['activeCrawlerIndex'],
                                       es_info['docType'],
                                       self._es)
    else:
      if(session['filter'] is None):
        hits = range(es_info['mapping']["timestamp"], session['fromDate'], session['toDate'], ['url',"x", "y", es_info['mapping']['tag'], es_info['mapping']["timestamp"], es_info['mapping']["text"]], True, session['pagesCap'], 
                     es_info['activeCrawlerIndex'], 
                     es_info['docType'], 
                     self._es)
      else:
        s_fields = {
          es_info['mapping']["text"]: "(" + session['filter'].replace('"','\"') + ")",
          es_info['mapping']["timestamp"]: "[" + str(session['fromDate']) + " TO " + str(session['toDate']) + "]" 
        }
        hits = multifield_query_search(s_fields, session['pagesCap'], ["url", "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], 
                                       es_info['activeCrawlerIndex'], 
                                       es_info['docType'],
                                       self._es)
    return hits
コード例 #2
0
  def getPagesSummarySeedCrawler(self, opt_ts1 = None, opt_ts2 = None, opt_applyFilter = False, session = None):
    es_info = self.esInfo(session['domainId'])

    # If ts1 not specified, sets it to -Infinity.
    if opt_ts1 is None:
      now = time.localtime(0)
      opt_ts1 = float(time.mktime(now)) * 1000
    else:
      opt_ts1 = float(opt_ts1)

    # If ts2 not specified, sets it to now.
    if opt_ts2 is None:
      now = time.localtime()
      opt_ts2 = float(time.mktime(now)) * 1000
    else:
      opt_ts2 = float(opt_ts2)

    if opt_applyFilter:
      # TODO(Yamuna): apply filter if it is None. Otherwise, match_all.
      results = get_most_recent_documents(session['pagesCap'], es_info['mapping'], ["url", es_info['mapping']["tag"]], 
                                          session['filter'], es_info['activeCrawlerIndex'], es_info['docType'],  \
                                          self._es)
    else:
      results = \
      range(es_info['mapping']["timestamp"], opt_ts1, opt_ts2, ['url',es_info['mapping']['tag']], True, session['pagesCap'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)

    relevant = 0
    irrelevant = 0
    neutral = 0

    # TODO(Yamuna): Double check the return values for crawler
    for res in results:
        try:
          tags = res[es_info['mapping']['tag']]
          if 'Relevant' in res[es_info['mapping']['tag']]:
            relevant = relevant + 1
          elif 'Irrelevant' in res[es_info['mapping']['tag']]:
            irrelevant = irrelevant + 1
          else:
            # Page has tags, but not Relevant or Irrelevant.
            neutral = neutral + 1
        except KeyError:
          # Page does not have tags.
          neutral = neutral + 1

    return { \
      'Relevant': relevant,
      'Irrelevant': irrelevant,
      'Neutral': neutral
    }
コード例 #3
0
  def getPagesSummarySeedCrawler(self, opt_ts1 = None, opt_ts2 = None, opt_applyFilter = False, session = None):
    es_info = self.esInfo(session['domainId'])

    # If ts1 not specified, sets it to -Infinity.
    if opt_ts1 is None:
      now = time.gmtime(0)
      opt_ts1 = float(calendar.timegm(now))
    else:
      opt_ts1 = float(opt_ts1)

    # If ts2 not specified, sets it to now.
    if opt_ts2 is None:
      now = time.gmtime()
      opt_ts2 = float(calendar.timegm(now))
    else:
      opt_ts2 = float(opt_ts2)

    if opt_applyFilter and session['filter'] != "":
      results = get_most_recent_documents(session['pagesCap'], es_info['mapping'], ["url", es_info['mapping']["tag"]], 
                                          session['filter'], es_info['activeCrawlerIndex'], es_info['docType'],  \
                                          self._es)
    else:
      results = \
      range(es_info['mapping']["timestamp"], opt_ts1, opt_ts2, ['url',es_info['mapping']['tag']], True, session['pagesCap'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)

    relevant = 0
    irrelevant = 0
    neutral = 0

    for res in results:
        try:
          tags = res[es_info['mapping']['tag']]
          if 'Irrelevant' in res[es_info['mapping']['tag']]:
            irrelevant = irrelevant + 1
          else:
            # Page has tags Relevant or custom.
            if "" not in tags:
              relevant = relevant + 1
            else:
              neutral = neutral + 1
        except KeyError:
          # Page does not have tags.
          neutral = neutral + 1

    return { \
      'Relevant': relevant,
      'Irrelevant': irrelevant,
      'Neutral': neutral
    }
コード例 #4
0
  def getTermsSummarySeedCrawler(self, opt_maxNumberOfTerms = 40, session = None):

    es_info = self.esInfo(session['domainId'])

    format = '%m/%d/%Y %H:%M %Z'
    if not session['fromDate'] is None:
      session['fromDate'] = long(CrawlerModel.convert_to_epoch(datetime.strptime(session['fromDate'], format)) * 1000)
    if not session['toDate'] is None:
      session['toDate'] = long(CrawlerModel.convert_to_epoch(datetime.strptime(session['toDate'], format)) * 1000)

    s_fields = {
      "tag": "Positive",
      "index": es_info['activeCrawlerIndex'],
      "doc_type": es_info['docType']
    }

    pos_terms = [field['term'][0] for field in multifield_term_search(s_fields, self._capTerms, ['term'], self._termsIndex, 'terms', self._es)]
        
    s_fields["tag"]="Negative"
    neg_terms = [field['term'][0] for field in multifield_term_search(s_fields, self._capTerms, ['term'], self._termsIndex, 'terms', self._es)]

    results = term_search(es_info['mapping']['tag'], ['Relevant'], self._pagesCapTerms, ['url', es_info['mapping']['text']], es_info['activeCrawlerIndex'], es_info['docType'], self._es)

    pos_urls = [field["id"] for field in results]
    
    top_terms = []
    top_bigrams = []
    top_trigrams = []

    if session['filter'] is None:
      urls = []
      if len(pos_urls) > 0:
        # If positive urls are available search for more documents like them
        results_more_like_pos = get_more_like_this(pos_urls, ['url', es_info['mapping']["text"]], self._pagesCapTerms,  es_info['activeCrawlerIndex'], es_info['docType'],  self._es)
        results.extend(results_more_like_pos)
        urls = pos_urls[0:self._pagesCapTerms] + [field['id'] for field in results_more_like_pos] 
        
      if not urls:
        # If positive urls are not available then get the most recent documents
        results = get_most_recent_documents(self._pagesCapTerms, es_info['mapping'], ['url',es_info['mapping']["text"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], self._es)
        urls = [field['id'] for field in results]
        
      if len(results) > 0:
        text = [field[es_info['mapping']["text"]][0] for field in results]
                
        if len(urls) > 0:
          tfidf_all = tfidf.tfidf(urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)
          if pos_terms:
            extract_terms_all = extract_terms.extract_terms(tfidf_all)
            [ranked_terms, scores] = extract_terms_all.results(pos_terms)
            top_terms = [ term for term in ranked_terms if (term not in neg_terms)]
            top_terms = top_terms[0:opt_maxNumberOfTerms]
          else:
            top_terms = tfidf_all.getTopTerms(opt_maxNumberOfTerms)
        
        if len(text) > 0:
          [_,_,_,_,_,_,_,_,top_bigrams, top_trigrams] = get_bigrams_trigrams.get_bigrams_trigrams(text, urls, opt_maxNumberOfTerms+len(neg_terms), self.w2v, self._es)
          top_bigrams = [term for term in top_bigrams if term not in neg_terms]
          top_trigrams = [term for term in top_trigrams if term not in neg_terms]
    else:
      s_fields = {
        es_info['mapping']["text"]: "(" + session['filter'].replace('"','\"') + ")"
      }
      if not session['fromDate'] is None:
        s_fields[es_info['mapping']["timestamp"]] = "[" + str(session['fromDate']) + " TO " + str(session['toDate']) + "]" 
        
      results = multifield_query_search(s_fields, self._pagesCapTerms, ["url", es_info['mapping']["text"]], 
                                        es_info['activeCrawlerIndex'], 
                                        es_info['docType'],
                                        self._es)
      
      ids = [field['id'] for field in results]
      text = [field[es_info['mapping']["text"]][0] for field in results]
      urls = [field[es_info['mapping']["url"]][0] for field in results]
      top_terms = get_significant_terms(ids, opt_maxNumberOfTerms, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)
      if len(text) > 0:
        [_,_,_,_,_,_,_,_,top_bigrams, top_trigrams] = get_bigrams_trigrams.get_bigrams_trigrams(text, urls, opt_maxNumberOfTerms+len(neg_terms), self.w2v, self._es)
        top_bigrams = [term for term in top_bigrams if term not in neg_terms]
        top_trigrams = [term for term in top_trigrams if term not in neg_terms]

    s_fields = {
      "tag": "Custom",
      "index": es_info['activeCrawlerIndex'],
      "doc_type": es_info['docType']
    }

    custom_terms = [field['term'][0] for field in multifield_query_search(s_fields, 500, ['term'], self._termsIndex, 'terms', self._es)]

    top_terms = custom_terms + top_terms

    if not top_terms:  
      return []

    pos_freq = {}
    if len(pos_urls) > 1:
      tfidf_pos = tfidf.tfidf(pos_urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)
      [_,corpus,ttfs_pos] = tfidf_pos.getTfArray()
      
      total_pos_tf = np.sum(ttfs_pos, axis=0)
      total_pos = np.sum(total_pos_tf)
      pos_freq={}
      for key in top_terms:
        try:
          pos_freq[key] = (float(total_pos_tf[corpus.index(key)])/total_pos)
        except ValueError:
          pos_freq[key] = 0
    else:
      pos_freq = { key: 0 for key in top_terms }      

    neg_urls = [field['id'] for field in term_search(es_info['mapping']['tag'], ['Irrelevant'], self._pagesCapTerms, ['url'], es_info['activeCrawlerIndex'], es_info['docType'], self._es)]
    neg_freq = {}
    if len(neg_urls) > 1:
      tfidf_neg = tfidf.tfidf(neg_urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)
      [_,corpus,ttfs_neg] = tfidf_neg.getTfArray()
      total_neg_tf = np.sum(ttfs_neg, axis=0)
      total_neg = np.sum(total_neg_tf)
      neg_freq={}
      for key in top_terms:
        try:
          neg_freq[key] = (float(total_neg_tf[corpus.index(key)])/total_neg)
        except ValueError:
          neg_freq[key] = 0
    else:
      neg_freq = { key: 0 for key in top_terms }      

    terms = []

    s_fields = {
      "term": "",
      "index": es_info['activeCrawlerIndex'],
      "doc_type": es_info['docType'],
    }

    results = []
    for term in top_terms:
      s_fields["term"] = term
      res = multifield_term_search(s_fields, self._capTerms, ['tag', 'term'], self._termsIndex, 'terms', self._es)
      results.extend(res)

    tags = {result['term'][0]: result['tag'][0] for result in results}    

    for term in top_terms:
      entry = [term, pos_freq[term], neg_freq[term], []]
      if tags and not tags.get(term) is None:
        entry[3] = tags[term].split(';')
      terms.append(entry)
      
    for term in top_bigrams:
      entry = [term, 0, 0, []]
      terms.append(entry)

    for term in top_trigrams:
      entry = [term, 0, 0, []]
      terms.append(entry)
    
    return terms
コード例 #5
0
  def getPagesSummaryDomain(self, opt_ts1 = None, opt_ts2 = None, opt_applyFilter = False, session = None):
    es_info = self._esInfo(session['domainId'])

    if opt_ts1 is None:
      now = time.gmtime(0)
      opt_ts1 = float(calendar.timegm(now))
    else:
      opt_ts1 = float(opt_ts1)

    if opt_ts2 is None:
      now = time.gmtime()
      opt_ts2 = float(calendar.timegm(now))
    else:
      opt_ts2 = float(opt_ts2)
    total_results = []
    total_results = get_most_recent_documents(2000, es_info['mapping'], ["url", es_info['mapping']["tag"]],
                                      None, es_info['activeDomainIndex'], es_info['docType'],  \
                                      self._es)
    if opt_applyFilter and session['filter'] != "":
      results = self._getPagesQuery(session)
    else:
      results = \
      range_search(es_info['mapping']["timestamp"], opt_ts1, opt_ts2, ['url',es_info['mapping']['tag']], True, session['pagesCap'], es_index=es_info['activeDomainIndex'], es_doc_type=es_info['docType'], es=self._es)


    relevant = 0
    irrelevant = 0
    neutral = 0
    otherTags = 0
    total_relevant = 0
    total_irrelevant = 0
    total_neutral = 0

    for res_total in total_results:
        try:
          total_tags = res_total[es_info['mapping']['tag']]
          if 'Irrelevant' in res_total[es_info['mapping']['tag']]:
            total_irrelevant = total_irrelevant + 1
          else:
            if "" not in total_tags:
                if 'Relevant' in total_tags:
                    total_relevant = total_relevant + 1
            else:
              total_neutral = total_neutral + 1
        except KeyError:
          total_neutral = total_neutral + 1

    for res in results:
        try:
          tags = res[es_info['mapping']['tag']]
          if 'Irrelevant' in res[es_info['mapping']['tag']]:
            irrelevant = irrelevant + 1
          else:
            if "" not in tags:
                if 'Relevant' in tags:
                    relevant = relevant + 1
                else:
                    otherTags = otherTags + 1
            else:
                neutral = neutral + 1
        except KeyError:
          neutral = neutral + 1 #1



    return { \
      'Relevant': relevant,
      'Irrelevant': irrelevant,
      'Neutral': neutral,
      'OtherTags': otherTags,
      'TotalRelevant': total_relevant,
      'TotalIrrelevant': total_irrelevant,
      'TotalNeutral': total_neutral
    }