def _getPagesForTags(self, session):
    es_info = self.esInfo(session['domainId'])

    s_fields = {}
    if not session['filter'] is None:
      s_fields[es_info['mapping']["text"]] = session['filter'].replace('"','\"')

    if not session['fromDate'] is None:
      s_fields[es_info['mapping']["timestamp"]] = "[" + str(session['fromDate']) + " TO " + str(session['toDate']) + "]" 
      
    hits=[]
    tags = session['selected_tags'].split(',')
    for tag in tags:
      if tag != "":
        if tag == "Neutral":
          query_field_missing = {
            "filtered" : {
              "filter" : {
                "missing" : { "field" : "tag" }
              }
            }
          }

          s_fields["queries"] = [query_field_missing]

          results = multifield_term_search(s_fields, session['pagesCap'], ["url", "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], 
                                           es_info['activeCrawlerIndex'], 
                                           es_info['docType'],
                                           self._es)

          hits.extend(results)
          
          s_fields["tag"] = ""

          results = multifield_term_search(s_fields, session['pagesCap'], ["url", "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], 
                                           es_info['activeCrawlerIndex'], 
                                           es_info['docType'],
                                           self._es)

          hits.extend(results)
          
          s_fields.pop("tag")

        else:  
          #Added a wildcard query as tag is not analyzed field
          query = {
            "wildcard": {es_info['mapping']["tag"]:"*" + tag + "*"}
          }
          s_fields["queries"] = [query]
          results= multifield_term_search(s_fields, session['pagesCap'], ["url", "x", "y", es_info['mapping']["tag"], es_info['mapping']["timestamp"], es_info['mapping']["text"]], 
                                          es_info['activeCrawlerIndex'], 
                                          es_info['docType'],
                                          self._es)
          hits.extend(results)
        
    return hits
Example #2
0
  def getAnnotatedTerms(self, session):
    es_info = self._esInfo(session['domainId'])

    s_fields = {
      "index": es_info['activeDomainIndex'],
      "doc_type": es_info['docType']
    }

    results = multifield_term_search(s_fields, 0, self._all, ['tag','term'], self._termsIndex, 'terms', self._es)

    hits = results["results"]
    terms = {}
    for hit in hits:
      term = hit['term'][0]
      terms[term] = {'tag':hit['tag'][0]}

    return terms
Example #3
0
  def getTermSnippets(self, term, session):
    es_info = self._esInfo(session['domainId'])


    s_fields = {
      "term": term,
      "index": es_info['activeDomainIndex'],
      "doc_type": es_info['docType'],
    }

    results = multifield_term_search(s_fields, 0, self._capTerms, ['tag'], self._termsIndex, 'terms', self._es)
    tags = results["results"]

    tag = []
    if tags:
      tag = tags[0]['tag'][0].split(';')

    return {'term': term, 'tags': tag, 'context': get_context(term.split('_'), es_info['mapping']['text'], 500, es_info['activeDomainIndex'], es_info['docType'],  self._es)}
  def getTermSnippets(self, term, session):
    es_info = self.esInfo(session['domainId'])

    #tags = get_documents(term, 'term', ['tag'], es_info['activeCrawlerIndex'], 'terms', self._es)


    s_fields = {
      "term": term,
      "index": es_info['activeCrawlerIndex'],
      "doc_type": es_info['docType'],
    }

    tags = multifield_term_search(s_fields, self._capTerms, ['tag'], self._termsIndex, 'terms', self._es)
    
    tag = []
    if tags:
      tag = tags[0]['tag'][0].split(';')

    return {'term': term, 'tags': tag, 'context': get_context(term.split('_'), es_info['mapping']['text'], es_info['activeCrawlerIndex'], es_info['docType'],  self._es)}
  def setTermsTag(self, terms, tag, applyTagFlag, session):
    # TODO(Yamuna): Apply tag to page and update in elastic search. Suggestion: concatenate tags
    # with semi colon, removing repetitions.

    es_info = self.esInfo(session['domainId'])

    s_fields = {
      "term": "",
      "index": es_info['activeCrawlerIndex'],
      "doc_type": es_info['docType'],
    }

    tags = []
    for term in terms:
      s_fields["term"] = term
      res = multifield_term_search(s_fields, 1, ['tag'], self._termsIndex, 'terms', self._es)
      tags.extend(res)

    results = {result['id']: result['tag'][0] for result in tags}

    add_entries = []
    update_entries = {}

    if applyTagFlag:
      for term in terms:
        if len(results) > 0:
          if results.get(term) is None:
            entry = {
              "term" : term,
              "tag" : tag,
              "index": es_info['activeCrawlerIndex'],
              "doc_type": es_info['docType'],
              "_id" : term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']
            }
            add_entries.append(entry)
          else:
            old_tag = results[term]
            if tag not in old_tag:
              entry = {
                "term" : term,
                "tag" : tag,
                "index": es_info['activeCrawlerIndex'],
                "doc_type": es_info['docType'],
              }
              update_entries[term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']] = entry
        else:
          entry = {
            "term" : term,
            "tag" : tag,
            "index": es_info['activeCrawlerIndex'],
            "doc_type": es_info['docType'],
            "_id": term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']
          }
          add_entries.append(entry)
    else:
      for term in terms:
        if len(results) > 0:
          if not results.get(term) is None:
            if tag in results[term]:
              entry = {
                "term" : term,
                "tag" : "",
                "index": es_info['activeCrawlerIndex'],
                "doc_type": es_info['docType']
              }
              update_entries[term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']] = entry

    if add_entries:
      add_document(add_entries, self._termsIndex, 'terms', self._es)
    
    if update_entries:
      update_document(update_entries, self._termsIndex, 'terms', self._es)
  def getTermsSummarySeedCrawler(self, opt_maxNumberOfTerms = 40, session = None):

    es_info = self.esInfo(session['domainId'])

    format = '%m/%d/%Y %H:%M %Z'
    if not session['fromDate'] is None:
      session['fromDate'] = long(CrawlerModel.convert_to_epoch(datetime.strptime(session['fromDate'], format)) * 1000)
    if not session['toDate'] is None:
      session['toDate'] = long(CrawlerModel.convert_to_epoch(datetime.strptime(session['toDate'], format)) * 1000)

    s_fields = {
      "tag": "Positive",
      "index": es_info['activeCrawlerIndex'],
      "doc_type": es_info['docType']
    }

    pos_terms = [field['term'][0] for field in multifield_term_search(s_fields, self._capTerms, ['term'], self._termsIndex, 'terms', self._es)]
        
    s_fields["tag"]="Negative"
    neg_terms = [field['term'][0] for field in multifield_term_search(s_fields, self._capTerms, ['term'], self._termsIndex, 'terms', self._es)]

    results = term_search(es_info['mapping']['tag'], ['Relevant'], self._pagesCapTerms, ['url', es_info['mapping']['text']], es_info['activeCrawlerIndex'], es_info['docType'], self._es)

    pos_urls = [field["id"] for field in results]
    
    top_terms = []
    top_bigrams = []
    top_trigrams = []

    if session['filter'] is None:
      urls = []
      if len(pos_urls) > 0:
        # If positive urls are available search for more documents like them
        results_more_like_pos = get_more_like_this(pos_urls, ['url', es_info['mapping']["text"]], self._pagesCapTerms,  es_info['activeCrawlerIndex'], es_info['docType'],  self._es)
        results.extend(results_more_like_pos)
        urls = pos_urls[0:self._pagesCapTerms] + [field['id'] for field in results_more_like_pos] 
        
      if not urls:
        # If positive urls are not available then get the most recent documents
        results = get_most_recent_documents(self._pagesCapTerms, es_info['mapping'], ['url',es_info['mapping']["text"]], session['filter'], es_info['activeCrawlerIndex'], es_info['docType'], self._es)
        urls = [field['id'] for field in results]
        
      if len(results) > 0:
        text = [field[es_info['mapping']["text"]][0] for field in results]
                
        if len(urls) > 0:
          tfidf_all = tfidf.tfidf(urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)
          if pos_terms:
            extract_terms_all = extract_terms.extract_terms(tfidf_all)
            [ranked_terms, scores] = extract_terms_all.results(pos_terms)
            top_terms = [ term for term in ranked_terms if (term not in neg_terms)]
            top_terms = top_terms[0:opt_maxNumberOfTerms]
          else:
            top_terms = tfidf_all.getTopTerms(opt_maxNumberOfTerms)
        
        if len(text) > 0:
          [_,_,_,_,_,_,_,_,top_bigrams, top_trigrams] = get_bigrams_trigrams.get_bigrams_trigrams(text, urls, opt_maxNumberOfTerms+len(neg_terms), self.w2v, self._es)
          top_bigrams = [term for term in top_bigrams if term not in neg_terms]
          top_trigrams = [term for term in top_trigrams if term not in neg_terms]
    else:
      s_fields = {
        es_info['mapping']["text"]: "(" + session['filter'].replace('"','\"') + ")"
      }
      if not session['fromDate'] is None:
        s_fields[es_info['mapping']["timestamp"]] = "[" + str(session['fromDate']) + " TO " + str(session['toDate']) + "]" 
        
      results = multifield_query_search(s_fields, self._pagesCapTerms, ["url", es_info['mapping']["text"]], 
                                        es_info['activeCrawlerIndex'], 
                                        es_info['docType'],
                                        self._es)
      
      ids = [field['id'] for field in results]
      text = [field[es_info['mapping']["text"]][0] for field in results]
      urls = [field[es_info['mapping']["url"]][0] for field in results]
      top_terms = get_significant_terms(ids, opt_maxNumberOfTerms, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)
      if len(text) > 0:
        [_,_,_,_,_,_,_,_,top_bigrams, top_trigrams] = get_bigrams_trigrams.get_bigrams_trigrams(text, urls, opt_maxNumberOfTerms+len(neg_terms), self.w2v, self._es)
        top_bigrams = [term for term in top_bigrams if term not in neg_terms]
        top_trigrams = [term for term in top_trigrams if term not in neg_terms]

    s_fields = {
      "tag": "Custom",
      "index": es_info['activeCrawlerIndex'],
      "doc_type": es_info['docType']
    }

    custom_terms = [field['term'][0] for field in multifield_query_search(s_fields, 500, ['term'], self._termsIndex, 'terms', self._es)]

    top_terms = custom_terms + top_terms

    if not top_terms:  
      return []

    pos_freq = {}
    if len(pos_urls) > 1:
      tfidf_pos = tfidf.tfidf(pos_urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)
      [_,corpus,ttfs_pos] = tfidf_pos.getTfArray()
      
      total_pos_tf = np.sum(ttfs_pos, axis=0)
      total_pos = np.sum(total_pos_tf)
      pos_freq={}
      for key in top_terms:
        try:
          pos_freq[key] = (float(total_pos_tf[corpus.index(key)])/total_pos)
        except ValueError:
          pos_freq[key] = 0
    else:
      pos_freq = { key: 0 for key in top_terms }      

    neg_urls = [field['id'] for field in term_search(es_info['mapping']['tag'], ['Irrelevant'], self._pagesCapTerms, ['url'], es_info['activeCrawlerIndex'], es_info['docType'], self._es)]
    neg_freq = {}
    if len(neg_urls) > 1:
      tfidf_neg = tfidf.tfidf(neg_urls, pos_tags=self.pos_tags, mapping=es_info['mapping'], es_index=es_info['activeCrawlerIndex'], es_doc_type=es_info['docType'], es=self._es)
      [_,corpus,ttfs_neg] = tfidf_neg.getTfArray()
      total_neg_tf = np.sum(ttfs_neg, axis=0)
      total_neg = np.sum(total_neg_tf)
      neg_freq={}
      for key in top_terms:
        try:
          neg_freq[key] = (float(total_neg_tf[corpus.index(key)])/total_neg)
        except ValueError:
          neg_freq[key] = 0
    else:
      neg_freq = { key: 0 for key in top_terms }      

    terms = []

    s_fields = {
      "term": "",
      "index": es_info['activeCrawlerIndex'],
      "doc_type": es_info['docType'],
    }

    results = []
    for term in top_terms:
      s_fields["term"] = term
      res = multifield_term_search(s_fields, self._capTerms, ['tag', 'term'], self._termsIndex, 'terms', self._es)
      results.extend(res)

    tags = {result['term'][0]: result['tag'][0] for result in results}    

    for term in top_terms:
      entry = [term, pos_freq[term], neg_freq[term], []]
      if tags and not tags.get(term) is None:
        entry[3] = tags[term].split(';')
      terms.append(entry)
      
    for term in top_bigrams:
      entry = [term, 0, 0, []]
      terms.append(entry)

    for term in top_trigrams:
      entry = [term, 0, 0, []]
      terms.append(entry)
    
    return terms
  def createModel(self, session):
    es_info = self.esInfo(session['domainId']);

    data_dir = environ["DDT_HOME"] + "/data/"
    data_crawler  = data_dir + es_info['activeCrawlerIndex']
    data_training = data_crawler + "/training_data/"
    data_negative = data_crawler + "/training_data/negative/"
    data_positive = data_crawler + "/training_data/positive/"

    if (not isdir(data_positive)):
      makedirs(data_positive)
    if (not isdir(data_negative)):
      makedirs(data_negative)

    s_fields = {}
    query = {
      "wildcard": {es_info['mapping']["tag"]:"*Relevant*"}
    }
    s_fields["queries"] = [query]
    pos_urls = [field['url'][0] for field in multifield_term_search(s_fields, self._all, ["url", es_info['mapping']['tag']], 
                                    es_info['activeCrawlerIndex'], 
                                    es_info['docType'],
                                    self._es) if "irrelevant" not in field["tag"]]

    query = {
      "wildcard": {es_info['mapping']["tag"]:"*Irrelevant*"}
    }
    s_fields["queries"] = [query]
    neg_urls = [field['url'][0] for field in multifield_term_search(s_fields, self._all, ["url", es_info['mapping']['tag']], 
                                    es_info['activeCrawlerIndex'], 
                                    es_info['docType'],
                                    self._es)]

    pos_html = get_documents(pos_urls, 'url', [es_info['mapping']["html"]], es_info['activeCrawlerIndex'], es_info['docType'])
    neg_html = get_documents(neg_urls, 'url', [es_info['mapping']["html"]], es_info['activeCrawlerIndex'], es_info['docType'])

    seeds_file = data_crawler +"/seeds.txt"
    print "Seeds path ", seeds_file
    with open(seeds_file, 'w') as s:
      for url in pos_html:
        try:
          file_positive = data_positive + self.encode(url.encode('utf8'))
          print file_positive
          s.write(url.encode('utf8') + '\n')
          with open(file_positive, 'w') as f:
            f.write(pos_html[url][0][es_info['mapping']['html']][0])

        except IOError:
          _, exc_obj, tb = exc_info()
          f = tb.tb_frame
          lineno = tb.tb_lineno
          filename = f.f_code.co_filename
          linecache.checkcache(filename)
          line = linecache.getline(filename, lineno, f.f_globals)
          print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)

    for url in neg_html:
      try:
        file_negative = data_negative + self.encode(url.encode('utf8'))
        with open(file_negative, 'w') as f:
          f.write(neg_html[url][0]['html'][0])
      except IOError:
        _, exc_obj, tb = exc_info()
        f = tb.tb_frame
        lineno = tb.tb_lineno
        filename = f.f_code.co_filename
        linecache.checkcache(filename)
        line = linecache.getline(filename, lineno, f.f_globals)
        print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)
    
    models_dir = environ["DDT_HOME"] + "/vis/html/models/"
    crawlermodel_dir = models_dir + es_info['activeCrawlerIndex']
    
    if (not isdir(models_dir)):
      makedirs(models_dir)

    if (not isdir(crawlermodel_dir)):
      makedirs(crawlermodel_dir)

    ache_home = environ['ACHE_HOME']
    comm = ache_home + "/bin/ache buildModel -t " + data_training + " -o "+ crawlermodel_dir + " -c " + ache_home + "/config/stoplist.txt"
    p = Popen(comm, shell=True, stderr=PIPE)
    output, errors = p.communicate()
    print output
    print errors

    zip_filename = models_dir + es_info['activeCrawlerIndex'] + "_model.zip"
    with ZipFile(zip_filename, "w") as modelzip:
      if (isfile(crawlermodel_dir + "/pageclassifier.features")):
        print "zipping file: "+crawlermodel_dir + "/pageclassifier.features"
        modelzip.write(crawlermodel_dir + "/pageclassifier.features", "pageclassifier.features")
      
      if (isfile(crawlermodel_dir + "/pageclassifier.model")):
        print "zipping file: "+crawlermodel_dir + "/pageclassifier.model"
        modelzip.write(crawlermodel_dir + "/pageclassifier.model", "pageclassifier.model")

      if (exists(data_crawler + "/training_data/positive")):
        print "zipping file: "+ data_crawler + "/training_data/positive"
        for (dirpath, dirnames, filenames) in walk(data_crawler + "/training_data/positive"):
          for html_file in filenames:
            modelzip.write(dirpath + "/" + html_file, "training_data/positive/" + html_file)

      if (exists(data_crawler + "/training_data/negative")):
        print "zipping file: "+ data_crawler + "/training_data/negative"
        for (dirpath, dirnames, filenames) in walk(data_crawler + "/training_data/negative"):
          for html_file in filenames:
            modelzip.write(dirpath + "/" + html_file, "training_data/negative/" + html_file)
        
      if (isfile(data_crawler +"/seeds.txt")):
        print "zipping file: "+data_crawler +"/seeds.txt"
        modelzip.write(data_crawler +"/seeds.txt", es_info['activeCrawlerIndex'] + "_seeds.txt")

    chmod(zip_filename, 0o777)

    return "models/" + es_info['activeCrawlerIndex'] + "_model.zip"
Example #8
0
    def createModel(self, session=None, zip=True):
        """ Create an ACHE model to be applied to SeedFinder and focused crawler.
        It saves the classifiers, features, the training data in the <project>/data/<domain> directory.
        If zip=True all generated files and folders are zipped into a file.

        Parameters:
        session (json): should have domainId

        Returns:
        Zip file url or message text
        """
        path = self._path

        es_info = self._esInfo(session["domainId"])

        data_dir = path + "/data/"
        data_domain  = data_dir + es_info['activeDomainIndex']
        data_training = data_domain + "/training_data/"
        data_negative = data_domain + "/training_data/negative/"
        data_positive = data_domain + "/training_data/positive/"

        if (not isdir(data_positive)):
            # Create dir if it does not exist
            makedirs(data_positive)
        else:
            # Remove all previous files
            for filename in listdir(data_positive):
                remove(data_positive+filename)

        if (not isdir(data_negative)):
            # Create dir if it does not exist
            makedirs(data_negative)
        else:
            # Remove all previous files
            for filename in listdir(data_negative):
                remove(data_negative+filename)

        pos_tags = ["Relevant"]
        neg_tags = ["Irrelevant"]

        try:
            pos_tags = session['model']['positive']
        except KeyError:
            print "Using default positive tags"

        try:
            neg_tags = session['model']['negative']
        except KeyError:
            print "Using default negative tags"

        pos_docs = []

        for tag in pos_tags: #.split(','):
            s_fields = {}
            query = {
                "wildcard": {es_info['mapping']["tag"]:tag}
            }
            s_fields["queries"] = [query]

            results = multifield_term_search(s_fields,
                                             0, self._all,
                                             ["url", es_info['mapping']['html']],
                                             es_info['activeDomainIndex'],
                                             es_info['docType'],
                                             self._es)

            pos_docs = pos_docs + results['results']

        pos_html = {field['url'][0]:field[es_info['mapping']["html"]][0] for field in pos_docs}

        neg_docs = []
        for tag in neg_tags: #.split(','):
            s_fields = {}
            query = {
                "wildcard": {es_info['mapping']["tag"]:tag}
            }
            s_fields["queries"] = [query]
            results = multifield_term_search(s_fields,
                                             0, self._all,
                                             ["url", es_info['mapping']['html']],
                                             es_info['activeDomainIndex'],
                                             es_info['docType'],
                                             self._es)
            neg_docs = neg_docs + results['results']

        neg_html = {field['url'][0]:field[es_info['mapping']["html"]][0] for field in neg_docs}

        seeds_file = data_domain +"/seeds.txt"
        print "Seeds path ", seeds_file
        with open(seeds_file, 'w') as s:
            for url in pos_html:
                try:
                    file_positive = data_positive + self._encode(url.encode('utf8'))
                    s.write(url.encode('utf8') + '\n')
                    with open(file_positive, 'w') as f:
                        f.write(pos_html[url].encode('utf8'))

                except IOError:
                    _, exc_obj, tb = exc_info()
                    f = tb.tb_frame
                    lineno = tb.tb_lineno
                    filename = f.f_code.co_filename
                    linecache.checkcache(filename)
                    line = linecache.getline(filename, lineno, f.f_globals)
                    print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)

        for url in neg_html:
            try:
                file_negative = data_negative + self._encode(url.encode('utf8'))
                with open(file_negative, 'w') as f:
                    f.write(neg_html[url].encode('utf8'))
            except IOError:
                _, exc_obj, tb = exc_info()
                f = tb.tb_frame
                lineno = tb.tb_lineno
                filename = f.f_code.co_filename
                linecache.checkcache(filename)
                line = linecache.getline(filename, lineno, f.f_globals)
                print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)

        domainmodel_dir = data_domain + "/models/"

        if (not isdir(domainmodel_dir)):
            makedirs(domainmodel_dir)
        else:
            # Remove all previous files
            for filename in listdir(domainmodel_dir):
                remove(domainmodel_dir+filename)


        if len(neg_docs) > 0:
            ache_home = environ['ACHE_HOME']
            comm = ache_home + "/bin/ache buildModel -t " + data_training + " -o "+ domainmodel_dir + " -c " + ache_home + "/config/sample_config/stoplist.txt"
            p = Popen(comm, shell=True, stderr=PIPE)
            output, errors = p.communicate()
            print output
            print errors
        else:
            return "No irrelevant pages to build domain model"

        if zip:
            return self._createModelZip(session)

        return "Model created successfully"
Example #9
0
    def getRecommendations(self, num_pages, session):
        """ Method to recommend tlds for deep crawling. These are tlds in the crawled relevant pages
        which have not yet been marked for deep crawl and are sorted by the number of relevant urls
        in the tld that were crawled.

        Parameters:
        session (json): should have domainId

        Returns:
        {<tld>:<number of relevant pages crawler>}
        """

        domainId = session['domainId']

        es_info = self._esInfo(domainId)

        s_fields = {
            "tag": "Positive",
            "index": es_info['activeDomainIndex'],
            "doc_type": es_info['docType']
        }

        results = multifield_term_search(s_fields, 0, self._all, ['term'], self._termsIndex, 'terms', self._es)
        pos_terms = [field['term'][0] for field in results["results"]]

        unique_tlds = {}

        if len(pos_terms) > 0:
            mm_queries = []
            for term in pos_terms:
                mm_queries.append({'multi_match': {
                    'query': term,
                    'fields': [es_info['mapping']["text"], es_info['mapping']["title"]+"^2",es_info['mapping']["domain"]+"^3"],
                    'type': 'cross_fields',
                    'operator': 'and'
                }})
            query = {
                'query':{
                    'bool':{
                        'must_not':{
                            'term': {'isRelevant': 'irrelevant' }
                        },
                        'should': mm_queries,
                        "minimum_number_should_match": 1
                    }
                }
            }
            
            results = exec_query(query,
                                 ['url', 'domain'],
                                 0, self._all,
                                 es_info['activeDomainIndex'],
                                 es_info['docType'],
                                 self._es)
            
            domain_scored_pages = {}
            for result in results['results']:
                if result.get('domain') is None:
                    continue
                domain = result['domain'][0]
                domain_info = domain_scored_pages.get(domain)
                if domain_info is not None:
                    domain_info[0] = domain_info[0] + result['score']
                    domain_info[1] = domain_info[1] + 1
                    domain_info[2] = domain_info[0] / float(domain_info[1])
                else:
                    domain_info = [result['score'], 1, result['score']]
                    
                domain_scored_pages[domain] = domain_info

            unique_tlds = {k:{'count':v[1],'score':v[2]} for k,v in domain_scored_pages.items()}
            
        else:    
            query = {
                'bool':{
                    'must_not':{
                        'term': {'isRelevant': 'irrelevant' }
                    }
                }
            }
            for k, v in get_unique_values('domain.exact', query, self._all, es_info['activeDomainIndex'], es_info['docType'], self._es).items():
                if "." in k:
                    unique_tlds[k] = {'count':v}

        #Get tlds in pages annotated deep crawl
        query = {
            "term": {
                "tag": {
                    "value": "Deep Crawl"
                }
            }
        }

        unique_dp_tlds = {}

        for k, v in get_unique_values('domain.exact', query, self._all, es_info['activeDomainIndex'], es_info['docType'], self._es).items():
            unique_dp_tlds[k.replace("www.","")] = v

        #Get tlds that are not already annotated deep crawl
        recommendations = list(set([k.replace('www.','') for k in unique_tlds.keys()]).difference(set(unique_dp_tlds.keys())))

        recommended_tlds = {}

        for k, v in unique_tlds.items(): 
            if k in recommendations and v['count'] >= int(num_pages):
                recommended_tlds[k] = v
                
        return recommended_tlds