Esempio n. 1
0
    def submit_selected_urls(self, positive, negative):
    #Perform ranking and diversifing on all urls with regard to the positive urls
    #
    #Args:
    #   labeled_urls: a list of pair <url, label>. Label 1 means positive and 0 means negative.
    #Returns:
    #   urls: list of urls with ranking scores

        # Test new positive and negative examples with exisitng classifier
        # If accuracy above threshold classify pages
        # Ranking 
        # Diversification
        
        print '\n\nsubmit_selected_urls\n\n'

        entries = []
        for pos_url in positive:
            entry = {
                'url': pos_url,
                'relevance': 1
            }
            entries.append(entry)
            
        for neg_url in negative:
            entry = {
                'url': pos_url,
                'relevance': 0
            }
            entries.append(entry)

        if len(entries) > 0:
            update_document(entries)

        other = []
        
        for url in positive:
            if url in self.urls_set:
                self.positive_urls_set.add(url)
                self.negative_urls_set.discard(url)

        for url in negative:
            if url in self.urls_set:
                self.negative_urls_set.add(url)
                self.positive_urls_set.discard(url)
                
        for url in self.urls_set:
            if (len(self.negative_urls_set) == 0) or (url not in self.negative_urls_set):
                if url not in self.positive_urls_set:
                    other.append(url)

        chdir(self.memex_home + '/seed_crawler/ranking')
        ranker = rank.rank()
        
        [ranked_urls,scores] = ranker.results(self.tfidf,self.positive_urls_set, other)
        return [ranked_urls, scores] # classified, ranked, diversified 
Esempio n. 2
0
  def updateColors(self, session, colors):
    es_info = self._esInfo(session['domainId'])

    entry = {
      session['domainId']: {
        "colors": colors["colors"],
        "index": colors["index"]
      }
    }

    update_document(entry, "config", "tag_colors", self._es)
Esempio n. 3
0
    def submit_selected_urls(self, positive, negative):
        #Perform ranking and diversifing on all urls with regard to the positive urls
        #
        #Args:
        #   labeled_urls: a list of pair <url, label>. Label 1 means positive and 0 means negative.
        #Returns:
        #   urls: list of urls with ranking scores

        # Test new positive and negative examples with exisitng classifier
        # If accuracy above threshold classify pages
        # Ranking
        # Diversification

        print '\n\nsubmit_selected_urls\n\n'

        entries = []
        for pos_url in positive:
            entry = {'url': pos_url, 'relevance': 1}
            entries.append(entry)

        for neg_url in negative:
            entry = {'url': pos_url, 'relevance': 0}
            entries.append(entry)

        if len(entries) > 0:
            update_document(entries)

        other = []

        for url in positive:
            if url in self.urls_set:
                self.positive_urls_set.add(url)
                self.negative_urls_set.discard(url)

        for url in negative:
            if url in self.urls_set:
                self.negative_urls_set.add(url)
                self.positive_urls_set.discard(url)

        for url in self.urls_set:
            if (len(self.negative_urls_set)
                    == 0) or (url not in self.negative_urls_set):
                if url not in self.positive_urls_set:
                    other.append(url)

        chdir(self.memex_home + '/seed_crawler/ranking')
        ranker = rank.rank()

        [ranked_urls, scores] = ranker.results(self.tfidf,
                                               self.positive_urls_set, other)
        return [ranked_urls, scores]  # classified, ranked, diversified
Esempio n. 4
0
    def saveModelTags(self, session):
        """ Method to save tags to be considered positive or negative for building a model

        Parameters:
        session (json): should have domainId, should have {"model": {"positive": []}} to set positive tags,
                        should have {"model": {"negative": []}}  to set negative tags

        Returns:
        None
        """
        domainId = session["domainId"]

        es_info = self._esInfo(domainId)

        pos_tags = []
        try:
            pos_tags = session['model']['positive']
        except KeyError:
            print "Using default positive tags"

        neg_tags = []
        try:
            neg_tags = session['model']['negative']
        except KeyError:
            print "Using default negative tags"

        model_tags = self.getModelTags(domainId)

        entry = {
            domainId: {
                "positive": pos_tags,
                "index":  es_info["activeDomainIndex"]
            }
        }

        update_document(entry, "config", "model_tags", self._es)

        entry = {
            domainId: {
                "negative": neg_tags,
                "index":  es_info["activeDomainIndex"]
            }
        }

        update_document(entry, "config", "model_tags", self._es)
Esempio n. 5
0
    def saveModelTags(self, session):

        domainId = session["domainId"]

        es_info = self._esInfo(domainId)

        pos_tags = []
        try:
            pos_tags = session['model']['positive']
        except KeyError:
            print "Using default positive tags"

        neg_tags = []
        try:
            neg_tags = session['model']['negative']
        except KeyError:
            print "Using default negative tags"

        model_tags = self.getModelTags(domainId)

        entry = {
            domainId: {
                "positive": pos_tags,
                "index":  es_info["activeDomainIndex"]
            }
        }

        update_document(entry, "config", "model_tags", self._es)

        entry = {
            domainId: {
                "negative": neg_tags,
                "index":  es_info["activeDomainIndex"]
            }
        }

        update_document(entry, "config", "model_tags", self._es)
  def setTermsTag(self, terms, tag, applyTagFlag, session):
    # TODO(Yamuna): Apply tag to page and update in elastic search. Suggestion: concatenate tags
    # with semi colon, removing repetitions.

    es_info = self.esInfo(session['domainId'])

    s_fields = {
      "term": "",
      "index": es_info['activeCrawlerIndex'],
      "doc_type": es_info['docType'],
    }

    tags = []
    for term in terms:
      s_fields["term"] = term
      res = multifield_term_search(s_fields, 1, ['tag'], self._termsIndex, 'terms', self._es)
      tags.extend(res)

    results = {result['id']: result['tag'][0] for result in tags}

    add_entries = []
    update_entries = {}

    if applyTagFlag:
      for term in terms:
        if len(results) > 0:
          if results.get(term) is None:
            entry = {
              "term" : term,
              "tag" : tag,
              "index": es_info['activeCrawlerIndex'],
              "doc_type": es_info['docType'],
              "_id" : term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']
            }
            add_entries.append(entry)
          else:
            old_tag = results[term]
            if tag not in old_tag:
              entry = {
                "term" : term,
                "tag" : tag,
                "index": es_info['activeCrawlerIndex'],
                "doc_type": es_info['docType'],
              }
              update_entries[term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']] = entry
        else:
          entry = {
            "term" : term,
            "tag" : tag,
            "index": es_info['activeCrawlerIndex'],
            "doc_type": es_info['docType'],
            "_id": term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']
          }
          add_entries.append(entry)
    else:
      for term in terms:
        if len(results) > 0:
          if not results.get(term) is None:
            if tag in results[term]:
              entry = {
                "term" : term,
                "tag" : "",
                "index": es_info['activeCrawlerIndex'],
                "doc_type": es_info['docType']
              }
              update_entries[term+'_'+es_info['activeCrawlerIndex']+'_'+es_info['docType']] = entry

    if add_entries:
      add_document(add_entries, self._termsIndex, 'terms', self._es)
    
    if update_entries:
      update_document(update_entries, self._termsIndex, 'terms', self._es)
  def setPagesTag(self, pages, tag, applyTagFlag, session):
    
    es_info = self.esInfo(session['domainId'])

    entries = {}
    results = get_documents(pages, 'url', [es_info['mapping']['tag']], es_info['activeCrawlerIndex'], es_info['docType'],  self._es)

    if applyTagFlag and len(results) > 0:
      print '\n\napplied tag ' + tag + ' to pages' + str(pages) + '\n\n'
      
      for page in pages:
        if not results.get(page) is None:
          # pages to be tagged exist
          records = results[page]
          for record in records:
            entry = {}
            if record.get(es_info['mapping']['tag']) is None:
              # there are no previous tags
              entry[es_info['mapping']['tag']] = tag
            else:
              current_tag = record[es_info['mapping']['tag']][0]
              tags = []
              if  current_tag != '':
                # all previous tags were removed
                tags = list(set(current_tag.split(';')))
                
              if len(tags) != 0:
                # previous tags exist
                if not tag in tags:
                  # append new tag    
                  entry[es_info['mapping']['tag']] = ';'.join(tags)+';'+tag
              else:
                # add new tag
                entry[es_info['mapping']['tag']] = tag

            if entry:
                  entries[record['id']] =  entry

    elif len(results) > 0:
      print '\n\nremoved tag ' + tag + ' from pages' + str(pages) + '\n\n'

      for page in pages:
        if not results.get(page) is None:
          records = results[page]
          for record in records:
            entry = {}
            if not record.get(es_info['mapping']['tag']) is None:
              current_tag = record[es_info['mapping']['tag']][0]
              if tag in current_tag:
                tags = list(set(current_tag.split(';')))
                tags.remove(tag)
                entry[es_info['mapping']['tag']] = ';'.join(tags)
                entries[record['id']] = entry
    
    if entries:
      update_try = 0
      while (update_try < 10):
        try:
          update_document(entries, es_info['activeCrawlerIndex'], es_info['docType'], self._es)
          break
        except:
          update_try = update_try + 1