Esempio n. 1
0
    def __update_topic_language_model(self, text_list):

        topic_text = '{title} {title} {title} {content}'.format(
            **self._topic.__dict__)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = SmoothedLanguageModel(
            new_language_model, self.background_language_model, self.mu)

        log.debug("Updating topic {0}".format(self._topic.id))
Esempio n. 2
0
    def make_topic_lm(self):
        topic_text = self.topic.content + self.topic.title

        doc_extractor = SingleQueryGeneration(minlen=3,
                                              stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        lm = LanguageModel(term_dict=doc_term_counts)
        self.topicLM = SmoothedLanguageModel(lm, self.backgroundLM, 100)
        print "making topic", self.topicLM.docLM.total_occurrences
Esempio n. 3
0
    def make_topic_language_model(self):
        """
        
        """
        topic_text = self._topic.content + self._topic.title

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        language_model = LanguageModel(term_dict=document_term_counts)
        self.topic_language_model = SmoothedLanguageModel(
            language_model, self.background_language_model, 100)
        print "making topic", self.topic_language_model.docLM.total_occurrences
Esempio n. 4
0
    def update_model(self, search_context):
        if not self.updating:
            return False

        snippet_text = self._get_snip_text(search_context)
        snippet_text = self._check_terms(snippet_text)

        if snippet_text:
            topic_text = search_context.topic.get_topic_text()
            all_text = '{0} {1}'.format(topic_text, snippet_text)

            #snippet_term_counts = lm_methods.extract_term_dict_from_text(snippet_text, self._stopword_file)
            #topic_term_counts = lm_methods.extract_term_dict_from_text(topic_text, self._stopword_file)
            #title_language_model = LanguageModel(term_dict=topic_term_counts)
            #snippet_language_model = LanguageModel(term_dict=snippet_term_counts)
            #topic_language_model = BayesLanguageModel(title_language_model, snippet_language_model, beta=10)

            term_counts = lm_methods.extract_term_dict_from_text(
                all_text, self._stopword_file)
            language_model = LanguageModel(term_dict=term_counts)

            self.topic_lang_model = language_model
            if self.background_language_model:
                smoothed_topic_language_model = SmoothedLanguageModel(
                    language_model, self.background_language_model)
                self.topic_lang_model = smoothed_topic_language_model

            return True
        else:
            return False
Esempio n. 5
0
class IFindTextClassifier(BaseTextClassifier):
    """
    
    """
    def __init__(self, topic, stopword_file=[], background_file=[]):
        """
        
        """
        super(IFindTextClassifier, self).__init__(topic, stopword_file,
                                                  background_file)
        self.make_topic_language_model()

    def make_topic_language_model(self):
        """
        
        """
        topic_text = self._topic.content + self._topic.title

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        language_model = LanguageModel(term_dict=document_term_counts)
        self.topic_language_model = SmoothedLanguageModel(
            language_model, self.background_language_model, 100)
        print "making topic", self.topic_language_model.docLM.total_occurrences

    def is_relevant(self, document):
        """
        
        """
        score = 0.0
        count = 0.0

        for term in document.title.split(' '):
            score = score + self.__get_term_score(term)
            count = count + 1.0

        for term in document.content.split(' '):
            score = score + self.__get_term_score(term)
            count = count + 1.0

        if (score / count) > self.threshold:
            return True

        return False

    def __get_term_score(self, term):
        """
        Returns a probability score for the given term when considering both the background and topic language models.
        """
        topic_term_prob = self.topic_language_model.get_term_prob(term)
        background_term_prob = self.background_language_model.get_term_prob(
            term)

        if background_term_prob == 0.0:
            return 0.0
        else:
            return math.log(topic_term_prob / background_term_prob, 2)
Esempio n. 6
0
    def __update_topic_language_model(self, text_list):

        topic_text =  '{title} {title} {title} {content}'.format(**self._topic.__dict__)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = SmoothedLanguageModel(new_language_model, self.background_language_model, self.mu)



        log.debug("Updating topic {0}".format(self._topic.id))
Esempio n. 7
0
    def make_topic_language_model(self):
        """
        
        """
        topic_text = '{title} {title} {title} {content}'.format(
            **self._topic.__dict__)

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        language_model = LanguageModel(term_dict=document_term_counts)

        self.topic_language_model = SmoothedLanguageModel(
            language_model, self.background_language_model, self.mu)
        log.debug("Making topic {0}".format(
            self.topic_language_model.docLM.total_occurrences))
Esempio n. 8
0
    def make_topic_lm(self):
        topic_text = self.topic.content + self.topic.title

        doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        lm = LanguageModel(term_dict=doc_term_counts)
        self.topicLM = SmoothedLanguageModel(lm,self.backgroundLM,100)
        print "making topic", self.topicLM.docLM.total_occurrences
Esempio n. 9
0
class IFindTextClassifier(BaseTextClassifier):
    """
    
    """
    def __init__(self, topic, stopword_file=[], background_file=[]):
        """
        
        """
        super(IFindTextClassifier, self).__init__(topic, stopword_file, background_file)
        self.make_topic_language_model()
    
    def make_topic_language_model(self):
        """
        
        """
        topic_text = self._topic.content + self._topic.title
        
        document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count
        
        language_model = LanguageModel(term_dict=document_term_counts)
        self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, 100)
        print "making topic", self.topic_language_model.docLM.total_occurrences
    
    def is_relevant(self, document):
        """
        
        """
        score = 0.0
        count = 0.0
        
        for term in document.title.split(' '):
            score = score + self.__get_term_score(term)
            count = count + 1.0
        
        for term in document.content.split(' '):
            score = score + self.__get_term_score(term)
            count = count + 1.0
        
        if (score / count) > self.threshold:
            return True
        
        return False
    
    def __get_term_score(self, term):
        """
        Returns a probability score for the given term when considering both the background and topic language models.
        """
        topic_term_prob = self.topic_language_model.get_term_prob(term)
        background_term_prob = self.background_language_model.get_term_prob(term)
        
        if background_term_prob == 0.0:
            return 0.0
        else:
            return math.log(topic_term_prob/background_term_prob, 2)
Esempio n. 10
0
class iFindTextClassifier(TextClassifier):

    def __init__(self, stopword_file=[], background_file=[]):
        TextClassifier.__init__(self, stopword_file, background_file)
        self.topicLM = None
        self.threshold = -0.20


    def set_topic(self, topic):
        self.topic = topic
        self.make_topic_lm()


    def make_topic_lm(self):
        topic_text = self.topic.content + self.topic.title

        doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        lm = LanguageModel(term_dict=doc_term_counts)
        self.topicLM = SmoothedLanguageModel(lm,self.backgroundLM,100)
        print "making topic", self.topicLM.docLM.total_occurrences


    def is_relevant(self, document):
        #print "computing relevance", document.docid

        score = 0.0
        count = 0.0
        for t in document.title.split(' '):
            score = score + self._get_term_score(t)
            count += 1.0

        for t in document.content.split(' '):
            score = score + self._get_term_score(t)
            count += 1.0


        if  (score/count) > self.threshold:
            return True
        else:
            return False


    def _get_term_score(self, term):

        ptd = self.topicLM.get_term_prob(term)
        pt = self.backgroundLM.get_term_prob(term)
        if pt == 0.0:
            return 0.0
        else:
            return math.log( ptd/pt, 2)
Esempio n. 11
0
 def make_topic_language_model(self):
     """
     
     """
     topic_text = self._topic.content + self._topic.title
     
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
     
     language_model = LanguageModel(term_dict=document_term_counts)
     self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, 100)
     print "making topic", self.topic_language_model.docLM.total_occurrences
Esempio n. 12
0
    def make_topic_language_model(self):
        """
        
        """
        topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__)

        document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count
        
        language_model = LanguageModel(term_dict=document_term_counts)

        self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, self.mu)
        log.debug("Making topic {0}".format(self.topic_language_model.docLM.total_occurrences))
Esempio n. 13
0
    def _generate_topic_language_model(self, search_context):
        """
        creates an empirical language model based on the search topic, or a smoothed language model if a background model has been loaded.
        """
        topic_text = self._make_topic_text(search_context)
        topic_term_counts = lm_methods.extract_term_dict_from_text(
            topic_text, self._stopword_file)

        topic_language_model = LanguageModel(term_dict=topic_term_counts)
        if self.background_language_model:
            smoothed_topic_language_model = SmoothedLanguageModel(
                topic_language_model, self.background_language_model)
            return smoothed_topic_language_model
        else:
            return topic_language_model
Esempio n. 14
0
class iFindTextClassifier(TextClassifier):
    def __init__(self, stopword_file=[], background_file=[]):
        TextClassifier.__init__(self, stopword_file, background_file)
        self.topicLM = None
        self.threshold = -0.20

    def set_topic(self, topic):
        self.topic = topic
        self.make_topic_lm()

    def make_topic_lm(self):
        topic_text = self.topic.content + self.topic.title

        doc_extractor = SingleQueryGeneration(minlen=3,
                                              stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        lm = LanguageModel(term_dict=doc_term_counts)
        self.topicLM = SmoothedLanguageModel(lm, self.backgroundLM, 100)
        print "making topic", self.topicLM.docLM.total_occurrences

    def is_relevant(self, document):
        #print "computing relevance", document.docid

        score = 0.0
        count = 0.0
        for t in document.title.split(' '):
            score = score + self._get_term_score(t)
            count += 1.0

        for t in document.content.split(' '):
            score = score + self._get_term_score(t)
            count += 1.0

        if (score / count) > self.threshold:
            return True
        else:
            return False

    def _get_term_score(self, term):

        ptd = self.topicLM.get_term_prob(term)
        pt = self.backgroundLM.get_term_prob(term)
        if pt == 0.0:
            return 0.0
        else:
            return math.log(ptd / pt, 2)
Esempio n. 15
0
class IFindTextClassifier(BaseTextClassifier):
    """
    
    """
    def __init__(self,
                 topic,
                 search_context,
                 stopword_file=[],
                 background_file=[]):
        """
        
        """
        super(IFindTextClassifier,
              self).__init__(topic, search_context, stopword_file,
                             background_file)
        self.threshold = 0.0
        self.mu = 100.0
        self.make_topic_language_model()

    def make_topic_language_model(self):
        """
        
        """
        topic_text = '{title} {title} {title} {content}'.format(
            **self._topic.__dict__)

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        language_model = LanguageModel(term_dict=document_term_counts)

        self.topic_language_model = SmoothedLanguageModel(
            language_model, self.background_language_model, self.mu)
        log.debug("Making topic {0}".format(
            self.topic_language_model.docLM.total_occurrences))

    def is_relevant(self, document):
        """
        
        """
        score = 0.0
        count = 0.0

        for term in document.title.split(' '):
            score = score + self.__get_term_score(term)
            count = count + 1.0

        for term in document.content.split(' '):
            score = score + self.__get_term_score(term)
            count = count + 1.0

        self.doc_score = (score / count)
        if self.doc_score > self.threshold:
            return True

        return False

    def __get_term_score(self, term):
        """
        Returns a probability score for the given term when considering both the background and topic language models.
        """
        topic_term_prob = self.topic_language_model.get_term_prob(term)
        background_term_prob = self.background_language_model.get_term_prob(
            term)

        if background_term_prob == 0.0:
            return 0.0
        else:
            return math.log(topic_term_prob / background_term_prob, 2.0)

    def update_model(self, search_context):

        if self.updating:
            ## Once we develop more update methods, it is probably worth making this a strategy
            ## so that setting the update_method changes the list of documents to use.
            if self.update_method == 1:
                document_list = search_context.get_all_examined_documents()
            else:
                document_list = search_context.get_all_examined_snippets()

            # iterate through document_list, pull out relevant snippets / text
            rel_text_list = []
            for doc in document_list:
                if doc.judgment > 0:
                    rel_text_list.append('{0} {1}'.format(
                        doc.title, doc.content))
            if rel_text_list:
                self.__update_topic_language_model(rel_text_list)
                return True
            else:
                return False

    def __update_topic_language_model(self, text_list):

        topic_text = '{title} {title} {title} {content}'.format(
            **self._topic.__dict__)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = SmoothedLanguageModel(
            new_language_model, self.background_language_model, self.mu)

        log.debug("Updating topic {0}".format(self._topic.id))
Esempio n. 16
0
class IFindTextClassifier(BaseTextClassifier):
    """
    
    """
    def __init__(self, topic, stopword_file=[], background_file=[]):
        """
        
        """
        super(IFindTextClassifier, self).__init__(topic, stopword_file, background_file)
        self.threshold = 0.0
        self.mu = 100.0
        self.make_topic_language_model()


    
    def make_topic_language_model(self):
        """
        
        """
        topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__)

        document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count
        
        language_model = LanguageModel(term_dict=document_term_counts)

        self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, self.mu)
        log.debug("Making topic {0}".format(self.topic_language_model.docLM.total_occurrences))
    
    def is_relevant(self, document):
        """
        
        """
        score = 0.0
        count = 0.0
        
        for term in document.title.split(' '):
            score = score + self.__get_term_score(term)
            count = count + 1.0
        
        for term in document.content.split(' '):
            score = score + self.__get_term_score(term)
            count = count + 1.0

        self.doc_score = (score/count)
        if self.doc_score > self.threshold:
            return True
        
        return False
    
    def __get_term_score(self, term):
        """
        Returns a probability score for the given term when considering both the background and topic language models.
        """
        topic_term_prob = self.topic_language_model.get_term_prob(term)
        background_term_prob = self.background_language_model.get_term_prob(term)
        
        if background_term_prob == 0.0:
            return 0.0
        else:
            return math.log(topic_term_prob/background_term_prob, 2.0)


    def update_model(self, search_context):

        if self.updating:
            ## Once we develop more update methods, it is probably worth making this a strategy
            ## so that setting the update_method changes the list of documents to use.
            if self.update_method == 1:
                document_list = search_context.get_all_examined_documents()
            else:
                document_list = search_context.get_all_examined_snippets()

            # iterate through document_list, pull out relevant snippets / text
            rel_text_list = []
            for doc in document_list:
                if doc.judgment > 0:
                    rel_text_list.append('{0} {1}'.format(doc.title, doc.content))
            if rel_text_list:
                self.__update_topic_language_model(rel_text_list)
                return True
            else:
                return False

    def __update_topic_language_model(self, text_list):

        topic_text =  '{title} {title} {title} {content}'.format(**self._topic.__dict__)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = SmoothedLanguageModel(new_language_model, self.background_language_model, self.mu)



        log.debug("Updating topic {0}".format(self._topic.id))