def assertTopicInserted(self, claim_id, expected_terms):
     topics = self._db.get_topics()
     terms = self._db.get_terms()
     topic_dict = defaultdict(set)
     term_dict = {term.term_id: term.description for term in terms}
     for topic_id, term_id, prob in topics:
         topic_dict[topic_id].add(term_dict[term_id])
     topic_id = self._preprocess_visualization.get_claim_id_topic_dictionary(
     )[claim_id]
     claim = self._claim_dictionary[claim_id]
     expected = set(clean_tweet(claim.description).split(' '))
     self.assertIn(topic_id, topic_dict)
     self.assertSetEqual(expected, topic_dict[topic_id])
     self.assertSetEqual(set(expected_terms), topic_dict[topic_id])
    def generate_topics_tables(self, source_id_target_elements_dict,
                               target_fields):
        # source_id_target_elements_dict = self._get_source_id_target_elements(target_fields)
        table_name = target_fields['source']['table_name']
        id_name = target_fields['source']['id']
        field_name = target_fields['source']['target_field']
        where_clauses = target_fields['source']['where_clauses']

        source_ids = list(source_id_target_elements_dict.keys())
        topic_objects = self._db.get_table_elements_by_ids(
            table_name, id_name, source_ids, where_clauses)
        term_id_term_texts = []
        topic_id_term_ids = []
        i = 0
        for topic_obj in topic_objects:
            if i % 100 == 0:
                msg = "\r generate topic {0}/{1}".format(
                    str(i), str(len(topic_objects)))
                print(msg, end="")
            i += 1
            source_id = getattr(topic_obj, id_name)
            topic_content = clean_tweet(getattr(topic_obj, field_name))
            if self._remove_stop_words:
                topic_content = clean_content_by_nltk_stopwords(topic_content)
            topic_terms = topic_content.split(' ')
            for term in topic_terms:
                if term not in self.term_dictionary:
                    self.term_dictionary[term] = self._last_term_index
                    self._last_term_index += 1
            term_count = float(len(topic_terms))
            self._num_topics += 1
            term_id_term_txt = [
                self._db.create_term(self.term_dictionary[term], term)
                for term in topic_terms
            ]
            topic_id_term_id = [
                self._db.create_topic_item(
                    self._num_topics, self.term_dictionary[term],
                    topic_content.count(term) / term_count)
                for term in topic_terms
            ]
            self._source_id_topic_dict[source_id] = self._num_topics
            term_id_term_texts.extend(term_id_term_txt)
            topic_id_term_ids.extend(topic_id_term_id)
        msg = "\r generate topic {0}/{1}".format(str(i),
                                                 str(len(topic_objects)))
        print(msg, end="")
        print()
        self._term_id_term_texts = term_id_term_texts
        self._topic_id_term_ids = topic_id_term_ids
Example #3
0
 def _generate_terms_and_topic_items(self, claim, topic_items, terms):
     claim_id = claim.claim_id
     topic_content = clean_tweet(claim.description)
     if self._remove_stop_words:
         topic_content = clean_content_by_nltk_stopwords(topic_content)
     topic_terms = topic_content.split(' ')
     self._update_term_to_term_id_dict(topic_terms)
     term_count = float(len(topic_terms))
     self._num_topics += 1
     topic_id = self._num_topics
     for term in topic_terms:
         term_id = self.term_dictionary[term]
         term_probability = topic_content.count(term) / term_count
         terms.append(self._db.create_term(term_id, term))
         topic_items.append(self._db.create_topic_item(topic_id, term_id, term_probability))
     self._claim_id_topic_dict[claim_id] = self._num_topics
 def _clean_text(self, text):
     text = text.lower()
     text = clean_tweet(text)
     text = unicode(text).encode('utf-8')
     text = text.replace('"',
                         ' ').replace('(',
                                      ' ').replace(')',
                                                   ' ').replace('-', ' ')
     text = text.replace('%',
                         ' ').replace('?',
                                      ' ').replace('!',
                                                   ' ').replace('*', ' ')
     text = text.replace("'", ' ').replace("$",
                                           ' ').replace("@", ' ').replace(
                                               "#", ' ').replace("&", ' ')
     return text
Example #5
0
 def assertTopicInserted(self, claim_id):
     topics = self._db.get_topics()
     terms = self._db.get_terms()
     topic_dict = defaultdict(set)
     term_dict = {term.term_id: term.description for term in terms}
     for topic_id, term_id, prob in topics:
         topic_dict[topic_id].add(term_dict[term_id])
     topic_id = self._preprocess_visualization.get_source_id_topic_dictionary(
     )[claim_id]
     self.assertIn(topic_id, topic_dict)
     expected = set(
         clean_tweet(self._post_dictionary[claim_id].content).split(' '))
     if self._preprocess_visualization._remove_stop_words:
         expected = set(
             clean_content_by_nltk_stopwords(
                 self._post_dictionary[claim_id].content).split(' '))
     self.assertSetEqual(expected, topic_dict[topic_id])