def assertTopicInserted(self, claim_id, expected_terms): topics = self._db.get_topics() terms = self._db.get_terms() topic_dict = defaultdict(set) term_dict = {term.term_id: term.description for term in terms} for topic_id, term_id, prob in topics: topic_dict[topic_id].add(term_dict[term_id]) topic_id = self._preprocess_visualization.get_claim_id_topic_dictionary( )[claim_id] claim = self._claim_dictionary[claim_id] expected = set(clean_tweet(claim.description).split(' ')) self.assertIn(topic_id, topic_dict) self.assertSetEqual(expected, topic_dict[topic_id]) self.assertSetEqual(set(expected_terms), topic_dict[topic_id])
def generate_topics_tables(self, source_id_target_elements_dict, target_fields): # source_id_target_elements_dict = self._get_source_id_target_elements(target_fields) table_name = target_fields['source']['table_name'] id_name = target_fields['source']['id'] field_name = target_fields['source']['target_field'] where_clauses = target_fields['source']['where_clauses'] source_ids = list(source_id_target_elements_dict.keys()) topic_objects = self._db.get_table_elements_by_ids( table_name, id_name, source_ids, where_clauses) term_id_term_texts = [] topic_id_term_ids = [] i = 0 for topic_obj in topic_objects: if i % 100 == 0: msg = "\r generate topic {0}/{1}".format( str(i), str(len(topic_objects))) print(msg, end="") i += 1 source_id = getattr(topic_obj, id_name) topic_content = clean_tweet(getattr(topic_obj, field_name)) if self._remove_stop_words: topic_content = clean_content_by_nltk_stopwords(topic_content) topic_terms = topic_content.split(' ') for term in topic_terms: if term not in self.term_dictionary: self.term_dictionary[term] = self._last_term_index self._last_term_index += 1 term_count = float(len(topic_terms)) self._num_topics += 1 term_id_term_txt = [ self._db.create_term(self.term_dictionary[term], term) for term in topic_terms ] topic_id_term_id = [ self._db.create_topic_item( self._num_topics, self.term_dictionary[term], topic_content.count(term) / term_count) for term in topic_terms ] self._source_id_topic_dict[source_id] = self._num_topics term_id_term_texts.extend(term_id_term_txt) topic_id_term_ids.extend(topic_id_term_id) msg = "\r generate topic {0}/{1}".format(str(i), str(len(topic_objects))) print(msg, end="") print() self._term_id_term_texts = term_id_term_texts self._topic_id_term_ids = topic_id_term_ids
def _generate_terms_and_topic_items(self, claim, topic_items, terms): claim_id = claim.claim_id topic_content = clean_tweet(claim.description) if self._remove_stop_words: topic_content = clean_content_by_nltk_stopwords(topic_content) topic_terms = topic_content.split(' ') self._update_term_to_term_id_dict(topic_terms) term_count = float(len(topic_terms)) self._num_topics += 1 topic_id = self._num_topics for term in topic_terms: term_id = self.term_dictionary[term] term_probability = topic_content.count(term) / term_count terms.append(self._db.create_term(term_id, term)) topic_items.append(self._db.create_topic_item(topic_id, term_id, term_probability)) self._claim_id_topic_dict[claim_id] = self._num_topics
def _clean_text(self, text): text = text.lower() text = clean_tweet(text) text = unicode(text).encode('utf-8') text = text.replace('"', ' ').replace('(', ' ').replace(')', ' ').replace('-', ' ') text = text.replace('%', ' ').replace('?', ' ').replace('!', ' ').replace('*', ' ') text = text.replace("'", ' ').replace("$", ' ').replace("@", ' ').replace( "#", ' ').replace("&", ' ') return text
def assertTopicInserted(self, claim_id): topics = self._db.get_topics() terms = self._db.get_terms() topic_dict = defaultdict(set) term_dict = {term.term_id: term.description for term in terms} for topic_id, term_id, prob in topics: topic_dict[topic_id].add(term_dict[term_id]) topic_id = self._preprocess_visualization.get_source_id_topic_dictionary( )[claim_id] self.assertIn(topic_id, topic_dict) expected = set( clean_tweet(self._post_dictionary[claim_id].content).split(' ')) if self._preprocess_visualization._remove_stop_words: expected = set( clean_content_by_nltk_stopwords( self._post_dictionary[claim_id].content).split(' ')) self.assertSetEqual(expected, topic_dict[topic_id])