def format(self, raw_dataset_path, target_dataset_dir, is_save=True): total_mentions, nme_mentions, NIL_mentions = [], [], [] mentions = [] # type: List[List[tuple]] docs, xlore_misses, valid_entities = [], [], [] with open(raw_dataset_path, "r", encoding="utf-8") as rf: doc_mentions = [] # type: List[List] doc = "" for line in rf: if line.startswith("-DOCSTART-"): doc = doc.strip() if len(doc) > 0: docs.append(doc) mentions.append(doc_mentions) doc, doc_mentions = "", [] elif len(line.strip()) == 0: # 如果为空 doc = doc.strip(' ') doc += "\n" elif line.strip() in string.punctuation: # 如果是符号 doc = doc.strip(' ') doc += line.strip() else: line_arr = line.strip().split("\t") if len(line_arr) > 1: token, flag, mention_label, yago_id = line_arr[0], line_arr[1], line_arr[2], line_arr[3] if flag == 'B': total_mentions.append(mention_label) mention = Mention(len(doc), len(doc) + len(mention_label), mention_label) if yago_id != '--NME--': wiki_url = line_arr[4][23:] entity = self.entity_manager.entity_dictionary.get_entity_from_uri(wiki_url) # type: Entity if entity is not None: valid_entities.append(wiki_url) mention.set_gold_entity(entity) doc_mentions.append((mention.start, mention.end, mention.label, mention.gold_entity.ID)) else: NIL_mentions.append(mention_label) doc_mentions.append((mention.start, mention.end, mention.label, 'NIL')) xlore_misses.append(wiki_url) else: NIL_mentions.append(mention_label) nme_mentions.append(mention.label) doc_mentions.append((mention.start, mention.end, mention.label, "NIL")) if flag != 'I': doc += mention_label + ' ' else: doc += line_arr[0] + " " if len(doc_mentions) > 0: mentions.append(doc_mentions) docs.append(doc) if is_save: json.dump(mentions, open(os.path.join(target_dataset_dir, "annotations.json"), "w", encoding="utf-8"), indent=4, ensure_ascii=False) json.dump(docs, open(os.path.join(target_dataset_dir, "docs.json"), "w", encoding="utf-8"), indent=4, ensure_ascii=False) json.dump(xlore_misses, open(os.path.join(target_dataset_dir, "xlore_misses.json"), "w", encoding="utf-8"), indent=4, ensure_ascii=False) json.dump(valid_entities, open(os.path.join(target_dataset_dir, "valid_entities.json"), "w", encoding="utf-8"), indent=4, ensure_ascii=False) self.report_result(docs, total_mentions, NIL_mentions, xlore_misses, nme_mentions) return total_mentions, NIL_mentions, xlore_misses, nme_mentions
def process_mentions(message): mentions = re.findall( '@(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', message.message) for mention in mentions: new_mention = Mention(name=mention, message=message) new_mention.save()
def test_get_kweek_mentions(): query = " SELECT ID FROM KWEEK WHERE USERNAME = '******' LIMIT 1" kweek_id = db_manager.execute_query(query)[0]['id'] actual_mention = actions.get_kweek_mentions(kweek_id)[0] expected_mention = Mention({'username': '******', 'indices': [25, 35]}) assert isinstance(actual_mention, Mention) assert actual_mention.to_json() == expected_mention.to_json() query = "SELECT ID FROM KWEEK WHERE USERNAME = '******' LIMIT 1" kweek_id = db_manager.execute_query(query)[0]['id'] mentions = actions.get_kweek_mentions(kweek_id) assert mentions == []
def get_twitter_mentions(): statuses = [] for query in QUERIES: response = client.api.search.tweets.get(q='jeffknupp.com', count=100) statuses += response.data.statuses session = Session() twitter = session.query(Source).get(1) new_mentions = 0 for status in statuses: if not session.query(Mention).filter( Mention.domain_id == status.id_str).count(): created_at = datetime.datetime.strptime( status.created_at, r"%a %b %d %H:%M:%S +0000 %Y") m = Mention(text=status.text, associated_user='******'.format( status.user.screen_name, status.user.followers_count), recorded_at=datetime.datetime.now(), occurred_at=created_at, source=twitter, domain_id=status.id_str) new_mentions += 1 session.add(m) session.commit() return new_mentions
def handle_mention(request_params, mention): mention = Mention(mention['action_text'], mention['message_id'], mention['mentioned_by'], mention['full_message']) actions = get_actions() if actions != None: action = perform_action(actions, mention) response = mention_response(request_params, action.action_result, mention.message_id) return (mention, action, response)
def create_entity_mentions(mentions): mentions_object = [] for m in mentions: mention = Mention( content=m.text.content, \ begin_offset=m.text.begin_offset, \ type=m.type, \ sentiment_score=m.sentiment.score, \ sentiment_magnitude=m.sentiment.magnitude ) mentions_object.append(mention) return mentions_object
def create_tweet(): content = request.form['content'] tweet = Tweet(content, g.user.id) db.session.add(tweet) db.session.commit() pattern = re.compile('@(\w+)') for name in pattern.findall(content): user = User.query.filter_by(username=name).first() if Mention.validate(user.id, tweet.id) and user is not g.user: mention = Mention(user.id, tweet.id) db.session.add(mention) db.session.commit() return redirect(url_for('timeline'))
def prioritize(words): counters = {} for word in words: dict_match = DictionaryWord.query( DictionaryWord.word == word).fetch() if dict_match: mentions = Mention.query(Mention.word == dict_match[0].key).fetch() for mention in mentions: if mention.quote in counters: counters[mention.quote] += 1 else: counters.update(dict([(mention.quote, 1)])) ordered = sorted(counters.items(), key=lambda x: x[1]).copy() logging.debug(" ====================================== ") logging.debug(ordered) logging.debug(" ====================================== ") return ordered
def create_mentions(self, asin, comment_attrs): logging.info(comment_attrs) logging.info(asin) subsite = self.find_or_create_subsite(comment_attrs['subsite_name']) logging.info("Subsite: " + str(subsite)) comment_attrs['subsite_id'] = subsite.id comment = self.find_or_create_comment(comment_attrs) product = self.find_or_create_product(asin) logging.info(comment) logging.info(product) if product and comment: mention = session.query(Mention).filter_by( product_id=product.id, comment_id=comment.id).first() if not mention: mention = Mention(product_id=product.id, comment_id=comment.id) session.add(mention) session.commit()
def parser(word): """ individual word into array of dicts of quotes """ dict_match = DictionaryWord.query( DictionaryWord.word == word).fetch() if dict_match: mentions = Mention.query(Mention.word == dict_match[0].key).fetch() yeild_text = [] for mention in mentions: quote = mention.quote.get() quote_dict = {'line':quote.line, 'context': quote.context, 'movie':quote.movie } yeild_text.append(quote_dict) return yeild_text else: return []
def get_kweek_mentions(kweek_id): """ Gets the mentions in a given kweek. *Parameters:* - *kweek_id (int)*: The id of the kweek. *Returns:* - *List of models.Mention objects* """ database_mentions = query_factory.get_kweek_mentions(kweek_id) mentions = [] for database_mention in database_mentions: mention = { 'username': database_mention['username'], 'indices': [database_mention['starting_index'], database_mention['ending_index']] } mentions.append(Mention(mention)) return mentions
def parse_text(self, text: str) -> List[Mention]: if not isThreadAttachedToJVM(): attachThreadToJVM() parsed_result = self.solve_conflict( self.format_output(self.parser.parseText(text), text)) mention_list = [] # type: List[Mention] for item in parsed_result: mention = Mention(int(item[0]), int(item[1]), item[2]) mention.candidates = [] for cand_id in item[3]: candidate = Candidate(cand_id) mention.add_candidate(candidate) mention.parse_from = self.param_config.name mention_list.append(mention) return mention_list
query: str = """SELECT COUNT(*) FROM HASHTAG """ third_count = db_manager.execute_query(query)[0]['count'] assert third_count - second_count == 0 check, message = actions.insert_kweek(kweek_test_3) assert message == 'Repeated mention in the same kweek' check, message = actions.insert_kweek(kweek_test_4) assert message == 'the user mentioned does not exist in the database' @pytest.mark.parametrize("text, expected_hashtags, expected_mentions", [ ('#hashtag and @mention', [Hashtag({'indices': (0, 8), 'text': '#hashtag', 'id': 0})], [Mention({'indices': (13, 21), 'username': '******'})]), ('#hashtag and @mention ', [Hashtag({'indices': (0, 8), 'text': '#hashtag', 'id': 0})], [Mention({'indices': (13, 21), 'username': '******'})]), ('@mention and #hashtag', [Hashtag({'indices': (13, 21), 'text': '#hashtag', 'id': 0})], [Mention({'indices': (0, 8), 'username': '******'})]), ('@mention and #hashtag ', [Hashtag({'indices': (13, 21), 'text': '#hashtag', 'id': 0})], [Mention({'indices': (0, 8), 'username': '******'})]), ('@mention and # ', [Hashtag({'indices': (13, 14), 'text': '#', 'id': 0})], [Mention({'indices': (0, 8), 'username': '******'})]), ('@mention and #', [], [Mention({'indices': (0, 8), 'username': '******'})]),
def test_insert_kweek(): kweek_test_1 = Kweek({ 'id': 0, 'created_at': datetime.utcnow(), 'text': '#test1', 'media_url': None, 'user': User({ 'username': '******', 'screen_name': 'test1', 'profile_image_url': 'image_url', 'following': False, 'follows_you': False, 'muted': False, 'blocked': False }), 'mentions': [ Mention({ 'username': '******', 'indices': [10, 16]}), Mention({ 'username': '******', 'indices': [18, 20]}, ) ], 'hashtags': [ Hashtag({ 'text': '#sky', 'indices': [10, 16], 'id': 0 }) ], 'number_of_likes': 0, 'number_of_rekweeks': 0, 'number_of_replies': 0, 'reply_to': None, 'rekweek_info': None, 'liked_by_user': False, 'rekweeked_by_user': False }) kweek_test_2 = Kweek({ 'id': 0, 'created_at': datetime.utcnow(), 'text': '#test2', 'media_url': None, 'user': User({ 'username': '******', 'screen_name': 'test1', 'profile_image_url': 'image_url', 'following': False, 'follows_you': False, 'muted': False, 'blocked': False }), 'mentions': [ Mention({ 'username': '******', 'indices': [10, 16]}), Mention({ 'username': '******', 'indices': [18, 20]}) ], 'hashtags': [ Hashtag({ 'text': '#sky', 'indices': [10, 16], 'id': 0 }) ], 'number_of_likes': 0, 'number_of_rekweeks': 0, 'number_of_replies': 0, 'reply_to': None, 'rekweek_info': None, 'liked_by_user': False, 'rekweeked_by_user': False }) kweek_test_3 = Kweek({ 'id': 0, 'created_at': datetime.utcnow(), 'text': '#test3', 'media_url': None, 'user': User({ 'username': '******', 'screen_name': 'test1', 'profile_image_url': 'image_url', 'following': False, 'follows_you': False, 'muted': False, 'blocked': False }), 'mentions': [ Mention({ 'username': '******', 'indices': [10, 16]}), Mention({ 'username': '******', 'indices': [18, 20]}, ) ], 'hashtags': [ Hashtag({ 'text': '#sky', 'indices': [10, 16], 'id': 0 }) ], 'number_of_likes': 0, 'number_of_rekweeks': 0, 'number_of_replies': 0, 'reply_to': None, 'rekweek_info': None, 'liked_by_user': False, 'rekweeked_by_user': False }) kweek_test_4 = Kweek({ 'id': 0, 'created_at': datetime.utcnow(), 'text': '#test1', 'media_url': None, 'user': User({ 'username': '******', 'screen_name': 'test1', 'profile_image_url': 'image_url', 'following': False, 'follows_you': False, 'muted': False, 'blocked': False }), 'mentions': [ Mention({ 'username': '******', 'indices': [10, 16]}), Mention({ 'username': '******', 'indices': [18, 20]}, ) ], 'hashtags': [ Hashtag({ 'text': '#sky', 'indices': [10, 16], 'id': 0 }) ], 'number_of_likes': 0, 'number_of_rekweeks': 0, 'number_of_replies': 0, 'reply_to': None, 'rekweek_info': None, 'liked_by_user': False, 'rekweeked_by_user': False }) query: str = """SELECT ID FROM HASHTAG WHERE TEXT=%s """ data = ('#sky',) hid = db_manager.execute_query(query, data) if len(hid) != 0: query: str = """DELETE FROM HASHTAG WHERE ID=%s """ data = (hid[0]['id'],) db_manager.execute_query_no_return(query, data) query: str = """DELETE FROM KWEEK_HASHTAG WHERE HASHTAG_ID=%s """ data = (hid[0]['id'],) db_manager.execute_query_no_return(query, data) query: str = """SELECT COUNT(*) FROM HASHTAG """ first_count = db_manager.execute_query(query)[0]['count'] actions.insert_kweek(kweek_test_1) query: str = """SELECT ID FROM KWEEK ORDER BY ID DESC LIMIT 1 """ kid = db_manager.execute_query(query)[0]['id'] print("kweek id", kid) query: str = """SELECT ID FROM HASHTAG ORDER BY ID DESC LIMIT 1 """ hid = db_manager.execute_query(query)[0]['id'] print("hahstag id ", hid) query: str = """SELECT ID,TEXT,media_url,username,reply_to FROM KWEEK WHERE ID= %s """ data = (kid,) resulted_kweek = db_manager.execute_query(query, data)[0] print("kweek", resulted_kweek) query: str = """SELECT * FROM MENTION WHERE KWEEK_ID= %s""" data = (kid,) resulted_mention = db_manager.execute_query(query, data)[0] query: str = """SELECT TEXT, KWEEK_ID, HASHTAG_ID, STARTING_INDEX, ENDING_INDEX FROM KWEEK_HASHTAG JOIN HASHTAG ON ID = HASHTAG_ID WHERE KWEEK_ID = %s""" data = (kid,) resulted_hashtag = db_manager.execute_query(query, data)[0] print("hashtag", resulted_hashtag) expected_mention = {'kweek_id': kid, 'username': '******', 'starting_index': 10, 'ending_index': 16} expected_hahstag = {'text': '#sky', 'kweek_id': kid, 'hashtag_id': hid, 'starting_index': 10, 'ending_index': 16} expected_kweek = {'id': kid, 'text': '#testtest', 'media_url': None, 'username': '******', 'reply_to': None} assert expected_kweek == resulted_kweek assert expected_hahstag == resulted_hashtag assert expected_mention == resulted_mention query: str = """SELECT COUNT(*) FROM HASHTAG """ second_count = db_manager.execute_query(query)[0]['count'] assert (second_count - first_count) == 1 check, message = actions.insert_kweek(kweek_test_2) assert message == 'success' query: str = """SELECT COUNT(*) FROM HASHTAG """ third_count = db_manager.execute_query(query)[0]['count'] assert third_count - second_count == 0 check, message = actions.insert_kweek(kweek_test_3) assert message == 'Repeated mention in the same kweek' check, message = actions.insert_kweek(kweek_test_4) assert message == 'the user mentioned does not exist in the database'
def predict(self, document) -> List[Mention]: mention_list = self.mention_parser.parse_text(document) mentions = [] for start, end, mention_str, candidates in mention_list: prev_start = start - self.context_words_window if prev_start < 0: prev_start = 0 after_end = end + self.context_words_window if after_end > len(document): after_end = len(document) prev_context_words = [ word for word in self.word_parser.parse_text( document[prev_start:start]) if word in self.word_manager.vec_model.vectors ] after_context_words = [ word for word in self.word_parser.parse_text( document[end:after_end]) if word in self.word_manager.vec_model.vectors ] context_words = prev_context_words context_words.extend(after_context_words) # 按照 context_words_sim 初步筛选出 valid candidate for mention valid_candidates = [] # type: List[Candidate] for candidate_id in candidates: if self.entity_manager.is_entity_has_embed(candidate_id) and \ self.entity_manager.entity_dictionary.entity_dict.get(candidate_id) is not None: candidate = Candidate(candidate_id) candidate.set_entity( self.entity_manager.entity_dictionary.entity_dict.get( candidate_id)) candidate.set_context_words_sim( self.cal_candidate_context_words_sim( candidate_id, context_words)) if candidate.context_words_sim > self.context_words_sim_th: valid_candidates.append(candidate) if len(valid_candidates) > 0: mention = Mention(start, end, mention_str, valid_candidates) mention.set_prev_context(prev_context_words) mention.set_after_context(after_context_words) mentions.append(mention) # 开始计算 context_entities_similarity seed_candidates = [] # type: List[Candidate] # 根据 context_words_sim_th_for_seed_candidates 筛选出 seed_candidates for i, mention in enumerate(mentions): max_sim = -1 max_cand = None for candidate in mention.candidates: if candidate.context_words_sim > max_sim: max_cand = candidate if max_cand.context_words_sim > self.seed_candidates_sim_th: seed_candidates.append(max_cand) mention.set_result_cand(max_cand) # 为未消歧的 mention 构建 context_entities context_entities = [] for cand in seed_candidates: context_entities.append(cand.entity) # 为所有的 mention 的 candidate 计算 context_entities_sim for i, mention in enumerate(mentions): if mention.result_cand is None: # 如果是未消歧的 mention,直接计算与 seed_candidates 的相似度 for j, candidate in enumerate(mentions[i].candidates): mentions[i].set_context_entities(context_entities) mentions[i].candidates[j].set_context_entities_sim( self.cal_candidate_context_entities_sim( candidate.entity_id, seed_candidates)) else: # 如果是已消歧的 mention,则去掉该 mention 的 candidates 得到 seed_candidates_for_mention,计算相似度 seed_entities_for_mention = [] # type: List[Candidate] for seed_cand in seed_candidates: belong_to_mention = False for cand in mention.candidates: if cand.entity_id == seed_cand.entity_id: belong_to_mention = True if not belong_to_mention: seed_entities_for_mention.append(seed_cand) for j, candidate in enumerate(mentions[i].candidates): mentions[i].set_context_entities(context_entities) mentions[i].candidates[j].set_context_entities_sim( self.cal_candidate_context_entities_sim( candidate.entity_id, seed_entities_for_mention)) # 设置 mention 的 believe_score for i, mention in enumerate(mentions): for cand in mention.candidates: cand.set_believe_score( self.words_sim_weight * cand.context_words_sim + (1 - self.words_sim_weight) * cand.context_entities_sim) mentions[i].candidates = sorted( mention.candidates, key=lambda item: item.believe_score, reverse=True) mentions[i].set_result_cand(mention.candidates[0]) # 根据 believe_score 再次筛选 mentions refined_mentions = [] for m in mentions: if m.result_cand.believe_score > self.believe_score_th: refined_mentions.append(m) # TODO: expand seed candidates here # for i, mention in enumerate(mentions): # for j, candidate in enumerate(mentions[i].candidates): # mentions[i].candidates[j].set_context_entities_sim( # self.cal_candidate_context_entities_sim(candidate.entity_id, seed_candidates)) return refined_mentions
def build_sample(self, mention_list: List, document: str, context_window=-1, context_words_sim_th=-1, seed_candidats_sim_th=-1, believe_score_th=-1): """ 1. 由 mention list 和 document 构造 List[Mention] (主要是 prev_context, after_context 和 context_words_sim) 2. 由 List[Mention] 计算 context_entities_sim. Args: mention_list: [(start, end, mention, candidates)], the result should come from MentionParser.parse_text(document) document: the input document. context_window: the window size is the character number, not word number. Return: List[Mention] """ if context_window != -1: self.context_window = context_window if context_words_sim_th != -1: self.context_words_sim_th = context_words_sim_th if seed_candidats_sim_th != -1: self.seed_candidates_sim_th = seed_candidats_sim_th if believe_score_th != -1: self.believe_score_th = believe_score_th mentions = [] for start, end, mention_str, candidates in mention_list: prev_start = start - self.context_window if prev_start < 0: prev_start = 0 after_end = end + self.context_window if after_end > len(document): after_end = len(document) prev_context_words = [ word for word in self.word_parser.parse_text( document[prev_start:start]) if word in self.word_manager.vec_model.vectors ] after_context_words = [ word for word in self.word_parser.parse_text( document[end:after_end]) if word in self.word_manager.vec_model.vectors ] context_words = prev_context_words context_words.extend(after_context_words) # 按照 context_words_sim 初步筛选出 valid candidate for mention valid_candidates = [] # type: List[Candidate] for candidate_id in candidates: if self.entity_manager.is_entity_has_embed(candidate_id) and \ self.entity_manager.entity_dictionary.entity_dict.get(candidate_id) is not None: candidate = Candidate(candidate_id) candidate.set_entity( self.entity_manager.entity_dictionary.entity_dict.get( candidate_id)) candidate.set_context_words_sim( self.cal_candidate_context_words_sim( candidate_id, context_words)) if candidate.context_words_sim > self.context_words_sim_th: valid_candidates.append(candidate) if len(valid_candidates) > 0: mention = Mention(start, end, mention_str, valid_candidates) mention.set_prev_context(prev_context_words) mention.set_after_context(after_context_words) mentions.append(mention) # 开始计算 context_entities_similarity seed_candidates = [] # type: List[Candidate] # 根据 context_words_sim_th_for_seed_candidates 筛选出 seed_candidates for i, mention in enumerate(mentions): max_sim = -1 max_cand = None for candidate in mention.candidates: if candidate.context_words_sim > max_sim: max_cand = candidate if max_cand.context_words_sim > self.seed_candidates_sim_th: seed_candidates.append(max_cand) mention.set_result_cand(max_cand) # 为未消歧的 mention 构建 context_entities context_entities = [] for cand in seed_candidates: context_entities.append(cand.entity) # 为所有的 mention 的 candidate 计算 context_entities_sim for i, mention in enumerate(mentions): if mention.result_cand is None: # 如果是未消歧的 mention,直接计算与 seed_candidates 的相似度 for j, candidate in enumerate(mentions[i].candidates): mentions[i].set_context_entities(context_entities) mentions[i].candidates[j].set_context_entities_sim( self.cal_candidate_context_entities_sim( candidate.entity_id, seed_candidates)) else: # 如果是已消歧的 mention,则去掉该 mention 的 candidates 得到 seed_candidates_for_mention,计算相似度 seed_entities_for_mention = [] # type: List[Candidate] for seed_cand in seed_candidates: belong_to_mention = False for cand in mention.candidates: if cand.entity_id == seed_cand.entity_id: belong_to_mention = True if not belong_to_mention: seed_entities_for_mention.append(seed_cand) for j, candidate in enumerate(mentions[i].candidates): mentions[i].set_context_entities(context_entities) mentions[i].candidates[j].set_context_entities_sim( self.cal_candidate_context_entities_sim( candidate.entity_id, seed_entities_for_mention)) # 设置 mention 的 believe_score for i, mention in enumerate(mentions): for cand in mention.candidates: cand.set_believe_score(0.3 * cand.context_words_sim + 0.7 * cand.context_entities_sim) mentions[i].candidates = sorted( mention.candidates, key=lambda item: item.believe_score, reverse=True) mentions[i].set_result_cand(mention.candidates[0]) # 根据 believe_score 再次筛选 mentions refined_mentions = [] for m in mentions: if m.result_cand.believe_score > self.believe_score_th: refined_mentions.append(m) # TODO: expand seed candidates here # for i, mention in enumerate(mentions): # for j, candidate in enumerate(mentions[i].candidates): # mentions[i].candidates[j].set_context_entities_sim( # self.cal_candidate_context_entities_sim(candidate.entity_id, seed_candidates)) return refined_mentions
def extract_mentions_hashtags(text): """ Extract mentions and replies for the given kweek. *Parameters:* - *text*: The text of the kweek to be inserted . *Returns:* -*Tuple*: { | *hashtags (hashtag object )*: The list of kweek hashtags, | *mention (mention object )*: The list of kweek mentions. | } """ hashtags = [] mentions = [] size = len(text) i = 0 while i < size: hashtag_indices_list = [] mention_indices_list = [] if text[i] == '#': hashtag_indices_list.append(i) for i in range(i + 1, len(text)): if (i == size - 1 and text[i] == ' ') or text[i] == ' ': hashtag_indices_list.append(i) elif i == size - 1: hashtag_indices_list.append(i + 1) else: continue hashtag_text = text[ hashtag_indices_list[0]:hashtag_indices_list[1]] hashtag = { 'indices': hashtag_indices_list, 'text': hashtag_text, 'id': 0 } hashtags.append(Hashtag(hashtag)) break if text[i] == '@': mention_indices_list.append(i) for i in range(i + 1, len(text)): if (i == size - 1 and text[i] == ' ') or text[i] == ' ': mention_indices_list.append(i) elif i == size - 1: mention_indices_list.append(i + 1) else: continue mention_username = text[mention_indices_list[0] + 1:(mention_indices_list[1])] mention = { 'indices': mention_indices_list, 'username': mention_username } mentions.append(Mention(mention)) break i += 1 return hashtags, mentions # lists of objects
def get_kweek(kid, authorized_username, replies_only): """ Get the requested kweek with its credentials. *Parameters:* - *kid*: The id of the kweek to be retrieved. - *authorized_username(string)*: The user currently logged in. - *replies_only (bool)*: To indicate whether the kweek with its replies is to be retrieved or the replies only *Returns:* -*Tuple*: { | *check (bool)*: To indicate whether kweek credentials creation | was successful or not., | *message (str)*: To specify the reason of failure if detected. | *kweekobj (kweek object )*: the kweek to be retrieved, | *replies (list of int )*: Ids of the replies to the retrieved kweek . | *code*: The code to be returned in the request. | } """ check, message, code = validate_request(kid) if not check: return check, message, None, None, code replies = retrieve_replies( kid ) # rows of kweek table who is set as a reply to the retrieved kweek (ids) if replies_only: return True, message, None, replies, code hashtags = retrieve_hashtags(kid) # rows of hahstag-kweek table (*) mentions = retrieve_mentions(kid) # rows of mention table (*) rekweeks = retrieve_user(kid, 3) likers = retrieve_user( kid, 2) # rows of likers table for those who liked the kweek (usernames) user = retrieve_user(kid, 1) hashtags_list = [] # list of hashtag objects mentions_list = [] # list of mention objects rekweeked_by_user = False liked_by_user = False if hashtags: for hash_obj in hashtags: hid = hash_obj['hashtag_id'] s_index = hash_obj['starting_index'] e_index = hash_obj['ending_index'] indices = [s_index, e_index] text = hash_obj['text'] hash_dic = {'id': hid, 'indices': indices, 'text': text} hashtag = Hashtag(hash_dic) hashtags_list.append(hashtag) if mentions: for ment in mentions: s_index = ment['starting_index'] e_index = ment['ending_index'] indices = [s_index, e_index] username = ment['username'] ment_dic = {'indices': indices, 'username': username} mention = Mention(ment_dic) mentions_list.append(mention) user = user[0] extrauser = {} me = authorized_username # should be replaced by the function getting the current user check = check_following(me, user['username']) if check: extrauser['following'] = True else: extrauser['following'] = False check = check_following(user['username'], me) if check: extrauser['follows_you'] = True else: extrauser['follows_you'] = False check = check_blocked(user['username'], me) if check: extrauser['blocked'] = True else: extrauser['blocked'] = False check = check_muted(user['username'], me) if check: extrauser['muted'] = True else: extrauser['muted'] = False extrauser.update(user) userobj = User(extrauser) if replies: num_of_replies = len(replies) else: num_of_replies = 0 if likers: num_of_likes = len(likers) for user in likers: if user['username'] == me: liked_by_user = True else: num_of_likes = 0 if rekweeks: num_of_rekweeks = len(rekweeks) for user in rekweeks: if user['username'] == me: rekweeked_by_user = True else: num_of_rekweeks = 0 kweekdic = { 'hashtags': hashtags_list, 'mentions': mentions_list, 'number_of_likes': num_of_likes, 'number_of_rekweeks': num_of_rekweeks, 'number_of_replies': num_of_replies, 'rekweek_info': None, 'liked_by_user': liked_by_user, 'rekweeked_by_user': rekweeked_by_user, 'user': userobj } kweek = retrieve_kweek(kid) # a row of kweek table kweek = kweek[0] kweekdic.update(kweek) kweekdic['reply_info'] = get_reply_to_info(kid) kweekobj = Kweek(kweekdic) return True, 'success.', kweekobj, replies, 200
def update(self): """ Update tweets related to movies in local DB :return: None """ # Get name for all the stored movies in the DB movie_obj = Movie(db) movies = movie_obj.get_names() print('Got movies') if movies: for movie in movies: hashtag = self.get_hashtag(movie['MV_NAME']) mv_id = movie['MV_ID'] # Search twitter for current movie hashtag in english language print('Searching for hashtag {}'.format(hashtag)) results = self.api.GetSearch(hashtag, lang='en', count=100) # Get data for each tweet in search results and save to respective tables for tweet in results: print(tweet) user_keys = [ 'id_str', 'name', 'description', 'created_at', 'created_at', 'followers_count', 'friends_count' ] user_data = [] for k in user_keys: user_data.append(tweet.user.__getattribute__(k)) # split time format before saving to the DB timestamp = datetime.strptime(user_data[3], self.tweet_time_format) user_data[3] = timestamp.strftime(self.date_format) user_data[4] = timestamp.strftime(self.time_format) try: u = User(db) if u.select_one(user_data[0]): u.update(user_data) else: u.insert(user_data) except Exception: # pass any exception occurred during the insert/update operation pass timestamp = datetime.strptime(tweet.created_at, self.tweet_time_format) date = timestamp.strftime(self.date_format) time = timestamp.strftime(self.time_format) tweet_data = [ tweet.id, tweet.full_text, hashtag, user_data[0], date, time, tweet.retweet_count ] try: t = Tweet(db) t.insert(tweet_data) except Exception: # pass any exception occurred during the insert operation pass try: tm = TweetMovie(db) tm.insert([tweet.id, mv_id]) except Exception: # pass any exception occurred during the insert operation pass # Add tweet mentions to the mentions table and any new user mentioned to the user table mentions = tweet.user_mentions if mentions: for mention in mentions: m = Mention(db) try: m.insert([tweet.id, mention.id]) except Exception: pass try: # Add user to the user table if not exists u = User(db) u.insert_mention_user( [mention.id, mention.name]) except Exception: # pass any exception occurred during the insert/update operation pass
def test_get_kweek_with_replies(): # first kweek # query: str = """INSERT INTO KWEEK (CREATED_AT,TEXT,MEDIA_URL,USERNAME,REPLY_TO) VALUES(%s, %s, %s, %s,%s) """ data = ('01-01-2010', 'test1', None, 'test_user1', None) db_manager.execute_query_no_return(query, data) kid1 = str(db_manager.execute_query("""SELECT ID FROM KWEEK ORDER BY ID DESC LIMIT 1 """)[0]['id']) query: str = """INSERT INTO HASHTAG(TEXT) VALUES (%s) """ data = ('hashtag1---',) db_manager.execute_query_no_return(query, data) query: str = """SELECT ID FROM HASHTAG WHERE TEXT = %s """ data = ('hashtag1---',) hid1 = db_manager.execute_query(query, data)[0]['id'] query: str = """INSERT INTO KWEEK_HASHTAG VALUES (%s,%s,%s,%s)""" data = (kid1, hid1, 0, 9,) db_manager.execute_query_no_return(query, data) query: str = """INSERT INTO MENTION VALUES(%s,%s,%s,%s) """ data = (kid1, 'test_user2', 10, 15) db_manager.execute_query_no_return(query, data) query: str = """INSERT INTO REKWEEK VALUES(%s,%s,%s) """ data = ('test_user2', kid1, '01-01-2010') db_manager.execute_query_no_return(query, data) query: str = """INSERT INTO FAVORITE VALUES(%s,%s,%s) """ data = ('test_user2', kid1, '01-01-2010') db_manager.execute_query_no_return(query, data) # second kweek # query: str = """INSERT INTO KWEEK (CREATED_AT,TEXT,MEDIA_URL,USERNAME,REPLY_TO) VALUES(%s, %s, %s, %s,%s) """ data = ('01-01-2010', 'test2', None, 'test_user2', kid1) db_manager.execute_query_no_return(query, data) kid2 = str(db_manager.execute_query("""SELECT ID FROM KWEEK ORDER BY ID DESC LIMIT 1 """)[0]['id']) query: str = """INSERT INTO HASHTAG(TEXT) VALUES (%s) """ data = ('hashtag2---',) db_manager.execute_query_no_return(query, data) query: str = """SELECT ID FROM HASHTAG WHERE TEXT = %s """ data = ('hashtag2---',) hid2 = db_manager.execute_query(query, data)[0]['id'] query: str = """INSERT INTO KWEEK_HASHTAG VALUES (%s,%s,%s,%s)""" data = (kid2, hid2, 0, 9,) db_manager.execute_query_no_return(query, data) query: str = """INSERT INTO FAVORITE VALUES(%s,%s,%s) """ data = ('test_user1', kid2, '01-01-2010') db_manager.execute_query_no_return(query, data) # third kweek # query: str = """INSERT INTO KWEEK (CREATED_AT,TEXT,MEDIA_URL,USERNAME,REPLY_TO) VALUES(%s, %s, %s, %s,%s) """ data = ('01-01-2010', 'test3', None, 'test_user3', kid1) db_manager.execute_query_no_return(query, data) kid3 = str(db_manager.execute_query("""SELECT ID FROM KWEEK ORDER BY ID DESC LIMIT 1 """)[0]['id']) query: str = """INSERT INTO HASHTAG(TEXT) VALUES (%s) """ data = ('hashtag3---',) db_manager.execute_query_no_return(query, data) query: str = """SELECT ID FROM HASHTAG WHERE TEXT = %s """ data = ('hashtag3---',) hid3 = db_manager.execute_query(query, data)[0]['id'] query: str = """INSERT INTO KWEEK_HASHTAG VALUES (%s,%s,%s,%s)""" data = (kid3, hid3, 0, 9,) db_manager.execute_query_no_return(query, data) query: str = """INSERT INTO FAVORITE VALUES(%s,%s,%s) """ data = ('test_user3', kid3, '01-01-2010') db_manager.execute_query_no_return(query, data) kweek_test1 = Kweek({ 'id': int(kid1), 'created_at': datetime(2010, 1, 1, 0, 0), 'text': 'test1', 'media_url': None, 'user': User({ 'username': '******', 'screen_name': 'test1', 'profile_image_url': 'image_url', 'following': True, 'follows_you': True, 'muted': False, 'blocked': False }), 'mentions': [ Mention({ 'username': '******', 'indices': [10, 15]}) ], 'hashtags': [ Hashtag({ 'text': 'hashtag1---', 'indices': [0, 9], 'id': hid1 }) ], 'number_of_likes': 1, 'number_of_rekweeks': 1, 'number_of_replies': 2, 'reply_to': None, 'rekweek_info': None, 'liked_by_user': False, 'rekweeked_by_user': False }) replies_test1 = [ Kweek({ 'id': int(kid2), 'created_at': datetime(2010, 1, 1, 0, 0), 'text': 'test2', 'media_url': None, 'user': User({ 'username': '******', 'screen_name': 'test2', 'profile_image_url': 'image_url', 'following': False, 'follows_you': True, 'muted': False, 'blocked': False }), 'mentions': [ ], 'hashtags': [ Hashtag({ 'text': 'hashtag2---', 'indices': [0, 9], 'id': hid2 }) ], 'number_of_likes': 1, 'number_of_rekweeks': 0, 'number_of_replies': 0, 'reply_to': int(kid1), 'rekweek_info': None, 'liked_by_user': False, 'rekweeked_by_user': False }), Kweek({ 'id': int(kid3), 'created_at': datetime(2010, 1, 1, 0, 0), 'text': 'test3', 'media_url': None, 'user': User({ 'username': '******', 'screen_name': 'test3', 'profile_image_url': 'image_url', 'following': False, 'follows_you': False, 'muted': False, 'blocked': False }), 'mentions': [ ], 'hashtags': [ Hashtag({ 'text': 'hashtag3---', 'indices': [0, 9], 'id': hid3 }) ], 'number_of_likes': 1, 'number_of_rekweeks': 0, 'number_of_replies': 0, 'reply_to': int(kid1), 'rekweek_info': None, 'liked_by_user': True, 'rekweeked_by_user': False }), ] check_replies, message, k, r = actions.get_kweek_with_replies(kid1, 'test_user3') print('kwweeek') print(k) print('replies') print(r) assert True == check_replies assert message == 'success' assert k.to_json() == kweek_test1.to_json() for n, i in enumerate(r): assert i.to_json() == replies_test1[n].to_json()
from sqlalchemy import create_engine from models import Source, Mention, Base from sqlalchemy.orm import sessionmaker engine = create_engine('postgresql+psycopg2://docker:docker@db/docker') Session = sessionmaker(bind=engine) Base.metadata.create_all(engine) session = Session() s = Source(id=1, name='Twitter') m = Mention(id=1, source=s, text='jeffknupp.com is the best website ever!') session.add(s) session.add(m) session.commit()