def consume_external_label(label, group, label_to_titles): # thumb_test = utils.key_exists([label, 'news_image', 'media_url_news_thumb'],label_to_titles) # media_url = label_to_titles[label]["news_image"]["media_url_news_thumb"] if thumb_test else "" media_url = utils.val_or_default2([label,"news_image","media_url_news_thumb"], label_to_titles, [""])[0] image_ref = "" if media_url == "": for s in group["statuses"]: image_ref = utils.val_or_default2(["extended_entities", "media", 0, "media_url"], s, default=[""])[0] if image_ref != "": print("Extracted image ref", image_ref) break # media_url2 = val_or_default2([label, 'news_image', 'media_url_news_thumb'], label_to_titles, default="") in_data = True if label in label_to_titles else False title = label_to_titles[label]["parsed_title"] if in_data and "parsed_title" in label_to_titles[label] else label domain = label_to_titles[label]["domain"] if in_data and "domain" in label_to_titles[label] else "" domainless_title = label_to_titles[label]["domainless_title"] if in_data and "domainless_title" in label_to_titles[label] else label label_info = { "type": "external_link", "media_url": media_url, "image_ref": image_ref, "domain": domain, "domainless_title": domainless_title } return label_info, title
def get_twitter_info(label, stat): quoted_status = utils.get_quoted_status(stat) if "quoted_labels_twrefs_deep_status" in stat["satellite_enhanced"][ "labels"]: quoted_status = stat["satellite_enhanced"]["labels"][ "quoted_labels_twrefs_deep_status"] print("Found deep status") if "full_text" in quoted_status: clean_fill_text = utils.clean_one(quoted_status) return_label = utils.clean_title(clean_fill_text) ## NEW IMAGE REF image_ref = utils.val_or_default2( ["extended_entities", "media", 0, "media_url"], quoted_status, default="") ## NEW IMAGE REF post_data = { "type": "twitter_id", "id": str(quoted_status["id"]), "user": quoted_status["user"]["id_str"], "text": return_label, "image_ref": image_ref } else: post_data = {} return post_data
def get_labels(status): if not bool(status): return [], [] # hashes = utils.find(["entities", "hashtags","text"], status) # hashes = ["#"+h for h in hashes] urls = utils.val_or_default2(["entities", "urls", [], "expanded_url"], status, default=[]) # VIDEO CODE # if "extended_entities" in status and "media" in status["extended_entities"] and "video_info" in status["extended_entities"]["media"][0]: # video_info = status["extended_entities"]["media"][0]["video_info"] # if video_info != [] and video_info != None: # variants = video_info["variants"] # video_url = variants[len(variants) - 1]["url"] # if video_url and (not urls or urls == []): # urls = [video_url] # print("Video Urls Found!" + video_url) # VIDEO CODE embed_fin = utils.val_or_default2(["quoted_status_id"], status, default=[]) return urls, embed_fin
def get_labels(status, seed=None): if seed == None: seed = {t: [] for t, v in specs.items()} if not bool(status): return seed for ref_type, spec in specs.items(): status_id = status["id_str"] ref_list = utils.val_or_default2(spec, status, default=[]) rows = [{"source": status_id, "label": l} for l in ref_list] seed[ref_type].extend(rows) # seed[ref_type].extend(utils.val_or_default2(spec, status, default=[])) return seed
def make_filtered_label_dict(batch_enhanced_full, threshhold = 2): # batch_enhanced_full.extend(batch_enhanced) label_dict = {} for status in batch_enhanced_full: norm_labels = utils.val_or_default2(["extended_entities", "media", [], "media_url"],status) # norm_labels = status["satellite_enhanced"]["combined_labels"] if len(norm_labels) > 0: uate_dict_nodup(label_dict, norm_labels[0], status) filt_label_dict = {str(l): {"statuses":v, "count":len(v)} for l, v in label_dict.items() if len(v) >= threshhold} for k, v in filt_label_dict.items(): print(k, str(len(v))) return filt_label_dict
def get_embedded_id(stat): quoted_status = utils.get_quoted_status(stat) embed = utils.val_or_default2(["quoted_status_id"],quoted_status) first = utils.val_or_default2(["quoted_status_id"],stat) first_urls = utils.val_or_default2(["entities", "urls", [], "expanded_url"],stat) return first, embed, first_urls