Example #1
0
def consume_external_label(label, group, label_to_titles):
    # thumb_test = utils.key_exists([label, 'news_image', 'media_url_news_thumb'],label_to_titles)
    # media_url = label_to_titles[label]["news_image"]["media_url_news_thumb"] if thumb_test else ""
    media_url = utils.val_or_default2([label,"news_image","media_url_news_thumb"], label_to_titles, [""])[0]
    image_ref = ""
    if media_url == "":
        for s in group["statuses"]:
            image_ref = utils.val_or_default2(["extended_entities", "media", 0, "media_url"], s, default=[""])[0]
            if image_ref != "":
                print("Extracted image ref", image_ref)
                break
                
    # media_url2 = val_or_default2([label, 'news_image', 'media_url_news_thumb'], label_to_titles, default="")
    in_data = True if label in label_to_titles else False
    title = label_to_titles[label]["parsed_title"] if in_data and "parsed_title" in label_to_titles[label] else label
    domain = label_to_titles[label]["domain"] if in_data and "domain" in label_to_titles[label] else ""
    domainless_title = label_to_titles[label]["domainless_title"] if in_data and "domainless_title" in label_to_titles[label] else label
    label_info = {
        "type": "external_link",
        "media_url": media_url,
        "image_ref": image_ref,
        "domain": domain,
        "domainless_title": domainless_title
    }
    return label_info, title
def get_twitter_info(label, stat):
    quoted_status = utils.get_quoted_status(stat)
    if "quoted_labels_twrefs_deep_status" in stat["satellite_enhanced"][
            "labels"]:
        quoted_status = stat["satellite_enhanced"]["labels"][
            "quoted_labels_twrefs_deep_status"]
        print("Found deep status")
    if "full_text" in quoted_status:
        clean_fill_text = utils.clean_one(quoted_status)
        return_label = utils.clean_title(clean_fill_text)
        ## NEW IMAGE REF
        image_ref = utils.val_or_default2(
            ["extended_entities", "media", 0, "media_url"],
            quoted_status,
            default="")
        ## NEW IMAGE REF
        post_data = {
            "type": "twitter_id",
            "id": str(quoted_status["id"]),
            "user": quoted_status["user"]["id_str"],
            "text": return_label,
            "image_ref": image_ref
        }
    else:
        post_data = {}
    return post_data
def get_labels(status):
    if not bool(status):
        return [], []
#     hashes = utils.find(["entities", "hashtags","text"], status)
#     hashes = ["#"+h for h in hashes]
    urls = utils.val_or_default2(["entities", "urls", [], "expanded_url"], status, default=[])
    # VIDEO CODE
    # if "extended_entities" in status and "media" in status["extended_entities"] and  "video_info" in status["extended_entities"]["media"][0]:
    #     video_info = status["extended_entities"]["media"][0]["video_info"]
    #     if video_info != [] and video_info != None:
    #         variants = video_info["variants"]
    #         video_url = variants[len(variants) - 1]["url"]
    #         if video_url and (not urls or urls == []):
    #             urls = [video_url]
    #             print("Video Urls Found!" + video_url)
    # VIDEO CODE
    embed_fin = utils.val_or_default2(["quoted_status_id"], status, default=[])
    return urls, embed_fin
def get_labels(status, seed=None):
    if seed == None:
        seed = {t: [] for t, v in specs.items()}
    if not bool(status):
        return seed
    for ref_type, spec in specs.items():
        status_id = status["id_str"]
        ref_list = utils.val_or_default2(spec, status, default=[])
        rows = [{"source": status_id, "label": l} for l in ref_list]
        seed[ref_type].extend(rows)
        # seed[ref_type].extend(utils.val_or_default2(spec, status, default=[]))
    return seed
def make_filtered_label_dict(batch_enhanced_full, threshhold = 2):
    # batch_enhanced_full.extend(batch_enhanced)
    label_dict = {}
    for status in batch_enhanced_full:
        norm_labels = utils.val_or_default2(["extended_entities", "media", [], "media_url"],status)
        # norm_labels = status["satellite_enhanced"]["combined_labels"]
        if len(norm_labels) > 0:
            uate_dict_nodup(label_dict, norm_labels[0], status)
    filt_label_dict =  {str(l): {"statuses":v, "count":len(v)} for l, v in label_dict.items() if len(v) >= threshhold}
    for k, v in filt_label_dict.items():
        print(k, str(len(v)))

    return filt_label_dict
Example #6
0
def get_embedded_id(stat):
    quoted_status = utils.get_quoted_status(stat)
    embed = utils.val_or_default2(["quoted_status_id"],quoted_status)
    first = utils.val_or_default2(["quoted_status_id"],stat)
    first_urls = utils.val_or_default2(["entities", "urls", [], "expanded_url"],stat)
    return first, embed, first_urls