Example #1
0
    def __init__(self, config: object, session):
        if key_exists(FOLLOW, config):
            follow = config[FOLLOW]
            if key_exists(ENABLE, follow):
                self.__set_enable(follow[ENABLE])

            if key_exists(USERNAMES, follow):
                self.__set_usernames(follow[USERNAMES])

            if key_exists(AMOUNT, follow):
                self.__set_amount(follow[AMOUNT])

            if key_exists(RANDOMIZE, follow):
                self.__set_randomize(follow[RANDOMIZE])

            if key_exists(INTERACT, follow):
                self.__set_interact(follow[INTERACT])

            if key_exists(SLEEP_DELAY, follow):
                self.__set_sleep_delay(follow[SLEEP_DELAY])

        self.__session = session
        if self.__enable:
            if key_exists(BY_TAGS, follow):
                FollowByTags(follow[BY_TAGS], session)
            elif key_exists(BY_LOCATIONS, follow):
                FollowByLocations(follow[BY_LOCATIONS], session)
            else:
                self.__follow()
Example #2
0
    def __init__(self, config: object, session):
        if config:
            if key_exists(LOCATIONS, config):
                self.__locations = config[LOCATIONS]
            if key_exists(AMOUNT, config):
                self.__amount = config[AMOUNT]
            if key_exists(MEDIA, config):
                self.__media = config[MEDIA]
            if key_exists(SKIP_TOP_POSTS, config):
                self.__skip_top_posts = config[SKIP_TOP_POSTS]

        self.__session = session
        self.__follow_by_locations()
Example #3
0
def process_label_dict(filt_label_dict, label_to_titles):
    threshhold = 2
    key = 1
    label_data = {}
    for label, group in filt_label_dict.items():
        if group["count"] >= threshhold:
            if utils.key_exists(["label_info", "type"], group) and group["label_info"]["type"] == "twitter_id":
                label_info = group["label_info"]
                link, title = consume_twitter_label(label_info)
            else:
                label_info, title = consume_external_label(label, group, label_to_titles)
                link = label
            final_image_url = pick_image(label_info)
            tweet_data = make_tweet_data(group)
            row = {
                "label_info": label_info,
                "title": title,
                "count": group["count"],
                'key': key,
                'tag': link,
                'final_image_url': final_image_url,
                'tweet_data' : tweet_data
            }
            label_data[label] = row
            key += 1
    return label_data
Example #4
0
def featurize_test_images(bucket, prefix, batch_size):
    imgnt = imagenet.ImageNetData()
    to_featurize = []
    to_featurize_keys = []
    client = utils.get_s3_client()
    start = timer()
    num_batches = 0
    for k in imgnt.test_filenames:
        key_name = os.path.join(prefix, f"{k}.npy")
        key_exists = utils.key_exists(bucket, key_name)
        if not key_exists:
            img = imgnt.load_image(k,
                                   size='scaled_256',
                                   force_rgb=True,
                                   verbose=False)
            img = skimage.transform.resize(img,
                                           FEATURIZE_SIZE,
                                           preserve_range=True)
            to_featurize.append(img)
            to_featurize_keys.append(k)
            if len(to_featurize) >= batch_size:
                num_batches += 1
                featurize_and_upload_batch(to_featurize, to_featurize_keys,
                                           batch_size, bucket, prefix, client)
                end = timer()
                print('processing bach {} (size {}) took {} seconds'.format(
                    num_batches, len(to_featurize), end - start))
                start = timer()
                to_featurize = []
                to_featurize_keys = []
    if len(to_featurize) > 0:
        featurize_and_upload_batch(to_featurize, to_featurize_keys, batch_size,
                                   bucket, prefix, client)
def return_not_exists(lst):
    ret_lst = []
    for e in lst:
        key = "{0}/{1}.jpg".format("imagenet2candidates_scaled", e)
        exists = utils.key_exists(bucket="imagenet2datav2", key=key)
        print(exists, key)
        if (not exists):
            ret_lst.append(e)
    return ret_lst
def quick_image_check(img_dict, bucket, prefix):
    url = img_dict['url']
    ext = url.split(".")[-1]
    key = img_dict["id_ours"]
    mturk_key = "{2}_mturk/{0}.{1}".format(key, ext, prefix)
    if (utils.key_exists(bucket, mturk_key)):
        return True
    else:
        return None
def return_not_exists_encrypted(lst):
    ret_lst = []
    for e in lst:
        e = utils.encrypt_string_with_magic(e)
        key = "{0}/{1}.jpg".format("encrypted", e)
        exists = utils.key_exists(bucket="imagenet2datav2", key=key)
        print(exists, key)
        if (not exists):
            ret_lst.append(e)
    return ret_lst
Example #8
0
def featurize_candidates(bucket, prefix, batch_size, source_filename):
    imgnt = imagenet.ImageNetData()
    cds = candidate_data.CandidateData(verbose=False)
    filenames_to_ignore = [
        '2018-08-06_17:33_vaishaal.json',
        '2018-08-17_17:24_vaishaal.json',
        'vaishaal_hits_submitted_2018-08-17-18:28:33-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-18:50:38-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:28:24-PDT.json',
        'vaishaal_hits_submitted_2018-08-17-19:56:28-PDT.json',
        'vaishaal_hits_submitted_2018-08-25-09:47:26-PDT.json']
    mturk = mturk_data.MTurkData(live=True, load_assignments=True, source_filenames_to_ignore=filenames_to_ignore, verbose=False)
    to_featurize = []
    to_featurize_keys = []
    client = utils.get_s3_client()
    i = 0
    #candidate_list = dataset_sampling.get_histogram_sampling_ndc_candidates(imgnet=imgnt, cds=cds, mturk=mturk)
    start = timer()
    with open('../data/metadata/fc7_candidates.json', 'r') as f:
        candidate_list = json.load(f)
    for k in candidate_list:
        key_name = os.path.join(prefix, str(k)+".npy")
        key_exists = utils.key_exists(bucket, key_name)
        if not key_exists:
            img = cds.load_image(k, size='original', verbose=False)
            img  = skimage.transform.resize(img, FEATURIZE_SIZE, preserve_range=True)
            to_featurize.append(img)
            to_featurize_keys.append(k)
            #if i > 250:
            #    break;
            i = i + 1
            print('Got candidate {}'.format(i))
    end = timer()
    print(f"Took {end-start} seconds to get remaining candidates.")
    print('Beginning featurization of {} items'.format(len(to_featurize_keys)))
    if len(to_featurize) > 0:
        to_featurize = np.stack(to_featurize, axis=0)
        print(f"input shape {to_featurize.shape}")
        batch_size = min(len(to_featurize), batch_size)
        features = featurize.vgg16_features(to_featurize, batch_size=batch_size)
        print(f"features shape {features.shape}")
        for i,f in enumerate(features):
            key_name = os.path.join(prefix, to_featurize_keys[i]+".npy")
            bio = io.BytesIO()
            np.save(bio, f)
            print("writing key {0}".format(key_name))
            utils.put_s3_object_bytes_with_backoff(bio.getvalue(), key_name)
    print(f"Took {end-start} seconds to get remaining candidates.")
Example #9
0
    def __init__(self, config: object, session):
        if config:
            if key_exists(TAGS, config):
                self.__tags = config[TAGS]
            if key_exists(AMOUNT, config):
                self.__amount = config[AMOUNT]
            if key_exists(SKIP_TOP_POSTS, config):
                self.__skip_top_posts = config[SKIP_TOP_POSTS]
            if key_exists(RANDOMIZE, config):
                self.__randomize = config[RANDOMIZE]
            if key_exists(INTERACT, config):
                self.__interact = config[INTERACT]
            if key_exists(MEDIA, config):
                self.__media = config[MEDIA]
            if key_exists(USE_SMART_HASHTAGS, config):
                self.__use_smart_hashtags = config[USE_SMART_HASHTAGS]
            if key_exists(USE_SMART_LOCATION_HASHTAGS, config):
                self.__use_smart_location_hashtags = config[
                    USE_SMART_LOCATION_HASHTAGS]

        if key_not_exists(TAGS, config):
            sys.exit('[ERROR]: tags not found.')
        self.__session = session
        self.__follow_by_tags()
Example #10
0
def compute_nearest_neighbors(distance_measures, candidate_filenames,
                              reference_filenames, top_k, window_size, cache,
                              cache_root):
    cache_key = compute_hash(distance_measures, candidate_filenames,
                             reference_filenames, top_k, window_size)
    full_key = f"{cache_root}/{cache_key}"
    timing_info = {}
    if cache:
        if utils.key_exists(BUCKET, full_key):
            load_start = timer()
            ret_value = pickle.loads(
                utils.get_s3_object_bytes_with_backoff(full_key)[0])
            load_end = timer()
            compute_start = compute_end = timer()
            timing_info['load_start'] = load_start
            timing_info['load_end'] = load_end
            timing_info['compute_start'] = compute_start
            timing_info['compute_end'] = compute_end
            timing_info['cached'] = True
            return ret_value, timing_info

    imgnt = imagenet.ImageNetData(cache_on_local_disk=True,
                                  verbose=False,
                                  cache_root_path='/tmp/imagenet2_cache')
    cds = candidate_data.CandidateData(cache_on_local_disk=True,
                                       load_metadata_from_s3=True,
                                       verbose=False,
                                       cache_root_path='/tmp/imagenet2_cache')
    loader = image_loader.ImageLoader(imgnt,
                                      cds,
                                      cache_on_local_disk=True,
                                      num_tries=4,
                                      cache_root_path='/tmp/imagenet2_cache')
    load_start = timer()
    if ('l2' in distance_measures) or ('dssim' in distance_measures):
        candidate_image_dict = loader.load_image_batch(candidate_filenames,
                                                       size='scaled_256',
                                                       force_rgb=True,
                                                       verbose=False)
        reference_image_dict = loader.load_image_batch(reference_filenames,
                                                       size='scaled_256',
                                                       force_rgb=True,
                                                       verbose=False)
    if 'fc7' in distance_measures:
        candidate_feature_dict = loader.load_features_batch(
            candidate_filenames, verbose=False)
        reference_feature_dict = loader.load_features_batch(
            reference_filenames, verbose=False)
    load_end = timer()

    compute_start = timer()
    result = {}
    for distance_measure in distance_measures:
        if distance_measure == 'l2':
            result['l2'] = compute_l2_distances(candidate_image_dict,
                                                reference_image_dict, 196608)
        elif distance_measure == 'dssim':
            result['dssim'] = compute_dssim_distances(candidate_image_dict,
                                                      reference_image_dict,
                                                      window_size)
        elif distance_measure == 'fc7':
            result['fc7'] = compute_l2_distances(candidate_feature_dict,
                                                 reference_feature_dict, 4096)
        else:
            raise ValueError('Unknown distance measure')
    compute_end = timer()
    timing_info = {}
    timing_info['load_start'] = load_start
    timing_info['load_end'] = load_end
    timing_info['compute_start'] = compute_start
    timing_info['compute_end'] = compute_end
    timing_info['cached'] = False

    res = compute_top_k(result, top_k)
    if cache:
        utils.put_s3_object_bytes_with_backoff(pickle.dumps(res), full_key)

    return res, timing_info
Example #11
0
 def __init__(self, config: object, session):
     if key_exists(UNFOLLOW, config):
         unfollow = config[UNFOLLOW]
         print(unfollow)
         if key_exists(WHITE_LIST, unfollow):
             self.__set_white_list(unfollow[WHITE_LIST])
         if key_exists(ENABLE, unfollow):
             self.__enable = unfollow[ENABLE]
         if key_exists(AMOUNT, unfollow):
             self.__set_amount(unfollow[AMOUNT])
         if key_exists(CUSTOM_LIST_ENABLED, unfollow):
             self.__set_custom_list_enabled(unfollow[CUSTOM_LIST_ENABLED])
         if key_exists(CUSTOM_LIST, unfollow):
             self.__set_custom_list(unfollow[CUSTOM_LIST])
         if key_exists(CUSTOM_LIST_PARAM, unfollow):
             self.__set_custom_list_param(unfollow[CUSTOM_LIST_PARAM])
         if key_exists(INSTAPY_FOLLOWED_ENABLED, unfollow):
             self.__set_instapy_followed_enabled(
                 unfollow[INSTAPY_FOLLOWED_ENABLED])
         if key_exists(INSTAPY_FOLLOWED_PARAM, unfollow):
             self.__set_instapy_followed_param(
                 unfollow[INSTAPY_FOLLOWED_PARAM])
         if key_exists(NON_FOLLOWERS, unfollow):
             self.__set_non_followers(unfollow[NON_FOLLOWERS])
         if key_exists(ALL_FOLLOWING, unfollow):
             self.__set_all_following(unfollow[ALL_FOLLOWING])
         if key_exists(STYLE, unfollow):
             self.__set_style(unfollow[STYLE])
         if key_exists(UNFOLLOW_AFTER, unfollow):
             self.__set_unfollow_after(unfollow[UNFOLLOW_AFTER])
         if key_exists(DELAY_FOLLOWBACKERS, unfollow):
             self.__set_delay_followbackers(unfollow[DELAY_FOLLOWBACKERS])
         if key_exists(SLEEP_DELAY, unfollow):
             self.__set_sleep_delay(unfollow[SLEEP_DELAY])
     self.__session = session
     self.__unfollow()
Example #12
0
def make_tweet_data(group):
    tweet_data = []
    for s in group["statuses"]:
        count = s["retweet_count"] + s["favorite_count"]
        date = pd.to_datetime(s["created_at"], errors='coerce').tz_localize('UTC').tz_convert('America/Los_Angeles')
        date = int(round(date.timestamp() * 1000))
        screen_name = s["satellite_enhanced"]["name_top"]["screen_name"] if "name_top" in s["satellite_enhanced"] else s["user"]["id_str"]
        profile_image_url = s["satellite_enhanced"]["name_top"]["profile_image_url"] if utils.key_exists(["satellite_enhanced", "name_top", "profile_image_url"],s) else None
        clean_full_text = utils.clean_one(s)
        clean_full_text = utils.clean_title(clean_full_text)
        row = {
            "count": count,
            'date': date,
            'text': clean_full_text,
            'user_id': s["user"]["id_str"],
            'tweet_id': s['id_str'],
            'screen_name': screen_name,
            'profile_image_url': profile_image_url
        }
        tweet_data.append(row)
    tweet_data_sorted = sorted(tweet_data, key=lambda item: item['date']) 
    return tweet_data_sorted
def write_image_to_s3(img_dict, bucket, prefix):

    t = time.time()
    url = img_dict['url']
    ext = url.split(".")[-1]
    key = img_dict["id_ours"]
    mturk_key = "{2}_mturk/{0}.{1}".format(key, ext, prefix)
    if utils.key_exists(bucket, mturk_key):
        return img_dict
    gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
    img_bytes = urllib.request.urlopen(url, context=gcontext).read()
    pil_image = Image.open(io.BytesIO(img_bytes))
    rotated_img, _ = fix_orientation(pil_image)
    np_image = np.array(rotated_img)
    np_image = utils.make_rgb(np_image)
    try:

        image_ndc = skimage.transform.resize(np_image, (NDC_SIZE, NDC_SIZE),
                                             preserve_range=True)
    except MemoryError:
        raise Exception(f"Image {img_dict} memory error")

    bigger_side = max(np_image.shape)
    scale_fac = MTURK_RESCALE_SIZE / bigger_side
    image_mturk = skimage.transform.rescale(np_image,
                                            scale=scale_fac,
                                            preserve_range=True)

    bio_mturk = io.BytesIO()
    bio_orig = io.BytesIO()
    bio_ndc = io.BytesIO()

    imageio.imwrite(uri=bio_orig, im=np_image, format="jpg", quality=90)
    try:
        imageio.imwrite(uri=bio_mturk,
                        im=image_mturk,
                        format="jpg",
                        quality=90)
    except:
        raise Exception(f"Image {img_dict} error")

    imageio.imwrite(uri=bio_ndc, im=image_ndc, format="jpg", quality=90)

    client = utils.get_s3_client()
    ext = "jpg"
    backoff = 1
    while (True):
        try:
            client.put_object(Key="{2}_scaled/{0}.{1}".format(
                key, ext, prefix),
                              Bucket=bucket,
                              Body=bio_ndc.getvalue())
            client.put_object(Key="{2}_original/{0}.{1}".format(
                key, ext, prefix),
                              Bucket=bucket,
                              Body=bio_orig.getvalue())
            client.put_object(Key="{2}_mturk/{0}.{1}".format(key, ext, prefix),
                              Bucket=bucket,
                              Body=bio_mturk.getvalue())
            break
        except:
            time.sleep(backoff)
            backoff *= 2
    e = time.time()
    print("One image took ", e - t)
    img_dict["width"] = np_image.shape[1]
    img_dict["height"] = np_image.shape[0]
    return img_dict