def main():
    # == == == == == == == == Part 1: Set up experiment parameters == == == == == == == == #
    total_start_time = time.time()

    data_prefix = '../data/'
    forecast_filepath = 'vevo_forecast_data_60k.tsv'
    recsys_dirpath = 'recsys'
    snapshot_dirpath = 'network_pickle'

    if not os.path.exists(os.path.join(data_prefix, snapshot_dirpath)):
        os.mkdir(os.path.join(data_prefix, snapshot_dirpath))

    # == == == == == == Part 2: Load vevo en videos 61k dataset == == == == == == #
    vid_embed_dict = {}
    vid_view_dict = {}
    with open(os.path.join(data_prefix, forecast_filepath), 'r') as fin:
        for line in fin:
            embed, vid, ts_view, total_view = line.rstrip().split('\t')
            vid_embed_dict[vid] = int(embed)
            ts_view = np.array(intify(ts_view.split(',')))
            vid_view_dict[vid] = ts_view
    vevo_en_vid_list = list(sorted(vid_embed_dict.keys()))
    num_videos = len(vevo_en_vid_list)

    for t in range(T):
        timer = Timer()
        timer.start()

        target_date_str = obj2str(datetime(2018, 9, 1) + timedelta(days=t))
        recsys_filepath = 'recsys_{0}.json'.format(target_date_str)
        snapshot_filepath = 'network_{0}.p'.format(target_date_str)
        network_mat = {embed: [] for embed in range(num_videos)}

        with open(os.path.join(data_prefix, recsys_dirpath, recsys_filepath), 'r') as fin:
            for line in fin:
                network_json = json.loads(line.rstrip())
                source = network_json['vid']
                targets = network_json['relevant_list'][: MAX_POSITION]
                for position, target in enumerate(targets):
                    if target in vevo_en_vid_list:
                        # add embedding of incoming video and position of target video on source video
                        network_mat[vid_embed_dict[target]].append((vid_embed_dict[source], position, vid_view_dict[source][t]))

        with open(os.path.join(data_prefix, snapshot_dirpath, snapshot_filepath), 'wb') as fout:
            pickle.dump(network_mat, fout)

        print('>>> Finish dumping date {0}'.format(target_date_str))
        timer.stop()

    print('>>> Network structure has been dumped!')
    print('>>> Total elapsed time: {0}\n'.format(str(timedelta(seconds=time.time() - total_start_time))[:-3]))
Ejemplo n.º 2
0
    def _extract_tweet(self, filequeue):
        while not filequeue.empty():
            filepath = filequeue.get()
            try:
                filedata = bz2.BZ2File(filepath, mode='r')
            except:
                self.logger.warn(
                    'Exists non-bz2 file {0} in dataset folder'.format(
                        filepath))
                continue
            filename, filetype = os.path.basename(
                os.path.normpath(filepath)).split('.')

            tweet_output = bz2.open(
                os.path.join(self.tweet_stats_path,
                             '{0}.bz2'.format(filename)), 'at')
            user_output = bz2.open(
                os.path.join(self.user_stats_path, '{0}.bz2'.format(filename)),
                'at')

            visited_user_ids = set()
            for line in filedata:
                try:
                    if line.rstrip():
                        tweet_json = json.loads(line)

                        # 3. ratemsg, timestamp_ms, track
                        if 'limit' in tweet_json:
                            # rate limit message
                            # {"limit":{"track":283540,"timestamp_ms":"1483189188944"}}
                            tweet_output.write('{0},{1},{2}\n'.format(
                                'ratemsg', tweet_json['limit']['timestamp_ms'],
                                tweet_json['limit']['track']))
                            continue

                        if 'id_str' not in tweet_json:
                            continue

                        # 1. tweet_id_str, created_at, timestamp_ms, user_id_str,
                        #    original_lang, retweeted_lang, quoted_lang,
                        #    original_vids, retweeted_vids, quoted_vids,
                        #    original_mentions, retweeted_mentions, quoted_mentions,
                        #    original_hashtags, retweeted_hashtags, quoted_hashtags,
                        #    original_geoname, retweeted_geoname, quoted_geoname,
                        #    original_countrycode, retweeted_countrycode, quoted_countrycode,
                        #    original_filter, retweeted_filter, quoted_filter,
                        #    original_retweet_count, retweeted_retweet_count, quoted_retweet_count,
                        #    original_favorite_count, retweeted_favorite_count, quoted_favorite_count,
                        #    original_user_followers_count, retweeted_user_followers_count, quoted_user_followers_count,
                        #    original_user_friends_count, retweeted_user_friends_count, quoted_user_friends_count,
                        #    original_user_statuses_count, retweeted_user_statuses_count, quoted_user_statuses_count,
                        #    original_user_favourites_count, retweeted_user_favourites_count, quoted_user_favourites_count,
                        #    reply_tweet_id_str, retweeted_tweet_id_str, quoted_tweet_id_str,
                        #    reply_user_id_str, retweeted_user_id_str, quoted_user_id_str,
                        #    original_text, retweeted_text, quoted_text
                        tweet_id = tweet_json['id_str']
                        created_at = obj2str(str2obj(tweet_json['created_at'],
                                                     fmt='tweet'),
                                             fmt='youtube')
                        timestamp_ms = tweet_json['timestamp_ms']
                        user_id_str = tweet_json['user']['id_str']
                        if 'lang' in tweet_json:
                            lang = tweet_json['lang']
                        else:
                            lang = 'N'

                        original_vids, retweeted_vids, quoted_vids = self._extract_vids(
                            tweet_json)
                        original_mentions, retweeted_mentions, quoted_mentions = self._extract_mentions(
                            tweet_json)
                        original_hashtags, retweeted_hashtags, quoted_hashtags, = self._extract_hashtags(
                            tweet_json)

                        if tweet_json['place'] is not None:
                            original_geo = self._replace_comma_space(
                                tweet_json['place']['full_name'])
                            original_cc = self._replace_comma_space(
                                tweet_json['place']['country_code'])
                        else:
                            original_geo = 'N'
                            original_cc = 'N'

                        original_filter = tweet_json['filter_level']

                        original_retweet_count = tweet_json['retweet_count']
                        original_favorite_count = tweet_json['favorite_count']

                        original_user_followers_count = tweet_json['user'][
                            'followers_count']
                        original_user_friends_count = tweet_json['user'][
                            'friends_count']
                        original_user_statuses_count = tweet_json['user'][
                            'statuses_count']
                        original_user_favourites_count = tweet_json['user'][
                            'favourites_count']

                        reply_tweet_id_str = self._replace_with_nan(
                            tweet_json['in_reply_to_status_id_str'])
                        reply_user_id_str = self._replace_with_nan(
                            tweet_json['in_reply_to_user_id_str'])

                        if 'extended_tweet' in tweet_json and 'full_text' in tweet_json[
                                'extended_tweet']:
                            text = self._replace_comma_space(
                                tweet_json['extended_tweet']['full_text'])
                        elif tweet_json['text'] is not None:
                            text = self._replace_comma_space(
                                tweet_json['text'])
                        else:
                            text = 'N'

                        retweeted_tweet_id_str, retweeted_user_id_str, retweeted_user_location,\
                        retweeted_lang, retweeted_geo, retweeted_cc, retweeted_filter, \
                        retweeted_retweet_count, retweeted_favorite_count, retweeted_user_followers_count, \
                        retweeted_user_friends_count, retweeted_user_statuses_count, retweeted_user_favourites_count, \
                        retweeted_text = self._extract_entities(tweet_json, 'retweeted_status')

                        quoted_tweet_id_str, quoted_user_id_str, quoted_user_location,\
                        quoted_lang, quoted_geo, quoted_cc, quoted_filter, \
                        quoted_retweet_count, quoted_favorite_count, quoted_user_followers_count, \
                        quoted_user_friends_count, quoted_user_statuses_count, quoted_user_favourites_count, \
                        quoted_text = self._extract_entities(tweet_json, 'quoted_status')

                        tweet_output.write(
                            '{0},{1},{2},{3},'
                            '{4},{5},{6},'
                            '{7},{8},{9},'
                            '{10},{11},{12},'
                            '{13},{14},{15},'
                            '{16},{17},{18},'
                            '{19},{20},{21},'
                            '{22},{23},{24},'
                            '{25},{26},{27},'
                            '{28},{29},{30},'
                            '{31},{32},{33},'
                            '{34},{35},{36},'
                            '{37},{38},{39},'
                            '{40},{41},{42},'
                            '{43},{44},{45},'
                            '{46},{47},{48},'
                            '{49},{50},{51}\n'.format(
                                tweet_id, created_at, timestamp_ms,
                                user_id_str, lang, retweeted_lang, quoted_lang,
                                strify(original_vids, delimiter=';'),
                                strify(retweeted_vids, delimiter=';'),
                                strify(quoted_vids, delimiter=';'),
                                strify(original_mentions, delimiter=';'),
                                strify(retweeted_mentions, delimiter=';'),
                                strify(quoted_mentions, delimiter=';'),
                                strify(original_hashtags, delimiter=';'),
                                strify(retweeted_hashtags, delimiter=';'),
                                strify(quoted_hashtags, delimiter=';'),
                                original_geo, retweeted_geo, quoted_geo,
                                original_cc, retweeted_cc, quoted_cc,
                                original_filter, retweeted_filter,
                                quoted_filter, original_retweet_count,
                                retweeted_retweet_count, quoted_retweet_count,
                                original_favorite_count,
                                retweeted_favorite_count,
                                quoted_favorite_count,
                                original_user_followers_count,
                                retweeted_user_followers_count,
                                quoted_user_followers_count,
                                original_user_friends_count,
                                retweeted_user_friends_count,
                                quoted_user_friends_count,
                                original_user_statuses_count,
                                retweeted_user_statuses_count,
                                quoted_user_statuses_count,
                                original_user_favourites_count,
                                retweeted_user_favourites_count,
                                quoted_user_favourites_count,
                                reply_tweet_id_str, retweeted_tweet_id_str,
                                quoted_tweet_id_str, reply_user_id_str,
                                retweeted_user_id_str, quoted_user_id_str,
                                text, retweeted_text, quoted_text))

                        # 2. user_id_str, screen_name, created_at, verified, location, followers_count, friends_count, listed_count, statuses_count, description
                        if user_id_str not in visited_user_ids:
                            user_screen_name, user_created_at, user_verified, user_location, user_followers_count, user_friends_count, user_listed_count, user_statuses_count, user_description = self._extract_user_entities(
                                tweet_json['user'])
                            user_output.write(
                                '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9}\n'.
                                format(user_id_str, user_screen_name,
                                       user_created_at, user_verified,
                                       user_location, user_followers_count,
                                       user_friends_count, user_listed_count,
                                       user_statuses_count, user_description))
                            visited_user_ids.add(user_screen_name)

                        if 'retweeted_status' in tweet_json:
                            ruser_id_str = tweet_json['retweeted_status'][
                                'user']['id_str']
                            if ruser_id_str not in visited_user_ids:
                                ruser_screen_name, ruser_created_at, ruser_verified, ruser_location, ruser_followers_count, ruser_friends_count, ruser_listed_count, ruser_statuses_count, ruser_description = self._extract_user_entities(
                                    tweet_json['retweeted_status']['user'])
                                user_output.write(
                                    '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9}\n'
                                    .format(ruser_id_str, ruser_screen_name,
                                            ruser_created_at, ruser_verified,
                                            ruser_location,
                                            ruser_followers_count,
                                            ruser_friends_count,
                                            ruser_listed_count,
                                            ruser_statuses_count,
                                            ruser_description))
                                visited_user_ids.add(ruser_id_str)

                        if 'quoted_status' in tweet_json:
                            quser_id_str = tweet_json['quoted_status']['user'][
                                'id_str']
                            if quser_id_str not in visited_user_ids:
                                quser_screen_name, quser_created_at, quser_verified, quser_location, quser_followers_count, quser_friends_count, quser_listed_count, quser_statuses_count, quser_description = self._extract_user_entities(
                                    tweet_json['quoted_status']['user'])
                                user_output.write(
                                    '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9}\n'
                                    .format(quser_id_str, quser_screen_name,
                                            quser_created_at, quser_verified,
                                            quser_location,
                                            quser_followers_count,
                                            quser_friends_count,
                                            quser_listed_count,
                                            quser_statuses_count,
                                            quser_description))
                                visited_user_ids.add(quser_id_str)

                except EOFError:
                    self.logger.error(
                        'EOFError: {0} ended before the logical end-of-stream was detected,'
                        .format(filename))

            tweet_output.close()
            user_output.close()
            filedata.close()
            self.logger.debug('{0} done!'.format(filename))
            print('{0} done!'.format(filename))
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    num_videos = data_loader.num_videos
    data_loader.load_embed_content_dict()
    embed_cid_dict = data_loader.embed_cid_dict
    embed_genre_dict = data_loader.embed_genre_dict

    # == == == == == == Part 3: Load dynamic network snapshot == == == == == == #
    network_dict_list = []
    for t in range(T):
        target_date_str = obj2str(datetime(2018, 9, 1) + timedelta(days=t))
        filename = 'network_{0}.p'.format(target_date_str)
        network_dict = pickle.load(
            open(os.path.join(data_prefix, 'network_pickle', filename), 'rb'))
        for embed in network_dict:
            network_dict[embed] = [
                x[0] for x in network_dict[embed] if x[1] < NUM_REL
            ]
        network_dict_list.append(network_dict)

    persistent_src_embed_set = set()
    persistent_tar_embed_set = set()
    existing_edges = set()
    num_reciprocal_edges = 0
    num_same_artist = 0
    num_same_genre = 0

    with open(os.path.join(data_prefix, 'persistent_network.csv'),
              'w') as fout:
        fout.write('Source,Target\n')

        for tar_embed in range(num_videos):
            src_union_set = set()
            for t in range(T):
                src_union_set.update(set(network_dict_list[t][tar_embed]))

            for src_embed in src_union_set:
                linkage_list = [0] * T
                for t in range(T):
                    if src_embed in network_dict_list[t][tar_embed]:
                        linkage_list[t] = 1
                if is_persistent_link(linkage_list):
                    # filter: at least 100 daily views for target video,
                    # and the mean daily views of source video is at least 1% of the target video
                    src_mean = embed_avg_view_dict[src_embed]
                    tar_mean = embed_avg_view_dict[tar_embed]

                    if tar_mean >= 100 and src_mean >= 0.01 * tar_mean:
                        fout.write('{0},{1}\n'.format(src_embed, tar_embed))
                        persistent_src_embed_set.add(src_embed)
                        persistent_tar_embed_set.add(tar_embed)
                        if '{1}-{0}'.format(src_embed,
                                            tar_embed) in existing_edges:
                            num_reciprocal_edges += 1
                        if embed_cid_dict[src_embed] == embed_cid_dict[
                                tar_embed]:
                            num_same_artist += 1
                        if is_same_genre(embed_genre_dict[src_embed],
                                         embed_genre_dict[tar_embed]):
                            num_same_genre += 1
                        existing_edges.add('{0}-{1}'.format(
                            src_embed, tar_embed))

    print('{0} edges in the persistent network'.format(len(existing_edges)))
    print(
        '{0} source videos, {1} target videos, {2} videos appear in both set'.
        format(
            len(persistent_src_embed_set), len(persistent_tar_embed_set),
            len(persistent_src_embed_set.intersection(
                persistent_tar_embed_set))))
    print('{0} pairs of reciprocal edges'.format(num_reciprocal_edges))
    print('{0} ({1:.1f}%) edges belong to the same artist'.format(
        num_same_artist, 100 * num_same_artist / len(existing_edges)))
    print('{0} ({1:.1f}%) edges belong to the same genre'.format(
        num_same_genre, 100 * num_same_genre / len(existing_edges)))

    timer.stop()
Ejemplo n.º 4
0
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data'

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    num_videos = data_loader.num_videos
    total_views = []

    # == == == == == == Part 3: Load network snapshot as cutoff value changes == == == == == == #
    for t in range(T):
        total_views.append(
            sum([embed_view_dict[embed][t] for embed in range(num_videos)]))

        snapshot_date = obj2str(datetime(2018, 9, 1) + timedelta(days=t))
        snapshot_filename = 'network_{0}.p'.format(snapshot_date)
        nodes_set = set()
        num_edges = 0
        embedded_graph = defaultdict(list)
        with open(
                os.path.join(data_prefix, 'network_pickle', snapshot_filename),
                'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src)]
            for embed_tar in range(num_videos):
                for embed_src, pos_src, _ in network_dict[embed_tar]:
                    if pos_src < CUTOFF:
                        embedded_graph[embed_src].append(embed_tar)
                        nodes_set.add(embed_src)
                        nodes_set.add(embed_tar)
                        num_edges += 1

        logging.info('>>> Graph embedding @ date {0} has been loaded!'.format(
            snapshot_date))
        logging.info('>>> {0} nodes and {1} edges in the graph'.format(
            len(nodes_set), num_edges))
        logging.info('    {0} views throughout the graph'.format(
            total_views[t]))

        # == == == == == == Part 4: Extract bow-tie structure == == == == == == #
        scc_content = tarjan(embedded_graph)
        scc_content = sorted(scc_content, key=lambda x: len(x), reverse=True)

        # largest SCC
        largest_scc = scc_content.pop(0)
        logging.info('>>> {0} ({1:.2f}%) nodes in the largest SCC'.format(
            len(largest_scc),
            len(largest_scc) / num_videos * 100))
        largest_scc_views = sum(
            [embed_view_dict[embed][t] for embed in largest_scc])
        logging.info('    {0} ({1:.2f}%) views in the largest SCC'.format(
            largest_scc_views, largest_scc_views / total_views[t] * 100))

        # find IN, OUT, Tendrils, Disconnected
        in_component = []
        num_scc_in = 0
        to_visit_scc = []
        for scc in scc_content:
            ret = is_in_component(scc, embedded_graph, largest_scc)
            if ret:
                in_component.extend(scc)
                num_scc_in += 1
            else:
                to_visit_scc.append(scc)
        logging.info('>>> {0} ({1:.2f}%) nodes in the IN component'.format(
            len(in_component),
            len(in_component) / num_videos * 100))
        logging.info('    {0} scc in the IN component'.format(num_scc_in))
        in_views = sum([embed_view_dict[embed][t] for embed in in_component])
        logging.info('    {0} ({1:.2f}%) views in the IN component'.format(
            in_views, in_views / total_views[t] * 100))

        out_component = []
        num_scc_out = 0
        to_visit_scc2 = []
        for scc in to_visit_scc:
            ret = is_out_component(scc, embedded_graph, largest_scc)
            if ret:
                out_component.extend(scc)
                num_scc_out += 1
            else:
                to_visit_scc2.append(scc)
        logging.info('>>> {0} ({1:.2f}%) nodes in the OUT component'.format(
            len(out_component),
            len(out_component) / num_videos * 100))
        logging.info('    {0} scc in the OUT component'.format(num_scc_out))
        out_views = sum([embed_view_dict[embed][t] for embed in out_component])
        logging.info('    {0} ({1:.2f}%) views in the OUT component'.format(
            out_views, out_views / total_views[t] * 100))

        tendrils = []
        num_scc_tendrils = 0
        disconnected = []
        num_disconnected = num_videos - len(nodes_set)
        num_scc_disconnected = 0
        for scc in to_visit_scc2:
            ret = is_out_component(scc, embedded_graph, in_component)
            if ret:
                tendrils.extend(scc)
                num_scc_tendrils += 1
            else:
                ret = is_in_component(scc, embedded_graph, out_component)
                if ret:
                    tendrils.extend(scc)
                    num_scc_tendrils += 1
                else:
                    disconnected.extend(scc)
                    num_scc_disconnected += 1
        logging.info('>>> {0} ({1:.2f}%) nodes in the Tendrils'.format(
            len(tendrils),
            len(tendrils) / num_videos * 100))
        logging.info('    {0} scc in the Tendrils'.format(num_scc_tendrils))
        tendrils_views = sum([embed_view_dict[embed][t] for embed in tendrils])
        logging.info('    {0} ({1:.2f}%) views in the Tendrils'.format(
            tendrils_views, tendrils_views / total_views[t] * 100))

        logging.info('>>> {0} ({1:.2f}%) nodes in the Disconnected'.format(
            num_disconnected + len(disconnected),
            (num_disconnected + len(disconnected)) / num_videos * 100))
        logging.info(
            '    {0} scc in the Disconnected'.format(num_disconnected +
                                                     num_scc_disconnected))
        disc_views = total_views[
            t] - largest_scc_views - in_views - out_views - tendrils_views
        logging.info('    {0} ({1:.2f}%) views in the Disconnected'.format(
            disc_views, disc_views / total_views[t] * 100))

        print('>>> Finish computing bowtie at day {0}...'.format(t + 1))

    timer.stop()
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'
    year_labels = [
        "all years", "'09", "'10", "'11", "'12", "'13", "'14", "'15", "'16",
        "'17", "'18"
    ]
    num_year = len(year_labels) - 1

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    data_loader.load_embed_content_dict()
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    embed_uploadtime_dict = data_loader.embed_uploadtime_dict
    num_videos = data_loader.num_videos

    for embed in range(num_videos):
        upload_year = int(embed_uploadtime_dict[embed][:4])
        if upload_year >= 2009:
            year_idx = upload_year - 2009
        else:
            year_idx = 0
        embed_uploadtime_dict[embed] = year_idx

    views_by_years_list = [[] for _ in range(num_year)]
    indegrees_by_years_list = [[] for _ in range(num_year)]

    # == == == == == == Part 3: Load dynamic network snapshot == == == == == == #
    embed_indegree_dict_15 = {
        embed: np.zeros((T, ))
        for embed in np.arange(num_videos)
    }
    for t in range(T):
        filename = 'network_{0}.p'.format(
            obj2str(datetime(2018, 9, 1) + timedelta(days=t)))
        with open(os.path.join(data_prefix, 'network_pickle', filename),
                  'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src)]
            for embed in range(num_videos):
                embed_indegree_dict_15[embed][t] = len(
                    [1 for x in network_dict[embed] if x[1] < NUM_REL_15])
        print('>>> Finish loading day {0}...'.format(t + 1))
    print('>>> Network structure has been loaded!')

    for embed in range(num_videos):
        views_by_years_list[embed_uploadtime_dict[embed]].append(
            embed_avg_view_dict[embed])
        indegrees_by_years_list[embed_uploadtime_dict[embed]].append(
            np.mean(embed_indegree_dict_15[embed]))

    spearman_traces = []
    all_views, all_indegrees = [], []
    for i in range(num_year):
        all_views.extend(views_by_years_list[i])
        all_indegrees.extend(indegrees_by_years_list[i])
    print('\n>>> {0}'.format(year_labels[0]),
          spearmanr(all_views, all_indegrees))
    spearman_traces.append(spearmanr(all_views, all_indegrees)[0])
    for i in range(num_year):
        spearman_traces.append(
            spearmanr(views_by_years_list[i], indegrees_by_years_list[i])[0])
        print('>>> {0} year'.format(year_labels[1 + i]),
              spearmanr(views_by_years_list[i], indegrees_by_years_list[i]))

    # == == == == == == Part 4: Plotting script == == == == == == #
    fig, ax1 = plt.subplots(1, 1, figsize=(8, 2))
    tomato = ColorPalette.TOMATO
    blue = ColorPalette.BLUE

    bar1 = ax1.bar(range(num_year + 1),
                   spearman_traces,
                   edgecolor=['k'] * (num_year + 1),
                   color=[tomato] + [blue] * num_year,
                   lw=1)
    for rect in bar1:
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width() / 2.0,
                 height,
                 '{0:.3f}'.format(height),
                 ha='center',
                 va='bottom')

    ax1.set_xticks(np.arange(11))
    ax1.set_xticklabels(year_labels)
    ax1.set_ylabel(r'spearman $\rho$')

    hide_spines(ax1)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/measure_spearmanr.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
Ejemplo n.º 6
0
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_view_dict = data_loader.embed_avg_view_dict

    # == == == == == == Part 3: Load persistent and non-persistent network == == == == == == #
    reciprocal_link_set = set()
    persistent_link_set = set()
    non_persistent_link_set = set()

    with open(os.path.join(data_prefix, 'persistent_network.csv'), 'r') as fin:
        fin.readline()
        for line in fin:
            src_embed, tar_embed = map(int, line.rstrip().split(','))

            link = '{0}-{1}'.format(src_embed, tar_embed)
            rec_link = '{1}-{0}'.format(src_embed, tar_embed)
            if rec_link in persistent_link_set:
                persistent_link_set.remove(rec_link)
                reciprocal_link_set.add(link)
            else:
                persistent_link_set.add(link)

    for t in range(T):
        target_date_str = obj2str(datetime(2018, 9, 1) + timedelta(days=t))
        filename = 'network_{0}.p'.format(target_date_str)
        network_dict = pickle.load(open(os.path.join(data_prefix, 'network_pickle', filename), 'rb'))
        for tar_embed in network_dict:
            src_embed_list = [x[0] for x in network_dict[tar_embed] if x[1] < NUM_REL]
            if len(src_embed_list) > 0:
                for src_embed in src_embed_list:
                    # filter: at least 100 daily views for target video,
                    # and the mean daily views of source video is at least 1% of the target video
                    if embed_avg_view_dict[tar_embed] >= 100 and embed_avg_view_dict[src_embed] >= 0.01 * embed_avg_view_dict[tar_embed]:
                        link = '{0}-{1}'.format(src_embed, tar_embed)
                        rec_link = '{1}-{0}'.format(src_embed, tar_embed)
                        if link not in persistent_link_set and rec_link not in persistent_link_set \
                                and link not in reciprocal_link_set and rec_link not in reciprocal_link_set \
                                and link not in non_persistent_link_set and rec_link not in non_persistent_link_set:
                            non_persistent_link_set.add(link)

    print('>>> Number of reciprocal links: {0}'.format(len(reciprocal_link_set)))
    print('>>> Number of persistent links (non-reciprocal): {0}'.format(len(persistent_link_set)))
    print('>>> Number of ephemeral links: {0}'.format(len(non_persistent_link_set)))

    for link_set, log_filename in zip([reciprocal_link_set, persistent_link_set, non_persistent_link_set],
                                      ['./reciprocal_pearsonr.log', './persistent_pearsonr.log',
                                       './ephemeral_pearsonr.log']):
        with open(log_filename, 'w') as log_file:
            for link in link_set:
                src_embed, tar_embed = map(int, link.split('-'))
                eff_size, pvalue = pearsonr(detsn(embed_view_dict[src_embed]), detsn(embed_view_dict[tar_embed]))
                log_file.write('{0},{1},{2},{3}\n'.format(src_embed, tar_embed, eff_size, pvalue))

    timer.stop()