Esempio n. 1
0
def create_edges(min_users, timespan, final_date, end):
    s3 = boto3.resource('s3')
    athena_db = AthenaDatabase(database='internet_scholar',
                               s3_output='internet-scholar-admin')
    min_date = athena_db.query_athena_and_get_result(
        query_string=MIN_DATE)['min_date']
    min_date = datetime.strptime(min_date, '%Y-%m-%d').date()
    initial_date = final_date - timedelta(days=timespan - 1)
    while final_date <= end:
        print('Edges - {}'.format(str(final_date)))
        if initial_date >= min_date:
            edges = athena_db.query_athena_and_download(
                query_string=SELECT_EDGES.format(
                    initial_date=str(initial_date),
                    final_date=str(final_date),
                    min_users=min_users),
                filename='edges.csv')
            compressed_file = compress(filename=edges, delete_original=True)
            s3_filename = "youtube_graph_edge/min_users={min_users}/" \
                          "timespan={timespan}/final_date={final_date}/edges.csv.bz2".format(
                min_users=min_users,
                timespan=timespan,
                final_date=str(final_date))
            s3.Bucket('internet-scholar').upload_file(str(compressed_file),
                                                      s3_filename)
        final_date = final_date + timedelta(days=1)
        initial_date = initial_date + timedelta(days=1)
    athena_db.query_athena_and_wait(
        query_string='drop table if exists youtube_graph_edge')
    athena_db.query_athena_and_wait(
        query_string=CREATE_YOUTUBE_GRAPH_EDGE.format(
            s3_data='internet-scholar'))
    athena_db.query_athena_and_wait(
        query_string='MSCK REPAIR TABLE youtube_graph_edge')
Esempio n. 2
0
    def export_twint(self, yesterday):
        tweet_from_video_id = Path(
            Path(__file__).parent, 'tmp', 'tweet_from_video_id.sqlite')
        json_video_id_file = Path(
            Path(__file__).parent, 'tmp', 'twint_from_video_id.json')
        self.create_json_twint_file(source=tweet_from_video_id,
                                    destination=json_video_id_file)
        json_video_id_file_compressed = compress(json_video_id_file)
        tweet_from_screen_name = Path(
            Path(__file__).parent, 'tmp', 'tweet_from_screen_name.sqlite')
        json_screen_name_file = Path(
            Path(__file__).parent, 'tmp', 'twint_from_screen_name.json')
        self.create_json_twint_file(source=tweet_from_screen_name,
                                    destination=json_screen_name_file)
        json_screen_name_file_compressed = compress(json_screen_name_file)

        s3 = boto3.resource('s3')
        s3_filename = "twint_video_id/reference_date={}/twint_from_video_id.json.bz2".format(
            yesterday)
        s3.Bucket(self.s3_data).upload_file(str(json_video_id_file_compressed),
                                            s3_filename)

        s3_filename = "twint_screen_name/reference_date={}/twint_from_screen_name.json.bz2".format(
            yesterday)
        s3.Bucket(self.s3_data).upload_file(
            str(json_screen_name_file_compressed), s3_filename)

        athena_db = AthenaDatabase(database=self.athena_data,
                                   s3_output=self.s3_admin)
        athena_db.query_athena_and_wait(
            query_string="DROP TABLE twint_video_id")
        athena_db.query_athena_and_wait(
            query_string=ATHENA_CREATE_TWINT_VIDEO_ID.format(
                structure=STRUCTURE_TWINT_ATHENA, s3_bucket=self.s3_data))
        athena_db.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE twint_video_id")

        athena_db.query_athena_and_wait(
            query_string="DROP TABLE twint_screen_name")
        athena_db.query_athena_and_wait(
            query_string=ATHENA_CREATE_TWINT_SCREEN_NAME.format(
                structure=STRUCTURE_TWINT_ATHENA, s3_bucket=self.s3_data))
        athena_db.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE twint_screen_name")
Esempio n. 3
0
def create_gexf(min_users, timespan, final_date, end):
    s3 = boto3.resource('s3')
    while final_date <= end:
        print('GEXF - {}'.format(str(final_date)))
        gexf = Element(
            'gexf', {
                'xmlns':
                "http://www.gexf.net/1.3",
                'version':
                "1.3",
                'xmlns:viz':
                "http://www.gexf.net/1.3/viz",
                'xmlns:xsi':
                "http://www.w3.org/2001/XMLSchema-instance",
                'xsi:schemaLocation':
                "http://www.gexf.net/1.3 http://www.gexf.net/1.3/gexf.xsd"
            })
        graph = SubElement(
            gexf, 'graph', {
                'mode': "dynamic",
                'defaultedgetype': "undirected",
                'timeformat': "double",
                'timerepresentation': "timestamp"
            })

        attributes = SubElement(graph, 'attributes', {
            'class': "node",
            'mode': "static"
        })
        SubElement(attributes, 'attribute', {
            'id': '1',
            'title': 'view_count',
            'type': 'long'
        })
        SubElement(attributes, 'attribute', {
            'id': '2',
            'title': 'cumulative_view_count',
            'type': 'long'
        })
        SubElement(attributes, 'attribute', {
            'id': '3',
            'title': 'subscriber_count',
            'type': 'long'
        })
        SubElement(attributes, 'attribute', {
            'id': '4',
            'title': 'cumulative_subscriber_count',
            'type': 'long'
        })
        SubElement(attributes, 'attribute', {
            'id': '5',
            'title': 'video_count',
            'type': 'long'
        })
        SubElement(attributes, 'attribute', {
            'id': '6',
            'title': 'cumulative_video_count',
            'type': 'long'
        })

        attributes = SubElement(graph, 'attributes', {
            'class': "node",
            'mode': "dynamic"
        })
        SubElement(attributes, 'attribute', {
            'id': '7',
            'title': 'cluster',
            'type': 'long'
        })
        nodes = SubElement(graph, 'nodes')
        edges = SubElement(graph, 'edges')

        s3_filename = "youtube_graph_node/min_users={min_users}/" \
                      "timespan={timespan}/final_date={final_date}/nodes.csv.bz2".format(
            min_users=min_users,
            timespan=timespan,
            final_date=str(final_date))
        s3.Bucket('internet-scholar').download_file(s3_filename,
                                                    './nodes.csv.bz2')
        nodes_file = decompress(filename='./nodes.csv.bz2')
        with open(nodes_file, newline='', encoding="utf8") as csv_reader:
            reader = csv.DictReader(csv_reader)
            dict_attvalues = dict()
            for node_record in reader:
                node = SubElement(
                    nodes, 'node', {
                        'id': node_record['channel_id'],
                        'label': node_record['channel_title']
                    })
                dict_attvalues[node_record['channel_id']] = SubElement(
                    node, 'attvalues')
                SubElement(dict_attvalues[node_record['channel_id']],
                           'attvalue', {
                               'for': '1',
                               'value': node_record['view_count']
                           })
                SubElement(dict_attvalues[node_record['channel_id']],
                           'attvalue', {
                               'for': '2',
                               'value': node_record['cumulative_view_count']
                           })
                SubElement(dict_attvalues[node_record['channel_id']],
                           'attvalue', {
                               'for': '3',
                               'value': node_record['subscriber_count']
                           })
                SubElement(dict_attvalues[node_record['channel_id']],
                           'attvalue', {
                               'for': '4',
                               'value': node_record['cumulative_view_count']
                           })
                SubElement(dict_attvalues[node_record['channel_id']],
                           'attvalue', {
                               'for': '5',
                               'value': node_record['video_count']
                           })
                SubElement(dict_attvalues[node_record['channel_id']],
                           'attvalue', {
                               'for': '6',
                               'value': node_record['cumulative_video_count']
                           })

        s3_filename = "youtube_graph_louvain/min_users={min_users}/" \
                      "timespan={timespan}/final_date={final_date}/louvain.csv.bz2".format(
            min_users=min_users,
            timespan=timespan,
            final_date=str(final_date))
        s3.Bucket('internet-scholar').download_file(s3_filename,
                                                    './louvain.csv.bz2')
        louvain_file = decompress(filename='./louvain.csv.bz2')
        with open(louvain_file, newline='', encoding="utf8") as csv_reader:
            reader = csv.DictReader(csv_reader)
            for louvain_record in reader:
                SubElement(
                    dict_attvalues[louvain_record['channel_id']], 'attvalue', {
                        'for': '7',
                        'value': louvain_record['cluster'],
                        'timestamp': louvain_record['resolution']
                    })

        s3_filename = "youtube_graph_edge/min_users={min_users}/" \
                      "timespan={timespan}/final_date={final_date}/edges.csv.bz2".format(
            min_users=min_users,
            timespan=timespan,
            final_date=str(final_date))
        s3.Bucket('internet-scholar').download_file(s3_filename,
                                                    './edges.csv.bz2')
        edges_file = decompress(filename='./edges.csv.bz2')
        with open(edges_file, newline='', encoding="utf8") as csv_reader:
            reader = csv.DictReader(csv_reader)
            for edge_record in reader:
                SubElement(
                    edges, 'edge', {
                        'source': edge_record['source_id'],
                        'target': edge_record['target_id'],
                        'Weight': edge_record['Weight']
                    })

        f = open('./network.gexf', 'wb')
        f.write(prettify_xml(gexf))
        f.close()
        compressed_gexf = compress(filename='./network.gexf')
        s3_filename = "youtube_graph_gexf/min_users={min_users}/" \
                      "timespan={timespan}/final_date={final_date}/network.gexf.bz2".format(min_users=min_users,
                                                                                            timespan=timespan,
                                                                                            final_date=str(final_date))
        s3.Bucket('internet-scholar').upload_file(str(compressed_gexf),
                                                  s3_filename)

        final_date = final_date + timedelta(days=1)
Esempio n. 4
0
def create_louvain(min_users, timespan, final_date, end):
    s3 = boto3.resource('s3')
    athena_db = AthenaDatabase(database='internet_scholar',
                               s3_output='internet-scholar-admin')
    while final_date <= end:
        print('Louvain - {}'.format(str(final_date)))
        edges = athena_db.query_athena_and_download(
            query_string=EDGES_LOUVAIN.format(final_date=str(final_date),
                                              min_users=min_users,
                                              timespan=timespan),
            filename='edges_louvain.csv')
        g = nx.Graph()
        with open(edges, newline='', encoding="utf8") as csv_reader:
            reader = csv.DictReader(csv_reader)
            for edge in reader:
                g.add_edge(edge['source_id'],
                           edge['target_id'],
                           weight=int(edge['weight']))

        with open('./louvain.csv', 'w', encoding="utf8") as csv_writer:
            writer = csv.DictWriter(csv_writer,
                                    fieldnames=[
                                        'resolution', 'channel_id', 'cluster',
                                        'graph_size', 'cluster_size',
                                        'cluster_count'
                                    ],
                                    dialect='unix')
            writer.writeheader()
            nodes = list(g)
            graph_size = len(nodes)
            for resolution in numpy.arange(10, 0, -0.1):
                partition = community.best_partition(g,
                                                     resolution=resolution,
                                                     randomize=False)
                cluster_count = len(set(partition.values()))
                for partition_number in set(partition.values()):
                    new_partition = list()
                    for channel_id in partition.keys():
                        if partition[channel_id] == partition_number:
                            new_partition.append(channel_id)
                    cluster_size = len(new_partition)
                    new_partition_number = nodes.index(min(new_partition))
                    for item in new_partition:
                        new_record = dict()
                        new_record['resolution'] = "{:.1f}".format(resolution)
                        new_record['channel_id'] = item
                        new_record['cluster'] = new_partition_number
                        new_record['graph_size'] = graph_size
                        new_record['cluster_size'] = cluster_size
                        new_record['cluster_count'] = cluster_count
                        writer.writerow(new_record)

        compressed_file = compress(filename='./louvain.csv',
                                   delete_original=True)
        s3_filename = "youtube_graph_louvain/min_users={min_users}/" \
                      "timespan={timespan}/final_date={final_date}/louvain.csv.bz2".format(
            min_users=min_users,
            timespan=timespan,
            final_date=str(final_date))
        s3.Bucket('internet-scholar').upload_file(str(compressed_file),
                                                  s3_filename)
        final_date = final_date + timedelta(days=1)
    athena_db.query_athena_and_wait(
        query_string='drop table if exists youtube_graph_louvain')
    athena_db.query_athena_and_wait(
        query_string=CREATE_YOUTUBE_GRAPH_LOUVAIN.format(
            s3_data='internet-scholar'))
    athena_db.query_athena_and_wait(
        query_string='MSCK REPAIR TABLE youtube_graph_louvain')
Esempio n. 5
0
    def collect_video_snippets(self):
        logging.info("Start collecting video snippets")
        athena = AthenaDatabase(database=self.athena_data, s3_output=self.s3_admin)
        if not athena.table_exists("youtube_video_snippet"):
            select_twitter_stream_video = SELECT_TWITTER_STREAM_VIDEO
            select_youtube_related_video = SELECT_YOUTUBE_RELATED_VIDEO
        else:
            logging.info("Table youtube_video_snippet exists")
            select_twitter_stream_video = SELECT_TWITTER_STREAM_VIDEO + EXTRA_TWITTER_STREAM_VIDEO
            select_youtube_related_video = SELECT_YOUTUBE_RELATED_VIDEO + EXTRA_YOUTUBE_RELATED_VIDEO
        queries = [select_twitter_stream_video]
        if athena.table_exists("youtube_related_video"):
            queries.append(select_youtube_related_video)
        query = " union all ".join(queries)
        query_count = SELECT_COUNT.format(query)
        query_group_by = SELECT_GROUP_BY.format(query)
        logging.info("Download IDs for all Youtube videos that have not been processed yet")
        video_count = int(athena.query_athena_and_get_result(query_string=query_count)['video_count'])
        logging.info("There are %d links to be processed: download them", video_count)
        video_ids_csv = athena.query_athena_and_download(query_string=query_group_by, filename="video_ids.csv")

        output_json = Path(Path(__file__).parent, 'tmp', 'youtube_video_snippet.json')
        Path(output_json).parent.mkdir(parents=True, exist_ok=True)
        current_key = 0
        try:
            youtube = googleapiclient.discovery.build(serviceName="youtube",
                                                      version="v3",
                                                      developerKey=
                                                      self.credentials[current_key]['developer_key'],
                                                      cache_discovery=False)
        except UnknownApiNameOrVersion as e:
            service = read_dict_from_url(url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest")
            youtube = googleapiclient.discovery.build_from_document(service=service,
                                                                    developerKey=self.credentials[current_key][
                                                                        'developer_key'])
        with open(video_ids_csv, newline='') as csv_reader:
            with open(output_json, 'w') as json_writer:
                reader = csv.DictReader(csv_reader)
                num_videos = 0
                for video_id in reader:
                    if num_videos % self.LOGGING_INTERVAL == 0:
                        logging.info("%d out of %d videos processed", num_videos, video_count)
                    num_videos = num_videos + 1

                    connection_reset_by_peer = 0
                    service_unavailable = 0
                    no_response = True
                    response = dict()
                    while no_response:
                        try:
                            response = youtube.videos().list(part="snippet",id=video_id['video_id']).execute()
                            no_response = False
                        except SocketError as e:
                            if e.errno != errno.ECONNRESET:
                                logging.info("Other socket error!")
                                raise
                            else:
                                connection_reset_by_peer = connection_reset_by_peer + 1
                                logging.info("Connection reset by peer! {}".format(connection_reset_by_peer))
                                if connection_reset_by_peer <= 10:
                                    time.sleep(self.WAIT_WHEN_CONNECTION_RESET_BY_PEER)
                                    try:
                                        youtube = googleapiclient.discovery.build(serviceName="youtube",
                                                                                  version="v3",
                                                                                  developerKey=
                                                                                  self.credentials[current_key][
                                                                                      'developer_key'],
                                                                                  cache_discovery=False)
                                    except UnknownApiNameOrVersion as e:
                                        service = read_dict_from_url(
                                            url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest")
                                        youtube = googleapiclient.discovery.build_from_document(service=service,
                                                                                                developerKey=
                                                                                                self.credentials[
                                                                                                    current_key][
                                                                                                    'developer_key'])
                                else:
                                    raise
                        except HttpError as e:
                            if "403" in str(e):
                                logging.info("Invalid {} developer key: {}".format(
                                    current_key,
                                    self.credentials[current_key]['developer_key']))
                                current_key = current_key + 1
                                if current_key >= len(self.credentials):
                                    raise
                                else:
                                    try:
                                        youtube = googleapiclient.discovery.build(serviceName="youtube",
                                                                                  version="v3",
                                                                                  developerKey=
                                                                                  self.credentials[current_key][
                                                                                      'developer_key'],
                                                                                  cache_discovery=False)
                                    except UnknownApiNameOrVersion as e:
                                        service = read_dict_from_url(
                                            url="https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest")
                                        youtube = googleapiclient.discovery.build_from_document(service=service,
                                                                                                developerKey=
                                                                                                self.credentials[
                                                                                                    current_key][
                                                                                                    'developer_key'])
                            elif "503" in str(e):
                                logging.info("Service unavailable")
                                service_unavailable = service_unavailable + 1
                                if service_unavailable <= 10:
                                    time.sleep(self.WAIT_WHEN_SERVICE_UNAVAILABLE)
                                else:
                                    raise
                            else:
                                raise
                    if len(response.get('items', [])) == 0:
                        response['id'] = video_id['video_id']
                        response['retrieved_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
                        response['description'] = "Video unavailable. It has probably been removed by the user."
                        json_writer.write("{}\n".format(json.dumps(response)))
                    else:
                        for item in response['items']:
                            item['snippet']['publishedAt'] = item['snippet']['publishedAt'].rstrip('Z').replace('T', ' ')
                            item['retrieved_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
                            json_writer.write("{}\n".format(json.dumps(item)))

        logging.info("Compress file %s", output_json)
        compressed_file = compress(filename=output_json, delete_original=True)

        s3 = boto3.resource('s3')
        s3_filename = "youtube_video_snippet/creation_date={}/{}-{}.json.bz2".format(datetime.utcnow().strftime("%Y-%m-%d"),
                                                                                     uuid.uuid4().hex,
                                                                                     num_videos)
        logging.info("Upload file %s to bucket %s at %s", compressed_file, self.s3_data, s3_filename)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info("Recreate table for Youtube channel stats")
        athena.query_athena_and_wait(query_string="DROP TABLE IF EXISTS youtube_video_snippet")
        athena.query_athena_and_wait(query_string=CREATE_VIDEO_SNIPPET_JSON.format(s3_bucket=self.s3_data))
        athena.query_athena_and_wait(query_string="MSCK REPAIR TABLE youtube_video_snippet")

        logging.info("Concluded collecting video snippets")
Esempio n. 6
0
    def collect_related_video(self, region_code, creation_date=None):
        athena_db = AthenaDatabase(database=self.athena_data,
                                   s3_output=self.s3_admin)

        trending_filename = Path(Path(__file__).parent, 'tmp', 'trending.csv')
        Path(trending_filename).parent.mkdir(parents=True, exist_ok=True)

        if creation_date is None:
            query_string = TRENDING_VIDEOS.format(
                creation_date=(
                    date.today() -
                    timedelta(days=self.NUMBER_OF_DAYS)).strftime("%Y-%m-%d"),
                number_of_videos=self.NUMBER_OF_VIDEOS)
        else:
            query_string = TRENDING_VIDEOS.format(
                creation_date=(
                    datetime.strptime(creation_date, '%Y-%m-%d') -
                    timedelta(days=self.NUMBER_OF_DAYS)).strftime("%Y-%m-%d"),
                number_of_videos=self.NUMBER_OF_VIDEOS)

        trending_videos = athena_db.query_athena_and_download(
            query_string=query_string, filename=trending_filename)

        with open(trending_videos, newline='', encoding="utf8") as csv_reader:
            output_json = Path(
                Path(__file__).parent, 'tmp', 'youtube_related_video.json')
            Path(output_json).parent.mkdir(parents=True, exist_ok=True)

            with open(output_json, 'w') as json_writer:
                reader = csv.DictReader(csv_reader)
                current_key = 0
                try:
                    youtube = googleapiclient.discovery.build(
                        serviceName="youtube",
                        version="v3",
                        developerKey=self.credentials[current_key]
                        ['developer_key'],
                        cache_discovery=False)
                except UnknownApiNameOrVersion as e:
                    service = read_dict_from_url(
                        url=
                        "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest"
                    )
                    youtube = googleapiclient.discovery.build_from_document(
                        service=service,
                        developerKey=self.credentials[current_key]
                        ['developer_key'])
                num_videos = 0
                if creation_date is None:
                    max_results = self.NUMBER_OF_RELATED_VIDEOS
                    part = 'id'
                else:
                    part = 'snippet'
                    max_results = self.NUMBER_OF_RELATED_VIDEOS * 3
                    if max_results > 50:
                        max_results = 50
                for trending_video in reader:
                    service_unavailable = 0
                    connection_reset_by_peer = 0
                    no_response = True
                    response = dict()
                    while no_response:
                        try:
                            response = youtube.search().list(
                                part=part,
                                type='video',
                                regionCode=region_code,
                                relatedToVideoId=trending_video['id'],
                                maxResults=max_results).execute()
                            no_response = False
                        except SocketError as e:
                            if e.errno != errno.ECONNRESET:
                                logging.info("Other socket error!")
                                raise
                            else:
                                connection_reset_by_peer = connection_reset_by_peer + 1
                                logging.info(
                                    "Connection reset by peer! {}".format(
                                        connection_reset_by_peer))
                                if connection_reset_by_peer <= 10:
                                    time.sleep(
                                        self.WAIT_WHEN_CONNECTION_RESET_BY_PEER
                                    )
                                    try:
                                        youtube = googleapiclient.discovery.build(
                                            serviceName="youtube",
                                            version="v3",
                                            developerKey=self.credentials[
                                                current_key]['developer_key'],
                                            cache_discovery=False)
                                    except UnknownApiNameOrVersion as e:
                                        service = read_dict_from_url(
                                            url=
                                            "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest"
                                        )
                                        youtube = googleapiclient.discovery.build_from_document(
                                            service=service,
                                            developerKey=self.credentials[
                                                current_key]['developer_key'])
                                else:
                                    raise
                        except HttpError as e:
                            if "403" in str(e):
                                logging.info(
                                    "Invalid {} developer key: {}".format(
                                        current_key,
                                        self.credentials[current_key]
                                        ['developer_key']))
                                current_key = current_key + 1
                                if current_key >= len(self.credentials):
                                    raise
                                else:
                                    try:
                                        youtube = googleapiclient.discovery.build(
                                            serviceName="youtube",
                                            version="v3",
                                            developerKey=self.credentials[
                                                current_key]['developer_key'],
                                            cache_discovery=False)
                                    except UnknownApiNameOrVersion as e:
                                        service = read_dict_from_url(
                                            url=
                                            "https://www.googleapis.com/discovery/v1/apis/youtube/v3/rest"
                                        )
                                        youtube = googleapiclient.discovery.build_from_document(
                                            service=service,
                                            developerKey=self.credentials[
                                                current_key]['developer_key'])
                            elif "Backend Error" in str(e):
                                # Backend errors are usually associated to getting
                                # recommended videos for a video that was deleted by the user.
                                # In that case, just move on.
                                logging.info(
                                    "Backend error. Video %s will be ignored",
                                    trending_video['id'])
                                no_response = False
                            elif "Not Found" in str(e):
                                # Backend errors are usually associated to getting
                                # recommended videos for a video that was deleted by the user.
                                # In that case, just move on.
                                logging.info(
                                    "Not Found error. Video %s will be ignored",
                                    trending_video['id'])
                                no_response = False
                            elif "404" in str(e):
                                logging.info(
                                    "Requested entity was not found. Video %s will be ignored",
                                    trending_video['id'])
                                no_response = False
                            elif "400" in str(e):
                                logging.info(
                                    "Invalid argument. Video %s will be ignored",
                                    trending_video['id'])
                                no_response = False
                            elif "503" in str(e):
                                logging.info("Service unavailable")
                                service_unavailable = service_unavailable + 1
                                if service_unavailable <= 10:
                                    time.sleep(
                                        self.WAIT_WHEN_SERVICE_UNAVAILABLE)
                                else:
                                    raise
                            else:
                                raise

                    rank = 1
                    for item in response.get('items', {}):
                        item['relatedToVideoId'] = trending_video['id']
                        item['retrieved_at'] = datetime.utcnow().strftime(
                            "%Y-%m-%d %H:%M:%S.%f")[:-3]
                        item['rank'] = rank
                        if creation_date is None:
                            rank = rank + 1
                            num_videos = num_videos + 1
                            json_writer.write("{}\n".format(json.dumps(item)))
                        else:
                            item['snippet']['publishedAt'] = item['snippet'][
                                'publishedAt'].rstrip('Z').replace('T', ' ')
                            if rank <= self.NUMBER_OF_RELATED_VIDEOS:
                                if item['snippet'][
                                        'publishedAt'] <= creation_date + ' 00:00:00.000':
                                    rank = rank + 1
                                    num_videos = num_videos + 1
                                    json_writer.write("{}\n".format(
                                        json.dumps(item)))

        logging.info("Compress file %s", output_json)
        compressed_file = compress(filename=output_json, delete_original=True)

        s3 = boto3.resource('s3')
        if creation_date is None:
            s3_filename = "youtube_related_video/creation_date={creation_date}/{num_videos}.json.bz2".format(
                creation_date=datetime.utcnow().strftime("%Y-%m-%d"),
                num_videos=num_videos)
        else:
            s3_filename = "youtube_related_video/creation_date={creation_date}/{num_videos}.json.bz2".format(
                creation_date=creation_date, num_videos=num_videos)
        logging.info("Upload file %s to bucket %s at %s", compressed_file,
                     self.s3_data, s3_filename)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info("Recreate table for Youtube related video snippets")
        athena_db.query_athena_and_wait(
            query_string="DROP TABLE IF EXISTS youtube_related_video")
        athena_db.query_athena_and_wait(
            query_string=CREATE_VIDEO_RELATED_JSON.format(
                s3_bucket=self.s3_data))
        athena_db.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE youtube_related_video")

        logging.info("Concluded collecting related video snippets")
Esempio n. 7
0
    def collect_channel_stats(self):
        logging.info("Start collecting Youtube channel stats")
        channel_ids = Path(Path(__file__).parent, 'tmp', 'channel_ids.csv')
        athena = AthenaDatabase(database=self.athena_data,
                                s3_output=self.s3_admin)
        athena.query_athena_and_download(query_string=SELECT_DISTINCT_CHANNEL,
                                         filename=channel_ids)
        channel_count = int(
            athena.query_athena_and_get_result(
                query_string=SELECT_COUNT_DISTINCT_CHANNEL)['channel_count'])
        logging.info("There are %d channels to be processed: download them",
                     channel_count)

        current_key = 0
        youtube = googleapiclient.discovery.build(
            serviceName="youtube",
            version="v3",
            developerKey=self.credentials[current_key]['developer_key'],
            cache_discovery=False)
        with open(channel_ids, newline='') as csv_reader:
            output_json = Path(
                Path(__file__).parent, 'tmp', 'youtube_channel_stats.json')
            with open(output_json, 'w') as json_writer:
                reader = csv.DictReader(csv_reader)
                num_channels = 0
                for channel_id in reader:
                    if num_channels % self.LOGGING_INTERVAL == 0:
                        logging.info("%d out of %d channels processed",
                                     num_channels, channel_count)
                    num_channels = num_channels + 1

                    service_unavailable = 0
                    no_response = True
                    while no_response:
                        try:
                            response = youtube.channels().list(
                                part="statistics",
                                id=channel_id['channel_id']).execute()
                            no_response = False
                        except HttpError as e:
                            if "403" in str(e):
                                logging.info(
                                    "Invalid {} developer key: {}".format(
                                        current_key,
                                        self.credentials[current_key]
                                        ['developer_key']))
                                current_key = current_key + 1
                                if current_key >= len(self.credentials):
                                    raise
                                else:
                                    youtube = googleapiclient.discovery.build(
                                        serviceName="youtube",
                                        version="v3",
                                        developerKey=self.credentials[
                                            current_key]['developer_key'],
                                        cache_discovery=False)
                            elif "503" in str(e):
                                logging.info("Service unavailable")
                                service_unavailable = service_unavailable + 1
                                if service_unavailable <= 10:
                                    time.sleep(
                                        self.WAIT_WHEN_SERVICE_UNAVAILABLE)
                                else:
                                    raise
                            else:
                                raise
                    for item in response.get('items', []):
                        item['retrieved_at'] = datetime.utcnow().strftime(
                            "%Y-%m-%d %H:%M:%S.%f")[:-3]
                        json_writer.write("{}\n".format(json.dumps(item)))

        logging.info("Compress file %s", output_json)
        compressed_file = compress(filename=output_json, delete_original=True)

        s3 = boto3.resource('s3')
        s3_filename = "youtube_channel_stats/creation_date={}/{}.json.bz2".format(
            datetime.utcnow().strftime("%Y-%m-%d"), num_channels)
        logging.info("Upload file %s to bucket %s at %s", compressed_file,
                     self.s3_data, s3_filename)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info("Recreate table for Youtube channel stats")
        athena.query_athena_and_wait(
            query_string="DROP TABLE IF EXISTS youtube_channel_stats")
        athena.query_athena_and_wait(
            query_string=CREATE_CHANNEL_STATS_JSON.format(
                s3_bucket=self.s3_data))
        athena.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE youtube_channel_stats")

        logging.info("Concluded collecting channel stats")
Esempio n. 8
0
    def collect_video_snippets(self):
        logging.info("Start collecting video snippets")
        athena = AthenaDatabase(database=self.athena_data,
                                s3_output=self.s3_admin)
        query = SELECT_YOUTUBE_VIDEOS
        query_count = SELECT_COUNT_YOUTUBE_VIDEOS
        if athena.table_exists("youtube_video_snippet"):
            logging.info("Table youtube_video_snippet exists")
            query = query + TABLE_YOUTUBE_VIDEO_SNIPPET_EXISTS
            query_count = query_count + TABLE_YOUTUBE_VIDEO_SNIPPET_EXISTS
        logging.info(
            "Download IDs for all Youtube videos that have not been processed yet"
        )
        video_count = int(
            athena.query_athena_and_get_result(
                query_string=query_count)['video_count'])
        logging.info("There are %d links to be processed: download them",
                     video_count)
        video_ids_csv = athena.query_athena_and_download(
            query_string=query, filename="video_ids.csv")

        output_json = Path(
            Path(__file__).parent, 'tmp', 'youtube_video_snippet.json')
        Path(output_json).parent.mkdir(parents=True, exist_ok=True)
        current_key = 0
        youtube = googleapiclient.discovery.build(
            serviceName="youtube",
            version="v3",
            developerKey=self.credentials[current_key]['developer_key'],
            cache_discovery=False)
        with open(video_ids_csv, newline='') as csv_reader:
            with open(output_json, 'w') as json_writer:
                reader = csv.DictReader(csv_reader)
                num_videos = 0
                for video_id in reader:
                    if num_videos % self.LOGGING_INTERVAL == 0:
                        logging.info("%d out of %d videos processed",
                                     num_videos, video_count)
                    num_videos = num_videos + 1

                    service_unavailable = 0
                    no_response = True
                    while no_response:
                        try:
                            response = youtube.videos().list(
                                part="snippet",
                                id=video_id['video_id']).execute()
                            no_response = False
                        except HttpError as e:
                            if "403" in str(e):
                                logging.info(
                                    "Invalid {} developer key: {}".format(
                                        current_key,
                                        self.credentials[current_key]
                                        ['developer_key']))
                                current_key = current_key + 1
                                if current_key >= len(self.credentials):
                                    raise
                                else:
                                    youtube = googleapiclient.discovery.build(
                                        serviceName="youtube",
                                        version="v3",
                                        developerKey=self.credentials[
                                            current_key]['developer_key'],
                                        cache_discovery=False)
                            elif "503" in str(e):
                                logging.info("Service unavailable")
                                service_unavailable = service_unavailable + 1
                                if service_unavailable <= 10:
                                    time.sleep(
                                        self.WAIT_WHEN_SERVICE_UNAVAILABLE)
                                else:
                                    raise
                            else:
                                raise
                    if len(response.get('items', [])) == 0:
                        response['id'] = video_id['video_id']
                        response['retrieved_at'] = datetime.utcnow().strftime(
                            "%Y-%m-%d %H:%M:%S.%f")[:-3]
                        response[
                            'description'] = "Video unavailable. It has probably been removed by the user."
                        json_writer.write("{}\n".format(json.dumps(response)))
                    else:
                        for item in response['items']:
                            item['snippet']['publishedAt'] = item['snippet'][
                                'publishedAt'].rstrip('Z').replace('T', ' ')
                            item['retrieved_at'] = datetime.utcnow().strftime(
                                "%Y-%m-%d %H:%M:%S.%f")[:-3]
                            json_writer.write("{}\n".format(json.dumps(item)))

        logging.info("Compress file %s", output_json)
        compressed_file = compress(filename=output_json, delete_original=True)

        s3 = boto3.resource('s3')
        s3_filename = "youtube_video_snippet/{}-{}.json.bz2".format(
            datetime.utcnow().strftime("%Y-%m-%d"), num_videos)
        logging.info("Upload file %s to bucket %s at %s", compressed_file,
                     self.s3_data, s3_filename)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info("Concluded collecting video snippets")
        athena.query_athena_and_wait(
            query_string=CREATE_VIDEO_SNIPPET_JSON.format(
                s3_bucket=self.s3_data))
Esempio n. 9
0
query = """
select
  twitter_stream.id_str as tweet_id,
  twitter_stream.user.id_str as user_id,
  url.expanded_url as url
from
  internet_scholar.twitter_stream_raw as twitter_stream,
  unnest(entities.urls) as t(url)
where
  creation_date = '{creation_date}' and
  url.display_url not like 'twitter.com/%'
order by
  tweet_id,
  user_id,
  url;
"""

while current_date <= date(2019, 11, 27):
    print(str(current_date))
    tweet_user_url = athena_db.query_athena_and_download(
        query_string=query.format(creation_date=str(current_date)),
        filename=str(current_date) + '.csv')
    compressed_file = compress(filename=tweet_user_url)

    s3 = boto3.resource('s3')
    s3_filename = "tweet_user_url/creation_date={creation_date}/{code}.csv.bz2".format(
        creation_date=str(current_date), code=uuid.uuid4().hex)
    s3.Bucket('internet-scholar').upload_file(str(compressed_file),
                                              s3_filename)

    current_date = current_date + timedelta(days=1)
def import_data(related_date, end_related_date, graph_date_difference,
                timespan):
    database = sqlite3.connect('./youtube_recommendations.sqlite')
    sqlite_aws = SqliteAWS(database=database,
                           s3_admin='internet-scholar-admin',
                           s3_data='internet-scholar',
                           athena_db='internet_scholar')
    logging.info('Retrieve recommendations...')
    sqlite_aws.convert_athena_query_to_sqlite(
        table_name='recommendation_aux',
        query=RECOMMENDATION.format(begin_date=str(related_date),
                                    end_date=str(end_related_date)))

    logging.info('Add primary key to recommendation table...')
    database.execute(CREATE_TABLE_RECOMMENDATION)
    database.execute(INSERT_TABLE_RECOMMENDATION)
    database.execute('DROP TABLE recommendation_aux')
    logging.info('Update categories and null values...')
    database.execute(UPDATE_CATEGORY_SEED)
    database.execute(UPDATE_CATEGORY_RECOMMENDED)
    database.execute(UPDATE_NULL_SEED)
    database.execute(UPDATE_NULL_RECOMMENDED)

    logging.info('Retrieve Twitter users and YouTube channel data...')
    initial_date = related_date + timedelta(
        days=graph_date_difference) - timedelta(days=timespan - 1)
    final_date = end_related_date + timedelta(days=graph_date_difference)
    sqlite_aws.convert_athena_query_to_sqlite(
        table_name='twitter_user_channel',
        query=TWITTER_USER_CHANNEL.format(initial_date=str(initial_date),
                                          final_date=str(final_date)))

    logging.info('Calculate number of common Twitter users per channel...')
    database.execute(CREATE_YOUTUBE_CHANNEL_COMMON_TWITTER_USERS)
    current_date = related_date
    while current_date <= end_related_date:
        logging.info(str(current_date))
        initial_date = current_date + timedelta(
            days=graph_date_difference) - timedelta(days=timespan - 1)
        final_date = current_date + timedelta(days=graph_date_difference)
        database.execute(
            INSERT_YOUTUBE_CHANNEL_COMMON_TWITTER_USERS.format(
                initial_date=initial_date, final_date=final_date))
        current_date = current_date + timedelta(days=1)
    logging.info('Update aggregate on SQLite table 1...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN seed_user_count INT")
    database.execute(UPDATE_SEED_USER_COUNT)
    logging.info('Update aggregate on SQLite table 2...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN recommended_user_count INT")
    database.execute(UPDATE_RECOMMENDED_USER_COUNT)
    logging.info('Update aggregate on SQLite table 3...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN common_user_count INT")
    database.execute(UPDATE_COMMON_USER_COUNT)

    logging.info('Retrieve info about political leaning...')
    sqlite_aws.convert_athena_query_to_sqlite(
        table_name='channel_political_leaning',
        query=SELECT_POLITICAL_LEANING.format(
            initial_date=str(related_date), final_date=str(end_related_date)))
    logging.info('Update political leaning info on SQLite 1...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN seed_political_leaning TEXT")
    database.execute(UPDATE_SEED_POLITICAL_LEANING)
    logging.info('Update political leaning info on SQLite 1...')
    database.execute(
        "ALTER TABLE recommendation ADD COLUMN recommended_political_leaning TEXT"
    )
    database.execute(UPDATE_RECOMMENDED_POLITICAL_LEANING)

    logging.info('Retrieve data on channel stats...')
    athena_db = AthenaDatabase(database='internet_scholar',
                               s3_output='internet-scholar-admin')
    athena_db.query_athena_and_wait(
        query_string=CREATE_VIEW_ENHANCED_CHANNEL_STATS)
    sqlite_aws.convert_athena_query_to_sqlite(
        table_name='channel_stats',
        query=SELECT_ENHANCED_STATS.format(initial_date=str(related_date),
                                           final_date=str(end_related_date)))
    logging.info('Add primary key to channel stats...')
    database.execute(CREATE_CHANNEL_STATS_WITH_PRIMARY_KEY)
    database.execute(INSERT_CHANNEL_STATS_WITH_PRIMARY_KEY)
    add_stat_to_sqlite(database, field='view_count')
    add_stat_to_sqlite(database, field='cumulative_view_count')
    add_stat_to_sqlite(database, field='subscriber_count')
    add_stat_to_sqlite(database, field='cumulative_subscriber_count')
    add_stat_to_sqlite(database, field='video_count')
    add_stat_to_sqlite(database, field='cumulative_video_count')
    add_stat_to_sqlite(database, field='comment_count')
    add_stat_to_sqlite(database, field='cumulative_comment_count')

    database.execute('DROP TABLE channel_political_leaning')
    database.execute('DROP TABLE channel_stats')
    database.execute('DROP TABLE channel_stats_with_primary_key')
    database.execute('DROP TABLE twitter_user_channel')
    database.execute('DROP TABLE youtube_channel_common_twitter_users')
    database.commit()

    database.execute('VACUUM')
    database.close()

    new_filename = compress('./youtube_recommendations.sqlite')
    s3_filename = "youtube_data_export_r/{timestamp}.sqlite.bz2".format(
        timestamp=datetime.utcnow().strftime("%Y%m%d-%H%M%S"))
    s3 = boto3.resource('s3')
    s3.Bucket('internet-scholar').upload_file(str(new_filename), s3_filename)
Esempio n. 11
0
    def expand_urls(self, creation_date=None):
        logging.info("begin: expand URLs")
        athena = AthenaDatabase(database=self.athena_data,
                                s3_output=self.s3_admin)

        yesterday = (date.today() - timedelta(days=1)).strftime("%Y-%m-%d")
        if creation_date is None:
            creation_date = yesterday
        logging.info("Expand URLs that were tweeted on {creation_date}".format(
            creation_date=creation_date))

        query_tweet_user_url = self.__TWEET_USER_URL.format(
            creation_date=creation_date)
        query = self.__UNVALIDATED_URLS.format(creation_date=creation_date)
        query_count = self.__COUNT_UNVALIDATED_URLS.format(
            creation_date=creation_date)
        if athena.table_exists("validated_url"):
            logging.info("Table validated_url exists")
            query = query + " and url not in (select validated_url.url from validated_url)"
            query_count = query_count + " and url not in (select validated_url.url from validated_url)"

        logging.info('Update table tweet_user_url')
        tweet_user_url = athena.query_athena_and_download(
            query_string=query_tweet_user_url.format(
                creation_date=creation_date),
            filename=creation_date + '.csv')
        compressed_file = compress(filename=tweet_user_url)
        s3 = boto3.resource('s3')
        s3_filename = "tweet_user_url/creation_date={creation_date}/{code}.csv.bz2".format(
            creation_date=creation_date, code=uuid.uuid4().hex)
        logging.info('Upload data file that will comprise tweet_user_url')
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), s3_filename)

        logging.info('Update table tweet_user_url on Athena')
        logging.info(
            "Create Athena table tweet_user_url if does not exist already")
        athena.query_athena_and_wait(
            query_string=self.__CREATE_TABLE_TWEET_USER_URL.format(
                s3_data=self.s3_data))
        athena.query_athena_and_wait(
            query_string="MSCK REPAIR TABLE tweet_user_url")

        link_count = int(
            athena.query_athena_and_get_result(
                query_string=query_count)['link_count'])
        logging.info("There are %d links to be processed: download them",
                     link_count)
        unvalidated_urls = athena.query_athena_and_download(
            query_string=query, filename="unvalidated_urls.csv")

        with open(unvalidated_urls, newline='') as csv_reader:
            validated_urls = Path(
                Path(__file__).parent, 'tmp', 'validated_urls.csv')
            Path(validated_urls).parent.mkdir(parents=True, exist_ok=True)
            logging.info("Create file %s for validated URLs", validated_urls)
            with open(str(validated_urls), 'w') as csv_writer:
                reader = csv.DictReader(csv_reader)
                writer = csv.DictWriter(csv_writer,
                                        fieldnames=[
                                            'url', 'validated_url',
                                            'status_code', 'content_type',
                                            'content_length', 'created_at'
                                        ],
                                        dialect='unix')
                url_expander = URLExpander()
                num_links = 0
                for url in reader:
                    if num_links % self.LOGGING_INTERVAL == 0:
                        logging.info("%d out of %d links processed", num_links,
                                     link_count)
                    num_links = num_links + 1
                    for expanded_url in url_expander.expand_url(url['url']):
                        writer.writerow(expanded_url)
                logging.info("All links processed")

        logging.info("Compress file %s", validated_urls)
        compressed_file = compress(filename=validated_urls,
                                   delete_original=True)

        if creation_date == yesterday:
            filename_s3 = 'validated_url_raw/{}-{}.csv.bz2'.format(
                time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime()), link_count)
        else:
            filename_s3 = 'validated_url_raw/{}-{}.csv.bz2'.format(
                creation_date + '-23-59-59', link_count)
        logging.info("Upload file %s to bucket %s at %s", compressed_file,
                     self.s3_data, filename_s3)
        s3.Bucket(self.s3_data).upload_file(str(compressed_file), filename_s3)

        logging.info(
            "Delete previous validated_url data: will be generated again")
        s3.Bucket(
            self.s3_data).objects.filter(Prefix="validated_url/").delete()

        logging.info(
            "Create Athena table validated_url_raw if does not exist already")
        athena.query_athena_and_wait(
            query_string=self.__CREATE_TABLE_VALIDATED_URL_RAW.format(
                s3_data=self.s3_data))
        logging.info("Drop Athena table validated_url")
        athena.query_athena_and_wait(
            query_string="drop table if exists validated_url")
        logging.info("Creates Athena table validated_url through CTAS")
        athena.query_athena_and_wait(
            query_string=self.__CREATE_TABLE_VALIDATED_URL.format(
                s3_data=self.s3_data))
        logging.info("END: expand URLs")