Esempio n. 1
0
def process(video):
    id, platform = video
    try:
        cap = cv2.VideoCapture(video_helper.get_path(platform, id))
        frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
        if frame_count < NUM_FRAMES * NUM_SEGMENTS:
            return "Too few frames", id, platform, None

        # Divide the video into NUM_SEGMENTS segments and take the center NUM_FRAMES frames for analysis.
        padding_frames = int((frame_count / NUM_SEGMENTS - NUM_FRAMES) / 2)
        segments = np.zeros((NUM_SEGMENTS, NUM_FRAMES, FRAME_WIDTH,
                             FRAME_HEIGHT, NUM_RGB_CHANNELS))
        for i in range(NUM_SEGMENTS):
            # Skip ahead padding_frames
            for _ in range(padding_frames):
                cap.read()
            # Take NUM_FRAMES frames
            for j in range(NUM_FRAMES):
                _, frame = cap.read()
                segments[i][j] = cv2.resize(
                    video_helper.crop_center_square(frame),
                    (FRAME_WIDTH, FRAME_HEIGHT))
            # Again, skip ahead padding_frames
            for _ in range(padding_frames):
                cap.read()
        cap.release()

        # batch size 5 allows for 4 workers on a 12GB GPU
        prediction = rgb_model.predict(np.array(segments), batch_size=5)
        # The model averages next, so I do the same. All NUM_SEGMENTS outputs are then averaged again.
        mean = prediction.mean(axis=1).mean(axis=0)[0][0]
        # Compression to reduce memory footprint of sparse vectors
        return "Success", id, platform, zlib.compress(mean, 9)
    except Exception as e:
        return str(e), id, platform, None
def process(video):
    # Takes a video and returns every nth frame preprocessed as a numpy-array
    id, platform = video

    images = []
    cap = cv2.VideoCapture(video_helper.get_path(platform, id))
    count = 0
    while True:
        success, image = cap.read()
        if success:
            if count % EVERY_FRAME == 0:
                x = resize(image, (224, 224), mode='constant') * 255
                x = preprocess_input(x)
                images.append(x)
            count += 1
        else:
            # Reached the end of the video
            cap.release()
            break

    if len(images) > MIN_IMAGES:
        # Batch predict
        frame_results = model.predict(np.array(images))
        # The shape is (n_frames, 1, 1, layer_output)
        frame_results = frame_results.reshape(-1, frame_results.shape[-1])
        # Mean pooling
        mean = np.mean(frame_results, axis=0)
        # Compression to reduce memory footprint of sparse vectors
        return "Success", id, platform, zlib.compress(mean, 9)
    else:
        return "Too few frames", id, platform, None
Esempio n. 3
0
def display_video(platorm="facebook", id="CadburyBournvita/1937970696267088"):
    path = video_helper.get_path(platform=platorm, id=id)
    # path = os.path.relpath(path, os.getcwd())
    video_encoded = base64.b64encode(open(path, "rb").read())
    display(
        HTML(data='''<video alt="test" controls>
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                 </video>'''.format(video_encoded.decode('ascii'))))
Esempio n. 4
0
def download(youtube_video_id):
    ret = dict()

    try:
        video_path = video_helper.get_path("youtube")
        video_file = video_path + youtube_video_id + ".mp4"
        ydl_opts = {
            # Download smallest file but not less then 240p (so not 144p for example)
            'format':
            'worst[height>=240][ext=mp4]/worst[height>=240]/worst',  # best[height<=360][ext=mp4]
            'outtmpl': video_file,
            'quiet': True,
            'logger': QuietLogger()
        }

        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ret = dict()
            info = ydl.extract_info(youtube_video_id, download=False)

            if "_type" not in info or info["_type"] != "playlist":

                # Likes might be disabled
                ret["likes"] = info[
                    "like_count"] if "like_count" in info else -1
                ret["views"] = info[
                    "view_count"] if "view_count" in info else -1
                ret["duration"] = info["duration"] * 1000
                # Youtube-dl does not extract these at this point, neither does pytube.
                ret["comments"] = -1
                ret["shares"] = -1

                if ret["duration"] <= video_helper.LENGTH_CUTOFF:
                    # Only download if its not too long
                    ydl.extract_info(youtube_video_id, download=True)
                    ffprobe = video_helper.get_ffprobe_json(video_file)
                    size = int(ffprobe['format']['size'])

                    if size <= video_helper.SIZE_CUTOFF:
                        ret["crawling_status"] = "Success"
                    else:  # File is too big.
                        os.remove(video_file)
                        ret["crawling_status"] = "Too big"
                else:  # Video is too long.
                    ret["crawling_status"] = "Too long"
            else:
                ret["crawling_status"] = "Is stream recording"
    except Exception as e:
        # traceback.print_exc()
        ret["crawling_status"] = str(
            e)[:100]  # to prevent filling the db with stack traces
    return ret
Esempio n. 5
0
def process(video):
    # Takes a video and returns every nth frame preprocessed as a numpy-array
    id, platform = video
    try:
        path = extract_audio(video_helper.get_path(platform, id))
        audio, _ = librosa.load(path, dtype='float32', sr=22050, mono=True)

        # SoundNet needs the range to be between -256 and 256
        # In addition to the research this is based on, we scale the amplitude
        maximum = max(audio.max(), -audio.min())
        if maximum != 0.0:
            audio *= 256.0 / maximum
            # reshaping the audio data so it fits into the graph (batch_size, num_samples, num_filter_channels)
            audio = np.reshape(audio, (1, -1, 1))
            prediction = model.predict(audio)
            subprocess.call(["rm", path])
            prediction = prediction.mean(axis=1)[0]
            return "Success", id, platform, zlib.compress(prediction, 9)
        else:
            subprocess.call(["rm", path])
            return "No Audio", id, platform, None
    except Exception as e:
        return str(e), id, platform, None
def run():
    MODEL = "yolov3"  # Postfix -tiny
    net, meta = darknet_wrapper.initialize_classifier(
        config="cfg/%s.cfg" % MODEL,
        weights="weights/%s.weights" % MODEL,
        data="cfg/coco.data")

    conn = psycopg2.connect(database="video_article_retrieval",
                            user="******")
    c = conn.cursor()
    # Just classifying facebook videos for now
    c.execute(
        "SELECT id, platform FROM videos WHERE object_detection_yolo_status<>'Success' AND platform = 'facebook'"
    )
    videos = c.fetchall()

    print("%d videos left to analyze" % len(videos))

    crawling_progress = StatusVisualization(len(videos), update_every=10)
    for id, platform in videos:
        # print(platform, id)
        # We need to extract the images first
        # start = time.time()
        images = []
        cap = cv2.VideoCapture(video_helper.get_path(platform, id))
        count = 0
        while True:
            success, image = cap.read()
            if success:
                if count % 30 == 0:
                    path = tempfile.gettempdir() + "/%05d.jpg" % count
                    cv2.imwrite(path, image)
                    images.append(path)
                count += 1
            else:
                # Reached the end of the video
                break

        # print("Extracted %d images in %d seconds" % (len(images), time.time() - start))
        # start = time.time()

        for index, image in enumerate(images):
            try:
                result = darknet_wrapper.detect(net, meta, image)

                # print("%d: Found %d rois in %s" % (index, len(result), image))
                for entity in result:
                    # format is (class, probability (x,y,width, height)) ANKERED IN THE CENTER!
                    (label, probability, (x, y, width, height)) = entity
                    # x,y,height and width are not saved for now.
                    # print("%d,%d (%dx%d): %s (%.3f)" % (x, y, width, height, label, probability))
                    c.execute(
                        "INSERT INTO object_detection_yolo(id,platform,second,class,probability) VALUES (%s,%s,%s,%s,%s)",
                        [
                            id, platform, index,
                            str(label, "utf-8"), probability
                        ])
                    conn.commit()
            except Exception as e:
                print(e)

        # Update the classification status
        c.execute(
            "UPDATE videos SET object_detection_yolo_status = 'Success' WHERE id=%s AND platform=%s",
            [id, platform])
        conn.commit()
        # print("Detection took %d seconds" % (time.time() - start))
        crawling_progress.inc()
Esempio n. 7
0
def download(tweet_id):
    """
    :param tweet_id:
    :return: dict: crawling_status, views, duraton (ms), comments (=reply), shares (=retweets), likes (=favorite)
    """
    ret = dict()

    try:
        video_path = video_helper.get_path("twitter")

        # Get an authorization and a guest Token by extracting it from the Twitter source code
        video_player_url = 'https://twitter.com/i/videos/tweet/' + tweet_id
        video_player_response = requests.get(video_player_url)
        video_player_soup = BeautifulSoup(video_player_response.text, 'lxml')
        js_file_url = video_player_soup.find('script')['src']
        js_file_response = requests.get(js_file_url)
        bearer_token_pattern = re.compile('Bearer ([a-zA-Z0-9%-])+')
        bearer_token = bearer_token_pattern.search(js_file_response.text)
        bearer_token = bearer_token.group(0)
        # For now I'm manually getting one by going to https://twitter.com/i/videos/tweet/1041730759613046787
        # And looking at the request headers for the config request.
        guest_token = "1049750915719340034"
        # Talk to the API to get the m3u8 URL using the token just extracted
        player_config_url = 'https://api.twitter.com/1.1/videos/tweet/config/%s.json' % tweet_id
        player_config_response = requests.get(player_config_url,
                                              headers={
                                                  'Authorization':
                                                  bearer_token,
                                                  "x-guest-token": guest_token
                                              })

        if player_config_response.status_code == 200:
            player_config = json.loads(player_config_response.text)
            if player_config['track']['contentType'] == 'media_entity':

                m3u8_url = player_config['track']['playbackUrl']
                ret["views"] = util.convert_si_to_number(
                    player_config['track']['viewCount'])
                ret["duration"] = int(player_config['track']['durationMs'])

                # Get some more information by extracting it from the website embedding the tweet
                status_url = "http://twitter.com/i/status/" + tweet_id
                status_response = requests.get(status_url)
                status_soup = BeautifulSoup(status_response.text, 'lxml')

                stats = status_soup.find(
                    "div", {'class': "permalink-tweet-container"})
                # Sometimes comments are disabled, then this is just 0.
                ret["comments"] = int(
                    stats.find("span", {
                        'class': "ProfileTweet-action--reply"
                    }).find("span", {'class': "ProfileTweet-actionCount"
                                     })['data-tweet-stat-count'])
                ret["shares"] = int(
                    stats.find("span", {
                        'class': "ProfileTweet-action--retweet"
                    }).find("span", {'class': "ProfileTweet-actionCount"
                                     })['data-tweet-stat-count'])
                ret["likes"] = int(
                    stats.find("span", {
                        'class': "ProfileTweet-action--favorite"
                    }).find("span", {'class': "ProfileTweet-actionCount"
                                     })['data-tweet-stat-count'])

                # Get m3u8
                m3u8_response = requests.get(
                    m3u8_url, headers={'Authorization': bearer_token})
                m3u8_url_parse = urllib.parse.urlparse(m3u8_url)
                video_host = m3u8_url_parse.scheme + '://' + m3u8_url_parse.hostname
                m3u8_parse = m3u8.loads(m3u8_response.text)

                if m3u8_parse.is_variant:

                    # Find video with 480p resolution or higher (or lower if not available)
                    # ...sort by res
                    sorted_by_res = sorted(
                        m3u8_parse.playlists,
                        key=lambda video: video.stream_info.resolution[0])
                    correct_res = None
                    for video in sorted_by_res:
                        if video.stream_info.resolution[0] >= 480:
                            correct_res = video
                            break
                    if correct_res is None:
                        # No video with resolution >= 480p found
                        correct_res = sorted_by_res[-1]

                    ts_m3u8_response = requests.get(video_host +
                                                    correct_res.uri)
                    ts_m3u8_parse = m3u8.loads(ts_m3u8_response.text)

                    video_file = os.path.join(video_path, tweet_id + ".ts")
                    with open(video_file, 'ab+') as wfd:
                        for ts_uri in ts_m3u8_parse.segments.uri:
                            ts_file = requests.get(video_host + ts_uri)
                            wfd.write(ts_file.content)

                    ffprobe = video_helper.get_ffprobe_json(video_file)
                    duration = int(float(ffprobe['format']['duration']) * 1000)
                    size = int(ffprobe['format']['size'])

                    if duration <= video_helper.LENGTH_CUTOFF:
                        if size <= video_helper.SIZE_CUTOFF:
                            ret["crawling_status"] = "Success"
                        else:  # File is too big.
                            os.remove(video_file)
                            ret["crawling_status"] = "Too big"
                    else:  # Video is too long.
                        os.remove(video_file)
                        ret["crawling_status"] = "Too long"
                else:  # No playlists are contained in the response
                    ret["crawling_status"] = "Not is_variant"
            else:  # The playable media is not a video (e.g. its a gif)
                ret["crawling_status"] = "Content Type: %s" % player_config[
                    'track']['contentType']
        else:  # The server returned an error message (most times this is a 404, meaning the tweet doesn't have playable media attached)
            ret["crawling_status"] = "Player Config: %d" % player_config_response.status_code

    except Exception as e:
        traceback.print_exc()
        ret["crawling_status"] = str(e)
    return ret
Esempio n. 8
0
def download(facebook_video_id):
    """

    :param facebook_video_id: Combination of the actual video id and the username, id + "/" + user_name
    :return:
    """
    ret = dict()
    user_name, video_id = facebook_video_id.split("/")
    try:
        video_path = os.path.join(video_helper.get_path("facebook"), user_name)
        url = "https://www.facebook.com/%s/videos/%s" % (user_name, video_id)

        res = requests.get(url, timeout=5, allow_redirects=True)
        if res.status_code == 200:
            # Alternatively, theres also hd_src and both with _no_ratelimit postfix
            # (but if one doesn't exist, neither does)
            mp4_url_occurences = re.findall("sd_src:\"(.*?)\",", res.text)
            if len(mp4_url_occurences) > 0:
                ret["comments"] = int(
                    re.findall("commentcount:([0-9]*),", res.text)[0])
                ret["shares"] = int(
                    re.findall("sharecount:([0-9]*),", res.text)[0])
                ret["likes"] = int(
                    re.findall("likecount:([0-9]*),", res.text)[0])
                view_count = re.findall("viewCount:\"([0-9,]*)\",", res.text)
                # Number of views are not always present
                if len(view_count) == 1:
                    ret["views"] = int(view_count[0].replace(",", ""))
                else:
                    ret["views"] = -1

                r = requests.get(mp4_url_occurences[0], stream=True)
                if not os.path.exists(video_path):
                    # Every user has its own path
                    os.makedirs(video_path)
                video_file = video_path + "/" + video_id + ".mp4"
                with open(video_file, 'wb+') as file:
                    for chunk in r.iter_content(chunk_size=1024):
                        if chunk:
                            file.write(chunk)
                ffprobe = video_helper.get_ffprobe_json(video_file)
                ret["duration"] = int(
                    float(ffprobe['format']['duration']) * 1000)
                size = int(ffprobe['format']['size'])

                if ret["duration"] <= video_helper.LENGTH_CUTOFF:
                    if size <= video_helper.SIZE_CUTOFF:
                        ret["crawling_status"] = "Success"
                    else:  # File is too big.
                        os.remove(video_file)
                        ret["crawling_status"] = "Too big"
                else:  # Video is too long.
                    os.remove(video_file)
                    ret["crawling_status"] = "Too long"
            else:
                ret["crawling_status"] = "Video not available"
        else:
            ret["crawling_status"] = res.status_code
    except (HTTPError, ConnectionError):
        ret["crawling_status"] = "Invalid URL"
    except Exception as e:
        traceback.print_exc()
        ret["crawling_status"] = str(e)
    return ret