def run():
    conn = psycopg2.connect(database="video_article_retrieval",
                            user="******")
    article_cursor = conn.cursor()
    update_cursor = conn.cursor()
    article_cursor.execute(
        "SELECT count(1) FROM articles WHERE text_extraction_status='Success'")
    article_count, = article_cursor.fetchone()
    # avoid loading all articles into memory.
    article_cursor.execute(
        "SELECT id, tokens FROM articles WHERE text_extraction_status='Success'"
    )

    crawling_progress = StatusVisualization(article_count, update_every=1000)

    with Pool(8, initializer=init_worker) as pool:
        for status, article_id, compressed_bow, compressed_w2v in pool.imap_unordered(
                extract_features, article_cursor):
            if status == 'Success':
                update_cursor.execute(
                    "UPDATE articles SET bow_2048=%s, w2v_2048=%s, feature_extraction_status='Success' WHERE id=%s",
                    [compressed_bow, compressed_w2v, article_id])
            else:
                update_cursor.execute(
                    "UPDATE articles SET feature_extraction_status=%s WHERE id=%s",
                    [status, article_id])
            crawling_progress.inc()
            conn.commit()
Example #2
0
def run(db, user):
    # We create a Pool (of Threads, not processes, since, again, this task is I/O-bound anyways)
    conn = psycopg2.connect(database=db, user=user)
    c = conn.cursor()
    c.execute(
        """SELECT id, platform FROM videos WHERE crawling_status='Not Crawled'"""
    )
    videos = c.fetchall()  #[:10000]
    shuffle(videos)

    print(len(videos))

    pool = Pool(16)
    crawling_progress = StatusVisualization(len(videos), update_every=100)
    for video in pool.imap_unordered(download_video, videos):
        if video["crawling_status"] == "Player Config: 429":
            print("Twitter rate limit hit. Try again in 15 minutes")
            sys.exit(1)
        query = (
            "UPDATE videos SET %s" %
            postgres_helper.dict_mogrifying_string(video)) + " WHERE id=%(id)s"
        c.execute(query, video)
        conn.commit()
        crawling_progress.inc(by=1)
    pool.close()
    pool.join()
def run(db, user):
    conn = psycopg2.connect(database=db, user=user)
    c = conn.cursor()
    # Only crawl articles that have not yet been crawled
    c.execute("SELECT source_url, source_name FROM articles WHERE crawling_status<>'Success'")
    articles = c.fetchall()
    crawling_progress = StatusVisualization(len(articles), update_every=10000)
    # parallel crawling and parsing to speed things up
    with Pool(32)  as pool:  # 16 seems to be around optimum
        for (index, status, videos) in pool.imap_unordered(crawl_article, enumerate(articles), chunksize=100):
            source_url = articles[index][0]
            source_name = articles[index][1]
            # Update article crawling status
            c.execute("UPDATE articles SET crawling_status=%s WHERE source_url=%s", [status, source_url])
            # If the article has been successfully crawled...
            if status == 'Success':
                # ...Update the article count in the sources table
                c.execute(
                    "INSERT INTO sources (source_name)  VALUES (%s) ON CONFLICT (source_name) DO UPDATE SET article_count = sources.article_count + 1",
                    [source_name])
                # ...Save all the found videos to the database
                for platform, video_id in videos:
                    # Insert it into the videos table s.t. it contains all videos in the end
                    c.execute("INSERT INTO videos (platform, id) VALUES (%s,%s) ON CONFLICT DO NOTHING",
                              [platform, video_id])
                    c.execute("""INSERT INTO article_videos (source_url, source_name, platform, video_id)
                                  VALUES (%s, %s, %s, %s)""", [source_url, source_name, platform, video_id])
            conn.commit()
            crawling_progress.inc(1)
Example #4
0
def run():
    conn = psycopg2.connect(database="video_article_retrieval",
                            user="******")
    video_cursor = conn.cursor()
    update_cursor = conn.cursor()
    video_cursor.execute(
        "SELECT id, platform FROM videos WHERE i3d_rgb_status<>'Success' AND resnet_status='Success'"
    )
    videos = video_cursor.fetchall()
    crawling_progress = StatusVisualization(len(videos), update_every=100)
    # 4 works best. Too many and each worker doesn't have the GPU memory it needs
    with Pool(4, initializer=init_worker) as pool:
        for status, id, platform, compressed_feature in pool.imap_unordered(
                process, videos, chunksize=10):
            if status == 'Success':
                # Insert embedding and update the classification status
                update_cursor.execute(
                    "UPDATE videos SET i3d_rgb_status = 'Success', i3d_rgb_1024 = %s WHERE id=%s AND platform=%s",
                    [compressed_feature, id, platform])
            else:
                update_cursor.execute(
                    "UPDATE videos SET i3d_rgb_status = %s WHERE id=%s AND platform=%s",
                    [status, id, platform])
            conn.commit()
            crawling_progress.inc()
Example #5
0
def run():
    conn = psycopg2.connect(database="video_article_retrieval",
                            user="******")
    c = conn.cursor()
    # Reset the topics table
    c.execute("DROP TABLE IF EXISTS topics")
    num_topics = 10
    query = "CREATE TABLE topics (source_url TEXT PRIMARY KEY, "
    query += ",".join("topic_%d FLOAT DEFAULT 0" % index
                      for index in range(0, num_topics))
    query += ")"
    c.execute(query)
    conn.commit()
    c.execute(
        "SELECT count(1) FROM articles WHERE text_extraction_status='Success'")
    article_count, = c.fetchone()

    crawling_progress = StatusVisualization(article_count, update_every=1000)
    articles_cursor = conn.cursor()
    articles_cursor.execute(
        "SELECT source_url, text FROM articles WHERE text_extraction_status='Success'"
    )

    # parallel classification
    with Pool(8) as pool:  # 16 seems to be around optimum
        for source_url, topics in pool.imap_unordered(classify,
                                                      articles_cursor,
                                                      chunksize=100):
            query = "INSERT INTO topics (source_url, " + ",".join("topic_%d" % topic[0] for topic in topics) + ")" \
                    + ("VALUES ('%s', " % source_url) + ",".join("%f" % topic[1] for topic in topics) + ")"
            c.execute(query)
            conn.commit()
            crawling_progress.inc(1)
def run():
    conn = psycopg2.connect(database="gdelt_social_video", user="******")
    c = conn.cursor()
    c.execute("SELECT * FROM sources LIMIT 1")
    # We're only interested in hosts that had any video etc. in them
    c.execute('SELECT source_name FROM sources')
    sources = c.fetchall()

    progress = StatusVisualization(total_count=len(sources), update_every=1000)
    for source in sources:
        source = source[0]
        features = dict()
        # article_count is already computed in when the db is populated

        for platform in ["twitter", "youtube", "facebook"]:
            # Get all videos from that source, of that platform:
            c.execute('SELECT video_url FROM article_videos WHERE source_name=%s AND platform=%s', [source, platform])
            videos = c.fetchall()

            # Get the count of each article
            c.execute('SELECT Count(1) FROM article_videos WHERE source_name=%s AND platform=%s GROUP BY source_url',
                      [source, platform])
            video_counts = c.fetchall()

            features[platform + "_std_dev"] = np.std(video_counts) if len(video_counts) > 0 else -1
            features[platform + "_count"] = len(video_counts)
            features[platform + "_sum"] = len(videos)
            features[platform + "_sum_distinct"] = len(set(videos))

        query = "UPDATE sources SET %s WHERE source_name=\'%s\'" % (postgres_helper.dict_set_string(features), source)
        c.execute(query)
        conn.commit()
        progress.inc()
Example #7
0
def run(year, month):
    # Make sure the data directories for the interesting collections exist.
    for collection in INTERESTING_COLLECTIONS:
        path = "%s/external/%s/" % (os.environ["DATA_PATH"], collection)
        if not os.path.exists(path):
            os.makedirs(path)

    with open(os.environ["DATA_PATH"] +
              "/external/masterfilelist.txt") as master_file_list:

        urls = list()
        malformed_lines = 0
        relevant_lines = 0

        for line in master_file_list:
            # Example line: 134072 f1c7a45aa0292b0aee2bc5b674841096 http://data.gdeltproject.org/gdeltv2/20180731191500.export.CSV.zip
            # But some files are missing, then the master file just contains http://data.gdeltproject.org/gdeltv2/
            try:
                url = line.rstrip("\n").split(" ")[2]
                file_name = url.split("/")[-1].lower(
                )  # Casing is inconsistent in the data source, we don't want that
                # Correct time?
                if file_name.startswith("%d%02d" % (year, month)):
                    relevant_lines += 1
                    # One of the collections we're interested in?
                    collection = file_name.split(".")[-3]
                    if collection in INTERESTING_COLLECTIONS:
                        file_path = "%s/external/%s/%s" % (
                            os.environ["DATA_PATH"], collection, file_name)
                        # Not already downloaded?
                        # if not os.path.isfile(file_path):
                        urls.append((url, file_path))
                elif file_name.startswith("%d%02d" % (year, month + 1)):
                    # We're done. (the dates are in order in the master file)
                    break
            except Exception as e:
                malformed_lines += 1  # Some lines just contain http://data.gdeltproject.org/gdeltv2/

        print("\nDone reading the master file.\n")
        print("%d relevant files,\n %d malformed,\n %d already downloaded,\n %d now downloading...\n" \
              % (relevant_lines, malformed_lines, relevant_lines - len(urls), len(urls)))

        crawling_progress = StatusVisualization(len(urls), update_every=50)
        with Pool(16) as pool:
            for _ in pool.imap_unordered(retrieve_and_save, urls):
                crawling_progress.inc()
def run():
    conn = psycopg2.connect(database="video_article_retrieval",
                            user="******")
    article_cursor = conn.cursor()
    update_cursor = conn.cursor()
    # Get the count.
    article_cursor.execute(
        "SELECT count(1) FROM articles WHERE text_extraction_status='Success'")
    article_count, = article_cursor.fetchone()
    # avoid loading all articles into memory.
    article_cursor.execute(
        "SELECT id, text FROM articles WHERE text_extraction_status='Success'")
    # Parallel tokenization, since it takes by far the most time

    crawling_progress = StatusVisualization(article_count, update_every=1000)
    with Pool(8) as pool:
        for article_id, tokens_string in pool.imap_unordered(tokenize_parallel,
                                                             article_cursor,
                                                             chunksize=100):
            update_cursor.execute("UPDATE articles SET tokens=%s WHERE id=%s",
                                  [tokens_string, article_id])
            crawling_progress.inc()

        conn.commit()
def run():
    MODEL = "yolov3"  # Postfix -tiny
    net, meta = darknet_wrapper.initialize_classifier(
        config="cfg/%s.cfg" % MODEL,
        weights="weights/%s.weights" % MODEL,
        data="cfg/coco.data")

    conn = psycopg2.connect(database="video_article_retrieval",
                            user="******")
    c = conn.cursor()
    # Just classifying facebook videos for now
    c.execute(
        "SELECT id, platform FROM videos WHERE object_detection_yolo_status<>'Success' AND platform = 'facebook'"
    )
    videos = c.fetchall()

    print("%d videos left to analyze" % len(videos))

    crawling_progress = StatusVisualization(len(videos), update_every=10)
    for id, platform in videos:
        # print(platform, id)
        # We need to extract the images first
        # start = time.time()
        images = []
        cap = cv2.VideoCapture(video_helper.get_path(platform, id))
        count = 0
        while True:
            success, image = cap.read()
            if success:
                if count % 30 == 0:
                    path = tempfile.gettempdir() + "/%05d.jpg" % count
                    cv2.imwrite(path, image)
                    images.append(path)
                count += 1
            else:
                # Reached the end of the video
                break

        # print("Extracted %d images in %d seconds" % (len(images), time.time() - start))
        # start = time.time()

        for index, image in enumerate(images):
            try:
                result = darknet_wrapper.detect(net, meta, image)

                # print("%d: Found %d rois in %s" % (index, len(result), image))
                for entity in result:
                    # format is (class, probability (x,y,width, height)) ANKERED IN THE CENTER!
                    (label, probability, (x, y, width, height)) = entity
                    # x,y,height and width are not saved for now.
                    # print("%d,%d (%dx%d): %s (%.3f)" % (x, y, width, height, label, probability))
                    c.execute(
                        "INSERT INTO object_detection_yolo(id,platform,second,class,probability) VALUES (%s,%s,%s,%s,%s)",
                        [
                            id, platform, index,
                            str(label, "utf-8"), probability
                        ])
                    conn.commit()
            except Exception as e:
                print(e)

        # Update the classification status
        c.execute(
            "UPDATE videos SET object_detection_yolo_status = 'Success' WHERE id=%s AND platform=%s",
            [id, platform])
        conn.commit()
        # print("Detection took %d seconds" % (time.time() - start))
        crawling_progress.inc()
Example #10
0
c.execute(
    "SELECT count(1) FROM articles WHERE text_extraction_status='Success'")
article_count, = c.fetchone()
# avoid loading all articles into memory.
c.execute(
    "SELECT source_url, text FROM articles WHERE text_extraction_status='Success'"
)


def tokenize_parallel(article):
    source_url, text = article
    return source_url, tokenize(text)


# Parallel tokenization, since it takes by far the most time
crawling_progress = StatusVisualization(article_count, update_every=1000)
articles = list()
token_count = 0
with Pool(8) as pool:
    for source_url, tokens in pool.imap_unordered(tokenize_parallel,
                                                  c,
                                                  chunksize=100):
        articles.append((source_url, tokens))
        token_count += len(tokens)
        crawling_progress.inc()

# articles = [(source_url, tokenize.tokenize(text)) for (source_url, text) in c]
# The rest is not parallelized.
print("Extracting %d tokens took %.2f seconds" %
      (token_count, time.time() - start))
start = time.time()
import psycopg2

from src import util
from src.data.articles import article as article_helper
from src.data.articles.boilerpipe import BoilerPipeArticleExtractor
from src.visualization.console import StatusVisualization

articles_base_path = os.environ["DATA_PATH"] + "/raw/articles/"

if __name__ == "__main__":
    conn = psycopg2.connect(database="video_article_retrieval", user="******")
    c = conn.cursor()
    c.execute("SELECT source_url FROM articles WHERE text_extraction_status = 'Not Tried'")
    extractor = BoilerPipeArticleExtractor()
    article_urls = list(c)
    crawling_progress = StatusVisualization(len(article_urls), update_every=100)

    for source_url, in article_urls:
        article_path, article_file = article_helper.get_article_html_filepath(source_url)
        html = util.load_gzip_text(os.path.join(article_path, article_file))
        try:
            text = extractor.get_text(html)
            # Save it to the DB
            c.execute("UPDATE articles SET text=%s, text_extraction_status=%s WHERE source_url=%s", [text, "Success", source_url])
            conn.commit()
        except Exception as e:
            c.execute("UPDATE articles SET text_extraction_status=%s WHERE source_url=%s", [type(e).__name__, source_url])


        crawling_progress.inc(by=1)