Exemple #1
0
def ml_scraper(
    USER_ID=None,
    PASSCODE=None,
    tag_hashes=None,
    bucket_ids=None,
    pages=None,
    mode=None,
    targeting=None,
):
    if targeting == "bucket":
        tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE,
                                                     bucket_ids)
        delay = uniform(10, 15)
    elif targeting == "tag":
        delay = uniform(30, 35)
    if mode == "archive":
        print("Scraping in archive mode")
        start_time = time.time()
        print("Initializing ...")
        initializationSuccess = False
        try:
            coll = sharechat_helper.ml_initialize_mongo()
            aws, bucket, s3 = sharechat_helper.ml_initialize_s3()
            initializationSuccess = True
            print("Initialized successfully")
        except Exception:
            print("Initialization failure")
            print(logging.traceback.format_exc())
        # Scrape data from tags
        if initializationSuccess:
            print("Scraping in progress ...")
            sharechat_df = sharechat_helper.get_trending_data(
                USER_ID, PASSCODE, tag_hashes, pages, delay)
            if len(sharechat_df) < 1:
                raise ValueError(
                    "Returned empty dataframe. No posts were scraped.")
            else:
                # Save data locally
                sharechat_df.to_pickle("sharechat_df.pkl")
                s3UploadSuccess = False
                # Save data to S3 & Mongo DB
                try:
                    print("S3 upload in progress ... ")
                    sharechat_df = sharechat_helper.ml_sharechat_s3_upload(
                        sharechat_df, aws, bucket, s3)
                    s3UploadSuccess = True
                    print("Data uploaded to S3")
                except Exception:
                    print("S3 upload failed")
                    print(logging.traceback.format_exc())
                    pass
                if s3UploadSuccess:
                    try:
                        print("HTML preview file creation in progress ...")
                        (
                            sharechat_df,
                            sharechat_df_html,
                        ) = sharechat_helper.get_thumbnails_from_s3(
                            sharechat_df)
                        with open("sharechat_ml_data_preview.html", "w") as f:
                            f.write(sharechat_df_html.data)
                            print("HTML preview file created")
                    except Exception:
                        print("HTML preview file creation failed")
                        print(logging.traceback.format_exc())
                        pass
                    try:
                        print("MongoDB upload in progress ...")
                        sharechat_helper.sharechat_mongo_upload(
                            sharechat_df, coll)
                        print("Data uploaded to MongoDB")
                    except Exception:
                        print("MongoDB upload failed")
                        print(logging.traceback.format_exc())
                        pass
                else:
                    pass
                try:
                    print("CSV file creation in progress ... ")
                    sharechat_df.to_csv("sharechat_ml_data.csv")
                    print("CSV file created")
                    print("{} posts scraped".format(len(sharechat_df)))
                except Exception:
                    print("CSV file creation failed")
                    print(logging.traceback.format_exc())
                    pass
                print("Scraping complete")
                print("Time taken: %s seconds" % (time.time() - start_time))
                return sharechat_df
    elif mode == "local":
        print("Scraping in local mode")
        start_time = time.time()
        print("Scraping in progress ...")
        sharechat_df = sharechat_helper.get_trending_data(
            USER_ID, PASSCODE, tag_hashes, pages, delay)
        if len(sharechat_df) < 1:
            raise ValueError(
                "Returned empty dataframe. No posts were scraped.")
        else:
            # Save data locally
            sharechat_df.to_pickle("sharechat_df.pkl")
        try:
            print("HTML preview file creation in progress ...")
            (
                sharechat_df,
                sharechat_df_html,
            ) = sharechat_helper.get_thumbnails_from_sharechat(sharechat_df)
            with open("sharechat_ml_data_preview.html", "w") as f:
                f.write(sharechat_df_html.data)
                print("HTML preview file created")
        except Exception:
            print("HTML preview file creation failed")
            print(logging.traceback.format_exc())
            pass
        try:
            print("CSV file creation in progress ... ")
            sharechat_df.to_csv("sharechat_ml_data.csv")
            print("CSV file created")
            print("{} posts scraped".format(len(sharechat_df)))
        except Exception:
            print("CSV file creation failed")
            print(logging.traceback.format_exc())
            pass
        print("Scraping complete")
        print("Time taken: %s seconds" % (time.time() - start_time))
        return sharechat_df
Exemple #2
0
def fresh_content_scraper(
    USER_ID=None,
    PASSCODE=None,
    tag_hashes=None,
    bucket_ids=None,
    pages=None,
    unix_timestamp=None,
    mode=None,
    targeting=None,
):
    if targeting == "bucket":
        tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE,
                                                     bucket_ids)
        delay = uniform(10, 15)
    elif targeting == "tag":
        delay = uniform(30, 35)
    if mode == "archive":
        print("Scraping in archive mode")
        start_time = time.time()
        # Initialize S3 and Mongo DB
        print("Initializing ...")
        initializationSuccess = False
        try:
            aws, bucket, s3 = s3_mongo_helper.initialize_s3()
            coll = s3_mongo_helper.initialize_mongo()
            initializationSuccess = True
            print("Initialized successfully")
        except Exception:
            print("Initialization failure")
            print(logging.traceback.format_exc())
        # Scrape data from Sharechat tags
        if initializationSuccess:
            print("Scraping in progress ...")
            sharechat_df = sharechat_helper.get_fresh_data(
                USER_ID, PASSCODE, tag_hashes, pages, unix_timestamp, delay)
        if len(sharechat_df) < 1:
            raise ValueError(
                "Returned empty dataframe. No posts were scraped.")
        else:
            # Save data to S3 & Mongo DB
            s3UploadSuccess = False
            try:
                print("S3 upload in progress ...")
                sharechat_df, tagwise_duplicates = sharechat_helper.sharechat_s3_upload(
                    sharechat_df, aws, bucket, s3,
                    coll)  # the returned df includes s3 urls
                s3UploadSuccess = True
                print("Data uploaded to S3")
            except Exception:
                print("S3 upload failed")
                print(logging.traceback.format_exc())
                pass
            if s3UploadSuccess:
                aws, logbucket, s3 = sharechat_helper.initialize_s3_logbucket()
                today = datetime.utcnow().strftime("%Y%m%d")
                try:
                    print("HTML file creation in progress ...")
                    (
                        sharechat_df,
                        sharechat_df_html,
                    ) = sharechat_helper.get_thumbnails_from_s3(sharechat_df)
                    with open("sharechat_fresh_data_preview.html", "w") as f:
                        f.write(sharechat_df_html.data)
                        print("HTML file created")
                    print("Uploading HTML file to S3 ...")
                    sharechat_helper.upload_logs(
                        s3=s3,
                        filename="sharechat_fresh_data_preview.html",
                        key="fresh_preview_" + today,
                        bucket=logbucket,
                    )
                    print("HTML file uploaded")
                except Exception:
                    print("HTML file upload failed")
                    print(logging.traceback.format_exc())
                    pass
                try:
                    print("Duplicates log creation in progress ...")
                    with open("tagwise_duplicates.json", "w") as fp:
                        json.dump(tagwise_duplicates, fp)
                    print("Duplicates log created")
                    print("Uploading duplicates log to S3 ...")
                    sharechat_helper.upload_logs(
                        s3=s3,
                        filename="tagwise_duplicates.json",
                        key="fresh_duplicates_" + today,
                        bucket=logbucket,
                    )
                    print("Duplicates log uploaded")
                except Exception:
                    print("Duplicates log upload failed")
                    print(logging.traceback.format_exc())
                    pass
                try:
                    print("CSV file creation in progress ... ")
                    sharechat_df.to_csv("sharechat_fresh_data.csv")
                    print("CSV file created")
                    print("Uploading CSV file to S3 ...")
                    sharechat_helper.upload_logs(
                        s3=s3,
                        filename="sharechat_fresh_data.csv",
                        key="fresh_posts_" + today,
                        bucket=logbucket,
                    )
                    print("CSV file uploaded")
                except Exception:
                    print("CSV file upload failed")
                    print(logging.traceback.format_exc())
                    pass
                try:
                    print("MongoDB upload in progress ...")
                    sharechat_helper.sharechat_mongo_upload(sharechat_df, coll)
                    print("Data uploaded to MongoDB")
                    print("{} posts saved".format(len(sharechat_df)))
                except Exception:
                    print("MongoDB upload failed")
                    print(logging.traceback.format_exc())
                    pass
            else:
                pass
            print("Scraping complete")
            print("Time taken: %s seconds" % (time.time() - start_time))
            return sharechat_df
    elif mode == "local":
        print("Scraping in local mode")
        start_time = time.time()
        print("Scraping in progress ...")
        sharechat_df = sharechat_helper.get_fresh_data(USER_ID, PASSCODE,
                                                       tag_hashes, pages,
                                                       unix_timestamp, delay)
        if len(sharechat_df) < 1:
            raise ValueError(
                "Returned empty dataframe. No posts were scraped.")
        else:
            # Save data locally
            sharechat_df.to_pickle("sharechat_df.pkl")
        try:
            print("HTML preview file creation in progress ...")
            (
                sharechat_df,
                sharechat_df_html,
            ) = sharechat_helper.get_thumbnails_from_sharechat(sharechat_df)
            with open("sharechat_fresh_data_preview.html", "w") as f:
                f.write(sharechat_df_html.data)
                print("HTML preview file created")
        except Exception:
            print("HTML preview file creation failed")
            print(logging.traceback.format_exc())
            pass
        try:
            print("CSV file creation in progress ... ")
            sharechat_df.to_csv("sharechat_fresh_data.csv")
            print("CSV file created")
            print("{} posts saved".format(len(sharechat_df)))
        except Exception:
            print("CSV file creation failed")
            print(logging.traceback.format_exc())
            pass
        print("Scraping complete")
        print("Time taken: %s seconds" % (time.time() - start_time))
        return sharechat_df