def ml_scraper( USER_ID=None, PASSCODE=None, tag_hashes=None, bucket_ids=None, pages=None, mode=None, targeting=None, ): if targeting == "bucket": tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE, bucket_ids) delay = uniform(10, 15) elif targeting == "tag": delay = uniform(30, 35) if mode == "archive": print("Scraping in archive mode") start_time = time.time() print("Initializing ...") initializationSuccess = False try: coll = sharechat_helper.ml_initialize_mongo() aws, bucket, s3 = sharechat_helper.ml_initialize_s3() initializationSuccess = True print("Initialized successfully") except Exception: print("Initialization failure") print(logging.traceback.format_exc()) # Scrape data from tags if initializationSuccess: print("Scraping in progress ...") sharechat_df = sharechat_helper.get_trending_data( USER_ID, PASSCODE, tag_hashes, pages, delay) if len(sharechat_df) < 1: raise ValueError( "Returned empty dataframe. No posts were scraped.") else: # Save data locally sharechat_df.to_pickle("sharechat_df.pkl") s3UploadSuccess = False # Save data to S3 & Mongo DB try: print("S3 upload in progress ... ") sharechat_df = sharechat_helper.ml_sharechat_s3_upload( sharechat_df, aws, bucket, s3) s3UploadSuccess = True print("Data uploaded to S3") except Exception: print("S3 upload failed") print(logging.traceback.format_exc()) pass if s3UploadSuccess: try: print("HTML preview file creation in progress ...") ( sharechat_df, sharechat_df_html, ) = sharechat_helper.get_thumbnails_from_s3( sharechat_df) with open("sharechat_ml_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML preview file created") except Exception: print("HTML preview file creation failed") print(logging.traceback.format_exc()) pass try: print("MongoDB upload in progress ...") sharechat_helper.sharechat_mongo_upload( sharechat_df, coll) print("Data uploaded to MongoDB") except Exception: print("MongoDB upload failed") print(logging.traceback.format_exc()) pass else: pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_ml_data.csv") print("CSV file created") print("{} posts scraped".format(len(sharechat_df))) except Exception: print("CSV file creation failed") print(logging.traceback.format_exc()) pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df elif mode == "local": print("Scraping in local mode") start_time = time.time() print("Scraping in progress ...") sharechat_df = sharechat_helper.get_trending_data( USER_ID, PASSCODE, tag_hashes, pages, delay) if len(sharechat_df) < 1: raise ValueError( "Returned empty dataframe. No posts were scraped.") else: # Save data locally sharechat_df.to_pickle("sharechat_df.pkl") try: print("HTML preview file creation in progress ...") ( sharechat_df, sharechat_df_html, ) = sharechat_helper.get_thumbnails_from_sharechat(sharechat_df) with open("sharechat_ml_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML preview file created") except Exception: print("HTML preview file creation failed") print(logging.traceback.format_exc()) pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_ml_data.csv") print("CSV file created") print("{} posts scraped".format(len(sharechat_df))) except Exception: print("CSV file creation failed") print(logging.traceback.format_exc()) pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df
def fresh_content_scraper( USER_ID=None, PASSCODE=None, tag_hashes=None, bucket_ids=None, pages=None, unix_timestamp=None, mode=None, targeting=None, ): if targeting == "bucket": tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE, bucket_ids) delay = uniform(10, 15) elif targeting == "tag": delay = uniform(30, 35) if mode == "archive": print("Scraping in archive mode") start_time = time.time() # Initialize S3 and Mongo DB print("Initializing ...") initializationSuccess = False try: aws, bucket, s3 = s3_mongo_helper.initialize_s3() coll = s3_mongo_helper.initialize_mongo() initializationSuccess = True print("Initialized successfully") except Exception: print("Initialization failure") print(logging.traceback.format_exc()) # Scrape data from Sharechat tags if initializationSuccess: print("Scraping in progress ...") sharechat_df = sharechat_helper.get_fresh_data( USER_ID, PASSCODE, tag_hashes, pages, unix_timestamp, delay) if len(sharechat_df) < 1: raise ValueError( "Returned empty dataframe. No posts were scraped.") else: # Save data to S3 & Mongo DB s3UploadSuccess = False try: print("S3 upload in progress ...") sharechat_df, tagwise_duplicates = sharechat_helper.sharechat_s3_upload( sharechat_df, aws, bucket, s3, coll) # the returned df includes s3 urls s3UploadSuccess = True print("Data uploaded to S3") except Exception: print("S3 upload failed") print(logging.traceback.format_exc()) pass if s3UploadSuccess: aws, logbucket, s3 = sharechat_helper.initialize_s3_logbucket() today = datetime.utcnow().strftime("%Y%m%d") try: print("HTML file creation in progress ...") ( sharechat_df, sharechat_df_html, ) = sharechat_helper.get_thumbnails_from_s3(sharechat_df) with open("sharechat_fresh_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML file created") print("Uploading HTML file to S3 ...") sharechat_helper.upload_logs( s3=s3, filename="sharechat_fresh_data_preview.html", key="fresh_preview_" + today, bucket=logbucket, ) print("HTML file uploaded") except Exception: print("HTML file upload failed") print(logging.traceback.format_exc()) pass try: print("Duplicates log creation in progress ...") with open("tagwise_duplicates.json", "w") as fp: json.dump(tagwise_duplicates, fp) print("Duplicates log created") print("Uploading duplicates log to S3 ...") sharechat_helper.upload_logs( s3=s3, filename="tagwise_duplicates.json", key="fresh_duplicates_" + today, bucket=logbucket, ) print("Duplicates log uploaded") except Exception: print("Duplicates log upload failed") print(logging.traceback.format_exc()) pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_fresh_data.csv") print("CSV file created") print("Uploading CSV file to S3 ...") sharechat_helper.upload_logs( s3=s3, filename="sharechat_fresh_data.csv", key="fresh_posts_" + today, bucket=logbucket, ) print("CSV file uploaded") except Exception: print("CSV file upload failed") print(logging.traceback.format_exc()) pass try: print("MongoDB upload in progress ...") sharechat_helper.sharechat_mongo_upload(sharechat_df, coll) print("Data uploaded to MongoDB") print("{} posts saved".format(len(sharechat_df))) except Exception: print("MongoDB upload failed") print(logging.traceback.format_exc()) pass else: pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df elif mode == "local": print("Scraping in local mode") start_time = time.time() print("Scraping in progress ...") sharechat_df = sharechat_helper.get_fresh_data(USER_ID, PASSCODE, tag_hashes, pages, unix_timestamp, delay) if len(sharechat_df) < 1: raise ValueError( "Returned empty dataframe. No posts were scraped.") else: # Save data locally sharechat_df.to_pickle("sharechat_df.pkl") try: print("HTML preview file creation in progress ...") ( sharechat_df, sharechat_df_html, ) = sharechat_helper.get_thumbnails_from_sharechat(sharechat_df) with open("sharechat_fresh_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML preview file created") except Exception: print("HTML preview file creation failed") print(logging.traceback.format_exc()) pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_fresh_data.csv") print("CSV file created") print("{} posts saved".format(len(sharechat_df))) except Exception: print("CSV file creation failed") print(logging.traceback.format_exc()) pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df