def get_thumbnails_from_s3(df): def path_to_image_html(path): return '<img src="' + path + '"width="200" >' thumbnail = [] aws, bucket, s3 = s3_mongo_helper.initialize_s3() temp_dir = tempfile.mkdtemp(dir=os.getcwd()) for link in df["s3_url"]: if link is not None: if link.split(".")[-1] == "mp4": video_input_path = link img_output_path = (temp_dir.split("/")[-1] + "/" + link.split("/")[-1].split(".")[0] + ".jpg") filename = link.split("/")[-1].split(".")[0] + ".jpg" subprocess.call( [ "ffmpeg", "-i", video_input_path, "-ss", "00:00:00.000", "-vframes", "1", img_output_path, ], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, ) s3_mongo_helper.upload_to_s3( s3=s3, file=img_output_path, filename=filename, bucket=bucket, content_type="image/jpeg", ) thumbnail.append(aws + bucket + "/" + filename) elif link.split(".")[-1] == "txt": thumbnail.append(None) else: # if jpg/jpeg/png thumbnail.append(link) else: # if NaN thumbnail.append(None) df["thumbnail"] = np.array(thumbnail) pd.set_option("display.max_colwidth", -1) df_html = HTML( df.to_html( index=False, escape=False, formatters=dict(thumbnail=path_to_image_html), render_links=True, )) shutil.rmtree(temp_dir) return df, df_html
def save_to_server(all_msgs: List[Msg], merged_msgs: List[Msg], media_files: list, drive_id: str) -> None: """ Save msgs and media to the Tattle server. This requires setting environment variables """ # 0. Initialize all_coll, merged_coll = initialize_mongo() aws, bucket, s3 = initialize_s3() # 1. Insert all "raw" msgs all_coll = initialize_mongo(var_prefix="whatsapp_all") msgs_by_file = group_by_file(all_msgs) to_insert = [] insert_dt = datetime.datetime.utcnow().isoformat() for msgs in msgs_by_file.values(): to_insert.append({ 'scrape_datetime': insert_dt, 'source': GOOGLE_DRIVE, 'source_loc': drive_id, 'msgs': [m.as_dict() for m in msgs] }) all_coll.insert_many(to_insert) # 2. Upsert merged msgs merged_coll = initialize_mongo(var_prefix="whatsapp_merged") msg_gids = [msg.group_id for msg in merged_msgs] existing_msgs = merged_coll.find({"group_id": {"$in": msg_gids}}) if existing_msgs: logging.warning("Not overwriting %d msgs already in server.", len(existing_msgs)) merge_msgs_from_server(merged_msgs, existing_msgs) merged_coll.insert_many([m.as_dict() for m in merged_msgs]) # 3. Upload media files to s3 for fl in media_files: logging.info("Uploading %r", fl['hash']) upload_to_s3(s3, fl['content'], fl['hash'], bucket, fl['media_mime_type']) logging.info("Wrote %d files to S3. Done", len(media_files))
def fresh_content_scraper( USER_ID=None, PASSCODE=None, tag_hashes=None, bucket_ids=None, pages=None, unix_timestamp=None, mode=None, targeting=None, ): if targeting == "bucket": tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE, bucket_ids) delay = uniform(10, 15) elif targeting == "tag": delay = uniform(30, 35) if mode == "archive": print("Scraping in archive mode") start_time = time.time() # Initialize S3 and Mongo DB print("Initializing ...") initializationSuccess = False try: aws, bucket, s3 = s3_mongo_helper.initialize_s3() coll = s3_mongo_helper.initialize_mongo() initializationSuccess = True print("Initialized successfully") except Exception: print("Initialization failure") print(logging.traceback.format_exc()) # Scrape data from Sharechat tags if initializationSuccess: print("Scraping in progress ...") sharechat_df = sharechat_helper.get_fresh_data( USER_ID, PASSCODE, tag_hashes, pages, unix_timestamp, delay) if len(sharechat_df) < 1: raise ValueError( "Returned empty dataframe. No posts were scraped.") else: # Save data to S3 & Mongo DB s3UploadSuccess = False try: print("S3 upload in progress ...") sharechat_df, tagwise_duplicates = sharechat_helper.sharechat_s3_upload( sharechat_df, aws, bucket, s3, coll) # the returned df includes s3 urls s3UploadSuccess = True print("Data uploaded to S3") except Exception: print("S3 upload failed") print(logging.traceback.format_exc()) pass if s3UploadSuccess: aws, logbucket, s3 = sharechat_helper.initialize_s3_logbucket() today = datetime.utcnow().strftime("%Y%m%d") try: print("HTML file creation in progress ...") ( sharechat_df, sharechat_df_html, ) = sharechat_helper.get_thumbnails_from_s3(sharechat_df) with open("sharechat_fresh_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML file created") print("Uploading HTML file to S3 ...") sharechat_helper.upload_logs( s3=s3, filename="sharechat_fresh_data_preview.html", key="fresh_preview_" + today, bucket=logbucket, ) print("HTML file uploaded") except Exception: print("HTML file upload failed") print(logging.traceback.format_exc()) pass try: print("Duplicates log creation in progress ...") with open("tagwise_duplicates.json", "w") as fp: json.dump(tagwise_duplicates, fp) print("Duplicates log created") print("Uploading duplicates log to S3 ...") sharechat_helper.upload_logs( s3=s3, filename="tagwise_duplicates.json", key="fresh_duplicates_" + today, bucket=logbucket, ) print("Duplicates log uploaded") except Exception: print("Duplicates log upload failed") print(logging.traceback.format_exc()) pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_fresh_data.csv") print("CSV file created") print("Uploading CSV file to S3 ...") sharechat_helper.upload_logs( s3=s3, filename="sharechat_fresh_data.csv", key="fresh_posts_" + today, bucket=logbucket, ) print("CSV file uploaded") except Exception: print("CSV file upload failed") print(logging.traceback.format_exc()) pass try: print("MongoDB upload in progress ...") sharechat_helper.sharechat_mongo_upload(sharechat_df, coll) print("Data uploaded to MongoDB") print("{} posts saved".format(len(sharechat_df))) except Exception: print("MongoDB upload failed") print(logging.traceback.format_exc()) pass else: pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df elif mode == "local": print("Scraping in local mode") start_time = time.time() print("Scraping in progress ...") sharechat_df = sharechat_helper.get_fresh_data(USER_ID, PASSCODE, tag_hashes, pages, unix_timestamp, delay) if len(sharechat_df) < 1: raise ValueError( "Returned empty dataframe. No posts were scraped.") else: # Save data locally sharechat_df.to_pickle("sharechat_df.pkl") try: print("HTML preview file creation in progress ...") ( sharechat_df, sharechat_df_html, ) = sharechat_helper.get_thumbnails_from_sharechat(sharechat_df) with open("sharechat_fresh_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML preview file created") except Exception: print("HTML preview file creation failed") print(logging.traceback.format_exc()) pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_fresh_data.csv") print("CSV file created") print("{} posts saved".format(len(sharechat_df))) except Exception: print("CSV file creation failed") print(logging.traceback.format_exc()) pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df
def trending_content_scraper(USER_ID=None, PASSCODE=None, tag_hashes=None, bucket_ids=None, pages=None, mode=None, targeting=None): if targeting == "bucket": tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE, bucket_ids) delay = uniform(10, 15) elif targeting == "tag": delay = uniform(30, 35) if mode == "archive": print("Scraping in archive mode") start_time = time.time() # Initialize S3 and Mongo DB print("Initializing ...") initializationSuccess = False try: aws, bucket, s3 = s3_mongo_helper.initialize_s3() coll = s3_mongo_helper.initialize_mongo() initializationSuccess = True print("Initialized successfully") except Exception as e: print("Initialization failure") print(logging.traceback.format_exc()) # Scrape data from Sharechat tags if initializationSuccess: print("Scraping in progress ...") sharechat_df = sharechat_helper.get_trending_data( USER_ID, PASSCODE, tag_hashes, pages, delay) if len(sharechat_df) < 1: raise ValueError( "get_data() returned empty dataframe. No posts were scraped." ) else: # Save data locally sharechat_df.to_pickle("sharechat_df.pkl") # Save data to S3 & Mongo DB s3UploadSuccess = False try: print("S3 upload in progress ...") sharechat_df = sharechat_helper.sharechat_s3_upload( sharechat_df, aws, bucket, s3) # the returned df includes s3 urls s3UploadSuccess = True print("Data uploaded to S3") except Exception as e: print("S3 upload failed") print(logging.traceback.format_exc()) pass if s3UploadSuccess: try: print("HTML preview file creation in progress ...") sharechat_df, sharechat_df_html = sharechat_helper.get_thumbnails_from_s3( sharechat_df) with open("sharechat_trending_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML preview file created") except Exception as e: print("HTML preview file creation failed") print(logging.traceback.format_exc()) pass try: print("MongoDB upload in progress ...") sharechat_helper.sharechat_mongo_upload( sharechat_df, coll) print("Data uploaded to MongoDB") except Exception as e: print("MongoDB upload failed") print(logging.traceback.format_exc()) pass else: pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_trending_data.csv") print("CSV file created") print("{} posts scraped".format(len(sharechat_df))) except Exception as e: print("CSV file creation failed") print(logging.traceback.format_exc()) pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df elif mode == "local": print("Scraping in local mode") start_time = time.time() print("Scraping in progress ...") sharechat_df = sharechat_helper.get_trending_data( USER_ID, PASSCODE, tag_hashes, pages, delay) if len(sharechat_df) < 1: raise ValueError( "get_data() returned empty dataframe. No posts were scraped.") else: # Save data locally sharechat_df.to_pickle("sharechat_df.pkl") try: print("HTML preview file creation in progress ...") sharechat_df, sharechat_df_html = sharechat_helper.get_thumbnails_from_sharechat( sharechat_df) with open("sharechat_trending_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML preview file created") except Exception as e: print("HTML preview file creation failed") print(logging.traceback.format_exc()) pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_trending_data.csv") print("CSV file created") print("{} posts scraped".format(len(sharechat_df))) except Exception as e: print("CSV file creation failed") print(logging.traceback.format_exc()) pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df