def compile_detections(aggregated_df, dir_name): metadata_path = os.path.join(metadata_dir, '%s_images.csv' % dir_name) metadata_df = pd.read_csv(metadata_path, names=[ 'CameraID', 'Latitude', 'Longitude', 'Date', 'Time', 'Filename', 'Dimensions' ]).drop_duplicates() combined = metadata_df.merge(aggregated_df, left_on='Filename', right_on='image', how='left').drop(columns=['image']) combined['Detections'].fillna(0, inplace=True) logging.info('Combined DF shape: %s' % str(combined.shape)) if not os.path.isdir(aggregated_dir): os.makedirs(aggregated_dir) logging.info('Created %s' % aggregated_dir) aggregated_path = os.path.join(aggregated_dir, '%s_aggregated.csv.xz' % dir_name) combined.to_csv(aggregated_path, index=False, header=False, compression='xz') logging.info('Saved aggregated data to %s' % aggregated_path) upload_blob(bucket, aggregated_path, 'traffic-images-aggregated/%s_aggregated.csv.xz' % dir_name) os.remove(aggregated_path) logging.info('Deleted %s' % aggregated_path)
def check_notebook_result(self): # Workaround because papermill does not directly return exit code. exit_code = '1' if _PAPERMILL_ERR_MSG in \ open('%s.ipynb' % self._test_name).read() else '0' os.chdir(self.TEST_DIR) if self._test_name == 'dsl_static_type_checking': subprocess.call([ sys.executable, 'check_notebook_results.py', '--testname', self._test_name, '--result', self._sample_test_result, '--exit-code', exit_code ]) else: subprocess.call([ sys.executable, 'check_notebook_results.py', '--experiment', '%s-test' % self._test_name, '--testname', self._test_name, '--result', self._sample_test_result, '--namespace', self._namespace, '--exit-code', exit_code ]) print('Copy the test results to GCS %s/' % self._results_gcs_dir) utils.upload_blob( self._bucket_name, self._sample_test_result, os.path.join(self._results_gcs_dir, self._sample_test_result))
def check_notebook_result(self): # Workaround because papermill does not directly return exit code. exit_code = '1' if PAPERMILL_ERR_MSG in \ open('%s.ipynb' % self._test_name).read() else '0' os.chdir(TEST_DIR) if self._test_name == 'dsl_static_type_checking': nbchecker = NoteBookChecker(testname=self._test_name, result=self._sample_test_result, exit_code=exit_code) nbchecker.check() else: nbchecker = NoteBookChecker(testname=self._test_name, result=self._sample_test_result, exit_code=exit_code, experiment=None, namespace='kubeflow') nbchecker.check() print('Copy the test results to GCS %s/' % self._results_gcs_dir) utils.upload_blob( self._bucket_name, self._sample_test_result, os.path.join(self._results_gcs_dir, self._sample_test_result))
def run_detections_for_dir(dir_name): logging.info('Running detections for %s' % dir_name) images = [ x for x in os.listdir(os.path.join(images_dir, dir_name)) if x.endswith('.jpg') ] compiled = [] logging.info('%s images to process' % len(images)) for image_file in images: # possible to stack? image_path = os.path.join(images_dir, dir_name, image_file) image_np = load_image_into_numpy_array(image_path) input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0), dtype=tf.float32) detections, predictions_dict, shapes = detect_fn(input_tensor) # / possible to stack? label_id_offset = 1 for i in range(detections['num_detections'].numpy()[0]): score = detections['detection_scores'][:, i].numpy()[0] if score > .2: object_class = detections['detection_classes'][:, i].numpy( )[0] + label_id_offset box = detections['detection_boxes'][:, i, :].numpy()[0] relative_size = (box[2] - box[0]) * (box[3] - box[1]) compiled.append([ image_file.split('.')[0], object_class, score, box[1], box[3], box[0], box[2], relative_size ]) if not os.path.isdir(detections_dir): os.makedirs(detections_dir) logging.info('Created %s' % detections_dir) detections_path = os.path.join(detections_dir, '%s_detections.csv.xz' % dir_name) detections_df = pd.DataFrame(compiled, columns=[ 'image', 'class', 'score', 'x1', 'x2', 'y1', 'y2', 'relative_size' ]) logging.info('Detections DF shape: %s' % str(detections_df.shape)) detections_df.to_csv(detections_path, index=False, header=False, compression='xz') logging.info('Saved detections to %s' % detections_path) upload_blob(bucket, detections_path, 'traffic-images-detections/%s_detections.csv.xz' % dir_name) os.remove(detections_path) logging.info('Deleted %s' % detections_path) return detections_df
def downloader_function(thing, thing2): date = (datetime.datetime.utcnow() - datetime.datetime(1970, 1, 1)).days people = [ "bijanmustard", "7e5h", "dadurath", "sagebeans", "spinebashed", "theonion", "clickhole" ] who = people[date % len(people)] tweet_dumper.get_all_tweets(who) delete_blob("twitter_bot_bucket", f"{who}-clean.txt") upload_blob("twitter_bot_bucket", f"/tmp/{who}-clean.txt", f"{who}-clean.txt")
def check_result(self): os.chdir(self.TEST_DIR) subprocess.call([ sys.executable, 'run_sample_test.py', '--input', '%s/%s.yaml' % (self._work_dir, self._test_name), '--result', self._sample_test_result, '--output', self._sample_test_output, '--testname', self._test_name, '--namespace', self._namespace ]) print('Copy the test results to GCS %s/' % self._results_gcs_dir) utils.upload_blob( self._bucket_name, self._sample_test_result, os.path.join(self._results_gcs_dir, self._sample_test_result))
def check_result(self): os.chdir(self.TEST_DIR) pysample_checker = PySampleChecker(testname=self._test_name, input='%s/%s.yaml' % (self._work_dir, self._test_name), output=self._sample_test_output, result=self._sample_test_result, namespace=self._namespace) pysample_checker.check() print('Copy the test results to GCS %s/' % self._results_gcs_dir) utils.upload_blob( self._bucket_name, self._sample_test_result, os.path.join(self._results_gcs_dir, self._sample_test_result))
def main(): for api in apis: logging.info('Loading data for %s' % api) request_datetime = datetime.now( tz=pytz.timezone('Singapore')).strftime('%Y%m%d%H%M') df = generate_table(api) dest_dir = os.path.join(data_dir, api) if not os.path.isdir(dest_dir): os.makedirs(dest_dir) logging.info('Created %s' % dest_dir) dest_path = os.path.join(dest_dir, '%s_%s.csv.xz' % (request_datetime, api)) df.to_csv(dest_path, index=False, header=False, compression='xz') logging.info('Saved data to %s' % dest_path) upload_blob(bucket, dest_path, '%s/%s_%s.csv.xz' % (api, request_datetime, api)) os.remove(dest_path) logging.info('Deleted %s' % dest_path)
project_id = json.load(f)['project_id'] bucket = storage_client.bucket('tyeoh-streetcred', user_project=project_id) metadata_dir = os.path.join(app_dir, 'metadata') if not os.path.isdir(metadata_dir): os.makedirs(metadata_dir) logging.info('Created %s' % metadata_dir) for api in apis: logging.info('Loading data for %s' % api) request_date = datetime.now( tz=pytz.timezone('Singapore')).strftime('%Y%m%d') df = generate_table(api) dest_path = os.path.join( metadata_dir, '%s_%s_metadata.csv.xz' % (request_date, api)) df.to_csv(dest_path, index=False, header=True, compression='xz') logging.info('Saved data to %s' % dest_path) upload_blob( bucket, dest_path, '%s_metadata/%s_%s_metadata.csv.xz' % (api, request_date, api)) os.remove(dest_path) logging.info('Deleted %s' % dest_path) except Exception as e: logging.error("Exception occurred", exc_info=True) raise else: logging.info('Script complete')
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument( 'marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str) parser.add_argument( '--number_images', default=10, type=int, help= 'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.' ) # if python file path is in argv remove it if ".py" in argv[0]: argv = argv[1:len(argv)] # get all arguments args = parser.parse_args(argv) marketplace = args.marketplace number_images = args.number_images # get all arguments args = parser.parse_args() # get already crawled asin list #asin_crawled_list = get_asin_images_crawled("mba_de.products_images") df_images = get_images_urls_not_crawled(marketplace) # if number_images is equal to 0, evry image should be crawled if number_images == 0: number_images = len(df_images) for j, image_row in df_images.iloc[0:number_images].iterrows(): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] url_image_lowq = image_row["url_image_lowq"] #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} #proxy_list = get_proxies("de", True) #proxy = next(iter(proxy_list)) #proxies={"http": proxy, "https": proxy} r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) utils.upload_blob( "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/" + marketplace + "/" + asin + ".jpg") df_img = pd.DataFrame(data={ "asin": [asin], "url": [ "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_gs": [ "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_mba_lowq": [url_image_lowq], "url_mba_hq": [url_image_hq], "timestamp": [datetime.datetime.now()] }, dtype=np.object) df_img['timestamp'] = df_img['timestamp'].astype('datetime64') df_img.to_gbq("mba_" + marketplace + ".products_images", project_id="mba-pipeline", if_exists="append") print("Successfully crawled image: %s | %s of %s" % (asin, j + 1, number_images)) else: print("Could not crawl image: %s | %s of %s" (asin, j + 1, number_images)) #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True) test = 0 bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7" folder_name = "mba-shirts" file_path = "mba-pipeline/crawler/mba/data/test.jpg" #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg") test = 0
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument( 'marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str) parser.add_argument('--telegram_api_key', default="", help='API key of mba bot', type=str) parser.add_argument( '--telegram_chatid', default="", help='Id of channel like private chat or group channel', type=str) parser.add_argument( '--number_products', default=10, type=int, help= 'Number of products/shirts that shoul be crawled. If 0, every image that is not already crawled will be crawled.' ) parser.add_argument( '--connection_timeout', default=10.0, type=float, help= 'Time that the request operation has until its breaks up. Default: 10.0 sec' ) parser.add_argument( '--time_break_sec', default=240, type=int, help= 'Time in seconds the script tries to get response of certain product. Default 240 sec' ) parser.add_argument( '--seconds_between_crawl', default=20, type=int, help= 'Time in seconds in which no proxy/ip shoul be used twice for crawling. Important to prevent being blacklisted. Default 20 sec' ) parser.add_argument( '--preemptible_code', default="0", type=str, help= 'Identifier of instance for pree logs. Default 0 which leads to GUID.') parser.add_argument( '--pre_instance_name', default="", type=str, help= 'Name of instance. Important: if set, script will stop instance after successfull operation. Default "".' ) parser.add_argument( '--zone', default="", type=str, help= 'Zone of instance. Must fit to close the instance correctly after successfull run. Default mayor zone of marketplace.' ) print(os.getcwd()) print(argv) # if python file path is in argv remove it if ".py" in argv[0]: argv = argv[1:len(argv)] # get all arguments args = parser.parse_args(argv) marketplace = args.marketplace api_key = args.telegram_api_key chat_id = args.telegram_chatid number_products = args.number_products connection_timeout = args.connection_timeout time_break_sec = args.time_break_sec seconds_between_crawl = args.seconds_between_crawl preemptible_code = args.preemptible_code pre_instance_name = args.pre_instance_name zone = args.zone if zone == "": zone = utils.get_zone_of_marketplace(marketplace) ip_address = utils.get_extrenal_ip(pre_instance_name, zone) if preemptible_code == "0": preemptible_code = uuid.uuid4().hex # get all arguments args = parser.parse_args() # get asins which are not already crawled df_product_details_tocrawl = get_asin_product_detail_crawled(marketplace) if len(df_product_details_tocrawl) == 0: print("no data to crawl") if pre_instance_name != "" and "pre" in pre_instance_name: utils.stop_instance(pre_instance_name, zone, msg="No data to crawl", api_key=api_key, chat_id=chat_id) return 0 #df_product_details = pd.DataFrame(data={"asin": ["B07RVNJHZL"], "url_product": ["adwwadwad"]}) df_product_details_tocrawl[ "url_product_asin"] = df_product_details_tocrawl.apply( lambda x: "https://www.amazon." + marketplace + "/dp/" + x["asin"], axis=1) # if number_images is equal to 0, evry image should be crawled if number_products == 0: number_products = len(df_product_details_tocrawl) reservationdate = datetime.datetime.now() df_reservation = df_product_details_tocrawl.iloc[0:number_products][[ "asin" ]].copy() df_reservation['status'] = "blocked" df_reservation['pree_id'] = preemptible_code df_reservation['ip_address'] = ip_address df_reservation['error_log'] = "" df_reservation['timestamp'] = reservationdate df_reservation['timestamp'] = df_reservation['timestamp'].astype( 'datetime64') try: df_reservation.to_gbq( "preemptible_logs.mba_detail_" + marketplace + "_preemptible_%s_%s_%s" % (reservationdate.year, reservationdate.month, reservationdate.day), project_id="mba-pipeline", if_exists="append") except: utils.stop_instance(pre_instance_name, zone, msg="Can not update big query reservation", api_key=api_key, chat_id=chat_id) for j, product_row in df_product_details_tocrawl.iloc[ 0:number_products].iterrows(): asin = product_row["asin"] url_product = product_row["url_product"] url_product_asin = product_row["url_product_asin"] if True: # try to get reponse with free proxies response = get_response( marketplace, url_product_asin, use_proxy=False, connection_timeout=connection_timeout, time_break_sec=time_break_sec, seconds_between_crawl=seconds_between_crawl) if response == None: # update reservation logs with blacklist of ip update_reservation_logs(marketplace, "blacklist", "blacklist", preemptible_code, ip_address, pre_instance_name, zone, api_key, chat_id) # if script is called by preemptible instance it should be deleted by itself if pre_instance_name != "": utils.stop_instance( pre_instance_name, zone, msg="Response is none because of time break condition", api_key=api_key, chat_id=chat_id) else: assert response != None, "Could not get response within time break condition" if response == 404: crawlingdate = [datetime.datetime.now()] df_product_details = pd.DataFrame( data={ "asin": [asin], "title": ["404"], "brand": ["404"], "url_brand": ["404"], "price": ["404"], "fit_types": [["404"]], "color_names": [["404"]], "color_count": [404], "product_features": [["404"]], "description": ["404"], "weight": ["404"], "upload_date_str": ["1995-01-01"], "upload_date": ["1995-01-01"], "customer_review_score": ["404"], "customer_review_count": [404], "mba_bsr_str": ["404"], "mba_bsr": [["404"]], "mba_bsr_categorie": [["404"]], "timestamp": crawlingdate }) # transform date/timestamo columns to datetime objects df_product_details['timestamp'] = df_product_details[ 'timestamp'].astype('datetime64') df_product_details['upload_date'] = df_product_details[ 'upload_date'].astype('datetime64') df_product_details.to_gbq("mba_" + marketplace + ".products_details", project_id="mba-pipeline", if_exists="append") update_reservation_logs(marketplace, asin, "404", preemptible_code, ip_address, pre_instance_name, zone, api_key, chat_id) print("No Match: Got 404: %s | %s of %s" % (asin, j + 1, number_products)) continue # save product detail page locally with open("data/mba_detail_page.html", "w") as f: f.write(response.text) # transform html response to soup format soup = BeautifulSoup( utils.get_div_in_html(response.text, 'id="dp-container"'), 'html.parser') # save html in storage utils.upload_blob( "5c0ae2727a254b608a4ee55a15a05fb7", "data/mba_detail_page.html", "logs/" + marketplace + "/product_detail/" + str(asin) + ".html") else: with open("data/mba_detail_page.html") as f: html_str = f.read() asin = "B086D9RL8Q" soup = BeautifulSoup( utils.get_div_in_html(html_str, 'id="dp-container"'), 'html.parser') try: df_product_details = get_product_detail_df(soup, asin, url_product_asin, marketplace, chat_id, api_key) except: utils.send_msg( chat_id, "Error while trying to get information for asin: " + str(asin), api_key) continue try: df_product_details.to_gbq("mba_" + marketplace + ".products_details", project_id="mba-pipeline", if_exists="append") except: update_reservation_logs(marketplace, asin, "failure", preemptible_code, ip_address, pre_instance_name, zone, api_key, chat_id) update_reservation_logs(marketplace, asin, "success", preemptible_code, ip_address, pre_instance_name, zone, api_key, chat_id) print("Match: Successfully crawled product: %s | %s of %s" % (asin, j + 1, number_products)) global df_successfull_proxies if type(df_successfull_proxies) != type(None): print(df_successfull_proxies.iloc[0]) #df_successfull_proxies.to_csv("data/successfull_proxies.csv") # if script is called by preemptible instance it should be deleted by itself if pre_instance_name != "" and "pre" in pre_instance_name: utils.stop_instance(pre_instance_name, zone, msg="Success", api_key=api_key, chat_id=chat_id) test = 0
all_upvotes = memedroid.union(twitter).union(reddit).union(imgur) X = X.join(all_upvotes, on=['id'], how="left_outer") X = X.na.drop() X.cache() X_df = X.toPandas() X_df.loc[X_df['source'] == 'reddit', 'percentile'] = X_df[ X_df['source'] == 'reddit']['upvotes'].apply(get_percentile_reddit) X_df.loc[X_df['source'] == 'twitter', 'percentile'] = X_df[ X_df['source'] == 'twitter']['upvotes'].apply(get_percentile_twitter) X_df.loc[X_df['source'] == 'imgur', 'percentile'] = X_df[ X_df['source'] == 'imgur']['upvotes'].apply(get_percentile_imgur) X_df.loc[X_df['source'] == 'memedroid', 'percentile'] = X_df[ X_df['source'] == 'memedroid']['upvotes'].apply(get_percentile_memedroid) X_df = X_df.loc[:, [ 'id', 'url', 'image_path', 'source', 'timestamp', 'upvotes', 'percentile', 'cluster' ]] X_df.columns = [ 'meme_id', 'url', 'image_path', 'source', 'meme_datetime', 'upvotes', 'upvotes_centile', 'cluster' ] X_df.to_json('/home/data_to_upload_on_bucket/one_hour_data.json') upload_blob('/home/data_to_upload_on_bucket/one_hour_data.json', 'one_hour_data.json')
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument( 'marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str) parser.add_argument( '--number_chunks', default=1, type=int, help= 'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.' ) parser.add_argument('--chunk_size', default=10, type=int, help='Chunk of images to batch upload to bigquery.') # if python file path is in argv remove it if ".py" in argv[0]: argv = argv[1:len(argv)] # get all arguments args = parser.parse_args(argv) marketplace = args.marketplace number_chunks = args.number_chunks chunk_size = args.chunk_size # get all arguments args = parser.parse_args() # get already crawled asin list #asin_crawled_list = get_asin_images_crawled("mba_de.products_images") df_images = get_images_urls_not_crawled(marketplace) pool = multiprocessing.Pool(4) def crawl_img(image_row): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] print(asin) r = ProxyRequests(url_image_hq) r.get() print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.get_status_code(): print(r.get_status_code()) # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: f.write(r.get_raw()) #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object) #df_imgs = df_imgs.append(df_img) #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg") print("Successfully crawled image: %s" % (asin)) else: print("Could not crawl image: %s" % (asin)) df_images_chunks = [ df_images[i:i + chunk_size] for i in range(0, df_images.shape[0], chunk_size) ] # if number_images is equal to 0, evry image should be crawled if number_chunks == 0: number_chunks = len(df_images_chunks) for j, df_images in enumerate(df_images_chunks[0:number_chunks]): df_imgs = pd.DataFrame(data={ "asin": [], "url": [], "url_gs": [], "url_mba_lowq": [], "url_mba_hq": [], "timestamp": [] }, dtype=np.object) #df_dask = ddf.from_pandas(df_images, npartitions=chunk_size) # where the number of partitions is the number of cores you want to use #df_dask.apply(lambda x: crawl_img(x), meta=('str'), axis=1).compute(scheduler='multiprocessing') for i, image_row in df_images.iterrows(): asin = image_row["asin"] url_image_hq = image_row["url_image_hq"] url_image_lowq = image_row["url_image_lowq"] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } #proxy_list = get_proxies("de", True) #proxy = next(iter(proxy_list)) while True: try: proxies = proxy_handler.get_random_proxy_url_dict( path_proxy_json='mba_crawler/proxy/proxies.json', only_working=True) r = requests.get(url_image_hq, proxies=proxies, headers=headers, stream=True) break except Exception as e: print(str(e), proxies) continue #print("Proxy used: " + str(r.meta)) #r = ProxyRequests(url_image_hq) #r.get() #print("Proxy used: " + str(r.get_proxy_used())) if 200 == r.status_code: # save image locally with open("data/shirts/shirt.jpg", 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) df_img = pd.DataFrame(data={ "asin": [asin], "url": [ "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_gs": [ "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" + marketplace + "/" + asin + ".jpg" ], "url_mba_lowq": [url_image_lowq], "url_mba_hq": [url_image_hq], "timestamp": [datetime.datetime.now()] }, dtype=np.object) df_imgs = df_imgs.append(df_img) utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/" + marketplace + "/" + asin + ".jpg", verbose=False) else: print("Could not crawl image: %s" % (asin)) #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True) test = 0 print("%s of %s chunks" % (j + 1, number_chunks)) df_imgs['timestamp'] = df_imgs['timestamp'].astype('datetime64') df_imgs.to_gbq("mba_" + marketplace + ".products_images", project_id="mba-pipeline", if_exists="append") test = 0 bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7" folder_name = "mba-shirts" file_path = "mba-pipeline/crawler/mba/data/test.jpg" #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg") test = 0
credential_path = "C:\\Users\\moacy\\OneDrive\\Documentos\\Python Scripts\\cpdoc-text-align-master\\AudioCPDOC-d89584a0ad75.json" os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path #%% DADOS INICIAIS - ESSES DADOS DEVEM SER ALTERADOS EM CADA AUDIO audio_path = 'Entrevistas\\' # audio_file = "pho_1880_luiz_moraes_2011-05-20_01-4.wav" audio_file = "A-Anjos-PsicologiaDeUmVencido.wav" audio_name = audio_file.split('.')[0] #pega apenas a primeira parte bucket_name = 'alinhamento-audio' source_file_name = audio_path + audio_file destination_blob_name = audio_name gcs_uri = 'gs://' + bucket_name + '/' + audio_file utils.upload_blob(bucket_name, source_file_name, destination_blob_name) config = speech.RecognitionConfig( #encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, #sample_rate_hertz=16000, audio_channel_count=1, language_code='pt-BR', #model='video', #Essas duas opções (model e use_enhanced) são mais caras... #use_enhanced=True, # é o caso de testar se valem a pena em algum caso específico. #language_code='en-US', enable_word_time_offsets=True) # caracteristicas da legenda max_linhas_por_bloco = 3 max_caracteres_linha = 50
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument('keyword', help='Keyword that you like to query in mba', type=str) parser.add_argument('api_key', help='API key of proxycrawl', type=str) parser.add_argument('marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str) parser.add_argument('pod_product', help='Name of Print on Demand product. I.e "shirt", "premium", "longsleeve", "sweatshirt", "hoodie", "popsocket", "kdp"', type=str) parser.add_argument('sort', help='What kind of sorting do you want?. I.e "best_seller", "price_up", "price_down", "cust_rating", "oldest", "newest"', type=str) parser.add_argument('--pages', default=0, type=int, help='Count of pages that shoul be crawled on amazon. Asin break condition is ignored if not 0') parser.add_argument('--start_page', default=1, type=int, help='Starting page number. Default is 1 (first page') print(os.getcwd()) print(argv) # if python file path is in argv remove it if ".py" in argv[0]: argv = argv[1:len(argv)] # get all arguments args = parser.parse_args(argv) keyword = args.keyword api_key = args.api_key marketplace = args.marketplace pod_product = args.pod_product sort = args.sort pages = args.pages start_page = args.start_page language = "de" # get all arguments args = parser.parse_args() #df = get_df_hobbies("de") df = pd.read_csv("data/hobbies_de.csv") hobbies_list = df["hobby"].tolist() test_hobby = hobbies_list[4] # get already crawled asin list asin_crawled_list = get_asin_crawled("mba_de.products") url_mba = url_creator.main([keyword, marketplace, pod_product, sort]) # if start_page is other than zero, crawler should start from differnt page if start_page != 1: url_mba = url_mba + "&page="+str(start_page)+"&ref=sr_pg_"+str(start_page) #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} #proxy_list = get_proxies("de", True) #proxy = next(iter(proxy_list)) #proxies={"http": proxy, "https": proxy} # If pages are 0, i.e default value, loop shoul break if the first product apears which was already crawled # At least 30 pages should be crawled to prevent to much request with API if pages == 0: count_pages = 30 else: count_pages = pages no_response = False for current_page in np.arange(start_page, start_page+count_pages, 1): #print(current_page) #''' timeout = time.time() + 60 response = requests.get(make_url_to_proxy_crawl_url(api_key,url_mba), stream=True) while response.status_code != 200: response = requests.get(make_url_to_proxy_crawl_url(url_mba), stream=True) if time.time() > timeout: no_response = True break if no_response: print("Error: No response found. Status code: " + str(response.status_code)) print("Current page: " + str(current_page)) break else: print("Crawling mba data was successfull") # transform html response to soup format soup = BeautifulSoup(get_shirt_div(response.text, "s-main-slot s-result-list s-search-results sg-row"), 'html.parser') with open("data/mba_page.html", "w") as f: f.write(response.text) # save html page in storage utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/mba_page.html" , "logs/"+marketplace+"/product_overview/"+str(datetime.date.today())+"_"+keyword+"_"+sort+"_"+str(current_page)+".html") # use this code block to read html without appling proxy crawl #with open("data/newest_2.html") as f: # html_str = f.read() # soup = BeautifulSoup(get_shirt_div(html_str, "s-main-slot s-result-list s-search-results sg-row"), 'html.parser') shirts = soup.find_all("div", class_="sg-col-inner") # get dataframe with product information df_products, asin_already_crawled = get_shirt_product_df(shirts, asin_crawled_list, pages, url_mba) # save data in big query df_products.to_gbq("mba_" + marketplace + ".products",project_id="mba-pipeline", if_exists="append") # get link to next page url_mba = "/".join(url_mba.split("/")[0:3]) + soup.find("ul", class_="a-pagination").find(class_="a-last").find("a")["href"] print("Page " + str(current_page) + " successfully crawled") # BREAK CONDITION only if pages parameter is not set if pages == 0 and asin_already_crawled: break #''' bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7" folder_name = "mba-shirts" file_path = "mba-pipeline/crawler/mba/data/test.jpg" #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg") test = 0