def update_reservation_logs(marketplace, asin, status, preemptible_code, ip_address, pre_instance_name, zone, api_key, chat_id): global df_successfull_proxies error_str = "" if type(df_successfull_proxies) != type(None): print(df_successfull_proxies.iloc[0]) error_str = "Error count: " + str( df_successfull_proxies.iloc[0]["errorCount"] ) + " errors: " + ",".join(df_successfull_proxies.iloc[0]["errors"]) reservationdate = datetime.datetime.now() df_reservation = pd.DataFrame({ "asin": [str(asin)], "timestamp": [reservationdate], "status": [str(status)], "pree_id": [str(preemptible_code)], "ip_address": [ip_address], "error_log": [error_str] }) df_reservation['timestamp'] = df_reservation['timestamp'].astype( 'datetime64') # TODO: fix this error try: df_reservation.to_gbq( "preemptible_logs.mba_detail_" + marketplace + "_preemptible_%s_%s_%s" % (reservationdate.year, reservationdate.month, reservationdate.day), project_id="mba-pipeline", if_exists="append") except: utils.stop_instance(pre_instance_name, zone, msg="Can not update big query reservation", api_key=api_key, chat_id=chat_id)
p_test_mean = p_test_all.mean(axis=2) p_test_odds = np.ones(100000) * 1 / 9 for j in range(var_len): if roc_auc_score(y_train, oof[:, j]) >= 0.500: p_test_odds *= (9 * p_test_mean[:, j] / (1 - p_test_mean[:, j])) p_test_odds = p_test_odds / (1 + p_test_odds) sub1 = pd.read_csv("../input/sample_submission.csv.zip") sub2 = pd.DataFrame({"ID_code": test.ID_code.values, "target": p_test_odds}) sub = pd.merge(sub1[["ID_code"]], sub2, how="left").fillna(0) # save sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') print(sub.target.describe()) os.system(f'gsutil cp {SUBMIT_FILE_PATH} gs://malware_onodera/') os.system(f'cp LOG/log_{__file__}.txt LOG/log_{__file__}_{SEED}.txt') os.system(f'gsutil cp LOG/log_{__file__}_{SEED}.txt gs://malware_onodera/') """ gsutil cp gs://malware_onodera/*.gz ../output/ gsutil cp gs://malware_onodera/*.txt LOG/ """ #============================================================================== utils.end(__file__) utils.stop_instance()
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument( 'marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str) parser.add_argument('--telegram_api_key', default="", help='API key of mba bot', type=str) parser.add_argument( '--telegram_chatid', default="", help='Id of channel like private chat or group channel', type=str) parser.add_argument( '--number_products', default=10, type=int, help= 'Number of products/shirts that shoul be crawled. If 0, every image that is not already crawled will be crawled.' ) parser.add_argument( '--connection_timeout', default=10.0, type=float, help= 'Time that the request operation has until its breaks up. Default: 10.0 sec' ) parser.add_argument( '--time_break_sec', default=240, type=int, help= 'Time in seconds the script tries to get response of certain product. Default 240 sec' ) parser.add_argument( '--seconds_between_crawl', default=20, type=int, help= 'Time in seconds in which no proxy/ip shoul be used twice for crawling. Important to prevent being blacklisted. Default 20 sec' ) parser.add_argument( '--preemptible_code', default="0", type=str, help= 'Identifier of instance for pree logs. Default 0 which leads to GUID.') parser.add_argument( '--pre_instance_name', default="", type=str, help= 'Name of instance. Important: if set, script will stop instance after successfull operation. Default "".' ) parser.add_argument( '--zone', default="", type=str, help= 'Zone of instance. Must fit to close the instance correctly after successfull run. Default mayor zone of marketplace.' ) print(os.getcwd()) print(argv) # if python file path is in argv remove it if ".py" in argv[0]: argv = argv[1:len(argv)] # get all arguments args = parser.parse_args(argv) marketplace = args.marketplace api_key = args.telegram_api_key chat_id = args.telegram_chatid number_products = args.number_products connection_timeout = args.connection_timeout time_break_sec = args.time_break_sec seconds_between_crawl = args.seconds_between_crawl preemptible_code = args.preemptible_code pre_instance_name = args.pre_instance_name zone = args.zone if zone == "": zone = utils.get_zone_of_marketplace(marketplace) ip_address = utils.get_extrenal_ip(pre_instance_name, zone) if preemptible_code == "0": preemptible_code = uuid.uuid4().hex # get all arguments args = parser.parse_args() # get asins which are not already crawled df_product_details_tocrawl = get_asin_product_detail_crawled(marketplace) if len(df_product_details_tocrawl) == 0: print("no data to crawl") if pre_instance_name != "" and "pre" in pre_instance_name: utils.stop_instance(pre_instance_name, zone, msg="No data to crawl", api_key=api_key, chat_id=chat_id) return 0 #df_product_details = pd.DataFrame(data={"asin": ["B07RVNJHZL"], "url_product": ["adwwadwad"]}) df_product_details_tocrawl[ "url_product_asin"] = df_product_details_tocrawl.apply( lambda x: "https://www.amazon." + marketplace + "/dp/" + x["asin"], axis=1) # if number_images is equal to 0, evry image should be crawled if number_products == 0: number_products = len(df_product_details_tocrawl) reservationdate = datetime.datetime.now() df_reservation = df_product_details_tocrawl.iloc[0:number_products][[ "asin" ]].copy() df_reservation['status'] = "blocked" df_reservation['pree_id'] = preemptible_code df_reservation['ip_address'] = ip_address df_reservation['error_log'] = "" df_reservation['timestamp'] = reservationdate df_reservation['timestamp'] = df_reservation['timestamp'].astype( 'datetime64') try: df_reservation.to_gbq( "preemptible_logs.mba_detail_" + marketplace + "_preemptible_%s_%s_%s" % (reservationdate.year, reservationdate.month, reservationdate.day), project_id="mba-pipeline", if_exists="append") except: utils.stop_instance(pre_instance_name, zone, msg="Can not update big query reservation", api_key=api_key, chat_id=chat_id) for j, product_row in df_product_details_tocrawl.iloc[ 0:number_products].iterrows(): asin = product_row["asin"] url_product = product_row["url_product"] url_product_asin = product_row["url_product_asin"] if True: # try to get reponse with free proxies response = get_response( marketplace, url_product_asin, use_proxy=False, connection_timeout=connection_timeout, time_break_sec=time_break_sec, seconds_between_crawl=seconds_between_crawl) if response == None: # update reservation logs with blacklist of ip update_reservation_logs(marketplace, "blacklist", "blacklist", preemptible_code, ip_address, pre_instance_name, zone, api_key, chat_id) # if script is called by preemptible instance it should be deleted by itself if pre_instance_name != "": utils.stop_instance( pre_instance_name, zone, msg="Response is none because of time break condition", api_key=api_key, chat_id=chat_id) else: assert response != None, "Could not get response within time break condition" if response == 404: crawlingdate = [datetime.datetime.now()] df_product_details = pd.DataFrame( data={ "asin": [asin], "title": ["404"], "brand": ["404"], "url_brand": ["404"], "price": ["404"], "fit_types": [["404"]], "color_names": [["404"]], "color_count": [404], "product_features": [["404"]], "description": ["404"], "weight": ["404"], "upload_date_str": ["1995-01-01"], "upload_date": ["1995-01-01"], "customer_review_score": ["404"], "customer_review_count": [404], "mba_bsr_str": ["404"], "mba_bsr": [["404"]], "mba_bsr_categorie": [["404"]], "timestamp": crawlingdate }) # transform date/timestamo columns to datetime objects df_product_details['timestamp'] = df_product_details[ 'timestamp'].astype('datetime64') df_product_details['upload_date'] = df_product_details[ 'upload_date'].astype('datetime64') df_product_details.to_gbq("mba_" + marketplace + ".products_details", project_id="mba-pipeline", if_exists="append") update_reservation_logs(marketplace, asin, "404", preemptible_code, ip_address, pre_instance_name, zone, api_key, chat_id) print("No Match: Got 404: %s | %s of %s" % (asin, j + 1, number_products)) continue # save product detail page locally with open("data/mba_detail_page.html", "w") as f: f.write(response.text) # transform html response to soup format soup = BeautifulSoup( utils.get_div_in_html(response.text, 'id="dp-container"'), 'html.parser') # save html in storage utils.upload_blob( "5c0ae2727a254b608a4ee55a15a05fb7", "data/mba_detail_page.html", "logs/" + marketplace + "/product_detail/" + str(asin) + ".html") else: with open("data/mba_detail_page.html") as f: html_str = f.read() asin = "B086D9RL8Q" soup = BeautifulSoup( utils.get_div_in_html(html_str, 'id="dp-container"'), 'html.parser') try: df_product_details = get_product_detail_df(soup, asin, url_product_asin, marketplace, chat_id, api_key) except: utils.send_msg( chat_id, "Error while trying to get information for asin: " + str(asin), api_key) continue try: df_product_details.to_gbq("mba_" + marketplace + ".products_details", project_id="mba-pipeline", if_exists="append") except: update_reservation_logs(marketplace, asin, "failure", preemptible_code, ip_address, pre_instance_name, zone, api_key, chat_id) update_reservation_logs(marketplace, asin, "success", preemptible_code, ip_address, pre_instance_name, zone, api_key, chat_id) print("Match: Successfully crawled product: %s | %s of %s" % (asin, j + 1, number_products)) global df_successfull_proxies if type(df_successfull_proxies) != type(None): print(df_successfull_proxies.iloc[0]) #df_successfull_proxies.to_csv("data/successfull_proxies.csv") # if script is called by preemptible instance it should be deleted by itself if pre_instance_name != "" and "pre" in pre_instance_name: utils.stop_instance(pre_instance_name, zone, msg="Success", api_key=api_key, chat_id=chat_id) test = 0
def stop_instance(request,instance_id): try: utils.stop_instance(request, instance_id) except Exception,e: return HttpResponse(simplejson.dumps({"status":"error","message":e},ensure_ascii=False))