Beispiel #1
0
def update_reservation_logs(marketplace, asin, status, preemptible_code,
                            ip_address, pre_instance_name, zone, api_key,
                            chat_id):
    global df_successfull_proxies
    error_str = ""
    if type(df_successfull_proxies) != type(None):
        print(df_successfull_proxies.iloc[0])
        error_str = "Error count: " + str(
            df_successfull_proxies.iloc[0]["errorCount"]
        ) + " errors: " + ",".join(df_successfull_proxies.iloc[0]["errors"])

    reservationdate = datetime.datetime.now()
    df_reservation = pd.DataFrame({
        "asin": [str(asin)],
        "timestamp": [reservationdate],
        "status": [str(status)],
        "pree_id": [str(preemptible_code)],
        "ip_address": [ip_address],
        "error_log": [error_str]
    })
    df_reservation['timestamp'] = df_reservation['timestamp'].astype(
        'datetime64')
    # TODO: fix this error
    try:
        df_reservation.to_gbq(
            "preemptible_logs.mba_detail_" + marketplace +
            "_preemptible_%s_%s_%s" %
            (reservationdate.year, reservationdate.month, reservationdate.day),
            project_id="mba-pipeline",
            if_exists="append")
    except:
        utils.stop_instance(pre_instance_name,
                            zone,
                            msg="Can not update big query reservation",
                            api_key=api_key,
                            chat_id=chat_id)
p_test_mean = p_test_all.mean(axis=2)

p_test_odds = np.ones(100000) * 1 / 9
for j in range(var_len):
    if roc_auc_score(y_train, oof[:, j]) >= 0.500:
        p_test_odds *= (9 * p_test_mean[:, j] / (1 - p_test_mean[:, j]))

p_test_odds = p_test_odds / (1 + p_test_odds)

sub1 = pd.read_csv("../input/sample_submission.csv.zip")
sub2 = pd.DataFrame({"ID_code": test.ID_code.values, "target": p_test_odds})
sub = pd.merge(sub1[["ID_code"]], sub2, how="left").fillna(0)

# save
sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

print(sub.target.describe())

os.system(f'gsutil cp {SUBMIT_FILE_PATH} gs://malware_onodera/')
os.system(f'cp LOG/log_{__file__}.txt LOG/log_{__file__}_{SEED}.txt')
os.system(f'gsutil cp LOG/log_{__file__}_{SEED}.txt gs://malware_onodera/')
"""
gsutil cp gs://malware_onodera/*.gz ../output/
gsutil cp gs://malware_onodera/*.txt LOG/
"""

#==============================================================================
utils.end(__file__)
utils.stop_instance()
Beispiel #3
0
def main(argv):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument(
        'marketplace',
        help='Shortcut of mba marketplace. I.e "com" or "de", "uk"',
        type=str)
    parser.add_argument('--telegram_api_key',
                        default="",
                        help='API key of mba bot',
                        type=str)
    parser.add_argument(
        '--telegram_chatid',
        default="",
        help='Id of channel like private chat or group channel',
        type=str)
    parser.add_argument(
        '--number_products',
        default=10,
        type=int,
        help=
        'Number of products/shirts that shoul be crawled. If 0, every image that is not already crawled will be crawled.'
    )
    parser.add_argument(
        '--connection_timeout',
        default=10.0,
        type=float,
        help=
        'Time that the request operation has until its breaks up. Default: 10.0 sec'
    )
    parser.add_argument(
        '--time_break_sec',
        default=240,
        type=int,
        help=
        'Time in seconds the script tries to get response of certain product. Default 240 sec'
    )
    parser.add_argument(
        '--seconds_between_crawl',
        default=20,
        type=int,
        help=
        'Time in seconds in which no proxy/ip shoul be used twice for crawling. Important to prevent being blacklisted. Default 20 sec'
    )
    parser.add_argument(
        '--preemptible_code',
        default="0",
        type=str,
        help=
        'Identifier of instance for pree logs. Default 0 which leads to GUID.')
    parser.add_argument(
        '--pre_instance_name',
        default="",
        type=str,
        help=
        'Name of instance. Important: if set, script will stop instance after successfull operation. Default "".'
    )
    parser.add_argument(
        '--zone',
        default="",
        type=str,
        help=
        'Zone of instance. Must fit to close the instance correctly after successfull run. Default mayor zone of marketplace.'
    )

    print(os.getcwd())
    print(argv)

    # if python file path is in argv remove it
    if ".py" in argv[0]:
        argv = argv[1:len(argv)]

    # get all arguments
    args = parser.parse_args(argv)
    marketplace = args.marketplace
    api_key = args.telegram_api_key
    chat_id = args.telegram_chatid
    number_products = args.number_products
    connection_timeout = args.connection_timeout
    time_break_sec = args.time_break_sec
    seconds_between_crawl = args.seconds_between_crawl
    preemptible_code = args.preemptible_code
    pre_instance_name = args.pre_instance_name
    zone = args.zone
    if zone == "":
        zone = utils.get_zone_of_marketplace(marketplace)
    ip_address = utils.get_extrenal_ip(pre_instance_name, zone)

    if preemptible_code == "0":
        preemptible_code = uuid.uuid4().hex

    # get all arguments
    args = parser.parse_args()

    # get asins which are not already crawled
    df_product_details_tocrawl = get_asin_product_detail_crawled(marketplace)
    if len(df_product_details_tocrawl) == 0:
        print("no data to crawl")
        if pre_instance_name != "" and "pre" in pre_instance_name:
            utils.stop_instance(pre_instance_name,
                                zone,
                                msg="No data to crawl",
                                api_key=api_key,
                                chat_id=chat_id)
        return 0
    #df_product_details = pd.DataFrame(data={"asin": ["B07RVNJHZL"], "url_product": ["adwwadwad"]})
    df_product_details_tocrawl[
        "url_product_asin"] = df_product_details_tocrawl.apply(
            lambda x: "https://www.amazon." + marketplace + "/dp/" + x["asin"],
            axis=1)

    # if number_images is equal to 0, evry image should be crawled
    if number_products == 0:
        number_products = len(df_product_details_tocrawl)

    reservationdate = datetime.datetime.now()
    df_reservation = df_product_details_tocrawl.iloc[0:number_products][[
        "asin"
    ]].copy()
    df_reservation['status'] = "blocked"
    df_reservation['pree_id'] = preemptible_code
    df_reservation['ip_address'] = ip_address
    df_reservation['error_log'] = ""
    df_reservation['timestamp'] = reservationdate
    df_reservation['timestamp'] = df_reservation['timestamp'].astype(
        'datetime64')
    try:
        df_reservation.to_gbq(
            "preemptible_logs.mba_detail_" + marketplace +
            "_preemptible_%s_%s_%s" %
            (reservationdate.year, reservationdate.month, reservationdate.day),
            project_id="mba-pipeline",
            if_exists="append")
    except:
        utils.stop_instance(pre_instance_name,
                            zone,
                            msg="Can not update big query reservation",
                            api_key=api_key,
                            chat_id=chat_id)

    for j, product_row in df_product_details_tocrawl.iloc[
            0:number_products].iterrows():
        asin = product_row["asin"]
        url_product = product_row["url_product"]
        url_product_asin = product_row["url_product_asin"]

        if True:
            # try to get reponse with free proxies
            response = get_response(
                marketplace,
                url_product_asin,
                use_proxy=False,
                connection_timeout=connection_timeout,
                time_break_sec=time_break_sec,
                seconds_between_crawl=seconds_between_crawl)

            if response == None:
                # update reservation logs with blacklist of ip
                update_reservation_logs(marketplace, "blacklist", "blacklist",
                                        preemptible_code, ip_address,
                                        pre_instance_name, zone, api_key,
                                        chat_id)
                # if script is called by preemptible instance it should be deleted by itself
                if pre_instance_name != "":
                    utils.stop_instance(
                        pre_instance_name,
                        zone,
                        msg="Response is none because of time break condition",
                        api_key=api_key,
                        chat_id=chat_id)
                else:
                    assert response != None, "Could not get response within time break condition"

            if response == 404:
                crawlingdate = [datetime.datetime.now()]
                df_product_details = pd.DataFrame(
                    data={
                        "asin": [asin],
                        "title": ["404"],
                        "brand": ["404"],
                        "url_brand": ["404"],
                        "price": ["404"],
                        "fit_types": [["404"]],
                        "color_names": [["404"]],
                        "color_count": [404],
                        "product_features": [["404"]],
                        "description": ["404"],
                        "weight": ["404"],
                        "upload_date_str": ["1995-01-01"],
                        "upload_date": ["1995-01-01"],
                        "customer_review_score": ["404"],
                        "customer_review_count": [404],
                        "mba_bsr_str": ["404"],
                        "mba_bsr": [["404"]],
                        "mba_bsr_categorie": [["404"]],
                        "timestamp": crawlingdate
                    })
                # transform date/timestamo columns to datetime objects
                df_product_details['timestamp'] = df_product_details[
                    'timestamp'].astype('datetime64')
                df_product_details['upload_date'] = df_product_details[
                    'upload_date'].astype('datetime64')
                df_product_details.to_gbq("mba_" + marketplace +
                                          ".products_details",
                                          project_id="mba-pipeline",
                                          if_exists="append")
                update_reservation_logs(marketplace, asin, "404",
                                        preemptible_code, ip_address,
                                        pre_instance_name, zone, api_key,
                                        chat_id)
                print("No Match: Got 404: %s | %s of %s" %
                      (asin, j + 1, number_products))
                continue

            # save product detail page locally
            with open("data/mba_detail_page.html", "w") as f:
                f.write(response.text)

            # transform html response to soup format
            soup = BeautifulSoup(
                utils.get_div_in_html(response.text, 'id="dp-container"'),
                'html.parser')

            # save html in storage
            utils.upload_blob(
                "5c0ae2727a254b608a4ee55a15a05fb7",
                "data/mba_detail_page.html", "logs/" + marketplace +
                "/product_detail/" + str(asin) + ".html")
        else:
            with open("data/mba_detail_page.html") as f:
                html_str = f.read()
                asin = "B086D9RL8Q"
                soup = BeautifulSoup(
                    utils.get_div_in_html(html_str, 'id="dp-container"'),
                    'html.parser')
        try:
            df_product_details = get_product_detail_df(soup, asin,
                                                       url_product_asin,
                                                       marketplace, chat_id,
                                                       api_key)
        except:
            utils.send_msg(
                chat_id,
                "Error while trying to get information for asin: " + str(asin),
                api_key)
            continue
        try:
            df_product_details.to_gbq("mba_" + marketplace +
                                      ".products_details",
                                      project_id="mba-pipeline",
                                      if_exists="append")
        except:
            update_reservation_logs(marketplace, asin, "failure",
                                    preemptible_code, ip_address,
                                    pre_instance_name, zone, api_key, chat_id)
        update_reservation_logs(marketplace, asin, "success", preemptible_code,
                                ip_address, pre_instance_name, zone, api_key,
                                chat_id)
        print("Match: Successfully crawled product: %s | %s of %s" %
              (asin, j + 1, number_products))

    global df_successfull_proxies
    if type(df_successfull_proxies) != type(None):
        print(df_successfull_proxies.iloc[0])
    #df_successfull_proxies.to_csv("data/successfull_proxies.csv")

    # if script is called by preemptible instance it should be deleted by itself
    if pre_instance_name != "" and "pre" in pre_instance_name:
        utils.stop_instance(pre_instance_name,
                            zone,
                            msg="Success",
                            api_key=api_key,
                            chat_id=chat_id)

    test = 0
def stop_instance(request,instance_id):
	try:
		utils.stop_instance(request, instance_id)
	except Exception,e:
		return HttpResponse(simplejson.dumps({"status":"error","message":e},ensure_ascii=False))