def compile_detections(aggregated_df, dir_name):
    metadata_path = os.path.join(metadata_dir, '%s_images.csv' % dir_name)
    metadata_df = pd.read_csv(metadata_path,
                              names=[
                                  'CameraID', 'Latitude', 'Longitude', 'Date',
                                  'Time', 'Filename', 'Dimensions'
                              ]).drop_duplicates()

    combined = metadata_df.merge(aggregated_df,
                                 left_on='Filename',
                                 right_on='image',
                                 how='left').drop(columns=['image'])
    combined['Detections'].fillna(0, inplace=True)
    logging.info('Combined DF shape: %s' % str(combined.shape))

    if not os.path.isdir(aggregated_dir):
        os.makedirs(aggregated_dir)
        logging.info('Created %s' % aggregated_dir)

    aggregated_path = os.path.join(aggregated_dir,
                                   '%s_aggregated.csv.xz' % dir_name)
    combined.to_csv(aggregated_path,
                    index=False,
                    header=False,
                    compression='xz')
    logging.info('Saved aggregated data to %s' % aggregated_path)

    upload_blob(bucket, aggregated_path,
                'traffic-images-aggregated/%s_aggregated.csv.xz' % dir_name)
    os.remove(aggregated_path)
    logging.info('Deleted %s' % aggregated_path)
    def check_notebook_result(self):
        # Workaround because papermill does not directly return exit code.
        exit_code = '1' if _PAPERMILL_ERR_MSG in \
                         open('%s.ipynb' % self._test_name).read() else '0'

        os.chdir(self.TEST_DIR)
        if self._test_name == 'dsl_static_type_checking':
            subprocess.call([
                sys.executable, 'check_notebook_results.py', '--testname',
                self._test_name, '--result', self._sample_test_result,
                '--exit-code', exit_code
            ])
        else:
            subprocess.call([
                sys.executable, 'check_notebook_results.py', '--experiment',
                '%s-test' % self._test_name, '--testname', self._test_name,
                '--result', self._sample_test_result, '--namespace',
                self._namespace, '--exit-code', exit_code
            ])

        print('Copy the test results to GCS %s/' % self._results_gcs_dir)

        utils.upload_blob(
            self._bucket_name, self._sample_test_result,
            os.path.join(self._results_gcs_dir, self._sample_test_result))
Beispiel #3
0
    def check_notebook_result(self):
        # Workaround because papermill does not directly return exit code.
        exit_code = '1' if PAPERMILL_ERR_MSG in \
                           open('%s.ipynb' % self._test_name).read() else '0'

        os.chdir(TEST_DIR)

        if self._test_name == 'dsl_static_type_checking':
            nbchecker = NoteBookChecker(testname=self._test_name,
                                        result=self._sample_test_result,
                                        exit_code=exit_code)
            nbchecker.check()
        else:
            nbchecker = NoteBookChecker(testname=self._test_name,
                                        result=self._sample_test_result,
                                        exit_code=exit_code,
                                        experiment=None,
                                        namespace='kubeflow')
            nbchecker.check()

        print('Copy the test results to GCS %s/' % self._results_gcs_dir)

        utils.upload_blob(
            self._bucket_name, self._sample_test_result,
            os.path.join(self._results_gcs_dir, self._sample_test_result))
def run_detections_for_dir(dir_name):
    logging.info('Running detections for %s' % dir_name)
    images = [
        x for x in os.listdir(os.path.join(images_dir, dir_name))
        if x.endswith('.jpg')
    ]
    compiled = []

    logging.info('%s images to process' % len(images))
    for image_file in images:
        # possible to stack?
        image_path = os.path.join(images_dir, dir_name, image_file)
        image_np = load_image_into_numpy_array(image_path)
        input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0),
                                            dtype=tf.float32)
        detections, predictions_dict, shapes = detect_fn(input_tensor)
        # / possible to stack?

        label_id_offset = 1
        for i in range(detections['num_detections'].numpy()[0]):
            score = detections['detection_scores'][:, i].numpy()[0]
            if score > .2:
                object_class = detections['detection_classes'][:, i].numpy(
                )[0] + label_id_offset
                box = detections['detection_boxes'][:, i, :].numpy()[0]
                relative_size = (box[2] - box[0]) * (box[3] - box[1])

                compiled.append([
                    image_file.split('.')[0], object_class, score, box[1],
                    box[3], box[0], box[2], relative_size
                ])

    if not os.path.isdir(detections_dir):
        os.makedirs(detections_dir)
        logging.info('Created %s' % detections_dir)

    detections_path = os.path.join(detections_dir,
                                   '%s_detections.csv.xz' % dir_name)
    detections_df = pd.DataFrame(compiled,
                                 columns=[
                                     'image', 'class', 'score', 'x1', 'x2',
                                     'y1', 'y2', 'relative_size'
                                 ])
    logging.info('Detections DF shape: %s' % str(detections_df.shape))
    detections_df.to_csv(detections_path,
                         index=False,
                         header=False,
                         compression='xz')
    logging.info('Saved detections to %s' % detections_path)

    upload_blob(bucket, detections_path,
                'traffic-images-detections/%s_detections.csv.xz' % dir_name)
    os.remove(detections_path)
    logging.info('Deleted %s' % detections_path)

    return detections_df
Beispiel #5
0
def downloader_function(thing, thing2):
    date = (datetime.datetime.utcnow() - datetime.datetime(1970, 1, 1)).days
    people = [
        "bijanmustard", "7e5h", "dadurath", "sagebeans", "spinebashed",
        "theonion", "clickhole"
    ]
    who = people[date % len(people)]
    tweet_dumper.get_all_tweets(who)
    delete_blob("twitter_bot_bucket", f"{who}-clean.txt")
    upload_blob("twitter_bot_bucket", f"/tmp/{who}-clean.txt",
                f"{who}-clean.txt")
    def check_result(self):
        os.chdir(self.TEST_DIR)
        subprocess.call([
            sys.executable, 'run_sample_test.py', '--input',
            '%s/%s.yaml' % (self._work_dir, self._test_name), '--result',
            self._sample_test_result, '--output', self._sample_test_output,
            '--testname', self._test_name, '--namespace', self._namespace
        ])
        print('Copy the test results to GCS %s/' % self._results_gcs_dir)

        utils.upload_blob(
            self._bucket_name, self._sample_test_result,
            os.path.join(self._results_gcs_dir, self._sample_test_result))
Beispiel #7
0
    def check_result(self):
        os.chdir(self.TEST_DIR)
        pysample_checker = PySampleChecker(testname=self._test_name,
                                           input='%s/%s.yaml' %
                                           (self._work_dir, self._test_name),
                                           output=self._sample_test_output,
                                           result=self._sample_test_result,
                                           namespace=self._namespace)
        pysample_checker.check()

        print('Copy the test results to GCS %s/' % self._results_gcs_dir)
        utils.upload_blob(
            self._bucket_name, self._sample_test_result,
            os.path.join(self._results_gcs_dir, self._sample_test_result))
Beispiel #8
0
def main():
    for api in apis:
        logging.info('Loading data for %s' % api)
        request_datetime = datetime.now(
            tz=pytz.timezone('Singapore')).strftime('%Y%m%d%H%M')
        df = generate_table(api)
        dest_dir = os.path.join(data_dir, api)
        if not os.path.isdir(dest_dir):
            os.makedirs(dest_dir)
            logging.info('Created %s' % dest_dir)
        dest_path = os.path.join(dest_dir,
                                 '%s_%s.csv.xz' % (request_datetime, api))
        df.to_csv(dest_path, index=False, header=False, compression='xz')
        logging.info('Saved data to %s' % dest_path)

        upload_blob(bucket, dest_path,
                    '%s/%s_%s.csv.xz' % (api, request_datetime, api))
        os.remove(dest_path)
        logging.info('Deleted %s' % dest_path)
Beispiel #9
0
            project_id = json.load(f)['project_id']
        bucket = storage_client.bucket('tyeoh-streetcred',
                                       user_project=project_id)

        metadata_dir = os.path.join(app_dir, 'metadata')

        if not os.path.isdir(metadata_dir):
            os.makedirs(metadata_dir)
            logging.info('Created %s' % metadata_dir)

        for api in apis:
            logging.info('Loading data for %s' % api)
            request_date = datetime.now(
                tz=pytz.timezone('Singapore')).strftime('%Y%m%d')
            df = generate_table(api)
            dest_path = os.path.join(
                metadata_dir, '%s_%s_metadata.csv.xz' % (request_date, api))
            df.to_csv(dest_path, index=False, header=True, compression='xz')
            logging.info('Saved data to %s' % dest_path)

            upload_blob(
                bucket, dest_path,
                '%s_metadata/%s_%s_metadata.csv.xz' % (api, request_date, api))
            os.remove(dest_path)
            logging.info('Deleted %s' % dest_path)

    except Exception as e:
        logging.error("Exception occurred", exc_info=True)
        raise
    else:
        logging.info('Script complete')
def main(argv):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument(
        'marketplace',
        help='Shortcut of mba marketplace. I.e "com" or "de", "uk"',
        type=str)
    parser.add_argument(
        '--number_images',
        default=10,
        type=int,
        help=
        'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.'
    )

    # if python file path is in argv remove it
    if ".py" in argv[0]:
        argv = argv[1:len(argv)]

    # get all arguments
    args = parser.parse_args(argv)
    marketplace = args.marketplace
    number_images = args.number_images

    # get all arguments
    args = parser.parse_args()

    # get already crawled asin list
    #asin_crawled_list = get_asin_images_crawled("mba_de.products_images")

    df_images = get_images_urls_not_crawled(marketplace)

    # if number_images is equal to 0, evry image should be crawled
    if number_images == 0:
        number_images = len(df_images)

    for j, image_row in df_images.iloc[0:number_images].iterrows():
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        url_image_lowq = image_row["url_image_lowq"]

        #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
        #proxy_list = get_proxies("de", True)
        #proxy = next(iter(proxy_list))
        #proxies={"http": proxy, "https": proxy}

        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            utils.upload_blob(
                "5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg",
                "mba-shirts/" + marketplace + "/" + asin + ".jpg")
            df_img = pd.DataFrame(data={
                "asin": [asin],
                "url": [
                    "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"
                    + marketplace + "/" + asin + ".jpg"
                ],
                "url_gs": [
                    "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" +
                    marketplace + "/" + asin + ".jpg"
                ],
                "url_mba_lowq": [url_image_lowq],
                "url_mba_hq": [url_image_hq],
                "timestamp": [datetime.datetime.now()]
            },
                                  dtype=np.object)
            df_img['timestamp'] = df_img['timestamp'].astype('datetime64')
            df_img.to_gbq("mba_" + marketplace + ".products_images",
                          project_id="mba-pipeline",
                          if_exists="append")
            print("Successfully crawled image: %s | %s of %s" %
                  (asin, j + 1, number_images))
        else:
            print("Could not crawl image: %s | %s of %s" (asin, j + 1,
                                                          number_images))

        #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True)
        test = 0

    bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7"
    folder_name = "mba-shirts"
    file_path = "mba-pipeline/crawler/mba/data/test.jpg"
    #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg")

    test = 0
Beispiel #11
0
def main(argv):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument(
        'marketplace',
        help='Shortcut of mba marketplace. I.e "com" or "de", "uk"',
        type=str)
    parser.add_argument('--telegram_api_key',
                        default="",
                        help='API key of mba bot',
                        type=str)
    parser.add_argument(
        '--telegram_chatid',
        default="",
        help='Id of channel like private chat or group channel',
        type=str)
    parser.add_argument(
        '--number_products',
        default=10,
        type=int,
        help=
        'Number of products/shirts that shoul be crawled. If 0, every image that is not already crawled will be crawled.'
    )
    parser.add_argument(
        '--connection_timeout',
        default=10.0,
        type=float,
        help=
        'Time that the request operation has until its breaks up. Default: 10.0 sec'
    )
    parser.add_argument(
        '--time_break_sec',
        default=240,
        type=int,
        help=
        'Time in seconds the script tries to get response of certain product. Default 240 sec'
    )
    parser.add_argument(
        '--seconds_between_crawl',
        default=20,
        type=int,
        help=
        'Time in seconds in which no proxy/ip shoul be used twice for crawling. Important to prevent being blacklisted. Default 20 sec'
    )
    parser.add_argument(
        '--preemptible_code',
        default="0",
        type=str,
        help=
        'Identifier of instance for pree logs. Default 0 which leads to GUID.')
    parser.add_argument(
        '--pre_instance_name',
        default="",
        type=str,
        help=
        'Name of instance. Important: if set, script will stop instance after successfull operation. Default "".'
    )
    parser.add_argument(
        '--zone',
        default="",
        type=str,
        help=
        'Zone of instance. Must fit to close the instance correctly after successfull run. Default mayor zone of marketplace.'
    )

    print(os.getcwd())
    print(argv)

    # if python file path is in argv remove it
    if ".py" in argv[0]:
        argv = argv[1:len(argv)]

    # get all arguments
    args = parser.parse_args(argv)
    marketplace = args.marketplace
    api_key = args.telegram_api_key
    chat_id = args.telegram_chatid
    number_products = args.number_products
    connection_timeout = args.connection_timeout
    time_break_sec = args.time_break_sec
    seconds_between_crawl = args.seconds_between_crawl
    preemptible_code = args.preemptible_code
    pre_instance_name = args.pre_instance_name
    zone = args.zone
    if zone == "":
        zone = utils.get_zone_of_marketplace(marketplace)
    ip_address = utils.get_extrenal_ip(pre_instance_name, zone)

    if preemptible_code == "0":
        preemptible_code = uuid.uuid4().hex

    # get all arguments
    args = parser.parse_args()

    # get asins which are not already crawled
    df_product_details_tocrawl = get_asin_product_detail_crawled(marketplace)
    if len(df_product_details_tocrawl) == 0:
        print("no data to crawl")
        if pre_instance_name != "" and "pre" in pre_instance_name:
            utils.stop_instance(pre_instance_name,
                                zone,
                                msg="No data to crawl",
                                api_key=api_key,
                                chat_id=chat_id)
        return 0
    #df_product_details = pd.DataFrame(data={"asin": ["B07RVNJHZL"], "url_product": ["adwwadwad"]})
    df_product_details_tocrawl[
        "url_product_asin"] = df_product_details_tocrawl.apply(
            lambda x: "https://www.amazon." + marketplace + "/dp/" + x["asin"],
            axis=1)

    # if number_images is equal to 0, evry image should be crawled
    if number_products == 0:
        number_products = len(df_product_details_tocrawl)

    reservationdate = datetime.datetime.now()
    df_reservation = df_product_details_tocrawl.iloc[0:number_products][[
        "asin"
    ]].copy()
    df_reservation['status'] = "blocked"
    df_reservation['pree_id'] = preemptible_code
    df_reservation['ip_address'] = ip_address
    df_reservation['error_log'] = ""
    df_reservation['timestamp'] = reservationdate
    df_reservation['timestamp'] = df_reservation['timestamp'].astype(
        'datetime64')
    try:
        df_reservation.to_gbq(
            "preemptible_logs.mba_detail_" + marketplace +
            "_preemptible_%s_%s_%s" %
            (reservationdate.year, reservationdate.month, reservationdate.day),
            project_id="mba-pipeline",
            if_exists="append")
    except:
        utils.stop_instance(pre_instance_name,
                            zone,
                            msg="Can not update big query reservation",
                            api_key=api_key,
                            chat_id=chat_id)

    for j, product_row in df_product_details_tocrawl.iloc[
            0:number_products].iterrows():
        asin = product_row["asin"]
        url_product = product_row["url_product"]
        url_product_asin = product_row["url_product_asin"]

        if True:
            # try to get reponse with free proxies
            response = get_response(
                marketplace,
                url_product_asin,
                use_proxy=False,
                connection_timeout=connection_timeout,
                time_break_sec=time_break_sec,
                seconds_between_crawl=seconds_between_crawl)

            if response == None:
                # update reservation logs with blacklist of ip
                update_reservation_logs(marketplace, "blacklist", "blacklist",
                                        preemptible_code, ip_address,
                                        pre_instance_name, zone, api_key,
                                        chat_id)
                # if script is called by preemptible instance it should be deleted by itself
                if pre_instance_name != "":
                    utils.stop_instance(
                        pre_instance_name,
                        zone,
                        msg="Response is none because of time break condition",
                        api_key=api_key,
                        chat_id=chat_id)
                else:
                    assert response != None, "Could not get response within time break condition"

            if response == 404:
                crawlingdate = [datetime.datetime.now()]
                df_product_details = pd.DataFrame(
                    data={
                        "asin": [asin],
                        "title": ["404"],
                        "brand": ["404"],
                        "url_brand": ["404"],
                        "price": ["404"],
                        "fit_types": [["404"]],
                        "color_names": [["404"]],
                        "color_count": [404],
                        "product_features": [["404"]],
                        "description": ["404"],
                        "weight": ["404"],
                        "upload_date_str": ["1995-01-01"],
                        "upload_date": ["1995-01-01"],
                        "customer_review_score": ["404"],
                        "customer_review_count": [404],
                        "mba_bsr_str": ["404"],
                        "mba_bsr": [["404"]],
                        "mba_bsr_categorie": [["404"]],
                        "timestamp": crawlingdate
                    })
                # transform date/timestamo columns to datetime objects
                df_product_details['timestamp'] = df_product_details[
                    'timestamp'].astype('datetime64')
                df_product_details['upload_date'] = df_product_details[
                    'upload_date'].astype('datetime64')
                df_product_details.to_gbq("mba_" + marketplace +
                                          ".products_details",
                                          project_id="mba-pipeline",
                                          if_exists="append")
                update_reservation_logs(marketplace, asin, "404",
                                        preemptible_code, ip_address,
                                        pre_instance_name, zone, api_key,
                                        chat_id)
                print("No Match: Got 404: %s | %s of %s" %
                      (asin, j + 1, number_products))
                continue

            # save product detail page locally
            with open("data/mba_detail_page.html", "w") as f:
                f.write(response.text)

            # transform html response to soup format
            soup = BeautifulSoup(
                utils.get_div_in_html(response.text, 'id="dp-container"'),
                'html.parser')

            # save html in storage
            utils.upload_blob(
                "5c0ae2727a254b608a4ee55a15a05fb7",
                "data/mba_detail_page.html", "logs/" + marketplace +
                "/product_detail/" + str(asin) + ".html")
        else:
            with open("data/mba_detail_page.html") as f:
                html_str = f.read()
                asin = "B086D9RL8Q"
                soup = BeautifulSoup(
                    utils.get_div_in_html(html_str, 'id="dp-container"'),
                    'html.parser')
        try:
            df_product_details = get_product_detail_df(soup, asin,
                                                       url_product_asin,
                                                       marketplace, chat_id,
                                                       api_key)
        except:
            utils.send_msg(
                chat_id,
                "Error while trying to get information for asin: " + str(asin),
                api_key)
            continue
        try:
            df_product_details.to_gbq("mba_" + marketplace +
                                      ".products_details",
                                      project_id="mba-pipeline",
                                      if_exists="append")
        except:
            update_reservation_logs(marketplace, asin, "failure",
                                    preemptible_code, ip_address,
                                    pre_instance_name, zone, api_key, chat_id)
        update_reservation_logs(marketplace, asin, "success", preemptible_code,
                                ip_address, pre_instance_name, zone, api_key,
                                chat_id)
        print("Match: Successfully crawled product: %s | %s of %s" %
              (asin, j + 1, number_products))

    global df_successfull_proxies
    if type(df_successfull_proxies) != type(None):
        print(df_successfull_proxies.iloc[0])
    #df_successfull_proxies.to_csv("data/successfull_proxies.csv")

    # if script is called by preemptible instance it should be deleted by itself
    if pre_instance_name != "" and "pre" in pre_instance_name:
        utils.stop_instance(pre_instance_name,
                            zone,
                            msg="Success",
                            api_key=api_key,
                            chat_id=chat_id)

    test = 0
Beispiel #12
0
all_upvotes = memedroid.union(twitter).union(reddit).union(imgur)

X = X.join(all_upvotes, on=['id'], how="left_outer")

X = X.na.drop()
X.cache()

X_df = X.toPandas()

X_df.loc[X_df['source'] == 'reddit', 'percentile'] = X_df[
    X_df['source'] == 'reddit']['upvotes'].apply(get_percentile_reddit)
X_df.loc[X_df['source'] == 'twitter', 'percentile'] = X_df[
    X_df['source'] == 'twitter']['upvotes'].apply(get_percentile_twitter)
X_df.loc[X_df['source'] == 'imgur', 'percentile'] = X_df[
    X_df['source'] == 'imgur']['upvotes'].apply(get_percentile_imgur)
X_df.loc[X_df['source'] == 'memedroid', 'percentile'] = X_df[
    X_df['source'] == 'memedroid']['upvotes'].apply(get_percentile_memedroid)

X_df = X_df.loc[:, [
    'id', 'url', 'image_path', 'source', 'timestamp', 'upvotes', 'percentile',
    'cluster'
]]

X_df.columns = [
    'meme_id', 'url', 'image_path', 'source', 'meme_datetime', 'upvotes',
    'upvotes_centile', 'cluster'
]

X_df.to_json('/home/data_to_upload_on_bucket/one_hour_data.json')
upload_blob('/home/data_to_upload_on_bucket/one_hour_data.json',
            'one_hour_data.json')
Beispiel #13
0
def main(argv):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument(
        'marketplace',
        help='Shortcut of mba marketplace. I.e "com" or "de", "uk"',
        type=str)
    parser.add_argument(
        '--number_chunks',
        default=1,
        type=int,
        help=
        'Number of images that shoul be crawled. If 0, every image that is not already crawled will be crawled.'
    )
    parser.add_argument('--chunk_size',
                        default=10,
                        type=int,
                        help='Chunk of images to batch upload to bigquery.')

    # if python file path is in argv remove it
    if ".py" in argv[0]:
        argv = argv[1:len(argv)]

    # get all arguments
    args = parser.parse_args(argv)
    marketplace = args.marketplace
    number_chunks = args.number_chunks
    chunk_size = args.chunk_size

    # get all arguments
    args = parser.parse_args()

    # get already crawled asin list
    #asin_crawled_list = get_asin_images_crawled("mba_de.products_images")

    df_images = get_images_urls_not_crawled(marketplace)

    pool = multiprocessing.Pool(4)

    def crawl_img(image_row):
        asin = image_row["asin"]
        url_image_hq = image_row["url_image_hq"]
        print(asin)
        r = ProxyRequests(url_image_hq)
        r.get()
        print("Proxy used: " + str(r.get_proxy_used()))
        if 200 == r.get_status_code():
            print(r.get_status_code())
            # save image locally
            with open("data/shirts/shirt.jpg", 'wb') as f:
                f.write(r.get_raw())

            #df_img = pd.DataFrame(data={"asin":[asin],"url":["https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_gs":["gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"+marketplace+"/"+asin+".jpg"],"url_mba_lowq":[url_image_lowq],"url_mba_hq":[url_image_hq], "timestamp":[datetime.datetime.now()]}, dtype=np.object)
            #df_imgs = df_imgs.append(df_img)
            #utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/shirts/shirt.jpg", "mba-shirts/"+marketplace+"/" + asin + ".jpg")

            print("Successfully crawled image: %s" % (asin))
        else:
            print("Could not crawl image: %s" % (asin))

    df_images_chunks = [
        df_images[i:i + chunk_size]
        for i in range(0, df_images.shape[0], chunk_size)
    ]

    # if number_images is equal to 0, evry image should be crawled
    if number_chunks == 0:
        number_chunks = len(df_images_chunks)

    for j, df_images in enumerate(df_images_chunks[0:number_chunks]):
        df_imgs = pd.DataFrame(data={
            "asin": [],
            "url": [],
            "url_gs": [],
            "url_mba_lowq": [],
            "url_mba_hq": [],
            "timestamp": []
        },
                               dtype=np.object)
        #df_dask = ddf.from_pandas(df_images, npartitions=chunk_size)   # where the number of partitions is the number of cores you want to use
        #df_dask.apply(lambda x: crawl_img(x), meta=('str'), axis=1).compute(scheduler='multiprocessing')
        for i, image_row in df_images.iterrows():
            asin = image_row["asin"]
            url_image_hq = image_row["url_image_hq"]
            url_image_lowq = image_row["url_image_lowq"]

            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
            }
            #proxy_list = get_proxies("de", True)
            #proxy = next(iter(proxy_list))
            while True:
                try:
                    proxies = proxy_handler.get_random_proxy_url_dict(
                        path_proxy_json='mba_crawler/proxy/proxies.json',
                        only_working=True)
                    r = requests.get(url_image_hq,
                                     proxies=proxies,
                                     headers=headers,
                                     stream=True)
                    break
                except Exception as e:
                    print(str(e), proxies)
                    continue
            #print("Proxy used: " + str(r.meta))

            #r = ProxyRequests(url_image_hq)
            #r.get()
            #print("Proxy used: " + str(r.get_proxy_used()))
            if 200 == r.status_code:
                # save image locally
                with open("data/shirts/shirt.jpg", 'wb') as f:
                    r.raw.decode_content = True
                    shutil.copyfileobj(r.raw, f)

                df_img = pd.DataFrame(data={
                    "asin": [asin],
                    "url": [
                        "https://storage.cloud.google.com/5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/"
                        + marketplace + "/" + asin + ".jpg"
                    ],
                    "url_gs": [
                        "gs://5c0ae2727a254b608a4ee55a15a05fb7/mba-shirts/" +
                        marketplace + "/" + asin + ".jpg"
                    ],
                    "url_mba_lowq": [url_image_lowq],
                    "url_mba_hq": [url_image_hq],
                    "timestamp": [datetime.datetime.now()]
                },
                                      dtype=np.object)
                df_imgs = df_imgs.append(df_img)
                utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7",
                                  "data/shirts/shirt.jpg",
                                  "mba-shirts/" + marketplace + "/" + asin +
                                  ".jpg",
                                  verbose=False)
            else:
                print("Could not crawl image: %s" % (asin))

            #response = requests.get(quote_plus(url_image_hq),proxies=proxies,headers=headers, stream=True)
            test = 0
        print("%s of %s chunks" % (j + 1, number_chunks))
        df_imgs['timestamp'] = df_imgs['timestamp'].astype('datetime64')
        df_imgs.to_gbq("mba_" + marketplace + ".products_images",
                       project_id="mba-pipeline",
                       if_exists="append")
        test = 0

    bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7"
    folder_name = "mba-shirts"
    file_path = "mba-pipeline/crawler/mba/data/test.jpg"
    #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg")

    test = 0
Beispiel #14
0
credential_path = "C:\\Users\\moacy\\OneDrive\\Documentos\\Python Scripts\\cpdoc-text-align-master\\AudioCPDOC-d89584a0ad75.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

#%%  DADOS INICIAIS - ESSES DADOS DEVEM SER ALTERADOS EM CADA AUDIO

audio_path = 'Entrevistas\\'
# audio_file = "pho_1880_luiz_moraes_2011-05-20_01-4.wav"
audio_file = "A-Anjos-PsicologiaDeUmVencido.wav"
audio_name = audio_file.split('.')[0]  #pega apenas a primeira parte

bucket_name = 'alinhamento-audio'
source_file_name = audio_path + audio_file
destination_blob_name = audio_name
gcs_uri = 'gs://' + bucket_name + '/' + audio_file
utils.upload_blob(bucket_name, source_file_name, destination_blob_name)

config = speech.RecognitionConfig(
    #encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    #sample_rate_hertz=16000,
    audio_channel_count=1,
    language_code='pt-BR',
    #model='video',  #Essas duas opções (model e use_enhanced) são mais caras...
    #use_enhanced=True, # é o caso de testar se valem a pena em algum caso específico.
    #language_code='en-US',
    enable_word_time_offsets=True)

# caracteristicas da legenda
max_linhas_por_bloco = 3
max_caracteres_linha = 50
Beispiel #15
0
def main(argv):
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('keyword', help='Keyword that you like to query in mba', type=str)
    parser.add_argument('api_key', help='API key of proxycrawl', type=str)
    parser.add_argument('marketplace', help='Shortcut of mba marketplace. I.e "com" or "de", "uk"', type=str)
    parser.add_argument('pod_product', help='Name of Print on Demand product. I.e "shirt", "premium", "longsleeve", "sweatshirt", "hoodie", "popsocket", "kdp"', type=str)
    parser.add_argument('sort', help='What kind of sorting do you want?. I.e "best_seller", "price_up", "price_down", "cust_rating", "oldest", "newest"', type=str)
    parser.add_argument('--pages', default=0, type=int, help='Count of pages that shoul be crawled on amazon. Asin break condition is ignored if not 0')
    parser.add_argument('--start_page', default=1, type=int, help='Starting page number. Default is 1 (first page')

    print(os.getcwd())
    print(argv)
    # if python file path is in argv remove it 
    if ".py" in argv[0]:
        argv = argv[1:len(argv)]

    # get all arguments
    args = parser.parse_args(argv)
    keyword = args.keyword
    api_key = args.api_key
    marketplace = args.marketplace
    pod_product = args.pod_product
    sort = args.sort
    pages = args.pages
    start_page = args.start_page

    language = "de"
    
    # get all arguments
    args = parser.parse_args()

    #df = get_df_hobbies("de")
    df = pd.read_csv("data/hobbies_de.csv")
    hobbies_list = df["hobby"].tolist()
    test_hobby = hobbies_list[4]

    # get already crawled asin list
    asin_crawled_list = get_asin_crawled("mba_de.products")

    url_mba = url_creator.main([keyword, marketplace, pod_product, sort])

    # if start_page is other than zero, crawler should start from differnt page
    if start_page != 1:
        url_mba = url_mba + "&page="+str(start_page)+"&ref=sr_pg_"+str(start_page)

    #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
    #proxy_list = get_proxies("de", True)
    #proxy = next(iter(proxy_list))
    #proxies={"http": proxy, "https": proxy}

    # If pages are 0, i.e default value, loop shoul break if the first product apears which was already crawled
    # At least 30 pages should be crawled to prevent to much request with API
    if pages == 0:
        count_pages = 30
    else:
        count_pages = pages

    no_response = False
    for current_page in np.arange(start_page, start_page+count_pages, 1):
        #print(current_page)
        #'''
        timeout = time.time() + 60
        response = requests.get(make_url_to_proxy_crawl_url(api_key,url_mba), stream=True)
        while response.status_code != 200:
            response = requests.get(make_url_to_proxy_crawl_url(url_mba), stream=True)
            if time.time() > timeout:
                no_response = True
                break
        if no_response:
            print("Error: No response found. Status code: " + str(response.status_code))
            print("Current page: " + str(current_page))
            break
        else:
            print("Crawling mba data was successfull")
        # transform html response to soup format
        soup = BeautifulSoup(get_shirt_div(response.text, "s-main-slot s-result-list s-search-results sg-row"), 'html.parser')

        with open("data/mba_page.html", "w") as f:
            f.write(response.text)
        # save html page in storage
        utils.upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", "data/mba_page.html" , "logs/"+marketplace+"/product_overview/"+str(datetime.date.today())+"_"+keyword+"_"+sort+"_"+str(current_page)+".html")
        
        # use this code block to read html without appling proxy crawl
        #with open("data/newest_2.html") as f:
        #    html_str = f.read()
        #    soup = BeautifulSoup(get_shirt_div(html_str, "s-main-slot s-result-list s-search-results sg-row"), 'html.parser') 

        shirts = soup.find_all("div", class_="sg-col-inner")

        # get dataframe with product information
        df_products, asin_already_crawled = get_shirt_product_df(shirts, asin_crawled_list, pages, url_mba)

        # save data in big query
        df_products.to_gbq("mba_" + marketplace + ".products",project_id="mba-pipeline", if_exists="append")

        # get link to next page 
        url_mba = "/".join(url_mba.split("/")[0:3]) + soup.find("ul", class_="a-pagination").find(class_="a-last").find("a")["href"]
        
        print("Page " + str(current_page) + " successfully crawled")
        # BREAK CONDITION only if pages parameter is not set
        if pages == 0 and asin_already_crawled:
            break
        #'''
    bucket_name = "5c0ae2727a254b608a4ee55a15a05fb7"
    folder_name = "mba-shirts"
    file_path = "mba-pipeline/crawler/mba/data/test.jpg"
    #upload_blob("5c0ae2727a254b608a4ee55a15a05fb7", file_path , "mba-shirts/test.jpg")

    
    test = 0