Example #1
0
    def index_new_files(self, processed_files):
        s3 = S3(self.Config.AWS.Key, self.Config.AWS.Secret,
                self.Config.AWS.Bucket)

        s3_loader = S3Helper(s3)

        # todo
        # for each processed file
        #   Is the index file already in memory
        #   No - pull file from server
        #        deserialize into dataindex and datafile
        #        add to local collection
        #   Is file and chart already in the index
        #   No - add file to index
        # for each index file
        #    serialize to json
        #    save to s3

        month_indexes = dict()

        for processed_file in processed_files:
            key = self.build_index_key(processed_file.Sitename,
                                       processed_file.StationCallsign,
                                       processed_file.Datetime)

            if key not in month_indexes:
                month_indexes[key] = self.get_index_file(
                    s3, processed_file.Sitename,
                    processed_file.StationCallsign, key)

            month_index = month_indexes[key]

        return
Example #2
0
def process_missing_face(local_path, person, img):
    bucket = settings.S3_MISSING_BUCKET
    col_id = settings.REKOG_FACEMATCH_COLLECTION
    col = Collection(collection_id=col_id)
    s3 = S3()
    uploaded = s3.upload_public_file(bucket_name=bucket, file_name=local_path)
    face = MissingFace.objects.filter(photo=img, person=person).first()

    if not face:
        try:
            ret = col.addFaceToCollection(bucket=bucket, photo_s3_path=img)
            if not ret:
                logger.error("Unable to add face %s of %s" % (img, person.code))
                face = MissingFace(person=person, id=uuid.uuid4(), is_face=False, photo=img)
                face.save()
                return
            face_id = ret["indexed"]["face_id"]
            face, created = MissingFace.objects.get_or_create(person=person, id=face_id)
            face.bounding_box = json.dumps(ret["indexed"]["bounding_box"])
            face.photo = img
            face.is_person = True
            face.save()
            logger.info("Saved missing face %s" % face_id)
        except ClientError as e:
            logger.error("Unable to index missing face %s. Error: '%s'" % (img, str(e)))
    else:
        logger.info("Already processed image %s, face %s" % (img, face.id))
Example #3
0
def test_get_s3file(event, context):

    s3_bucket = "buyma-app-dev"
    s3 = S3(s3_bucket)

    file_name = "Lyst.com&margiela.collected.csv"
    s3.download_item(key="tests/%s" % (file_name, ),
                     path="/tmp/%s" % (file_name, ))

    df = pd.read_csv("/tmp/%s" % (file_name, ))
    print(df.head())

    for index, row in df.iterrows():

        import time
        import random
        # TODO:: for status code 429
        r = random.randint(1, 5)
        time.sleep(r)

        payload = {"href": row["href"]}
        payload = json.dumps(payload)
        print(payload)
        response = boto3.client('lambda').invoke(
            FunctionName='buyma-app-dev-testGetHrefRequests',
            InvocationType='Event',  # Event or RequestResponse
            Payload=payload)

    return True
Example #4
0
def get_item_list(event, context):
    # {"name": "IMPORT SELECT musee", "id": "841549", "url": "https://www.buyma.com/buyer/841549/sales_2150.html"}

    data = _get_event_data(event)
    s3 = S3(bucket_name=constants.S3_BUCKET)

    try:
        response = requests.get(data["url"])
        soup = BeautifulSoup(response.text, 'html.parser')
        hrefs = [c["href"] for c in soup.select('p.buyeritem_name a')]
        titles = [
            c.get_text().strip() for c in soup.select('p.buyeritem_name a')
        ]
        order_amounts = [
            c.get_text().strip().replace("注文数:", "")
            for c in soup.select('p.buyeritem_name ~p:contains("注文数")')
        ]
        order_dates = [
            c.get_text().strip().replace("注文日:", "")
            for c in soup.select('p.buyeritem_name ~p:contains("注文日")')
        ]

        # items = []

        for href, title, order_amount, order_date in zip(
                hrefs, titles, order_amounts, order_dates):
            # items.append([href, title, order_amount, order_date])
            ptn = r"^/item/(\d+)/$"
            id = re.sub(ptn, r"\1", href)
            csv = ",".join([id, href, title, order_amount, order_date]) + "\n"
            response = s3.upload_item(key="items/%s" % (id, ), item=csv)
            # job_log delete
            response = s3.delete_item(key="jobs/shop/%s/%s" % (
                data["name"],
                data["url"],
            ))

            # 詳細取得
            payload = {
                "id": id,
                "href": href,
                "title": title,
                "order_amount": order_amount,
                "order_date": order_date
            }
            payload = json.dumps(payload)
            response = boto3.client('lambda').invoke(
                FunctionName='buyma-app-dev-getItemDetail',
                InvocationType='Event',  # Event or RequestResponse
                Payload=payload)
    except Exception as e:
        # job_log delete
        print("Exception Occured!!!")
        response = s3.upload_item(key="jobs/shop/%s/%s" % (
            data["name"],
            data["url"],
        ),
                                  item=e.args[0])
Example #5
0
def merge_s3_files():
    s3 = S3(bucket_name=constants.S3_BUCKET)
    # keys = s3.list_objects(key="trains/")
    keys = s3.list_objects(key="items/")

    print(len(keys))

    # with open("/tmp/trains_data.csv", mode="w") as f:
    with open("/tmp/items_data.csv", mode="w") as f:
        for key in keys:
            response = s3.get_item(key=key)
            f.write(response["Body"].read().decode("utf-8"))
Example #6
0
def save_shoppers(event, context):

    for shopper in constants.shopper_list:
        shop = Shop(shopper, ("Sales", ))
        pkl_name = "%s.pkl" % (shop.name, )
        pd.to_pickle(shop, "/tmp/" + pkl_name)
        pkl_obj = open("/tmp/" + pkl_name, "rb")

        s3 = S3(bucket_name=constants.S3_BUCKET)
        response = s3.upload_item(key="shoppers/%s" % (pkl_name, ),
                                  item=pkl_obj)

        print(response)

    return True
def read_io(path):
    """Reads the contents of a local or S3 path into a StringIO.
    """
    note = StringIO()
    if path.startswith("s3://"):
        s3 = S3(env='prod')
        for line in s3.read(path):
            note.write(line)
            note.write("\n")
    else:
        with open(path) as local:
            for line in local.readlines():
                note.write(line)

    note.seek(0)

    return note
Example #8
0
def get_shopper(event, context):

    data = _get_event_data(event)
    shop_name = data["name"]

    pkl_name = "%s.pkl" % (shop_name, )

    s3 = S3(bucket_name=constants.S3_BUCKET)
    response = s3.download_item(key="shoppers/%s" % (pkl_name, ),
                                path="/tmp/%s" % (pkl_name, ))

    print(response)

    pkl_obj = pd.read_pickle("/tmp/%s" % (pkl_name, ))

    print(pkl_obj)

    return True
    def process(self):
        s3 = S3(self.Config.AWS.Key, self.Config.AWS.Secret,
                self.Config.AWS.Bucket)

        s3_loader = S3Helper(s3)

        utc_now = datetime.utcnow()
        utc_string = utc_now.date().isoformat()
        site_name = self.Config.SiteName

        processed_files = []

        # Loop through the stations
        for station in self.Config.Stations:
            # Get the files for the specified station
            files = self.list_files(self.Config.DataPath, site_name,
                                    station.CallSign)

            for file in files:
                # Get the filename
                basename = os.path.basename(file)

                # pull the date from the filename
                current_file_datepart = basename[11:21]

                # don't process current file
                if utc_string != current_file_datepart:
                    # Generate chart and load datafile
                    currentDate = datetime.strptime(current_file_datepart,
                                                    "%Y-%m-%d")
                    remote_chart = self.generate_load_chart(
                        s3_loader, station, file)

                    # upload and archive the datafile
                    remote_datafile = self.load_data_file(
                        s3_loader, station, file)

                    # add the file to list of processed files for later indexing
                    processed_files.append(
                        ProcessedFile(site_name, station.CallSign, currentDate,
                                      remote_chart, remote_datafile))

        # Sort processed files by date and return
        return sorted(processed_files, key=lambda x: getattr(x, "Datetime"))
Example #10
0
def save_year_data(year='2014', load_pickle=False):

    if load_pickle:
        with open('data.pickle', 'rb') as handle:
            year_data = pickle.load(handle)

    else:
        s3 = S3()
        print 'downloading'
        df = s3.get_df('rac', columns='web_columns', year=year)
        chi_data = df[df['h_tract'].isin(tracts)]
        print 'start group by'
        year_data = chi_data.groupby(['h_tract']).sum().to_json()
        year_data = json.loads(year_data)
        print year_data
        with open('data.pickle', 'wb') as handle:
            pickle.dump(year_data, handle)
    keys = year_data.keys()

    # writes files to
    for stat in keys:
        print stat
        file_data = {}

        ## create file if it doesn't exist
        if not os.path.exists('data_temp/%s.json' % stat):
            with open('data_temp/%s.json' % stat, 'w+') as f:
                f.write(json.dumps({}))

        with open('data_temp/%s.json' % stat, 'r+') as f:
            file_data = json.loads(f.read())
            file_data[year] = year_data[stat]
            # need to write at beginning of file
            f.seek(0)
            f.write(json.dumps(file_data))
            f.truncate()
Example #11
0

class Download():
    def __init__(self):
        self.name = 'Ben'

    def download_data(self):
        return True

    def transform_json(self):
        return True


db = DynamoConn()
d = Download()
s3 = S3()
S3FS = s3fs.S3FileSystem()

datasets = {
    # 'building_permits': {'key': '9pkb-4fbf', 'date': '_issue_date'},
    # 'business_liscenses': {'key': 'r5kz-chrr', 'date': 'license_start_date'}  ## 900,000 rows,
    'business_grants': {
        'key': 'jp7n-tgmf',
        'date': 'completion_date'
    }
}
address_lookup = pickle.load(open("address_lookup.p", "rb"))


def handler(event, context):
    client = Socrata("data.cityofchicago.org", None)
Example #12
0
def save_data(run_all=False):
    S3FS = s3fs.S3FileSystem()

    s3 = S3()
    db = DynamoConn()

    dates = {}
    datasets = db.get_datasets()
    for dataset in datasets:
        # print dataset
        if datasets[dataset]['source'] == 'Plenario':

            today = datetime.datetime.today().date()
            date_list = set([today.strftime('%Y-%m')])
            date_list.add(
                (today - datetime.timedelta(days=32)).strftime('%Y-%m'))
            date_list = sorted(
                list(
                    set([(today - datetime.timedelta(days=x)).strftime('%Y-%m')
                         for x in range(32)])))
            paths = []

            if run_all:
                paths = ['bnroths/chicago-data/%s' % dataset]
                cnts = {}

            else:
                for month in date_list:
                    year, month = month.split('-')
                    paths.append('bnroths/chicago-data/%s/year=%s/month=%s' %
                                 (dataset, year, month))
                print paths
                cnts = datasets[dataset]['cnts']
                # exit(0)

            print paths
            for path in paths:
                ds = pq.ParquetDataset(path_or_paths=path,
                                       filesystem=S3FS,
                                       validate_schema=False)

                columns = datasets[dataset]['columns']
                dt = columns[1]
                table = ds.read()
                df = table.to_pandas()
                print df.columns
                print df.head()
                df['dt'] = df[dt].astype(str).str[:7]

                dts = []
                groups = dict(list(df.groupby('dt')))
                print groups.keys()
                # exit(0)
                for group in groups:
                    print group
                    year, month = group.split('-')

                    a = groups[group][['longitude',
                                       'latitude']].to_json(orient='values')
                    cnts[group] = groups[group].count()[0]
                    dts.append(group)

                    filename = '../data/%s/%s-%s/all.json' % (dataset, year,
                                                              month)

                    if not os.path.exists(os.path.dirname(filename)):
                        try:
                            os.makedirs(os.path.dirname(filename))
                        except OSError as exc:  # Guard against race condition
                            if exc.errno != errno.EEXIST:
                                raise

                    with open(filename, 'w') as f:
                        f.write(a)

                    ## write to s3
                    s3.save_file_public(local='../data/%s/%s-%s/all.json' %
                                        (dataset, year, month),
                                        dataset=dataset,
                                        dt="%s-%s" % (year, month),
                                        filename='all.json')
                    db.update_col(dataset=dataset,
                                  col='cnts',
                                  update=json.dumps(cnts))
 def set_s3(self):
     '''
     '''
     self.s3 = S3(self.bucket, self.key)
Example #14
0
 def delete(self, *args, **kwargs):
     super().delete(*args, **kwargs)
     S3().delete_file(UnidentifiedStorage.bucket_name, file_name=self.photo)
Example #15
0
 def delete(self, *args, **kwargs):
     super().delete(*args, **kwargs)
     S3().delete_file(MissingStorage.bucket_name, file_name=self.photo)
Example #16
0
def get_item_detail(event, context):
    #  {"id": "55301234", "href": "/item/55301234/", "title": "DIESEL スイムウェア 水着 海パン SV9U KAXH","order_amount": "1個","order_date": "2020/07/03"}
    try:
        data = _get_event_data(event)
        s3 = S3(bucket_name=constants.S3_BUCKET)

        items = [
            data["id"], data["href"], data["title"], data["order_amount"],
            data["order_date"]
        ]

        TOP_URL = "https://www.buyma.com"
        url = TOP_URL + data["href"]

        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        if soup.select_one(".notfoundSection_txt") is None:
            brands = [
                c.get_text().strip()
                for c in soup.select('dt.pt1:contains("ブランド") ~ dd a')
                if c.get_text().strip() != "商品一覧"
            ]
            brand_1 = brands[0] if len(brands) > 0 else ""
            brand_2 = brands[1] if len(brands) > 1 else ""
            brand_3 = brands[2] if len(brands) > 2 else ""

            categories = [
                c.get_text().strip()
                for c in soup.select('dt.pt1:contains("カテゴリ") ~ dd a')
            ]
            category_1 = categories[0] if len(categories) > 0 else ""
            category_2 = categories[1] if len(categories) > 1 else ""
            category_3 = categories[2] if len(categories) > 2 else ""

            price_origin = soup.select_one(
                'p.price_dd strike').get_text().strip() if soup.select_one(
                    'p.price_dd strike') is not None else ""

            price = soup.select_one('span.price_txt').get_text().strip(
            ) if soup.select_one('span.price_txt') is not None else ""

            colors = [
                c.get_text().strip()
                for c in soup.select('span.item_color_name')
            ]

            sizes = [
                c.get_text().strip()
                for c in soup.select('table.cse-set__table tr>td:first-child')
            ]

            item_details = [
                brand_1, brand_2, brand_3, category_1, category_2, category_3,
                price_origin, price, "@".join(colors), "@".join(sizes)
            ]

            items += item_details

            csv = ",".join(items) + "\n"
            response = s3.upload_item(key="trains/%s" % (data["id"], ),
                                      item=csv)
            # job_log delete
            response = s3.delete_item(key="jobs/item/%s" % (data["id"]), )
    except Exception as e:
        print("Exception Occured!!!")
        # job_log delete
        response = s3.upload_item(key="jobs/item/%s" % (data["id"], ),
                                  item=e.args[0])