def index_new_files(self, processed_files): s3 = S3(self.Config.AWS.Key, self.Config.AWS.Secret, self.Config.AWS.Bucket) s3_loader = S3Helper(s3) # todo # for each processed file # Is the index file already in memory # No - pull file from server # deserialize into dataindex and datafile # add to local collection # Is file and chart already in the index # No - add file to index # for each index file # serialize to json # save to s3 month_indexes = dict() for processed_file in processed_files: key = self.build_index_key(processed_file.Sitename, processed_file.StationCallsign, processed_file.Datetime) if key not in month_indexes: month_indexes[key] = self.get_index_file( s3, processed_file.Sitename, processed_file.StationCallsign, key) month_index = month_indexes[key] return
def process_missing_face(local_path, person, img): bucket = settings.S3_MISSING_BUCKET col_id = settings.REKOG_FACEMATCH_COLLECTION col = Collection(collection_id=col_id) s3 = S3() uploaded = s3.upload_public_file(bucket_name=bucket, file_name=local_path) face = MissingFace.objects.filter(photo=img, person=person).first() if not face: try: ret = col.addFaceToCollection(bucket=bucket, photo_s3_path=img) if not ret: logger.error("Unable to add face %s of %s" % (img, person.code)) face = MissingFace(person=person, id=uuid.uuid4(), is_face=False, photo=img) face.save() return face_id = ret["indexed"]["face_id"] face, created = MissingFace.objects.get_or_create(person=person, id=face_id) face.bounding_box = json.dumps(ret["indexed"]["bounding_box"]) face.photo = img face.is_person = True face.save() logger.info("Saved missing face %s" % face_id) except ClientError as e: logger.error("Unable to index missing face %s. Error: '%s'" % (img, str(e))) else: logger.info("Already processed image %s, face %s" % (img, face.id))
def test_get_s3file(event, context): s3_bucket = "buyma-app-dev" s3 = S3(s3_bucket) file_name = "Lyst.com&margiela.collected.csv" s3.download_item(key="tests/%s" % (file_name, ), path="/tmp/%s" % (file_name, )) df = pd.read_csv("/tmp/%s" % (file_name, )) print(df.head()) for index, row in df.iterrows(): import time import random # TODO:: for status code 429 r = random.randint(1, 5) time.sleep(r) payload = {"href": row["href"]} payload = json.dumps(payload) print(payload) response = boto3.client('lambda').invoke( FunctionName='buyma-app-dev-testGetHrefRequests', InvocationType='Event', # Event or RequestResponse Payload=payload) return True
def get_item_list(event, context): # {"name": "IMPORT SELECT musee", "id": "841549", "url": "https://www.buyma.com/buyer/841549/sales_2150.html"} data = _get_event_data(event) s3 = S3(bucket_name=constants.S3_BUCKET) try: response = requests.get(data["url"]) soup = BeautifulSoup(response.text, 'html.parser') hrefs = [c["href"] for c in soup.select('p.buyeritem_name a')] titles = [ c.get_text().strip() for c in soup.select('p.buyeritem_name a') ] order_amounts = [ c.get_text().strip().replace("注文数:", "") for c in soup.select('p.buyeritem_name ~p:contains("注文数")') ] order_dates = [ c.get_text().strip().replace("注文日:", "") for c in soup.select('p.buyeritem_name ~p:contains("注文日")') ] # items = [] for href, title, order_amount, order_date in zip( hrefs, titles, order_amounts, order_dates): # items.append([href, title, order_amount, order_date]) ptn = r"^/item/(\d+)/$" id = re.sub(ptn, r"\1", href) csv = ",".join([id, href, title, order_amount, order_date]) + "\n" response = s3.upload_item(key="items/%s" % (id, ), item=csv) # job_log delete response = s3.delete_item(key="jobs/shop/%s/%s" % ( data["name"], data["url"], )) # 詳細取得 payload = { "id": id, "href": href, "title": title, "order_amount": order_amount, "order_date": order_date } payload = json.dumps(payload) response = boto3.client('lambda').invoke( FunctionName='buyma-app-dev-getItemDetail', InvocationType='Event', # Event or RequestResponse Payload=payload) except Exception as e: # job_log delete print("Exception Occured!!!") response = s3.upload_item(key="jobs/shop/%s/%s" % ( data["name"], data["url"], ), item=e.args[0])
def merge_s3_files(): s3 = S3(bucket_name=constants.S3_BUCKET) # keys = s3.list_objects(key="trains/") keys = s3.list_objects(key="items/") print(len(keys)) # with open("/tmp/trains_data.csv", mode="w") as f: with open("/tmp/items_data.csv", mode="w") as f: for key in keys: response = s3.get_item(key=key) f.write(response["Body"].read().decode("utf-8"))
def save_shoppers(event, context): for shopper in constants.shopper_list: shop = Shop(shopper, ("Sales", )) pkl_name = "%s.pkl" % (shop.name, ) pd.to_pickle(shop, "/tmp/" + pkl_name) pkl_obj = open("/tmp/" + pkl_name, "rb") s3 = S3(bucket_name=constants.S3_BUCKET) response = s3.upload_item(key="shoppers/%s" % (pkl_name, ), item=pkl_obj) print(response) return True
def read_io(path): """Reads the contents of a local or S3 path into a StringIO. """ note = StringIO() if path.startswith("s3://"): s3 = S3(env='prod') for line in s3.read(path): note.write(line) note.write("\n") else: with open(path) as local: for line in local.readlines(): note.write(line) note.seek(0) return note
def get_shopper(event, context): data = _get_event_data(event) shop_name = data["name"] pkl_name = "%s.pkl" % (shop_name, ) s3 = S3(bucket_name=constants.S3_BUCKET) response = s3.download_item(key="shoppers/%s" % (pkl_name, ), path="/tmp/%s" % (pkl_name, )) print(response) pkl_obj = pd.read_pickle("/tmp/%s" % (pkl_name, )) print(pkl_obj) return True
def process(self): s3 = S3(self.Config.AWS.Key, self.Config.AWS.Secret, self.Config.AWS.Bucket) s3_loader = S3Helper(s3) utc_now = datetime.utcnow() utc_string = utc_now.date().isoformat() site_name = self.Config.SiteName processed_files = [] # Loop through the stations for station in self.Config.Stations: # Get the files for the specified station files = self.list_files(self.Config.DataPath, site_name, station.CallSign) for file in files: # Get the filename basename = os.path.basename(file) # pull the date from the filename current_file_datepart = basename[11:21] # don't process current file if utc_string != current_file_datepart: # Generate chart and load datafile currentDate = datetime.strptime(current_file_datepart, "%Y-%m-%d") remote_chart = self.generate_load_chart( s3_loader, station, file) # upload and archive the datafile remote_datafile = self.load_data_file( s3_loader, station, file) # add the file to list of processed files for later indexing processed_files.append( ProcessedFile(site_name, station.CallSign, currentDate, remote_chart, remote_datafile)) # Sort processed files by date and return return sorted(processed_files, key=lambda x: getattr(x, "Datetime"))
def save_year_data(year='2014', load_pickle=False): if load_pickle: with open('data.pickle', 'rb') as handle: year_data = pickle.load(handle) else: s3 = S3() print 'downloading' df = s3.get_df('rac', columns='web_columns', year=year) chi_data = df[df['h_tract'].isin(tracts)] print 'start group by' year_data = chi_data.groupby(['h_tract']).sum().to_json() year_data = json.loads(year_data) print year_data with open('data.pickle', 'wb') as handle: pickle.dump(year_data, handle) keys = year_data.keys() # writes files to for stat in keys: print stat file_data = {} ## create file if it doesn't exist if not os.path.exists('data_temp/%s.json' % stat): with open('data_temp/%s.json' % stat, 'w+') as f: f.write(json.dumps({})) with open('data_temp/%s.json' % stat, 'r+') as f: file_data = json.loads(f.read()) file_data[year] = year_data[stat] # need to write at beginning of file f.seek(0) f.write(json.dumps(file_data)) f.truncate()
class Download(): def __init__(self): self.name = 'Ben' def download_data(self): return True def transform_json(self): return True db = DynamoConn() d = Download() s3 = S3() S3FS = s3fs.S3FileSystem() datasets = { # 'building_permits': {'key': '9pkb-4fbf', 'date': '_issue_date'}, # 'business_liscenses': {'key': 'r5kz-chrr', 'date': 'license_start_date'} ## 900,000 rows, 'business_grants': { 'key': 'jp7n-tgmf', 'date': 'completion_date' } } address_lookup = pickle.load(open("address_lookup.p", "rb")) def handler(event, context): client = Socrata("data.cityofchicago.org", None)
def save_data(run_all=False): S3FS = s3fs.S3FileSystem() s3 = S3() db = DynamoConn() dates = {} datasets = db.get_datasets() for dataset in datasets: # print dataset if datasets[dataset]['source'] == 'Plenario': today = datetime.datetime.today().date() date_list = set([today.strftime('%Y-%m')]) date_list.add( (today - datetime.timedelta(days=32)).strftime('%Y-%m')) date_list = sorted( list( set([(today - datetime.timedelta(days=x)).strftime('%Y-%m') for x in range(32)]))) paths = [] if run_all: paths = ['bnroths/chicago-data/%s' % dataset] cnts = {} else: for month in date_list: year, month = month.split('-') paths.append('bnroths/chicago-data/%s/year=%s/month=%s' % (dataset, year, month)) print paths cnts = datasets[dataset]['cnts'] # exit(0) print paths for path in paths: ds = pq.ParquetDataset(path_or_paths=path, filesystem=S3FS, validate_schema=False) columns = datasets[dataset]['columns'] dt = columns[1] table = ds.read() df = table.to_pandas() print df.columns print df.head() df['dt'] = df[dt].astype(str).str[:7] dts = [] groups = dict(list(df.groupby('dt'))) print groups.keys() # exit(0) for group in groups: print group year, month = group.split('-') a = groups[group][['longitude', 'latitude']].to_json(orient='values') cnts[group] = groups[group].count()[0] dts.append(group) filename = '../data/%s/%s-%s/all.json' % (dataset, year, month) if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(filename, 'w') as f: f.write(a) ## write to s3 s3.save_file_public(local='../data/%s/%s-%s/all.json' % (dataset, year, month), dataset=dataset, dt="%s-%s" % (year, month), filename='all.json') db.update_col(dataset=dataset, col='cnts', update=json.dumps(cnts))
def set_s3(self): ''' ''' self.s3 = S3(self.bucket, self.key)
def delete(self, *args, **kwargs): super().delete(*args, **kwargs) S3().delete_file(UnidentifiedStorage.bucket_name, file_name=self.photo)
def delete(self, *args, **kwargs): super().delete(*args, **kwargs) S3().delete_file(MissingStorage.bucket_name, file_name=self.photo)
def get_item_detail(event, context): # {"id": "55301234", "href": "/item/55301234/", "title": "DIESEL スイムウェア 水着 海パン SV9U KAXH","order_amount": "1個","order_date": "2020/07/03"} try: data = _get_event_data(event) s3 = S3(bucket_name=constants.S3_BUCKET) items = [ data["id"], data["href"], data["title"], data["order_amount"], data["order_date"] ] TOP_URL = "https://www.buyma.com" url = TOP_URL + data["href"] response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') if soup.select_one(".notfoundSection_txt") is None: brands = [ c.get_text().strip() for c in soup.select('dt.pt1:contains("ブランド") ~ dd a') if c.get_text().strip() != "商品一覧" ] brand_1 = brands[0] if len(brands) > 0 else "" brand_2 = brands[1] if len(brands) > 1 else "" brand_3 = brands[2] if len(brands) > 2 else "" categories = [ c.get_text().strip() for c in soup.select('dt.pt1:contains("カテゴリ") ~ dd a') ] category_1 = categories[0] if len(categories) > 0 else "" category_2 = categories[1] if len(categories) > 1 else "" category_3 = categories[2] if len(categories) > 2 else "" price_origin = soup.select_one( 'p.price_dd strike').get_text().strip() if soup.select_one( 'p.price_dd strike') is not None else "" price = soup.select_one('span.price_txt').get_text().strip( ) if soup.select_one('span.price_txt') is not None else "" colors = [ c.get_text().strip() for c in soup.select('span.item_color_name') ] sizes = [ c.get_text().strip() for c in soup.select('table.cse-set__table tr>td:first-child') ] item_details = [ brand_1, brand_2, brand_3, category_1, category_2, category_3, price_origin, price, "@".join(colors), "@".join(sizes) ] items += item_details csv = ",".join(items) + "\n" response = s3.upload_item(key="trains/%s" % (data["id"], ), item=csv) # job_log delete response = s3.delete_item(key="jobs/item/%s" % (data["id"]), ) except Exception as e: print("Exception Occured!!!") # job_log delete response = s3.upload_item(key="jobs/item/%s" % (data["id"], ), item=e.args[0])