def get_page_text(self): if self.s3_page_text: bucket = config.s3_bucket() urlpath = "%s/%s" % ('page_text', self.id) text = s3client.download_from_s3_as_string(bucket, urlpath) return text.decode('utf-8') return None
def generate_info_files(s3_info_filename, info_filename, pos_dir, neg_dir): "generate info files for training and testing" if not os.path.exists(pos_dir): os.makedirs(pos_dir) if not os.path.exists(neg_dir): os.makedirs(neg_dir) bucket = affine_config.s3_bucket() fp = open(s3_info_filename, 'r') fo = open(info_filename, 'w') for f in fp: line = f.split(' ') image_label = int(line[1]) video_id = int(line[2]) time_stamp = int(line[3]) filename = '%012d_%012d.jpg' % (video_id, time_stamp) video = Video.get(video_id) if video: all_tmps = video.s3_timestamps() if time_stamp in all_tmps: if image_label > 0: outfile = os.path.join(pos_dir, filename) else: outfile = os.path.join(neg_dir, filename) line_item = '%s %i %i %i' % (outfile, image_label, video_id, time_stamp) if os.path.exists(outfile): fo.write(line_item + '\n') continue print "downloading data from s3" # using the affine bucket for negative images img_path = 'thumbnail/%d/%d' % (video_id, time_stamp) s3client.download_from_s3(bucket, img_path, outfile) fo.write(line_item + '\n') fp.close() fo.close()
def upload_screenshot_full(self, path): bucket = config.s3_bucket() urlpath = "%s/%s" % ('screenshot_full', self.id) thumb_path = resize_image(path) s3client.upload_to_s3(bucket, urlpath + '_thumb', thumb_path, public=True) convert_png_to_jpeg(path, path, quality=60) s3client.upload_to_s3(bucket, urlpath, path, public=True)
def get_page_text_dict(page_ids, silent=False): """ Retrieves page_text for given page_ids. :param page_ids: List of page_ids to retrieve page text for. :param silent: If set to true, will not forward any errors for non-existent page_ids. Defaults to False and returns empty text for non-existent page_ids. :return: dictionary with mapping page_id -> page_text. """ page_ids = set(page_ids) output = {page_id: "" for page_id in page_ids} bucket = config.s3_bucket() s3_conn = s3client.connect(bucket) for page_id in page_ids: urlpath = "%s/%s" % ('page_text', page_id) try: text = s3_conn.get_key(urlpath).get_contents_as_string() output[page_id] = text.decode('utf-8') except AttributeError as e: if not silent: raise e return output
def upload_favicon(self, path): bucket = config.s3_bucket() urlpath = "%s/%s" % ('favicon', self.id) s3client.upload_to_s3(bucket, urlpath, path, public=True)
def upload_page_text(self, text): bucket = config.s3_bucket() urlpath = "%s/%s" % ('page_text', self.id) text = text.encode('utf-8') s3client.upload_to_s3_from_string(bucket, urlpath, text, public=True)
def upload_page_source(self, path): bucket = config.s3_bucket() urlpath = "%s/%s" % ('page_source', self.id) s3client.upload_to_s3(bucket, urlpath, path, public=True)
def upload_screenshot(self, path): bucket = config.s3_bucket() urlpath = "%s/%s" % ('screenshot', self.id) s3client.upload_to_s3(bucket, urlpath, path, public=True) path = resize_image(path) s3client.upload_to_s3(bucket, urlpath + '_thumb', path, public=True)
def s3_favicon_url(self): if self.s3_favicon: bucket = config.s3_bucket() return "http://%s.s3.amazonaws.com/favicon/%s" % (bucket, self.id)
def s3_page_text_url(self): if self.s3_page_text: bucket = config.s3_bucket() return "http://%s.s3.amazonaws.com/page_text/%s" % (bucket, self.id)
def s3_screenshot_full_url(self): if self.s3_screenshot_full: bucket = config.s3_bucket() return "http://%s.s3.amazonaws.com/screenshot_full/%s" % (bucket, self.id)
def s3_screenshot_url(self, for_new_screenshot=False): if self.s3_screenshot or for_new_screenshot: bucket = config.s3_bucket() return "http://%s.s3.amazonaws.com/screenshot/app/%s" % (bucket, self.id)
def construct_s3_image_url(video_id, timestamp): bucket = config.s3_bucket() return "http://%s.s3.amazonaws.com/thumbnail/%s/%s" % ( bucket, video_id, timestamp)
def bucket(self): return config.s3_bucket()