def get_resource_file(resource_file): bucket = config.s3_detector_bucket() tarball_name = resource_file + '.tar.gz' s3client.download_from_s3(bucket, tarball_name, tarball_name) with tarfile.open(tarball_name, 'r:*') as tar: tar.extractall() assert os.path.isfile(resource_file)
def create_new_tm_version(cls, classifier_model_files_dir, op_dir=None): """Creates new versioned tarball and uploads it to the s3 affine detector bucket""" assert os.path.isdir( classifier_model_files_dir ), "Directory missing : %s" % classifier_model_files_dir if op_dir is None: op_dir = mkdtemp() if not os.path.isdir(op_dir): os.makedirs(op_dir) print 'Storing model files at location: %s' % op_dir # get new version and folder with appropriate name timestamp_int = cls.get_current_timestamp() # create tarball with version from input folder versioned_tarball_path, versioned_tarball_name = cls.create_versioned_tarball( classifier_model_files_dir, op_dir, timestamp_int) # upload to s3 bucket = config.s3_detector_bucket() s3client.upload_to_s3(bucket, versioned_tarball_name, versioned_tarball_path, public=False) # create new row in DB tmv = cls(detector_type='topic_model', timestamp=datetime.utcfromtimestamp(timestamp_int)) session.flush() return tmv
def grab_files(self): bucket = config.s3_detector_bucket() retry_operation(s3client.download_tarball, bucket, self.tarball_basename, self.local_dir(), sleep_time=0.1, error_class=IOError)
def grab_projection_files(self): download_dir = os.path.join(config.scratch_detector_path(), self.PCA_TAR_BALL_NAME) bucket = config.s3_detector_bucket() logger.info('Downloading files from s3') s3client.download_tarball(bucket, self.PCA_TAR_BALL_NAME, download_dir) pca_file = os.path.join(download_dir, 'pca.xml') assert os.path.exists(pca_file), pca_file return pca_file
def fetch_domain_stopwords(self): self.stop_dict = defaultdict(set) download_tarball(config.s3_detector_bucket(), DOMAIN_WF, DOMAIN_WF) for pp in os.listdir(DOMAIN_WF): pickle_file = os.path.join(DOMAIN_WF, pp) with open(pickle_file, 'rb') as fi: wf = pickle.load(fi) for i in wf: if wf[i] > STOP_THRESH: self.stop_dict[pp.replace('.pickle', '')].add(i)
def get_negative_set_s3(config, inter_dir, train_pos, test_pos): ''' Predetermined set of negative images from s3 ''' bucket = affine_config.s3_detector_bucket() # CHANGE min_neg_lines_to_use = int( config['train_detector_params']['neg_train_min_num_frames']) neg_train = get_set_lines(bucket, min_neg_lines_to_use, train_pos, inter_dir, 's3_neg_test_info_readymade.txt') min_neg_lines_to_use = int( config['train_detector_params']['neg_test_min_num_frames']) neg_test = get_set_lines(bucket, min_neg_lines_to_use, test_pos, inter_dir, 's3_neg_train_info_readymade.txt') return neg_train, neg_test
def tar_and_upload(self, clf): """ Creates tarball with model files/directories and upload to s3 Args: clf: classifier or model that has a tarball_basename property The uploaded tarball will have this name with a .tar.gz extension. """ tarball_path = self.create_tarball(self.model_paths, clf.tarball_basename, op_dir=self.model_dir) bucket = config.s3_detector_bucket() s3client.upload_to_s3(bucket, os.path.basename(tarball_path), tarball_path, public=False)
def write_neg_json(self, config_dict): """Creates train/test files containing negative data""" skip_labels = [ int(i) for i in config_dict['youtube_data_ignore_labels'] ] bucket = config.s3_detector_bucket() tarball_name = config_dict['neg_tarball_s3'] fname = tarball_name.rstrip('.tar.gz') s3client.download_from_s3(bucket, tarball_name, tarball_name) with tarfile.open(tarball_name, 'r:*') as tar: tar.extractall() assert os.path.isfile(fname) test_split = config_dict['test_split'] neg_train_json = config_dict['neg_train_json'] neg_test_json = config_dict['neg_test_json'] fo_train = open(neg_train_json, 'w') fo_test = open(neg_test_json, 'w') count = 0 skip_labels_set = set(skip_labels) skip_lines = set() for lnum, jsn in enumerate(open(fname)): yvt = YoutubeVideoText.to_object(jsn) if yvt.label_id in skip_labels_set: skip_lines.add(lnum) count += 1 rr = range(count) random.shuffle(rr) line_nums = list(set(rr) - skip_lines) test_size = len(line_nums) * test_split test_line_nums = set(line_nums[:int(test_size)]) for lnum, jsn in enumerate(open(fname)): if lnum in skip_lines: continue if lnum in test_line_nums: fo_test.write(jsn) else: fo_train.write(jsn) fo_train.close() fo_test.close()
def grab_s3_files(self): bucket = config.s3_detector_bucket() tar_ball_name = 'classifier_model_files_%d' % self.current_version logger.info("downloading files from S3") s3client.download_tarball(bucket, tar_ball_name, self.model_files_dir) logger.info("done downloading files from S3")
def grab_s3_files(self): """Download model files from s3 and untar them to destination dir.""" bucket = config.s3_detector_bucket() logger.info("downloading files from S3") s3client.download_tarball(bucket, 'asr_model', self.model_dir) logger.info("done downloading files from S3")