def get_resource_file(resource_file):
     bucket = config.s3_detector_bucket()
     tarball_name = resource_file + '.tar.gz'
     s3client.download_from_s3(bucket, tarball_name, tarball_name)
     with tarfile.open(tarball_name, 'r:*') as tar:
         tar.extractall()
     assert os.path.isfile(resource_file)
Beispiel #2
0
 def create_new_tm_version(cls, classifier_model_files_dir, op_dir=None):
     """Creates new versioned tarball and uploads it to the s3 affine detector bucket"""
     assert os.path.isdir(
         classifier_model_files_dir
     ), "Directory missing : %s" % classifier_model_files_dir
     if op_dir is None:
         op_dir = mkdtemp()
     if not os.path.isdir(op_dir):
         os.makedirs(op_dir)
     print 'Storing model files at location: %s' % op_dir
     # get new version and folder with appropriate name
     timestamp_int = cls.get_current_timestamp()
     # create tarball with version from input folder
     versioned_tarball_path, versioned_tarball_name = cls.create_versioned_tarball(
         classifier_model_files_dir, op_dir, timestamp_int)
     # upload to s3
     bucket = config.s3_detector_bucket()
     s3client.upload_to_s3(bucket,
                           versioned_tarball_name,
                           versioned_tarball_path,
                           public=False)
     # create new row in DB
     tmv = cls(detector_type='topic_model',
               timestamp=datetime.utcfromtimestamp(timestamp_int))
     session.flush()
     return tmv
Beispiel #3
0
 def grab_files(self):
     bucket = config.s3_detector_bucket()
     retry_operation(s3client.download_tarball,
                     bucket,
                     self.tarball_basename,
                     self.local_dir(),
                     sleep_time=0.1,
                     error_class=IOError)
 def grab_projection_files(self):
     download_dir = os.path.join(config.scratch_detector_path(), self.PCA_TAR_BALL_NAME)
     bucket = config.s3_detector_bucket()
     logger.info('Downloading files from s3')
     s3client.download_tarball(bucket, self.PCA_TAR_BALL_NAME, download_dir)
     pca_file =  os.path.join(download_dir, 'pca.xml')
     assert os.path.exists(pca_file), pca_file
     return pca_file
Beispiel #5
0
 def fetch_domain_stopwords(self):
     self.stop_dict = defaultdict(set)
     download_tarball(config.s3_detector_bucket(), DOMAIN_WF, DOMAIN_WF)
     for pp in os.listdir(DOMAIN_WF):
         pickle_file = os.path.join(DOMAIN_WF, pp)
         with open(pickle_file, 'rb') as fi:
             wf = pickle.load(fi)
         for i in wf:
             if wf[i] > STOP_THRESH:
                 self.stop_dict[pp.replace('.pickle', '')].add(i)
def get_negative_set_s3(config, inter_dir, train_pos, test_pos):
    ''' Predetermined set of negative images from s3 '''
    bucket = affine_config.s3_detector_bucket()  # CHANGE
    min_neg_lines_to_use = int(
        config['train_detector_params']['neg_train_min_num_frames'])
    neg_train = get_set_lines(bucket, min_neg_lines_to_use, train_pos,
                              inter_dir, 's3_neg_test_info_readymade.txt')
    min_neg_lines_to_use = int(
        config['train_detector_params']['neg_test_min_num_frames'])
    neg_test = get_set_lines(bucket, min_neg_lines_to_use, test_pos, inter_dir,
                             's3_neg_train_info_readymade.txt')
    return neg_train, neg_test
    def tar_and_upload(self, clf):
        """ Creates tarball with model files/directories and upload to s3

        Args:
            clf: classifier or model that has a tarball_basename property
                The uploaded tarball will have this name with a .tar.gz
                extension.
        """
        tarball_path = self.create_tarball(self.model_paths,
                                           clf.tarball_basename,
                                           op_dir=self.model_dir)
        bucket = config.s3_detector_bucket()
        s3client.upload_to_s3(bucket,
                              os.path.basename(tarball_path),
                              tarball_path,
                              public=False)
    def write_neg_json(self, config_dict):
        """Creates train/test files containing negative data"""
        skip_labels = [
            int(i) for i in config_dict['youtube_data_ignore_labels']
        ]
        bucket = config.s3_detector_bucket()
        tarball_name = config_dict['neg_tarball_s3']
        fname = tarball_name.rstrip('.tar.gz')
        s3client.download_from_s3(bucket, tarball_name, tarball_name)
        with tarfile.open(tarball_name, 'r:*') as tar:
            tar.extractall()
        assert os.path.isfile(fname)

        test_split = config_dict['test_split']
        neg_train_json = config_dict['neg_train_json']
        neg_test_json = config_dict['neg_test_json']
        fo_train = open(neg_train_json, 'w')
        fo_test = open(neg_test_json, 'w')
        count = 0
        skip_labels_set = set(skip_labels)
        skip_lines = set()
        for lnum, jsn in enumerate(open(fname)):
            yvt = YoutubeVideoText.to_object(jsn)
            if yvt.label_id in skip_labels_set:
                skip_lines.add(lnum)
            count += 1

        rr = range(count)
        random.shuffle(rr)
        line_nums = list(set(rr) - skip_lines)
        test_size = len(line_nums) * test_split
        test_line_nums = set(line_nums[:int(test_size)])
        for lnum, jsn in enumerate(open(fname)):
            if lnum in skip_lines:
                continue
            if lnum in test_line_nums:
                fo_test.write(jsn)
            else:
                fo_train.write(jsn)

        fo_train.close()
        fo_test.close()
 def grab_s3_files(self):
     bucket = config.s3_detector_bucket()
     tar_ball_name = 'classifier_model_files_%d' % self.current_version
     logger.info("downloading files from S3")
     s3client.download_tarball(bucket, tar_ball_name, self.model_files_dir)
     logger.info("done downloading files from S3")
Beispiel #10
0
 def grab_s3_files(self):
     """Download model files from s3 and untar them to destination dir."""
     bucket = config.s3_detector_bucket()
     logger.info("downloading files from S3")
     s3client.download_tarball(bucket, 'asr_model', self.model_dir)
     logger.info("done downloading files from S3")