Example #1
0
 def __init__(self, cluster, url, component, service):
     '''
     @param cluster: 集群名称, 在配置文件配置或者通过命令行设置.
     @param url: 每个组件暴露指标的URL。例如:通过http://ip:9870/jmx可以获取hdfs集群的指标。
                 而通过http://ip:8088/jmx可以获取ResourceManager的指标。
     @param component: 组件名称. 例如:"hdfs", "resourcemanager", "mapreduce", "hive", "hbase".
     @param service: 服务名称. 例如:"namenode", "resourcemanager", "mapreduce".
     '''
     self._cluster = cluster
     # 删除末尾的/
     self._url = url.rstrip('/')
     self._component = component
     # 指标前缀, 以 hadoop_组件名_服务名 命名
     self._prefix = 'hadoop_{0}_{1}'.format(component, service)
     # 获取以服务名命名的所有JSON文件列表,例如:namenode,会将namenode中的所有文件夹中的json文件加载
     # 获取到的是文件名
     self._file_list = utils.get_file_list(service)
     # 获取common目录中的所有json文件
     self._common_file = utils.get_file_list("common")
     # 整合所有json文件
     self._merge_list = self._file_list + self._common_file
     # 用于保存指标对象
     self._metrics = {}
     for i in range(len(self._file_list)):
         # 设置文件名,并读取对应的指标配置文件(JSON文件)
         self._metrics.setdefault(self._file_list[i], utils.read_json_file(service, self._file_list[i]))
Example #2
0
def main(hypes_file, output_dir, override):
    """
    Orchestrate.

    Parameters
    ----------
    hypes_file : str
        Path to a JSON file
    output_dir : str
        Path where the output should be stored
    override : bool
        If True, then override the model if it exists.
    """
    # Load hyperparameters
    with open(hypes_file, 'r') as f:
        hypes = json.load(f)

    # Set serialization path
    base = os.path.dirname(hypes_file)
    model_file_path = os.path.join(base, '%s.json' % hypes['model']['name'])
    model_file_path = os.path.abspath(model_file_path)

    if not os.path.isfile(model_file_path) or override:
        if not os.path.isfile(model_file_path):
            logging.info("Did not find '%s'. Start training...",
                         model_file_path)
        else:
            logging.info("Override '%s'. Start training...",
                         model_file_path)

        # Get training data
        x_files, y_files = get_file_list(hypes, 'train')

        # "Train" "classifier" (it just counts the classes)
        model = {}
        for i in range(len(hypes['classes'])):
            model[i] = 0

        for y_file in y_files:
            logging.info("Read '%s'...", y_file)
            mask = load_segmentation_mask(hypes, y_file)
            for row in mask:
                for pixel in row:
                    model[pixel] += 1

        # save model as json file
        with open(model_file_path, 'w') as f:
            json.dump(model, f)
    else:
        # load model from json file
        with open(model_file_path) as f:
            model = json.load(f)
    # Evaluate
    data = get_file_list(hypes, 'test')
    analyze.evaluate(hypes,
                     data,
                     output_dir,
                     model,
                     elements=[0, 1],
                     get_segmentation=get_segmentation)
Example #3
0
    def generate_file_diff(self, source_file_directory, target_file_directory):
        source_file_list = get_file_list(source_file_directory)
        target_file_list = get_file_list(target_file_directory)

        for filename in target_file_list:
            if '.txt' in filename and filename in source_file_list:
                target_file_path = os.path.join(target_file_directory, filename)
                source_file_path = os.path.join(source_file_directory, filename)

                if os.path.isfile(source_file_path) and os.path.isfile(target_file_path):
                    results = generate_file_diff(source_file_path, target_file_path)
                    # Are there any changes in the logs
                    insertion_count = results.count('ins style')
                    deletion_count = results.count('del style')

                    if insertion_count > 0 or deletion_count > 0:
                        results = results.replace(' ', ' ')

                        rep_dict = {"ins style": "ins style", "del style": "del style", "¶": ''}
                        results = multiple_replace(results, rep_dict)

                        source_filename = 'File 1: ' + filename + ' (created on ' + \
                                          get_datetime_string(get_file_timestamp(source_file_path)) + ')'
                        target_filename = 'File 2: ' + filename + ' (created on ' + \
                                          get_datetime_string(get_file_timestamp(target_file_path)) + ')'

                        # Add insertion and deletion status
                        html_code = source_filename + '<br>' + target_filename + '<br><br>' + \
                                    '<ins style="background:#e6ffe6;">Insertions</ins>:&nbsp;' + str(insertion_count) + \
                                    '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;' + \
                                    '<del style="background:#ffe6e6;">Deletions</del>:&nbsp;' + str(deletion_count) + \
                                    '<hr>'
                        diff_file_name = os.path.join(target_file_directory, filename + '.diff.html')
                        with open(diff_file_name, 'w') as fo:
                            fo.write('<pre>' + html_code + results + '</pre>')
Example #4
0
def predict_test(inference, result_dir, style_dir, content_dir):
    list_path_content = get_file_list(content_dir)
    list_path_style = get_file_list(style_dir)

    dir_out_img = os.path.join(result_dir, 'image')
    check_folder(dir_out_img)

    for style_file in list_path_style:
        style_prefix, _ = os.path.splitext(style_file)
        style_prefix = os.path.basename(style_prefix)
        style_img = load_data_testing(style_file)

        for content_file in list_path_content:
            content_prefix, _ = os.path.splitext(content_file)
            content_prefix = os.path.basename(content_prefix)
            content_img = load_data_testing(content_file)

            print("Processing: size_content: (%d,%d)   size_style: (%d,%d)" %
                  (content_img.shape[1], content_img.shape[2],
                   style_img.shape[1], style_img.shape[2]))
            print(style_file)
            print(content_file)

            results = inference.predict(content_img, style_img)

            img_fakes = results['img_fakes'][0]
            for i in range(len(img_fakes)):
                image_path = os.path.join(
                    dir_out_img, '{}-{}-{}.jpg'.format(style_prefix,
                                                       content_prefix, str(i)))
                save_images(img_fakes[i], [1, 1], image_path)
Example #5
0
    def generate_file_diff(self, source_file_directory, target_file_directory):
        source_file_list = get_file_list(source_file_directory)
        target_file_list = get_file_list(target_file_directory)

        for filename in target_file_list:
            if '.txt' in filename and filename in source_file_list:
                target_file_path = os.path.join(target_file_directory, filename)
                source_file_path = os.path.join(source_file_directory, filename)

                if os.path.isfile(source_file_path) and os.path.isfile(target_file_path):
                    results = generate_file_diff(source_file_path, target_file_path)
                    # Are there any changes in the logs
                    insertion_count = results.count('ins style')
                    deletion_count = results.count('del style')

                    if insertion_count > 0 or deletion_count > 0:
                        results = results.replace(' ', '&nbsp;')

                        rep_dict = {"ins&nbsp;style": "ins style", "del&nbsp;style": "del style", "&para;": ''}
                        results = multiple_replace(results, rep_dict)

                        source_filename = 'File 1: ' + filename + ' (created on ' + \
                                          get_datetime_string(get_file_timestamp(source_file_path)) + ')'
                        target_filename = 'File 2: ' + filename + ' (created on ' + \
                                          get_datetime_string(get_file_timestamp(target_file_path)) + ')'

                        # Add insertion and deletion status
                        html_code = source_filename + '<br>' + target_filename + '<br><br>' + \
                                    '<ins style="background:#e6ffe6;">Insertions</ins>:&nbsp;' + str(insertion_count) + \
                                    '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;' + \
                                    '<del style="background:#ffe6e6;">Deletions</del>:&nbsp;' + str(deletion_count) + \
                                    '<hr>'
                        diff_file_name = os.path.join(target_file_directory, filename + '.diff.html')
                        with open(diff_file_name, 'w') as fo:
                            fo.write('<pre>' + html_code + results + '</pre>')
Example #6
0
    def generate_file_diff(self, source_string, target_string, source_file_directory, target_file_directory):
        source_file_list = get_file_list(source_file_directory)
        target_file_list = get_file_list(target_file_directory)

        for filename in target_file_list:
            if target_string in filename and filename.replace(target_string, source_string) in source_file_list:
                target_file_path = os.path.join(target_file_directory, filename)
                source_file_path = os.path.join(
                    source_file_directory, filename.replace(target_string, source_string))

                if os.path.isfile(source_file_path) and os.path.isfile(target_file_path):
                    results = generate_file_diff(source_file_path, target_file_path)
                    # Are there any changes in the logs
                    insertion_count = results.count('ins style')
                    deletion_count = results.count('del style')

                    if insertion_count > 0 or deletion_count > 0:
                        results = results.replace(' ', '&nbsp;')

                        # Performs a one-pass replacements
                        rep = {"ins&nbsp;style": "ins style", "del&nbsp;style": "del style", "&para;": ''}
                        rep = dict((re.escape(k), v) for k, v in rep.iteritems())
                        pattern = re.compile("|".join(rep.keys()))
                        results = pattern.sub(lambda m: rep[re.escape(m.group(0))], results)

                        # Add insertion and deletion status
                        html_code = '<ins style="background:#e6ffe6;">Insertions</ins>:&nbsp;' + str(insertion_count) + \
                                    '&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;' + \
                                    '<del style="background:#ffe6e6;">Deletions</del>:&nbsp;' + str(deletion_count) + \
                                    '<hr>'
                        diff_file_name = os.path.join(target_file_directory, filename + '.diff.html')
                        with open(diff_file_name, 'w') as fo:
                            fo.write('<pre>' + html_code + results + '</pre>')
Example #7
0
def main(hypes_file, output_dir, override):
    """
    Orchestrate.

    Parameters
    ----------
    hypes_file : str
        Path to a JSON file
    output_dir : str
        Path where the output should be stored
    override : bool
        If True, then override the model if it exists.
    """
    # Load hyperparameters
    with open(hypes_file, 'r') as f:
        hypes = json.load(f)

    # Set serialization path
    base = os.path.dirname(hypes_file)
    model_file_path = os.path.join(base, '%s.pickle' % hypes['model']['name'])
    model_file_path = os.path.abspath(model_file_path)

    if not os.path.isfile(model_file_path) or override:
        if not os.path.isfile(model_file_path):
            logging.info("Did not find '%s'. Start training...",
                         model_file_path)
        else:
            logging.info("Override '%s'. Start training...", model_file_path)

        # Get training data
        x_files, y_files = get_file_list(hypes, 'train')

        # "Train" "classifier" (it just counts the classes)
        model = {'positions': None, 'files': 0}

        for y_file in y_files:
            logging.info("Read '%s'...", y_file)
            mask = load_segmentation_mask(hypes, y_file)
            if model['positions'] is None:
                model['positions'] = mask
            else:
                model['positions'] += mask
            model['files'] += 1

        # save model as pickle file
        scipy.misc.imsave("instruments.png", model['positions'])
        with open(model_file_path, 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        # load model from pickle file
        with open(model_file_path, 'rb') as handle:
            model = pickle.load(handle)
    # Evaluate
    data = get_file_list(hypes, 'test')
    analyze.evaluate(hypes,
                     data,
                     output_dir,
                     model,
                     elements=[0, 1],
                     get_segmentation=get_segmentation)
Example #8
0
def mirflickr_annotations(min_tag_count=50):
    global nlp

    # tags that appear at least 50 times among all images
    fname = join(settings.MIRFLICKR_PATH, "mirflickr25k", "doc", "common_tags.txt")
    with open(fname) as fh:
        frequent_tags = [lin.split()[0] for lin in fh.readlines() if int(lin.split()[1]) >= min_tag_count]
    frequent_tags = sorted(frequent_tags)

    # read images
    imlist, impath = get_file_list(settings.MIRFLICKR_PATH, (".jpg",))
    potential_tags = {"tag2im": {}, "im2tag": {im: [] for im in imlist}}
    relevant_tags = {"tag2im": {}, "im2tag": {im: [] for im in imlist}}

    # read annotations
    flist, fpath = get_file_list(join(settings.MIRFLICKR_PATH, "annotations"), (".txt",))
    flist.remove("README.txt")

    id2im = lambda id_: "im{}.jpg".format(id_)
    im2id = lambda im_: int(im_[2:-4])

    # 24 potential tags
    for f in [f_ for f_ in flist if not f_.endswith("_r1.txt")]:
        tag = splitext(f)[0]
        with open(join(fpath, f)) as fh:
            potential_tags["tag2im"][tag] = sorted([id2im(id_.strip()) for id_ in fh.readlines()])

    for tag, imlist in potential_tags["tag2im"].items():
        for im in imlist:
            potential_tags["im2tag"][im].append(tag)

    # 14 relevant tags
    for f in [f_ for f_ in flist if f_.endswith("_r1.txt")]:
        tag = splitext(f)[0].replace("_r1", "")
        with open(join(fpath, f)) as fh:
            relevant_tags["tag2im"][tag] = sorted(["im{}.jpg".format(id_.strip()) for id_ in fh.readlines()])

    for tag, imlist in relevant_tags["tag2im"].items():
        for im in imlist:
            relevant_tags["im2tag"][im].append(tag)

    potential_tags_ = list(potential_tags["tag2im"].keys())
    potential_images_ = list(set(sum(potential_tags["tag2im"].values(), [])))
    relevant_tags_ = list(relevant_tags["tag2im"].keys())
    relevant_images_ = list(set(sum(relevant_tags["tag2im"].values(), [])))

    print(" >> tags w/ more than {} counts: {}".format(min_tag_count, len(frequent_tags)))
    print(" >> potential tags: {} ({} images)".format(len(potential_tags_), len(potential_images_)))
    print(" >> relevant tags: {} ({} images)".format(len(relevant_tags_), len(relevant_images_)))

    mirflickr = {
        "tags": sorted(list(set(frequent_tags + potential_tags_ + relevant_tags_))),
        "frequent": frequent_tags,
        "potential": potential_tags,
        "relevant": relevant_tags,
    }

    return mirflickr
def split_imagenet_dataset(image_dir, anno_dir):
    """
    Split imagenet dataset to train, validation and test
    :param image_dir: Downloaded images folder
    :type image_dir: String
    :param anno_dir: Annotation directory
    :type anno_dir: String
    """
    data_map = {}
    img_count = 0
    for img in get_file_list(image_dir, format=".jpg") + get_file_list(
            image_dir, format=".JPEG"):
        img_base = os.path.splitext(os.path.basename(img))[0]
        data_map[img_base] = False
        img_count += 1

    ann_count = 0

    for anno in get_file_list(anno_dir, format=".xml"):
        anno_base = os.path.splitext(os.path.basename(anno))[0]
        if anno_base in data_map:
            img_count -= 1
            data_map[anno_base] = True
        else:
            ann_count += 1

    print("Img not annotated:", img_count)
    print("Anno without image:", ann_count)

    filtered_set = [k for k in data_map.keys() if data_map[k]]
    wnid_set = set([k.split('_')[0] for k in filtered_set])

    # Split filtered set to train, test and validation sets
    wnid_map = {k: v for v, k in enumerate(wnid_set)}
    x_all = range(len(filtered_set))
    y_all = [wnid_map[x.split('_')[0]] for x in filtered_set]
    X_train, test_data, _, test_label = sk.train_test_split(x_all,
                                                            y_all,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y_all)
    X_test, X_eval, _, _ = sk.train_test_split(test_data,
                                               test_label,
                                               test_size=0.5,
                                               random_state=42,
                                               stratify=test_label)

    split = {
        'data': filtered_set,
        'label_map': wnid_map,
        'train': X_train,
        'test': X_test,
        'validation': X_eval
    }

    return split
Example #10
0
def process_openimages(data_dir, class_filter=None):
    """
    Process open Images dataset
    :param data_dir:
    :type data_dir:
    :return:
    :rtype:
    """

    global OPEN_IMAGES_OBJECTS_SET

    dataset = dict()
    splits = [
        os.path.join(data_dir, split)
        for split in ['train', 'test', 'validation']
    ]
    for split in splits:
        split_dir = os.path.basename(split)
        print("Current split:", split_dir)
        dataset[split_dir] = {'images': dict(), 'boxes': dict()}
        obj_list = get_immediate_subdirectories(split)
        for obj in tqdm(obj_list):
            obj_name = os.path.basename(obj).lower()

            if class_filter:
                if obj_name not in class_filter:
                    continue

            img_file_list = get_file_list(obj, format=".jpg")

            if len(img_file_list) > 0:
                OPEN_IMAGES_OBJECTS_SET.add(obj_name)

            label_dir = os.path.join(obj, 'Label')
            label_list = get_file_list(label_dir, format=".txt")
            for img in img_file_list:
                img_name, _ = os.path.splitext(os.path.basename(img))
                dataset[split_dir]['images'][img_name] = img
            for label in label_list:
                label_name, _ = os.path.splitext(os.path.basename(label))
                if label_name not in dataset[split_dir]['boxes']:
                    dataset[split_dir]['boxes'][label_name] = list()
                with open(label, 'r') as label_file:
                    annotations = label_file.readlines()
                for annotation in annotations:
                    dataset[split_dir]['boxes'][label_name].append(
                        annotation.lower().split())

    OPEN_IMAGES_OBJECTS_SET = sorted(OPEN_IMAGES_OBJECTS_SET)
    update_label_map(OPEN_IMAGES_OBJECTS_SET)

    return dataset
Example #11
0
def vectorize_images(input_dir, mask_dir, output_dir, cache_dir, image_level,
                     patch_size):
    """
    Converts a set of whole-slide images into numpy arrays with valid tissue patches for fast processing.

    :param input_dir: folder containing the whole-slide images.
    :param mask_dir: folder containing the whole-slide masks.
    :param output_dir: destination folder to store the vectorized images.
    :param cache_dir: folder to store whole-slide images temporarily for fast access.
    :param image_level: image resolution to read the patches.
    :param patch_size: size of the read patches.
    :return: nothing
    """

    # Output dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read image file names
    image_paths = get_file_list(input_dir,
                                ext='tif')  # get all the wsi.svs files

    # Read mask file names
    mask_paths = get_file_list(mask_dir)  # get all the mask files

    total_images = len(image_paths)

    for index in range(total_images):
        image_id = (os.path.basename(image_paths[index])).split('.')[0]
        output_pattern = output_dir + '/' + image_id + '_{item}.npy'  # by convection on NIC it has to be an .npy
        vectorized_png = output_dir + '/' + image_id + '_{item}.png'
        if not check_file_exists(vectorized_png):
            print(f'Processing image {image_id}')
            vectorize_wsi(image_path=cache_file(image_paths[index],
                                                cache_dir,
                                                overwrite=False),
                          mask_path=mask_paths[index],
                          output_pattern=output_pattern,
                          image_level=image_level,
                          mask_level=image_level,
                          patch_size=patch_size,
                          stride=patch_size,
                          downsample=1,
                          select_bounding_box=False)
            print(
                f'Successful vectorized {image_id} : {total_images - index} images left'
            )
        else:
            print(
                f'Already existing file {image_id} - {total_images - index - 1} images left'
            )
    print('Finish Processing All images!')
Example #12
0
    def __init__(self, cluster, component, service):
        self.cluster = cluster
        self.component = component
        self.prefix = 'hadoop_{0}_{1}'.format(component, service)

        self.file_list = utils.get_file_list(service)
        self.metrics = {}
        for i in range(len(self.file_list)):
            self.metrics.setdefault(
                self.file_list[i],
                utils.read_json_file(service, self.file_list[i]))

        common_file = utils.get_file_list("common")
        self.merge_list = self.file_list + common_file
def connect_base(current_commit, base_commit):
    """Creates a merge commit that takes files from base_commit.

    Literally it's identical to git merge base_commit in current_commit.

    Args:
        current_commit: commit hashes on where to commit to.
        base_commit: commit hashes contains file histories.
    """
    current_files = utils.get_file_list(current_commit)
    base_files = utils.get_file_list(base_commit)
    tree = utils.git_mktree(current_files + base_files)
    return utils.git_commit(tree, [current_commit, base_commit],
                            message=b'Connect history with base %s' %
                            (base_commit.encode('ascii')))
Example #14
0
File: log.py Project: smjurcak/csm
def api_get_session_logs(table):
    id = request.args.get("record_id")

    db_session = DBSession()
    if table == 'install_job':
        install_job = db_session.query(InstallJob).filter(InstallJob.id == id).first()
    elif table == 'install_job_history':
        install_job = db_session.query(InstallJobHistory).filter(InstallJobHistory.id == id).first()
    elif table == 'inventory_job_history':
        install_job = db_session.query(InventoryJobHistory).filter(InventoryJobHistory.id == id).first()

    if install_job is None:
        abort(404)

    log_folder = install_job.session_log
    file_path = os.path.join(get_log_directory(), log_folder)

    if not os.path.isdir(file_path):
        abort(404)

    rows = []
    log_file_list = get_file_list(file_path)
    for file in log_file_list:
        row = dict()
        row['filepath'] = os.path.join(file_path, file)
        row['filename'] = file
        rows.append(row)

    return jsonify(**{'data': rows})
Example #15
0
def make_chunks(num_links, min_chunk_lines=MIN_CHUNK_LINES):
    text_fns = utils.get_file_list(utils.TEXTS_DIR, num_links)
    max_chunks = min(utils.CHUNKS_FOR_SOURCE, len(text_fns))
    texts_processed = 0
    for text_idx, text_fn in enumerate(text_fns[:utils.CHUNKS_FOR_SOURCE],
                                       start=1):
        chunk_fn = text_fn.replace(utils.TEXTS_DIR, utils.CHUNKS_DIR)
        assert chunk_fn != text_fn, 'ERROR: invalid path to text file'
        if not os.path.isfile(chunk_fn):
            with open(text_fn, 'rt', encoding='utf-8') as f_in:
                text = f_in.read().split('\n')[1:]
            with open(chunk_fn, 'wt', encoding='utf-8') as f_out:
                lines, chunk_words = [], 0
                for line_no, line in enumerate(text):
                    line = re.sub('\s+', ' ',
                           re.sub(r'[\u2800\uFE00-\uFE0F]', '', line)).strip()
                    if not line:
                        continue
                    chunk_words += len(line.split())
                    if line_no < min_chunk_lines \
                    or chunk_words <= MAX_CHUNK_WORDS:
                        lines.append(line)
                    else:
                        break
                f_out.write('\n'.join(lines))
                print('\r{} (of {})'.format(text_idx, max_chunks),
                      end='')
                texts_processed += 1
    if texts_processed:
        print()
Example #16
0
File: log.py Project: smjurcak/csm
def api_get_session_logs(table):
    id = request.args.get("record_id")

    db_session = DBSession()
    if table == 'install_job':
        install_job = db_session.query(InstallJob).filter(
            InstallJob.id == id).first()
    elif table == 'install_job_history':
        install_job = db_session.query(InstallJobHistory).filter(
            InstallJobHistory.id == id).first()
    elif table == 'inventory_job_history':
        install_job = db_session.query(InventoryJobHistory).filter(
            InventoryJobHistory.id == id).first()

    if install_job is None:
        abort(404)

    log_folder = install_job.session_log
    file_path = os.path.join(get_log_directory(), log_folder)

    if not os.path.isdir(file_path):
        abort(404)

    rows = []
    log_file_list = get_file_list(file_path)
    for file in log_file_list:
        row = dict()
        row['filepath'] = os.path.join(file_path, file)
        row['filename'] = file
        rows.append(row)

    return jsonify(**{'data': rows})
Example #17
0
def prepload(extension='fits', N=-1, flist=None):
    
    if flist is None:
        # read the file list
        flist = np.array(utils.get_file_list(imtype=extension))
        #, verbose=True)

    imlist_dict = {}
    # stores the number that makes the name of each image, e.g. 75605 for srch75605.fits
    imlist_dict["fnumbers"] = np.argsort(np.array([int(
        ''.join(filter(str.isdigit,  f.split("/")[-1])))
                     for f in flist])) 
    # stores the name of the images as a list
    imlist_dict["flist"] = flist[imlist_dict["fnumbers"]]
    # reads only N images - if -1 reads all images
    if N > -1:
        imlist_dict["flist"] = imlist_dict["flist"][:N * 3]
    # checks that the images are in triplets (template, search, difference)
    if imlist_dict["fnumbers"].shape[0] % 3 :
        print("warning: images not in triplets")
        return np.nan
    
    # the total number of triplets
    imlist_dict["nimgs"] = int(imlist_dict["fnumbers"].shape[0] / 3)
    # the list of objects by numbers
    imlist_dict["nobjects"] = np.unique(imlist_dict["fnumbers"])

    # reads in the images choosing method depending on extension
    if extension == "fits":
        imlist_dict["imshp"] = utils.fits2stamp(imlist_dict["flist"][0]).shape
    else:
        imlist_dict["imshp"] = plt.imread(imlist_dict["flist"][0]).shape

    return imlist_dict, imlist_dict["flist"]
Example #18
0
def infer(model_path, image_shape, label_dict_path, infer_file_list_path):

    infer_file_list = get_file_list(infer_file_list_path)
    # 获取标签字典
    char_dict = load_dict(label_dict_path)
    # 获取反转的标签字典
    reversed_char_dict = load_reverse_dict(label_dict_path)
    # 获取字典大小
    dict_size = len(char_dict)
    # 获取reader
    data_generator = DataGenerator(char_dict=char_dict,
                                   image_shape=image_shape)
    # 初始化PaddlePaddle
    paddle.init(use_gpu=True, trainer_count=2)
    # 加载训练好的参数
    parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
    # 获取网络模型
    model = Model(dict_size, image_shape, is_infer=True)
    # 获取预测器
    inferer = paddle.inference.Inference(output_layer=model.log_probs,
                                         parameters=parameters)
    # 开始预测
    test_batch = []
    labels = []
    for i, (image, label) in enumerate(
            data_generator.infer_reader(infer_file_list)()):
        test_batch.append([image])
        labels.append(label)
    infer_batch(inferer, test_batch, labels, reversed_char_dict)
Example #19
0
  def batch_features_extract(self, opts, flag='query', enable_cache=True):
    print("===================================")
    fea = None
  
    fea_smps = getattr(opts, flag)
    cache_file = os.path.join(self.cache_root, self.feature_info + '_' + fea_smps.replace('/','_') +'.pkl')
    if enable_cache and os.path.isfile(cache_file):
      print('Feature cache file exists : '+ cache_file)
      print('loading '+flag+' features...')
      with open(cache_file, 'rb') as f:
        fea = pickle.load(f)
      print('Loading '+flag+'features completed.')
    else:
      if enable_cache:
        print('Feature cache file not found : '+ cache_file)
      file_dir = os.path.join(opts.data_root, fea_smps)
      file_list = get_file_list(file_dir)
    
      print('Extracting '+flag+' features...')
      fea = self.feature_extract(file_list)
      print('Extracting '+flag+'features completed.')

      if enable_cache:
        print('Saving ' + flag + 'features...')
        if not os.path.exists(self.cache_root):
          os.makedirs(self.cache_root)
        with open(cache_file, 'wb') as f:
          pickle.dump(fea, f)
        print('Saving ' + flag + ' features completed.')
    return fea
Example #20
0
    def _prepare(self):
        
        file_list = get_file_list(self.data_path, regx='.tif')
        self.im_file_list = [os.path.join(self.data_path, im_file) for im_file in file_list]
        interval          = 50

        self._sample_and_get_statistics(self.im_file_list, interval, low=0.2, high=95)
Example #21
0
def resample_directory(input_path, output_path, sample_rate):
    for filepath, filename in get_file_list(input_path):
        print('Reading: ', filename)
        df = load_dataset(filepath)
        print('Resampling:', filename)
        df = resample_dataset(df, sample_rate)
        df.to_csv(os.path.join(output_path, filename))
Example #22
0
def train_preprocessed_img_lazy_batch(train_folder, max_epochs, config, name=None, debug=True):
    fnames, bboxes = get_file_list(train_folder)
    y = np.array(list(bboxes), dtype=np.float32)
    assert (np.any(np.isnan(y)) or np.any(np.isinf(y))
            ) == False, "Invalid `y` detected"
    X = np.array(list(fnames))
    train(X, y, config, max_epochs, 'LazyBatchIterator', name, debug)
Example #23
0
def make_chunks(num_links, min_chunk_lines=MIN_CHUNK_LINES):
    text_fns = utils.get_file_list(utils.TEXTS_DIR, num_links)
    max_chunks = min(utils.CHUNKS_FOR_SOURCE, len(text_fns))
    texts_processed = 0
    for text_idx, text_fn in enumerate(text_fns[:utils.CHUNKS_FOR_SOURCE],
                                       start=1):
        chunk_fn = text_fn.replace(utils.TEXTS_DIR, utils.CHUNKS_DIR)
        assert chunk_fn != text_fn, 'ERROR: invalid path to text file'
        if not os.path.isfile(chunk_fn):
            with open(text_fn, 'rt', encoding='utf-8') as f_in, \
                 open(chunk_fn, 'wt', encoding='utf-8') as f_out:
                f_in.readline()
                text = f_in.read()
                text = re.sub(r'[\u2800\uFE00-\uFE0F]', '', text)
                lines = ('\t'.join(x.strip() for x in x.split('\t'))
                             for x in text.split('\n')
                             if re.search('\w', x)
                            and not all(x.startswith('#') for x in x.split()))
                #lines = (x.strip() for x in text.split('\n')
                #                   if re.search('\w', x)
                #                  and not all(x.startswith('#')
                #                                  for x in x.split()))
                f_out.write('\n'.join(x for x in lines if x))
                print('\r{} (of {})'.format(text_idx, max_chunks),
                      end='')
                texts_processed += 1
    if texts_processed:
        print()
Example #24
0
def get_file_list_route():
    app.logger.debug(request.form)
    ip = dict(request.form)['ip'][0]
    dir_name = dict(request.form)['dir'][0]
    port = get_port(conf, ip)
    file_list = get_file_list(conf, ip, dir_name, port)
    return " ".join(file_list)
def get_annotation_map(data_directory):
    """

    :param data_directory:
    :type data_directory:
    :return:
    :rtype:
    """

    new_obj_map = {k.strip(): v.strip() for k, v in OBJECT_WNID_MAP.items()}

    concerned_objs = [new_obj_map[k] for k in new_obj_map if k in objects]

    result = {}
    obj_dir_list = get_immediate_subdirectories(data_directory)
    for obj_dir in obj_dir_list:
        obj = os.path.basename(obj_dir)
        if obj not in concerned_objs:
            continue
        result[obj] = []
        anno_dir = os.path.join(obj_dir, 'Annotation')
        for anno in get_file_list(anno_dir, format=".xml"):
            anno_name = os.path.splitext(os.path.basename(anno))[0]
            result[obj].append(anno_name)
    return result
Example #26
0
    def get_result(self, audio_path):
        def softmax(x):
            e_x = np.exp(x - np.max(x))
            return e_x / e_x.sum(axis=0)

        print('Generate side channel...')
        y_s = utils.get_side(audio_path)
        print('Rendering spectrum...')
        utils.get_spectrum(y_s, 0, 'temp', max=20)
        spectrum_list = utils.get_file_list('temp')
        print('Valid samples...')
        fin = np.zeros(4)
        for i_idx in range(len(spectrum_list)):
            norm_img = self.img_preprocess(spectrum_list[i_idx])
            result = self.session.run([], {self.model_input: norm_img})[0][0]
            result = softmax(result)
            fin[np.argmax(result)] += 1
            print(
                f'Sample {i_idx+1} -> {self.r_map[np.argmax(result)]}, Prob:{np.max(result)*100:.3f}%'
            )
        if fin[0] != len(spectrum_list):
            fin[0] = 0
            print(f'Final result: {self.r_map[np.argmax(fin)]}')
        else:
            print('Final result: Lossless')
Example #27
0
def enrich_directory(sample_path, rich_path):
    for filepath, filename in get_file_list(sample_path):
        print('Reading:', filename)
        df = load_sampleset(filepath)
        print('Enriching:', filename)
        df = enrich_sampleset(df)
        print('Saving:', filename)
        df.to_csv(os.path.join(rich_path, filename))
def disconnect(source_commit, ref_commit):
    """Creates a commit that disconnects files from source_commit.

    All files existing in ref_commit will be removed from source_commit.

    Args:
        source_commit: commit hash to disconnect from.
        ref_commit: commit hash to be a file list reference.
    """
    source_files = utils.get_file_list(source_commit)
    ref_files = utils.get_file_list(ref_commit)
    ref_files_set = set(ref.path for ref in ref_files)
    kept_files = [ref for ref in source_files if ref.path not in ref_files_set]
    tree = utils.git_mktree(kept_files)
    return utils.git_commit(tree, [source_commit],
                            message=b'Disconnect history from %s' %
                            (source_commit.encode('ascii')))
Example #29
0
def main():
    # Init args
    parser = argparse.ArgumentParser(
        description='Copy file from given commits')
    parser.add_argument('commit_hash',
                        metavar='commit',
                        type=str,
                        nargs=1,
                        help='commit hash to copy files from')
    parser.add_argument('--dry_run',
                        dest='dry_run',
                        action='store_const',
                        const=True,
                        default=False)
    arg = parser.parse_args(sys.argv[1:])

    # Read file list from HEAD and upstream commit.
    upstream_files = utils.get_file_list(arg.commit_hash[0])
    our_files = utils.get_file_list('HEAD')

    # Calculate target file list
    target_files = filters.filter_file(our_files, upstream_files)

    # Calculate operations needed
    ops = utils.gen_op(our_files, target_files)

    if arg.dry_run:
        # Print ops only on dry-run mode.
        print('\n'.join(repr(x) for x in ops))
        return
    for op, f in ops:
        # Ignore if op is REP because we only want to copy missing files, not to
        # revert custom Chromium OS libchrome patch.
        assert type(op) == utils.DiffOperations
        if op == utils.DiffOperations.DEL:
            subprocess.check_call(['git', 'rm', f.path]),
        elif op == utils.DiffOperations.ADD:
            # Create directory recursively if not exist.
            os.makedirs(os.path.dirname(f.path), exist_ok=True)
            # Read file by git cat-file with blob object id to avoid heavy git checkout.
            with open(f.path, 'wb') as outfile:
                subprocess.check_call(['git', 'cat-file', 'blob', f.id],
                                      stdout=outfile)
            # Add to git index
            subprocess.check_call(['git', 'add', f.path])
Example #30
0
    def _refine_database(self):
        # load cached data if it existed.
        cache_file = os.path.join(self._cache_dir,
                                  self._db_name + '_gt_db.pkl')
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as fid:
                refinedb = pickle.load(fid)
            print('{} database loaded from {}'.format(self._db_name,
                                                      cache_file))
            return refinedb

# generate db used for refine result.
# file list is generated from predict xml files
        file_name_itr = get_file_list(self._data_dir, file_suffix='.xml')
        refine_boxes = []
        # refine_boxes_ext = []
        refine_class = []
        refine_img = []
        img_path_list = []
        for file_name in file_name_itr:
            print(file_name)
            annt_file = os.path.join(self._data_dir, file_name + '.xml')
            gt_annt_file = os.path.join(self._gt_dir, file_name + '.xml')
            img_file = os.path.join(self._img_dir, file_name + self._suffix)
            img = Image.open(img_file)
            try:
                # boxes, boxes_ext, classes, patches = self._get_box_info(annt_file, gt_annt_file, img)
                boxes, classes, patches = self._get_box_info(
                    annt_file, gt_annt_file, img)
            except ValueError as e:
                print(e)
                print(annt_file)
                print(gt_annt_file)
                print(img_file)
                classes = []

            if len(classes) == 0:
                continue
            img_path_list.extend(
                [os.path.join(self._img_dir, file_name + self._suffix)] *
                len(boxes))
            refine_boxes.extend(boxes)
            # refine_boxes_ext.extend(boxes_ext)
            refine_class.extend(classes)
            refine_img.extend(patches)

        # db_info = zip(refine_boxes, refine_boxes_ext, refine_class, img_path_list, refine_img)
        # refinedb = [dict(zip(('box', 'box_ext', 'class', 'path', 'image'), item_info)) for item_info in db_info]
        db_info = zip(refine_boxes, refine_class, img_path_list, refine_img)
        refinedb = [
            dict(zip(('box', 'class', 'path', 'image'), item_info))
            for item_info in db_info
        ]
        with open(cache_file, 'wb') as fid:
            pickle.dump(refinedb, fid, pickle.HIGHEST_PROTOCOL)
        print('wrote {} database to {}'.format(self._db_name, cache_file))
        return refinedb
Example #31
0
def train_model(unique_labels, file_dir, num_samples, le_file_path, num_epochs, model_file_path):
    model = create_model(unique_labels)
    file_list = utils.get_file_list(file_dir, samples=num_samples, training=True)
    logger.info('Beginning training model')
    images, labels = utils.read_images(file_list, le_file_path, training=True)
    model.fit(images, labels, epochs=num_epochs)
    logger.info('Finished fitting model')
    logger.info(f'Saving model to {model_file_path}')
    model.save(model_file_path)
    def get_ins_photo_list(self):
        """Get all the files in the instagram folder. """
        photo_fn_list = get_file_list(self.ins_folder)
        # print(self.home_folder+self.ins_folder)
        # print(len(photo_list), photo_list[:10])
        if len(photo_fn_list) == 0:
            logging.error("The Ins folder is empty.")

        return photo_fn_list
Example #33
0
def verify_commit(original_commit, new_tree):
    """Verifies if new_tree is exactly original_commit after filters.

    Args:
        original_commit: commit hash in Chromium browser tree.
        new_tree: tree hash created for upstream branch commit.
    """
    expected_file_list = filters.filter_file(
        [], utils.get_file_list(original_commit))
    assert utils.git_mktree(expected_file_list) == new_tree
Example #34
0
    def _patch_database(self):
        # load cached data if it existed.
        cache_file = os.path.join(self._cache_dir,
                                  self._db_name + '_gt_db.pkl')
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as fid:
                patchdb = pickle.load(fid)
            print('{} database loaded from {}'.format(self._db_name,
                                                      cache_file))
            return patchdb

# file list is generated from predict xml files
        file_name_itr = get_file_list(self._gt_dir, file_suffix=self._suffix)
        db_boxes = []
        db_class = []
        patch_img = []
        img_path_list = []
        for file_name in file_name_itr:
            # filter the required files
            if not (self._db_name in file_name):
                continue
            print('processing ' + file_name)
            annt_file = os.path.join(self._gt_dir, file_name + '.xml')
            img_file = os.path.join(self._img_dir, file_name + self._suffix)
            try:
                img = Image.open(img_file)
                boxes, classes, patches = self._get_box_info(annt_file, img)
            except ValueError as e:
                print(e)
                classes = []
            except OSError as e:
                print(e)
                classes = []

            # continue if the annt file is empty
            if len(classes) == 0:
                continue

            # add box to db
            img_path_list.extend(
                [os.path.join(self._img_dir, file_name + self._suffix)] *
                len(classes))
            db_boxes.extend(boxes)
            db_class.extend(classes)
            patch_img.extend(patches)

        db_info = zip(db_boxes, db_class, img_path_list, patch_img)
        db = [
            dict(zip(('box', 'class', 'path', 'image'), item_info))
            for item_info in db_info
        ]
        with open(cache_file, 'wb') as fid:
            pickle.dump(db, fid, pickle.HIGHEST_PROTOCOL)
        print('wrote {} database to {}'.format(self._db_name, cache_file))
        return db
Example #35
0
    def check_command_file_diff(self, install_job, message):
        file_suffix = '.diff.html'
        file_list = get_file_list(os.path.join(get_log_directory(), install_job.session_log))
        diff_file_list = [file for file in file_list if file_suffix in file]

        if len(diff_file_list) > 0:
            message += 'The following command outputs have changed between different installation phases<br><br>'
            for file in diff_file_list:
                message += file.replace(file_suffix, '') + '<br>'
            message += '<br>'

        return message
Example #36
0
File: log.py Project: smjurcak/csm
def host_session_log(hostname, table, id):
    """
    This route is also used by mailer.py for email notification.
    """
    db_session = DBSession()

    record = None
    doc_central_log_file_path = ''

    if table == 'install_job':
        record = db_session.query(InstallJob).filter(InstallJob.id == id).first()
    elif table == 'install_job_history':
        record = db_session.query(InstallJobHistory).filter(InstallJobHistory.id == id).first()

        doc_central_log_file_path = get_doc_central_log_path(record)
    elif table == 'inventory_job_history':
        record = db_session.query(InventoryJobHistory).filter(InventoryJobHistory.id == id).first()

    if record is None:
        abort(404)

    file_path = request.args.get('file_path')
    log_file_path = get_log_directory() + file_path

    if not(os.path.isdir(log_file_path) or os.path.isfile(log_file_path)):
        abort(404)

    file_pairs = {}
    log_file_contents = ''

    file_suffix = '.diff.html'
    if os.path.isdir(log_file_path):
        # Returns all files under the requested directory
        log_file_list = get_file_list(log_file_path)
        diff_file_list = [filename for filename in log_file_list if file_suffix in filename]

        for filename in log_file_list:
            diff_file_path = ''
            if file_suffix not in filename:
                if filename + file_suffix in diff_file_list:
                    diff_file_path = os.path.join(file_path, filename + file_suffix)
                file_pairs[os.path.join(file_path, filename)] = diff_file_path

        file_pairs = collections.OrderedDict(sorted(file_pairs.items()))
    else:
        with io.open(log_file_path, "rt", encoding='latin-1') as fo:
            log_file_contents = fo.read()

    return render_template('host/session_log.html', hostname=hostname, table=table,
                           record_id=id, file_pairs=file_pairs, log_file_contents=log_file_contents,
                           is_file=os.path.isfile(log_file_path),
                           doc_central_log_file_path=doc_central_log_file_path)
Example #37
0
def main(root_folder, batch_size=256, train_split=0.2):
    fnames, bboxes = get_file_list(root_folder)
    fnames = np.asarray(list(fnames))
    bboxes = np.asarray(list(bboxes), dtype=np.float32)
    num_samples = fnames.shape[0]
    num_val = int(round(num_samples * train_split))
    # Perform train validation split
    idx = np.arange(num_samples)
    rng = np.random.RandomState(seed=12345)
    rng.shuffle(idx)
    train_fnames = fnames[idx[num_val:]]
    train_bboxes = bboxes[idx[num_val:]]
    val_fnames = fnames[idx[:num_val]]
    val_bboxes = bboxes[idx[:num_val]]
    print "%d training samples and %d validation samples" % (train_fnames.shape[0], val_fnames.shape[0])
    # Create (key, value) pairs for storing in db
    X_t = []
    y_t = []
    for i in xrange(len(train_fnames)):
        X_t.append(('%08d' % i, train_fnames[i]))
        y_t.append(('%08d' % i, train_bboxes[i]))
    X_v = []
    y_v = []
    for i in xrange(len(val_fnames)):
        X_v.append(('%08d' % i, val_fnames[i]))
        y_v.append(('%08d' % i, val_bboxes[i]))

    # Training set
    train_image_db = lmdb.open('train_image', map_size=1e+12)
    train_label_db = lmdb.open('train_label', map_size=1e+12)

    prev_j = 0
    for j in xrange(batch_size, len(X_t), batch_size):
        print "Starting train batch #%d processing" % (prev_j / batch_size)
        process_batch(train_image_db, train_label_db,
                      X_t[prev_j:j], y_t[prev_j:j])
        prev_j = j

    train_image_db.close()
    train_label_db.close()
    # Validation set
    val_image_db = lmdb.open('val_image', map_size=1e+12)
    val_label_db = lmdb.open('val_label', map_size=1e+12)

    prev_j = 0
    for j in xrange(batch_size, len(X_v), batch_size):
        print "Starting val batch #%d processing" % (prev_j / batch_size)
        process_batch(val_image_db, val_label_db, X_v[prev_j:j], y_v[prev_j:j])
        prev_j = j

    val_image_db.close()
    val_label_db.close()
Example #38
0
def get_sp_files_from_csm_repository():
    rows = []
    file_list = get_file_list(get_repository_directory())

    for filename in file_list:
        if '.pie' in filename:
            statinfo = os.stat(get_repository_directory() + filename)
            row = {}
            row['image_name'] = filename
            row['image_size'] = '{} bytes'.format(statinfo.st_size)
            rows.append(row)

    return jsonify(**{'data': rows})
Example #39
0
def get_full_software_tar_files_from_csm_repository():
    rows = []
    file_list = get_file_list(get_repository_directory())

    for filename in file_list:
        if '-iosxr-' in filename and filename.endswith('.tar'):
            statinfo = os.stat(get_repository_directory() + filename)
            row = {}
            row['image_name'] = filename
            row['image_size'] = '{} bytes'.format(statinfo.st_size)
            rows.append(row)

    return jsonify(**{'data': rows})
Example #40
0
def main():
    """
    Enter the absolute paths of the directories containing your
    Movies/TV Shows, separated by an empty space
    """
    # creates the project dir if it doesn't exists
    utils.create_project_directory(project_path)
    directories = sys.argv[1:]
    print('dir entered:', directories)
    # if len(directories) == 0:
    #     print("Please mention the directories")
    #     return

    movie_paths = dataset_db['movie_paths']
    movie_data = dataset_db['movie_data']

    for directory in directories:
        # Ensure that the directory path begins with '/',
        # but doesn't end with one, and is NOT root
        path = '/{}'.format(directory.strip('/'))
        if path == '/':
            continue
        if os.path.exists(path):
            movie_paths.upsert(dict(directory=path), ['directory'])

    file_list = utils.get_file_list()
    if file_list is None:
        print('No video files found')
        return

    failed_list = []
    movie_data = dataset_db['movie_data']
    for i, filename in enumerate(file_list):
        try:
            data_exists = movie_data.find_one(Filename=filename)
            if data_exists:
                print('Already indexed:', filename)
                continue
        except:
            pass

        print(i, filename)
        temp = utils.fetch_movie_details(filename)
        if temp is False:
            failed_list.append(filename)
            continue

    print('Failed for:', failed_list)
def inputs(hypes, _, phase, data_dir):
    """
    Get data.

    Parameters
    ----------
    hypes : dict
    _ : ignore this
    phase : {'train', 'val'}
    data_dir : str

    Returns
    -------
    tuple
        (xs, ys), where xs and ys are lists of the same length.
        xs are paths to the input images and ys are paths to the expected
        output
    """
    x_files, y_files = get_file_list(hypes, 'train')
    x_files, y_files = sklearn.utils.shuffle(x_files,
                                             y_files,
                                             random_state=0)
    # x_files = x_files[:40]  # reducing data
    # y_files = y_files[:40]  # reducing data

    xs, ys = [], []
    i = 0
    for x, y in zip(x_files, y_files):
        logging.info("Read '%s' for data...", x)
        image = get_image(x, 'RGB')
        # from scipy.ndimage.filters import gaussian_filter
        # image = gaussian_filter(image, sigma=10)
        label = get_image(y, 'L')
        label = normalize_labels(label)
        im = Image.open(x, 'r')
        width, height = im.size
        for x in range(width):
            for y in range(height):
                image_val = get_features(x, y, image, hypes['model_nr'])
                label_val = (label[y][x][0] == 0)  # only 0 is background

                xs.append(image_val)
                ys.append(label_val)
        i += 1
        if i == 10:  # TODO: For testing
            break
    return xs, numpy.array(ys, dtype=int)
Example #42
0
File: cco.py Project: smjurcak/csm
def get_smu_or_sp_list(hostname, hide_installed_packages, smu_info_list, file_suffix):
    """
    Return the SMU/SP list.  If hostname is given, compare its active packages.
    """
    file_list = get_file_list(get_repository_directory(), '.' + file_suffix)

    host_packages = [] if hostname is None else get_host_active_packages(hostname)

    rows = []
    for smu_info in smu_info_list:

        # Verify if the package has already been installed.
        installed = False
        for host_package in host_packages:
            if smu_info.name in host_package:
                installed = True
                break

        include = False if (hide_installed_packages == 'true' and installed) else True
        if include:
            row = dict()
            row['ST'] = 'True' if smu_info.name + '.' + file_suffix in file_list else 'False'
            row['package_name'] = smu_info.name + '.' + file_suffix
            row['posted_date'] = smu_info.posted_date.split()[0]
            row['ddts'] = smu_info.ddts
            row['ddts_url'] = BUG_SEARCH_URL + smu_info.ddts
            row['type'] = smu_info.type
            row['description'] = smu_info.description
            row['impact'] = smu_info.impact
            row['functional_areas'] = smu_info.functional_areas
            row['id'] = smu_info.id
            row['name'] = smu_info.name
            row['status'] = smu_info.status
            row['package_bundles'] = smu_info.package_bundles
            row['compressed_image_size'] = smu_info.compressed_image_size
            row['uncompressed_image_size'] = smu_info.uncompressed_image_size
            row['is_installed'] = installed

            if not is_empty(hostname) and SMU_INDICATOR in smu_info.name:
                row['is_applicable'] = is_smu_applicable(host_packages, smu_info.package_bundles)
            else:
                row['is_applicable'] = True

            rows.append(row)

    return jsonify(**{'data': rows})
Example #43
0
File: cco.py Project: smjurcak/csm
def api_get_tar_list(platform, release):
    smu_loader = SMUInfoLoader(platform, release, from_cco=False)

    if not smu_loader.is_valid:
        return jsonify(**{'data': []})
    else:
        file_list = get_file_list(get_repository_directory(), '.tar')
        tars_list = smu_loader.get_tar_list()
        rows = []
        for tar_info in tars_list:
            row = dict()
            row['ST'] = 'True' if tar_info.name in file_list else 'False'
            row['name'] = tar_info.name
            row['compressed_size'] = tar_info.compressed_image_size
            row['description'] = ""
            rows.append(row)

    return jsonify(**{'data': rows})
Example #44
0
def generate_batch(hypes, phase):
    """
    Generate patches.

    Parameters
    ----------
    hypes : dict
    phase : 'train' or 'test'
    """
    x_files, y_files = get_file_list(hypes, phase)
    x_files, y_files = sklearn.utils.shuffle(x_files,
                                             y_files,
                                             random_state=0)
    batch_x, batch_y = [], []
    while True:
        for x, y in zip(x_files, y_files):
            logging.info("Read '%s' for data...", x)
            image = get_image(x, 'RGB')
            label = load_segmentation_mask(hypes, y)
            im = Image.open(x, 'r')
            width, height = im.size
            image_vals = get_features(hypes, image, 'data')
            label_vals = get_features(hypes, label, 'label')
            # print("image_vals = %s" % str(list(image_vals)))
            for patch, label_ in zip(image_vals, label_vals):
                patch = img_to_array(patch)
                label_ = img_to_array(label_)
                _, w, h = label_.shape
                label_ = label_.reshape((w, h))
                if phase == 'val' and 1.0 not in label_:
                    print("continue")
                    continue
                # scipy.misc.imshow(patch)
                # scipy.misc.imshow(label_)
                batch_x.append(patch)
                batch_y.append(label_)  # .flatten()
                if len(batch_x) == hypes['solver']['batch_size']:
                    yield (np.array(batch_x), np.array(batch_y))
                    batch_x, batch_y = [], []
def inputs(hypes, _, phase, data_dir):
    """
    Get data.

    Parameters
    ----------
    hypes : dict
    _ : ignore this
    phase : {'train', 'val'}
    data_dir : str

    Returns
    -------
    tuple
        (xs, ys), where xs and ys are lists of the same length.
        xs are paths to the input images and ys are paths to the expected
        output
    """
    x_files, y_files = get_file_list(hypes, 'train')
    x_files, y_files = sklearn.utils.shuffle(x_files,
                                             y_files,
                                             random_state=0)

    xs, ys = [], []
    for x, y in zip(x_files, y_files):
        logging.info("Read '%s' for data...", x)
        image = get_image(x, 'RGB')
        label = load_segmentation_mask(hypes, y)
        im = Image.open(x, 'r')
        width, height = im.size
        for x in range(width):
            for y in range(height):
                image_val = get_features(x, y, image, hypes['model_nr'])
                label_val = label[y][x]

                xs.append(image_val)
                ys.append(label_val)
    return xs, np.array(ys, dtype=int)
def main(hypes_file, data_dir, override):
    """Orchestrate."""
    with open(hypes_file, 'r') as f:
        hypes = json.load(f)
    if 'training' not in hypes:
        hypes['training'] = {}
    if 'make_equal' not in hypes['training']:
        hypes['training']['make_equal'] = False

    base = os.path.dirname(hypes_file)
    model_file_path = os.path.join(base, '%s.yaml' % hypes['model']['name'])
    model_file_path = os.path.abspath(model_file_path)
    weights_file_path = os.path.join(base, '%s.hdf5' % hypes['model']['name'])
    weights_file_path = os.path.abspath(weights_file_path)

    if not os.path.isfile(model_file_path) or override:
        if not os.path.isfile(model_file_path):
            logging.info("Did not find '%s'. Start training...",
                         model_file_path)
        else:
            logging.info("Override '%s'. Start training...",
                         model_file_path)

        # Get data
        # x_files, y_files = inputs(hypes, None, 'train', data_dir)
        x_files, y_files = get_file_list(hypes, 'train')
        x_files, y_files = sklearn.utils.shuffle(x_files,
                                                 y_files,
                                                 random_state=0)

        x_train, y_train = get_traindata_single_file(hypes,
                                                     x_files[0],
                                                     y_files[0])

        nb_features = x_train[0].shape[0]
        logging.info("Input gets %i features", nb_features)

        # Make model
        model = Sequential()
        model.add(Dense(64,
                  input_dim=nb_features,
                  init='uniform',
                  activation='sigmoid'))
        model.add(Dropout(0.5))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adagrad',  # rmsprop
                      metrics=['accuracy'])

        generator = generate_training_data(hypes, x_files, y_files)
        t0 = time.time()
        sep = hypes['solver']['samples_per_epoch']
        if True:
            class_weight = get_class_weight(hypes)
            logging.info("class_weights = %s", class_weight)
            model.fit_generator(generator,
                                samples_per_epoch=sep,
                                nb_epoch=hypes['solver']['epochs'],
                                verbose=1,
                                validation_data=(x_train, y_train),
                                class_weight=class_weight)
        else:
            logging.info("Fit with .fit")
            x_train, y_train = inputs(hypes, None, 'train', data_dir)
            model.fit(x_train, y_train, batch_size=128, nb_epoch=1)
        t1 = time.time()
        print("Training Time: %0.4f" % (t1 - t0))

        # save as YAML
        yaml_string = model.to_yaml()
        with open(model_file_path, 'w') as f:
            f.write(yaml_string)
        model.save_weights(weights_file_path)

        # Evaluate
        data = get_file_list(hypes, 'test')
        logging.info("Start segmentation")
        analyze.evaluate(hypes,
                         data,
                         data_dir,
                         model,
                         elements=[0, 1],
                         get_segmentation=get_segmentation)
    else:
        logging.info("## Found '%s'.", model_file_path)
        with open(model_file_path) as f:
            yaml_string = f.read()
        model = model_from_yaml(yaml_string)
        model.load_weights(weights_file_path)
        model.compile(optimizer='adagrad', loss='binary_crossentropy')
        data = get_file_list(hypes, 'test')
        analyze.evaluate(hypes,
                         data,
                         data_dir,
                         model,
                         elements=[0, 1],
                         get_segmentation=get_segmentation)
def main(hypes_file, data_dir, override):
    """Orchestrate."""
    with open(hypes_file, 'r') as f:
        hypes = json.load(f)

    model_file_path = os.path.abspath('%s.pkl' % hypes['model']['name'])

    color_changes = {0: (0, 0, 0, 0),
                     1: (0, 255, 0, 127),
                     'default': (0, 0, 0, 0)}

    if not os.path.isfile(model_file_path) or override:
        if not os.path.isfile(model_file_path):
            logging.info("Did not find '%s'. Start training...",
                         model_file_path)
        else:
            logging.info("Override '%s'. Start training...",
                         model_file_path)

        # Get data
        # x_files, y_files = inputs(hypes, None, 'train', data_dir)
        x_files, y_files = get_file_list(hypes, 'train')
        x_files, y_files = sklearn.utils.shuffle(x_files,
                                                 y_files,
                                                 random_state=0)

        x_train, y_train = get_traindata_single_file(hypes,
                                                     x_files[0],
                                                     y_files[0])

        nb_features = x_train[0].shape[0]
        logging.info("Input gets %i features", nb_features)

        # Make model
        from sklearn.svm import LinearSVC, SVC
        from sklearn.tree import DecisionTreeClassifier
        model = SVC(probability=False,  # cache_size=200,
                    kernel="linear", C=2.8, gamma=.0073)
        model = LinearSVC(C=2.8)
        model = DecisionTreeClassifier()

        print("Start fitting. This may take a while")

        generator = generate_training_data(hypes, x_files, y_files)
        t0 = time.time()

        if False:
            sep = hypes['solver']['samples_per_epoch']
            model.fit_generator(generator,
                                samples_per_epoch=sep,
                                nb_epoch=hypes['solver']['epochs'],
                                verbose=1,
                                # callbacks=[callb],
                                validation_data=(x_train, y_train))
        else:
            logging.info("Fit with .fit")
            x_train, y_train = inputs(hypes, None, 'train', data_dir)
            print(len(y_train))
            model.fit(x_train, y_train)
        t1 = time.time()
        print("Training Time: %0.4f" % (t1 - t0))

        # save as YAML
        joblib.dump(model, model_file_path)

        # Evaluate
        data = get_file_list(hypes, 'test')
        logging.info("Start segmentation")
        analyze.evaluate(hypes,
                         data,
                         data_dir,
                         model,
                         elements=[0, 1],
                         load_label_seg=load_label_seg,
                         color_changes=color_changes,
                         get_segmentation=get_segmentation)
    else:
        model = joblib.load(model_file_path)
        data = get_file_list(hypes, 'test')
        analyze.evaluate(hypes,
                         data,
                         data_dir,
                         model,
                         elements=[0, 1],
                         load_label_seg=load_label_seg,
                         color_changes=color_changes,
                         get_segmentation=get_segmentation)
Example #48
0
def main(hypes_file, out_dir, override):
    """Orchestrate."""
    with open(hypes_file, 'r') as f:
        hypes = json.load(f)

    model_file_path = '%s.yaml' % hypes['model']['name']
    weights_file_path = '%s.hdf5' % hypes['model']['name']

    if not os.path.isfile(model_file_path) or override:
        patch_size = hypes['arch']['patch_size']
        img_channels = hypes['arch']['num_channels']
        nb_out = hypes['arch']['stride']**len(hypes['classes'])

        model = Sequential()
        model.add(Convolution2D(64, 3, 3, border_mode='valid',
                                init='glorot_normal',
                                activation='sigmoid',
                                input_shape=(img_channels,
                                             patch_size,
                                             patch_size)))
        model.add(Convolution2D(32, 3, 3,
                                activation='relu',
                                init='glorot_normal'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.5))

        # model.add(Convolution2D(64, 3, 3, border_mode='same'))
        # model.add(Activation('relu'))
        # model.add(Convolution2D(64, 3, 3))
        # model.add(Activation('relu'))
        # model.add(MaxPooling2D(pool_size=(2, 2)))
        # model.add(Dropout(0.25))

        model.add(Flatten())
        # model.add(Dense(64, activation='sigmoid'))
        # # model.add(Dropout(0.5))
        # model.add(Dense(64, activation='relu'))
        # model.add(Dropout(0.5))
        model.add(Dense(nb_out,
                        activation='sigmoid',
                        init='glorot_normal'))
        model.add(Reshape((hypes['arch']['stride'], hypes['arch']['stride'])))

        # sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
        opt = keras.optimizers.Adadelta(lr=hypes['solver']['learning_rate'],
                                        rho=0.95,
                                        epsilon=1e-08)
        model.compile(loss=hypes['solver']['loss'],
                      optimizer=opt)  # hypes['solver']['optimizer']
        logging.info("model compiled")

        # while 1:
        #     b = generate_batch(hypes, 'train')

        # for e in range(10):
        #     print 'Epoch', e
        #     batches = 0
        #     for X_batch, Y_batch in generate_batch(hypes, 'train'):
        #         Y_batch = np.reshape(Y_batch, (-1, 400))
        #         loss = model.fit(X_batch,
        #                          Y_batch,
        #                          batch_size=hypes['solver']['batch_size'])
        #         print(loss)
        #         batches += 1
        #         if e > 2:
        #             # we need to break the loop by hand because
        #             # the generator loops indefinitely
        #             break

        # # Train
        g = generate_batch(hypes, 'train')
        logging.info("generate_batch")
        X_test, Y_test = g.next()
        # print("#" * 80)
        # print(X_test.shape)
        # print(Y_test.shape)
        logging.info("start fit_generator")
        model.fit_generator(generate_batch(hypes, 'train'),
                            samples_per_epoch=hypes['solver']['samples_per_epoch'],
                            nb_epoch=hypes['solver']['epochs'],
                            verbose=1,
                            validation_data=(X_test, Y_test))
        x_files, y_files = get_file_list(hypes, 'train')
        x_files, y_files = sklearn.utils.shuffle(x_files,
                                                 y_files,
                                                 random_state=0)
        # ij = 0
        # for epoch in range(1, hypes['solver']['epochs'] + 1):
        #     print("#" * 80)
        #     print("# Epoch %i" % epoch)
        #     print("#" * 80)
        #     x_files, y_files = sklearn.utils.shuffle(x_files,
        #                                              y_files,
        #                                              random_state=epoch)
        #     for x_train_file, y_train_file in zip(x_files, y_files):
        #         x_train, y_train = get_traindata_single_file(hypes,
        #                                                      x_train_file,
        #                                                      y_train_file)
        #         # Reduce data
        #         # x_train, y_train = reduce_data_equal(x_train,
        #         #                                      y_train)

        #         t0 = time.time()
        #         model.fit(x_train, y_train,
        #                   batch_size=128,
        #                   nb_epoch=1,
        #                   )
        #         ij += 1
        #         print("%i of %i" %
        #               (ij, hypes['solver']['epochs'] * len(x_files)))
        #         t1 = time.time()
        #         print("Training Time: %0.4f" % (t1 - t0))
        print("done with fit_generator")
        # save as YAML
        yaml_string = model.to_yaml()
        with open(model_file_path, 'w') as f:
            f.write(yaml_string)
        model.save_weights(weights_file_path)

        # Evaluate
        data = get_file_list(hypes, 'test')
        analyze.evaluate(hypes,
                         data,
                         out_dir,
                         model,
                         elements=[0, 1],
                         get_segmentation=get_segmentation,
                         verbose=True)
    else:
        with open(model_file_path) as f:
            yaml_string = f.read()
        model = model_from_yaml(yaml_string)
        model.load_weights(weights_file_path)
        model.compile(optimizer=hypes['solver']['optimizer'],
                      loss='binary_crossentropy')
        data = get_file_list(hypes, 'test')
        analyze.evaluate(hypes,
                         data,
                         out_dir,
                         model,
                         elements=[0, 1],
                         get_segmentation=get_segmentation,
                         verbose=True)
def train(data_path, config):
    with tf.Graph().as_default(), tf.Session() as session:
        word_to_id_path = os.path.join(data_path, config.vocab_file)
        with open(word_to_id_path, "rb") as f:
            word_to_id = pickle.load(f)

        vocab_size = len(word_to_id)
        print("Vocab size: %d" % vocab_size)
        sys.stdout.flush()

        train_pattern = config.data_pattern.replace("{-type-}", "train") + ".part*"
        valid_pattern = config.data_pattern.replace("{-type-}", "valid") + ".part*"

        train_files = get_file_list(config, data_path, train_pattern, "train")
        valid_files = get_file_list(config, data_path, valid_pattern, "valid")

        if config.copy_temp:
            temp_dir = tempfile.mkdtemp()
            print("Copying data files to %s" % temp_dir)
            train_files = copy_temp_files(train_files, temp_dir)
            valid_files = copy_temp_files(valid_files, temp_dir)

        config.vocab_size = vocab_size

        train_batcher = PreBatched(train_files, config.batch_size, description="train") if config.use_prebatched \
            else QueuedSequenceBatcher(train_files, config.seq_length, config.batch_size, description="train",
                                       attns=config.attention)
        valid_batcher = PreBatched(valid_files, config.batch_size, description="valid") if config.use_prebatched \
            else QueuedSequenceBatcher(valid_files, config.seq_length, config.batch_size, description="valid",
                                       attns=config.attention)

        t0 = datetime.datetime.now()
        initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)

        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = create_model(config, True)
        with tf.variable_scope("model", reuse=True, initializer=initializer):
            mvalid = create_model(config, False)

        summary_writer = tf.train.SummaryWriter(config.events_path, graph=session.graph)
        valid_perplexity = PerplexityHook(summary_writer, mvalid, valid_batcher)

        hooks = [
            SpeedHook(summary_writer, config.status_iterations, config.batch_size),
            LossHook(summary_writer, config.status_iterations),
            valid_perplexity,
            SaveModelHook(config.checkpoint_path, 1, config.__dict__, 5)
        ]
        t1 = datetime.datetime.now()
        print("Building models took: %s" % (t1 - t0))

        def load_func():
            if config.model_path is not None:
                load_model(session, config.model_path)
                print("Continuing training from model: %s" % config.model_path)
            if config.embedding_path is not None:
                load_variables(session, os.path.join(config.embedding_path, "embedding.tf"),
                               [m.embedding_variable])
                print("Loading embedding vectors from: %s" % config.embedding_path)

        trainer = Trainer(m.optimizer, config.epochs, hooks, m, m.train_op)
        trainer(train_batcher, m.loss, session, config.learning_rate, config.lr_decay, load_func)

        saver = tf.train.Saver(tf.trainable_variables())
        embedding_saver = tf.train.Saver([m.embedding_variable])
        print("Saving model...")
        out_path = save_model(saver, session, config.save_path, m.predict, config.__dict__)
        embedding_saver.save(session, os.path.join(out_path, "embedding.tf"))

        if config.copy_temp:
            shutil.rmtree(temp_dir)
def main(hypes_file, output_dir, override):
    """
    Orchestrate.

    Parameters
    ----------
    hypes_file : str
        Path to a JSON file
    output_dir : str
        Path where the output should be stored
    override : bool
        If True, then override the model if it exists.
    """
    # Load hyperparameters
    with open(hypes_file, 'r') as f:
        hypes = json.load(f)

    # Set serialization path
    base = os.path.dirname(hypes_file)
    model_file_path = os.path.join(base, '%s.pickle' % hypes['model']['name'])
    model_file_path = os.path.abspath(model_file_path)

    if not os.path.isfile(model_file_path) or override:
        if not os.path.isfile(model_file_path):
            logging.info("Did not find '%s'. Start training...",
                         model_file_path)
        else:
            logging.info("Override '%s'. Start training...",
                         model_file_path)

        # Get training data
        x_files, y_files = get_file_list(hypes, 'train')

        # "Train" "classifier" (it just counts the classes)
        model = {'positions': None, 'files': 0}

        for y_file in y_files:
            logging.info("Read '%s'...", y_file)
            mask = load_segmentation_mask(hypes, y_file)
            if model['positions'] is None:
                model['positions'] = mask
            else:
                model['positions'] += mask
            model['files'] += 1

        # save model as pickle file
        scipy.misc.imsave("instruments.png", model['positions'])
        with open(model_file_path, 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        # load model from pickle file
        with open(model_file_path, 'rb') as handle:
            model = pickle.load(handle)
    # Evaluate
    data = get_file_list(hypes, 'test')
    analyze.evaluate(hypes,
                     data,
                     output_dir,
                     model,
                     elements=[0, 1],
                     get_segmentation=get_segmentation)
Example #51
0
 def get_file_list(self):
     return get_file_list(self.server.server_directory)