def __init__(self, cluster, url, component, service): ''' @param cluster: 集群名称, 在配置文件配置或者通过命令行设置. @param url: 每个组件暴露指标的URL。例如:通过http://ip:9870/jmx可以获取hdfs集群的指标。 而通过http://ip:8088/jmx可以获取ResourceManager的指标。 @param component: 组件名称. 例如:"hdfs", "resourcemanager", "mapreduce", "hive", "hbase". @param service: 服务名称. 例如:"namenode", "resourcemanager", "mapreduce". ''' self._cluster = cluster # 删除末尾的/ self._url = url.rstrip('/') self._component = component # 指标前缀, 以 hadoop_组件名_服务名 命名 self._prefix = 'hadoop_{0}_{1}'.format(component, service) # 获取以服务名命名的所有JSON文件列表,例如:namenode,会将namenode中的所有文件夹中的json文件加载 # 获取到的是文件名 self._file_list = utils.get_file_list(service) # 获取common目录中的所有json文件 self._common_file = utils.get_file_list("common") # 整合所有json文件 self._merge_list = self._file_list + self._common_file # 用于保存指标对象 self._metrics = {} for i in range(len(self._file_list)): # 设置文件名,并读取对应的指标配置文件(JSON文件) self._metrics.setdefault(self._file_list[i], utils.read_json_file(service, self._file_list[i]))
def main(hypes_file, output_dir, override): """ Orchestrate. Parameters ---------- hypes_file : str Path to a JSON file output_dir : str Path where the output should be stored override : bool If True, then override the model if it exists. """ # Load hyperparameters with open(hypes_file, 'r') as f: hypes = json.load(f) # Set serialization path base = os.path.dirname(hypes_file) model_file_path = os.path.join(base, '%s.json' % hypes['model']['name']) model_file_path = os.path.abspath(model_file_path) if not os.path.isfile(model_file_path) or override: if not os.path.isfile(model_file_path): logging.info("Did not find '%s'. Start training...", model_file_path) else: logging.info("Override '%s'. Start training...", model_file_path) # Get training data x_files, y_files = get_file_list(hypes, 'train') # "Train" "classifier" (it just counts the classes) model = {} for i in range(len(hypes['classes'])): model[i] = 0 for y_file in y_files: logging.info("Read '%s'...", y_file) mask = load_segmentation_mask(hypes, y_file) for row in mask: for pixel in row: model[pixel] += 1 # save model as json file with open(model_file_path, 'w') as f: json.dump(model, f) else: # load model from json file with open(model_file_path) as f: model = json.load(f) # Evaluate data = get_file_list(hypes, 'test') analyze.evaluate(hypes, data, output_dir, model, elements=[0, 1], get_segmentation=get_segmentation)
def generate_file_diff(self, source_file_directory, target_file_directory): source_file_list = get_file_list(source_file_directory) target_file_list = get_file_list(target_file_directory) for filename in target_file_list: if '.txt' in filename and filename in source_file_list: target_file_path = os.path.join(target_file_directory, filename) source_file_path = os.path.join(source_file_directory, filename) if os.path.isfile(source_file_path) and os.path.isfile(target_file_path): results = generate_file_diff(source_file_path, target_file_path) # Are there any changes in the logs insertion_count = results.count('ins style') deletion_count = results.count('del style') if insertion_count > 0 or deletion_count > 0: results = results.replace(' ', ' ') rep_dict = {"ins style": "ins style", "del style": "del style", "¶": ''} results = multiple_replace(results, rep_dict) source_filename = 'File 1: ' + filename + ' (created on ' + \ get_datetime_string(get_file_timestamp(source_file_path)) + ')' target_filename = 'File 2: ' + filename + ' (created on ' + \ get_datetime_string(get_file_timestamp(target_file_path)) + ')' # Add insertion and deletion status html_code = source_filename + '<br>' + target_filename + '<br><br>' + \ '<ins style="background:#e6ffe6;">Insertions</ins>: ' + str(insertion_count) + \ ' ' + \ '<del style="background:#ffe6e6;">Deletions</del>: ' + str(deletion_count) + \ '<hr>' diff_file_name = os.path.join(target_file_directory, filename + '.diff.html') with open(diff_file_name, 'w') as fo: fo.write('<pre>' + html_code + results + '</pre>')
def predict_test(inference, result_dir, style_dir, content_dir): list_path_content = get_file_list(content_dir) list_path_style = get_file_list(style_dir) dir_out_img = os.path.join(result_dir, 'image') check_folder(dir_out_img) for style_file in list_path_style: style_prefix, _ = os.path.splitext(style_file) style_prefix = os.path.basename(style_prefix) style_img = load_data_testing(style_file) for content_file in list_path_content: content_prefix, _ = os.path.splitext(content_file) content_prefix = os.path.basename(content_prefix) content_img = load_data_testing(content_file) print("Processing: size_content: (%d,%d) size_style: (%d,%d)" % (content_img.shape[1], content_img.shape[2], style_img.shape[1], style_img.shape[2])) print(style_file) print(content_file) results = inference.predict(content_img, style_img) img_fakes = results['img_fakes'][0] for i in range(len(img_fakes)): image_path = os.path.join( dir_out_img, '{}-{}-{}.jpg'.format(style_prefix, content_prefix, str(i))) save_images(img_fakes[i], [1, 1], image_path)
def generate_file_diff(self, source_string, target_string, source_file_directory, target_file_directory): source_file_list = get_file_list(source_file_directory) target_file_list = get_file_list(target_file_directory) for filename in target_file_list: if target_string in filename and filename.replace(target_string, source_string) in source_file_list: target_file_path = os.path.join(target_file_directory, filename) source_file_path = os.path.join( source_file_directory, filename.replace(target_string, source_string)) if os.path.isfile(source_file_path) and os.path.isfile(target_file_path): results = generate_file_diff(source_file_path, target_file_path) # Are there any changes in the logs insertion_count = results.count('ins style') deletion_count = results.count('del style') if insertion_count > 0 or deletion_count > 0: results = results.replace(' ', ' ') # Performs a one-pass replacements rep = {"ins style": "ins style", "del style": "del style", "¶": ''} rep = dict((re.escape(k), v) for k, v in rep.iteritems()) pattern = re.compile("|".join(rep.keys())) results = pattern.sub(lambda m: rep[re.escape(m.group(0))], results) # Add insertion and deletion status html_code = '<ins style="background:#e6ffe6;">Insertions</ins>: ' + str(insertion_count) + \ ' ' + \ '<del style="background:#ffe6e6;">Deletions</del>: ' + str(deletion_count) + \ '<hr>' diff_file_name = os.path.join(target_file_directory, filename + '.diff.html') with open(diff_file_name, 'w') as fo: fo.write('<pre>' + html_code + results + '</pre>')
def main(hypes_file, output_dir, override): """ Orchestrate. Parameters ---------- hypes_file : str Path to a JSON file output_dir : str Path where the output should be stored override : bool If True, then override the model if it exists. """ # Load hyperparameters with open(hypes_file, 'r') as f: hypes = json.load(f) # Set serialization path base = os.path.dirname(hypes_file) model_file_path = os.path.join(base, '%s.pickle' % hypes['model']['name']) model_file_path = os.path.abspath(model_file_path) if not os.path.isfile(model_file_path) or override: if not os.path.isfile(model_file_path): logging.info("Did not find '%s'. Start training...", model_file_path) else: logging.info("Override '%s'. Start training...", model_file_path) # Get training data x_files, y_files = get_file_list(hypes, 'train') # "Train" "classifier" (it just counts the classes) model = {'positions': None, 'files': 0} for y_file in y_files: logging.info("Read '%s'...", y_file) mask = load_segmentation_mask(hypes, y_file) if model['positions'] is None: model['positions'] = mask else: model['positions'] += mask model['files'] += 1 # save model as pickle file scipy.misc.imsave("instruments.png", model['positions']) with open(model_file_path, 'wb') as handle: pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL) else: # load model from pickle file with open(model_file_path, 'rb') as handle: model = pickle.load(handle) # Evaluate data = get_file_list(hypes, 'test') analyze.evaluate(hypes, data, output_dir, model, elements=[0, 1], get_segmentation=get_segmentation)
def mirflickr_annotations(min_tag_count=50): global nlp # tags that appear at least 50 times among all images fname = join(settings.MIRFLICKR_PATH, "mirflickr25k", "doc", "common_tags.txt") with open(fname) as fh: frequent_tags = [lin.split()[0] for lin in fh.readlines() if int(lin.split()[1]) >= min_tag_count] frequent_tags = sorted(frequent_tags) # read images imlist, impath = get_file_list(settings.MIRFLICKR_PATH, (".jpg",)) potential_tags = {"tag2im": {}, "im2tag": {im: [] for im in imlist}} relevant_tags = {"tag2im": {}, "im2tag": {im: [] for im in imlist}} # read annotations flist, fpath = get_file_list(join(settings.MIRFLICKR_PATH, "annotations"), (".txt",)) flist.remove("README.txt") id2im = lambda id_: "im{}.jpg".format(id_) im2id = lambda im_: int(im_[2:-4]) # 24 potential tags for f in [f_ for f_ in flist if not f_.endswith("_r1.txt")]: tag = splitext(f)[0] with open(join(fpath, f)) as fh: potential_tags["tag2im"][tag] = sorted([id2im(id_.strip()) for id_ in fh.readlines()]) for tag, imlist in potential_tags["tag2im"].items(): for im in imlist: potential_tags["im2tag"][im].append(tag) # 14 relevant tags for f in [f_ for f_ in flist if f_.endswith("_r1.txt")]: tag = splitext(f)[0].replace("_r1", "") with open(join(fpath, f)) as fh: relevant_tags["tag2im"][tag] = sorted(["im{}.jpg".format(id_.strip()) for id_ in fh.readlines()]) for tag, imlist in relevant_tags["tag2im"].items(): for im in imlist: relevant_tags["im2tag"][im].append(tag) potential_tags_ = list(potential_tags["tag2im"].keys()) potential_images_ = list(set(sum(potential_tags["tag2im"].values(), []))) relevant_tags_ = list(relevant_tags["tag2im"].keys()) relevant_images_ = list(set(sum(relevant_tags["tag2im"].values(), []))) print(" >> tags w/ more than {} counts: {}".format(min_tag_count, len(frequent_tags))) print(" >> potential tags: {} ({} images)".format(len(potential_tags_), len(potential_images_))) print(" >> relevant tags: {} ({} images)".format(len(relevant_tags_), len(relevant_images_))) mirflickr = { "tags": sorted(list(set(frequent_tags + potential_tags_ + relevant_tags_))), "frequent": frequent_tags, "potential": potential_tags, "relevant": relevant_tags, } return mirflickr
def split_imagenet_dataset(image_dir, anno_dir): """ Split imagenet dataset to train, validation and test :param image_dir: Downloaded images folder :type image_dir: String :param anno_dir: Annotation directory :type anno_dir: String """ data_map = {} img_count = 0 for img in get_file_list(image_dir, format=".jpg") + get_file_list( image_dir, format=".JPEG"): img_base = os.path.splitext(os.path.basename(img))[0] data_map[img_base] = False img_count += 1 ann_count = 0 for anno in get_file_list(anno_dir, format=".xml"): anno_base = os.path.splitext(os.path.basename(anno))[0] if anno_base in data_map: img_count -= 1 data_map[anno_base] = True else: ann_count += 1 print("Img not annotated:", img_count) print("Anno without image:", ann_count) filtered_set = [k for k in data_map.keys() if data_map[k]] wnid_set = set([k.split('_')[0] for k in filtered_set]) # Split filtered set to train, test and validation sets wnid_map = {k: v for v, k in enumerate(wnid_set)} x_all = range(len(filtered_set)) y_all = [wnid_map[x.split('_')[0]] for x in filtered_set] X_train, test_data, _, test_label = sk.train_test_split(x_all, y_all, test_size=0.2, random_state=42, stratify=y_all) X_test, X_eval, _, _ = sk.train_test_split(test_data, test_label, test_size=0.5, random_state=42, stratify=test_label) split = { 'data': filtered_set, 'label_map': wnid_map, 'train': X_train, 'test': X_test, 'validation': X_eval } return split
def process_openimages(data_dir, class_filter=None): """ Process open Images dataset :param data_dir: :type data_dir: :return: :rtype: """ global OPEN_IMAGES_OBJECTS_SET dataset = dict() splits = [ os.path.join(data_dir, split) for split in ['train', 'test', 'validation'] ] for split in splits: split_dir = os.path.basename(split) print("Current split:", split_dir) dataset[split_dir] = {'images': dict(), 'boxes': dict()} obj_list = get_immediate_subdirectories(split) for obj in tqdm(obj_list): obj_name = os.path.basename(obj).lower() if class_filter: if obj_name not in class_filter: continue img_file_list = get_file_list(obj, format=".jpg") if len(img_file_list) > 0: OPEN_IMAGES_OBJECTS_SET.add(obj_name) label_dir = os.path.join(obj, 'Label') label_list = get_file_list(label_dir, format=".txt") for img in img_file_list: img_name, _ = os.path.splitext(os.path.basename(img)) dataset[split_dir]['images'][img_name] = img for label in label_list: label_name, _ = os.path.splitext(os.path.basename(label)) if label_name not in dataset[split_dir]['boxes']: dataset[split_dir]['boxes'][label_name] = list() with open(label, 'r') as label_file: annotations = label_file.readlines() for annotation in annotations: dataset[split_dir]['boxes'][label_name].append( annotation.lower().split()) OPEN_IMAGES_OBJECTS_SET = sorted(OPEN_IMAGES_OBJECTS_SET) update_label_map(OPEN_IMAGES_OBJECTS_SET) return dataset
def vectorize_images(input_dir, mask_dir, output_dir, cache_dir, image_level, patch_size): """ Converts a set of whole-slide images into numpy arrays with valid tissue patches for fast processing. :param input_dir: folder containing the whole-slide images. :param mask_dir: folder containing the whole-slide masks. :param output_dir: destination folder to store the vectorized images. :param cache_dir: folder to store whole-slide images temporarily for fast access. :param image_level: image resolution to read the patches. :param patch_size: size of the read patches. :return: nothing """ # Output dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Read image file names image_paths = get_file_list(input_dir, ext='tif') # get all the wsi.svs files # Read mask file names mask_paths = get_file_list(mask_dir) # get all the mask files total_images = len(image_paths) for index in range(total_images): image_id = (os.path.basename(image_paths[index])).split('.')[0] output_pattern = output_dir + '/' + image_id + '_{item}.npy' # by convection on NIC it has to be an .npy vectorized_png = output_dir + '/' + image_id + '_{item}.png' if not check_file_exists(vectorized_png): print(f'Processing image {image_id}') vectorize_wsi(image_path=cache_file(image_paths[index], cache_dir, overwrite=False), mask_path=mask_paths[index], output_pattern=output_pattern, image_level=image_level, mask_level=image_level, patch_size=patch_size, stride=patch_size, downsample=1, select_bounding_box=False) print( f'Successful vectorized {image_id} : {total_images - index} images left' ) else: print( f'Already existing file {image_id} - {total_images - index - 1} images left' ) print('Finish Processing All images!')
def __init__(self, cluster, component, service): self.cluster = cluster self.component = component self.prefix = 'hadoop_{0}_{1}'.format(component, service) self.file_list = utils.get_file_list(service) self.metrics = {} for i in range(len(self.file_list)): self.metrics.setdefault( self.file_list[i], utils.read_json_file(service, self.file_list[i])) common_file = utils.get_file_list("common") self.merge_list = self.file_list + common_file
def connect_base(current_commit, base_commit): """Creates a merge commit that takes files from base_commit. Literally it's identical to git merge base_commit in current_commit. Args: current_commit: commit hashes on where to commit to. base_commit: commit hashes contains file histories. """ current_files = utils.get_file_list(current_commit) base_files = utils.get_file_list(base_commit) tree = utils.git_mktree(current_files + base_files) return utils.git_commit(tree, [current_commit, base_commit], message=b'Connect history with base %s' % (base_commit.encode('ascii')))
def api_get_session_logs(table): id = request.args.get("record_id") db_session = DBSession() if table == 'install_job': install_job = db_session.query(InstallJob).filter(InstallJob.id == id).first() elif table == 'install_job_history': install_job = db_session.query(InstallJobHistory).filter(InstallJobHistory.id == id).first() elif table == 'inventory_job_history': install_job = db_session.query(InventoryJobHistory).filter(InventoryJobHistory.id == id).first() if install_job is None: abort(404) log_folder = install_job.session_log file_path = os.path.join(get_log_directory(), log_folder) if not os.path.isdir(file_path): abort(404) rows = [] log_file_list = get_file_list(file_path) for file in log_file_list: row = dict() row['filepath'] = os.path.join(file_path, file) row['filename'] = file rows.append(row) return jsonify(**{'data': rows})
def make_chunks(num_links, min_chunk_lines=MIN_CHUNK_LINES): text_fns = utils.get_file_list(utils.TEXTS_DIR, num_links) max_chunks = min(utils.CHUNKS_FOR_SOURCE, len(text_fns)) texts_processed = 0 for text_idx, text_fn in enumerate(text_fns[:utils.CHUNKS_FOR_SOURCE], start=1): chunk_fn = text_fn.replace(utils.TEXTS_DIR, utils.CHUNKS_DIR) assert chunk_fn != text_fn, 'ERROR: invalid path to text file' if not os.path.isfile(chunk_fn): with open(text_fn, 'rt', encoding='utf-8') as f_in: text = f_in.read().split('\n')[1:] with open(chunk_fn, 'wt', encoding='utf-8') as f_out: lines, chunk_words = [], 0 for line_no, line in enumerate(text): line = re.sub('\s+', ' ', re.sub(r'[\u2800\uFE00-\uFE0F]', '', line)).strip() if not line: continue chunk_words += len(line.split()) if line_no < min_chunk_lines \ or chunk_words <= MAX_CHUNK_WORDS: lines.append(line) else: break f_out.write('\n'.join(lines)) print('\r{} (of {})'.format(text_idx, max_chunks), end='') texts_processed += 1 if texts_processed: print()
def api_get_session_logs(table): id = request.args.get("record_id") db_session = DBSession() if table == 'install_job': install_job = db_session.query(InstallJob).filter( InstallJob.id == id).first() elif table == 'install_job_history': install_job = db_session.query(InstallJobHistory).filter( InstallJobHistory.id == id).first() elif table == 'inventory_job_history': install_job = db_session.query(InventoryJobHistory).filter( InventoryJobHistory.id == id).first() if install_job is None: abort(404) log_folder = install_job.session_log file_path = os.path.join(get_log_directory(), log_folder) if not os.path.isdir(file_path): abort(404) rows = [] log_file_list = get_file_list(file_path) for file in log_file_list: row = dict() row['filepath'] = os.path.join(file_path, file) row['filename'] = file rows.append(row) return jsonify(**{'data': rows})
def prepload(extension='fits', N=-1, flist=None): if flist is None: # read the file list flist = np.array(utils.get_file_list(imtype=extension)) #, verbose=True) imlist_dict = {} # stores the number that makes the name of each image, e.g. 75605 for srch75605.fits imlist_dict["fnumbers"] = np.argsort(np.array([int( ''.join(filter(str.isdigit, f.split("/")[-1]))) for f in flist])) # stores the name of the images as a list imlist_dict["flist"] = flist[imlist_dict["fnumbers"]] # reads only N images - if -1 reads all images if N > -1: imlist_dict["flist"] = imlist_dict["flist"][:N * 3] # checks that the images are in triplets (template, search, difference) if imlist_dict["fnumbers"].shape[0] % 3 : print("warning: images not in triplets") return np.nan # the total number of triplets imlist_dict["nimgs"] = int(imlist_dict["fnumbers"].shape[0] / 3) # the list of objects by numbers imlist_dict["nobjects"] = np.unique(imlist_dict["fnumbers"]) # reads in the images choosing method depending on extension if extension == "fits": imlist_dict["imshp"] = utils.fits2stamp(imlist_dict["flist"][0]).shape else: imlist_dict["imshp"] = plt.imread(imlist_dict["flist"][0]).shape return imlist_dict, imlist_dict["flist"]
def infer(model_path, image_shape, label_dict_path, infer_file_list_path): infer_file_list = get_file_list(infer_file_list_path) # 获取标签字典 char_dict = load_dict(label_dict_path) # 获取反转的标签字典 reversed_char_dict = load_reverse_dict(label_dict_path) # 获取字典大小 dict_size = len(char_dict) # 获取reader data_generator = DataGenerator(char_dict=char_dict, image_shape=image_shape) # 初始化PaddlePaddle paddle.init(use_gpu=True, trainer_count=2) # 加载训练好的参数 parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) # 获取网络模型 model = Model(dict_size, image_shape, is_infer=True) # 获取预测器 inferer = paddle.inference.Inference(output_layer=model.log_probs, parameters=parameters) # 开始预测 test_batch = [] labels = [] for i, (image, label) in enumerate( data_generator.infer_reader(infer_file_list)()): test_batch.append([image]) labels.append(label) infer_batch(inferer, test_batch, labels, reversed_char_dict)
def batch_features_extract(self, opts, flag='query', enable_cache=True): print("===================================") fea = None fea_smps = getattr(opts, flag) cache_file = os.path.join(self.cache_root, self.feature_info + '_' + fea_smps.replace('/','_') +'.pkl') if enable_cache and os.path.isfile(cache_file): print('Feature cache file exists : '+ cache_file) print('loading '+flag+' features...') with open(cache_file, 'rb') as f: fea = pickle.load(f) print('Loading '+flag+'features completed.') else: if enable_cache: print('Feature cache file not found : '+ cache_file) file_dir = os.path.join(opts.data_root, fea_smps) file_list = get_file_list(file_dir) print('Extracting '+flag+' features...') fea = self.feature_extract(file_list) print('Extracting '+flag+'features completed.') if enable_cache: print('Saving ' + flag + 'features...') if not os.path.exists(self.cache_root): os.makedirs(self.cache_root) with open(cache_file, 'wb') as f: pickle.dump(fea, f) print('Saving ' + flag + ' features completed.') return fea
def _prepare(self): file_list = get_file_list(self.data_path, regx='.tif') self.im_file_list = [os.path.join(self.data_path, im_file) for im_file in file_list] interval = 50 self._sample_and_get_statistics(self.im_file_list, interval, low=0.2, high=95)
def resample_directory(input_path, output_path, sample_rate): for filepath, filename in get_file_list(input_path): print('Reading: ', filename) df = load_dataset(filepath) print('Resampling:', filename) df = resample_dataset(df, sample_rate) df.to_csv(os.path.join(output_path, filename))
def train_preprocessed_img_lazy_batch(train_folder, max_epochs, config, name=None, debug=True): fnames, bboxes = get_file_list(train_folder) y = np.array(list(bboxes), dtype=np.float32) assert (np.any(np.isnan(y)) or np.any(np.isinf(y)) ) == False, "Invalid `y` detected" X = np.array(list(fnames)) train(X, y, config, max_epochs, 'LazyBatchIterator', name, debug)
def make_chunks(num_links, min_chunk_lines=MIN_CHUNK_LINES): text_fns = utils.get_file_list(utils.TEXTS_DIR, num_links) max_chunks = min(utils.CHUNKS_FOR_SOURCE, len(text_fns)) texts_processed = 0 for text_idx, text_fn in enumerate(text_fns[:utils.CHUNKS_FOR_SOURCE], start=1): chunk_fn = text_fn.replace(utils.TEXTS_DIR, utils.CHUNKS_DIR) assert chunk_fn != text_fn, 'ERROR: invalid path to text file' if not os.path.isfile(chunk_fn): with open(text_fn, 'rt', encoding='utf-8') as f_in, \ open(chunk_fn, 'wt', encoding='utf-8') as f_out: f_in.readline() text = f_in.read() text = re.sub(r'[\u2800\uFE00-\uFE0F]', '', text) lines = ('\t'.join(x.strip() for x in x.split('\t')) for x in text.split('\n') if re.search('\w', x) and not all(x.startswith('#') for x in x.split())) #lines = (x.strip() for x in text.split('\n') # if re.search('\w', x) # and not all(x.startswith('#') # for x in x.split())) f_out.write('\n'.join(x for x in lines if x)) print('\r{} (of {})'.format(text_idx, max_chunks), end='') texts_processed += 1 if texts_processed: print()
def get_file_list_route(): app.logger.debug(request.form) ip = dict(request.form)['ip'][0] dir_name = dict(request.form)['dir'][0] port = get_port(conf, ip) file_list = get_file_list(conf, ip, dir_name, port) return " ".join(file_list)
def get_annotation_map(data_directory): """ :param data_directory: :type data_directory: :return: :rtype: """ new_obj_map = {k.strip(): v.strip() for k, v in OBJECT_WNID_MAP.items()} concerned_objs = [new_obj_map[k] for k in new_obj_map if k in objects] result = {} obj_dir_list = get_immediate_subdirectories(data_directory) for obj_dir in obj_dir_list: obj = os.path.basename(obj_dir) if obj not in concerned_objs: continue result[obj] = [] anno_dir = os.path.join(obj_dir, 'Annotation') for anno in get_file_list(anno_dir, format=".xml"): anno_name = os.path.splitext(os.path.basename(anno))[0] result[obj].append(anno_name) return result
def get_result(self, audio_path): def softmax(x): e_x = np.exp(x - np.max(x)) return e_x / e_x.sum(axis=0) print('Generate side channel...') y_s = utils.get_side(audio_path) print('Rendering spectrum...') utils.get_spectrum(y_s, 0, 'temp', max=20) spectrum_list = utils.get_file_list('temp') print('Valid samples...') fin = np.zeros(4) for i_idx in range(len(spectrum_list)): norm_img = self.img_preprocess(spectrum_list[i_idx]) result = self.session.run([], {self.model_input: norm_img})[0][0] result = softmax(result) fin[np.argmax(result)] += 1 print( f'Sample {i_idx+1} -> {self.r_map[np.argmax(result)]}, Prob:{np.max(result)*100:.3f}%' ) if fin[0] != len(spectrum_list): fin[0] = 0 print(f'Final result: {self.r_map[np.argmax(fin)]}') else: print('Final result: Lossless')
def enrich_directory(sample_path, rich_path): for filepath, filename in get_file_list(sample_path): print('Reading:', filename) df = load_sampleset(filepath) print('Enriching:', filename) df = enrich_sampleset(df) print('Saving:', filename) df.to_csv(os.path.join(rich_path, filename))
def disconnect(source_commit, ref_commit): """Creates a commit that disconnects files from source_commit. All files existing in ref_commit will be removed from source_commit. Args: source_commit: commit hash to disconnect from. ref_commit: commit hash to be a file list reference. """ source_files = utils.get_file_list(source_commit) ref_files = utils.get_file_list(ref_commit) ref_files_set = set(ref.path for ref in ref_files) kept_files = [ref for ref in source_files if ref.path not in ref_files_set] tree = utils.git_mktree(kept_files) return utils.git_commit(tree, [source_commit], message=b'Disconnect history from %s' % (source_commit.encode('ascii')))
def main(): # Init args parser = argparse.ArgumentParser( description='Copy file from given commits') parser.add_argument('commit_hash', metavar='commit', type=str, nargs=1, help='commit hash to copy files from') parser.add_argument('--dry_run', dest='dry_run', action='store_const', const=True, default=False) arg = parser.parse_args(sys.argv[1:]) # Read file list from HEAD and upstream commit. upstream_files = utils.get_file_list(arg.commit_hash[0]) our_files = utils.get_file_list('HEAD') # Calculate target file list target_files = filters.filter_file(our_files, upstream_files) # Calculate operations needed ops = utils.gen_op(our_files, target_files) if arg.dry_run: # Print ops only on dry-run mode. print('\n'.join(repr(x) for x in ops)) return for op, f in ops: # Ignore if op is REP because we only want to copy missing files, not to # revert custom Chromium OS libchrome patch. assert type(op) == utils.DiffOperations if op == utils.DiffOperations.DEL: subprocess.check_call(['git', 'rm', f.path]), elif op == utils.DiffOperations.ADD: # Create directory recursively if not exist. os.makedirs(os.path.dirname(f.path), exist_ok=True) # Read file by git cat-file with blob object id to avoid heavy git checkout. with open(f.path, 'wb') as outfile: subprocess.check_call(['git', 'cat-file', 'blob', f.id], stdout=outfile) # Add to git index subprocess.check_call(['git', 'add', f.path])
def _refine_database(self): # load cached data if it existed. cache_file = os.path.join(self._cache_dir, self._db_name + '_gt_db.pkl') if os.path.exists(cache_file): with open(cache_file, 'rb') as fid: refinedb = pickle.load(fid) print('{} database loaded from {}'.format(self._db_name, cache_file)) return refinedb # generate db used for refine result. # file list is generated from predict xml files file_name_itr = get_file_list(self._data_dir, file_suffix='.xml') refine_boxes = [] # refine_boxes_ext = [] refine_class = [] refine_img = [] img_path_list = [] for file_name in file_name_itr: print(file_name) annt_file = os.path.join(self._data_dir, file_name + '.xml') gt_annt_file = os.path.join(self._gt_dir, file_name + '.xml') img_file = os.path.join(self._img_dir, file_name + self._suffix) img = Image.open(img_file) try: # boxes, boxes_ext, classes, patches = self._get_box_info(annt_file, gt_annt_file, img) boxes, classes, patches = self._get_box_info( annt_file, gt_annt_file, img) except ValueError as e: print(e) print(annt_file) print(gt_annt_file) print(img_file) classes = [] if len(classes) == 0: continue img_path_list.extend( [os.path.join(self._img_dir, file_name + self._suffix)] * len(boxes)) refine_boxes.extend(boxes) # refine_boxes_ext.extend(boxes_ext) refine_class.extend(classes) refine_img.extend(patches) # db_info = zip(refine_boxes, refine_boxes_ext, refine_class, img_path_list, refine_img) # refinedb = [dict(zip(('box', 'box_ext', 'class', 'path', 'image'), item_info)) for item_info in db_info] db_info = zip(refine_boxes, refine_class, img_path_list, refine_img) refinedb = [ dict(zip(('box', 'class', 'path', 'image'), item_info)) for item_info in db_info ] with open(cache_file, 'wb') as fid: pickle.dump(refinedb, fid, pickle.HIGHEST_PROTOCOL) print('wrote {} database to {}'.format(self._db_name, cache_file)) return refinedb
def train_model(unique_labels, file_dir, num_samples, le_file_path, num_epochs, model_file_path): model = create_model(unique_labels) file_list = utils.get_file_list(file_dir, samples=num_samples, training=True) logger.info('Beginning training model') images, labels = utils.read_images(file_list, le_file_path, training=True) model.fit(images, labels, epochs=num_epochs) logger.info('Finished fitting model') logger.info(f'Saving model to {model_file_path}') model.save(model_file_path)
def get_ins_photo_list(self): """Get all the files in the instagram folder. """ photo_fn_list = get_file_list(self.ins_folder) # print(self.home_folder+self.ins_folder) # print(len(photo_list), photo_list[:10]) if len(photo_fn_list) == 0: logging.error("The Ins folder is empty.") return photo_fn_list
def verify_commit(original_commit, new_tree): """Verifies if new_tree is exactly original_commit after filters. Args: original_commit: commit hash in Chromium browser tree. new_tree: tree hash created for upstream branch commit. """ expected_file_list = filters.filter_file( [], utils.get_file_list(original_commit)) assert utils.git_mktree(expected_file_list) == new_tree
def _patch_database(self): # load cached data if it existed. cache_file = os.path.join(self._cache_dir, self._db_name + '_gt_db.pkl') if os.path.exists(cache_file): with open(cache_file, 'rb') as fid: patchdb = pickle.load(fid) print('{} database loaded from {}'.format(self._db_name, cache_file)) return patchdb # file list is generated from predict xml files file_name_itr = get_file_list(self._gt_dir, file_suffix=self._suffix) db_boxes = [] db_class = [] patch_img = [] img_path_list = [] for file_name in file_name_itr: # filter the required files if not (self._db_name in file_name): continue print('processing ' + file_name) annt_file = os.path.join(self._gt_dir, file_name + '.xml') img_file = os.path.join(self._img_dir, file_name + self._suffix) try: img = Image.open(img_file) boxes, classes, patches = self._get_box_info(annt_file, img) except ValueError as e: print(e) classes = [] except OSError as e: print(e) classes = [] # continue if the annt file is empty if len(classes) == 0: continue # add box to db img_path_list.extend( [os.path.join(self._img_dir, file_name + self._suffix)] * len(classes)) db_boxes.extend(boxes) db_class.extend(classes) patch_img.extend(patches) db_info = zip(db_boxes, db_class, img_path_list, patch_img) db = [ dict(zip(('box', 'class', 'path', 'image'), item_info)) for item_info in db_info ] with open(cache_file, 'wb') as fid: pickle.dump(db, fid, pickle.HIGHEST_PROTOCOL) print('wrote {} database to {}'.format(self._db_name, cache_file)) return db
def check_command_file_diff(self, install_job, message): file_suffix = '.diff.html' file_list = get_file_list(os.path.join(get_log_directory(), install_job.session_log)) diff_file_list = [file for file in file_list if file_suffix in file] if len(diff_file_list) > 0: message += 'The following command outputs have changed between different installation phases<br><br>' for file in diff_file_list: message += file.replace(file_suffix, '') + '<br>' message += '<br>' return message
def host_session_log(hostname, table, id): """ This route is also used by mailer.py for email notification. """ db_session = DBSession() record = None doc_central_log_file_path = '' if table == 'install_job': record = db_session.query(InstallJob).filter(InstallJob.id == id).first() elif table == 'install_job_history': record = db_session.query(InstallJobHistory).filter(InstallJobHistory.id == id).first() doc_central_log_file_path = get_doc_central_log_path(record) elif table == 'inventory_job_history': record = db_session.query(InventoryJobHistory).filter(InventoryJobHistory.id == id).first() if record is None: abort(404) file_path = request.args.get('file_path') log_file_path = get_log_directory() + file_path if not(os.path.isdir(log_file_path) or os.path.isfile(log_file_path)): abort(404) file_pairs = {} log_file_contents = '' file_suffix = '.diff.html' if os.path.isdir(log_file_path): # Returns all files under the requested directory log_file_list = get_file_list(log_file_path) diff_file_list = [filename for filename in log_file_list if file_suffix in filename] for filename in log_file_list: diff_file_path = '' if file_suffix not in filename: if filename + file_suffix in diff_file_list: diff_file_path = os.path.join(file_path, filename + file_suffix) file_pairs[os.path.join(file_path, filename)] = diff_file_path file_pairs = collections.OrderedDict(sorted(file_pairs.items())) else: with io.open(log_file_path, "rt", encoding='latin-1') as fo: log_file_contents = fo.read() return render_template('host/session_log.html', hostname=hostname, table=table, record_id=id, file_pairs=file_pairs, log_file_contents=log_file_contents, is_file=os.path.isfile(log_file_path), doc_central_log_file_path=doc_central_log_file_path)
def main(root_folder, batch_size=256, train_split=0.2): fnames, bboxes = get_file_list(root_folder) fnames = np.asarray(list(fnames)) bboxes = np.asarray(list(bboxes), dtype=np.float32) num_samples = fnames.shape[0] num_val = int(round(num_samples * train_split)) # Perform train validation split idx = np.arange(num_samples) rng = np.random.RandomState(seed=12345) rng.shuffle(idx) train_fnames = fnames[idx[num_val:]] train_bboxes = bboxes[idx[num_val:]] val_fnames = fnames[idx[:num_val]] val_bboxes = bboxes[idx[:num_val]] print "%d training samples and %d validation samples" % (train_fnames.shape[0], val_fnames.shape[0]) # Create (key, value) pairs for storing in db X_t = [] y_t = [] for i in xrange(len(train_fnames)): X_t.append(('%08d' % i, train_fnames[i])) y_t.append(('%08d' % i, train_bboxes[i])) X_v = [] y_v = [] for i in xrange(len(val_fnames)): X_v.append(('%08d' % i, val_fnames[i])) y_v.append(('%08d' % i, val_bboxes[i])) # Training set train_image_db = lmdb.open('train_image', map_size=1e+12) train_label_db = lmdb.open('train_label', map_size=1e+12) prev_j = 0 for j in xrange(batch_size, len(X_t), batch_size): print "Starting train batch #%d processing" % (prev_j / batch_size) process_batch(train_image_db, train_label_db, X_t[prev_j:j], y_t[prev_j:j]) prev_j = j train_image_db.close() train_label_db.close() # Validation set val_image_db = lmdb.open('val_image', map_size=1e+12) val_label_db = lmdb.open('val_label', map_size=1e+12) prev_j = 0 for j in xrange(batch_size, len(X_v), batch_size): print "Starting val batch #%d processing" % (prev_j / batch_size) process_batch(val_image_db, val_label_db, X_v[prev_j:j], y_v[prev_j:j]) prev_j = j val_image_db.close() val_label_db.close()
def get_sp_files_from_csm_repository(): rows = [] file_list = get_file_list(get_repository_directory()) for filename in file_list: if '.pie' in filename: statinfo = os.stat(get_repository_directory() + filename) row = {} row['image_name'] = filename row['image_size'] = '{} bytes'.format(statinfo.st_size) rows.append(row) return jsonify(**{'data': rows})
def get_full_software_tar_files_from_csm_repository(): rows = [] file_list = get_file_list(get_repository_directory()) for filename in file_list: if '-iosxr-' in filename and filename.endswith('.tar'): statinfo = os.stat(get_repository_directory() + filename) row = {} row['image_name'] = filename row['image_size'] = '{} bytes'.format(statinfo.st_size) rows.append(row) return jsonify(**{'data': rows})
def main(): """ Enter the absolute paths of the directories containing your Movies/TV Shows, separated by an empty space """ # creates the project dir if it doesn't exists utils.create_project_directory(project_path) directories = sys.argv[1:] print('dir entered:', directories) # if len(directories) == 0: # print("Please mention the directories") # return movie_paths = dataset_db['movie_paths'] movie_data = dataset_db['movie_data'] for directory in directories: # Ensure that the directory path begins with '/', # but doesn't end with one, and is NOT root path = '/{}'.format(directory.strip('/')) if path == '/': continue if os.path.exists(path): movie_paths.upsert(dict(directory=path), ['directory']) file_list = utils.get_file_list() if file_list is None: print('No video files found') return failed_list = [] movie_data = dataset_db['movie_data'] for i, filename in enumerate(file_list): try: data_exists = movie_data.find_one(Filename=filename) if data_exists: print('Already indexed:', filename) continue except: pass print(i, filename) temp = utils.fetch_movie_details(filename) if temp is False: failed_list.append(filename) continue print('Failed for:', failed_list)
def inputs(hypes, _, phase, data_dir): """ Get data. Parameters ---------- hypes : dict _ : ignore this phase : {'train', 'val'} data_dir : str Returns ------- tuple (xs, ys), where xs and ys are lists of the same length. xs are paths to the input images and ys are paths to the expected output """ x_files, y_files = get_file_list(hypes, 'train') x_files, y_files = sklearn.utils.shuffle(x_files, y_files, random_state=0) # x_files = x_files[:40] # reducing data # y_files = y_files[:40] # reducing data xs, ys = [], [] i = 0 for x, y in zip(x_files, y_files): logging.info("Read '%s' for data...", x) image = get_image(x, 'RGB') # from scipy.ndimage.filters import gaussian_filter # image = gaussian_filter(image, sigma=10) label = get_image(y, 'L') label = normalize_labels(label) im = Image.open(x, 'r') width, height = im.size for x in range(width): for y in range(height): image_val = get_features(x, y, image, hypes['model_nr']) label_val = (label[y][x][0] == 0) # only 0 is background xs.append(image_val) ys.append(label_val) i += 1 if i == 10: # TODO: For testing break return xs, numpy.array(ys, dtype=int)
def get_smu_or_sp_list(hostname, hide_installed_packages, smu_info_list, file_suffix): """ Return the SMU/SP list. If hostname is given, compare its active packages. """ file_list = get_file_list(get_repository_directory(), '.' + file_suffix) host_packages = [] if hostname is None else get_host_active_packages(hostname) rows = [] for smu_info in smu_info_list: # Verify if the package has already been installed. installed = False for host_package in host_packages: if smu_info.name in host_package: installed = True break include = False if (hide_installed_packages == 'true' and installed) else True if include: row = dict() row['ST'] = 'True' if smu_info.name + '.' + file_suffix in file_list else 'False' row['package_name'] = smu_info.name + '.' + file_suffix row['posted_date'] = smu_info.posted_date.split()[0] row['ddts'] = smu_info.ddts row['ddts_url'] = BUG_SEARCH_URL + smu_info.ddts row['type'] = smu_info.type row['description'] = smu_info.description row['impact'] = smu_info.impact row['functional_areas'] = smu_info.functional_areas row['id'] = smu_info.id row['name'] = smu_info.name row['status'] = smu_info.status row['package_bundles'] = smu_info.package_bundles row['compressed_image_size'] = smu_info.compressed_image_size row['uncompressed_image_size'] = smu_info.uncompressed_image_size row['is_installed'] = installed if not is_empty(hostname) and SMU_INDICATOR in smu_info.name: row['is_applicable'] = is_smu_applicable(host_packages, smu_info.package_bundles) else: row['is_applicable'] = True rows.append(row) return jsonify(**{'data': rows})
def api_get_tar_list(platform, release): smu_loader = SMUInfoLoader(platform, release, from_cco=False) if not smu_loader.is_valid: return jsonify(**{'data': []}) else: file_list = get_file_list(get_repository_directory(), '.tar') tars_list = smu_loader.get_tar_list() rows = [] for tar_info in tars_list: row = dict() row['ST'] = 'True' if tar_info.name in file_list else 'False' row['name'] = tar_info.name row['compressed_size'] = tar_info.compressed_image_size row['description'] = "" rows.append(row) return jsonify(**{'data': rows})
def generate_batch(hypes, phase): """ Generate patches. Parameters ---------- hypes : dict phase : 'train' or 'test' """ x_files, y_files = get_file_list(hypes, phase) x_files, y_files = sklearn.utils.shuffle(x_files, y_files, random_state=0) batch_x, batch_y = [], [] while True: for x, y in zip(x_files, y_files): logging.info("Read '%s' for data...", x) image = get_image(x, 'RGB') label = load_segmentation_mask(hypes, y) im = Image.open(x, 'r') width, height = im.size image_vals = get_features(hypes, image, 'data') label_vals = get_features(hypes, label, 'label') # print("image_vals = %s" % str(list(image_vals))) for patch, label_ in zip(image_vals, label_vals): patch = img_to_array(patch) label_ = img_to_array(label_) _, w, h = label_.shape label_ = label_.reshape((w, h)) if phase == 'val' and 1.0 not in label_: print("continue") continue # scipy.misc.imshow(patch) # scipy.misc.imshow(label_) batch_x.append(patch) batch_y.append(label_) # .flatten() if len(batch_x) == hypes['solver']['batch_size']: yield (np.array(batch_x), np.array(batch_y)) batch_x, batch_y = [], []
def inputs(hypes, _, phase, data_dir): """ Get data. Parameters ---------- hypes : dict _ : ignore this phase : {'train', 'val'} data_dir : str Returns ------- tuple (xs, ys), where xs and ys are lists of the same length. xs are paths to the input images and ys are paths to the expected output """ x_files, y_files = get_file_list(hypes, 'train') x_files, y_files = sklearn.utils.shuffle(x_files, y_files, random_state=0) xs, ys = [], [] for x, y in zip(x_files, y_files): logging.info("Read '%s' for data...", x) image = get_image(x, 'RGB') label = load_segmentation_mask(hypes, y) im = Image.open(x, 'r') width, height = im.size for x in range(width): for y in range(height): image_val = get_features(x, y, image, hypes['model_nr']) label_val = label[y][x] xs.append(image_val) ys.append(label_val) return xs, np.array(ys, dtype=int)
def main(hypes_file, data_dir, override): """Orchestrate.""" with open(hypes_file, 'r') as f: hypes = json.load(f) if 'training' not in hypes: hypes['training'] = {} if 'make_equal' not in hypes['training']: hypes['training']['make_equal'] = False base = os.path.dirname(hypes_file) model_file_path = os.path.join(base, '%s.yaml' % hypes['model']['name']) model_file_path = os.path.abspath(model_file_path) weights_file_path = os.path.join(base, '%s.hdf5' % hypes['model']['name']) weights_file_path = os.path.abspath(weights_file_path) if not os.path.isfile(model_file_path) or override: if not os.path.isfile(model_file_path): logging.info("Did not find '%s'. Start training...", model_file_path) else: logging.info("Override '%s'. Start training...", model_file_path) # Get data # x_files, y_files = inputs(hypes, None, 'train', data_dir) x_files, y_files = get_file_list(hypes, 'train') x_files, y_files = sklearn.utils.shuffle(x_files, y_files, random_state=0) x_train, y_train = get_traindata_single_file(hypes, x_files[0], y_files[0]) nb_features = x_train[0].shape[0] logging.info("Input gets %i features", nb_features) # Make model model = Sequential() model.add(Dense(64, input_dim=nb_features, init='uniform', activation='sigmoid')) model.add(Dropout(0.5)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adagrad', # rmsprop metrics=['accuracy']) generator = generate_training_data(hypes, x_files, y_files) t0 = time.time() sep = hypes['solver']['samples_per_epoch'] if True: class_weight = get_class_weight(hypes) logging.info("class_weights = %s", class_weight) model.fit_generator(generator, samples_per_epoch=sep, nb_epoch=hypes['solver']['epochs'], verbose=1, validation_data=(x_train, y_train), class_weight=class_weight) else: logging.info("Fit with .fit") x_train, y_train = inputs(hypes, None, 'train', data_dir) model.fit(x_train, y_train, batch_size=128, nb_epoch=1) t1 = time.time() print("Training Time: %0.4f" % (t1 - t0)) # save as YAML yaml_string = model.to_yaml() with open(model_file_path, 'w') as f: f.write(yaml_string) model.save_weights(weights_file_path) # Evaluate data = get_file_list(hypes, 'test') logging.info("Start segmentation") analyze.evaluate(hypes, data, data_dir, model, elements=[0, 1], get_segmentation=get_segmentation) else: logging.info("## Found '%s'.", model_file_path) with open(model_file_path) as f: yaml_string = f.read() model = model_from_yaml(yaml_string) model.load_weights(weights_file_path) model.compile(optimizer='adagrad', loss='binary_crossentropy') data = get_file_list(hypes, 'test') analyze.evaluate(hypes, data, data_dir, model, elements=[0, 1], get_segmentation=get_segmentation)
def main(hypes_file, data_dir, override): """Orchestrate.""" with open(hypes_file, 'r') as f: hypes = json.load(f) model_file_path = os.path.abspath('%s.pkl' % hypes['model']['name']) color_changes = {0: (0, 0, 0, 0), 1: (0, 255, 0, 127), 'default': (0, 0, 0, 0)} if not os.path.isfile(model_file_path) or override: if not os.path.isfile(model_file_path): logging.info("Did not find '%s'. Start training...", model_file_path) else: logging.info("Override '%s'. Start training...", model_file_path) # Get data # x_files, y_files = inputs(hypes, None, 'train', data_dir) x_files, y_files = get_file_list(hypes, 'train') x_files, y_files = sklearn.utils.shuffle(x_files, y_files, random_state=0) x_train, y_train = get_traindata_single_file(hypes, x_files[0], y_files[0]) nb_features = x_train[0].shape[0] logging.info("Input gets %i features", nb_features) # Make model from sklearn.svm import LinearSVC, SVC from sklearn.tree import DecisionTreeClassifier model = SVC(probability=False, # cache_size=200, kernel="linear", C=2.8, gamma=.0073) model = LinearSVC(C=2.8) model = DecisionTreeClassifier() print("Start fitting. This may take a while") generator = generate_training_data(hypes, x_files, y_files) t0 = time.time() if False: sep = hypes['solver']['samples_per_epoch'] model.fit_generator(generator, samples_per_epoch=sep, nb_epoch=hypes['solver']['epochs'], verbose=1, # callbacks=[callb], validation_data=(x_train, y_train)) else: logging.info("Fit with .fit") x_train, y_train = inputs(hypes, None, 'train', data_dir) print(len(y_train)) model.fit(x_train, y_train) t1 = time.time() print("Training Time: %0.4f" % (t1 - t0)) # save as YAML joblib.dump(model, model_file_path) # Evaluate data = get_file_list(hypes, 'test') logging.info("Start segmentation") analyze.evaluate(hypes, data, data_dir, model, elements=[0, 1], load_label_seg=load_label_seg, color_changes=color_changes, get_segmentation=get_segmentation) else: model = joblib.load(model_file_path) data = get_file_list(hypes, 'test') analyze.evaluate(hypes, data, data_dir, model, elements=[0, 1], load_label_seg=load_label_seg, color_changes=color_changes, get_segmentation=get_segmentation)
def main(hypes_file, out_dir, override): """Orchestrate.""" with open(hypes_file, 'r') as f: hypes = json.load(f) model_file_path = '%s.yaml' % hypes['model']['name'] weights_file_path = '%s.hdf5' % hypes['model']['name'] if not os.path.isfile(model_file_path) or override: patch_size = hypes['arch']['patch_size'] img_channels = hypes['arch']['num_channels'] nb_out = hypes['arch']['stride']**len(hypes['classes']) model = Sequential() model.add(Convolution2D(64, 3, 3, border_mode='valid', init='glorot_normal', activation='sigmoid', input_shape=(img_channels, patch_size, patch_size))) model.add(Convolution2D(32, 3, 3, activation='relu', init='glorot_normal')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.5)) # model.add(Convolution2D(64, 3, 3, border_mode='same')) # model.add(Activation('relu')) # model.add(Convolution2D(64, 3, 3)) # model.add(Activation('relu')) # model.add(MaxPooling2D(pool_size=(2, 2))) # model.add(Dropout(0.25)) model.add(Flatten()) # model.add(Dense(64, activation='sigmoid')) # # model.add(Dropout(0.5)) # model.add(Dense(64, activation='relu')) # model.add(Dropout(0.5)) model.add(Dense(nb_out, activation='sigmoid', init='glorot_normal')) model.add(Reshape((hypes['arch']['stride'], hypes['arch']['stride']))) # sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) opt = keras.optimizers.Adadelta(lr=hypes['solver']['learning_rate'], rho=0.95, epsilon=1e-08) model.compile(loss=hypes['solver']['loss'], optimizer=opt) # hypes['solver']['optimizer'] logging.info("model compiled") # while 1: # b = generate_batch(hypes, 'train') # for e in range(10): # print 'Epoch', e # batches = 0 # for X_batch, Y_batch in generate_batch(hypes, 'train'): # Y_batch = np.reshape(Y_batch, (-1, 400)) # loss = model.fit(X_batch, # Y_batch, # batch_size=hypes['solver']['batch_size']) # print(loss) # batches += 1 # if e > 2: # # we need to break the loop by hand because # # the generator loops indefinitely # break # # Train g = generate_batch(hypes, 'train') logging.info("generate_batch") X_test, Y_test = g.next() # print("#" * 80) # print(X_test.shape) # print(Y_test.shape) logging.info("start fit_generator") model.fit_generator(generate_batch(hypes, 'train'), samples_per_epoch=hypes['solver']['samples_per_epoch'], nb_epoch=hypes['solver']['epochs'], verbose=1, validation_data=(X_test, Y_test)) x_files, y_files = get_file_list(hypes, 'train') x_files, y_files = sklearn.utils.shuffle(x_files, y_files, random_state=0) # ij = 0 # for epoch in range(1, hypes['solver']['epochs'] + 1): # print("#" * 80) # print("# Epoch %i" % epoch) # print("#" * 80) # x_files, y_files = sklearn.utils.shuffle(x_files, # y_files, # random_state=epoch) # for x_train_file, y_train_file in zip(x_files, y_files): # x_train, y_train = get_traindata_single_file(hypes, # x_train_file, # y_train_file) # # Reduce data # # x_train, y_train = reduce_data_equal(x_train, # # y_train) # t0 = time.time() # model.fit(x_train, y_train, # batch_size=128, # nb_epoch=1, # ) # ij += 1 # print("%i of %i" % # (ij, hypes['solver']['epochs'] * len(x_files))) # t1 = time.time() # print("Training Time: %0.4f" % (t1 - t0)) print("done with fit_generator") # save as YAML yaml_string = model.to_yaml() with open(model_file_path, 'w') as f: f.write(yaml_string) model.save_weights(weights_file_path) # Evaluate data = get_file_list(hypes, 'test') analyze.evaluate(hypes, data, out_dir, model, elements=[0, 1], get_segmentation=get_segmentation, verbose=True) else: with open(model_file_path) as f: yaml_string = f.read() model = model_from_yaml(yaml_string) model.load_weights(weights_file_path) model.compile(optimizer=hypes['solver']['optimizer'], loss='binary_crossentropy') data = get_file_list(hypes, 'test') analyze.evaluate(hypes, data, out_dir, model, elements=[0, 1], get_segmentation=get_segmentation, verbose=True)
def train(data_path, config): with tf.Graph().as_default(), tf.Session() as session: word_to_id_path = os.path.join(data_path, config.vocab_file) with open(word_to_id_path, "rb") as f: word_to_id = pickle.load(f) vocab_size = len(word_to_id) print("Vocab size: %d" % vocab_size) sys.stdout.flush() train_pattern = config.data_pattern.replace("{-type-}", "train") + ".part*" valid_pattern = config.data_pattern.replace("{-type-}", "valid") + ".part*" train_files = get_file_list(config, data_path, train_pattern, "train") valid_files = get_file_list(config, data_path, valid_pattern, "valid") if config.copy_temp: temp_dir = tempfile.mkdtemp() print("Copying data files to %s" % temp_dir) train_files = copy_temp_files(train_files, temp_dir) valid_files = copy_temp_files(valid_files, temp_dir) config.vocab_size = vocab_size train_batcher = PreBatched(train_files, config.batch_size, description="train") if config.use_prebatched \ else QueuedSequenceBatcher(train_files, config.seq_length, config.batch_size, description="train", attns=config.attention) valid_batcher = PreBatched(valid_files, config.batch_size, description="valid") if config.use_prebatched \ else QueuedSequenceBatcher(valid_files, config.seq_length, config.batch_size, description="valid", attns=config.attention) t0 = datetime.datetime.now() initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = create_model(config, True) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = create_model(config, False) summary_writer = tf.train.SummaryWriter(config.events_path, graph=session.graph) valid_perplexity = PerplexityHook(summary_writer, mvalid, valid_batcher) hooks = [ SpeedHook(summary_writer, config.status_iterations, config.batch_size), LossHook(summary_writer, config.status_iterations), valid_perplexity, SaveModelHook(config.checkpoint_path, 1, config.__dict__, 5) ] t1 = datetime.datetime.now() print("Building models took: %s" % (t1 - t0)) def load_func(): if config.model_path is not None: load_model(session, config.model_path) print("Continuing training from model: %s" % config.model_path) if config.embedding_path is not None: load_variables(session, os.path.join(config.embedding_path, "embedding.tf"), [m.embedding_variable]) print("Loading embedding vectors from: %s" % config.embedding_path) trainer = Trainer(m.optimizer, config.epochs, hooks, m, m.train_op) trainer(train_batcher, m.loss, session, config.learning_rate, config.lr_decay, load_func) saver = tf.train.Saver(tf.trainable_variables()) embedding_saver = tf.train.Saver([m.embedding_variable]) print("Saving model...") out_path = save_model(saver, session, config.save_path, m.predict, config.__dict__) embedding_saver.save(session, os.path.join(out_path, "embedding.tf")) if config.copy_temp: shutil.rmtree(temp_dir)
def get_file_list(self): return get_file_list(self.server.server_directory)