def map_file(database: Database, table: str, key: str, input_format: Dict[str, Any], output_format: Dict[str, Any], offsets: List[int], params: Dict[str, Any]): prefix: str = util.key_prefix(key) if util.is_set(params, "ranges"): [bucket_name, key, ranges] = pivot.get_pivot_ranges(table, key) items: List[Entry] = database.get_entries(table, prefix) keys: List[str] = list(set(map(lambda item: item.key, items))) else: if "map_bucket_key_prefix" in params: items: List[Entry] = database.get_entries(params["map_bucket"], prefix=params["map_bucket_key_prefix"]) keys: List[str] = list(set(map(lambda item: item.key, items))) else: items: List[Entry] = database.get_entries(params["map_bucket"]) if params["directories"]: items = list(filter(lambda item: "/" in item.key, items)) keys: List[str] = list(set(map(lambda item: item.key.split("/")[0], items))) else: keys: List[str] = list(set(map(lambda item: item.key, items))) file_id = 0 num_files = len(keys) keys.sort() for i in range(num_files): target_file = keys[i] file_id += 1 payload = { "Records": [{ "s3": { "bucket": { "name": table, }, "object": { }, "extra_params": { "target_bucket": params["map_bucket"], "target_file": target_file, "prefix": output_format["prefix"], "file_id": file_id, "num_files": num_files, } } }] } if params["input_key_value"] == "key": payload["Records"][0]["s3"]["object"]["key"] = key payload["Records"][0]["s3"]["extra_params"][params["bucket_key_value"]] = target_file elif params["bucket_key_value"] == "key": payload["Records"][0]["s3"]["object"]["key"] = target_file payload["Records"][0]["s3"]["extra_params"][params["input_key_value"]] = key else: raise Exception("Need to specify field for map key") if util.is_set(params, "ranges"): payload["Records"][0]["s3"]["extra_params"]["pivots"] = ranges payload["Records"][0]["s3"]["pivots"] = ranges database.invoke(params["output_function"], payload)
def test_list_dir_no_x(self): nox_name = self.ch.path_module.join(self.rtd, 'nox') notmine_name = self.ch.path_module.join(self.rtd, 'not_mine') nox_mode = self.self.ch.sftp.stat(nox_name).st_mode notmine_mode = self.self.ch.sftp.stat(notmine_name).st_mode self.assertTrue(stat.S_ISDIR(nox_mode)) self.assertTrue(is_set(stat.S_IMODE(nox_mode), stat.S_IXGRP)) self.assertTrue(stat.S_ISDIR(notmine_mode)) self.assertTrue(is_set(stat.S_IMODE(notmine_mode), stat.S_IXOTH)) logging.debug('listdir(nox)='+str(self.self.ch.list_dir(nox_name))) logging.debug('listdir(notmine)='+str(self.self.ch.list_dir(notmine_name)))
def test_list_dir_no_x(self): nox_name = self.ch.path_module.join(self.rtd, 'nox') notmine_name = self.ch.path_module.join(self.rtd, 'not_mine') nox_mode = self.self.ch.sftp.stat(nox_name).st_mode notmine_mode = self.self.ch.sftp.stat(notmine_name).st_mode self.assertTrue(stat.S_ISDIR(nox_mode)) self.assertTrue(is_set(stat.S_IMODE(nox_mode), stat.S_IXGRP)) self.assertTrue(stat.S_ISDIR(notmine_mode)) self.assertTrue(is_set(stat.S_IMODE(notmine_mode), stat.S_IXOTH)) logging.debug('listdir(nox)='+str(self.self.ch.list_dir(nox_name))) logging.debug('listdir(notmine)='+str(self.self.ch.list_dir(notmine_name)))
def combine(cls: Any, entries: List[Entry], f: BinaryIO, extra: Dict[str, Any]) -> Dict[str, str]: if not util.is_set(extra, "sort"): return new_line.Iterator.combine(entries, f, extra) top_scores = {} for entry in entries: items = cls.to_array(entry.get_content()) for item in items: [s, neighbors] = item if s not in top_scores: top_scores[s] = list( map(lambda n: (-1 * n[0], n[1]), neighbors)) heapq.heapify(top_scores[s]) else: for neighbor in neighbors: [score, classification] = neighbor if len( top_scores[s] ) < extra["k"] or -1 * score > top_scores[s][0][0]: heapq.heappush(top_scores[s], (-1 * score, classification)) if len(top_scores[s]) > extra["k"]: heapq.heappop(top_scores[s]) keys = list(top_scores.keys()) for i in range(len(keys)): if i > 0: f.write(b'\n') s = keys[i] line = s for [score, c] in top_scores[s]: line += str.encode(",{0:f} {1:d}".format(-1 * score, c)) f.write(line) return {}
def get_loaders(traindir, valdir, sz, bs, fp16=True, val_bs=None, workers=8, rect_val=False, min_scale=0.08, distributed=False, synthetic=False): val_bs = val_bs or bs train_tfms = [ transforms.RandomResizedCrop(sz, scale=(min_scale, 1.0)), transforms.RandomHorizontalFlip() ] train_dataset = datasets.ImageFolder(traindir, transforms.Compose(train_tfms)) train_sampler = (DistributedSampler(train_dataset, num_replicas=env_world_size(), rank=env_rank()) if distributed else None) if synthetic: print("Using synthetic dataloader") train_loader = SyntheticDataLoader(bs, (3, sz, sz)) elif util.is_set('PYTORCH_USE_SPAWN'): print("Using SPAWN method for dataloader") train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=bs, shuffle=(train_sampler is None), num_workers=workers, pin_memory=True, collate_fn=fast_collate, sampler=train_sampler, multiprocessing_context='spawn') else: train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=bs, shuffle=(train_sampler is None), num_workers=workers, pin_memory=True, collate_fn=fast_collate, sampler=train_sampler) val_dataset, val_sampler = create_validation_set(valdir, val_bs, sz, rect_val=rect_val, distributed=distributed) val_loader = torch.utils.data.DataLoader( val_dataset, num_workers=workers, pin_memory=True, collate_fn=fast_collate, batch_sampler=val_sampler) train_loader = BatchTransformDataLoader(train_loader, fp16=fp16) val_loader = BatchTransformDataLoader(val_loader, fp16=fp16) return train_loader, val_loader, train_sampler, val_sampler
def run_single(self, reads_ref, params): """ Performs a single run of HISAT2 against a single reads reference. The rest of the info is taken from the params dict - see the spec for details. """ # 1. Get hisat2 index from genome. # a. If it exists in cache, use that. # b. Otherwise, build it idx_prefix = self.build_index(params["genome_ref"]) # 2. Fetch the reads file and deal make sure input params are correct. reads = fetch_reads_from_reference(reads_ref["ref"], self.callback_url) # if the reads ref came from a different sample set, then we need to drop that # reference inside the reads info object so it can be linked in the alignment if reads_ref["ref"] != params["sampleset_ref"]: reads["sampleset_ref"] = params["sampleset_ref"] # make sure condition info carries over if we have it if "condition" in reads_ref: reads["condition"] = reads_ref["condition"] elif "condition" in params: reads["condition"] = params["condition"] reads["name"] = reads_ref["name"] output_file = "accepted_hits" # 3. Finally all set, do the alignment and upload the output. alignment_file = self.run_hisat2(idx_prefix, reads, params, output_file=output_file) alignment_name = reads["name"] + params["alignment_suffix"] output_ref = self.upload_alignment(params, reads, alignment_name, alignment_file) alignment_set_ref = None if is_set(params["sampleset_ref"], self.workspace_url): # alignment_items, alignmentset_name, ws_name set_name = get_object_names( [params["sampleset_ref"]], self.workspace_url)[params["sampleset_ref"]] alignment_set_name = set_name + params["alignmentset_suffix"] alignment_set_ref = self.upload_alignment_set( [{ "ref": output_ref, "label": reads["condition"] }], alignment_set_name, params["ws_name"]) alignments = dict() alignments[reads_ref["ref"]] = { "ref": output_ref, "name": alignment_name } os.remove(reads["file_fwd"]) if "file_rev" in reads: os.remove(reads["file_rev"]) return (alignments, output_ref, alignment_set_ref)
def combine(cls: Any, entries: List[Entry], f: BinaryIO, extra: Dict[str, Any]) -> Dict[str, str]: metadata: Dict[str, str] = {} if util.is_set(extra, "sort"): items = [] for i in range(len(entries)): it = cls(entries[i], None) sub_items = it.get(it.get_start_index(), it.get_end_index()) items += list( map( lambda item: (it.get_identifier_value( item, extra["identifier"]), item), sub_items)) items = sorted(items, key=lambda i: i[0]) items = list(map(lambda i: i[1], items)) content, metadata = cls.from_array(items, f, extra) f.write(content) else: count = 0 for i in range(len(entries)): entry = entries[i] if entry.content_length() == 0: continue if count > 0 and cls.delimiter.position == DelimiterPosition.inbetween: f.seek(-1 * len(cls.delimiter.item_token), os.SEEK_END) end: str = f.read(len(cls.delimiter.item_token)) if end != cls.delimiter.item_token: f.write(cls.delimiter.item_token) if cls.options.has_header and count > 0: lines = entry.get_content().split( cls.delimiter.item_token)[1:] content = cls.delimiter.item_token.join(lines) f.write(content) else: # TODO: There seems to be a bug where if I do entry.download(f), it's not guaranteed # the entire file will write at the end. I need to figure out why because downloading, # loading into memory and then writing to disk is slower. f.write(entry.get_content()) count += 1 return metadata
def _get_flattener(self, obj): if util.is_primitive(obj): return lambda obj: obj list_recurse = self._list_recurse if util.is_list(obj): if self._mkref(obj): return list_recurse else: self._push() return self._getref # We handle tuples and sets by encoding them in a "(tuple|set)dict" if util.is_tuple(obj): if not self.unpicklable: return list_recurse return lambda obj: {tags.TUPLE: [self._flatten(v) for v in obj]} if util.is_set(obj): if not self.unpicklable: return list_recurse return lambda obj: {tags.SET: [self._flatten(v) for v in obj]} if util.is_dictionary(obj): return self._flatten_dict_obj if util.is_type(obj): return _mktyperef if util.is_object(obj): return self._ref_obj_instance # else, what else? (methods, functions, old style classes...) return None
def split_file(database: Database, bucket_name: str, key: str, input_format: Dict[str, Any], output_format: Dict[str, Any], offsets: List[int], params: Dict[str, Any]): split_size = params["split_size"] input_bucket = bucket_name if util.is_set(params, "ranges"): if "input_prefix" in params: obj = database.get_entries(bucket_name, params["input_prefix"])[0] input_key = obj.key pivot_key = database.get_entries(bucket_name, "4/")[0].key # TODO Unhardcode [_, _, ranges] = pivot.get_pivot_ranges(bucket_name, pivot_key, params) else: [input_bucket, input_key, ranges] = pivot.get_pivot_ranges(bucket_name, key, params) obj = database.get_entry(input_bucket, input_key) else: input_key = key obj = database.get_entry(input_bucket, input_key) output_format["ext"] = obj.key.split(".")[-1] assert ("ext" not in output_format or output_format["ext"] != "pivot") file_id = 1 content_length: int = obj.content_length() #num_files = 10 num_files = int((content_length + split_size - 1) / split_size) threads = [] token = "{0:f}-{1:d}".format(output_format["timestamp"], output_format["nonce"]) while file_id <= num_files: offsets = [(file_id - 1) * split_size, min(content_length, (file_id) * split_size) - 1] extra_params = { **output_format, **{ "file_id": file_id, "num_files": num_files, "offsets": offsets, } } if util.is_set(params, "ranges"): extra_params["pivots"] = ranges payload = database.create_payload(params["bucket"], input_key, extra_params) payload["log"] = [ token, output_format["prefix"], output_format["bin"], output_format["num_bins"], file_id, num_files ] threads.append( threading.Thread(target=database.invoke, args=(params["output_function"], payload))) threads[-1].start() file_id += 1 for thread in threads: thread.join() return True