Esempio n. 1
0
def map_file(database: Database, table: str, key: str, input_format: Dict[str, Any], output_format: Dict[str, Any], offsets: List[int], params: Dict[str, Any]):
  prefix: str = util.key_prefix(key)

  if util.is_set(params, "ranges"):
    [bucket_name, key, ranges] = pivot.get_pivot_ranges(table, key)
    items: List[Entry] = database.get_entries(table, prefix)
    keys: List[str] = list(set(map(lambda item: item.key, items)))
  else:
    if "map_bucket_key_prefix" in params:
      items: List[Entry] = database.get_entries(params["map_bucket"], prefix=params["map_bucket_key_prefix"])
      keys: List[str] = list(set(map(lambda item: item.key, items)))
    else:
      items: List[Entry] = database.get_entries(params["map_bucket"])
      if params["directories"]:
        items = list(filter(lambda item: "/" in item.key, items))
        keys: List[str] = list(set(map(lambda item: item.key.split("/")[0], items)))
      else:
        keys: List[str] = list(set(map(lambda item: item.key, items)))

  file_id = 0
  num_files = len(keys)
  keys.sort()
  for i in range(num_files):
    target_file = keys[i]
    file_id += 1

    payload = {
      "Records": [{
        "s3": {
          "bucket": {
            "name": table,
          },
          "object": {
          },
          "extra_params": {
            "target_bucket": params["map_bucket"],
            "target_file": target_file,
            "prefix": output_format["prefix"],
            "file_id": file_id,
            "num_files": num_files,
          }
        }
      }]
    }

    if params["input_key_value"] == "key":
      payload["Records"][0]["s3"]["object"]["key"] = key
      payload["Records"][0]["s3"]["extra_params"][params["bucket_key_value"]] = target_file
    elif params["bucket_key_value"] == "key":
      payload["Records"][0]["s3"]["object"]["key"] = target_file
      payload["Records"][0]["s3"]["extra_params"][params["input_key_value"]] = key
    else:
      raise Exception("Need to specify field for map key")

    if util.is_set(params, "ranges"):
      payload["Records"][0]["s3"]["extra_params"]["pivots"] = ranges
      payload["Records"][0]["s3"]["pivots"] = ranges

    database.invoke(params["output_function"], payload)
Esempio n. 2
0
    def test_list_dir_no_x(self):
        nox_name = self.ch.path_module.join(self.rtd, 'nox')
        notmine_name = self.ch.path_module.join(self.rtd, 'not_mine')
        nox_mode = self.self.ch.sftp.stat(nox_name).st_mode
        notmine_mode = self.self.ch.sftp.stat(notmine_name).st_mode

        self.assertTrue(stat.S_ISDIR(nox_mode))
        self.assertTrue(is_set(stat.S_IMODE(nox_mode), stat.S_IXGRP))
        
        self.assertTrue(stat.S_ISDIR(notmine_mode))
        self.assertTrue(is_set(stat.S_IMODE(notmine_mode), stat.S_IXOTH))

        logging.debug('listdir(nox)='+str(self.self.ch.list_dir(nox_name)))
        logging.debug('listdir(notmine)='+str(self.self.ch.list_dir(notmine_name)))
Esempio n. 3
0
    def test_list_dir_no_x(self):
        nox_name = self.ch.path_module.join(self.rtd, 'nox')
        notmine_name = self.ch.path_module.join(self.rtd, 'not_mine')
        nox_mode = self.self.ch.sftp.stat(nox_name).st_mode
        notmine_mode = self.self.ch.sftp.stat(notmine_name).st_mode

        self.assertTrue(stat.S_ISDIR(nox_mode))
        self.assertTrue(is_set(stat.S_IMODE(nox_mode), stat.S_IXGRP))
        
        self.assertTrue(stat.S_ISDIR(notmine_mode))
        self.assertTrue(is_set(stat.S_IMODE(notmine_mode), stat.S_IXOTH))

        logging.debug('listdir(nox)='+str(self.self.ch.list_dir(nox_name)))
        logging.debug('listdir(notmine)='+str(self.self.ch.list_dir(notmine_name)))
Esempio n. 4
0
    def combine(cls: Any, entries: List[Entry], f: BinaryIO,
                extra: Dict[str, Any]) -> Dict[str, str]:
        if not util.is_set(extra, "sort"):
            return new_line.Iterator.combine(entries, f, extra)

        top_scores = {}
        for entry in entries:
            items = cls.to_array(entry.get_content())
            for item in items:
                [s, neighbors] = item
                if s not in top_scores:
                    top_scores[s] = list(
                        map(lambda n: (-1 * n[0], n[1]), neighbors))
                    heapq.heapify(top_scores[s])
                else:
                    for neighbor in neighbors:
                        [score, classification] = neighbor
                        if len(
                                top_scores[s]
                        ) < extra["k"] or -1 * score > top_scores[s][0][0]:
                            heapq.heappush(top_scores[s],
                                           (-1 * score, classification))
                        if len(top_scores[s]) > extra["k"]:
                            heapq.heappop(top_scores[s])

        keys = list(top_scores.keys())
        for i in range(len(keys)):
            if i > 0:
                f.write(b'\n')
            s = keys[i]
            line = s
            for [score, c] in top_scores[s]:
                line += str.encode(",{0:f} {1:d}".format(-1 * score, c))
            f.write(line)
        return {}
Esempio n. 5
0
def get_loaders(traindir, valdir, sz, bs, fp16=True, val_bs=None, workers=8, rect_val=False, min_scale=0.08, distributed=False, synthetic=False):
    val_bs = val_bs or bs
    train_tfms = [
            transforms.RandomResizedCrop(sz, scale=(min_scale, 1.0)),
            transforms.RandomHorizontalFlip()
    ]
    train_dataset = datasets.ImageFolder(traindir, transforms.Compose(train_tfms))
    train_sampler = (DistributedSampler(train_dataset, num_replicas=env_world_size(), rank=env_rank()) if distributed else None)

    if synthetic:
        print("Using synthetic dataloader")
        train_loader = SyntheticDataLoader(bs, (3, sz, sz))
    elif util.is_set('PYTORCH_USE_SPAWN'):
        print("Using SPAWN method for dataloader")
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=bs, shuffle=(train_sampler is None),
            num_workers=workers, pin_memory=True, collate_fn=fast_collate, 
            sampler=train_sampler,
            multiprocessing_context='spawn')
    else:
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=bs, shuffle=(train_sampler is None),
            num_workers=workers, pin_memory=True, collate_fn=fast_collate, 
            sampler=train_sampler)

    val_dataset, val_sampler = create_validation_set(valdir, val_bs, sz, rect_val=rect_val, distributed=distributed)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        num_workers=workers, pin_memory=True, collate_fn=fast_collate, 
        batch_sampler=val_sampler)

    train_loader = BatchTransformDataLoader(train_loader, fp16=fp16)
    val_loader = BatchTransformDataLoader(val_loader, fp16=fp16)

    return train_loader, val_loader, train_sampler, val_sampler
Esempio n. 6
0
    def run_single(self, reads_ref, params):
        """
        Performs a single run of HISAT2 against a single reads reference. The rest of the info
        is taken from the params dict - see the spec for details.
        """
        # 1. Get hisat2 index from genome.
        #    a. If it exists in cache, use that.
        #    b. Otherwise, build it
        idx_prefix = self.build_index(params["genome_ref"])

        # 2. Fetch the reads file and deal make sure input params are correct.
        reads = fetch_reads_from_reference(reads_ref["ref"], self.callback_url)
        # if the reads ref came from a different sample set, then we need to drop that
        # reference inside the reads info object so it can be linked in the alignment
        if reads_ref["ref"] != params["sampleset_ref"]:
            reads["sampleset_ref"] = params["sampleset_ref"]
        # make sure condition info carries over if we have it
        if "condition" in reads_ref:
            reads["condition"] = reads_ref["condition"]
        elif "condition" in params:
            reads["condition"] = params["condition"]
        reads["name"] = reads_ref["name"]
        output_file = "accepted_hits"

        # 3. Finally all set, do the alignment and upload the output.
        alignment_file = self.run_hisat2(idx_prefix,
                                         reads,
                                         params,
                                         output_file=output_file)
        alignment_name = reads["name"] + params["alignment_suffix"]
        output_ref = self.upload_alignment(params, reads, alignment_name,
                                           alignment_file)
        alignment_set_ref = None
        if is_set(params["sampleset_ref"], self.workspace_url):
            # alignment_items, alignmentset_name, ws_name
            set_name = get_object_names(
                [params["sampleset_ref"]],
                self.workspace_url)[params["sampleset_ref"]]
            alignment_set_name = set_name + params["alignmentset_suffix"]
            alignment_set_ref = self.upload_alignment_set(
                [{
                    "ref": output_ref,
                    "label": reads["condition"]
                }], alignment_set_name, params["ws_name"])
        alignments = dict()
        alignments[reads_ref["ref"]] = {
            "ref": output_ref,
            "name": alignment_name
        }
        os.remove(reads["file_fwd"])
        if "file_rev" in reads:
            os.remove(reads["file_rev"])
        return (alignments, output_ref, alignment_set_ref)
Esempio n. 7
0
    def combine(cls: Any, entries: List[Entry], f: BinaryIO,
                extra: Dict[str, Any]) -> Dict[str, str]:
        metadata: Dict[str, str] = {}

        if util.is_set(extra, "sort"):
            items = []
            for i in range(len(entries)):
                it = cls(entries[i], None)
                sub_items = it.get(it.get_start_index(), it.get_end_index())
                items += list(
                    map(
                        lambda item: (it.get_identifier_value(
                            item, extra["identifier"]), item), sub_items))
            items = sorted(items, key=lambda i: i[0])
            items = list(map(lambda i: i[1], items))
            content, metadata = cls.from_array(items, f, extra)
            f.write(content)
        else:
            count = 0
            for i in range(len(entries)):
                entry = entries[i]
                if entry.content_length() == 0:
                    continue
                if count > 0 and cls.delimiter.position == DelimiterPosition.inbetween:
                    f.seek(-1 * len(cls.delimiter.item_token), os.SEEK_END)
                    end: str = f.read(len(cls.delimiter.item_token))
                    if end != cls.delimiter.item_token:
                        f.write(cls.delimiter.item_token)
                if cls.options.has_header and count > 0:
                    lines = entry.get_content().split(
                        cls.delimiter.item_token)[1:]
                    content = cls.delimiter.item_token.join(lines)
                    f.write(content)
                else:
                    # TODO: There seems to be a bug where if I do entry.download(f), it's not guaranteed
                    # the entire file will write at the end. I need to figure out why because downloading,
                    # loading into memory and then writing to disk is slower.
                    f.write(entry.get_content())
                count += 1

        return metadata
Esempio n. 8
0
    def _get_flattener(self, obj):

        if util.is_primitive(obj):
            return lambda obj: obj

        list_recurse = self._list_recurse

        if util.is_list(obj):
            if self._mkref(obj):
                return list_recurse
            else:
                self._push()
                return self._getref

        # We handle tuples and sets by encoding them in a "(tuple|set)dict"
        if util.is_tuple(obj):
            if not self.unpicklable:
                return list_recurse
            return lambda obj: {tags.TUPLE: [self._flatten(v) for v in obj]}

        if util.is_set(obj):
            if not self.unpicklable:
                return list_recurse
            return lambda obj: {tags.SET: [self._flatten(v) for v in obj]}

        if util.is_dictionary(obj):
            return self._flatten_dict_obj

        if util.is_type(obj):
            return _mktyperef

        if util.is_object(obj):
            return self._ref_obj_instance

        # else, what else? (methods, functions, old style classes...)
        return None
Esempio n. 9
0
def split_file(database: Database, bucket_name: str, key: str,
               input_format: Dict[str, Any], output_format: Dict[str, Any],
               offsets: List[int], params: Dict[str, Any]):
    split_size = params["split_size"]

    input_bucket = bucket_name
    if util.is_set(params, "ranges"):
        if "input_prefix" in params:
            obj = database.get_entries(bucket_name, params["input_prefix"])[0]
            input_key = obj.key
            pivot_key = database.get_entries(bucket_name,
                                             "4/")[0].key  # TODO Unhardcode
            [_, _, ranges] = pivot.get_pivot_ranges(bucket_name, pivot_key,
                                                    params)
        else:
            [input_bucket, input_key,
             ranges] = pivot.get_pivot_ranges(bucket_name, key, params)
            obj = database.get_entry(input_bucket, input_key)
    else:
        input_key = key
        obj = database.get_entry(input_bucket, input_key)

    output_format["ext"] = obj.key.split(".")[-1]
    assert ("ext" not in output_format or output_format["ext"] != "pivot")
    file_id = 1
    content_length: int = obj.content_length()
    #num_files = 10
    num_files = int((content_length + split_size - 1) / split_size)

    threads = []
    token = "{0:f}-{1:d}".format(output_format["timestamp"],
                                 output_format["nonce"])
    while file_id <= num_files:
        offsets = [(file_id - 1) * split_size,
                   min(content_length, (file_id) * split_size) - 1]
        extra_params = {
            **output_format,
            **{
                "file_id": file_id,
                "num_files": num_files,
                "offsets": offsets,
            }
        }

        if util.is_set(params, "ranges"):
            extra_params["pivots"] = ranges

        payload = database.create_payload(params["bucket"], input_key,
                                          extra_params)
        payload["log"] = [
            token, output_format["prefix"], output_format["bin"],
            output_format["num_bins"], file_id, num_files
        ]

        threads.append(
            threading.Thread(target=database.invoke,
                             args=(params["output_function"], payload)))
        threads[-1].start()
        file_id += 1

    for thread in threads:
        thread.join()
    return True