Python ExInCounterの例

プログラミング言語: Python

名前空間/パッケージ名: velocyto

メソッド/関数: ExInCounter

hotexamples.comのコード掲載数: 3

Python ExInCounter - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのvelocyto.ExInCounterの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: _run.py プロジェクト: winni2k/velocyto.py

def _run(*,
         bamfile: Tuple[str],
         gtffile: str,
         bcfile: str,
         outputfolder: str,
         sampleid: str,
         metadatatable: str,
         repmask: str,
         onefilepercell: bool,
         logic: str,
         without_umi: str,
         umi_extension: str,
         multimap: bool,
         test: bool,
         samtools_threads: int,
         samtools_memory: int,
         loom_numeric_dtype: str,
         dump: bool,
         verbose: int,
         additional_ca: dict = {}) -> None:
    """Runs the velocity analysis outputing a loom file

    BAMFILE or [BAMFILES] one or several bam files with position-sorted

    GTFFILE annotation file

    NOTE: it is keyword only argument function
    """

    ########################
    #    Resolve Inputs    #
    ########################

    logging.basicConfig(
        stream=sys.stdout,
        format='%(asctime)s - %(levelname)s - %(message)s',
        level=[logging.ERROR, logging.WARNING, logging.INFO,
               logging.DEBUG][verbose])

    if isinstance(bamfile,
                  tuple) and len(bamfile) > 1 and bamfile[-1][-4:] in [
                      ".bam", ".sam"
                  ]:
        multi = True
    elif isinstance(bamfile, tuple) and len(bamfile) == 1:
        multi = False
    else:
        raise IOError(
            f"Something went wrong in the argument parsing. You passed as bamfile: {bamfile}"
        )

    if onefilepercell and multi:
        if bcfile is not None:
            raise ValueError(
                "Inputs incompatibility. --bcfile/-b option was used together with --onefilepercell/-c option."
            )
        logging.warning(
            "Each bam file will be interpreted as a DIFFERENT cell")
    elif not onefilepercell and multi:
        logging.warning(
            "Several input files but --onefilepercell is False. Each bam file will be interpreted as containing a SET of cells!!!"
        )

    if sampleid is None:
        assert metadatatable is None, "--metadatatable was specified but cannot fetch sample metadata without valid sampleid"
        if multi:
            logging.warning(
                f"When using mutliple files you may want to use --sampleid option to specify the name of the output file"
            )
        if multi and not onefilepercell:
            full_name = "_".join([
                os.path.basename(bamfile[i]).split(".")[0]
                for i in range(len(bamfile))
            ])
            if len(full_name) > 50:
                sampleid = f'multi_input_{os.path.basename(bamfile[0]).split(".")[0]}_{id_generator(5)}'
            else:
                sampleid = f'multi_input_{full_name}_and_others_{id_generator(5)}'
        elif multi and onefilepercell:
            sampleid = f'onefilepercell_{os.path.basename(bamfile[0]).split(".")[0]}_and_others_{id_generator(5)}'
        else:
            sampleid = f'{os.path.basename(bamfile[0]).split(".")[0]}_{id_generator(5)}'
        logging.info(
            f"No SAMPLEID specified, the sample will be called {sampleid} (last 5 digits are a random-id to avoid overwriting some other file by mistake)"
        )

    # Create an output folder inside the cell ranger output folder
    if outputfolder is None:
        outputfolder = os.path.join(os.path.split(bamfile[0])[0], "velocyto")
        logging.info(
            f"No OUTPUTFOLDER specified, find output files inside {outputfolder}"
        )
    if not os.path.exists(outputfolder):
        os.mkdir(outputfolder)

    logic_class = getattr(vcy, logic)
    if not issubclass(logic_class, vcy.Logic):
        raise ValueError(
            f"{logic} is not a valid logic. Choose one among {', '.join([k for k, v in vcy.logic.__dict__.items() if issubclass(v, vcy.Logic)])}"
        )
    else:
        logging.debug(f"Using logic: {logic}")
        logic_obj = logic_class()

    if bcfile is None:
        logging.debug(
            "Cell barcodes will be determined while reading the .bam file")
        valid_bcset = None
    else:
        # Get valid cell barcodes
        valid_bcs_list = (gzip.open(bcfile).read().decode()
                          if bcfile.endswith(".gz") else
                          open(bcfile).read()).rstrip().split()
        valid_cellid_list = np.array([
            f"{sampleid}:{v_bc}" for v_bc in valid_bcs_list
        ])  # with sample id and with -1
        if len(set(bc.split('-')[0] for bc in valid_bcs_list)) == 1:
            gem_grp = f"-{valid_bcs_list[0].split('-')[-1]}"
        else:
            gem_grp = "x"
        valid_bcset = set(bc.split('-')[0]
                          for bc in valid_bcs_list)  # without -1
        logging.info(f"Read {len(valid_bcs_list)} cell barcodes from {bcfile}")
        logging.debug(
            f"Example of barcode: {valid_bcs_list[0].split('-')[0]} and cell_id: {valid_cellid_list[0]}"
        )

    # Get metadata from sample sheet
    if metadatatable:
        try:
            sample_metadata = vcy.MetadataCollection(metadatatable)
            sample = sample_metadata.where("SampleID", sampleid)
            if len(sample) == 0:
                logging.error(
                    f"Sample ID {sampleid} not found in sample sheet")
                # schema = []  # type: List
                sample = {}
            elif len(sample) > 1:
                logging.error(
                    f"Sample ID {sampleid} has multiple lines in sample sheet")
                sys.exit(1)
            else:
                # schema = sample[0].types
                sample = sample[0].dict
            logging.debug(f"Collecting column attributes from {metadatatable}")
        except (NameError, TypeError) as e:
            logging.warn(
                "SAMPLEFILE was not specified. add -s SAMPLEFILE to add metadata."
            )
            sample = {}
    else:
        sample = {}

    ########################
    #     Start Analysis   #
    ########################

    # Initialize Exon-Intron Counter with the logic and valid barcodes (need to do it now to peek)
    if without_umi:
        if umi_extension != "no":
            logging.warning(
                "--umi-extension was specified but incompatible with --without-umi, it will be ignored!"
            )
        umi_extension = "without_umi"
    exincounter = vcy.ExInCounter(sampleid=sampleid,
                                  logic=logic_class,
                                  valid_bcset=valid_bcset,
                                  umi_extension=umi_extension,
                                  onefilepercell=onefilepercell,
                                  dump_option=dump,
                                  outputfolder=outputfolder)

    # Heuristic to chose the memory/cpu effort
    try:
        mb_available = int(
            subprocess.check_output(
                'grep MemAvailable /proc/meminfo'.split()).split()[1]) / 1000
    except subprocess.CalledProcessError:
        logging.warning(
            "Your system does not support calling `grep MemAvailable /proc/meminfo` so the memory effort for the samtools command could not be chosen appropriately. 32Gb will be assumed"
        )
        mb_available = 32000  # 64Gb
    threads_to_use = min(samtools_threads, multiprocessing.cpu_count())
    mb_to_use = int(
        min(samtools_memory, mb_available / (len(bamfile) * threads_to_use)))
    compression = vcy.BAM_COMPRESSION

    # I need to peek into the bam file to know wich cell barcode flag should be used
    if onefilepercell and without_umi:
        tagname = "NOTAG"
    elif onefilepercell:
        logging.debug("The multi input option ")
        tagname = "NOTAG"
        exincounter.peek_umi_only(bamfile[0])
    else:
        exincounter.peek(bamfile[0])
        tagname = exincounter.cellbarcode_str

    if multi and onefilepercell:
        bamfile_cellsorted = list(bamfile)
    elif onefilepercell:
        bamfile_cellsorted = [bamfile[0]]
    else:
        bamfile_cellsorted = [
            f"{os.path.join(os.path.dirname(bmf), 'cellsorted_' + os.path.basename(bmf))}"
            for bmf in bamfile
        ]

    sorting_process: Dict[int, Any] = {}
    for ni, bmf_cellsorted in enumerate(bamfile_cellsorted):
        # Start a subprocess that sorts the bam file
        command = f"samtools sort -l {compression} -m {mb_to_use}M -t {tagname} -O BAM -@ {threads_to_use} -o {bmf_cellsorted} {bamfile[ni]}"
        if os.path.exists(bmf_cellsorted):
            # This should skip sorting in smartseq2
            logging.warning(
                f"The file {bmf_cellsorted} already exists. The sorting step will be skipped and the existing file will be used."
            )
            check_end_process = False
        else:
            sorting_process[ni] = subprocess.Popen(command.split(),
                                                   stdout=subprocess.PIPE)
            logging.info(
                f"Starting the sorting process of {bamfile[ni]} the output will be at: {bmf_cellsorted}"
            )
            logging.info(f"Command being run is: {command}")
            logging.info(f"While the bam sorting happens do other things...")
            check_end_process = True

    # Load annotations
    logging.info(f"Load the annotation from {gtffile}")
    annotations_by_chrm_strand = exincounter.read_transcriptmodels(gtffile)
    chrs = list(v for k, v in annotations_by_chrm_strand.items())
    tms = list(itertools.chain.from_iterable((v.values() for v in chrs)))
    ivls = list(itertools.chain.from_iterable(tms))
    logging.debug(
        f"Generated {len(ivls)} features corresponding to {len(tms)} transcript models from {gtffile}"
    )
    del chrs, tms, ivls

    # Load annotations
    if repmask is not None:
        logging.info(f"Load the repeat masking annotation from {repmask}")
        mask_ivls_by_chromstrand = exincounter.read_repeats(repmask)

    # Go through the bam files a first time to markup introns
    logging.info(f"Scan {' '.join(bamfile)} to validate intron intervals")
    if test:  # NOTE: Remove this after finishing testing, the only purpuso was to save 15min in the debugging process
        logging.warning("This place is for developer only!")
        import pickle
        if os.path.exists("exincounter_dump.pickle"):
            logging.debug("exincounter_dump.pickle is being loaded")
            exincounter = pickle.load(open("exincounter_dump.pickle", "rb"))
        else:
            logging.debug("exincounter_dump.pickle was not found")
            logging.debug("Dumping exincounter_dump.pickle BEFORE markup")
            pickle.dump(exincounter, open("exincounter_dump.pickle", "wb"))
            exincounter.mark_up_introns(bamfile=bamfile, multimap=multimap)
    else:
        exincounter.mark_up_introns(bamfile=bamfile, multimap=multimap)

    # Wait for child process to terminate
    if check_end_process:
        logging.info(
            f"Now just waiting that the bam sorting process terminates")
        for k in sorting_process.keys():
            returncode = sorting_process[k].wait()
            if returncode == 0:
                logging.info(f"bam file #{k} has been sorted")
            else:
                raise MemoryError(
                    f"bam file #{k} could not be sorted by cells.\n\
                This is probably related to an old version of samtools, please install samtools >= 1.6.\
                In alternative this could be a memory error, try to set the --samtools_memory option to a value compatible with your system. \
                Otherwise sort manually by samtools ``sort -l [compression] -m [mb_to_use]M -t [tagname] -O BAM -@ [threads_to_use] -o cellsorted_[bamfile] [bamfile]``"
                )

    # Do the actual counting
    logging.debug("Start molecule counting!")
    results = exincounter.count(
        bamfile_cellsorted, multimap=multimap
    )  # NOTE: we would avoid some millions of if statements evaluations if we write two function count and count_with output
    dict_list_arrays, cell_bcs_order = results

    ########################
    #         Output       #
    ########################

    # Prepare the loom file output
    if not exincounter.filter_mode:
        valid_bcset = exincounter.valid_bcset  # without -1
        valid_bcs_list = list(valid_bcset)  # without -1
        gem_grp = ""
        valid_cellid_list = np.array([
            f"{sampleid}:{v_bc}" for v_bc in valid_bcs_list
        ])  # with sampleid and with -1
        logging.debug(
            f"Example of barcode: {valid_bcs_list[0]} and cell_id: {valid_cellid_list[0]}"
        )

    ca = {
        "CellID":
        np.array([f"{sampleid}:{v_bc}{gem_grp}" for v_bc in cell_bcs_order])
    }
    ca.update(additional_ca)

    for key, value in sample.items():
        ca[key] = np.full(len(cell_bcs_order), value)

    # Save to loom file
    outfile = os.path.join(outputfolder, f"{sampleid}.loom")
    logging.debug(f"Generating output file {outfile}")

    # row attributes
    atr_table = (("Gene", "genename", str), ("Accession", "geneid", str),
                 ("Chromosome", "chrom", str), ("Strand", "strand", str),
                 ("Start", "start", int), ("End", "end", int))

    logging.debug("Collecting row attributes")
    ra = {}
    for name_col_attr, name_obj_attr, dtyp in atr_table:
        tmp_array = np.zeros((len(exincounter.genes), ),
                             dtype=object)  # type: np.ndarray
        for gene_id, gene_info in exincounter.genes.items():
            tmp_array[exincounter.geneid2ix[gene_id]] = getattr(
                gene_info, name_obj_attr)
        ra[name_col_attr] = tmp_array.astype(dtyp)

    logging.debug("Generating data table")
    layers: Dict[str, np.ndarray] = {}

    for layer_name in logic_obj.layers:
        layers[layer_name] = np.concatenate(dict_list_arrays[layer_name],
                                            axis=1)
        del dict_list_arrays[layer_name]

    for layer_name in logic_obj.layers:
        total: np.ndarray  # This is just a type annotation to avoid mypy complaints
        try:
            total += layers[layer_name]
        except NameError:
            total = np.array(layers[layer_name])

    logging.debug("Writing loom file")
    try:
        ds = loompy.create(filename=outfile,
                           matrix=total,
                           row_attrs=ra,
                           col_attrs=ca,
                           dtype="float32")
        for layer_name in logic_obj.layers:
            ds.set_layer(name=layer_name,
                         matrix=layers[layer_name],
                         dtype=loom_numeric_dtype)
        ds.attrs["velocyto.__version__"] = vcy.__version__
        ds.attrs["velocyto.logic"] = logic
        ds.close()
    except TypeError:
        # If user is using loompy2
        # NOTE maybe this is not super efficient if the type and order are already correct
        tmp_layers = {"": total.astype("float32", order="C", copy=False)}
        tmp_layers.update({
            layer_name: layers[layer_name].astype(loom_numeric_dtype,
                                                  order="C",
                                                  copy=False)
            for layer_name in logic_obj.layers
        })
        loompy.create(filename=outfile,
                      layers=tmp_layers,
                      row_attrs=ra,
                      col_attrs=ca,
                      file_attrs={
                          "velocyto.__version__": vcy.__version__,
                          "velocyto.logic": logic
                      })
    logging.debug("Terminated Succesfully!")

コード例 #2

ファイルを表示

ファイル: _run.py プロジェクト: b97jre/velocyto.py

def _run(*, bamfile: str, gtffile: str,
         bcfile: str, outputfolder: str,
         sampleid: str, metadatatable: str,
         repmask: str, logic: str, molrep: bool,
         multimap: bool, test: bool, samtools_threads: int, samtools_memory: int,
         additional_ca: dict={}) -> None:
    """Runs the velocity analysis outputing a loom file

    BAMFILE bam file with sorted reads

    GTFFILE annotation file

    NOTE: it is keyword only argument function
    """
    
    ########################
    #    Resolve Inputs    #
    ########################

    if sampleid is None:
        assert metadatatable is None, "Cannot fetch sample metadata without valid sampleid"
        sampleid = f'{os.path.basename(bamfile).split(".")[0]}_{id_generator(5)}'
        logging.debug(f"No SAMPLEID specified, the sample will be called {sampleid}")

    # Create an output folder inside the cell ranger output folder
    if outputfolder is None:
        outputfolder = os.path.join(os.path.split(bamfile)[0], "velocyto")
        logging.debug(f"No OUTPUTFOLDER specified, find output files inside {outputfolder}")
    if not os.path.exists(outputfolder):
        os.mkdir(outputfolder)

    logic_obj = getattr(vcy, logic)
    if not issubclass(logic_obj, vcy.Logic):
        raise ValueError(f"{logic} is not a valid logic. Chose one among {', '.join([k for k, v in vcy.logic.__dict__.items() if issubclass(v, vcy.Logic)])}")
    else:
        logging.debug(f"Using logic: {logic}")

    if bcfile is None:
        logging.debug("Cell barcodes will be determined while reading the .bam file")
        valid_bcset = None
    else:
        # Get valid cell barcodes
        valid_bcs_list = open(bcfile).read().rstrip().split()
        valid_cellid_list = np.array([f"{sampleid}:{v_bc}" for v_bc in valid_bcs_list])  # with sample id and with -1
        if len(set(bc.split('-')[0] for bc in valid_bcs_list)) == 1:
            gem_grp = f"-{valid_bcs_list[0].split('-')[-1]}"
        else:
            gem_grp = "x"
        valid_bcset = set(bc.split('-')[0] for bc in valid_bcs_list)  # without -1
        logging.debug(f"Read {len(valid_bcs_list)} cell barcodes from {bcfile}")
        logging.debug(f"Example of barcode: {valid_bcs_list[0].split('-')[0]} and cell_id: {valid_cellid_list[0]}")
        
    # Get metadata from sample sheet
    if metadatatable:
        try:
            sample_metadata = vcy.MetadataCollection(metadatatable)
            sample = sample_metadata.where("SampleID", sampleid)
            if len(sample) == 0:
                logging.error(f"Sample ID {sampleid} not found in sample sheet")
                # schema = []  # type: List
                sample = {}
            elif len(sample) > 1:
                logging.error(f"Sample ID {sampleid} has multiple lines in sample sheet")
                sys.exit(1)
            else:
                # schema = sample[0].types
                sample = sample[0].dict
            logging.debug(f"Collecting column attributes from {metadatatable}")
        except (NameError, TypeError) as e:
            logging.warn("SAMPLEFILE was not specified. add -s SAMPLEFILE to add metadata.")
            sample = {}
    else:
        sample = {}

    ########################
    #     Start Analysis   #
    ########################

    # Initialize Exon-Intron Counter with the logic and valid barcodes (need to do it now to peek)
    exincounter = vcy.ExInCounter(logic_obj, valid_bcset)

    # Heuristic to chose the memory/cpu effort
    mb_available = int(subprocess.check_output('grep MemAvailable /proc/meminfo'.split()).split()[1]) / 1000
    threads_to_use = min(samtools_threads, multiprocessing.cpu_count())
    mb_to_use = int(min(samtools_memory, mb_available / threads_to_use))
    compression = vcy.BAM_COMPRESSION

    # I need to peek into the bam file to know wich cell barcode flag should be used
    exincounter.peek(bamfile)
    tagname = exincounter.cellbarcode_str
    bamfile_cellsorted = f"{os.path.join(os.path.dirname(bamfile), 'cellsorted_' + os.path.basename(bamfile))}"

    # Start a subprocess that sorts the bam file
    command = f"samtools sort -l {compression} -m {mb_to_use}M -t {tagname} -O BAM -@ {threads_to_use} -o {bamfile_cellsorted} {bamfile}"
    if os.path.exists(bamfile_cellsorted):
        logging.warning(f"The file {bamfile_cellsorted} already exists. The sorting step will be skipped and the existing file will be used.")
        check_end_process = False
    else:
        sorting_process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
        logging.info(f"Starting the sorting process of {bamfile} the output will be at: {bamfile_cellsorted}")
        logging.info(f"Command being run is: {command}")
        logging.info(f"While the bam sorting happens do other things...")
        check_end_process = True

    # Load annotations
    logging.info(f"Load the annotation from {gtffile}")
    annotations_by_chrm_strand = exincounter.read_transcriptmodels(gtffile)
    chrs = list(v for k, v in annotations_by_chrm_strand.items())
    tms = list(itertools.chain.from_iterable((v.values() for v in chrs)))
    ivls = list(itertools.chain.from_iterable(tms))
    logging.debug(f"Generated {len(ivls)} features corresponding to {len(tms)} transcript models from {gtffile}")
    del chrs, tms, ivls

    # Load annotations
    if repmask is not None:
        logging.info(f"Load the repeat masking annotation from {repmask}")
        mask_ivls_by_chromstrand = exincounter.read_repeats(repmask)

    # Go through the sam files a first time to markup introns
    logging.info(f"Scan {bamfile} to validate intron intervals")
    if test:  # NOTE: Remove this after finishing testing, the only purpuso was to save 15min in the debugging process
        import pickle
        if os.path.exists("exincounter_dump.pickle"):
            logging.debug("exincounter_dump.pickle is being loaded")
            exincounter = pickle.load(open("exincounter_dump.pickle", "rb"))
        else:
            logging.debug("exincounter_dump.pickle was not found")
            logging.debug("Dumping exincounter_dump.pickle BEFORE markup")
            pickle.dump(exincounter, open("exincounter_dump.pickle", "wb"))
            exincounter.mark_up_introns(bamfile=bamfile, multimap=multimap)
            
    else:
        exincounter.mark_up_introns(bamfile=bamfile, multimap=multimap)

    # Wait for child process to terminate
    if check_end_process:
        logging.info(f"Now just waiting that the bam sorting process terminates")
        sorting_process.wait()
        logging.info(f"bam file has been sorted")

    # Do the actual counting
    logging.debug("Start molecule counting!")
    results = exincounter.count(bamfile_cellsorted, multimap=multimap, molecules_report=molrep)  # NOTE: we would avoid some millions of if statements evalution if we write two function count and count_with output
    list_spliced_arrays, list_unspliced_arrays, list_ambiguous_arrays, cell_bcs_order = results

    ########################
    #         Output       #
    ########################

    # Prepare the loom file output
    if not exincounter.filter_mode:
        valid_bcset = exincounter.valid_bcset  # without -1
        valid_bcs_list = list(valid_bcset)  # without -1
        gem_grp = ""
        valid_cellid_list = np.array([f"{sampleid}:{v_bc}" for v_bc in valid_bcs_list])  # with sampleid and with -1
        logging.debug(f"Example of barcode: {valid_bcs_list[0]} and cell_id: {valid_cellid_list[0]}")
     
    ca = {"CellID": np.array([f"{sampleid}:{v_bc}{gem_grp}" for v_bc in cell_bcs_order])}
    ca.update(additional_ca)

    for key, value in sample.items():
        ca[key] = np.array([value] * len(cell_bcs_order))

    # Save to loom file
    outfile = os.path.join(outputfolder, f"{sampleid}.loom")
    logging.debug(f"Generating output file {outfile}")
    
    # row attributes
    atr_table = (("Gene", "genename", str),
                 ("Accession", "geneid", str),
                 ("Chromosome", "chrom", str),
                 ("Strand", "strand", str),
                 ("Start", "start", int),
                 ("End", "end", int))

    logging.debug("Collecting row attributes")
    ra = {}
    for name_col_attr, name_obj_attr, dtyp in atr_table:
        tmp_array = np.zeros((len(exincounter.genes),), dtype=object)  # type: np.ndarray
        for gene_id, gene_info in exincounter.genes.items():
            tmp_array[exincounter.geneid2ix[gene_id]] = getattr(gene_info, name_obj_attr)
        ra[name_col_attr] = tmp_array.astype(dtyp)
    
    logging.debug("Generating data table")
    spliced = np.concatenate(list_spliced_arrays, axis=1)
    del list_spliced_arrays
    unspliced = np.concatenate(list_unspliced_arrays, axis=1)
    del list_unspliced_arrays
    ambiguous = np.concatenate(list_ambiguous_arrays, axis=1)
    del list_ambiguous_arrays
    
    total = spliced + unspliced + ambiguous
    logging.debug("Writing loom file")
    try:
        ds = loompy.create(filename=outfile, matrix=total, row_attrs=ra, col_attrs=ca, dtype="float32")
        ds.set_layer(name="spliced", matrix=spliced, dtype=vcy.LOOM_NUMERIC_DTYPE)
        ds.set_layer(name="unspliced", matrix=unspliced, dtype=vcy.LOOM_NUMERIC_DTYPE)
        ds.set_layer(name="ambiguous", matrix=ambiguous, dtype=vcy.LOOM_NUMERIC_DTYPE)
        ds.attrs["velocyto.__version__"] = vcy.__version__
        ds.close()
    except TypeError:
        # If user is using loompy2
        loompy.create(filename=outfile, layers={"": total.astype("float32", order="C", copy=False),
                                                "spliced": spliced.astype(vcy.LOOM_NUMERIC_DTYPE, order="C", copy=False),
                                                "unspliced": unspliced.astype(vcy.LOOM_NUMERIC_DTYPE, order="C", copy=False),
                                                "ambiguous": ambiguous.astype(vcy.LOOM_NUMERIC_DTYPE, order="C", copy=False)},
                      row_attrs=ra, col_attrs=ca, file_attrs={"velocyto.__version__": vcy.__version__})
    logging.debug("Terminated Succesfully!")

コード例 #3

ファイルを表示

ファイル: _run.py プロジェクト: olgabot/velocyto.py

def _run(bamfile: str,
         ivlfile: str,
         bcfile: str,
         outputfolder: str,
         sampleid: str,
         metadatatable: str,
         repmask: str,
         debug: bool,
         additional_ca: dict = {}) -> None:
    """Runs the velocity analysis outputing a loom file

    BAMFILE bam file with sorted reads

    IVLFILE text file generated by velocyto extract_intervals
    """

    split_sam_flag = debug

    if sampleid is None:
        assert metadatatable is None, "Cannot fetch sample metadata without valid sampleid"
        sampleid = f'{os.path.basename(bamfile).split(".")[0]}_{id_generator(5)}'
        logging.debug(
            f"No SAMPLEID specified, the sample will be called {sampleid}")

    # Create an output folder inside the cell ranger output folder
    if outputfolder is None:
        outputfolder = os.path.join(os.path.split(bamfile)[0], "velocyto")
        logging.debug(
            f"No OUTPUTFOLDER specified, find output files inside {outputfolder}"
        )
    if not os.path.exists(outputfolder):
        os.mkdir(outputfolder)

    if bcfile is None:
        logging.debug(
            "Cell barcodes will be determined while reading the .bam file")
    else:
        # Get valid cell barcodes
        valid_bcs_list = [l.strip() for l in open(bcfile).readlines()]
        valid_cellid_list = np.array([
            f"{sampleid}:{v_bc}" for v_bc in valid_bcs_list
        ])  # with sample id and with -1
        valid_bcs2idx = dict(
            (bc.split('-')[0], n)
            for n, bc in enumerate(valid_bcs_list))  # without -1
        logging.debug(
            f"Read {len(valid_bcs_list)} cell barcodes from {bcfile}")
        logging.debug(
            f"Example of barcode: {valid_bcs_list[0].split('-')[0]} and cell_id: {valid_cellid_list[0]}"
        )

    # Get metadata from sample sheet
    if metadatatable:
        try:
            sample_metadata = vcy.MetadataCollection(metadatatable)
            sample = sample_metadata.where("SampleID", sampleid)
            if len(sample) == 0:
                logging.error(
                    f"Sample ID {sampleid} not found in sample sheet")
                # schema = []  # type: List
                sample = {}
            elif len(sample) > 1:
                logging.error(
                    f"Sample ID {sampleid} has multiple lines in sample sheet")
                sys.exit(1)
            else:
                # schema = sample[0].types
                sample = sample[0].dict
            logging.debug(f"Collecting column attributes from {metadatatable}")
        except (NameError, TypeError) as e:
            logging.warn(
                "SAMPLEFILE was not specified. add -s SAMPLEFILE to add metadata."
            )
            sample = {}
    else:
        sample = {}

    # Initialize Exon-Intron Counter with the valid barcodes
    exincounter = vcy.ExInCounter(valid_bcs2idx)

    # Load the Intervals definition from file
    n = exincounter.read_genes(ivlfile)
    logging.debug(
        f"Read {n} intervals for {len(exincounter.genes)} genes from {ivlfile}"
    )

    if repmask is not None:
        m = exincounter.read_repeats(repmask)
        logging.debug(f"Read {m} repeat intervals to mask from {repmask}")

    # Go through the sam files a first time to markup introns
    logging.debug("Marking up introns...")
    exincounter.mark_up_introns(bamfile)

    # Do the actual counting
    if split_sam_flag:
        logging.debug("Counting molecules and writing sam outputs...")
        # NOTE: I should write bam file directly using pysam
        f_sure_introns = open(
            os.path.join(outputfolder, f"{sampleid}_sure_introns.sam"), "w")
        f_sure_exon = open(
            os.path.join(outputfolder, f"{sampleid}_sure_exon.sam"), "w")
        f_maybe_exon = open(
            os.path.join(outputfolder, f"{sampleid}_maybe_exon.sam"), "w")
        f_others = open(
            os.path.join(outputfolder, f"{sampleid}_not_exon_not_intron.sam"),
            "w")
        f_chimera = open(
            os.path.join(outputfolder, f"{sampleid}_chimeras.sam"), "w")
        exincounter.count(bamfile,
                          sam_output=(f_sure_introns, f_sure_exon,
                                      f_maybe_exon, f_others, f_chimera))
        f_sure_introns.close()
        f_sure_exon.close()
        f_maybe_exon.close()
        f_others.close()
    else:
        logging.debug("Counting molecules...")
        exincounter.count(
            bamfile
        )  # NOTE: we would avoid some millions of if statements evalution if we write two function count and count_with output

    if not exincounter.filter_mode:
        valid_bcs2idx = exincounter.valid_bcs2idx  # without -1
        valid_bcs_list = list(
            zip(*sorted([(v, k)
                         for k, v in valid_bcs2idx.items()])))[1]  # without -1
        valid_cellid_list = np.array([
            f"{sampleid}:{v_bc}-1" for v_bc in valid_bcs_list
        ])  # with sampleid and with -1
        logging.debug(
            f"Example of barcode: {valid_bcs_list[0]} and cell_id: {valid_cellid_list[0]}"
        )

    ca = {"CellID": np.array(valid_cellid_list)}
    ca.update(additional_ca)

    for key, value in sample.items():
        ca[key] = np.array([value] * len(valid_cellid_list))

    # Save 3' junction/exon read counts
    # NOTE: Legacy code this should be added, where is not redunddant to the newer mapstats.hdf5 file
    logging.debug("Save 3' junction/exon read counts")
    olastexon_counts_file = os.path.join(outputfolder, "lastexon_counts.tab")
    ofd = open(olastexon_counts_file, 'w')
    ofd.write(
        "GeneMame\tGeneID\tAnnotatedTrEnd\tDeducedTrEnd\tLastExonLen\tLastJunctionCount\tLastExonCount\tFromEndReadProfile(3'=>5')...\n"
    )
    for g in exincounter.genes:
        lastjunction_count, lastexon_count = g.get_lastexon_counts()
        lastexon_length = g.get_lastexon_length()
        profile = []
        for c in g.read_start_counts_from_locus_end[:lastexon_length]:
            profile.append(c)
        profile_str = "\t".join([str(c) for c in profile])
        ofd.write(
            "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
            (g.genename, g.geneid, g.get_tr_end(), g.get_deduced_tr_end(),
             lastexon_length, lastjunction_count, lastexon_count, profile_str))
    ofd.close()

    # Save some stats about exon and introns in a loom file
    logging.debug("Collecting genes structural info statistics")

    # Create hdf5 containg the structural stats
    statsfilename = os.path.join(outputfolder, f"{sampleid}_mapstats.hdf5")
    stats_hdf5 = h5py.File(statsfilename, 'w')
    for i, g in enumerate(exincounter.genes):
        # create a group with the Accession Name
        grp = stats_hdf5.create_group(g.geneid)
        type_intervals = np.zeros(
            len(g.ivls), dtype="|S3"
        )  # not redundand because intron markup is library dependent
        len_intervals = np.zeros(
            len(g.ivls),
            dtype="uint32")  # NOTE having this entry to the file is redundant
        valid_intron = np.zeros(len(g.ivls), dtype="bool")
        for j, ivl in enumerate(g.ivls):
            type_intervals[j] = ivl.ivltype
            len_intervals[j] = np.abs(ivl.end - ivl.start)
            valid_intron[j] = ivl.is_sure_valid_intron

        grp.create_dataset("reads_per_ivl",
                           data=np.row_stack((g.ivljunction5_read_counts,
                                              g.ivlinside_read_counts,
                                              g.ivljunction3_read_counts)))
        grp.create_dataset("ivls_type", data=type_intervals)
        grp.create_dataset("ivls_len", data=len_intervals)
        grp.create_dataset("valid_intron", data=valid_intron)

    stats_hdf5.close()
    logging.debug(f"Mapping statistics have been saved to {statsfilename}")

    # Save to loom file
    outfile = os.path.join(outputfolder, f"{sampleid}.loom")
    logging.debug(f"Generating output file {outfile}")

    # row attributes
    atr_table = (("Gene", "genename", str), ("Accession", "geneid", str),
                 ("Chromosome", "chrom", str), ("Strand", "strand", str),
                 ("Start", "start", int), ("End", "end", int))

    logging.debug("Collecting row attributes")
    ra = {}
    for name_col_attr, name_obj_attr, dtyp in atr_table:
        tmp_array = np.zeros((len(exincounter.genes), ),
                             dtype=object)  # type: np.ndarray
        for i, g in enumerate(exincounter.genes):
            tmp_array[i] = getattr(g, name_obj_attr)
        ra[name_col_attr] = tmp_array.astype(dtyp)

    logging.debug("Generating data table")
    shape_loom = len(exincounter.genes), len(valid_bcs_list)
    spliced = np.zeros(shape_loom, dtype=vcy.LOOM_NUMERIC_DTYPE)
    unspliced = np.zeros(shape_loom, dtype=vcy.LOOM_NUMERIC_DTYPE)
    ambiguous = np.zeros(shape_loom, dtype=vcy.LOOM_NUMERIC_DTYPE)
    for i, g in enumerate(exincounter.genes):
        spliced[i, :] = g.spliced_mol_counts
        unspliced[i, :] = g.unspliced_mol_counts
        ambiguous[i, :] = g.ambiguous_mol_counts

    total = spliced + unspliced + ambiguous
    if not np.any(total):
        logging.error("The output file is empty check the input!")

    logging.debug("Writing loom file")
    ds = loompy.create(filename=outfile,
                       matrix=total,
                       row_attrs=ra,
                       col_attrs=ca,
                       dtype="float32")
    ds.set_layer(name="spliced", matrix=spliced, dtype=vcy.LOOM_NUMERIC_DTYPE)
    ds.set_layer(name="unspliced",
                 matrix=unspliced,
                 dtype=vcy.LOOM_NUMERIC_DTYPE)
    ds.set_layer(name="ambiguous",
                 matrix=ambiguous,
                 dtype=vcy.LOOM_NUMERIC_DTYPE)
    ds.close()

    logging.debug("Terminated Succesfully!")