Esempio n. 1
0
def main():
    """Entry point."""
    args = parse_args()
    source = open(args.bed_source, "r") if args.bed_source != "stdin" else sys.stdin
    two_bit_data = TwoBitFile(get_2bit_path(args.db))
    # so let's read input
    for num, line in enumerate(source):
        bed_info = line[:-1].split("\t")
        # parse bed info
        chrom = bed_info[0]
        chrom_seq = two_bit_data[chrom]
        gene_seq = ""
        chromStart = int(bed_info[1])
        # chromEnd = int(bed_info[2])
        name = bed_info[3]  # gene_name usually
        # bed_score = int(bed_info[4])  # never used
        # strand = bed_info[5]  # otherwise:
        strand = True if bed_info[5] == '+' else False
        thickStart = int(bed_info[6])
        thickEnd = int(bed_info[7])
        # itemRgb = bed_info[8]  # never used
        blockCount = int(bed_info[9])
        blockSizes = [int(x) for x in bed_info[10].split(',') if x != '']
        blockStarts = [int(x) for x in bed_info[11].split(',') if x != '']
        # not-in-file info
        blockEnds = [blockStarts[i] + blockSizes[i] for i in range(blockCount)]
        blockAbsStarts = [blockStarts[i] + chromStart for i in range(blockCount)]
        blockAbsEnds = [blockEnds[i] + chromStart for i in range(blockCount)]
        # block-by-block
        for block_num in range(blockCount):
            if not args.utr:
                blockStart = blockAbsStarts[block_num]
                blockEnd = blockAbsEnds[block_num]
                # skip the block if it is entirely UTR
                if blockEnd <= thickStart:
                    continue
                elif blockStart >= thickEnd:
                    continue
                blockNewStart = blockStart if blockStart >= thickStart else thickStart
                blockNewEnd = blockEnd if blockEnd <= thickEnd else thickEnd
                exon_seq = chrom_seq[blockNewStart: blockNewEnd].upper()
            else:
                exon_seq = chrom_seq[blockAbsStarts[block_num]: blockAbsEnds[block_num]]
            gene_seq += exon_seq
        if len(gene_seq) == 0:
            continue
        gene_seq = gene_seq if strand else revert_compl(gene_seq)
        sys.stdout.write(">{}\n{}\n".format(name, gene_seq))
    source.close() if args.bed_source != "stdin" else None
    sys.exit(0)
Esempio n. 2
0
def main():
    (options, args) = _get_args()
    twoBitFile = TwoBitFile(options.twobit)

    pileup = Pileup(region=_parse_range(options.range))
    pileup.addTrack(ScaleTrack(name="Scale"))
    pileup.addTrack(LocationTrack(name="Location"))
    pileup.addTrack(ReferenceTrack(twoBitFile, name="Reference"))

    for filename in args:
        if filename.endswith('.vcf'):
            pileup.addTrack(DivTrack(divider='-', name="Div1"))
            vcf_reader = vcf.Reader(open(filename, 'r'))
            variants = list(vcf_reader)
            pileup.addTrack(VCFTrack(variants=variants, name="Variants"))

    pileup.addTrack(DivTrack(divider='~', name="Div2"))
    pileup.addTrack(DivTrack(divider='.', name="Div3"))

    pileup.render()
Esempio n. 3
0
def writeGuideRow(db, guideSeq, otRows, ofh):
    " write a guide row, with all off-targets merged into a single field "
    otRows.sort(key=operator.itemgetter(2), reverse=True)
    filtOtRows = []
    mismCounts = [0] * 5
    for row in otRows:
        # format of row is: chrom;start;score;pam;diffString;annotation
        otSeq = row[4]
        #mismString, mismCount = showMism(guideSeq, row[4][:20])
        #row[4] = compressAln(mismString)
        mismCount = countMm(guideSeq, row[4][:20])
        if mismCount <= 4:  # very very rare >4: only when there are Ns in the off-target seq
            mismCounts[mismCount] += 1
        #if mismCount >= 4:
        #continue # just show count, don't store locations of off-targets with 4 mismatches
        #row[2] = "%0.2f" % row[2]
        chrom, start, score, pam, diffString, annot = row
        start = int(start) - 1  # aargh!! crispor is 1-based!
        row = [chrom, start, score, otSeq]

        #otStrings.append(";".join(row))
        filtOtRows.append(row)

    # need to determine strand. idiotic bug: old version of crispor didn't give me the strand.

    # get seqs, but in chrom order to get better speed
    otCoords = []
    for row in filtOtRows:
        chrom, start, score, otSeq = row
        otCoords.append((chrom, start, otSeq))
    otCoords.sort()

    # write to bed
    #tmpFh = tempfile.NamedTemporaryFile(dir="/dev/shm", prefix="max-crisprTrack")
    #for r in bedRows:
    #r = [str(x) for x in r]
    #tmpFh.write("%s\n" % ("\t".join(r)))
    #tmpFh.flush()

    twoBitFname = '/scratch/data/%s/%s.2bit' % (db, db)
    if not isfile(twoBitFname):  # can happen these days, says Hiram
        twoBitFname = '/gbdb/%s/%s.2bit' % (db, db)
    if not isfile(twoBitFname):  # can happen these days, says Hiram
        twoBitFname = '/cluster/data/%s/%s.2bit' % (db, db)
    genome = TwoBitFile(twoBitFname)

    # get sequences
    strands = {}
    for otRow in otCoords:
        chrom, start, otSeq = otRow
        twoBitChrom = genome[chrom]
        forwSeq = twoBitChrom[start:start + 23].upper()
        # two possible sequences, depending on strand
        revSeq = revComp(forwSeq).upper()

        #print guideSeq, otSeq, forwSeq, revSeq
        # for palindromes, we can't decide, default to +
        if otSeq == forwSeq:
            strand = "+"
        elif otSeq == revSeq:
            strand = "-"
        else:
            assert (False)
        strands[(chrom, start)] = strand

    # now add the strand to the features
    otStrings = []
    for row in filtOtRows:
        chrom, start, score, mismCount = row
        scoreStr = str(int(score * 100))
        #if scoreStr[:2]=="0.":
        #scoreStr = scoreStr[1:]
        row = (chrom, str(start) + strands[(chrom, start)], scoreStr)
        otStrings.append(";".join(row))

    mismCounts = [str(x) for x in mismCounts]
    otField = "|".join(otStrings)
    # mysql has trouble with very long blogs, and we also can save a lot of space by
    # ignoring too repetitive sequences
    if len(otField) > 5000:
        otField = ""
    row = (guideSeq, ",".join(mismCounts), otField)
    ofh.write("\t".join(row))
    ofh.write("\n")
transposaseMotifs["SB"] = "TA"
transposaseMotifs["HelR"] = "AT"

parser = argparse.ArgumentParser()
parser.add_argument("-t",
                    "--transposase",
                    type=str,
                    required=True,
                    choices=["PB", "SB", "HelR"])
parser.add_argument("-f", "--filter", action="store_true")
parser.add_argument("input", type=str)
parser.add_argument("reference", type=str)
parser.add_argument("output", type=str)
args = parser.parse_args()

ref = TwoBitFile(args.reference)


# This is to cache the most recent 1024 genomic loci looked up. This saves
# overhead by not having to needlessly query the reference
@lru_cache(maxsize=1024)
def fetchGenomicSequence(chromosome, start, end):
    return ref[chromosome][start:end].upper()


# This function returns a function specifically tailored to the transposase
# specified by the user. This practice is known as "currying" and prevents
# having to re-initialize the insertSiteLength parameter during each iteration
def makeInsertionSiteFunction(transposase):
    insertSiteLength = len(transposaseMotifs[transposase])
def get_spectra_from_maf(maf: pd.DataFrame,
                         hgfile: Union[str, None] = None,
                         cosmic: str = 'cosmic2',
                         real_snps: bool = False):
    """
    Attaches context categories to maf and gets counts of contexts for each sample
    ---------------------------
    Args:
        * maf: Pandas DataFrame of maf
        * hgfile: path to 2bit genome build file for computing reference context
        * cosmic: cosmic signatures to decompose to

    Returns:
        * Pandas DataFrame of maf with context category attached
        * Pandas DataFrame of counts with samples as columns and context as rows
    """
    maf = maf.copy()

    if 'Start_Position' in list(maf):
        maf = maf.rename(columns={'Start_Position': 'Start_position'})

    maf['sample'] = maf['Tumor_Sample_Barcode']

    if cosmic in ['cosmic2', 'cosmic3', 'cosmic3_exome']:
        # Subset to SNPs
        if 'Variant_Type' in maf.columns:
            maf = maf.loc[maf['Variant_Type'] == 'SNP']
        else:
            maf = maf.loc[maf['Reference_Allele'].apply(lambda k: len(k) == 1 and k != '-') & \
            maf['Tumor_Seq_Allele2'].apply(lambda k: len(k) == 1 and k != '-')]
        if not real_snps:
            maf = get_true_snps_from_maf(maf)

        ref = maf['Reference_Allele'].str.upper()
        alt = maf['Tumor_Seq_Allele2'].str.upper()

        if 'ref_context' in list(maf):
            context = maf['ref_context'].str.upper()
        else:
            assert hgfile is not None, 'Please provide genome build file.'

            try:
                hg = TwoBitFile(hgfile)
            except:
                raise Exception("{} not a valid 2bit file.".format(hgfile))

            # Map contexts
            _contexts = list()
            maf_size = maf.shape[0]
            for idx, (pos, chromosome) in enumerate(
                    zip(maf["Start_position"].astype(int),
                        maf["Chromosome"].astype(str))):
                stdout.write("\r      * Mapping contexts: {} / {}".format(
                    idx, maf_size))

                # Double check version
                if chromosome == '23':
                    chromosome = 'X'
                elif chromosome == '24':
                    chromosome = 'Y'
                elif chromosome == 'MT':
                    chromosome = 'M'
                if not chromosome.startswith('chr'):
                    chromosome = 'chr' + chromosome

                _contexts.append(hg[chromosome][pos - 2:pos + 1].lower())

            maf['ref_context'] = _contexts
            stdout.write("\n")
            context = maf['ref_context'].str.upper()

        n_context = context.str.len()
        mid = n_context // 2

        contig = pd.Series([r + a + c[m - 1] + c[m + 1] if r in 'AC' \
                            else compl(r + a + c[m + 1] + c[m - 1]) \
                            for r, a, c, m in zip(ref, alt, context, mid)], index=maf.index)

        try:
            maf['context96.num'] = contig.apply(context96.__getitem__)
        except KeyError as e:
            raise KeyError('Unusual context: ' + str(e))

        maf['context96.word'] = contig
        spectra = maf.groupby(['context96.word', 'sample'
                               ]).size().unstack().fillna(0).astype(int)
        for c in context96:
            if c not in spectra.index:
                spectra.loc[c] = 0
        spectra = spectra.loc[context96]

    elif cosmic == 'cosmic3_DBS':
        # Subset to DNPs
        if 'Variant_Type' not in maf.columns:
            ref_alt = maf['Reference_Allele'] + '>' + maf['Tumor_Seq_Allele2']

            def get_variant_type(ra):
                r, a = ra.split('>')
                if len(r) == 1 and r != '-' and len(a) == 1 and a != '-':
                    return 'SNP'
                if len(r) == 2 and len(a) == 2:
                    return 'DNP'

            maf['Variant_Type'] = ref_alt.apply(get_variant_type)
        if 'DNP' in maf['Variant_Type']:
            maf = maf.loc[maf['Variant_Type'] == 'DNP']
        else:
            maf = get_dnps_from_maf(maf)

        ref = maf['Reference_Allele'].str.upper()
        alt = maf['Tumor_Seq_Allele2'].str.upper()

        contig = pd.Series([
            r + '>' + a if r + '>' + a in context78 else
            compl(r, reverse=True) + '>' + compl(a, reverse=True)
            for r, a in zip(ref, alt)
        ],
                           index=maf.index)

        try:
            maf['context78.num'] = contig.apply(context78.__getitem__)
        except KeyError as e:
            raise KeyError('Unusual context: ' + str(e))

        maf['context78.word'] = contig
        spectra = maf.groupby(['context78.word', 'sample'
                               ]).size().unstack().fillna(0).astype(int)
        for c in context78:
            if c not in spectra.index:
                spectra.loc[c] = 0
        spectra = spectra.loc[context78]

    elif cosmic == 'cosmic3_ID':

        maf = maf.loc[(maf['Reference_Allele'] == '-')
                      ^ (maf['Tumor_Seq_Allele2'] == '-')]

        ref = maf['Reference_Allele'].str.upper()
        alt = maf['Tumor_Seq_Allele2'].str.upper()

        assert hgfile is not None, 'Please provide genome build file.'

        try:
            hg = TwoBitFile(hgfile)
        except:
            raise Exception("{} not a valid 2bit file.".format(hgfile))

        # Map contexts
        contig = list()
        maf_size = maf.shape[0]
        for idx, (pos, chromosome, r, a) in enumerate(
                zip(maf["Start_position"].astype(int),
                    maf["Chromosome"].astype(str), ref, alt)):
            stdout.write("\r      * Mapping contexts: {} / {}".format(
                idx, maf_size))

            # Double check version
            if chromosome == '23':
                chromosome = 'X'
            elif chromosome == '24':
                chromosome = 'Y'
            elif chromosome == 'MT':
                chromosome = 'M'
            if not chromosome.startswith('chr'):
                chromosome = 'chr' + chromosome

            if a == '-':
                del_len = len(r)
                _context = hg[chromosome][pos - 1 + del_len:pos - 1 +
                                          del_len * 6].upper()
                _context_list = [
                    _context[n:n + del_len]
                    for n in range(0, 5 * del_len, del_len)
                ]
                n_repeats = 1
                for c in _context_list:
                    if c == r:
                        n_repeats += 1
                    else:
                        break
                microhomology = 0
                if n_repeats == 1:
                    for b1, b2 in zip(r, _context_list[0]):
                        if b1 == b2:
                            microhomology += 1
                        else:
                            break
                    prev_context = hg[chromosome][pos - 1 - del_len:pos -
                                                  1].upper()
                    for b1, b2 in zip(reversed(r), reversed(prev_context)):
                        if b1 == b2:
                            microhomology += 1
                        else:
                            break
                if del_len == 1:
                    pre = 'C' if r in 'CG' else 'T'
                elif del_len >= 5:
                    pre = '5+'
                else:
                    pre = str(del_len)
                if microhomology >= 5:
                    post = 'm5+'
                elif microhomology:
                    post = 'm' + str(microhomology)
                elif n_repeats == 6:
                    post = '6+'
                else:
                    post = str(n_repeats)
                contig.append(pre + 'del' + post)

            elif r == '-':
                ins_len = len(a)
                _context = hg[chromosome][pos:pos + ins_len * 5].upper()
                _context_list = [
                    _context[n:n + ins_len]
                    for n in range(0, 5 * ins_len, ins_len)
                ]
                n_repeats = 0
                for c in _context_list:
                    if c == a:
                        n_repeats += 1
                    else:
                        break
                if ins_len == 1:
                    pre = 'C' if a in 'CG' else 'T'
                elif ins_len >= 5:
                    pre = '5+'
                else:
                    pre = str(ins_len)
                if n_repeats == 5:
                    post = '5+'
                else:
                    post = str(n_repeats)
                contig.append(pre + 'ins' + post)

        maf['context83.word'] = contig
        try:
            maf['context83.num'] = maf['context83.word'].apply(
                context83.__getitem__)
        except KeyError as e:
            raise KeyError('Unusual context: ' + str(e))

        spectra = maf.groupby(['context83.word', 'sample'
                               ]).size().unstack().fillna(0).astype(int)
        for c in context83:
            if c not in spectra.index:
                spectra.loc[c] = 0
        spectra = spectra.loc[context83]

        stdout.write("\n")
    else:
        raise NotImplementedError()

    return maf, spectra
Esempio n. 6
0
                  dest='stats',
                  help="write orf stats to this csv file")
parser.add_option('-o',
                  '--outfile',
                  dest='outfile',
                  help="write output to this file (default: stdout)")
parser.add_option('',
                  '--minlength',
                  dest='minlength',
                  default=12,
                  help="""minimum ORF length (in nt, including stop) [12]""")

options, args = parser.parse_args()

print >> sys.stderr, 'reading genome from ' + options.genome
genome = TwoBitFile(options.genome)
print >> sys.stderr, 'reading bed file ' + options.bed

orfs = []
hash_values = []
orf_types = []
orf_length = []
utr5_length = []
utr3_length = []


def match_type(sig):
    if sig == 'annotated':
        return 'annotated'
    elif sig == 'utr5:cds:0':
        return "N-ext"
Esempio n. 7
0
def build_index(args, unknown_args):
    from pyfaidx import Fasta
    from twobitreader import TwoBitFile
    import gffutils
    import gffutils.merge_criteria as mc
    import atexit
    import shutil
    from tqdm import tqdm
    from collections import defaultdict
    from pprint import pprint
    from tempfile import NamedTemporaryFile
    from urllib.parse import urlparse
    from urllib.request import urlopen
    from shutil import copyfileobj
    from subprocess import Popen, PIPE, call

    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(
        level=log_level,
        format='%(asctime)s %(levelname)-8s %(message)s',
        datefmt='%m-%d %H:%M')

    config = args.conf
    logging.info("PISCES version %s", __version__)

    for species, datasets in list(config.items()):
        indices = datasets["index"]
        download_dir = datasets["downloads"]
        index_dir_base = datasets["index_dir"]
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)
        for index_name, dataset in list(indices.items()):
            if args.indices and index_name not in args.indices:
                continue
            pprint(dataset, indent=4)
            options = defaultdict(lambda: True)
            options.update(dataset["options"])
            index_dir_path = os.path.join(index_dir_base, species, index_name)
            if os.path.exists(index_dir_path):
                if args.overwrite:
                    logging.warn(
                        "index directory %s already exists! overwriting",
                        index_dir_path)
                    shutil.rmtree(
                        os.path.join(index_dir_path, "transcripts"))
                    shutil.rmtree(os.path.join(index_dir_path, "salmon"))
                else:
                    continue
            os.makedirs(os.path.join(index_dir_path, "transcripts"))
            os.makedirs(os.path.join(index_dir_path, "salmon"))

            transcripts_fasta_file = os.path.join(
                index_dir_path, "transcripts", "transcripts.fa")
                
            with open(transcripts_fasta_file, 'w') as transcripts_fasta:
                ## all of this URI handling should probably use an existing library like
                ## https://github.com/intake/filesystem_spec
                for fasta_loc in dataset["extra_fastas"]:
                    fasta = urlparse(fasta_loc)
                    if fasta.scheme == '':
                        reference = Fasta(fasta.path)
                    elif fasta.scheme.lower() in ('ftp', 'http', 'https'):
                        _fasta_local_path = os.path.join(
                            download_dir, os.path.basename(fasta.path))
                        logging.info("Downloading %s", fasta.geturl())
                        if not os.path.exists(_fasta_local_path):
                            with urlopen(fasta.geturl()) as _fasta:
                                with open(_fasta_local_path,
                                          'wb') as _fasta_local:
                                    copyfileobj(_fasta, _fasta_local)
                                    _fasta_local.flush()
                                    if fasta.path.endswith('gz'):
                                        logging.info("Decompressing %s",
                                                     fasta.geturl())
                                        call(
                                            ' '.join([
                                                'gzip -dc', _fasta_local_path,
                                                '>',
                                                _fasta_local_path.replace(
                                                    ".gz", "")
                                            ]),
                                            shell=True)
                        if _fasta_local_path.endswith("2bit"):
                            logging.info("Converting %s to FASTA format",
                                         fasta.geturl())
                            twobit = TwoBitFile(_fasta_local_path)
                            if not os.path.exists(
                                    _fasta_local_path.replace("2bit", "fa")):
                                with open(
                                        _fasta_local_path.replace(
                                            "2bit", "fa"), 'w') as fasta:
                                    for chrom in twobit.keys():
                                        fasta.write(">%s\n" % chrom)
                                        fasta.write(str(twobit[chrom]) + '\n')
                            reference = Fasta(
                                _fasta_local_path.replace("2bit", "fa"))
                    
                    with open(_fasta_local_path) as extra:
                        logging.info("Adding entries from %s", fasta)
                        for line in extra:
                            transcripts_fasta.write(line)
                            
                for gtf_loc, fasta_loc in zip(dataset["gtfs"],
                                              dataset["fastas"]):
                    gtf = urlparse(gtf_loc)
                    fasta = urlparse(fasta_loc)
                    assembly = os.path.basename(fasta.path)

                    if fasta.scheme == '':
                        reference = Fasta(fasta.path)
                    elif fasta.scheme.lower() in ('ftp', 'http', 'https'):
                        _fasta_local_path = os.path.join(
                            download_dir, os.path.basename(fasta.path))
                        logging.info("Downloading %s", fasta.geturl())
                        if not os.path.exists(_fasta_local_path):
                            with urlopen(fasta.geturl()) as _fasta:
                                with open(_fasta_local_path,
                                          'wb') as _fasta_local:
                                    copyfileobj(_fasta, _fasta_local)
                                    _fasta_local.flush()
                                    if fasta.path.endswith('gz'):
                                        logging.info("Decompressing %s",
                                                     fasta.geturl())
                                        call(
                                            ' '.join([
                                                'gzip -dc', _fasta_local_path,
                                                '>',
                                                _fasta_local_path.replace(
                                                    ".gz", "")
                                            ]),
                                            shell=True)
                        if _fasta_local_path.endswith("2bit"):
                            logging.info("Converting %s to FASTA format",
                                         fasta.geturl())
                            twobit = TwoBitFile(_fasta_local_path)
                            if not os.path.exists(
                                    _fasta_local_path.replace("2bit", "fa")):
                                with open(
                                        _fasta_local_path.replace(
                                            "2bit", "fa"), 'w') as fasta:
                                    for chrom in twobit.keys():
                                        fasta.write(">%s\n" % chrom)
                                        fasta.write(str(twobit[chrom]) + '\n')
                            reference = Fasta(
                                _fasta_local_path.replace("2bit", "fa"))
                        elif fasta.path.endswith('gz'):
                            reference = Fasta(
                                _fasta_local_path.replace(".gz", ""))
                        else:
                            reference = Fasta(_fasta_local_path)

                    if gtf.scheme == '':
                        database_filename = gtf.path + '.db'
                        if os.path.exists(database_filename):
                            logging.info("Loading existing GTF database file.")
                            db = gffutils.FeatureDB(database_filename)
                        else:
                            logging.info(
                                "Creating GTF database file. This will take some time..."
                            )
                            try:
                                db = gffutils.create_db(
                                    gtf.path,
                                    database_filename,
                                    disable_infer_genes=
                                    not options["infer_features"],
                                    disable_infer_transcripts=
                                    not options["infer_features"])
                            except:
                                tmp_db = os.path.join(download_dir, os.path.basename(gtf.path) + '.db')
                                logging.info(
                                    "Unable to create %s, so using %s",
                                    database_filename, tmp_db)
                                if os.path.exists(tmp_db):
                                    logging.info("Loading existing GTF database file.")
                                    db = gffutils.FeatureDB(tmp_db)
                                else:
                                    db = gffutils.create_db(
                                        gtf.path,
                                        tmp_db,
                                        disable_infer_genes=
                                        not options["infer_features"],
                                        disable_infer_transcripts=
                                        not options["infer_features"])
                    elif gtf.scheme.lower() in ('ftp', 'http', 'https'):
                        _gtf_local_path = os.path.join(download_dir,
                                                       os.path.basename(
                                                           gtf.path))
                        logging.info("Downloading %s", gtf.geturl())
                        if not os.path.exists(_gtf_local_path):
                            with urlopen(gtf.geturl()) as _gtf:
                                with open(_gtf_local_path, 'wb') as _gtf_local:
                                    copyfileobj(_gtf, _gtf_local)
                                    _gtf_local.flush()
                                    if gtf.path.endswith('gz'):
                                        logging.info("Decompressing %s",
                                                     gtf.geturl())
                                        call(
                                            ' '.join([
                                                'gzip -dc', _gtf_local_path,
                                                '>',
                                                _gtf_local_path.replace(
                                                    ".gz", "")
                                            ]),
                                            shell=True)
                                        logging.info(
                                            "Creating GTF database file. This will take some time..."
                                        )
                                        db = gffutils.create_db(
                                            _gtf_local_path.replace(".gz", ""),
                                            _gtf_local_path.replace(
                                                ".gz", "") + '.db',
                                            disable_infer_genes=
                                            not options["infer_features"],
                                            disable_infer_transcripts=
                                            not options["infer_features"])
                                    else:
                                        logging.info(
                                            "Creating GTF database file. This will take some time..."
                                        )
                                        db = gffutils.create_db(
                                            _gtf_local_path,
                                            _gtf_local_path + '.db',
                                            disable_infer_genes=
                                            not options["infer_features"],
                                            disable_infer_transcripts=
                                            not options["infer_features"])
                        elif gtf.path.endswith('gz'):
                            logging.info("Loading existing GTF database file.")
                            db = gffutils.FeatureDB(
                                _gtf_local_path.replace(".gz", "") + '.db')
                        else:
                            logging.info("Loading existing GTF database file.")
                            db = gffutils.FeatureDB(_gtf_local_path)

                    # https://github.com/daler/gffutils/issues/56
                    db.execute('ANALYZE features')
                    #if db.count_features_of_type('intron') == 0 and options["unprocessed_transcripts"]:
                        #logging.info("Inferring intronic sequences...")
                        #db.update(db.create_introns())
                    soft_chars = set(('a', 'c', 'g', 't'))

                    if not options["-k"]:
                        k = 31
                    else:
                        k = options["-k"]

                    gene_tx_file = os.path.join(
                        index_dir_path,
                        assembly + "_transcripts_to_genes.txt")
                    gene_annotation = os.path.join(
                        index_dir_path,
                        assembly + "_gene_annotation.txt")
                        
                    def features_to_string(features, fasta_in, masked=True, strand=True):
                        """ 
                        """
                        sequences = []
                        feature_strand = "."
                        for feature in features:
                            feature_strand = feature.strand
                            sequences.append(
                                feature.sequence(
                                    fasta_in, use_strand=strand))
                        # if the transcript is on the reverse strand, reverse order of exons 
                        # before concatenating
                        if feature_strand == "-":
                            sequences = sequences[::-1]
                        seq = ''.join(sequences)
                        mask_count = sum(seq.count(a) for a in soft_chars)
                        if masked:
                            if mask_count > 0:
                                seq = seq.replace(
                                    'a', 'N').replace('t', 'N').replace(
                                        'c', 'N').replace('g', 'N')
                        try:
                            frac_masked = mask_count / len(seq)
                        except ZeroDivisionError:
                            frac_masked = 0
                        return (seq, frac_masked)

                    with open(gene_tx_file, 'w') as gene2tx, open(
                            gene_annotation, 'w') as annotation:
                        logging.info("Making transcripts_to_genes, annotation and FASTA file for %s",
                                     gtf.path)
                        with tqdm(
                                total=db.count_features_of_type('gene'),
                                unit='gene') as pbar:
                            for gene in db.features_of_type('gene'):
                                first_exon = next(
                                    db.children(
                                        gene,
                                        featuretype='exon',
                                        order_by='start'))
                                try:
                                    if options["gene_type"] == True:
                                        type_tag = "gene_type"
                                    else:
                                        type_tag = options["gene_type"]
                                    gene_type = first_exon[type_tag][0]
                                except KeyError:
                                    logging.info("No gene type tag found for %s", gene['gene_id'][0])
                                    gene_type = 'NA'
                                try:
                                    if options["gene_name"] == True:
                                        name_tag = "gene_name"
                                    else:
                                        name_tag = options["gene_name"]
                                    gene_name = first_exon[name_tag][0]
                                except KeyError:
                                    logging.info("No gene name tag found for %s", gene['gene_id'][0])
                                    gene_name = 'NA'
                                    
                                transcripts = db.children(gene, featuretype='transcript', order_by='start')
                                for transcript in transcripts:
                                    # Write entry in the transcripts to genes table
                                    gene2tx.write("{txp}\t{gene}\n".format(
                                        gene=gene['gene_id'][0],
                                        txp=transcript['transcript_id'][0]))
                                    # Construct the transcript sequences and write them to the FASTA
                                    fa_seq, frac_masked = features_to_string(db.children(transcript, 
                                                                                   featuretype='exon', 
                                                                                   order_by='start'), 
                                                                       reference, 
                                                                       masked=options["masked"])
                                    transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n')
                                    transcripts_fasta.write(fa_seq + '\n')
                                    
                                exons = db.children(gene, featuretype='exon', order_by='start') 
                                merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive))
                                if options["unprocessed_transcripts"]:
                                    introns = db.interfeatures(merged_exons, new_featuretype='intron')                                                     
                                    transcripts_fasta.write('>' + "intronic_" + gene['gene_id'][0] + '\n')
                                    fa_seq, _ = features_to_string(introns, reference, masked=options["masked"])
                                    transcripts_fasta.write(fa_seq + '\n')
                                
                                annotation.write(
                                    "{gene}\t{type}\t{name}\t{chrom}\t{start}\t{stop}\t{length}\t{frac_masked}\n".
                                    format(
                                        gene=gene['gene_id'][0],
                                        type=gene_type,
                                        name=gene_name,
                                        start=gene.start,
                                        stop=gene.stop,
                                        chrom=gene.chrom,
                                        length=sum(len(exon) for exon in merged_exons),
                                        frac_masked=str(frac_masked)))
                                        
                                transcripts = db.children(
                                    gene,
                                    featuretype='transcript',
                                    order_by='start')
                                pbar.update(1)
                            
                    if options["intergenes"]:
                        for seqid in reference.keys():
                            logging.info("Merging overlapping genes on %s", seqid)
                            merged_genes = db.merge(db.region(seqid=seqid), merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive))
                            with tqdm(unit='intergene features') as pbar:
                                for intergene in db.interfeatures(merged_genes, new_featuretype='intergene'):
                                    transcripts_fasta.write('>' + 'intergene_' + seqid + "_" + str(intergene.start) + ':' + str(intergene.end) + '\n')
                                    fa_seq, _ = features_to_string([intergene], reference, masked=options["masked"], strand=False)
                                    transcripts_fasta.write(fa_seq + '\n')
                                    pbar.update(1)


            # This needs to happen outside of context handler so FASTA file can be closed properly
            logging.info("Making salmon index files for %s",
                         species + '/' + index_name)
            cmd = [
                os.path.join(find_data_directory(), 'redist', 'salmon',
                             'bin', 'salmon'), 'index', '-p',
                str(args.threads), '-k',
                str(k), '-t', transcripts_fasta.name, '-i',
                os.path.join(index_dir_path, "salmon")
            ]
            logging.debug(' '.join(cmd))
            p = Popen(cmd, stderr=PIPE)
            for line in p.stderr:
                line = line.decode()
                if line.endswith('\n'):
                    logging.info(line.rstrip())
                else:
                    logging.info(line)
            logging.info(line)
Esempio n. 8
0
def __main__():
    parser = argparse.ArgumentParser(
        description='Translate from BED')
    parser.add_argument(
        'input_bed', default=None,
        help="BED to translate,  '-' for stdin")
    pg_seq = parser.add_argument_group('Genomic sequence source')
    pg_seq.add_argument(
        '-t', '--twobit', default=None,
        help='Genome reference sequence in 2bit format')
    pg_seq.add_argument(
        '-c', '--column', type=int, default=None,
        help='Column offset containing genomic sequence' +
             'between start and stop (-1) for last column')
    pg_out = parser.add_argument_group('Output options')
    pg_out.add_argument(
        '-f', '--fasta', default=None,
        help='Path to output translations.fasta')
    pg_out.add_argument(
        '-b', '--bed', default=None,
        help='Path to output translations.bed')
    pg_bed = parser.add_argument_group('BED filter options')
    pg_bed.add_argument(
        '-E', '--ensembl', action='store_true', default=False,
        help='Input BED is in 20 column Ensembl format')
    pg_bed.add_argument(
        '-R', '--regions', action='append', default=[],
        help='Filter input by regions e.g.:'
             + ' X,2:20000-25000,3:100-500+')
    pg_bed.add_argument(
        '-B', '--biotypes', action='append', default=[],
        help='For Ensembl BED restrict translations to Ensembl biotypes')
    pg_trans = parser.add_argument_group('Translation filter options')
    pg_trans.add_argument(
        '-m', '--min_length', type=int, default=10,
        help='Minimum length of protein translation to report')
    pg_trans.add_argument(
        '-e', '--enzyme', default=None,
        help='Digest translation with enzyme')
    pg_trans.add_argument(
        '-M', '--start_codon', action='store_true', default=False,
        help='Trim translations to methionine start_codon')
    pg_trans.add_argument(
        '-C', '--cds', action='store_true', default=False,
        help='Only translate CDS')
    pg_trans.add_argument(
        '-A', '--all', action='store_true',
        help='Include CDS protein translations ')
    pg_fmt = parser.add_argument_group('ID format options')
    pg_fmt.add_argument(
        '-r', '--reference', default='',
        help='Genome Reference Name')
    pg_fmt.add_argument(
        '-D', '--fa_db', dest='fa_db', default=None,
        help='Prefix DB identifier for fasta ID line, e.g. generic')
    pg_fmt.add_argument(
        '-s', '--fa_sep', dest='fa_sep', default='|',
        help='fasta ID separator defaults to pipe char, ' +
             'e.g. generic|ProtID|description')
    pg_fmt.add_argument(
        '-P', '--id_prefix', default='',
        help='prefix for the sequence ID')
    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
    parser.add_argument('-d', '--debug', action='store_true', help='Debug')
    args = parser.parse_args()

    input_rdr = open(args.input_bed, 'r')\
        if args.input_bed != '-' else sys.stdin
    fa_wtr = open(args.fasta, 'w')\
        if args.fasta is not None and args.fasta != '-' else sys.stdout
    bed_wtr = open(args.bed, 'w') if args.bed is not None else None

    enzyme = digest.expasy_rules.get(args.enzyme, None)

    biotypea = [bt.strip() for biotype in args.biotypes
                for bt in biotype.split(',')]

    twobit = TwoBitFile(args.twobit) if args.twobit else None

    selected_regions = dict()  # chrom:(start, end)
    region_pat = '^(?:chr)?([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?'
    if args.regions:
        for entry in args.regions:
            if not entry:
                continue
            regs = [x.strip() for x in entry.split(',') if x.strip()]
            for reg in regs:
                m = re.match(region_pat, reg)
                if m:
                    (chrom, start, end, strand) = m.groups()
                    if chrom:
                        if chrom not in selected_regions:
                            selected_regions[chrom] = []
                        selected_regions[chrom].append([start, end, strand])
        if args.debug:
            print("selected_regions: %s" % selected_regions, file=sys.stderr)

    def filter_by_regions(bed):
        if not selected_regions:
            return True
        ref = re.sub('^(?i)chr', '', bed.chrom)
        if ref not in selected_regions:
            return False
        for reg in selected_regions[ref]:
            (_start, _stop, _strand) = reg
            start = int(_start) if _start else 0
            stop = int(_stop) if _stop else sys.maxint
            if _strand and bed.strand != _strand:
                continue
            if bed.chromEnd >= start and bed.chromStart <= stop:
                return True
        return False

    translations = dict()  # start : end : seq

    def unique_prot(tbed, seq):
        if tbed.chromStart not in translations:
            translations[tbed.chromStart] = dict()
            translations[tbed.chromStart][tbed.chromEnd] = []
            translations[tbed.chromStart][tbed.chromEnd].append(seq)
        elif tbed.chromEnd not in translations[tbed.chromStart]:
            translations[tbed.chromStart][tbed.chromEnd] = []
            translations[tbed.chromStart][tbed.chromEnd].append(seq)
        elif seq not in translations[tbed.chromStart][tbed.chromEnd]:
            translations[tbed.chromStart][tbed.chromEnd].append(seq)
        else:
            return False
        return True

    def get_sequence(chrom, start, end):
        if twobit:
            if chrom in twobit and 0 <= start < end < len(twobit[chrom]):
                return twobit[chrom][start:end]
            contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
            if contig in twobit and 0 <= start < end < len(twobit[contig]):
                return twobit[contig][start:end]
        return None

    def write_translation(tbed, accession, peptide):
        if args.id_prefix:
            tbed.name = "%s%s" % (args.id_prefix, tbed.name)
        probed = "%s\t%s\t%s\t%s%s" % (accession, peptide,
                                       'unique', args.reference,
                                       '\t.' * 9)
        if bed_wtr:
            bed_wtr.write("%s\t%s\n" % (str(tbed), probed))
            bed_wtr.flush()
        location = "chromosome:%s:%s:%s:%s:%s"\
            % (args.reference, tbed.chrom,
               tbed.thickStart, tbed.thickEnd, tbed.strand)
        fa_desc = '%s%s' % (args.fa_sep, location)
        fa_db = '%s%s' % (args.fa_db, args.fa_sep) if args.fa_db else ''
        fa_id = ">%s%s%s\n" % (fa_db, tbed.name, fa_desc)
        fa_wtr.write(fa_id)
        fa_wtr.write(peptide)
        fa_wtr.write("\n")
        fa_wtr.flush()

    def translate_bed(bed):
        translate_count = 0
        transcript_id = bed.name
        refprot = None
        if not bed.seq:
            if twobit:
                bed.seq = get_sequence(bed.chrom, bed.chromStart, bed.chromEnd)
            else:
                bed.cdna = get_cdna(transcript_id)
        cdna = bed.get_cdna()
        if cdna is not None:
            cdna_len = len(cdna)
            if args.cds or args.all:
                try:
                    cds = bed.get_cds()
                    if cds:
                        if args.debug:
                            print("cdna:%s" % str(cdna), file=sys.stderr)
                            print("cds: %s" % str(cds), file=sys.stderr)
                        if len(cds) % 3 != 0:
                            cds = cds[:-(len(cds) % 3)]
                        refprot = translate(cds) if cds else None
                except:
                    refprot = None
                if args.cds:
                    if refprot:
                        tbed = bed.get_cds_bed()
                        if args.start_codon:
                            m = refprot.find('M')
                            if m < 0:
                                return 0
                            elif m > 0:
                                bed.trim_cds(m*3)
                                refprot = refprot[m:]
                        stop = refprot.find('*')
                        if stop >= 0:
                            bed.trim_cds((stop - len(refprot)) * 3)
                            refprot = refprot[:stop]
                        if len(refprot) >= args.min_length:
                            write_translation(tbed, bed.name, refprot)
                            return 1
                    return 0
            if args.debug:
                print("%s\n" % (str(bed)), file=sys.stderr)
                print("CDS: %s %d %d" %
                      (bed.strand, bed.cdna_offset_of_pos(bed.thickStart),
                       bed.cdna_offset_of_pos(bed.thickEnd)),
                      file=sys.stderr)
                print("refprot: %s" % str(refprot), file=sys.stderr)
            for offset in range(3):
                seqend = cdna_len - (cdna_len - offset) % 3
                aaseq = translate(cdna[offset:seqend])
                aa_start = 0
                while aa_start < len(aaseq):
                    aa_end = aaseq.find('*', aa_start)
                    if aa_end < 0:
                        aa_end = len(aaseq)
                    prot = aaseq[aa_start:aa_end]
                    if args.start_codon:
                        m = prot.find('M')
                        aa_start += m if m >= 0 else aa_end
                        prot = aaseq[aa_start:aa_end]
                    if enzyme and refprot:
                        frags = digest._cleave(prot, enzyme)
                        for frag in reversed(frags):
                            if frag in refprot:
                                prot = prot[:prot.rfind(frag)]
                            else:
                                break
                    is_cds = refprot and prot in refprot
                    if args.debug:
                        print("is_cds: %s %s" % (str(is_cds), str(prot)),
                              file=sys.stderr)
                    if len(prot) < args.min_length:
                        pass
                    elif not args.all and is_cds:
                        pass
                    else:
                        tstart = aa_start*3+offset
                        tend = aa_end*3+offset
                        prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend)
                        tbed = bed.trim(tstart, tend)
                        if args.all or unique_prot(tbed, prot):
                            translate_count += 1
                            tbed.name = prot_acc
                            write_translation(tbed, bed.name, prot)
                    aa_start = aa_end + 1
        return translate_count

    if input_rdr:
        translation_count = 0
        transcript_count = 0
        for i, bedline in enumerate(input_rdr):
            try:
                bed = bed_from_line(bedline, ensembl=args.ensembl,
                                    seq_column=args.column)
                if bed is None:
                    continue
                transcript_count += 1
                if bed.biotype and biotypea and bed.biotype not in biotypea:
                    continue
                if filter_by_regions(bed):
                    translation_count += translate_bed(bed)
            except Exception as e:
                print("BED format Error: line %d: %s\n%s"
                      % (i, bedline, e), file=sys.stderr)
                break
        if args.debug or args.verbose:
            print("transcripts: %d\ttranslations: %d"
                  % (transcript_count, translation_count), file=sys.stderr)
def __main__():
    parser = argparse.ArgumentParser(
        description='Retrieve Ensembl cDNAs and three frame translate')
    parser.add_argument('-s',
                        '--species',
                        default='human',
                        help='Ensembl Species to retrieve')
    parser.add_argument(
        '-R',
        '--regions',
        action='append',
        default=[],
        help=
        'Restrict Ensembl retrieval to regions e.g.: X,2:20000-25000,3:100-500+'
    )
    parser.add_argument('-B',
                        '--biotypes',
                        action='append',
                        default=[],
                        help='Restrict Ensembl biotypes to retrieve')
    parser.add_argument(
        '-i',
        '--input',
        default=None,
        help='Use BED instead of retrieving cDNA from ensembl (-) for stdin')
    parser.add_argument('-T',
                        '--twobit',
                        default=None,
                        help='Genome reference sequence in 2bit format')
    parser.add_argument(
        '-t',
        '--transcripts',
        default=None,
        help='Path to output cDNA transcripts.bed (-) for stdout')
    parser.add_argument(
        '-r',
        '--raw',
        action='store_true',
        help='Report transcript exacty as returned from Ensembl')
    parser.add_argument('-f',
                        '--fasta',
                        default=None,
                        help='Path to output translations.fasta')
    parser.add_argument('-b',
                        '--bed',
                        default=None,
                        help='Path to output translations.bed')
    parser.add_argument('-m',
                        '--min_length',
                        type=int,
                        default=7,
                        help='Minimum length of protein translation to report')
    parser.add_argument('-e',
                        '--enzyme',
                        default=None,
                        help='Digest translation with enzyme')
    parser.add_argument('-a',
                        '--all',
                        action='store_true',
                        help='Include reference protein translations')
    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
    parser.add_argument('-d', '--debug', action='store_true', help='Debug')
    args = parser.parse_args()
    # print >> sys.stderr, "args: %s" % args
    species = args.species
    input_rdr = None
    if args.input is not None:
        input_rdr = open(args.input, 'r') if args.input != '-' else sys.stdin
    tx_wtr = None
    if args.transcripts is not None:
        tx_wtr = open(args.transcripts, 'w')\
            if args.transcripts != '-' else sys.stdout
    fa_wtr = open(args.fasta, 'w') if args.fasta is not None else None
    bed_wtr = open(args.bed, 'w') if args.bed is not None else None

    enzyme = digest.expasy_rules.get(args.enzyme, args.enzyme)

    # print >> sys.stderr, "args biotypes: %s" % args.biotypes
    biotypea = [
        'biotype=%s' % bt.strip() for biotype in args.biotypes
        for bt in biotype.split(',')
    ]
    # print >> sys.stderr, "args biotypes: %s" % biotypea
    biotypes = ';'.join([
        'biotype=%s' % bt.strip() for biotype in args.biotypes
        for bt in biotype.split(',') if bt.strip()
    ])
    # print >> sys.stderr, "biotypes: %s" % biotypes

    twobit = TwoBitFile(args.twobit) if args.twobit else None

    selected_regions = dict()  # chrom:(start,end)
    region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?'
    if args.regions:
        for entry in args.regions:
            if not entry:
                continue
            regs = [x.strip() for x in entry.split(',') if x.strip()]
            for reg in regs:
                m = re.match(region_pat, reg)
                if m:
                    (chrom, start, end, strand) = m.groups()
                    if chrom:
                        if chrom not in selected_regions:
                            selected_regions[chrom] = []
                        selected_regions[chrom].append([start, end, strand])
        if args.debug:
            print >> sys.stderr, "selected_regions: %s" % selected_regions

    translations = dict()  # start : end : seq

    def unique_prot(tbed, seq):
        if tbed.chromStart not in translations:
            translations[tbed.chromStart] = dict()
            translations[tbed.chromStart][tbed.chromEnd] = []
            translations[tbed.chromStart][tbed.chromEnd].append(seq)
        elif tbed.chromEnd not in translations[tbed.chromStart]:
            translations[tbed.chromStart][tbed.chromEnd] = []
            translations[tbed.chromStart][tbed.chromEnd].append(seq)
        elif seq not in translations[tbed.chromStart][tbed.chromEnd]:
            translations[tbed.chromStart][tbed.chromEnd].append(seq)
        else:
            return False
        return True

    def get_sequence(chrom, start, end):
        if twobit:
            if chrom in twobit:
                return twobit[chrom][start:end]
            contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
            if contig in twobit:
                return twobit[contig][start:end]
        return None

    def translate_bed(bed):
        translate_count = 0
        if any([fa_wtr, bed_wtr]):
            transcript_id = bed.name
            refprot = None
            if twobit:
                bed.seq = get_sequence(bed.chrom, bed.chromStart, bed.chromEnd)
            else:
                bed.cdna = get_cdna(transcript_id)
            cdna = bed.get_cdna()
            cdna_len = len(cdna)
            if not args.all:
                try:
                    cds = bed.get_cds()
                    if cds is None:
                        cds = get_cds(transcript_id)
                    if len(cds) % 3 != 0:
                        cds = cds[:-(len(cds) % 3)]
                    refprot = translate(cds) if cds else None
                except:
                    refprot = None
            for offset in range(3):
                seqend = cdna_len - (cdna_len - offset) % 3
                aaseq = translate(cdna[offset:seqend])
                aa_start = 0
                while aa_start < len(aaseq):
                    aa_end = aaseq.find('*', aa_start)
                    if aa_end < 0:
                        aa_end = len(aaseq)
                    prot = aaseq[aa_start:aa_end]
                    if enzyme and refprot:
                        frags = digest._cleave(prot, enzyme)
                        for frag in reversed(frags):
                            if frag in refprot:
                                prot = prot[:prot.rfind(frag)]
                            else:
                                break
                    if len(prot) < args.min_length:
                        pass
                    elif refprot and prot in refprot:
                        pass
                    else:
                        tstart = aa_start * 3 + offset
                        tend = aa_end * 3 + offset
                        prot_acc = "%s_%d_%d" % (transcript_id, tstart, tend)
                        tbed = bed.trim(tstart, tend)
                        if args.all or unique_prot(tbed, prot):
                            translate_count += 1
                            tbed.name = prot_acc
                            bed_wtr.write("%s\t%s\n" % (str(tbed), prot))
                            bed_wtr.flush()
                            fa_id = ">%s\n" % (prot_acc)
                            fa_wtr.write(fa_id)
                            fa_wtr.write(prot)
                            fa_wtr.write("\n")
                            fa_wtr.flush()
                    aa_start = aa_end + 1
        return translate_count

    def translate_region(species, ref, start, stop, strand):
        translation_count = 0
        regions = range(start, stop, max_region)
        if not regions or regions[-1] < stop:
            regions.append(stop)
        for end in regions[1:]:
            bedlines = get_transcripts_bed(species,
                                           ref,
                                           start,
                                           end,
                                           strand=strand,
                                           params=biotypes)
            if args.verbose or args.debug:
                print >> sys.stderr,\
                    "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\
                    % (species, ref, start, end, len(bedlines))
            # start, end, seq
            for i, bedline in enumerate(bedlines):
                try:
                    bed = bed_from_line(bedline)\
                        if any([not args.raw, fa_wtr, bed_wtr])\
                        else None
                    if tx_wtr:
                        tx_wtr.write(bedline if args.raw else str(bed))
                        tx_wtr.write("\n")
                        tx_wtr.flush()
                    if bed:
                        translation_count += translate_bed(bed)
                except Exception as e:
                    print >> sys.stderr,\
                        "BED error (%s) : %s\n" % (e, bedline)
            start = end + 1
        return translation_count

    if input_rdr:
        translation_count = 0
        for i, bedline in enumerate(input_rdr):
            try:
                bed = bed_from_line(bedline)
                if bed is None:
                    continue
                if bed.biotype and biotypea and bed.biotype not in biotypea:
                    continue
                translation_count += translate_bed(bed)
            except:
                print >> sys.stderr, "BED format error: %s\n" % bedline
        if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
            print >> sys.stderr,\
                "%s\tcDNA translations:%d" % (species, translation_count)
    else:
        coord_systems = get_toplevel(species)
        if 'chromosome' in coord_systems:
            ref_lengths = dict()
            for ref in sorted(coord_systems['chromosome'].keys()):
                length = coord_systems['chromosome'][ref]
                ref_lengths[ref] = length
                if not any([tx_wtr, fa_wtr, bed_wtr]):
                    print >> sys.stderr,\
                        "%s\t%s\tlength: %d" % (species, ref, length)
            if selected_regions:
                translation_count = 0
                for ref in sorted(selected_regions.keys()):
                    if ref in ref_lengths:
                        for reg in selected_regions[ref]:
                            (_start, _stop, _strand) = reg
                            start = int(_start) if _start else 0
                            stop = int(_stop) if _stop else ref_lengths[ref]
                            strand = '' if not _strand else ':1' if _strand == '+' else ':-1'
                            translation_count += translate_region(
                                species, ref, start, stop, strand)
            else:
                strand = ''
                start = 0
                for ref in sorted(ref_lengths.keys()):
                    length = ref_lengths[ref]
                    translation_count = 0
                    if args.debug:
                        print >> sys.stderr,\
                            "Retrieving transcripts: %s\t%s\tlength: %d"\
                            % (species, ref, length)
                    translation_count += translate_region(
                        species, ref, start, length, strand)
                    if args.debug or (args.verbose and any([fa_wtr, bed_wtr])):
                        print >> sys.stderr,\
                            "%s\t%s\tlength: %d\tcDNA translations:%d"\
                            % (species, ref, length, translation_count)
Esempio n. 10
0
class EnsemblRef(object):
    def __init__(self, gtf_file, twobitfile, read_now=True):
        self.gtf_file = gtf_file
        self.twobitfile = twobitfile
        self.twobit = TwoBitFile(self.twobitfile)
        self.gene_dict = None
        self.transcript_idx = None
        self.name_idx = None
        if read_now:
            self.get_transcript_idx()

    def get_gene_dict(self):
        if self.gene_dict is None:
            gene_structures = gene.t_parse_gtf('test')
            self.gene_dict = gene_structures.get_genes(self.gtf_file,
                                                       logger=logger)
        return self.gene_dict

    def get_transcript_idx(self):
        if self.transcript_idx is None:
            self.transcript_idx = gene_utilities.index_transcripts(
                self.get_gene_dict(), by_prot_id=False)
        return self.transcript_idx

    def get_name_idx(self):
        if self.name_idx is None:
            self.name_idx = dict()
            for i, t in self.get_transcript_idx().items():
                for name in t.gene.names:
                    self.name_idx[name] = t.gene
                for name in t.names:
                    self.name_idx[name] = t
                if t.prot_id:
                    self.name_idx[t.prot_id] = t
        return self.name_idx

    def get_gtf_transcript(self, name):
        idx = self.get_transcript_idx()
        if name in idx:
            return idx[name]
        else:
            nidx = self.get_name_idx()
            if name in nidx:
                return nidx[name]
        return None

    def transcript_is_coding(self, transcript_id):
        tx = self.get_transcript_idx()[transcript_id]
        return len(tx.start_codons) > 0

    def get_transcript_start_codon(self, transcript_id):
        tx = self.get_transcript_idx()[transcript_id]
        return tx.start_codons[0] if len(tx.start_codons) > 0 else None

    def get_bed_line(self,
                     transcript_id,
                     score=0,
                     itemRgb='0,0,0',
                     coding=False):
        tx = self.get_transcript_idx()[transcript_id]
        chrom = tx.gene.contig
        chromStart = tx.coding_beg if coding else tx.beg
        chromEnd = tx.coding_end if coding else tx.end
        name = transcript_id
        strand = '+' if tx.gene.strand else '-'
        thickStart = tx.coding_beg if tx.coding_beg else chromStart
        thickEnd = tx.coding_end if tx.coding_end else chromEnd
        exons = tx.get_coding_exons() if coding else tx.get_exons()
        blockCount = len(exons)
        if tx.gene.strand:
            strand = '+'
            blockSizes = [abs(e - s) for s, e in exons]
            blockStarts = [s - chromStart for s, e in exons]
        else:
            strand = '-'
            blockSizes = [abs(e - s) for s, e in reversed(exons)]
            blockStarts = [s - chromStart for s, e in reversed(exons)]
        blockSizes = ','.join([str(x) for x in blockSizes])
        blockStarts = ','.join([str(x) for x in blockStarts])
        return '%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % (
            chrom, chromStart, chromEnd, name, score, strand, thickStart,
            thickEnd, itemRgb, blockCount, blockSizes, blockStarts)

    def transcripts_in_range(self, chrom, startpos, endpos, strand=None):
        spos = min(startpos, endpos) if endpos else startpos
        epos = max(startpos, endpos) if endpos else startpos
        transcripts = []
        for i, t in self.get_transcript_idx().items():
            if t.gene.contig == chrom and t.beg <= epos and spos <= t.end:
                if strand and t.gene.strand != strand:
                    continue
                transcripts.append(t)
        return transcripts

    def genes_in_range(self,
                       chrom,
                       startpos,
                       endpos,
                       strand=None,
                       gene_types=None):
        spos = min(startpos, endpos) if endpos else startpos
        epos = max(startpos, endpos) if endpos else startpos
        gene_dict = self.get_gene_dict()
        gtypes = set(gene_types) & set(
            gene_dict.keys()) if gene_types else set(gene_dict.keys())
        genes = []
        for gt in gtypes:
            for gene in gene_dict[gt]:
                if gene.contig == chrom and gene.beg <= epos and spos <= gene.end:
                    if strand and gene.strand != strand:
                        continue
                    genes.append(gene)
        return genes

    def get_sequence(self, chrom, start, end):
        if self.twobit:
            if chrom in self.twobit:
                return self.twobit[chrom][start:end]
            contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
            if contig in self.twobit:
                return self.twobit[contig][start:end]
        return None

    def sequence_sizes(self):
        return self.twobit.sequence_sizes()

    def get_transcript_seq(self, transcript_id, coding=False):
        tx = self.get_transcript_idx()[transcript_id]
        chrom = tx.gene.contig
        exonbnds = tx.get_coding_exons() if coding else tx.get_exons()
        if tx.gene.strand:
            seqs = [self.get_sequence(chrom, s, e) for s, e in exonbnds]
        else:
            seqs = [
                reverse_complement(self.get_sequence(chrom, s, e))
                for s, e in exonbnds
            ]
        return ''.join(seqs)

    def get_cdna(self, transcript_id):
        return self.get_transcript_seq(transcript_id, coding=False)

    def get_cds(self, transcript_id):
        return self.get_transcript_seq(transcript_id, coding=True)

    def genome_to_transcript_pos(self,
                                 transcript_id,
                                 genome_pos,
                                 coding=False):
        tx = self.get_transcript_idx()[transcript_id]
        if not tx.beg <= genome_pos < tx.end:
            return None
        exonbnds = tx.get_coding_exons() if coding else tx.get_exons()
        cdna_pos = 0
        if tx.gene.strand:
            for s, e in exonbnds:
                if s <= genome_pos < e:
                    cdna_pos += genome_pos - s
                    break
                else:
                    cdna_pos += e - s
        else:
            for s, e in exonbnds:
                if s <= genome_pos < e:
                    cdna_pos += e - genome_pos - 1
                    break
                else:
                    cdna_pos += e - s
        return cdna_pos

    def genome_to_cdna_pos(self, transcript_id, genome_pos):
        return self.genome_to_transcript_pos(transcript_id,
                                             genome_pos,
                                             coding=False)

    def genome_to_cds_pos(self, transcript_id, genome_pos):
        return self.genome_to_transcript_pos(transcript_id,
                                             genome_pos,
                                             coding=True)
def __main__():
    parser = argparse.ArgumentParser(
        description='Generate proBED and proBAM from mz.sqlite')
    parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML")
    parser.add_argument(
        'genomic_mapping_sqlite',
        help="genomic_mapping.sqlite with feature_cds_map table")
    parser.add_argument('-R',
                        '--genomeReference',
                        default='Unknown',
                        help='Genome reference sequence in 2bit format')
    parser.add_argument('-t',
                        '--twobit',
                        default=None,
                        help='Genome reference sequence in 2bit format')
    parser.add_argument('-r',
                        '--reads_bam',
                        default=None,
                        help='reads alignment bam path')
    parser.add_argument('-g',
                        '--gffutils_sqlite',
                        default=None,
                        help='gffutils GTF sqlite DB')
    parser.add_argument('-B', '--probed', default=None, help='proBed path')
    parser.add_argument('-s', '--prosam', default=None, help='proSAM path')
    parser.add_argument('-b', '--probam', default=None, help='proBAM path')
    parser.add_argument('-l',
                        '--limit',
                        type=int,
                        default=None,
                        help='limit numbers of PSMs for testing')
    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
    parser.add_argument('-d', '--debug', action='store_true', help='Debug')
    args = parser.parse_args()

    def get_sequence(chrom, start, end):
        if twobit:
            if chrom in twobit and 0 <= start < end < len(twobit[chrom]):
                return twobit[chrom][start:end]
            contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
            if contig in twobit and 0 <= start < end < len(twobit[contig]):
                return twobit[contig][start:end]
            return ''
        return None

    twobit = TwoBitFile(args.twobit) if args.twobit else None
    samfile = pysam.AlignmentFile(args.reads_bam,
                                  "rb") if args.reads_bam else None
    seqlens = twobit.sequence_sizes()

    probed = open(args.probed, 'w') if args.probed else sys.stdout

    gff_cursor = get_connection(
        args.gffutils_sqlite).cursor() if args.gffutils_sqlite else None
    map_cursor = get_connection(args.genomic_mapping_sqlite).cursor()
    mz_cursor = get_connection(args.mzsqlite).cursor()

    unmapped_accs = set()
    timings = dict()

    def add_time(name, elapsed):
        if name in timings:
            timings[name] += elapsed
        else:
            timings[name] = elapsed

    XG_TYPES = [
        'N', 'V', 'W', 'J', 'A', 'M', 'C', 'E', 'B', 'O', 'T', 'R', 'I', 'G',
        'D', 'U', 'X', '*'
    ]
    FT_TYPES = ['CDS', 'five_prime_utr', 'three_prime_utr', 'transcript']

    def get_peptide_type(exons):
        ## XG classify peptide
        ##     N  Normal peptide. The peptide sequence is contained in the reference protein sequence.
        ##     V  Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
        ##     W  Indel peptide. An insertion or deletion is present as compared to the reference.
        ##     J  Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
        ##     A  Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
        ##     M  Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
        ##     C  Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
        ##     E  Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
        ##     B  3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
        ##     O  Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
        ##     T  Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
        ##     R  Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
        ##     I  Intron peptide. A peptide that is located in an intronic region of the reference isoform.
        ##     G  Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
        ##     D  Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
        ##     U  Unmapped peptide. A peptide that could not be mapped to a reference sequence.
        ##     X  Unknown.

        peptide_type = '*'
        if gff_cursor:
            ts = time()
            etypes = ['*'] * len(exons)
            efeatures = [None] * len(exons)
            if args.debug:
                print('exons:%d\t%s' % (len(exons), etypes), file=sys.stderr)
            for i, exon in enumerate(exons):
                (acc, gc, gs, ge, st, cs, ce) = exon
                fr = cs % 3
                if args.debug:
                    print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                          (acc, gc, gs, ge, st, cs, ce, fr),
                          file=sys.stderr)
                ft_params = {
                    "seqid": str(gc).replace('chr', ''),
                    "start": gs,
                    "end": ge,
                    'strand': st,
                    'frame': fr,
                    'ftype': 'CDS'
                }
                features = [
                    f for f in gff_cursor.execute(FEATURE_ANY_QUERY, ft_params)
                ]
                efeatures[i] = features
            for i, exon in enumerate(exons):
                (acc, gc, gs, ge, st, cs, ce) = exon
                for f in efeatures[i]:
                    (id, seqid, start, end, featuretype, strand, frame,
                     in_frame) = f
                    if args.debug:
                        print('feat:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                              (id, seqid, start, end, featuretype, strand,
                               frame, in_frame),
                              file=sys.stderr)
                    if strand == st:
                        if start <= gs and ge <= end:
                            if in_frame:
                                etypes[i] = 'N'
                                break
                            elif XG_TYPES.index('O') < XG_TYPES.index(
                                    etypes[i]):
                                etypes[i] = 'O'
                        break
                    else:
                        if XG_TYPES.index('O') < XG_TYPES.index(etypes[i]):
                            etypes[i] = 'O'
                peptide_type = etypes[i]
            te = time()
            add_time('pep_type', te - ts)
        return peptide_type

    def classify_exon(exon, exons, features):
        ##     N  Normal peptide. The peptide sequence is contained in the reference protein sequence.
        # 1 exon, contained, in_frame
        # 2+ exons, contained, in_frame, on_exon_boundary
        ##     V  Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
        # 1 exon, contained, in_frame, AA_mismatch
        # 2+ exons, contained, in_frame, on_exon_boundary, AA_mismatch
        ##     W  Indel peptide. An insertion or deletion is present as compared to the reference.
        # 1 exon, contained, in_frame, AA_mismatch
        # 2+ exons, contained, in_frame, on_exon_boundary or off by 3, AA_mismatch
        ##     J  Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
        # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons
        ##     A  Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
        # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons
        ##     M  Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
        ##     C  Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
        # 1 exon overlaps but not contained
        ##     E  Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
        ##     B  3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
        # exon overlaps a three_prime_utr
        ##     O  Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
        # exon contained but not in_frame
        ##     T  Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
        ##     R  Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
        ##     I  Intron peptide. A peptide that is located in an intronic region of the reference isoform.
        # exon contained in transcript, not not overlapping any exon
        ##     G  Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
        # exonis from different seqs, strand, or transcripts
        ##     D  Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
        ##     U  Unmapped peptide. A peptide that could not be mapped to a reference sequence.
        ##     X  Unknown.
        return '*'

    def get_variant_cds(exons, ref_prot, peptide, pep_cds):
        if ref_prot != peptide and samfile:
            try:
                if args.debug:
                    print('name: %s \nref: %s\npep: %s\n' %
                          (scan_name, ref_prot, peptide),
                          file=sys.stderr)
                ts = time()
                for exon in exons:
                    (acc, chrom, start, end, strand, c_start, c_end) = exon
                    a_start = c_start / 3 * 3
                    a_end = c_end / 3 * 3
                    if ref_prot[a_start:a_end] != peptide[a_start:a_end]:
                        pileup = get_exon_pileup(chrom, start, end)
                        for i, (bi, ai, ao) in enumerate([
                            (i, i / 3, i % 3) for i in range(c_start, c_end)
                        ]):
                            if ao == 0 or i == 0:
                                if ref_prot[ai] != peptide[ai]:
                                    codon = get_pep_codon(
                                        pileup, bi - c_start, peptide[ai], ao)
                                    if args.debug:
                                        print('%d %d %d   %s :  %s %s %s' %
                                              (bi, ai, ao, peptide[ai],
                                               str(pep_cds[:bi]), str(codon),
                                               str(pep_cds[bi + 3:])),
                                              file=sys.stderr)
                                    if codon:
                                        pep_cds = pep_cds[:
                                                          bi] + codon + pep_cds[
                                                              bi + 3:]
                te = time()
                add_time('var_cds', te - ts)
            except Exception as e:
                print('name: %s \nref: %s\npep: %s\n%s\n' %
                      (scan_name, ref_prot, peptide, e),
                      file=sys.stderr)
        return pep_cds

    def get_mapping(acc, pep_start, pep_end):
        ts = time()
        p_start = (pep_start - 1) * 3
        p_end = pep_end * 3
        map_params = {"acc": acc, "p_start": p_start, "p_end": p_end}
        if args.debug:
            print('%s' % map_params, file=sys.stderr)
        locs = [l for l in map_cursor.execute(MAP_QUERY, map_params)]
        exons = []
        ##       =========	pep
        ##  ---			continue
        ##      ---		trim
        ##          ---		copy
        ##              ---	trim
        ##                 ---  break
        c_end = 0
        for i, (acc, chrom, start, end, strand, cds_start,
                cds_end) in enumerate(locs):
            if args.debug:
                print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' %
                      (acc, chrom, start, end, strand, cds_start, cds_end),
                      file=sys.stderr)
            c_start = c_end
            if cds_end < p_start:
                continue
            if cds_start >= p_end:
                break
            if strand == '+':
                if cds_start < p_start:
                    start += p_start - cds_start
                if cds_end > p_end:
                    end -= cds_end - p_end
            else:
                if cds_start < p_start:
                    end -= p_start - cds_start
                if cds_end > p_end:
                    start += cds_end - p_end
            c_end = c_start + abs(end - start)
            if args.debug:
                print('Pep:  %s\t%s:%d-%d\t%s\t%d\t%d' %
                      (acc, chrom, start, end, strand, cds_start, cds_end),
                      file=sys.stderr)
            exons.append([acc, chrom, start, end, strand, c_start, c_end])
        te = time()
        add_time('get_mapping', te - ts)
        return exons

    def get_cds(exons):
        ts = time()
        seqs = []
        for i, (acc, chrom, start, end, strand, cds_start,
                cds_end) in enumerate(exons):
            seq = get_sequence(chrom, min(start, end), max(start, end))
            if strand == '-':
                seq = reverse_complement(seq)
            seqs.append(seq)
        te = time()
        add_time('get_cds', te - ts)
        if args.debug:
            print('CDS:  %s' % str(seqs), file=sys.stderr)
        return ''.join(seqs) if seqs else ''

    def genomic_mapping_count(peptide):
        ts = time()
        params = {"sequence": peptide}
        acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY, params)]
        te = time()
        add_time('PEPTIDE_ACC_QUERY', te - ts)
        if acc_locs:
            if len(acc_locs) == 1:
                return 1
            locations = set()
            for i, acc_loc in enumerate(acc_locs):
                (acc, pep_start, pep_end) = acc_loc
                if acc in unmapped_accs:
                    continue
                try:
                    add_time('GENOMIC_POS_QUERY_COUNT', 1)
                    ts = time()
                    p_start = pep_start * 3
                    p_end = pep_end * 3
                    params = {"acc": acc, "cds_offset": p_start}
                    (start_chrom,
                     start_pos) = map_cursor.execute(GENOMIC_POS_QUERY,
                                                     params).fetchone()
                    params = {"acc": acc, "cds_offset": p_end}
                    (end_chrom,
                     end_pos) = map_cursor.execute(GENOMIC_POS_QUERY,
                                                   params).fetchone()
                    locations.add('%s:%s-%s:%s' %
                                  (start_chrom, start_pos, end_chrom, end_pos))
                    te = time()
                    add_time('GENOMIC_POS_QUERY', te - ts)
                except:
                    unmapped_accs.add(acc)
                    if args.debug:
                        print('Unmapped: %s' % acc, file=sys.stderr)
            return len(locations)
        return -1

    def spectrum_peptide_count(spectrum_id):
        ts = time()
        params = {"sr_id": spectrum_id}
        pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY,
                                      params).fetchone()[0]
        te = time()
        add_time('SPECTRUM_PEPTIDES_QUERY', te - ts)
        return pep_count

    def get_exon_pileup(chrom, chromStart, chromEnd):
        cols = []
        for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd):
            if chromStart <= pileupcolumn.reference_pos <= chromEnd:
                bases = dict()
                col = {
                    'depth': 0,
                    'cov': pileupcolumn.nsegments,
                    'pos': pileupcolumn.reference_pos,
                    'bases': bases
                }
                for pileupread in pileupcolumn.pileups:
                    if not pileupread.is_del and not pileupread.is_refskip:
                        col['depth'] += 1
                        base = pileupread.alignment.query_sequence[
                            pileupread.query_position]
                        if base not in bases:
                            bases[base] = 1
                        else:
                            bases[base] += 1
                cols.append(col)
        return cols

    codon_map = {
        "TTT": "F",
        "TTC": "F",
        "TTA": "L",
        "TTG": "L",
        "TCT": "S",
        "TCC": "S",
        "TCA": "S",
        "TCG": "S",
        "TAT": "Y",
        "TAC": "Y",
        "TAA": "*",
        "TAG": "*",
        "TGT": "C",
        "TGC": "C",
        "TGA": "*",
        "TGG": "W",
        "CTT": "L",
        "CTC": "L",
        "CTA": "L",
        "CTG": "L",
        "CCT": "P",
        "CCC": "P",
        "CCA": "P",
        "CCG": "P",
        "CAT": "H",
        "CAC": "H",
        "CAA": "Q",
        "CAG": "Q",
        "CGT": "R",
        "CGC": "R",
        "CGA": "R",
        "CGG": "R",
        "ATT": "I",
        "ATC": "I",
        "ATA": "I",
        "ATG": "M",
        "ACT": "T",
        "ACC": "T",
        "ACA": "T",
        "ACG": "T",
        "AAT": "N",
        "AAC": "N",
        "AAA": "K",
        "AAG": "K",
        "AGT": "S",
        "AGC": "S",
        "AGA": "R",
        "AGG": "R",
        "GTT": "V",
        "GTC": "V",
        "GTA": "V",
        "GTG": "V",
        "GCT": "A",
        "GCC": "A",
        "GCA": "A",
        "GCG": "A",
        "GAT": "D",
        "GAC": "D",
        "GAA": "E",
        "GAG": "E",
        "GGT": "G",
        "GGC": "G",
        "GGA": "G",
        "GGG": "G",
    }

    aa_codon_map = dict()
    for c, a in codon_map.items():
        aa_codon_map[a] = [
            c
        ] if a not in aa_codon_map else aa_codon_map[a] + [c]

    aa_na_map = dict()  # m[aa]{bo : {b1 : [b3]
    for c, a in codon_map.items():
        if a not in aa_na_map:
            aa_na_map[a] = dict()
        d = aa_na_map[a]
        for i in range(3):
            b = c[i]
            if i < 2:
                if b not in d:
                    d[b] = dict() if i < 1 else set()
                d = d[b]
            else:
                d.add(b)

    def get_pep_codon(pileup, idx, aa, ao):
        try:
            ts = time()
            bases = []
            for i in range(3):
                if i < ao:
                    bases.append(list(set([c[i] for c in aa_codon_map[aa]])))
                else:
                    bases.append([
                        b for b, cnt in reversed(
                            sorted(pileup[idx + i]['bases'].iteritems(),
                                   key=lambda (k, v): (v, k)))
                    ])
                if args.debug:
                    print('%s' % bases, file=sys.stderr)
            for b0 in bases[0]:
                if b0 not in aa_na_map[aa]:
                    continue
                for b1 in bases[1]:
                    if b1 not in aa_na_map[aa][b0]:
                        continue
                    for b2 in bases[2]:
                        if b2 in aa_na_map[aa][b0][b1]:
                            return '%s%s%s' % (b0, b1, b2)
            te = time()
            add_time('pep_codon', te - ts)
        except Exception as e:
            print("get_pep_codon: %s %s %s %s" % (aa, ao, idx, pileup),
                  file=sys.stderr)
            raise e
        return None

    def write_probed(chrom,
                     chromStart,
                     chromEnd,
                     strand,
                     blockCount,
                     blockSizes,
                     blockStarts,
                     spectrum,
                     protacc,
                     peptide,
                     uniqueness,
                     genomeReference,
                     score=1000,
                     psmScore='.',
                     fdr='.',
                     mods='.',
                     charge='.',
                     expMassToCharge='.',
                     calcMassToCharge='.',
                     psmRank='.',
                     datasetID='.',
                     uri='.'):
        probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
            (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount,
             ','.join([str(v) for v in blockSizes]),
             ','.join([str(v) for v in blockStarts]),
             protacc,peptide,uniqueness, genomeReference,
             psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri))

    def get_genomic_location(exons):
        chrom = exons[0][1]
        strand = exons[0][4]
        pos = [exon[2] for exon in exons] + [exon[3] for exon in exons]
        chromStart = min(pos)
        chromEnd = max(pos)
        blockCount = len(exons)
        blockSizes = [abs(exon[3] - exon[2]) for exon in exons]
        blockStarts = [min(exon[2], exon[3]) - chromStart for exon in exons]
        return (chrom, chromStart, chromEnd, strand, blockCount, blockSizes,
                blockStarts)

    def get_psm_modifications(peptide_ref):
        mods = []
        ts = time()
        params = {"peptide_ref": peptide_ref}
        pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)]
        if pepmods:
            for (location, residue, name, modType, unimod) in pepmods:
                mods.append('%s-%s' % (location, unimod if unimod else '%s%s' %
                                       (name, residue)))
        te = time()
        add_time('PEP_MODS_QUERY', te - ts)
        return ';'.join(mods)

    """
    QNAME
    FLAG
    RNAME
    POS
    CIGAR
    SEQ
    'NH' : 'i', #number of genomic locations to which the peptide sequence maps
    'XO' : 'Z', #uniqueness of the peptide mapping
    'XL' : 'i', #number of peptides to which the spectrum maps
    'XP' : 'Z', #peptide sequence
    'YP' : 'Z', #Protein accession ID from the original search result
    'XF' : 'Z', #Reading frame of the peptide (0, 1, 2)
    'XI' : 'f', #Peptide intensity
    'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
    'XR' : 'Z', #reference peptide sequence
    'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before).
    'YA' : 'Z', #Following amino acids (2 AA, A stands for after).
    'XS' : 'f', #PSM score
    'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP).
    'XC' : 'i', #peptide charge
    'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
    'XM' : 'Z', #Modifications
    'XN' : 'i', #Number of missed cleavages in the peptide (XP)
    'XT' : 'i', #Enzyme specificity
    'XE' : 'i', #Enzyme used in the experiment
    'XG' : 'A', #Peptide type
    'XU' : 'Z', #URI
    """
    psm_cursor = get_connection(args.mzsqlite).cursor()
    ts = time()
    psms = psm_cursor.execute(PSM_QUERY)
    te = time()
    add_time('PSM_QUERY', te - ts)
    proBAM = ProBAM(species=None,
                    assembly=args.genomeReference,
                    seqlens=seqlens,
                    comments=[])
    proBED = ProBED(species=None, assembly=args.genomeReference, comments=[])
    for i, psm in enumerate(psms):
        probam_dict = PROBAM_DEFAULTS.copy()
        (acc, pep_start, pep_end, aa_pre, aa_post, peptide, spectrum_id,
         spectrum_title, rank, charge, calcmass, exprmass, pepref) = psm
        scan_name = spectrum_title if spectrum_title else spectrum_id
        if args.debug:
            print('\nPSM: %d\t%s' % (i, '\t'.join([
                str(v) for v in (acc, pep_start, pep_end, peptide, spectrum_id,
                                 scan_name, rank, charge, calcmass, exprmass)
            ])),
                  file=sys.stderr)
        exons = get_mapping(acc, pep_start, pep_end)
        if args.debug:
            print('%s' % exons, file=sys.stderr)
        if not exons:
            continue
        mods = get_psm_modifications(pepref)
        (chrom, chromStart, chromEnd, strand, blockCount, blockSizes,
         blockStarts) = get_genomic_location(exons)
        ref_cds = get_cds(exons)
        if args.debug:
            print('%s' % ref_cds, file=sys.stderr)
        ref_prot = translate(ref_cds)
        if args.debug:
            print('%s' % ref_prot, file=sys.stderr)
            print('%s' % peptide, file=sys.stderr)
        spectrum_peptides = spectrum_peptide_count(spectrum_id)
        peptide_locations = genomic_mapping_count(peptide)
        if args.debug:
            print('spectrum_peptide_count: %d\tpeptide_location_count: %d' %
                  (spectrum_peptides, peptide_locations),
                  file=sys.stderr)
        uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]'
        ts = time()
        proBEDEntry = ProBEDEntry(chrom,
                                  chromStart,
                                  chromEnd,
                                  '%s_%s' % (acc, scan_name),
                                  1000,
                                  strand,
                                  blockCount,
                                  blockSizes,
                                  blockStarts,
                                  acc,
                                  peptide,
                                  uniqueness,
                                  args.genomeReference,
                                  charge=charge,
                                  expMassToCharge=exprmass,
                                  calcMassToCharge=calcmass,
                                  mods=mods if mods else '.',
                                  psmRank=rank)
        proBED.add_entry(proBEDEntry)
        te = time()
        add_time('add_probed', te - ts)
        if len(ref_prot) != len(peptide):
            continue
        ts = time()
        probam_dict['NH'] = peptide_locations
        probam_dict['XO'] = uniqueness
        probam_dict['XL'] = peptide_locations
        probam_dict['XP'] = peptide
        probam_dict['YP'] = acc
        probam_dict['XC'] = charge
        probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass,
                                          calcmass)
        probam_dict['XR'] = ref_prot  # ? dbSequence
        probam_dict['YA'] = aa_post
        probam_dict['YB'] = aa_pre
        probam_dict['XM'] = mods if mods else '*'
        flag = 16 if strand == '-' else 0
        if str(rank) != str(1) and rank != '*' and rank != [] and rank != "":
            flag += 256
        probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons])
        ## check for variation from ref_cds
        pep_cds = get_variant_cds(exons, ref_prot, peptide, ref_cds)
        peptide_type = '*'
        ## XG classify peptide
        probam_dict['XG'] = get_peptide_type(exons)
        ## probam_dict['MD'] = peptide

        ## FIX  SAM sequence is forward strand
        seq = pep_cds if strand == '+' else reverse_complement(pep_cds)
        ## cigar based on plus strand
        cigar = ''
        if strand == '+':
            blkStarts = blockStarts
            blkSizes = blockSizes
        else:
            blkStarts = [x for x in reversed(blockStarts)]
            blkSizes = [x for x in reversed(blockSizes)]
        for j in range(blockCount):
            if j > 0:
                intron = blkStarts[j] - (blkStarts[j - 1] + blkSizes[j - 1])
                if intron > 0:
                    cigar += '%dN' % intron
            cigar += '%dM' % blkSizes[j]
        ## Mods TODO
        proBAMEntry = ProBAMEntry(qname=scan_name,
                                  flag=flag,
                                  rname=chrom,
                                  pos=chromStart + 1,
                                  cigar=cigar,
                                  seq=seq,
                                  optional=probam_dict)
        proBAM.add_entry(proBAMEntry)
        te = time()
        add_time('add_probam', te - ts)

        if args.debug:
            print('%s' % probam_dict, file=sys.stderr)

        if args.limit and i >= args.limit:
            break
    if args.probed:
        ts = time()
        with open(args.probed, 'w') as fh:
            proBED.write(fh)
        te = time()
        add_time('write_probed', te - ts)
    if args.prosam or args.probam:
        samfile = args.prosam if args.prosam else 'temp.sam'
        ts = time()
        with open(samfile, 'w') as fh:
            proBAM.write(fh)
        te = time()
        add_time('write_prosam', te - ts)
        if args.probam:
            ts = time()
            bamfile = args.prosam.replace('.sam', '.bam')
            pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False)
            te = time()
            add_time('write_probam', te - ts)
            pysam.index(args.probam)

    print('\n%s\n' % str(timings), file=sys.stderr)