Beispiel #1
0
def Readanno(filename, annoglb, genome):
    glannot = glload(annoglb)
    allelement = set(glannot['annot'])
    if genome in ['mm10']:
        chr_list = ['chr' + str(i)
                    for i in range(1, 20)] + ['chrX', 'chrY', 'chrM']
    elif genome in ['hg38']:
        chr_list = ['chr' + str(i)
                    for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']
    return (allelement, chr_list, annoglb, glannot)
Beispiel #2
0
def Readanno(filename, annoglb, genome):
    glannot = glload(annoglb)
    allelement = set(glannot['annot'])
    #     if genome in ['mm10']:
    #         chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ]
    #     elif genome in ['hg38']:
    #         chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ]

    chr_list = list(set([k['chr'] for k in glannot['loc']
                         ]))  #this is useful for costume chromsome
    return (allelement, chr_list, annoglb, glannot)
Beispiel #3
0
def getanno(filename, genefile, tefile, genome, mode):
    form = {
        'force_tsv': True,
        'loc': 'location(chr=column[0], left=column[1], right=column[2])',
        'annot': 3
    }

    if genefile == 'default' and tefile == 'default':
        if genome == 'mm10':
            chr_list = ['chr' + str(i)
                        for i in range(1, 20)] + ['chrX', 'chrY', 'chrM']
            if mode == 'exclusive':
                if not os.path.exists('mm10.exclusive.glb'):
                    logging.error(
                        "Did not find the annotation index mm10.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n"
                    )
                    sys.exit(1)
                all_annot = 'mm10.exclusive.glb'
                allelement = set(glload(all_annot)['annot'])

            elif mode == 'inclusive':
                if not os.path.exists('mm10.inclusive.glb'):
                    logging.error(
                        "Did not find the annotation index mm10.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n"
                    )
                    sys.exit(1)
                all_annot = 'mm10.inclusive.glb'
                allelement = set(glload(all_annot)['annot'])

        elif genome == 'hg38':
            chr_list = ['chr' + str(i)
                        for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']
            if mode == 'exclusive':
                if not os.path.exists('hg38.exclusive.glb'):
                    logging.error(
                        "Did not find the annotation index hg38.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n"
                    )
                    sys.exit(1)
                all_annot = 'hg38.exclusive.glb'
                allelement = set(glload(all_annot)['annot'])

            elif mode == 'inclusive':
                if not os.path.exists('hg38.inclusive.glb'):
                    logging.error(
                        "Did not find the annotation index hg38.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n"
                    )
                    sys.exit(1)
                all_annot = 'hg38.inclusive.glb'
                allelement = set(glload(all_annot)['annot'])
    else:
        if genome in ['hg38']:
            chr_list = ['chr' + str(i)
                        for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']

        elif genome in ['mm10']:
            chr_list = ['chr' + str(i)
                        for i in range(1, 20)] + ['chrX', 'chrY', 'chrM']

        if not os.path.isfile(tefile):
            logging.error("No such file: %s !\n" % (tefile))
            sys.exit(1)

        if not os.path.isfile(genefile):
            logging.error("No such file: %s !\n" % (genefile))
            sys.exit(1)

        all_annot = annoGtf(filename,
                            genefile=genefile,
                            tefile=tefile,
                            mode=mode)
        allelement = set(glload(all_annot)['annot'])

    return (allelement, chr_list, all_annot)
Beispiel #4
0
def align(chr, filename, all_annot, glannot, whitelist, CB):
    '''
    **Purpose**
        For each read, align it to the index and assign a TE, gene.

    This is the speed critical part.

    '''
    s1 = time.time()
    chr = 'chr' + chr

    if not os.path.exists('%s_scTEtmp/o3' % filename):
        os.system('mkdir -p %s_scTEtmp/o3' % filename)

    if not glannot:  # Load separately for the multicore pipeline, share the index for the single core pipeline
        glannot = glload(all_annot)

    # Only keep the glbase parts we need.
    buckets = glannot.buckets[chr.replace('chr', '')]
    all_annot = glannot.linearData

    oh = gzip.open('%s_scTEtmp/o2/%s.%s.bed.gz' % (filename, filename, chr),
                   'rt')
    res = {}
    for line in oh:
        t = line.strip().split('\t')
        barcode = t[3]
        if barcode not in whitelist:
            continue
        if barcode not in res:
            res[barcode] = defaultdict(int)

        #chrom = t[0].replace('chr', '') # Don't need as each align is already split for each chrom;
        left = int(t[1])
        rite = int(t[2])

        #loc = location(chr=chrom, left=left, right=rite)
        left_buck = ((left - 1) // 10000) * 10000
        right_buck = ((rite) // 10000) * 10000
        buckets_reqd = range(left_buck, right_buck + 10000, 10000)

        if buckets_reqd:
            loc_ids = set()
            loc_ids_update = loc_ids.update

            # get the ids reqd.
            [
                loc_ids_update(buckets[buck]) for buck in buckets_reqd
                if buck in buckets
            ]

            result = [
                all_annot[index]['annot'] for index in loc_ids
                if (rite >= all_annot[index]['loc'].loc['left']
                    and left <= all_annot[index]['loc'].loc["right"])
            ]

            if result:
                for gene in result:
                    res[barcode][gene] += 1

    oh.close()

    oh = gzip.open('%s_scTEtmp/o3/%s.%s.bed.gz' % (filename, filename, chr),
                   'wt')
    for bc in sorted(res):
        for gene in sorted(res[bc]):
            oh.write('%s\t%s\t%s\n' % (bc, gene, res[bc][gene]))
    oh.close()