def parse_sequences(sites, size, fasta_file):
    """Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region."""
    from pyfasta import Fasta  # Fasta package is needed to fetch sequences from genome fasta file
            
    print "INFO: Begin to fetch sequences...."
    
    f = Fasta(fasta_file, key_fn=lambda key: key.split()[0])

    for i, reg in enumerate(sites):
        
        start = reg["ext_start"]
        end = reg["ext_end"]
        
        # if motif on negativ strand, shift region by +1 to account for zero based half-open intervals
        if reg["strand"] == '-':
            start += 1
            end += 1
        
        seq = f.sequence({"chr":reg["chr"], "start":start, "stop":end}, one_based=False)

        # Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code.
        seq = seq.upper()
 
        # if motif on negative strand, convert seq to reverse complement
        if reg["strand"] == '-': 
            seq = reverse_complement(seq)
        
        # add sequence to region dict
        reg["ext_seq"] = seq
        
    print "INFO: Finished sequences."
    return regions 
def getSequence(genome):
    genome=Fasta(genome)
    RAD_seq = pd.read_csv('../data/input_data/peak.csv')
    result = map(lambda i:[genome.sequence({'chr':RAD_seq['chrom'][i],'start':RAD_seq['start'][i],'stop':RAD_seq['end'][i]})],range(len(RAD_seq)))
    RAD_seq['seq'] = result
    RAD_seq['seq'] = RAD_seq.apply(fuc,axis=1)
    RAD_seq.to_csv('../data/input_data/RAD_seq.csv',index=False)
    print 'getSequence is over,RAD_seq.csv is bulit!'
Ejemplo n.º 3
0
class Reference(object):
    def __init__(self, genome_fasta):
        # @see: https://pypi.python.org/pypi/pyfasta
        key_fn = lambda key : key.split()[0] # Use first value before whitespace as keys
        self.fasta =  Fasta(genome_fasta, key_fn=key_fn)

    def get_sequence_from_iv(self, iv):
        feature_hash = {'chr' : iv.chrom, 'start' : iv.start, 'stop' : iv.end, 'strand' : iv.strand}
        return self.fasta.sequence(feature_hash, one_based=False)
Ejemplo n.º 4
0
class GenomeSeq(object):
	""" genomic sequence"""

	def __init__(self,filename):
		self.filename = filename
		self.fh = Fasta(filename, key_fn = get_chrom)
	def get_seq(self,chrom,start=0,end=False,strand="+"):
		if end is False:
			end = len(self.fh[chrom])
		return self.fh.sequence({"chr":chrom, "start":start, "stop": end, "strand":strand}, one_based=False)
def extract_only_ref_variant_fasta():
    f = Fasta(args.reference)
    if len(f.keys()) == 1:
        ref_id = str(f.keys())
    ffp = open("%s/Only_ref_variant_positions_for_closely" %
               args.filter2_only_snp_vcf_dir).readlines()
    core_vcf_file = args.filter2_only_snp_vcf_filename.replace(
        '_filter2_final.vcf_no_proximate_snp.vcf',
        '_filter2_final.vcf_core.vcf.gz')
    fasta_string = ""
    count = 0
    for lines in ffp:
        lines = lines.strip()
        grep_position = "zcat %s | grep -v \'#\' | awk -F\'\\t\' \'{ if ($2 == %s) print $0 }\' | awk -F\'\\t\' \'{print $5}\'" % (
            core_vcf_file, lines)
        proc = subprocess.Popen([grep_position],
                                stdout=subprocess.PIPE,
                                shell=True)
        (out, err) = proc.communicate()
        out = out.strip()
        if out:
            if "," in out:

                split = out.split(',')
                fasta_string = fasta_string + split[0]
                print "HET SNP found: Position:%s; Taking the First SNP:%s" % (
                    lines, split[0])
                count += 1
            else:
                fasta_string = fasta_string + out
                count += 1
        else:
            fasta_string = fasta_string + str(
                f.sequence({
                    'chr': str(f.keys()[0]),
                    'start': int(lines),
                    'stop': int(lines)
                }))
            count += 1
    pattern = re.compile(r'\s+')
    fasta_string = re.sub(pattern, '', fasta_string)
    final_fasta_string = ">%s\n" % os.path.basename(
        core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz',
                              '')) + fasta_string
    fp = open(
        "%s/%s_variants.fa" %
        (args.filter2_only_snp_vcf_dir,
         os.path.basename(
             core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', ''))),
        'w+')
    fp.write(final_fasta_string + '\n')
    fp.close()
Ejemplo n.º 6
0
def main(gff_file, outdir):
    """empty docstring"""
    name = re.compile("parent=([^.;]+)", re.I)

    feats = {}
    non_cds_feats = collections.defaultdict(list)
    for line in open(gff_file):
        line = line.split("\t")
        match = re.search(name, line[-1])
        if not match:
            continue
        fname = match.groups(0)[0]
        non_cds_feats[fname].append(line)
        if line[2].upper() == "CDS":
            feats[fname] = True
            continue
        if fname in feats:
            continue
        feats[fname] = None
    i = 0
    for k, v in sorted(feats.items()):
        if not v is None:
            del non_cds_feats[k]

    seen = {}
    RNA = open(outdir + "/at_non_cds.gff", "w")
    for k, feat_list in sorted(non_cds_feats.items()):
        for feat in feat_list:
            if feat[0] in ("ChrC", "ChrM"):
                continue
            if feat[2] == "exon":
                continue
            key = (feat[0], feat[3], feat[4])
            if key in seen:
                continue
            feat[0] = feat[0].upper().replace("CHR", "")
            seen[key] = True
            feat[-1] = k
            print >> RNA, "\t".join(feat)
    RNA.close()

    gff = read_gff(outdir + "/at_non_cds.gff")
    fasta = Fasta("/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta")
    ftypes = {}
    FA = open(outdir + "/at_rnas.fasta", "w")
    for chr, feature_list in gff.iteritems():
        for fname, feature in feature_list.iteritems():
            seq = fasta.sequence(feature)
            print >> FA, ">", feature["name"]
            print >> FA, seq
    FA.close()
Ejemplo n.º 7
0
def main(gff_file, outdir):
    """empty docstring"""
    name = re.compile("parent=([^.;]+)", re.I)

    feats = {}
    non_cds_feats = collections.defaultdict(list)
    for line in open(gff_file):
        line = line.split("\t")
        match = re.search(name, line[-1])
        if not match: continue
        fname = match.groups(0)[0]
        non_cds_feats[fname].append(line)
        if line[2].upper() == 'CDS':
            feats[fname] = True
            continue
        if fname in feats: continue
        feats[fname] = None
    i = 0
    for k, v in sorted(feats.items()):
        if not v is None: del non_cds_feats[k]

    seen = {}
    RNA = open(outdir + '/at_non_cds.gff', 'w')
    for k, feat_list in sorted(non_cds_feats.items()):
        for feat in feat_list:
            if feat[0] in ('ChrC', 'ChrM'): continue
            if feat[2] == 'exon': continue
            key = (feat[0], feat[3], feat[4])
            if key in seen: continue
            feat[0] = feat[0].upper().replace('CHR', '')
            seen[key] = True
            feat[-1] = k
            print >> RNA, "\t".join(feat)
    RNA.close()

    gff = read_gff(outdir + '/at_non_cds.gff')
    fasta = Fasta(
        '/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta')
    ftypes = {}
    FA = open(outdir + '/at_rnas.fasta', 'w')
    for chr, feature_list in gff.iteritems():
        for fname, feature in feature_list.iteritems():
            seq = fasta.sequence(feature)
            print >> FA, ">", feature['name']
            print >> FA, seq
    FA.close()
Ejemplo n.º 8
0
def run(args):
    genome = Fasta(args.genome)
    bed = filter(lambda x: x.strip(), args.bedfile.readlines())
    bed_list = map(lambda x: x.strip().split(), bed)
    result = map(
        lambda i: '>{0}_{1}\n{2}'.format(
            args.seqname, i + 1,
            genome.sequence({
                'chr': bed_list[i][0],
                'start': int(bed_list[i][1]) - args.flank,
                'stop': int(bed_list[i][2]) + args.flank,
                'strand': bed_list[i][3]
            })).upper(), range(len(bed_list)))
    if args.outfile:
        args.outfile.write('\n'.join(result))
    else:
        print ''.join(result)
Ejemplo n.º 9
0
def intron(fa, ann):
    f = Fasta(fa)
    fh = open(ann, 'r')
    out1 = open('intron.fa', 'w')
    mdict = {}
    ndict = {}
    for line in fh:
        if line.startswith('#'):
            continue
        new = line.strip().split('\t')
        if new[2] != 'CDS':
            continue
        n = new[-1].split(';')
        for i, j in enumerate(n):
            if 'Parent=' in j:
                mindex = i
        g = n[mindex].split('.')
        t = g[0].replace('Parent=', '')
        if '_' in t:
            gene = t.split('_')[0]
        else:
            gene = t
        if gene not in mdict:
            mdict[gene] = []
            ndict[gene] = [new[0], new[6]]
        start1 = int(new[3])
        stop1 = int(new[4])
        mdict[gene].append((start1, stop1))

    for i in sorted(mdict):
        k = ''
        total = len(mdict[i])
        for j in range(0, total - 1):
            start = mdict[i][j][1] + 1
            stop = mdict[i][j + 1][0] - 1
            k1 = f.sequence({
                'chr': ndict[i][0],
                'start': start,
                'stop': stop,
                'strand': ndict[i][1]
            })
            k += k1
        out1.write('>{0}-intron'.format(i) + '\n')
        out1.write(k + '\n')
    fh.close()
    out1.close()
def extract_reference_allele():
    print "Extracting Reference Allele from Reference Fasta file - %s to REF\n" % args.reference
    # Get reference genome ID from reference fasta file
    get_reference = Fasta(args.reference)
    if len(get_reference.keys()) == 1:
        ref_id = get_reference.keys()
    print "The reference genome ID from reference genome - %s" % ref_id

    fileObj = open("REF", 'w+')
    fileObj.write('Ref' + '\n')
    for item in pos:
        ref_allele = str(
            get_reference.sequence({
                'chr': str(get_reference.keys()[0]),
                'start': int(item),
                'stop': int(item)
            }))
        fileObj.write(ref_allele + '\n')
    fileObj.close()
Ejemplo n.º 11
0
def parse_sequences(sites, size, fasta_file):
    """Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region."""
    from pyfasta import Fasta  # Fasta package is needed to fetch sequences from genome fasta file

    print "INFO: Begin to fetch sequences...."

    f = Fasta(fasta_file, key_fn=lambda key: key.split()[0])

    for i, reg in enumerate(sites):

        start = reg["ext_start"]
        end = reg["ext_end"]

        # if motif on negativ strand, shift region by +1 to account for zero based half-open intervals
        if reg["strand"] == '-':
            start += 1
            end += 1

        seq = f.sequence({
            "chr": reg["chr"],
            "start": start,
            "stop": end
        },
                         one_based=False)

        # Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code.
        seq = seq.upper()

        # if motif on negative strand, convert seq to reverse complement
        if reg["strand"] == '-':
            seq = reverse_complement(seq)

        # add sequence to region dict
        reg["ext_seq"] = seq

    print "INFO: Finished sequences."
    return regions
Ejemplo n.º 12
0
def main(gff_file, fasta_file, parents, children):

    db_file = gff_file + ".db"

    if not op.exists(db_file):
        GFFutils.create_gffdb(gff_file, db_file)

    f = Fasta(fasta_file)
    g = GFFutils.GFFDB(db_file)

    parents = set(parents.split(','))
    parents_iter = [g.features_of_type(x) for x in parents]
    parents_list = itertools.chain(*parents_iter)
    children_list = set(children.split(','))

    for feat in parents_list:

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list: continue
            child = f.sequence(
                dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand))
            children.append((child, c))

        if not children:
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand == '-': children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        print ">%s" % feat.id
        print feat_seq
Ejemplo n.º 13
0
def main(gff_file, fasta_file, parents, children):

    db_file = gff_file + ".db"

    if not op.exists(db_file):
        GFFutils.create_gffdb(gff_file, db_file)

    f = Fasta(fasta_file)
    g = GFFutils.GFFDB(db_file)

    parents = set(parents.split(','))
    parents_iter = [g.features_of_type(x) for x in parents]
    parents_list = itertools.chain(*parents_iter)
    children_list = set(children.split(','))

    for feat in parents_list:

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list: continue
            child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop,
                strand=c.strand))
            children.append((child, c))

        if not children: 
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand=='-': children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        print ">%s" % feat.id
        print feat_seq
Ejemplo n.º 14
0
#version 1.1 此版本使用pyfasta实现。
import sys, os
from pyfasta import Fasta

if len(sys.argv) != 3:
    print 'Usage: *.py inputFile outputFile'
    sys.exit(0)
inputFile = sys.argv[1]
outputFile = sys.argv[2]


def writeFile(text, files):
    with open(files, 'a') as f:
        f.write(text)


if os.path.isfile(inputFile):
    f = Fasta(inputFile)
    for key in f.keys():
        writeFile(">" + key + os.linesep, outputFile)
        content = f.sequence(
            {
                'chr': key,
                'start': 0,
                'stop': len(f[key]) - 1,
                'strand': '-'
            },
            one_based=False)
        writeFile(content + os.linesep, outputFile)
else:
    print '您输入的不是一个文件'
Ejemplo n.º 15
0
def downstream(fa, ann, kb1, kb2):
    '''
    Extracting gene upstream sequences. fa is genome assembly file, ann is
    the annotation file, kb1 is the defined length of 3' UTR, kb2 is the defined
    length of downstream.
    '''
    f = Fasta(fa)
    fh = open(ann, 'r')
    out1 = open('3-UTR.fa', 'w')
    out2 = open('downstream.fa', 'w')
    mdict = {}
    ndict = {}
    for line in fh:
        # this is the demo line that we want to filter out
        # chr7    GLEAN   Gene    25420153        25421713        0.953889        -       .       Name=Pgl_GLEAN_10006696;
        if line.startswith('#'):
            continue
        new = line.strip().split('\t')
        if new[2] != 'CDS':
            continue
        n = new[-1].split(';')
        for i, j in enumerate(n):
            if 'Parent=' in j:
                mindex = i
        g = n[mindex].split('.')
        t = g[0].replace('Parent=', '')
        if '_' in t:
            gene = t.split('_')[0]
        else:
            gene = t
        if gene not in mdict:
            mdict[gene] = []
            ndict[gene] = []
        ndict[gene].append(new[0])
        ndict[gene].append(new[6])
        mdict[gene].append(int(new[3]))
        mdict[gene].append(int(new[4]))

    for gene in sorted(mdict):
        if ndict[gene][1] == '+':
            stop = max(mdict[gene])
            start1 = stop + 1
            stop1 = stop + (int(kb1) + 1)
            start2 = stop1 + 1
            stop2 = stop1 + (int(kb2) + 1)
            k1 = f.sequence({
                'chr': ndict[gene][0],
                'start': start1,
                'stop': stop1,
                'strand': ndict[gene][1]
            })
            out1.write('>{0}-3UTR'.format(gene) + '\n')
            out1.write(k1 + '\n')
            k2 = f.sequence({
                'chr': ndict[gene][0],
                'start': start2,
                'stop': stop2,
                'strand': ndict[gene][1]
            })
            out2.write('>{0}-downstream'.format(gene) + '\n')
            out2.write(k2 + '\n')
        elif ndict[gene][1] == '-':
            start = min(mdict[gene])
            start1 = start - (int(kb1) + 1)
            stop1 = start - 1
            start2 = start1 - (int(kb2) + 1)
            stop2 = start1 - 1
            k1 = f.sequence({
                'chr': ndict[gene][0],
                'start': start1,
                'stop': stop1,
                'strand': ndict[gene][1]
            })
            out1.write('>{0}-3UTR'.format(gene) + '\n')
            out1.write(k1 + '\n')
            k2 = f.sequence({
                'chr': ndict[gene][0],
                'start': start2,
                'stop': stop2,
                'strand': ndict[gene][1]
            })
            out2.write('>{0}-downstream'.format(gene) + '\n')
            out2.write(k2 + '\n')
    fh.close()
    out1.close()
    out2.close()
Ejemplo n.º 16
0
    if not out:
        f = Fasta(reference)
        if len(f.keys()) == 1:
            ref_id = str(f.keys())

        fasta_string = ""
        extract_base = "tr -d \'\\n\' < %s | cut -b%s" % (reference, j)
        #print extract_base
        # proc = subprocess.Popen([extract_base], stdout=subprocess.PIPE, shell=True)
        # (out, err) = proc.communicate()
        # out = out.strip()
        # fasta_string = fasta_string + out
        # if not out:
        #     print "Error extracting reference allele"
        #out = str(f.sequence({'chr': str(f.keys()[0]), 'start': int(lines), 'stop': int(lines)}))
        fasta_string = fasta_string + str(f.sequence({'chr': str(f.keys()[0]), 'start': int(j), 'stop': int(j)}))

        pattern = re.compile(r'\s+')
        fasta_string = re.sub(pattern, '', fasta_string)

        st = fasta_string + fasta_string + "\n"
        f1.write(st)
    else:
        cmd2 = "grep -P \'\s+" + j + "\s+\' " + args.filter2_only_snp_vcf_file
        #cmd2 =  "grep -v \'^#\' %s | awk -F\'\t\' \'{print $2}\' | grep -w \'%s\'" % (final_file, j)
        proc = subprocess.Popen([cmd2], stdout=subprocess.PIPE, shell=True)
        (out2, err2) = proc.communicate()
        line_string_array = out2.split('\t')
        print line_string_array
        ref_allele = line_string_array[3]
        alt_allel = line_string_array[4]
def extract_only_ref_variant_fasta_alternate():
    # Get reference genome ID
    f = Fasta(args.reference)
    if len(f.keys()) == 1:
        ref_id = str(f.keys())

    if args.functional_filter == "yes":
        functional_filter_pos_array = []
        functional_class_filter_positions = args.filter2_only_snp_vcf_dir + "/Functional_class_filter_positions.txt"
        with open(functional_class_filter_positions, 'rU') as f_functional:
            for line_func in f_functional:
                functional_filter_pos_array.append(line_func.strip())

        only_ref_variant = []
        ffp = open(
            "%s/Only_ref_variant_positions_for_closely" %
            args.filter2_only_snp_vcf_dir, 'r+')
        for line in ffp:
            line = line.strip()
            if line not in functional_filter_pos_array:
                only_ref_variant.append(line)
        ffp.close()
    else:
        only_ref_variant = []
        ffp = open(
            "%s/Only_ref_variant_positions_for_closely" %
            args.filter2_only_snp_vcf_dir, 'r+')
        for line in ffp:
            line = line.strip()
            only_ref_variant.append(line)
        ffp.close()
    print len(only_ref_variant)

    # # Get Only_ref_variant positions list
    # only_ref_variant = []
    # ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, "r")
    # for lines in ffp:
    #     lines = lines.strip()
    #     only_ref_variant.append(lines)
    # ffp.close()

    core_vcf_file = args.filter2_only_snp_vcf_filename.replace(
        '_filter2_final.vcf_no_proximate_snp.vcf',
        '_filter2_final.vcf_core.vcf.gz')
    # print core_vcf_file
    core_vcf_pos_base = {}
    for variants in VCF("%s" % core_vcf_file):
        if len(variants.ALT) > 1:
            core_vcf_pos_base[variants.POS] = variants.ALT[0]
        else:
            core_vcf_pos_base[variants.POS] = variants.ALT

    ffp.close()

    #troubleshoot
    # print len(core_vcf_pos_base)
    # test = "2024"
    # print str(core_vcf_pos_base[int(test)][0])

    fasta_string = ""
    count = 0
    for lines in only_ref_variant:
        lines = lines.strip()
        if int(lines) in core_vcf_pos_base.keys():
            # print lines
            fasta_string = fasta_string + str(core_vcf_pos_base[int(lines)][0])
            count += 1
        else:
            fasta_string = fasta_string + str(
                f.sequence({
                    'chr': str(f.keys()[0]),
                    'start': int(lines),
                    'stop': int(lines)
                }))
            count += 1
    pattern = re.compile(r'\s+')
    fasta_string = re.sub(pattern, '', fasta_string)
    final_fasta_string = ">%s\n" % os.path.basename(
        core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz',
                              '')) + fasta_string
    fp = open(
        "%s/%s_variants.fa" %
        (args.filter2_only_snp_vcf_dir,
         os.path.basename(
             core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', ''))),
        'w+')
    fp.write(final_fasta_string + '\n')
    fp.close()

    # print final_fasta_string
    print "Count: %s " % count
    print "Length: %s " % len(fasta_string)
Ejemplo n.º 18
0
class MutateFasta(object):
    def __init__(self, fasta):
        self.fasta = Fasta(fasta, key_fn=lambda key: key.split()[0])
        # self.chroms = [str(i+1) for i in range(22)] + ['X', 'Y']  # , 'MT']

    def generate_seq(self, records, offset=None):
        if not records and not offset: return

        seq = ''
        chrom = offset[0] if offset else records[0]['chrom']
        prev_pos = offset[1] if offset else 0
        last_pos = offset[2] if offset else len(self.fasta[chrom])

        for r in records:
            ref = self.slice_fasta(r['chrom'], r['pos'], r['pos'])

            if not r['chrom'] == chrom: continue
            if not (r['ref'] and r['alt']): continue
            if not r['ref'][0] == ref: continue

            mut_type, sub_seq = self._classify_mut(r['ref'], r['alt'])

            if mut_type == 'snv':
                seq += self.slice_fasta(chrom, prev_pos + 1, r['pos'] - 1)
                seq += sub_seq
                prev_pos = r['pos']

            elif mut_type == 'del':
                seq += self.slice_fasta(chrom, prev_pos + 1, r['pos'])
                prev_pos += len(sub_seq)

            elif mut_type == 'ins':
                seq += self.slice_fasta(chrom, prev_pos + 1, r['pos'])
                seq += sub_seq
                prev_pos = r['pos']

        # Reminder
        if prev_pos + 1 <= last_pos:
            seq += self.slice_fasta(chrom, prev_pos + 1, last_pos)

        return seq

    def generate_contexted_seq(self, r):
        cons = []
        chrom = r['chrom']

        # TODO: support - strand genes. (currently only supports + strand genes...)

        # NOTE: refFlat is stored in 0-based coordinate

        # 5'UTR + 1st Exon
        cons.append(
            [self.slice_fasta(chrom, r['txStart'] + 1, r['cdsStart']), 'utr'])
        cons.append([
            self.slice_fasta(chrom, r['cdsStart'] + 1, r['exonEnds'][0]),
            'exon'
        ])

        if r['exonCount'] > 1:
            cons.append([
                self.slice_fasta(chrom, r['exonEnds'][0] + 1,
                                 r['exonStarts'][1]), 'intron'
            ])

            # Exons
            for i, con in enumerate(r['exonStarts']):
                if i == 0 or i + 1 == r['exonCount']: continue

                cons.append([
                    self.slice_fasta(chrom, r['exonStarts'][i] + 1,
                                     r['exonEnds'][i]), 'exon'
                ])
                cons.append([
                    self.slice_fasta(chrom, r['exonEnds'][i] + 1,
                                     r['exonStarts'][i + 1]), 'intron'
                ])

            # last Exon + 3'UTR
            cons.append([
                self.slice_fasta(chrom,
                                 r['exonStarts'][r['exonCount'] - 1] + 1,
                                 r['cdsEnd']), 'exon'
            ])

        cons.append(
            [self.slice_fasta(chrom, r['cdsEnd'] + 1, r['txEnd']), 'utr'])

        return cons

    def slice_fasta(self, chrom, start, stop):
        return self.fasta.sequence(
            {
                'chr': str(chrom),
                'start': int(start),
                'stop': int(stop)
            },
            one_based=True)

    def _classify_mut(self, ref, alt):
        """
        >>> _classify_mut('A','G')
        ('snv', 'G')
        >>> _classify_mut('G','GAA')
        ('ins', 'AA')
        >>> _classify_mut('TTA','T')
        ('del', 'TA')
        """

        if len(ref) == len(alt) == 1:
            return 'snv', alt
        elif len(ref) < len(alt):
            assert ref[0] == alt[0], '{0} {1}'.format(ref, alt)
            return 'ins', alt[1:]
        elif len(ref) > len(alt):
            assert ref[0] == alt[0], '{0} {1}'.format(ref, alt)
            return 'del', ref[1:]
Ejemplo n.º 19
0
def extract_only_ref_variant_fasta_unique_positions():
    #print "here"

    # Get reference genome ID
    get_reference = Fasta(args.reference)
    if len(get_reference.keys()) == 1:
        ref_id = get_reference.keys()


    c_reader = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
    c_reader_2 = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
    columns = list(zip(*c_reader))
    ncol = len(next(c_reader_2))


    unique_position_array = []
    for i in columns[0][1:]:
        replace_string = i.split(' ')
        if replace_string[0] != "None":
            unique_position_array.append(int(replace_string[3]))
        else:
            unique_position_array.append(int(replace_string[2]))
    #print unique_position_array

    counts = 1
    end = ncol
    for i in xrange(1, end, 1):
        print_string = ""
        ref_print_string = ""
        grab_vcf_filename = len(os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''))
        #print grab_vcf_filename
        sample_name_re = columns[i][0][:grab_vcf_filename]
        #print sample_name_re

        # Replaced this with a more stable check
        #sample_name = str(columns[i][0])
        # sample_name_re = re.sub('_R1.fastq.gz', '', sample_name)
        # sample_name_re = re.sub('_R1_001.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_L001.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_S.*', '', sample_name_re)



        if sample_name_re == os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '') or sample_name_re in os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''):
            vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample_name_re
            print_string = print_string + ">%s\n" % sample_name_re
            ref_print_string = ref_print_string + ">%s\n" % sample_name_re
            #variant_allele = ''.join(columns[i][1:])
            variant_allele = ""
            for ntd in columns[i][1:]:
                if "/" in ntd:
                    variant_allele = variant_allele + ntd[0]
                else:
                    variant_allele = variant_allele + ntd
            #print variant_allele
            print_string = print_string + str(variant_allele) + "\n"

            allele_variant_fasta = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_variant_fasta.write(print_string)
            allele_variant_fasta.close()

            allele_ref_variant_fasta = open("%s/%s_ref_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_vcf = open("%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_vcf.write(vcf_header)

            variant_allele_array = []
            variant_allele_array.append(columns[i][1:])

            get_sample_reference = Fasta("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re))
            if len(get_sample_reference.keys()) == 1:
                sample_ref_id = get_sample_reference.keys()

            for positions in unique_position_array:

                pos_index = unique_position_array.index(positions)

                if "/" in str(variant_allele_array[0][pos_index]):
                    allele_var = str(variant_allele_array[0][pos_index][0])
                    #print allele_var
                else:
                    allele_var = str(variant_allele_array[0][pos_index])

                ref_allele = str(get_reference.sequence({'chr': str(get_reference.keys()[0]), 'start': int(positions), 'stop': int(positions)}))
                generate_vcf_string = "%s\t%s\t.\t%s\t%s\t221.999\t.\t.\t.\n" % (ref_id[0].split(' ')[0], positions, ref_allele, allele_var)
                allele_ref_variant_vcf.write(generate_vcf_string)

            allele_ref_variant_vcf.close()
            filename = "%s/consensus_ref_allele_variant.sh" % args.filter2_only_snp_vcf_dir

            vcf_filename = "%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            f1 = open(filename, 'a+')
            bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename)
            f1.write(bgzip_cmd)
            subprocess.call([bgzip_cmd], shell=True)
            tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename)
            f1.write(tabix_cmd)
            subprocess.call([tabix_cmd], shell=True)
            base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin']
            fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_variants.fa\n" % (args.reference, base_vcftools_bin, vcf_filename, sample_name_re)
            f1.write(fasta_cmd)
            subprocess.call([fasta_cmd], shell=True)

            sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_variants.fa\n" % (sample_name_re, sample_name_re)
            subprocess.call([sed_command], shell=True)
            f1.write(sed_command)

            #os.system("bash %s" % filename)
            #sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin'])
            #os.system(sequence_lgth_cmd)
            #call("%s" % sequence_lgth_cmd, logger)


        else:
            print "Sample name %s does not match with column name %s" % (os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''), sample_name_re)
Ejemplo n.º 20
0
def extract_only_ref_variant_fasta_unique_positions_with_unmapped():
    # Get reference genome ID from reference fasta file
    get_reference = Fasta(args.reference)
    if len(get_reference.keys()) == 1:
        ref_id = get_reference.keys()

    # Read in the SNP Matrix file and seperate the columns.
    c_reader = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
    c_reader_2 = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
    columns = list(zip(*c_reader))
    ncol = len(next(c_reader_2))

    # Generate an array of all the unique variant positions that were called in all the samples
    unique_position_array = []
    for i in columns[0][1:]:
        replace_string = i.split(' ')
        if replace_string[0] != "None":
            unique_position_array.append(int(replace_string[3]))
        else:
            unique_position_array.append(int(replace_string[2]))




    counts = 1
    end = ncol
    # Loop over each column, check if the column name matches the sample name provided with argument args.filter2_only_snp_vcf_filename
    for i in xrange(1, end, 1):
        print_string = ""
        ref_print_string = ""
        grab_vcf_filename = len(os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''))
        #print grab_vcf_filename

        sample_name_re = columns[i][0][:grab_vcf_filename]
        #print sample_name_re

        # Replaced this with a more stable check
        #sample_name = str(columns[i][0])
        # sample_name_re = re.sub('_R1.fastq.gz', '', sample_name)
        # sample_name_re = re.sub('_R1_001.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_L001.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_S.*', '', sample_name_re)

        #print len(columns[i][1:])
        if sample_name_re == os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '') or sample_name_re in os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''):

            vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample_name_re
            print_string = print_string + ">%s\n" % sample_name_re
            ref_print_string = ref_print_string + ">%s\n" % sample_name_re
            #variant_allele = ''.join(columns[i][1:])
            variant_allele = ""
            for ntd in columns[i][1:]:
                #if "/" in ntd:
                if "/" in ntd or len(ntd) > 1:
                    variant_allele = variant_allele + ntd[0]
                else:
                    variant_allele = variant_allele + ntd
            #print variant_allele
            print_string = print_string + str(variant_allele) + "\n"
            allele_variant_fasta = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_fasta = open("%s/%s_ref_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_vcf = open("%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_vcf.write(vcf_header)
            allele_variant_fasta.write(print_string)
            allele_variant_fasta.close()
            variant_allele_array = []
            variant_allele_array_dict = {}
            #variant_allele_array.append(columns[i][1:])
            count_index = 0
            end_index = len(unique_position_array) + 1
            for start_count in xrange(1, end_index, 1):
                pos = columns[0][start_count]
                get_positions_string = pos.split(' ')
                if get_positions_string[0] != "None":
                    get_positions = int(get_positions_string[3])
                else:
                    get_positions = int(get_positions_string[2])

                variant_allele_array_dict[get_positions] = columns[i][start_count]
            # print len(variant_allele_array_dict)
            # print len(unique_position_array)
            get_sample_reference = Fasta("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re))
            if len(get_sample_reference.keys()) == 1:
                sample_ref_id = get_sample_reference.keys()
            for positions in unique_position_array:
                #print positions
                #pos_index = unique_position_array.index(positions)

                if "/" in str(variant_allele_array_dict[positions]) or len(variant_allele_array_dict[positions]) > 1:
                    allele_var = str(variant_allele_array_dict[positions][0])
                    #print allele_var
                else:
                    allele_var = str(variant_allele_array_dict[positions])
                # if str(positions) == "1477126":
                #     print allele_var
                ref_allele = str(get_reference.sequence({'chr': str(get_reference.keys()[0]), 'start': int(positions), 'stop': int(positions)}))
                generate_vcf_string = "%s\t%s\t.\t%s\t%s\t221.999\t.\t.\t.\t.\n" % (ref_id[0].split(' ')[0], positions, ref_allele, allele_var)
                allele_ref_variant_vcf.write(generate_vcf_string)
            allele_ref_variant_vcf.close()
            filename = "%s/consensus_ref_allele_variant.sh" % args.filter2_only_snp_vcf_dir

            vcf_filename = "%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            f1 = open(filename, 'a+')
            bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename)
            f1.write(bgzip_cmd)
            subprocess.call([bgzip_cmd], shell=True)
            tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename)
            f1.write(tabix_cmd)
            subprocess.call([tabix_cmd], shell=True)
            base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin']
            fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_variants.fa\n" % (args.reference, base_vcftools_bin, vcf_filename, sample_name_re)
            f1.write(fasta_cmd)
            subprocess.call([fasta_cmd], shell=True)

            sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_variants.fa\n" % (sample_name_re, sample_name_re)
            subprocess.call([sed_command], shell=True)
            f1.write(sed_command)

            #os.system("bash %s" % filename)
            #sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin'])
            #os.system(sequence_lgth_cmd)
            #call("%s" % sequence_lgth_cmd, logger)

            unmapped_positions_file = "%s/%s_unmapped.bed_positions" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''))
            #print unmapped_positions_file
            unmapped_vcf_file = "%s/%s_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            unmapped_vcf = open(
                "%s/%s_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            unmapped_vcf.write(vcf_header)
            with open(unmapped_positions_file, 'r') as fpp:
                for lines in fpp:
                    lines = lines.strip()
                    ref_allele = str(get_reference.sequence(
                        {'chr': str(get_reference.keys()[0]), 'start': int(lines), 'stop': int(lines)}))
                    generate_vcf_string_unmapped = "%s\t%s\t.\t%s\t-\t221.999\t.\t.\t.\t.\n" % (
                    ref_id[0].split(' ')[0], lines, ref_allele)
                    unmapped_vcf.write(generate_vcf_string_unmapped)
            unmapped_vcf.close()

            bgzip_cmd = "%s/%s/bgzip -f %s\n" % (
            ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'],
            unmapped_vcf_file)
            print bgzip_cmd
            tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (
            ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'],
            unmapped_vcf_file)
            print tabix_cmd
            subprocess.call([bgzip_cmd], shell=True)
            subprocess.call([tabix_cmd], shell=True)
            #allele_ref_variant_unmapped_vcf = open("%s/%s_ref_allele_variants_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')

            vcf_filename_unmapped = "%s/%s_ref_allele_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            bcftools_merge_cmd =  "%s/%s/bcftools merge --merge snps --force-samples %s.gz %s.gz -O v -o %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bcftools", Config)['bcftools_bin'], unmapped_vcf_file, vcf_filename, vcf_filename_unmapped)

            bgzip_cmd = "%s/%s/bgzip -f %s\n" % (
            ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'],
            vcf_filename_unmapped)

            subprocess.call([bcftools_merge_cmd], shell=True)

            tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (
                ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'],
                vcf_filename_unmapped)

            fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_unmapped_variants.fa\n" % (
                args.reference, base_vcftools_bin, vcf_filename_unmapped, sample_name_re)

            #filename = "%s/consensus_ref_allele_unmapped_variant.sh" % args.filter2_only_snp_vcf_dir
            filename = "%s/%s_consensus_ref_allele_unmapped_variant.sh" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            f1 = open(filename, 'w+')
            f1.write(bgzip_cmd)
            f1.write(tabix_cmd)
            f1.write(fasta_cmd)
            print "print here: %s" % filename
            subprocess.call(['pwd'], shell=True)
            subprocess.call(bgzip_cmd, shell=True)
            subprocess.call(tabix_cmd, shell=True)
            subprocess.call(fasta_cmd, shell=True)
            sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_unmapped_variants.fa\n" % (sample_name_re, sample_name_re)
            subprocess.call([sed_command], shell=True)
            f1.write(sed_command)
            f1.close()

        else:
            print "Sample name %s does not match with column name %s" % (os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''), sample_name_re)
Ejemplo n.º 21
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from pyfasta import Fasta

f = Fasta('test.txt')
print f.keys()
print len(f['X80413'])
print f['X80413'][0:5]
print f.sequence(
    {
        'chr': 'X80413',
        'start': 0,
        'stop': len(f[key]),
        'strand': '-'
    },
    one_based=False)
Ejemplo n.º 22
0
def remove_reads_from_precursor(inbam, outbam, gr, minRlen, readlen_cutoff):
    """
	prepare input/output files
	"""
    inbamPysamObj = pysam.Samfile(inbam, "rb")
    outbamPysamObj = pysam.Samfile(outbam, "wb", template=inbamPysamObj)
    """
	create genome fetch object
	"""
    gf = Fasta(gr)
    """
	remove reads when 3' has TGG on the genome
	"""
    for read in inbamPysamObj:
        read_name = read.qname
        tid = read.rname
        readchr = inbamPysamObj.getrname(tid)
        readstart = int(read.pos) + 1
        readend = read.aend
        strand = read.flag
        readlen = len(
            read.seq)  #this is the actual read length (41M, means readlen=41)
        read_len = read.qlen  #this only considers matches (8S30M, means read_len=30)
        if readlen <= readlen_cutoff:
            outbamPysamObj.write(read)
            continue

#		if strand ==0 :  #read maps to forward strand
        if strand == 0 or strand == 256:  #read maps to forward strand
            upperlimit = minRlen - readlen
            #print(readchr,readend+1,readend+upperlimit)
            bpwindow = gf.sequence({
                'chr': readchr,
                'start': readend + 1,
                'stop': readend + upperlimit
            })
            #print bpwindow
            if readlen == minRlen - 1 and (bpwindow == "T" or bpwindow == "A"):
                continue  #TGGAATTCTCGGGTGCCAAGG
            elif readlen == minRlen - 2 and (bpwindow == "TG"
                                             or bpwindow == "AA"):
                continue
            elif readlen == minRlen - 3 and (bpwindow == "TGG"
                                             or bpwindow == "AAA"):
                continue
            elif readlen == minRlen - 4 and (bpwindow == "TGGA"
                                             or bpwindow == "AAAA"):
                continue
            elif readlen == minRlen - 5 and (bpwindow == "TGGAA"
                                             or bpwindow == "AAAAA"):
                continue
            else:
                outbamPysamObj.write(read)

#		elif strand ==16:  #read maps to reverse strand
        elif strand == 16 or strand == 272:  #read maps to reverse strand
            upperlimit = minRlen - readlen
            bpwindow = gf.sequence({
                'chr': readchr,
                'start': readstart - upperlimit,
                'stop': readstart - 1
            })
            if readlen == minRlen - 1 and (bpwindow == "A" or bpwindow == "T"):
                continue  #TTCCA
            elif readlen == minRlen - 2 and (bpwindow == "CA"
                                             or bpwindow == "TT"):
                continue
            elif readlen == minRlen - 3 and (bpwindow == "CCA"
                                             or bpwindow == "TTT"):
                continue
            elif readlen == minRlen - 4 and (bpwindow == "TCCA"
                                             or bpwindow == "TTTT"):
                continue
            elif readlen == minRlen - 5 and (bpwindow == "TTCCA"
                                             or bpwindow == "TTTTT"):
                continue
            else:
                outbamPysamObj.write(read)

    outbamPysamObj.close()
Ejemplo n.º 23
0
	end_index = content.find("-----", start_index)
	required_each_content = content[start_index: end_index]
	each_motif_lines = required_each_content.strip().split("\n")
	
	for motif_line in each_motif_lines:		
		splitted = motif_line.split()
		chrom1 = splitted[0].split(":")[0]
		sequence1 = splitted[5]
		strand1 = splitted[1]
		
		if strand1 == "+":			
			#f.sequence({"chr": "chr18", "start" : (3603155 + 22 -1), "stop" : (3603155 + 22 -1) + len_3, "strand" : "+"}, one_based = False).upper()
			# u'TCAGGTCACCAGATAAAG'
			start1 = (int(splitted[0].split(":")[1]) + int(splitted[2])) - 1
			end1 = start1 + len(sequence1)
			seq_pyfasta = f.sequence({"chr": chrom1, "start" : start1, "stop" : end1, "strand" : strand1}, one_based = False).upper()
			
		if strand1 == "-":
			#f.sequence({"chr": "chr2", "start" : (112383865 + 59 -1), "stop" : (112383865 + 59 -1) + len_3, "strand" : "-"}, one_based = False).upper()
			# u'AATCTTTGTCAGATAATC'	
			start1 = (int(splitted[0].split(":")[1]) + int(splitted[2])) -1 
			end1 = start1 + len(sequence1)
			seq_pyfasta = f.sequence({"chr": chrom1, "start" : start1, "stop" : end1, "strand" : strand1}, one_based = False).upper()
			#convert unicode string to python string
			#str(seq_pyfasta)
			#seq_pyfasta.encode("ascii", "replace")
			#seq_pyfasta.encode("ascii", "ignore")

		req = [chrom1, start1, end1, strand1, sequence1, str(seq_pyfasta)]

		for i,item in enumerate(req):
Ejemplo n.º 24
0
class MutateFasta(object):
    def __init__(self, fasta):
        self.fasta = Fasta(fasta, key_fn=lambda key: key.split()[0])
        # self.chroms = [str(i+1) for i in range(22)] + ['X', 'Y']  # , 'MT']

    def generate_seq(self, records, offset=None):
        if not records and not offset: return

        seq = ''
        chrom = offset[0] if offset else records[0]['chrom']
        prev_pos = offset[1] if offset else 0
        last_pos = offset[2] if offset else len(self.fasta[chrom])

        for r in records:
            ref = self.slice_fasta(r['chrom'], r['pos'], r['pos'])

            if not r['chrom'] == chrom: continue
            if not (r['ref'] and r['alt']): continue
            if not r['ref'][0] == ref: continue

            mut_type, sub_seq = self._classify_mut(r['ref'], r['alt'])

            if mut_type == 'snv':
                seq += self.slice_fasta(chrom, prev_pos + 1, r['pos'] - 1)
                seq += sub_seq
                prev_pos = r['pos']

            elif mut_type == 'del':
                seq += self.slice_fasta(chrom, prev_pos + 1, r['pos'])
                prev_pos += len(sub_seq)

            elif mut_type == 'ins':
                seq += self.slice_fasta(chrom, prev_pos + 1, r['pos'])
                seq += sub_seq
                prev_pos = r['pos']

        # Reminder
        if prev_pos + 1 <= last_pos:
            seq += self.slice_fasta(chrom, prev_pos + 1, last_pos)

        return seq

    def generate_contexted_seq(self, r):
        cons = []
        chrom = r['chrom']

        # TODO: support - strand genes. (currently only supports + strand genes...)

        # NOTE: refFlat is stored in 0-based coordinate

        # 5'UTR + 1st Exon
        cons.append([self.slice_fasta(chrom, r['txStart'] + 1, r['cdsStart']), 'utr'])
        cons.append([self.slice_fasta(chrom, r['cdsStart'] + 1, r['exonEnds'][0]), 'exon'])

        if r['exonCount'] > 1:
            cons.append([self.slice_fasta(chrom, r['exonEnds'][0] + 1, r['exonStarts'][1]), 'intron'])

            # Exons
            for i,con in enumerate(r['exonStarts']):
                if i == 0 or i+1 == r['exonCount']: continue

                cons.append([self.slice_fasta(chrom, r['exonStarts'][i] + 1, r['exonEnds'][i]), 'exon'])
                cons.append([self.slice_fasta(chrom, r['exonEnds'][i] + 1, r['exonStarts'][i+1]), 'intron'])

            # last Exon + 3'UTR
            cons.append([self.slice_fasta(chrom, r['exonStarts'][r['exonCount']-1] + 1, r['cdsEnd']), 'exon'])

        cons.append([self.slice_fasta(chrom, r['cdsEnd'] + 1, r['txEnd']), 'utr'])

        return cons

    def slice_fasta(self, chrom, start, stop):
        return self.fasta.sequence({'chr': str(chrom), 'start': int(start), 'stop': int(stop)}, one_based=True)

    def _classify_mut(self, ref, alt):
        """
        >>> _classify_mut('A','G')
        ('snv', 'G')
        >>> _classify_mut('G','GAA')
        ('ins', 'AA')
        >>> _classify_mut('TTA','T')
        ('del', 'TA')
        """

        if len(ref) == len(alt) == 1:
            return 'snv', alt
        elif len(ref) < len(alt):
            assert ref[0] == alt[0], '{0} {1}'.format(ref, alt)
            return 'ins', alt[1:]
        elif len(ref) > len(alt):
            assert ref[0] == alt[0], '{0} {1}'.format(ref, alt)
            return 'del', ref[1:]