def parse_sequences(sites, size, fasta_file): """Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region.""" from pyfasta import Fasta # Fasta package is needed to fetch sequences from genome fasta file print "INFO: Begin to fetch sequences...." f = Fasta(fasta_file, key_fn=lambda key: key.split()[0]) for i, reg in enumerate(sites): start = reg["ext_start"] end = reg["ext_end"] # if motif on negativ strand, shift region by +1 to account for zero based half-open intervals if reg["strand"] == '-': start += 1 end += 1 seq = f.sequence({"chr":reg["chr"], "start":start, "stop":end}, one_based=False) # Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code. seq = seq.upper() # if motif on negative strand, convert seq to reverse complement if reg["strand"] == '-': seq = reverse_complement(seq) # add sequence to region dict reg["ext_seq"] = seq print "INFO: Finished sequences." return regions
def getSequence(genome): genome=Fasta(genome) RAD_seq = pd.read_csv('../data/input_data/peak.csv') result = map(lambda i:[genome.sequence({'chr':RAD_seq['chrom'][i],'start':RAD_seq['start'][i],'stop':RAD_seq['end'][i]})],range(len(RAD_seq))) RAD_seq['seq'] = result RAD_seq['seq'] = RAD_seq.apply(fuc,axis=1) RAD_seq.to_csv('../data/input_data/RAD_seq.csv',index=False) print 'getSequence is over,RAD_seq.csv is bulit!'
class Reference(object): def __init__(self, genome_fasta): # @see: https://pypi.python.org/pypi/pyfasta key_fn = lambda key : key.split()[0] # Use first value before whitespace as keys self.fasta = Fasta(genome_fasta, key_fn=key_fn) def get_sequence_from_iv(self, iv): feature_hash = {'chr' : iv.chrom, 'start' : iv.start, 'stop' : iv.end, 'strand' : iv.strand} return self.fasta.sequence(feature_hash, one_based=False)
class GenomeSeq(object): """ genomic sequence""" def __init__(self,filename): self.filename = filename self.fh = Fasta(filename, key_fn = get_chrom) def get_seq(self,chrom,start=0,end=False,strand="+"): if end is False: end = len(self.fh[chrom]) return self.fh.sequence({"chr":chrom, "start":start, "stop": end, "strand":strand}, one_based=False)
def extract_only_ref_variant_fasta(): f = Fasta(args.reference) if len(f.keys()) == 1: ref_id = str(f.keys()) ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir).readlines() core_vcf_file = args.filter2_only_snp_vcf_filename.replace( '_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_core.vcf.gz') fasta_string = "" count = 0 for lines in ffp: lines = lines.strip() grep_position = "zcat %s | grep -v \'#\' | awk -F\'\\t\' \'{ if ($2 == %s) print $0 }\' | awk -F\'\\t\' \'{print $5}\'" % ( core_vcf_file, lines) proc = subprocess.Popen([grep_position], stdout=subprocess.PIPE, shell=True) (out, err) = proc.communicate() out = out.strip() if out: if "," in out: split = out.split(',') fasta_string = fasta_string + split[0] print "HET SNP found: Position:%s; Taking the First SNP:%s" % ( lines, split[0]) count += 1 else: fasta_string = fasta_string + out count += 1 else: fasta_string = fasta_string + str( f.sequence({ 'chr': str(f.keys()[0]), 'start': int(lines), 'stop': int(lines) })) count += 1 pattern = re.compile(r'\s+') fasta_string = re.sub(pattern, '', fasta_string) final_fasta_string = ">%s\n" % os.path.basename( core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', '')) + fasta_string fp = open( "%s/%s_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename( core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', ''))), 'w+') fp.write(final_fasta_string + '\n') fp.close()
def main(gff_file, outdir): """empty docstring""" name = re.compile("parent=([^.;]+)", re.I) feats = {} non_cds_feats = collections.defaultdict(list) for line in open(gff_file): line = line.split("\t") match = re.search(name, line[-1]) if not match: continue fname = match.groups(0)[0] non_cds_feats[fname].append(line) if line[2].upper() == "CDS": feats[fname] = True continue if fname in feats: continue feats[fname] = None i = 0 for k, v in sorted(feats.items()): if not v is None: del non_cds_feats[k] seen = {} RNA = open(outdir + "/at_non_cds.gff", "w") for k, feat_list in sorted(non_cds_feats.items()): for feat in feat_list: if feat[0] in ("ChrC", "ChrM"): continue if feat[2] == "exon": continue key = (feat[0], feat[3], feat[4]) if key in seen: continue feat[0] = feat[0].upper().replace("CHR", "") seen[key] = True feat[-1] = k print >> RNA, "\t".join(feat) RNA.close() gff = read_gff(outdir + "/at_non_cds.gff") fasta = Fasta("/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta") ftypes = {} FA = open(outdir + "/at_rnas.fasta", "w") for chr, feature_list in gff.iteritems(): for fname, feature in feature_list.iteritems(): seq = fasta.sequence(feature) print >> FA, ">", feature["name"] print >> FA, seq FA.close()
def main(gff_file, outdir): """empty docstring""" name = re.compile("parent=([^.;]+)", re.I) feats = {} non_cds_feats = collections.defaultdict(list) for line in open(gff_file): line = line.split("\t") match = re.search(name, line[-1]) if not match: continue fname = match.groups(0)[0] non_cds_feats[fname].append(line) if line[2].upper() == 'CDS': feats[fname] = True continue if fname in feats: continue feats[fname] = None i = 0 for k, v in sorted(feats.items()): if not v is None: del non_cds_feats[k] seen = {} RNA = open(outdir + '/at_non_cds.gff', 'w') for k, feat_list in sorted(non_cds_feats.items()): for feat in feat_list: if feat[0] in ('ChrC', 'ChrM'): continue if feat[2] == 'exon': continue key = (feat[0], feat[3], feat[4]) if key in seen: continue feat[0] = feat[0].upper().replace('CHR', '') seen[key] = True feat[-1] = k print >> RNA, "\t".join(feat) RNA.close() gff = read_gff(outdir + '/at_non_cds.gff') fasta = Fasta( '/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta') ftypes = {} FA = open(outdir + '/at_rnas.fasta', 'w') for chr, feature_list in gff.iteritems(): for fname, feature in feature_list.iteritems(): seq = fasta.sequence(feature) print >> FA, ">", feature['name'] print >> FA, seq FA.close()
def run(args): genome = Fasta(args.genome) bed = filter(lambda x: x.strip(), args.bedfile.readlines()) bed_list = map(lambda x: x.strip().split(), bed) result = map( lambda i: '>{0}_{1}\n{2}'.format( args.seqname, i + 1, genome.sequence({ 'chr': bed_list[i][0], 'start': int(bed_list[i][1]) - args.flank, 'stop': int(bed_list[i][2]) + args.flank, 'strand': bed_list[i][3] })).upper(), range(len(bed_list))) if args.outfile: args.outfile.write('\n'.join(result)) else: print ''.join(result)
def intron(fa, ann): f = Fasta(fa) fh = open(ann, 'r') out1 = open('intron.fa', 'w') mdict = {} ndict = {} for line in fh: if line.startswith('#'): continue new = line.strip().split('\t') if new[2] != 'CDS': continue n = new[-1].split(';') for i, j in enumerate(n): if 'Parent=' in j: mindex = i g = n[mindex].split('.') t = g[0].replace('Parent=', '') if '_' in t: gene = t.split('_')[0] else: gene = t if gene not in mdict: mdict[gene] = [] ndict[gene] = [new[0], new[6]] start1 = int(new[3]) stop1 = int(new[4]) mdict[gene].append((start1, stop1)) for i in sorted(mdict): k = '' total = len(mdict[i]) for j in range(0, total - 1): start = mdict[i][j][1] + 1 stop = mdict[i][j + 1][0] - 1 k1 = f.sequence({ 'chr': ndict[i][0], 'start': start, 'stop': stop, 'strand': ndict[i][1] }) k += k1 out1.write('>{0}-intron'.format(i) + '\n') out1.write(k + '\n') fh.close() out1.close()
def extract_reference_allele(): print "Extracting Reference Allele from Reference Fasta file - %s to REF\n" % args.reference # Get reference genome ID from reference fasta file get_reference = Fasta(args.reference) if len(get_reference.keys()) == 1: ref_id = get_reference.keys() print "The reference genome ID from reference genome - %s" % ref_id fileObj = open("REF", 'w+') fileObj.write('Ref' + '\n') for item in pos: ref_allele = str( get_reference.sequence({ 'chr': str(get_reference.keys()[0]), 'start': int(item), 'stop': int(item) })) fileObj.write(ref_allele + '\n') fileObj.close()
def parse_sequences(sites, size, fasta_file): """Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region.""" from pyfasta import Fasta # Fasta package is needed to fetch sequences from genome fasta file print "INFO: Begin to fetch sequences...." f = Fasta(fasta_file, key_fn=lambda key: key.split()[0]) for i, reg in enumerate(sites): start = reg["ext_start"] end = reg["ext_end"] # if motif on negativ strand, shift region by +1 to account for zero based half-open intervals if reg["strand"] == '-': start += 1 end += 1 seq = f.sequence({ "chr": reg["chr"], "start": start, "stop": end }, one_based=False) # Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code. seq = seq.upper() # if motif on negative strand, convert seq to reverse complement if reg["strand"] == '-': seq = reverse_complement(seq) # add sequence to region dict reg["ext_seq"] = seq print "INFO: Finished sequences." return regions
def main(gff_file, fasta_file, parents, children): db_file = gff_file + ".db" if not op.exists(db_file): GFFutils.create_gffdb(gff_file, db_file) f = Fasta(fasta_file) g = GFFutils.GFFDB(db_file) parents = set(parents.split(',')) parents_iter = [g.features_of_type(x) for x in parents] parents_list = itertools.chain(*parents_iter) children_list = set(children.split(',')) for feat in parents_list: children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence( dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand == '-': children.reverse() feat_seq = ''.join(x[0] for x in children) print ">%s" % feat.id print feat_seq
def main(gff_file, fasta_file, parents, children): db_file = gff_file + ".db" if not op.exists(db_file): GFFutils.create_gffdb(gff_file, db_file) f = Fasta(fasta_file) g = GFFutils.GFFDB(db_file) parents = set(parents.split(',')) parents_iter = [g.features_of_type(x) for x in parents] parents_list = itertools.chain(*parents_iter) children_list = set(children.split(',')) for feat in parents_list: children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand=='-': children.reverse() feat_seq = ''.join(x[0] for x in children) print ">%s" % feat.id print feat_seq
#version 1.1 此版本使用pyfasta实现。 import sys, os from pyfasta import Fasta if len(sys.argv) != 3: print 'Usage: *.py inputFile outputFile' sys.exit(0) inputFile = sys.argv[1] outputFile = sys.argv[2] def writeFile(text, files): with open(files, 'a') as f: f.write(text) if os.path.isfile(inputFile): f = Fasta(inputFile) for key in f.keys(): writeFile(">" + key + os.linesep, outputFile) content = f.sequence( { 'chr': key, 'start': 0, 'stop': len(f[key]) - 1, 'strand': '-' }, one_based=False) writeFile(content + os.linesep, outputFile) else: print '您输入的不是一个文件'
def downstream(fa, ann, kb1, kb2): ''' Extracting gene upstream sequences. fa is genome assembly file, ann is the annotation file, kb1 is the defined length of 3' UTR, kb2 is the defined length of downstream. ''' f = Fasta(fa) fh = open(ann, 'r') out1 = open('3-UTR.fa', 'w') out2 = open('downstream.fa', 'w') mdict = {} ndict = {} for line in fh: # this is the demo line that we want to filter out # chr7 GLEAN Gene 25420153 25421713 0.953889 - . Name=Pgl_GLEAN_10006696; if line.startswith('#'): continue new = line.strip().split('\t') if new[2] != 'CDS': continue n = new[-1].split(';') for i, j in enumerate(n): if 'Parent=' in j: mindex = i g = n[mindex].split('.') t = g[0].replace('Parent=', '') if '_' in t: gene = t.split('_')[0] else: gene = t if gene not in mdict: mdict[gene] = [] ndict[gene] = [] ndict[gene].append(new[0]) ndict[gene].append(new[6]) mdict[gene].append(int(new[3])) mdict[gene].append(int(new[4])) for gene in sorted(mdict): if ndict[gene][1] == '+': stop = max(mdict[gene]) start1 = stop + 1 stop1 = stop + (int(kb1) + 1) start2 = stop1 + 1 stop2 = stop1 + (int(kb2) + 1) k1 = f.sequence({ 'chr': ndict[gene][0], 'start': start1, 'stop': stop1, 'strand': ndict[gene][1] }) out1.write('>{0}-3UTR'.format(gene) + '\n') out1.write(k1 + '\n') k2 = f.sequence({ 'chr': ndict[gene][0], 'start': start2, 'stop': stop2, 'strand': ndict[gene][1] }) out2.write('>{0}-downstream'.format(gene) + '\n') out2.write(k2 + '\n') elif ndict[gene][1] == '-': start = min(mdict[gene]) start1 = start - (int(kb1) + 1) stop1 = start - 1 start2 = start1 - (int(kb2) + 1) stop2 = start1 - 1 k1 = f.sequence({ 'chr': ndict[gene][0], 'start': start1, 'stop': stop1, 'strand': ndict[gene][1] }) out1.write('>{0}-3UTR'.format(gene) + '\n') out1.write(k1 + '\n') k2 = f.sequence({ 'chr': ndict[gene][0], 'start': start2, 'stop': stop2, 'strand': ndict[gene][1] }) out2.write('>{0}-downstream'.format(gene) + '\n') out2.write(k2 + '\n') fh.close() out1.close() out2.close()
if not out: f = Fasta(reference) if len(f.keys()) == 1: ref_id = str(f.keys()) fasta_string = "" extract_base = "tr -d \'\\n\' < %s | cut -b%s" % (reference, j) #print extract_base # proc = subprocess.Popen([extract_base], stdout=subprocess.PIPE, shell=True) # (out, err) = proc.communicate() # out = out.strip() # fasta_string = fasta_string + out # if not out: # print "Error extracting reference allele" #out = str(f.sequence({'chr': str(f.keys()[0]), 'start': int(lines), 'stop': int(lines)})) fasta_string = fasta_string + str(f.sequence({'chr': str(f.keys()[0]), 'start': int(j), 'stop': int(j)})) pattern = re.compile(r'\s+') fasta_string = re.sub(pattern, '', fasta_string) st = fasta_string + fasta_string + "\n" f1.write(st) else: cmd2 = "grep -P \'\s+" + j + "\s+\' " + args.filter2_only_snp_vcf_file #cmd2 = "grep -v \'^#\' %s | awk -F\'\t\' \'{print $2}\' | grep -w \'%s\'" % (final_file, j) proc = subprocess.Popen([cmd2], stdout=subprocess.PIPE, shell=True) (out2, err2) = proc.communicate() line_string_array = out2.split('\t') print line_string_array ref_allele = line_string_array[3] alt_allel = line_string_array[4]
def extract_only_ref_variant_fasta_alternate(): # Get reference genome ID f = Fasta(args.reference) if len(f.keys()) == 1: ref_id = str(f.keys()) if args.functional_filter == "yes": functional_filter_pos_array = [] functional_class_filter_positions = args.filter2_only_snp_vcf_dir + "/Functional_class_filter_positions.txt" with open(functional_class_filter_positions, 'rU') as f_functional: for line_func in f_functional: functional_filter_pos_array.append(line_func.strip()) only_ref_variant = [] ffp = open( "%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') for line in ffp: line = line.strip() if line not in functional_filter_pos_array: only_ref_variant.append(line) ffp.close() else: only_ref_variant = [] ffp = open( "%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') for line in ffp: line = line.strip() only_ref_variant.append(line) ffp.close() print len(only_ref_variant) # # Get Only_ref_variant positions list # only_ref_variant = [] # ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, "r") # for lines in ffp: # lines = lines.strip() # only_ref_variant.append(lines) # ffp.close() core_vcf_file = args.filter2_only_snp_vcf_filename.replace( '_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_core.vcf.gz') # print core_vcf_file core_vcf_pos_base = {} for variants in VCF("%s" % core_vcf_file): if len(variants.ALT) > 1: core_vcf_pos_base[variants.POS] = variants.ALT[0] else: core_vcf_pos_base[variants.POS] = variants.ALT ffp.close() #troubleshoot # print len(core_vcf_pos_base) # test = "2024" # print str(core_vcf_pos_base[int(test)][0]) fasta_string = "" count = 0 for lines in only_ref_variant: lines = lines.strip() if int(lines) in core_vcf_pos_base.keys(): # print lines fasta_string = fasta_string + str(core_vcf_pos_base[int(lines)][0]) count += 1 else: fasta_string = fasta_string + str( f.sequence({ 'chr': str(f.keys()[0]), 'start': int(lines), 'stop': int(lines) })) count += 1 pattern = re.compile(r'\s+') fasta_string = re.sub(pattern, '', fasta_string) final_fasta_string = ">%s\n" % os.path.basename( core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', '')) + fasta_string fp = open( "%s/%s_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename( core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', ''))), 'w+') fp.write(final_fasta_string + '\n') fp.close() # print final_fasta_string print "Count: %s " % count print "Length: %s " % len(fasta_string)
class MutateFasta(object): def __init__(self, fasta): self.fasta = Fasta(fasta, key_fn=lambda key: key.split()[0]) # self.chroms = [str(i+1) for i in range(22)] + ['X', 'Y'] # , 'MT'] def generate_seq(self, records, offset=None): if not records and not offset: return seq = '' chrom = offset[0] if offset else records[0]['chrom'] prev_pos = offset[1] if offset else 0 last_pos = offset[2] if offset else len(self.fasta[chrom]) for r in records: ref = self.slice_fasta(r['chrom'], r['pos'], r['pos']) if not r['chrom'] == chrom: continue if not (r['ref'] and r['alt']): continue if not r['ref'][0] == ref: continue mut_type, sub_seq = self._classify_mut(r['ref'], r['alt']) if mut_type == 'snv': seq += self.slice_fasta(chrom, prev_pos + 1, r['pos'] - 1) seq += sub_seq prev_pos = r['pos'] elif mut_type == 'del': seq += self.slice_fasta(chrom, prev_pos + 1, r['pos']) prev_pos += len(sub_seq) elif mut_type == 'ins': seq += self.slice_fasta(chrom, prev_pos + 1, r['pos']) seq += sub_seq prev_pos = r['pos'] # Reminder if prev_pos + 1 <= last_pos: seq += self.slice_fasta(chrom, prev_pos + 1, last_pos) return seq def generate_contexted_seq(self, r): cons = [] chrom = r['chrom'] # TODO: support - strand genes. (currently only supports + strand genes...) # NOTE: refFlat is stored in 0-based coordinate # 5'UTR + 1st Exon cons.append( [self.slice_fasta(chrom, r['txStart'] + 1, r['cdsStart']), 'utr']) cons.append([ self.slice_fasta(chrom, r['cdsStart'] + 1, r['exonEnds'][0]), 'exon' ]) if r['exonCount'] > 1: cons.append([ self.slice_fasta(chrom, r['exonEnds'][0] + 1, r['exonStarts'][1]), 'intron' ]) # Exons for i, con in enumerate(r['exonStarts']): if i == 0 or i + 1 == r['exonCount']: continue cons.append([ self.slice_fasta(chrom, r['exonStarts'][i] + 1, r['exonEnds'][i]), 'exon' ]) cons.append([ self.slice_fasta(chrom, r['exonEnds'][i] + 1, r['exonStarts'][i + 1]), 'intron' ]) # last Exon + 3'UTR cons.append([ self.slice_fasta(chrom, r['exonStarts'][r['exonCount'] - 1] + 1, r['cdsEnd']), 'exon' ]) cons.append( [self.slice_fasta(chrom, r['cdsEnd'] + 1, r['txEnd']), 'utr']) return cons def slice_fasta(self, chrom, start, stop): return self.fasta.sequence( { 'chr': str(chrom), 'start': int(start), 'stop': int(stop) }, one_based=True) def _classify_mut(self, ref, alt): """ >>> _classify_mut('A','G') ('snv', 'G') >>> _classify_mut('G','GAA') ('ins', 'AA') >>> _classify_mut('TTA','T') ('del', 'TA') """ if len(ref) == len(alt) == 1: return 'snv', alt elif len(ref) < len(alt): assert ref[0] == alt[0], '{0} {1}'.format(ref, alt) return 'ins', alt[1:] elif len(ref) > len(alt): assert ref[0] == alt[0], '{0} {1}'.format(ref, alt) return 'del', ref[1:]
def extract_only_ref_variant_fasta_unique_positions(): #print "here" # Get reference genome ID get_reference = Fasta(args.reference) if len(get_reference.keys()) == 1: ref_id = get_reference.keys() c_reader = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') c_reader_2 = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') columns = list(zip(*c_reader)) ncol = len(next(c_reader_2)) unique_position_array = [] for i in columns[0][1:]: replace_string = i.split(' ') if replace_string[0] != "None": unique_position_array.append(int(replace_string[3])) else: unique_position_array.append(int(replace_string[2])) #print unique_position_array counts = 1 end = ncol for i in xrange(1, end, 1): print_string = "" ref_print_string = "" grab_vcf_filename = len(os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) #print grab_vcf_filename sample_name_re = columns[i][0][:grab_vcf_filename] #print sample_name_re # Replaced this with a more stable check #sample_name = str(columns[i][0]) # sample_name_re = re.sub('_R1.fastq.gz', '', sample_name) # sample_name_re = re.sub('_R1_001.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_L001.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_S.*', '', sample_name_re) if sample_name_re == os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '') or sample_name_re in os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''): vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample_name_re print_string = print_string + ">%s\n" % sample_name_re ref_print_string = ref_print_string + ">%s\n" % sample_name_re #variant_allele = ''.join(columns[i][1:]) variant_allele = "" for ntd in columns[i][1:]: if "/" in ntd: variant_allele = variant_allele + ntd[0] else: variant_allele = variant_allele + ntd #print variant_allele print_string = print_string + str(variant_allele) + "\n" allele_variant_fasta = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_variant_fasta.write(print_string) allele_variant_fasta.close() allele_ref_variant_fasta = open("%s/%s_ref_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_vcf = open("%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_vcf.write(vcf_header) variant_allele_array = [] variant_allele_array.append(columns[i][1:]) get_sample_reference = Fasta("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re)) if len(get_sample_reference.keys()) == 1: sample_ref_id = get_sample_reference.keys() for positions in unique_position_array: pos_index = unique_position_array.index(positions) if "/" in str(variant_allele_array[0][pos_index]): allele_var = str(variant_allele_array[0][pos_index][0]) #print allele_var else: allele_var = str(variant_allele_array[0][pos_index]) ref_allele = str(get_reference.sequence({'chr': str(get_reference.keys()[0]), 'start': int(positions), 'stop': int(positions)})) generate_vcf_string = "%s\t%s\t.\t%s\t%s\t221.999\t.\t.\t.\n" % (ref_id[0].split(' ')[0], positions, ref_allele, allele_var) allele_ref_variant_vcf.write(generate_vcf_string) allele_ref_variant_vcf.close() filename = "%s/consensus_ref_allele_variant.sh" % args.filter2_only_snp_vcf_dir vcf_filename = "%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re) f1 = open(filename, 'a+') bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) f1.write(bgzip_cmd) subprocess.call([bgzip_cmd], shell=True) tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) f1.write(tabix_cmd) subprocess.call([tabix_cmd], shell=True) base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin'] fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_variants.fa\n" % (args.reference, base_vcftools_bin, vcf_filename, sample_name_re) f1.write(fasta_cmd) subprocess.call([fasta_cmd], shell=True) sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_variants.fa\n" % (sample_name_re, sample_name_re) subprocess.call([sed_command], shell=True) f1.write(sed_command) #os.system("bash %s" % filename) #sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin']) #os.system(sequence_lgth_cmd) #call("%s" % sequence_lgth_cmd, logger) else: print "Sample name %s does not match with column name %s" % (os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''), sample_name_re)
def extract_only_ref_variant_fasta_unique_positions_with_unmapped(): # Get reference genome ID from reference fasta file get_reference = Fasta(args.reference) if len(get_reference.keys()) == 1: ref_id = get_reference.keys() # Read in the SNP Matrix file and seperate the columns. c_reader = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') c_reader_2 = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') columns = list(zip(*c_reader)) ncol = len(next(c_reader_2)) # Generate an array of all the unique variant positions that were called in all the samples unique_position_array = [] for i in columns[0][1:]: replace_string = i.split(' ') if replace_string[0] != "None": unique_position_array.append(int(replace_string[3])) else: unique_position_array.append(int(replace_string[2])) counts = 1 end = ncol # Loop over each column, check if the column name matches the sample name provided with argument args.filter2_only_snp_vcf_filename for i in xrange(1, end, 1): print_string = "" ref_print_string = "" grab_vcf_filename = len(os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) #print grab_vcf_filename sample_name_re = columns[i][0][:grab_vcf_filename] #print sample_name_re # Replaced this with a more stable check #sample_name = str(columns[i][0]) # sample_name_re = re.sub('_R1.fastq.gz', '', sample_name) # sample_name_re = re.sub('_R1_001.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_L001.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_S.*', '', sample_name_re) #print len(columns[i][1:]) if sample_name_re == os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '') or sample_name_re in os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''): vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample_name_re print_string = print_string + ">%s\n" % sample_name_re ref_print_string = ref_print_string + ">%s\n" % sample_name_re #variant_allele = ''.join(columns[i][1:]) variant_allele = "" for ntd in columns[i][1:]: #if "/" in ntd: if "/" in ntd or len(ntd) > 1: variant_allele = variant_allele + ntd[0] else: variant_allele = variant_allele + ntd #print variant_allele print_string = print_string + str(variant_allele) + "\n" allele_variant_fasta = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_fasta = open("%s/%s_ref_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_vcf = open("%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_vcf.write(vcf_header) allele_variant_fasta.write(print_string) allele_variant_fasta.close() variant_allele_array = [] variant_allele_array_dict = {} #variant_allele_array.append(columns[i][1:]) count_index = 0 end_index = len(unique_position_array) + 1 for start_count in xrange(1, end_index, 1): pos = columns[0][start_count] get_positions_string = pos.split(' ') if get_positions_string[0] != "None": get_positions = int(get_positions_string[3]) else: get_positions = int(get_positions_string[2]) variant_allele_array_dict[get_positions] = columns[i][start_count] # print len(variant_allele_array_dict) # print len(unique_position_array) get_sample_reference = Fasta("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re)) if len(get_sample_reference.keys()) == 1: sample_ref_id = get_sample_reference.keys() for positions in unique_position_array: #print positions #pos_index = unique_position_array.index(positions) if "/" in str(variant_allele_array_dict[positions]) or len(variant_allele_array_dict[positions]) > 1: allele_var = str(variant_allele_array_dict[positions][0]) #print allele_var else: allele_var = str(variant_allele_array_dict[positions]) # if str(positions) == "1477126": # print allele_var ref_allele = str(get_reference.sequence({'chr': str(get_reference.keys()[0]), 'start': int(positions), 'stop': int(positions)})) generate_vcf_string = "%s\t%s\t.\t%s\t%s\t221.999\t.\t.\t.\t.\n" % (ref_id[0].split(' ')[0], positions, ref_allele, allele_var) allele_ref_variant_vcf.write(generate_vcf_string) allele_ref_variant_vcf.close() filename = "%s/consensus_ref_allele_variant.sh" % args.filter2_only_snp_vcf_dir vcf_filename = "%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re) f1 = open(filename, 'a+') bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) f1.write(bgzip_cmd) subprocess.call([bgzip_cmd], shell=True) tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) f1.write(tabix_cmd) subprocess.call([tabix_cmd], shell=True) base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin'] fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_variants.fa\n" % (args.reference, base_vcftools_bin, vcf_filename, sample_name_re) f1.write(fasta_cmd) subprocess.call([fasta_cmd], shell=True) sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_variants.fa\n" % (sample_name_re, sample_name_re) subprocess.call([sed_command], shell=True) f1.write(sed_command) #os.system("bash %s" % filename) #sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin']) #os.system(sequence_lgth_cmd) #call("%s" % sequence_lgth_cmd, logger) unmapped_positions_file = "%s/%s_unmapped.bed_positions" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) #print unmapped_positions_file unmapped_vcf_file = "%s/%s_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re) unmapped_vcf = open( "%s/%s_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') unmapped_vcf.write(vcf_header) with open(unmapped_positions_file, 'r') as fpp: for lines in fpp: lines = lines.strip() ref_allele = str(get_reference.sequence( {'chr': str(get_reference.keys()[0]), 'start': int(lines), 'stop': int(lines)})) generate_vcf_string_unmapped = "%s\t%s\t.\t%s\t-\t221.999\t.\t.\t.\t.\n" % ( ref_id[0].split(' ')[0], lines, ref_allele) unmapped_vcf.write(generate_vcf_string_unmapped) unmapped_vcf.close() bgzip_cmd = "%s/%s/bgzip -f %s\n" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], unmapped_vcf_file) print bgzip_cmd tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], unmapped_vcf_file) print tabix_cmd subprocess.call([bgzip_cmd], shell=True) subprocess.call([tabix_cmd], shell=True) #allele_ref_variant_unmapped_vcf = open("%s/%s_ref_allele_variants_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') vcf_filename_unmapped = "%s/%s_ref_allele_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re) bcftools_merge_cmd = "%s/%s/bcftools merge --merge snps --force-samples %s.gz %s.gz -O v -o %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bcftools", Config)['bcftools_bin'], unmapped_vcf_file, vcf_filename, vcf_filename_unmapped) bgzip_cmd = "%s/%s/bgzip -f %s\n" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename_unmapped) subprocess.call([bcftools_merge_cmd], shell=True) tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename_unmapped) fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_unmapped_variants.fa\n" % ( args.reference, base_vcftools_bin, vcf_filename_unmapped, sample_name_re) #filename = "%s/consensus_ref_allele_unmapped_variant.sh" % args.filter2_only_snp_vcf_dir filename = "%s/%s_consensus_ref_allele_unmapped_variant.sh" % (args.filter2_only_snp_vcf_dir, sample_name_re) f1 = open(filename, 'w+') f1.write(bgzip_cmd) f1.write(tabix_cmd) f1.write(fasta_cmd) print "print here: %s" % filename subprocess.call(['pwd'], shell=True) subprocess.call(bgzip_cmd, shell=True) subprocess.call(tabix_cmd, shell=True) subprocess.call(fasta_cmd, shell=True) sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_unmapped_variants.fa\n" % (sample_name_re, sample_name_re) subprocess.call([sed_command], shell=True) f1.write(sed_command) f1.close() else: print "Sample name %s does not match with column name %s" % (os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''), sample_name_re)
#!/usr/bin/env python # -*- coding: utf-8 -*- from pyfasta import Fasta f = Fasta('test.txt') print f.keys() print len(f['X80413']) print f['X80413'][0:5] print f.sequence( { 'chr': 'X80413', 'start': 0, 'stop': len(f[key]), 'strand': '-' }, one_based=False)
def remove_reads_from_precursor(inbam, outbam, gr, minRlen, readlen_cutoff): """ prepare input/output files """ inbamPysamObj = pysam.Samfile(inbam, "rb") outbamPysamObj = pysam.Samfile(outbam, "wb", template=inbamPysamObj) """ create genome fetch object """ gf = Fasta(gr) """ remove reads when 3' has TGG on the genome """ for read in inbamPysamObj: read_name = read.qname tid = read.rname readchr = inbamPysamObj.getrname(tid) readstart = int(read.pos) + 1 readend = read.aend strand = read.flag readlen = len( read.seq) #this is the actual read length (41M, means readlen=41) read_len = read.qlen #this only considers matches (8S30M, means read_len=30) if readlen <= readlen_cutoff: outbamPysamObj.write(read) continue # if strand ==0 : #read maps to forward strand if strand == 0 or strand == 256: #read maps to forward strand upperlimit = minRlen - readlen #print(readchr,readend+1,readend+upperlimit) bpwindow = gf.sequence({ 'chr': readchr, 'start': readend + 1, 'stop': readend + upperlimit }) #print bpwindow if readlen == minRlen - 1 and (bpwindow == "T" or bpwindow == "A"): continue #TGGAATTCTCGGGTGCCAAGG elif readlen == minRlen - 2 and (bpwindow == "TG" or bpwindow == "AA"): continue elif readlen == minRlen - 3 and (bpwindow == "TGG" or bpwindow == "AAA"): continue elif readlen == minRlen - 4 and (bpwindow == "TGGA" or bpwindow == "AAAA"): continue elif readlen == minRlen - 5 and (bpwindow == "TGGAA" or bpwindow == "AAAAA"): continue else: outbamPysamObj.write(read) # elif strand ==16: #read maps to reverse strand elif strand == 16 or strand == 272: #read maps to reverse strand upperlimit = minRlen - readlen bpwindow = gf.sequence({ 'chr': readchr, 'start': readstart - upperlimit, 'stop': readstart - 1 }) if readlen == minRlen - 1 and (bpwindow == "A" or bpwindow == "T"): continue #TTCCA elif readlen == minRlen - 2 and (bpwindow == "CA" or bpwindow == "TT"): continue elif readlen == minRlen - 3 and (bpwindow == "CCA" or bpwindow == "TTT"): continue elif readlen == minRlen - 4 and (bpwindow == "TCCA" or bpwindow == "TTTT"): continue elif readlen == minRlen - 5 and (bpwindow == "TTCCA" or bpwindow == "TTTTT"): continue else: outbamPysamObj.write(read) outbamPysamObj.close()
end_index = content.find("-----", start_index) required_each_content = content[start_index: end_index] each_motif_lines = required_each_content.strip().split("\n") for motif_line in each_motif_lines: splitted = motif_line.split() chrom1 = splitted[0].split(":")[0] sequence1 = splitted[5] strand1 = splitted[1] if strand1 == "+": #f.sequence({"chr": "chr18", "start" : (3603155 + 22 -1), "stop" : (3603155 + 22 -1) + len_3, "strand" : "+"}, one_based = False).upper() # u'TCAGGTCACCAGATAAAG' start1 = (int(splitted[0].split(":")[1]) + int(splitted[2])) - 1 end1 = start1 + len(sequence1) seq_pyfasta = f.sequence({"chr": chrom1, "start" : start1, "stop" : end1, "strand" : strand1}, one_based = False).upper() if strand1 == "-": #f.sequence({"chr": "chr2", "start" : (112383865 + 59 -1), "stop" : (112383865 + 59 -1) + len_3, "strand" : "-"}, one_based = False).upper() # u'AATCTTTGTCAGATAATC' start1 = (int(splitted[0].split(":")[1]) + int(splitted[2])) -1 end1 = start1 + len(sequence1) seq_pyfasta = f.sequence({"chr": chrom1, "start" : start1, "stop" : end1, "strand" : strand1}, one_based = False).upper() #convert unicode string to python string #str(seq_pyfasta) #seq_pyfasta.encode("ascii", "replace") #seq_pyfasta.encode("ascii", "ignore") req = [chrom1, start1, end1, strand1, sequence1, str(seq_pyfasta)] for i,item in enumerate(req):
class MutateFasta(object): def __init__(self, fasta): self.fasta = Fasta(fasta, key_fn=lambda key: key.split()[0]) # self.chroms = [str(i+1) for i in range(22)] + ['X', 'Y'] # , 'MT'] def generate_seq(self, records, offset=None): if not records and not offset: return seq = '' chrom = offset[0] if offset else records[0]['chrom'] prev_pos = offset[1] if offset else 0 last_pos = offset[2] if offset else len(self.fasta[chrom]) for r in records: ref = self.slice_fasta(r['chrom'], r['pos'], r['pos']) if not r['chrom'] == chrom: continue if not (r['ref'] and r['alt']): continue if not r['ref'][0] == ref: continue mut_type, sub_seq = self._classify_mut(r['ref'], r['alt']) if mut_type == 'snv': seq += self.slice_fasta(chrom, prev_pos + 1, r['pos'] - 1) seq += sub_seq prev_pos = r['pos'] elif mut_type == 'del': seq += self.slice_fasta(chrom, prev_pos + 1, r['pos']) prev_pos += len(sub_seq) elif mut_type == 'ins': seq += self.slice_fasta(chrom, prev_pos + 1, r['pos']) seq += sub_seq prev_pos = r['pos'] # Reminder if prev_pos + 1 <= last_pos: seq += self.slice_fasta(chrom, prev_pos + 1, last_pos) return seq def generate_contexted_seq(self, r): cons = [] chrom = r['chrom'] # TODO: support - strand genes. (currently only supports + strand genes...) # NOTE: refFlat is stored in 0-based coordinate # 5'UTR + 1st Exon cons.append([self.slice_fasta(chrom, r['txStart'] + 1, r['cdsStart']), 'utr']) cons.append([self.slice_fasta(chrom, r['cdsStart'] + 1, r['exonEnds'][0]), 'exon']) if r['exonCount'] > 1: cons.append([self.slice_fasta(chrom, r['exonEnds'][0] + 1, r['exonStarts'][1]), 'intron']) # Exons for i,con in enumerate(r['exonStarts']): if i == 0 or i+1 == r['exonCount']: continue cons.append([self.slice_fasta(chrom, r['exonStarts'][i] + 1, r['exonEnds'][i]), 'exon']) cons.append([self.slice_fasta(chrom, r['exonEnds'][i] + 1, r['exonStarts'][i+1]), 'intron']) # last Exon + 3'UTR cons.append([self.slice_fasta(chrom, r['exonStarts'][r['exonCount']-1] + 1, r['cdsEnd']), 'exon']) cons.append([self.slice_fasta(chrom, r['cdsEnd'] + 1, r['txEnd']), 'utr']) return cons def slice_fasta(self, chrom, start, stop): return self.fasta.sequence({'chr': str(chrom), 'start': int(start), 'stop': int(stop)}, one_based=True) def _classify_mut(self, ref, alt): """ >>> _classify_mut('A','G') ('snv', 'G') >>> _classify_mut('G','GAA') ('ins', 'AA') >>> _classify_mut('TTA','T') ('del', 'TA') """ if len(ref) == len(alt) == 1: return 'snv', alt elif len(ref) < len(alt): assert ref[0] == alt[0], '{0} {1}'.format(ref, alt) return 'ins', alt[1:] elif len(ref) > len(alt): assert ref[0] == alt[0], '{0} {1}'.format(ref, alt) return 'del', ref[1:]