def test_deletion_insertion_mismatch(self): """ Compute the correct MD tag for a spliced transcript that contains an insertion, deletion, and mismatch. """ sam = "input_files/sams/deletion_insertion_mismatch.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) correct_MD = "MD:Z:475C0A0C0C0A1082^C1347G17C205" correct_NM = "NM:i:9" assert transcript.MD == correct_MD assert transcript.NM == correct_NM
def test_perfect_match_with_introns(self): """ Compute the correct MD tag for a transcript that is a perfect reference match containing introns. """ sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) correct_MD = "MD:Z:3400" correct_NM = "NM:i:0" assert transcript.MD == correct_MD assert transcript.NM == correct_NM
def test_insertion(self): """ Compute the correct MD tag for a spliced transcript that contains an insertion. """ sam = "input_files/sams/insertion.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) correct_MD = "MD:Z:3069" correct_NM = "NM:i:2" assert transcript.MD == correct_MD assert transcript.NM == correct_NM
def test_insertion_deletion_mismatch_ncsj(self): """ Compute the correct MD tag for a transcript that contains an insertion, deletion, mismatch, and noncanonical splice junction in it. """ sam = "input_files/sams/deletion_insertion_mismatch_nc.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) correct_MD = "MD:Z:414G0A450^C405^C2435" correct_NM = "NM:i:5" assert transcript.MD == correct_MD assert transcript.NM == correct_NM
def get_seq(random_cns_list, fasta_file, new_fasta_name): new_fasta = open("{0}".format(new_fasta_name), "wb") n = 0 for random_cns in random_cns_list: n = n + 1 accn, seqid, start, end = random_cns f = Fasta(fasta_file) seq = f[seqid][start:end] if "X" in seq: print accn, seqid, start, end if len(seq) < 15 and len(seq) > 0: print "OH NO!!!!!!" w = ">cns{0}\n".format(n) seq_w = "{0}\n".format(seq) new_fasta.write(w) new_fasta.write(seq_w)
def ExtractSg(str_refgem, str_chr, int_beg, int_end, str_drct): fa_ref = Fasta(str_refgem) re_pat = re.compile('G+') gn_f_sg = __ParseSeq(str_chr, int_beg, int_end, '+', fa_ref, re_pat) gn_r_sg = __ParseSeq(str_chr, int_beg, int_end, '-', fa_ref, re_pat) if str_drct == 'f': for str_sg in gn_f_sg: yield str_sg elif str_drct == 'r': for str_sg in gn_r_sg: yield str_sg elif str_drct == 'b': for str_sg in gn_f_sg: yield str_sg for str_sg in gn_r_sg: yield str_sg
def find_repeats(fastafile,outfile): f = Fasta(fastafile) d = defaultdict(list) out = open(outfile,"wb") for chrm in f: chrm_str = f[chrm][:] ### match as many repeats get as many as possible lower or upper #repeat_reagions = re.finditer("X{1,}",chrm_str, flags=re.IGNORECASE) repeat_reagions = re.finditer("[a-z]{1,}",chrm_str) rep_pos = [repeat.span() for repeat in repeat_reagions] for rep in rep_pos: d[chrm].append(rep) name = "{0}_{1}_{2}".format(chrm,rep[0], rep[1]) out.write("{0}\t{1}\t{2}\t{3}\n".format(chrm,rep[0],rep[1],name)) out.close() return d
def search(self, gene, ref, pos, alt): mut_name = "".join([ref, str(pos), alt]) gene_mut_name = "_".join([gene, mut_name]) fasta_string = self.create_variant_probe_set(var_name=gene_mut_name) with tempfile.NamedTemporaryFile() as fp: fp.write(fasta_string) fp.seek(0) fasta = Fasta(fp.name) refs = [] alts = [] for k, v in fasta.items(): if "ref" in k: refs.append(str(v)) else: alts.append(str(v)) return {"query": gene_mut_name, "results": self.genotype_alleles(refs, alts)}
def main(gff_file, outdir): """empty docstring""" name = re.compile("parent=([^.;]+)", re.I) feats = {} non_cds_feats = collections.defaultdict(list) for line in open(gff_file): line = line.split("\t") match = re.search(name, line[-1]) if not match: continue fname = match.groups(0)[0] non_cds_feats[fname].append(line) if line[2].upper() == 'CDS': feats[fname] = True continue if fname in feats: continue feats[fname] = None i = 0 for k, v in sorted(feats.items()): if not v is None: del non_cds_feats[k] seen = {} RNA = open(outdir + '/at_non_cds.gff', 'w') for k, feat_list in sorted(non_cds_feats.items()): for feat in feat_list: if feat[0] in ('ChrC', 'ChrM'): continue if feat[2] == 'exon': continue key = (feat[0], feat[3], feat[4]) if key in seen: continue feat[0] = feat[0].upper().replace('CHR', '') seen[key] = True feat[-1] = k print >> RNA, "\t".join(feat) RNA.close() gff = read_gff(outdir + '/at_non_cds.gff') fasta = Fasta( '/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta') ftypes = {} FA = open(outdir + '/at_rnas.fasta', 'w') for chr, feature_list in gff.iteritems(): for fname, feature in feature_list.iteritems(): seq = fasta.sequence(feature) print >> FA, ">", feature['name'] print >> FA, seq FA.close()
def intron(fa, ann): f = Fasta(fa) fh = open(ann, 'r') out1 = open('intron.fa', 'w') mdict = {} ndict = {} for line in fh: if line.startswith('#'): continue new = line.strip().split('\t') if new[2] != 'CDS': continue n = new[-1].split(';') for i, j in enumerate(n): if 'Parent=' in j: mindex = i g = n[mindex].split('.') t = g[0].replace('Parent=', '') if '_' in t: gene = t.split('_')[0] else: gene = t if gene not in mdict: mdict[gene] = [] ndict[gene] = [new[0], new[6]] start1 = int(new[3]) stop1 = int(new[4]) mdict[gene].append((start1, stop1)) for i in sorted(mdict): k = '' total = len(mdict[i]) for j in range(0, total - 1): start = mdict[i][j][1] + 1 stop = mdict[i][j + 1][0] - 1 k1 = f.sequence({ 'chr': ndict[i][0], 'start': start, 'stop': stop, 'strand': ndict[i][1] }) k += k1 out1.write('>{0}-intron'.format(i) + '\n') out1.write(k + '\n') fh.close() out1.close()
def exclude_genes_in_high_repeat_areas(merged_genes, bfasta): #print "FASTA:", afasta f = Fasta(bfasta) skipped = 0 for gene in merged_genes: # get the total sequence length. seq = str(f[gene['seqid']][gene['start']:gene['end']]) tot = len(seq) # and the lenght of real sequence. seq = seq.upper().replace('N', '').replace('X', '') # if it's not > 80% good sequence, just skip it. if float(len(seq)) / tot < .85: skipped += 1 continue yield gene log.info("removed %i (otherwise) new genes in masked areas" % skipped)
def run(args): genome = Fasta(args.genome) bed = filter(lambda x: x.strip(), args.bedfile.readlines()) bed_list = map(lambda x: x.strip().split(), bed) result = map( lambda i: '>{0}_{1}\n{2}'.format( args.seqname, i + 1, genome.sequence({ 'chr': bed_list[i][0], 'start': int(bed_list[i][1]) - args.flank, 'stop': int(bed_list[i][2]) + args.flank, 'strand': bed_list[i][3] })).upper(), range(len(bed_list))) if args.outfile: args.outfile.write('\n'.join(result)) else: print ''.join(result)
def main(): """ select specific contigs from FASTA file """ if len(sys.argv) == 2: prefix = sys.argv[1] else: print "Usage: python select.py <prefix>; assume that <prefix>_BspQI_key.txt <prefix>.fasta and <prefix>_list.txt exist; output will be <prefix>_selected.fasta" return 0 ren = ReadTable(prefix+'_BspQI_key.txt', 4, '\t') # 4 lines of header print 'renaming table',ren select = ReadTable(prefix+'_list.txt', 0) # no header, text file of contigs numbers, one per line print 'select list',select # create a dictionary between contig id x[0] and (FASTA id x[1]) renaming = {} for x in ren: renaming[int(x[0])]=x[1] # contigs names are converted into integers, as well as length print 'renaming dictionary', renaming # collect the names of the contigs to be cut selected_list = [] for x in select: index = int(x[0]) # name of the contig to select, convert contig name into integer so we can match it #print 'index',index if index in renaming: selected_list.append(renaming[index]) # add the name of the contig else: print 'Error: contig',index,'does not exist' sys.exit(-1) print 'selected_list', selected_list # open the fasta file for reading fas = Fasta(prefix+'.fasta') # open the new fasta file for writing ofa = open(prefix+'_new.fasta','w') print 'writing new fasta' for x in sorted(fas.keys()): # process all the contigs one by one if x in selected_list: # if it needs to be split print 'Selecting',x ofa.write('>'+x+'\n') ofa.write(fas[x][:]+'\n') # entire contig else: print 'Not selecting',x ofa.close()
def build_gc_array(fastafile="/mnt/ref/hg38.upper.fa", gcdir="gc", n=1000): from pyfasta import Fasta f = Fasta(fastafile) mkdir(gcdir) for seqid in allsomes: if seqid not in f: logging.debug("Seq {} not found. Continue anyway.".format(seqid)) continue c = np.array(f[seqid]) gc = (c == 'G') | (c == 'C') # If base is GC rr = ~(c == 'N') # If base is real mgc = pd.rolling_sum(gc, n, min_periods=n / 2)[n - 1::n] mrr = pd.rolling_sum(rr, n, min_periods=n / 2)[n - 1::n] gc_pct = np.rint(mgc * 100 / mrr) gc_pct = np.asarray(gc_pct, dtype=np.uint8) arfile = op.join(gcdir, "{}.{}.gc".format(seqid, n)) gc_pct.tofile(arfile) print >> sys.stderr, seqid, gc_pct, arfile
def test_two_annotated_SJs(self): """ Transcript with 2 junctions and each match the provided reference """ sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") sjFile = "input_files/GM12878_SJs_chr1.tab" outprefix = "scratch/test" tmp_dir = "scratch/test_jIjM/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert transcript.allJnsAnnotated == True assert transcript.isCanonical == True
def test_plus_strand(self): """ Toy transcript with sequence AAAGA on the plus strand. Sequence should be output as listed in the SAM fields.""" sam_fields = ["test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0", "AAAGA", "*", "NM:i:0", "MD:Z:5", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") spliceAnnot = None # Init transcript object transcript = ts.Transcript(sam_fields, genome, spliceAnnot) # Output fasta and check against expected expected_fasta = ">test_read" + "\n" + "AAAGA" assert transcript.printableFa() == expected_fasta
def test_pre_correction_dmel(self): """ This is a noisy Drosophila read, but prior to correction, the CIGAR and SEQ strings should definitely match """ sam = "input_files/drosophila_example/input_read.sam" genome = Fasta("input_files/drosophila_example/chr3R.fa") sjDict = set() with open(sam, 'r') as f: for sam_line in f: if sam_line.startswith("@"): continue else: sam_line = sam_line.strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert t2.check_seq_and_cigar_length(transcript.SEQ, transcript.CIGAR) == True
def build(args): reference = Fasta( '/pipeline/data/b37/VEP/homo_sapiens/75/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa' ) with open(args[1], 'r') as ref_file: for line in ref_file: if line.startswith('#'): continue fields = line.split('\t') chrom = key_map[fields[1]] position = int(fields[2]) ref_allele = reference[chrom][position] sys.stdout.write("{}\n".format("\t".join( [fields[1], fields[2], ref_allele])))
def bed_to_introns(bed_in, fasta_in, fasta_out): logging.info("Opening FASTA: {0}".format(fasta_in)) logging.info("Note: will take a while the first time it is opened.") fasta = Fasta(fasta_in, key_fn=lambda key: key.split()[0]) bed_h = open(bed_in, 'r') all_keys = {} output_seq = [] count = 0 for line in bed_h: if count % 10000 == 0 and count > 0: logging.info("On intron: {0}".format(count)) ref, start, stop, genename = line.split() key = ref + ':' + start + '-' + stop if key in all_keys: logging.warning("ERROR: {0} appears once already".format(key)) continue all_keys[key] = True if int(start) > int(stop): logging.warning( "Intron coords greater than reference: {0}:{1}-{2}".format( ref, start, stop)) logging.warning("Reference length: {0}".format(len(fasta[ref]))) continue if not ref in fasta: logging.warning( "Skipping {0} due to missing sequence in the FASTA".format( key)) continue seq = fasta[ref][int(start):int(stop)] if len(seq) == 0: logging.warning("Intron length is 0? {0}:{1}-{2}".format( ref, start, stop)) continue record = SeqRecord(Seq(seq), key, '', '') output_seq.append(record) count += 1 bed_h.close() with open(fasta_out, 'w') as outf: logging.info("Writing intron sequences out to {0}".format(fasta_out)) SeqIO.write(output_seq, outf, 'fasta')
def test_minus_strand(self): """ Toy transcript on the minus strand with SAM sequence AAAGA. Sequence for FASTA file should be the reverse complement. """ sam_fields = ["test_read", "16", "chr1", "202892094", "255", "5M", "*", "0", "0", "AAAGA", "*", "NM:i:0", "MD:Z:5", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") spliceAnnot = None # Init transcript object transcript = ts.Transcript(sam_fields, genome, spliceAnnot) # Output fasta and check against expected expected_fasta = ">test_read" + "\n" + "TCTTT" assert transcript.printableFa() == expected_fasta
def get_sketch(fasta, n_kmers=100, k=15): # use a sample of kmers from a fastq hash_count = Counter() f = Fasta(fasta) for chrom in f.keys(): seq = f[chrom] for i in range(len(seq) - k): kmer = seq[i:i + k] hash_count[kmer] += 1 hashes_used = 0 hashed_sketch = [] for kmer in sorted(hash_count.keys()): if hashes_used <= n_kmers: #print(hash_count[i]) hashed_sketch.append(kmer) hashes_used += 1 return hashed_sketch
def generate_corpusfile(fasta_fname, n, corpus_fname): ''' Args: fasta_fname: corpus file name n: the number of chunks to split. In other words, "n" for "n-gram" corpus_fname: corpus_fnameput corpus file path Description: Protvec uses word2vec inside, and it requires to load corpus file to generate corpus. ''' f = open(corpus_fname, "w") fasta = Fasta(fasta_fname) for record_id in tqdm(fasta.keys(), desc='corpus generation progress'): r = fasta[record_id] seq = str(r) ngram_patterns = split_ngrams(seq, n) for ngram_pattern in ngram_patterns: f.write(" ".join(ngram_pattern) + "\n") f.close()
def extract_reference_allele(): print "Extracting Reference Allele from Reference Fasta file - %s to REF\n" % args.reference # Get reference genome ID from reference fasta file get_reference = Fasta(args.reference) if len(get_reference.keys()) == 1: ref_id = get_reference.keys() print "The reference genome ID from reference genome - %s" % ref_id fileObj = open("REF", 'w+') fileObj.write('Ref' + '\n') for item in pos: ref_allele = str( get_reference.sequence({ 'chr': str(get_reference.keys()[0]), 'start': int(item), 'stop': int(item) })) fileObj.write(ref_allele + '\n') fileObj.close()
def test_not_matching(self): """ This is the sam read as above but after correction with TCv2.0.1. The read got flagged by samtools after correction as having inconsistent CIGAR and SEQ fields """ sam = "input_files/drosophila_example/bad_correction.sam" genome = Fasta("input_files/drosophila_example/chr3R.fa") sjDict = set() with open(sam, 'r') as f: for sam_line in f: if sam_line.startswith("@"): continue else: sam_line = sam_line.strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert t2.check_seq_and_cigar_length(transcript.SEQ, transcript.CIGAR) == False
def __init__(self, cellline_trn_num,path='/home/liuqiao/software/DeepCAGE/data/encode'): self.path = path self.genome = Fasta('%s/genome.fa'%path) pd_openness = pd.read_csv('%s/readscount_normalized_filtered.csv'%path,header=0,index_col=[0],sep='\t') self.pd_openness = np.log(pd_openness+1) pd_tf_gexp = pd.read_csv('%s/preprocessed/tf_gexp.csv'%path ,sep='\t',header=0,index_col=[0]) self.pd_tf_gexp = np.log(pd_tf_gexp+1) self.pd_tf_bs = pd.read_csv('%s/preprocessed/tf_motif_score.csv'%path,sep='\t',header=0,index_col=[0]) np.random.seed(123) train_idx = np.random.choice(np.arange(self.pd_openness.shape[1]),size=cellline_trn_num,replace=False) self.train_dseq_celllines = np.array(list(self.pd_openness.columns))[train_idx] self.test_dseq_celllines = [item for item in self.pd_openness.columns if item not in self.train_dseq_celllines] self.train_rseq_celllines = np.array(list(self.pd_tf_gexp.index))[train_idx] self.test_rseq_celllines = [item for item in self.pd_tf_gexp.index if item not in self.train_rseq_celllines] assert self.pd_openness.shape[1] == len(self.train_dseq_celllines)+len(self.test_dseq_celllines) assert self.pd_tf_gexp.shape[0] == len(self.train_rseq_celllines)+len(self.test_rseq_celllines)
def __init__(self): self.datapath = '../data/' self.dna_peak_c_path = 'dnase_peaks_conservative/' self.label_path = os.path.join(self.datapath, 'chipseq_labels/') self.benchmark_path = os.path.join(self.datapath, 'benchmark_labels/') self.hg19 = Fasta(os.path.join(self.datapath, 'annotations/hg19.genome.fa')) self.bin_size = 200 self.correction = 400 self.train_length = 51676736 self.ladder_length = 8843011 self.test_length = 60519747 self.chunk_size = 1000000 self.num_channels = 4 self.num_trans_fs = len(self.get_trans_fs()) self.num_celltypes = len(self.get_celltypes()) self.save_dir = os.path.join(self.datapath, 'preprocess/features') if not os.path.exists(self.save_dir): os.mkdir(self.save_dir)
def write_files(original_fasta, out_dir, counts, write_bin): fa = Fasta(original_fasta) fc, ft, fmethyltype = \ bin_paths_from_fasta(original_fasta, out_dir) out = open(op.dirname(fmethyltype) + "/methyl-data-%s.txt" \ % datetime.date.today(), 'w') print >> out, make_header() print >> out, "#seqid\tmt\tbp\tc\tt" f_pat = bin_paths_from_fasta(original_fasta, out_dir, pattern_only=True) f_summary = open(op.join(out_dir, "summary.txt"), "w") print >> sys.stderr, "#> writing:", f_pat, f_summary.name summary_counts = dict.fromkeys(('CHG', 'CHH', 'CG')) for ctx in summary_counts.keys(): summary_counts[ctx] = {'cs': 0, 'ts': 0} for i, seqid in enumerate(sorted(counts.keys())): seq = str(fa[seqid]) mtype = calc_methylation(seq) cs = counts[seqid]['c'] ts = counts[seqid]['t'] print_summary(seqid, cs, ts, mtype, summary_counts, f_summary, print_header=(i == 0)) if write_bin: cs.tofile(fc % seqid) ts.tofile(ft % seqid) mtype.tofile(fmethyltype % seqid) to_text_file(cs, ts, mtype, seqid, out) del cs del ts del mtype print_genome_summary(summary_counts, f_summary)
def test_crash_correction(self): """ This is a case that is supposed to crash the NCSJ correction process, resulting in no correction. This is because the mapping has created a 7-bp micro-exon with a canonical but likely incorrect junction to its left, and a non-canonical junction on its right. Post-correction, we end up with two introns next to each other with a zero-length exon, which is not valid.""" # Process references sjFile = "input_files/chr11_sjs.txt" tmp_dir = "scratch/test/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr11"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/hg38_chr11.fa") sam = "input_files/sams/microexon.sam" with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) assert transcript.isCanonical == False # Attempt to correct the splice junction transcript, TE_entries = TC.cleanNoncanonical(transcript, refs, maxDist, logInfo) orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N" "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N" "59M864N31M9891N69M1711N7M1341N47M13S") assert transcript.isCanonical == False assert transcript.MD == "MD:Z:2473" assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert transcript.CIGAR == orig_CIGAR
def parse_map(name, reads): f = Fasta("/hive/groups/recon/projs/gorilla_eichler/pipeline_data/assemblies/susie_3_2/human.fa") start, stop = reads[0].pos - padding, reads[-1].pos + padding seq = f['chr1'][start:stop] tmp_ref = tempfile.mkstemp()[1] with open(tmp_ref, "w") as outf: outf.write(">chr1_{}-{}\n{}\n".format(start, stop, seq)) tmp_fq = tempfile.mkstemp()[1] with open(tmp_fq, "w") as outf: for i, read in enumerate(reads): outf.write("@{}\n{}\n+\n{}\n".format(read.qname + str(i), read.seq, read.qual)) subprocess.call("/cluster/home/ifiddes/fermikit/fermi.kit/fermi2.pl unitig -s {}k -l100 -p {} {} > {}".format((stop - start) / 1000, tmp_fq, tmp_fq, tmp_fq + ".mak"), shell=True) subprocess.call("make -f {}".format(tmp_fq + ".mak"), shell=True) subprocess.call("bwa index {}".format(tmp_ref), shell=True) subprocess.call("/cluster/home/ifiddes/fermikit/fermi.kit/run-calling {} {} | sh".format(tmp_ref, tmp_fq + ".mag.gz"), shell=True) header = {"HD": {"VN": "1.3"}, "SQ": [{"LN": 248956422, "SN": "chr1"}]} with pysam.Samfile(os.path.join("/hive/users/ifiddes/longranger-1.2.0/separated_phased_bams/fermi-assembly_NA12878", name + ".bam"), "wb", header=header) as outf: for read in pysam.Samfile(tmp_fq + ".srt.bam"): read.pos = read.pos + start outf.write(read)
def updownTSS_seq(type): for p in db.features_of_type(type): genomefa = Fasta(myFasta) chrlen = len(genomefa[p.chrom]) #print('>' + p.id + "_[-" + str(upRange) + "]-[+" + str(downRange) + "]") seqstart = p.start - 1 - upRange seqstart = seqstart if seqstart > 0 else 0 # avoid start with minus coord seqend = p.start + downRange # get sequence based on coordinates (start is 0-based) p_updown = genomefa[p.seqid][seqstart:seqend] if p.strand == '-': seqstart = p.end - 1 - downRange seqend = p.end + upRange seqend = seqend if seqend < chrlen else chrlen # avoid end coord exceeds chrom end p_updown = genomefa[p.seqid][seqstart:seqend] p_updown = Seq(p_updown).reverse_complement() print('>' + p.id + "_[" + str(seqstart) + "-" + str(seqend) + "]") # print out the coordinates for i in range(0, len(p_updown), 60): # print 60 bases per line print(p_updown[i:i + 60])