def main(dna_file, protein_file=None, output_h=sys.stdout): output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n") work_dir = op.join(os.getcwd(), "syn_analysis") if not op.exists(work_dir): os.makedirs(work_dir) if not protein_file: protein_file = translate_dna(dna_file) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir) mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write( "%s\n" % (",".join(str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng))) ) output_h.flush() # Cleanup for f in ("2YN.dN", "2YN.dS", "2YN.t", "rst", "rst1", "rub"): if op.exists(f): os.remove(f)
def standard_test_procedure(self, cline): """Standard testing procedure used by all tests.""" # Overwrite existing files. cline.force = True # Mark output files for later cleanup. self.add_file_to_clean(cline.outfile) if cline.guidetree_out: self.add_file_to_clean(cline.guidetree_out) input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta")) self.assertEqual(str(eval(repr(cline))), str(cline)) output, error = cline() self.assertTrue(not output or output.strip().startswith("CLUSTAL")) # Test if ClustalOmega executed successfully. self.assertTrue(error.strip() == "" or error.startswith("WARNING: Sequence type is DNA.") or error.startswith("WARNING: DNA alignment is still experimental.")) # Check the output... align = AlignIO.read(cline.outfile, "clustal") output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal")) self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys()))) for record in align: self.assertEqual(str(record.seq), str(output_records[record.id].seq)) # TODO - Try and parse this with Bio.Nexus? if cline.guidetree_out: self.assertTrue(os.path.isfile(cline.guidetree_out))
def load_examples_from_fasta(signal, org, data_path): """ load examples from fasta file signal """ fn_pos = "%s/%s_sig_%s_example.fa" % (data_path, signal, "pos") fn_neg = "%s/%s_sig_%s_example.fa" % (data_path, signal, "neg") print "loading: \n %s \n %s" % (fn_pos, fn_neg) # parse file xt_pos = [str(rec.seq) for rec in SeqIO.parse(fn_pos, "fasta")] xt_neg = [str(rec.seq) for rec in SeqIO.parse(fn_neg, "fasta")] labels = [+1] * len(xt_pos) + [-1] * len(xt_neg) examples = xt_pos + xt_neg print ( "organism: %s, signal %s,\t num_labels: %i,\t num_examples %i,\t num_positives: %i,\t num_negatives: %i" % (org, signal, len(labels), len(examples), len(xt_pos), len(xt_neg)) ) examples_shuffled, labels_shuffled = helper.coshuffle(examples, labels) ret = {"examples": numpy.array(examples_shuffled), "labels": numpy.array(labels_shuffled)} return ret
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) if args.database_name not in server.keys(): server.new_database(args.database_name) db = server[args.database_name] try: if args.gff is not None and args.fasta is not None: load_gff(db, args.gff, args.fasta, args.tax_lookup, args.taxid) elif args.genbank is not None: load_genbank(db, args.genbank, args.tax_lookup, args.taxid) except: server.adaptor.rollback() raise if args.new_taxons: taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid) if args.fasta is not None: gen = SeqIO.parse(args.fasta, 'fasta') elif args.genbank is not None: gen = SeqIO.parse(args.genbank, 'genbank') for rec in gen: server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec.name))) server.commit()
def loop(self, filename, format): original_records = list(SeqIO.parse(open(filename, "rU"), format)) # now open a connection to load the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) db_name = "test_loop_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(original_records) self.assertEqual(count, len(original_records)) server.commit() #Now read them back... biosql_records = [db.lookup(name=rec.name) for rec in original_records] #And check they agree self.assertTrue(compare_records(original_records, biosql_records)) #Now write to a handle... handle = StringIO() SeqIO.write(biosql_records, handle, "gb") #Now read them back... handle.seek(0) new_records = list(SeqIO.parse(handle, "gb")) #And check they still agree self.assertEqual(len(new_records), len(original_records)) for old, new in zip(original_records, new_records): #TODO - remove this hack because we don't yet write these (yet): for key in ["comment", "references", "db_source"]: if key in old.annotations and key not in new.annotations: del old.annotations[key] self.assertTrue(compare_record(old, new)) #Done server.close()
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def run_pal2nal(fname_aln, fname_nuc, fname_prot): """ Generate a codon alignment via PAL2NAL. @param fname_aln: MSA of protein sequences in CLUSTAL format (.aln) @param fname_nuc: Nucleotide sequences in FASTA format (.fasta) @param fname_prot: Protein sequences in FASTA format (.fasta) @return: Codon alignment in CLUSTAL format (.aln), suitable for codeml 1""" sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc)) # Reorder fname_nuc according to the order of the proteins in fname_aln, which # was reordered due to CLUSTALW2. Note that the first protein in each of # these files remains the same as at the start, however; this first protein # is our original query protein. nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")] prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")] records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records)) fname_nuc2 = "homologs_ordered.dna.fasta" with open(fname_nuc2, "w") as f: for record in SeqIO.parse(fname_aln, "clustal"): SeqIO.write(records_map[record.id], f, "fasta") fname_codon = "homologs.codon.aln" # TODO: use subprocess os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon)) return fname_codon
def main_build_markov(promotor_filename = "promotor.fa", genome_filename = "genom.fa", symbol_length = 2, load_cached = False, save_cache = True): ''' Na podstawie plików z sekwencjami promotorowymi i genomem funkcja buduje model Markova''' promotor_sequences = [ x for x in SeqIO.parse("promotor.fa", "fasta")] genome = [ x for x in SeqIO.parse("genom.fa", "fasta")] if not load_cached: promotor_freqs = calc_symbol_freq(promotor_sequences) genome_freqs = calc_symbol_freq(genome) if save_cache: dump_obj(promotor_freqs, Dumpfiles.promotor_freq) dump_obj(genome_freqs, Dumpfiles.genome_freq) else: promotor_freqs = load_obj(Dumpfiles.promotor_freq) genome_freqs = load_obj(Dumpfiles.genome_freq) promotor_counts = calc_counts(promotor_sequences) genome_counts = calc_counts(genome) print promotor_counts promotor_freqs = fold_and_normalize(promotor_freqs[symbol_length], symbol_length, promotor_counts[symbol_length]) genome_freqs = fold_and_normalize(genome_freqs[symbol_length], symbol_length, genome_counts[symbol_length]) for k in promotor_freqs: assert(k in genome_freqs) for k in genome_freqs: assert(k in genome_freqs) print promotor_freqs (markov, states) = build_markov(genome_freqs, promotor_freqs) return (markov, states)
def cluster_pid(folder): result = [] f_name = folder.split("/")[-1] try: genes = pd.read_csv(folder + "/report/" + f_name + "_genes.csv") genes = genes.loc[~(genes['cluster'].isin(['na', '0', 0])) & (genes['species'] == 'H**o sapiens')] if genes.shape[0] > 0: for cluster in set(genes['cluster']): pids = [] accs = genes.loc[genes['cluster'] == cluster, 'prot_acc'].values for seq1 in accs: for seq2 in accs: seq_1 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if x.name.split("|")[2] == seq1] seq_2 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if x.name.split("|")[2] == seq2] aln = pairwise2.align.globalxx(seq_1[0], seq_2[0])[0] mean_len = (len(aln[0]) + len(aln[1])) / 2 pids.append(aln[2] / mean_len) n_genes = len(pids) mean_pid = np.mean(pids) sd_id = np.std(pids) result.append(cluster, n_genes, mean_pid, sd_id) print(cluster) return result except OSError: return None
def count_overlap(filename): for seq_record in SeqIO.parse(filename, "fasta"): for seq_record_1 in SeqIO.parse(filename, "fasta"): s1 = seq_record.seq s2 = seq_record_1.seq if s1 != s2 and s1[-3:] == s2[0:3]: print(seq_record.id + " " + seq_record_1.id)
def _validate_fasta(self, text): try: SeqIO.parse(text, 'fasta').next() return text except StopIteration: raise argparse.ArgumentTypeError( "{0} is not fasta file".format(text))
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20): """ removes reads from a pair of fastq files that are shorter than a minimum length. removes both ends of a read if one end falls below the threshold while maintaining the order of the reads """ logger.info("Removing reads in %s and %s that " "are less than %d bases." % (fq1, fq2, min_length)) fq1_out = utils.append_stem(fq1, ".fixed") fq2_out = utils.append_stem(fq2, ".fixed") fq1_single = utils.append_stem(fq1, ".singles") fq2_single = utils.append_stem(fq2, ".singles") if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])): return [fq1_out, fq2_out] fq1_in = SeqIO.parse(fq1, quality_format) fq2_in = SeqIO.parse(fq2, quality_format) with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle: for fq1_record, fq2_record in izip(fq1_in, fq2_in): if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length: fq1_out_handle.write(fq1_record.format(quality_format)) fq2_out_handle.write(fq2_record.format(quality_format)) else: if len(fq1_record.seq) > min_length: fq1_single_handle.write(fq1_record.format(quality_format)) if len(fq2_record.seq) > min_length: fq2_single_handle.write(fq2_record.format(quality_format)) return [fq1_out, fq2_out]
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) if args.database_name not in server.keys(): server.new_database(args.database_name) db = server[args.database_name] gen = [] if args.fasta is not None: for rec in SeqIO.parse(args.fasta, 'fasta'): gen.append(rec.name) elif args.genbank is not None: for rec in SeqIO.parse(args.genbank, 'genbank'): gen.append(rec.name) elif args.input is not None: with open(args.input) as fp: for line in fp: gen.append(line.rstrip()) if args.remove: taxon_id = None else: taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid) for rec in gen: server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec))) server.commit()
def _set_seqFormat(self, inFile, nlines=100): """Determining the format of the seuqence file. Args: inFile -- file name nlines -- number of lines in file to check (starting from top) Attrib set: readFileForamt -- set to fasta or fastq or NoneType """ with open(inFile, 'r') as fd: head = ''.join([fd.readline() for x in xrange(nlines)]) # format? nseqs_fasta = len( [seq_rec.id for seq_rec in SeqIO.parse(StringIO(head), 'fasta')] ) nseqs_fastq = len( [seq_rec.id for seq_rec in SeqIO.parse(StringIO(head), 'fastq')] ) if nseqs_fasta > 0 and nseqs_fastq > 0: if nseqs_fasta > nseqs_fastq: self.set_readFileFormat('fasta') elif nseqs_fasta < nseqs_fastq: self.set_readFileFormat('fastq') else: raise IOError(' The file appears to be both fasta and fastq\n') elif nseqs_fasta > 0: self.set_readFileFormat('fasta') elif nseqs_fastq > 0: self.set_readFileFormat('fastq') else: self.set_readFileFormat(None)
def standard_test_procedure(self, cline): """Standard testing procedure used by all tests.""" self.assertTrue(str(eval(repr(cline))) == str(cline)) input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"), lambda rec : rec.id.replace(":", "_")) #Determine name of tree file if cline.newtree: tree_file = cline.newtree else: #Clustalw will name it based on the input file tree_file = os.path.splitext(cline.infile)[0] + ".dnd" # Mark generated files for later removal self.add_file_to_clean(cline.outfile) self.add_file_to_clean(tree_file) output, error = cline() self.assertTrue(output.strip().startswith("CLUSTAL")) self.assertTrue(error.strip() == "") #Check the output... align = AlignIO.read(cline.outfile, "clustal") #The length of the alignment will depend on the version of clustalw #(clustalw 2.1 and clustalw 1.83 are certainly different). output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile,"clustal")) self.assertTrue(set(input_records.keys()) == set(output_records.keys())) for record in align: self.assertTrue(str(record.seq) == str(output_records[record.id].seq)) self.assertTrue(str(record.seq).replace("-", "") == str(input_records[record.id].seq)) #Check the DND file was created. #TODO - Try and parse this with Bio.Nexus? self.assertTrue(os.path.isfile(tree_file))
#We are going to normalise this sequence with the sklearn preprocessing algorithm to see what happens. queryarray = sklearn.preprocessing.scale(np.array(squiggle),axis=0,with_mean=True,with_std=True,copy=True) dist, cost, path = mlpy.dtw_subsequence(queryarray,kmerhash2[id][ref]['Fprime']) result.append((dist,id,"F",path[1][0],ref)) dist, cost, path = mlpy.dtw_subsequence(queryarray,kmerhash2[id][ref]['Rprime']) result.append((dist,id,"R",path[1][0],ref)) return sorted(result,key=lambda result: result[0])[0][1],sorted(result,key=lambda result: result[0])[0][0],sorted(result,key=lambda result: result[0])[0][2],sorted(result,key=lambda result: result[0])[0][3],sorted(result,key=lambda result: result[0])[0][4] ###################################################################### ####################################################### # Retrieve a model from the database rather than the # # expected data # ####################################################### def retrieve_model(): model_kmers = dict() db = MySQLdb.connect(host=dbhost, user=dbusername, passwd=dbpass, port=dbport) cursor = db.cursor() sql = "SELECT * FROM minion_LomanLabz_013731_11rx_v2_3135.model_data where model like '%template%'" cursor.execute(sql) kmerresults = cursor.fetchall() for line in kmerresults: kmer = line[2] mean = line[4] #print kmer,mean model_kmers[kmer]=mean return model_kmers
def extract(arguments): """ Extract a reference alignment from a reference package """ refpkg = arguments.refpkg # If not masking, just copy the sequences, reformatting if appropriate if not arguments.use_mask: with refpkg.open_resource('aln_sto') as input_fp: with arguments.output_file as output_fp: result = SeqIO.convert(input_fp, 'stockholm', output_fp, arguments.output_format) logging.info("Wrote %d sequences", result) return # Mask will be applied if available with refpkg.open_resource('aln_sto') as fp: alignment_length = len(next(SeqIO.parse(fp, 'stockholm'))) # Rewind fp.seek(0) sequences = SeqIO.parse(fp, 'stockholm') try: with refpkg.open_resource('mask') as fp: mask = AlignmentMask.from_csv_file(fp, alignment_length) logging.info("Applying mask - keeping %d/%d positions", mask.unmasked_count, len(mask)) sequences = mask.mask_records(sequences) except KeyError: log.warn("No mask found. Extracting all columns.") with arguments.output_file as output_fp: result = SeqIO.write(sequences, output_fp, arguments.output_format) logging.info("Wrote %d sequences.", result)
def extract_seq_from_file(seq_file, coords_file, output_file): # 记录reference sequence名称 chrs = [] # 存储片段 chr_seg = {} # 对片段计数 cnt = 0 seqio = SeqIO.parse(seq_file, 'fasta') for seq_record in seqio: chrs.append(seq_record.id) with open(coords_file, 'r') as f: for line in f: cnt += 1 line = line.strip('\n') regions = re.split('\s+', line) if regions[0] not in chrs: log.warning('{0} not in reference sequence'.format(regions[0])) if len(regions) < 3: log.warning('The numbers of this line are less than 3(required)') continue if regions[0] not in chr_seg: chr_seg[regions[0]] = [] chr_seg[regions[0]].append(regions) else: chr_seg[regions[0]].append(regions) log.info('Summary: {0} chromosomes, {1} segments processed'.format(len(chr_seg), cnt)) res_file_handle = open(output_file, 'w') # 遍历reference sequence seqio = SeqIO.parse(seq_file, 'fasta') for seq_record in seqio: if seq_record.id in chr_seg: for seg in chr_seg[seq_record.id]: try: # 创建SeqRecord对象 tmp_seq = SeqRecord.SeqRecord(seq=(seq_record.seq)[(int(seg[1])-1):int(seg[2])], id='{0}:{1}..{2}:{3}'.format(seg[0], seg[1], seg[2], seg[3])) # 当strang为-时, 进行反向互补处理 if seg[3] == '-': tmp_seq = tmp_seq.reverse_complement(id=True, name=True, description='reverse_complement') SeqIO.write(tmp_seq, res_file_handle, 'fasta') except Exception as e: log.error(e) else: log.warning(seq_record.id + ' not exists in reference sequences') res_file_handle.close()
def stitch_scaffolds(fa,outFile,len_limit=200000000,dist=500): """ This function merge multiple scaffold together to form a longer sequence. * fa: str. Reference fa file name * outFile: str. Filename output to the file. * len_limit: int. Maximum length of each merged scaffold. * dist: int. Distance between each scaffold. """ in_handle = open(fa,'r') out_handle = open(outFile,'w') sequence = '' n = 1 for record in SeqIO.parse(in_handle,'fasta'): sequence += str(record.seq) if len(sequence) >= len_limit: item = SeqRecord(Seq(sequence), id = 'chr'+str(n),description="") SeqIO.write(item,out_handle,'fasta') sequence = '' n += 1 else: sequence += 'N'*500 if sequence != '': item = SeqRecord(Seq(sequence[:-500]), id = 'chr'+str(n),description="") SeqIO.write(item,out_handle,'fasta') # output the last one handle = open(outFile) for record in SeqIO.parse(handle,'fasta'): print len(record.seq)
def Pairfold_Execute_Parallel(Input_sRNA_File,Input_Target_File): sRNA_FASTA=list(SeqIO.parse(Input_sRNA_File,"fasta")) Target_FASTA=list(SeqIO.parse(Input_Target_File,"fasta")) for sRNA in sRNA_FASTA: RNAjobList=[] #for RNA in Target_FASTA: while True: try: if len(RNAjobList) < args.cpu: RNA=Target_FASTA.pop(0) shell_command="""/home/suu13/misc_stuff/MultiRNAFold-2.0/pairfold "%s" "%s" -m RNA | grep MFE | awk '{print $NF}' """ % (str(sRNA.seq),str(RNA.seq)) #enter pairfold actual path job_id=subprocess.Popen(shell_command,shell=True,stdout=subprocess.PIPE) RNAjobList.append([str(sRNA.description),str(RNA.description),job_id]) else: time.sleep(0.1) #wait for 0.1s to check status of jobs for job in RNAjobList: if subprocess.Popen.poll(job[2])!=None: #check the status of job object print "%s\t%s\t%s" % (job[0],job[1],job[2].communicate()[0].strip()) RNAjobList.remove(job) except: while len(RNAjobList) != 0: time.sleep(0.1) #wait for 0.1s to check status of jobs for job in RNAjobList: if subprocess.Popen.poll(job[2])!=None: #check the status of job object print "%s\t%s\t%s" % (job[0],job[1],job[2].communicate()[0].strip()) RNAjobList.remove(job) break return
def load_multi_database(gb_filename_or_handle, gb_filename_or_handle2): """Load two GenBank files into a new BioSQL database as different subdatabases. This is useful for running tests against a newly created database. """ TESTDB = create_database() # now open a connection to load the database db_name = "biosql-test" db_name2 = "biosql-test2" server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) db = server.new_database(db_name) # get the GenBank file we are going to put into it iterator = SeqIO.parse(gb_filename_or_handle, "gb") count = db.load(iterator) db = server.new_database(db_name2) # get the GenBank file we are going to put into it iterator = SeqIO.parse(gb_filename_or_handle2, "gb") # finally put it in the database count2 = db.load(iterator) server.commit() server.close() return count + count2
def check_match(input, reference, output): in_record = open(input) ref_record = open(reference) fasta_list = [] for in_record in SeqIO.parse(input, "fasta"): #parse input and reference seqs #translate to peptide seq orf = trans(str(in_record.seq)) written = 0 for aa_seq in orf: if (written == 0) and (len(aa_seq) >= 140): #shortest length of a ref seq for ref_record in SeqIO.parse(reference, "fasta"): # pairwise alignment of input seq and each ref until a match found # 1 point for match, -1 for mistmatch, -.5 for gab, -.1 for gap extension. # Can alter scoring for looser alignments align = pairwise2.align.localms(aa_seq, ref_record.seq, 1, -1, -.5, -.1, score_only=True) #scores equal to ref length (100% alignment) if align == len(ref_record.seq): fasta_list.append('>%s\n%s\n' % (in_record.description + " len:" + str(len(aa_seq)), aa_seq)) written = 1 break #write query descriptions and seqs that match ref with open(output + ".fna", 'a') as file: file.write('\n'.join(fasta_list)) file.close()
def fastaSubtract(fastaFiles): """ Given a list of open file descriptors, each with FASTA content, remove the reads found in the 2nd, 3rd, etc files from the first file in the list. @param fastaFiles: a C{list} of FASTA filenames. @raises IndexError: if passed an empty list. @return: An iterator producing C{Bio.SeqRecord} instances suitable for writing to a file using C{Bio.SeqIO.write}. """ reads = {} firstFile = fastaFiles.pop(0) for seq in SeqIO.parse(firstFile, 'fasta'): reads[seq.id] = seq for fastaFile in fastaFiles: for seq in SeqIO.parse(fastaFile, 'fasta'): # Make sure that reads with the same id have the same sequence. if seq.id in reads: assert str(seq.seq) == str(reads[seq.id].seq) reads.pop(seq.id, None) return iter(reads.values())
def main(): parser = argparse.ArgumentParser() parser.add_argument("reads1",help='modified reads') parser.add_argument("reads2", help='reads to adjust') parser.add_argument('reads1_output', help='output folder and filename. Note that the folder should already exist') parser.add_argument('reads2_output', help='output folder and filename. Note that the folder should already exist') args = parser.parse_args() #we'll need to go through the reads1 multiple time and it can be a large file #so it's better to use inline func that return a generator _reads1 = lambda: (rec for rec in SeqIO.parse(args.reads1, 'fastq')) _reads2 = (rec for rec in SeqIO.parse(args.reads2, 'fastq')) matching_reads2 = (read2 for read2 in next_matching_read(_reads1(), _reads2)) synced_reads2_names = (read2 for read2 in adjust_name(_reads1(), matching_reads2)) final_reads1 = (remove_space_from_sequence_header(r1) for r1 in _reads1()) final_reads2 = (remove_space_from_sequence_header(r2) for r2 in synced_reads2_names) SeqIO.write(final_reads1, args.reads1_output, "fastq") SeqIO.write(final_reads2, args.reads2_output, "fastq")
def write_joined(ffile, rfile, joined_file, length=None): freads = SeqIO.parse(ffile, 'fastq') rreads = SeqIO.parse(rfile, 'fastq') with open(joined_file, 'w') as outfile: for fread, rread in itertools.izip(freads, rreads): outfile.write(join_seqs(fread, rread, length=length).format('fastq'))
def read_seqs(self, sequence_file): """ read sequences from uniprot files (.dat or .fasta) or from lists or dicts of BioPython SeqRecords and make them available for fast search. Appending also with this function. :param sequence_file: uniprot files (.dat or .fasta) :return: """ recs = sequence_file if not isinstance(sequence_file, dict) and not isinstance(sequence_file, list): try: with open(sequence_file, 'rb') as f: if sequence_file.endswith('.fa') or sequence_file.endswith('.fasta'): recs = SeqIO.to_dict(SeqIO.parse(f, "fasta")) else: # assume it is a dat file recs = SeqIO.to_dict(SeqIO.parse(open(sequence_file), 'swiss')) except: warnings.warn("Could not read file", UserWarning) return if isinstance(sequence_file, list): recs = SeqIO.to_dict(sequence_file) if recs: self.collection.update(recs) self.searchstring = '#'.join([str(x.seq) for x in self.collection.values()]).decode('ascii') self.accs = self.collection.keys() self.idx = list() self.idx.append(0) for i, v in enumerate(self.collection.values()): self.idx.append(1 + self.idx[-1] + len(self.collection.values()[i].seq)) return
def _open_seq(f): if os.path.splitext(f)[1].lower() in ['.gb', '.gbk', '.genbank', '.gen',]: return SeqIO.parse(f, 'genbank', alphabet=Alphabet.generic_dna) elif os.path.splitext(f)[1].lower() in ['.fas', '.fasta',]: return SeqIO.parse(f, 'fasta', alphabet=Alphabet.generic_dna) else: raise ValueError('Could not detect file type for \'{}\''.format(f))
def main(argv): args = parseArgs() logging.basicConfig(level=logging.INFO, format=log_format) setup_mismatches(args.mismatches) with open(args.r1_fastq) as r1_in, \ open(args.r2_fastq) as r2_in, \ open(args.out_r1, 'wt') as r1_out, \ open(args.out_r2, 'wt') as r2_out: r1_seqIO = SeqIO.parse(r1_in, "fastq") r2_seqIO = SeqIO.parse(r2_in, "fastq") try: while True: (r1, r2) = attach_umt(r1_seqIO.__next__(), r2_seqIO.__next__()) # Only write Fastq records for which we find stems if r1 is not None and r2 is not None: umtstats['trimmed'] += 1 r1_out.write(r1.format("fastq")) r2_out.write(r2.format("fastq")) else: umtstats['no_stem'] += 1 except StopIteration: logging.info("EOF reached") logging.info("Trimmed: %d" % umtstats['trimmed']) logging.info("No stem: %d" % umtstats['no_stem'])
def index_database_dbm( dbfiles, outfile, type='fasta' ): DBM = anydbm.open( outfile, 'c' ) if type == 'fasta': for db in dbfiles: handle = open(db) for seq_record in SeqIO.parse(handle, "fasta"): DBM[ seq_record.id ] = seq_record.seq.tostring() handle.close() if type == 'description': for db in dbfiles: handle = open(db) for seq_record in SeqIO.parse(handle, "fasta"): DBM[ seq_record.id ] = seq_record.description[ seq_record.description.index(' ')+1 : ] handle.close() if type == 'annotation': for db in dbfiles: handle = open(db) for seq_record in SeqIO.parse(handle, "fasta"): DBM[ seq_record.id ] = seq_record.annotations handle.close() if type == 'name': for db in dbfiles: handle = open(db) for seq_record in SeqIO.parse(handle, "fasta"): DBM[ seq_record.id ] = seq_record.name handle.close() if type == 'features': for db in dbfiles: handle = open(db) for seq_record in SeqIO.parse(handle, "fasta"): DBM[ seq_record.id ] = seq_record.features handle.close() name DBM.close() return outfile
def make(reads,barcodes,saveAs,name=False,mismatch=0,report=False): # fastools does not support pipe temp=tempnam(environ['HOME']) args=[config.get('paths','fastools'),reads,temp,mismatch] calls={seq.id: [] for seq in list(SeqIO.parse(reads,'fasta'))} counts=defaultdict(lambda: defaultdict(int)) for barcode in SeqIO.parse(barcodes,'fasta'): # Fastools to check left of each sequence for barcode match call(cmd.format(*args+[-len(barcode),barcode.seq]).split()) for seq in SeqIO.parse(temp,'fasta'): calls[seq.id]+=[barcode] remove(temp) with open(saveAs,'w') as handle: for seq in SeqIO.parse(reads,'fasta'): if calls[seq.id]: # only use best barcode barcode=sorted(calls[seq.id],key=lambda x: sum([ seq.seq.find(j,i)-i for i,j in enumerate(x)])-len(x))[0] seq.id+='|{0}'.format((barcode.seq,barcode.id)[name]) counts[barcode][str(seq.seq[:len(barcode)+1])]+=1 SeqIO.write(seq,handle,'fasta') if report: with open(path.splitext(saveAs)[0]+'.report.txt','w') as handle: handle.write('\n\n'.join(['\n'.join( ['{0}: {1}'.format(k.id,k.seq)]+ ['{0} x {1}'.format(i,j) for i,j in sorted(v.items())]) for k,v in counts.items()]))
def profile_bam(bam, fasta, **kwargs): ''' Return a dataframe with the complete coverage exhaustion of each scaffold Bdb = coverage information on all scaffolds Sdb = SNP information Return Bdb, Sdb ''' # get arguments minP = kwargs.get('minP', .8) minC = kwargs.get('minC', 5) lightRAM = kwargs.get('lightRAM', False) # initialize table = defaultdict(list) # set up coverage dataframe Atable = defaultdict(list) # set up ANI dataframe Stable = defaultdict(list) # Set up SNP table samfile = pysam.AlignmentFile(bam) # set up .sam file scaff2sequence = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) # set up .fasta file s2l = {s: len(scaff2sequence[s]) for s in list(scaff2sequence.keys())} # Get scaffold2length # initialize new goodies on the scaffold level if not lightRAM: scaff2covT = {} scaff2basesCounted = {} scaff2snpsCounted = {} # Iterate scaffolds for scaff in tqdm(s2l, desc='Scaffolds processed'): covT = defaultdict(lambda: np.zeros(s2l[scaff], dtype=int) ) # Dictionary of mm -> positional coverage basesCounted = defaultdict( lambda: np.zeros(s2l[scaff], dtype=bool) ) # Count of bases that got through to SNP calling snpsCounted = defaultdict( lambda: np.zeros(s2l[scaff], dtype=bool)) # Count of SNPs try: iter = samfile.pileup(scaff) except ValueError: print("scaffold {0} is not in the .bam file {1}!".format( scaff, bam)) continue for pileupcolumn in iter: # Iterate reads at this position to figure out basecounts # note: pileupcolumn.pos is 0-based MMcounts = _get_base_counts_mm(pileupcolumn) _update_covT(covT, MMcounts, pileupcolumn.pos) # Call SNPs _update_snp_table_T(Stable, basesCounted,\ snpsCounted, scaff2sequence[scaff][pileupcolumn.pos], MMcounts,\ pileupcolumn.pos, scaff, minC=minC, minP=minP) # Update coverage table _update_covT_table(table, covT, s2l[scaff], scaff) # Update ANI table _update_snp_covT_table(Atable, snpsCounted, basesCounted, s2l[scaff], \ scaff, covT, minC) # Add to dicts if not lightRAM: scaff2covT[scaff] = dict(covT) scaff2basesCounted[scaff] = dict(basesCounted) scaff2snpsCounted[scaff] = dict(snpsCounted) # Make the profile Sprofile = SNVprofile( fasta_loc=fasta, bam_loc=bam, minP=minP, minC=minC, scaffold2length=s2l, raw_coverage_table=pd.DataFrame(table), raw_ANI_table=pd.DataFrame(Atable), raw_snp_table=_make_snp_table(Stable), ) if not lightRAM: # Add the extra weight for att in ['scaff2covT', 'scaff2basesCounted', 'scaff2snpsCounted']: setattr(Sprofile, att, eval(att)) # Make the tables Sprofile.make_cumulative_tables() return Sprofile
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument("-i",'--input',type=str,help="input fasta") parser.add_argument("-k",'--kmer',type=str,help="exact match to search") args = parser.parse_args() from Bio.Seq import Seq from Bio.Alphabet import generic_dna target_seq = Seq(args.kmer, generic_dna) from Bio import SeqIO handle = open(args.input, "rU") for record in SeqIO.parse(handle, "fasta") : seq = record.seq #import time #time.sleep(3) for i in range(0,len(seq)): #if i%1000000 == 0: # print i #print seq[i:i+len(args.kmer)] subseq = seq[i:i+len(target_seq)] #print subseq #print args.kmer if str(subseq) == str(target_seq): print "MATCH:", i, i + len(target_seq), str(target_seq) elif str(subseq) == str(target_seq.reverse_complement()): print "MATCH:", i, i + len(target_seq), str(target_seq), "(rev. complement)" elif str(subseq) == str(target_seq.complement()):
def out(): infile1 = e1.get() infile2 = e2.get() outfile = e3.get() for record in SeqIO.parse(infile1, "fasta"): seq1 = record.seq seq1name = record.id for record in SeqIO.parse(infile2, "fasta"): seq2 = record.seq seq2name = record.id file = open(outfile, "w") #file.write("blablablabla") blomat = matrix log = type metadata = "#########################################\n# First Sequence Length: %s\n" \ "# First Sequence Name: %s\n# First Secuence File :%s\n" \ "# Second Sequence Length: %s\n# Second Sequence Name: %s\n" \ "# Second Sequence File: %s\n# Output Result File: %s\n# Program: Pairwise\n# Matrix: %s\n" \ "# Type: %s\n# Gap Open: %s\n# Gap Extend: %s\n# Note: Given Result May Contain Different Alignment with" \ "Different Scores or Different Alignment With The Same Score.\n#########################################\n\n" \ "" % (len(seq1),seq1name, infile1, len(seq2),seq2name,infile2,outfile, blomat, log ,gapopen,gapextend) file.write(metadata) print(blomat) print(log) if blomat == "BLOSUM 30": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum30, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum30, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 35": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum35, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum35, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 40": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum40, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum40, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 45": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum45, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum45, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 50": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum50, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum50, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 55": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum55, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum55, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 60": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum60, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum60, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 62": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum62, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum62, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 65": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum65, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum65, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 70": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum70, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum70, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 75": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum75, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum75, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 80": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum80, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum80, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 85": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum85, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum85, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 90": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum90, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum90, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 95": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum95, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum95, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) elif blomat == "BLOSUM 100": if log == "Local": for a in pairwise2.align.localds(seq1, seq2, blosum100, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) else: for a in pairwise2.align.globalds(seq1, seq2, blosum100, int(gapopen), int(gapextend)): # print(format_alignment(*a)) alignments2 = pairwise2.format_alignment(*a) print(alignments2) file.write(alignments2) file.close() win.destroy() r = open(outfile, 'r').read() root = Tk() S = Scrollbar(root) T = Text(root, height=50, width=500) S.pack(side=RIGHT, fill=Y) T.pack(side=LEFT, fill=Y) S.config(command=T.yview) S.config(command=T.xview) T.config(yscrollcommand=S.set) T.config(xscrollcommand=S.set) quote = r T.insert(END, quote) mainloop()
#!/usr/bin/env python # -*- coding: UTF-8 -*- from Bio.Restriction import Restriction from Bio import SeqIO #print (Restriction.Sau3AI.site) print (Restriction.PstI.site) records = SeqIO.parse("C:\\Users\\user\\Downloads\\test_restriction.fa", "fasta") for i in records: name = i.id #digest = Restriction.ApeKI.catalyse(i.seq) hits = Restriction.PstI.search(i.seq) for d in hits: print ("{0}\t{1}\t{2}".format(name, str(d), str(d+1))) ## Ref ## https://github.com/daler/rdbio-scripts/blob/master/sequenceFiles/restriction-finder.py ## http://coco.sam.pitt.edu/~emeneses/python/lecture8.pdf Page 20 ## ## from Bio.Restriction import Restriction ## from Bio import SeqIO ## print (Restriction.Sau3AI.site)
from Bio import SeqIO list_file = "/srv/projects3/human_plasmids/georgina/16_Host_Linkage/1_Blast/Outv2/MAGS_plasmidA_list.txt" # open list text file f = open(list_file, 'r') # set mag directory mag_dir = "/srv/projects3/human_plasmids/georgina/7_coverm/reformatted_mags/" # set cluster directory n = 0 for line in f.readlines(): subject = line.split("\t")[1].strip("\n\t") print("Subject:", subject) assemblyname = "f." + subject.split("_")[0] print("Assembly:", assemblyname) assembly = list(SeqIO.parse("{}/{}".format(mag_dir, assemblyname), 'fasta')) print("Opening Assembly {} of length {}".format(assemblyname, len(assembly))) new = [] for seq_record in assembly: if seq_record.id != subject: new.append(seq_record) elif seq_record.id == subject: n = n + 1 SeqIO.write(new, "{}/{}".format(mag_dir, assemblyname), 'fasta') print("Done! Deleted ", n, " contigs.")
def get_LFY_binding_sites(matScore, matRev, FastaFile, threshold, factorTranscription): # This line allows to retrieve all the sequences from the fasta file sequences = SeqIO.to_dict(SeqIO.parse(FastaFile, "fasta")) print " There are %s sequence(s) to analyze" % (len(sequences)) list_of_the_LFY_binding_sites = [] # We apply a loop on all the fasta sequences: for s in sequences: # We will store in this list all the best scores (see the threshold after) found for subsequences of one sequence good_score_positions = [] # This line allows to retrieve the DNA sequence seq = sequences[s].seq seq_id = sequences[s].id chrom = re.split(':', seq_id) pos = re.split(':|-', seq_id) # We look at each sub-sequences of the whole sequence. Each sub-sequence has the same length that the matrix length. for c in range(len(seq) - (lenMotif - 1)): strandPos = seq[c:c + lenMotif].upper() test = 0 for nu in strandPos: if nu not in ["A", "C", "G", "T"]: test = 1 if test == 1: score = "NA" else: #These lines allows to calculate a score for one sub-sequence index = 0 scoreStrandPos = 0 scoreStrandNeg = 0 while index < lenMotif: if strandPos[index] == 'A': scoreStrandPos = scoreStrandPos + matScore[index * 4] scoreStrandNeg = scoreStrandNeg + matRev[index * 4] elif strandPos[index] == 'C': scoreStrandPos = scoreStrandPos + matScore[index * 4 + 1] scoreStrandNeg = scoreStrandNeg + matRev[index * 4 + 1] elif strandPos[index] == 'G': scoreStrandPos = scoreStrandPos + matScore[index * 4 + 2] scoreStrandNeg = scoreStrandNeg + matRev[index * 4 + 2] elif strandPos[index] == 'T': scoreStrandPos = scoreStrandPos + matScore[index * 4 + 3] scoreStrandNeg = scoreStrandNeg + matRev[index * 4 + 3] index += 1 # This function allows to add scores that are associated with interdependent positions if factorTranscription == "LFY_scores_matrix_19nucl": scoreStrandPos, scoreStrandNeg = add_scores_associated_with_interdependent_positions( get_dependency_matrix(dependencyFile, num), scoreStrandPos, scoreStrandNeg, strandPos) #These lines allows to retrieve the chromosome and the positions where there is a predicted binding site (score above the threshold fixed by the user) . if scoreStrandPos > threshold or scoreStrandNeg > threshold: list_of_the_LFY_binding_sites.append([ chrom[0].replace('chr', ''), int(pos[1]) + c + 1, int(pos[1]) + c + 1 + 19, str(strandPos[0:19]) ]) return (list_of_the_LFY_binding_sites)
import sys from Bio import SeqIO from Bio.SeqUtils import GC # Argument one is a fasta file print "GC_content" for rec in SeqIO.parse(sys.argv[1], "fasta"): print "%s\t%1.2f" % (rec.id, GC(rec.seq) / 100.0)
### Input directory_tst_ffpe = '/media/partition/fastq_tst_ffpe' directory_tst_cll = '/media/partition/fastq_tst_cll' fastq_tst_ffpe = [ f for f in glob.iglob(directory_tst_ffpe + "/*R1_001.fastq.gz") ] fastq_tst_cll = [ f for f in glob.iglob(directory_tst_cll + "/*R1_001.fastq.gz") ] ### Open file & parse over it for file in fastq_tst_ffpe: print file handle = gzip.open(file) recs = SeqIO.parse(handle, "fastq") d1 = defaultdict(list) for rec in recs: pos = 0 for i, qual in enumerate(rec.letter_annotations['phred_quality']): d1[pos].append(qual) pos = pos + 1 means = [] mean = 0 for key, value in d1.items(): mean = np.mean(value) means.append(mean) x = np.arange(len(means)) plt.plot(x, means, color='red')
def read_fasta(filepath): records = [] with open(filepath, 'rU') as fasta_file: records = list(SeqIO.parse(fasta_file, 'fasta')) return records
def main(cpus, dun_make_bins=False, dun_use_partial=False, num_seqs_per_batch=100000, dun_cleanup_files=False): print "Indexing isoseq_flnc.fasta using LazyFastaReader..." d = LazyFastaReader('isoseq_flnc.fasta') print "Splitting input isoseq_flnc.fasta into seed/batches..." num_batchs = create_seed_n_batch_files( input='isoseq_flnc.fasta', fasta_d=d, seed_filename='seed0.fasta', batch_pre='batch', num_seqs_per_batch=num_seqs_per_batch) # step1. run minimap of seed0 against itself and process o = ar.run_minimap('seed0.fasta', 'seed0.fasta', cpus=cpus) seqids = set([r.id for r in SeqIO.parse(open('seed0.fasta'), 'fasta')]) pCS, orphans = sp.process_self_align_into_seed( o, seqids, MiniReader, dun_use_partial=dun_use_partial) # keep stats size_S, size_tucked, size_orphans = len(pCS.S), sum( v == 'T' for v in pCS.seq_stat.itervalues()), len(orphans) print "seed 0 initial: S {0}, tucked {1}, orphans {2}".format( size_S, size_tucked, size_orphans) # write out seed1.S.fasta and seed1.orphans.fasta FileIO.write_preClusterSet_to_fasta(pCS, 'seed1.S.fasta', d) FileIO.write_seqids_to_fasta(orphans, 'seed1.orphans.fasta', d) # step 2a. minimap batch1 against seed1.S and process for i in xrange(1, num_batchs): pCS, orphans = add_batch(i, pCS, orphans, d, cpus=cpus, dun_use_partial=dun_use_partial) cleanup_precluster_intermediate_files(i) # detect PCR chimeras from orphans chimeras = detect_PCR_chimeras(orphans, d) orphans = orphans.difference(chimeras) FileIO.write_seqids_to_fasta(orphans, "preCluster_out.orphans.fasta", d) FileIO.write_seqids_to_fasta(chimeras, "preCluster_out.chimeras.fasta", d) tucked_seqids = [] # dump pCS, orphans, chimeras to a pickle # can't dump yet --- since pCS is an object #with open('preCluster.output.pickle', 'w') as f: # dump({'pCS': pCS, 'chimeras': chimeras, 'orphans': orphans}, f) # write CSV file with open('preCluster.output.csv', 'w') as f: f.write("seqid,stat\n") for x, stat in pCS.seq_stat.iteritems(): if stat == 'T': f.write("{0},tucked\n".format(x)) tucked_seqids.append(x) elif stat == 'M': f.write("{0},{1}\n".format(x, pCS.seq_map[x])) for x in orphans: f.write("{0},orphan\n".format(x)) for x in chimeras: f.write("{0},chimera\n".format(x)) # Liz: currently not using tucked... if len(tucked_seqids) > 0: FileIO.write_seqids_to_fasta(tucked_seqids, "preCluster_out.tucked.fasta", d) infof = open('preCluster.cluster_info.csv', 'w') infof.write("cluster,size\n") # write out a directory per preCluster cid in preCluster_out/<cid> # Liz note: right now, write out even directories with just 1 sequence # (we know they have "tucked" support, so can run Partial/Arrow on it) #singlef = open("preCluster_out.singles.fasta", 'w') for cid in pCS.S: # if pCS.S[cid].size == 1: # r = d[pCS.S[cid].members[0]] # singlef.write(">{0}\n{1}\n".format(r.id, r.seq)) # else: if True: if not dun_make_bins: dirname = os.path.join("preCluster_out", str(cid)) os.makedirs(dirname) file = os.path.join(dirname, 'isoseq_flnc.fasta') FileIO.write_seqids_to_fasta(pCS.S[cid].members, file, d) infof.write("{0},{1}\n".format(cid, len(pCS.S[cid].members))) #singlef.close() infof.close() if not dun_cleanup_files: # clean up all seed* and batch* files for file in glob.glob('batch*fasta*'): os.remove(file) for file in glob.glob('seed*fasta*'): os.remove(file)
def create_flats_and_lisp(run_folder, taxon_file): """ Read Genbank/GFF/PF files and create Pathway Tools needed file. Create also a lisp file to create flat files from Pathway tools results. The name of the PGDB created by Pathway Tools will be the name of the species with '_' instead of space. Create organism-params.dat: ID pgdb_id STORAGE FILE NCBI-TAXON-ID taxon_id NAME species_name Create genetic-elements.dats: NAME ANNOT-FILE gbk_name // Create flat_files_creation.lisp: (in-package :ecocyc) (select-organism :org-id 'pgdb_id) (create-flat-files-for-current-kb) Args: run_folder (str): ID of a species of the input folder taxon_file (bool): Boolean indicating if a taxon_file must be used Returns: list: boolean list, True if all files have been created """ # Look for a Genbank/GFF files in the run folder. # PGDB ID corresponds to the name of the species folder. pgdb_id = os.path.basename(run_folder) gbk_name = pgdb_id + ".gbk" gbk_pathname = os.path.join(run_folder, gbk_name) gbff_name = pgdb_id + ".gbff" gbff_pathname = os.path.join(run_folder, gbff_name) gff_name = pgdb_id + ".gff" gff_pathname = os.path.join(run_folder, gff_name) organism_dat = os.path.join(run_folder, 'organism-params.dat') genetic_dat = os.path.join(run_folder, 'genetic-elements.dat') lisp_pathname = os.path.join(run_folder, 'flat_files_creation.lisp') fasta_extensions = ['.fasta', '.fsa'] taxon_id = "" taxon_error = False species_name = "" taxon_datas = {} if os.path.isfile(gbk_pathname) or os.path.isfile(gbff_pathname): if os.path.isfile(gbk_pathname): input_name = gbk_name input_path = gbk_pathname else: input_name = gbff_name input_path = gbff_pathname # Take the species name and the taxon id from the genbank file. with open(input_path, "r") as gbk: # Take the first record of the genbank (first contig/chromosome) to retrieve the species name. try: first_seq_record = next(SeqIO.parse(gbk, "genbank")) except StopIteration: logger.critical( 'Issue with the genbank {0}, it can be empty or malformatted.' .format(input_path)) return None try: species_name = first_seq_record.annotations['organism'] except KeyError: logger.critical( 'No organism in the Genbank {0} In the SOURCE you must have: ORGANISM Species name' .format(pgdb_id)) return None # Take the source feature of the first record. # This feature contains the taxon ID in the db_xref qualifier. src_features = [ feature for feature in first_seq_record.features if feature.type == "source" ] for src_feature in src_features: if 'db_xref' in src_feature.qualifiers: src_dbxref_qualifiers = src_feature.qualifiers['db_xref'] for src_dbxref_qualifier in src_dbxref_qualifiers: if 'taxon:' in src_dbxref_qualifier: taxon_id = src_dbxref_qualifier.replace( 'taxon:', '') if not taxon_id: logger.info( 'No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.' .format(gbk_pathname)) logger.info('Try to look in the taxon_id.tsv file') taxon_error, taxon_id, taxon_datas = extract_taxon_id( run_folder, pgdb_id, taxon_id, taxon_file) if taxon_file: taxon_error, taxon_id, taxon_datas = extract_taxon_id( run_folder, pgdb_id, taxon_id, taxon_file) elif os.path.isfile(gff_pathname): input_name = gff_name # Check if there is a fasta file. gff_fasta = None for fasta_extension in fasta_extensions: fasta_input_name = input_name.replace('.gff', fasta_extension) fasta_path = os.path.join(run_folder, fasta_input_name) if os.path.exists(fasta_path): gff_fasta = fasta_input_name if not gff_fasta: logger.critical( 'No fasta file (.fasta or .fsa) with the GFF of {0}'.format( pgdb_id)) return None # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature. try: region_feature = [ feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region' ][0] except IndexError: logger.critical( 'No region feature in the GFF file of {0}, GFF file must have region features.' .format(pgdb_id)) return None try: region_feature.attributes['Dbxref'] except KeyError: logger.critical( 'No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.' .format(pgdb_id)) for dbxref in region_feature.attributes['Dbxref']: if 'taxon' in dbxref: taxon_id = dbxref.split('taxon:')[1] if not taxon_id or taxon_file: if not taxon_id: logger.info( 'Missing "taxon:" in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.' .format(pgdb_id)) logger.info('Try to look in the taxon_id.tsv file') taxon_error, taxon_id, taxon_datas = extract_taxon_id( run_folder, pgdb_id, taxon_id, taxon_file) # Look for PF files. elif all([ True for species_file in os.listdir(run_folder) if '.pf' in species_file or '.fasta' in species_file or '.fsa' in species_file ]): for species_file in os.listdir(run_folder): if '.pf' in species_file: # Check if there is a fasta file. pf_fasta = None for fasta_extension in fasta_extensions: fasta_species_name = species_file.replace( '.pf', fasta_extension) fasta_path = os.path.join(run_folder, fasta_species_name) if os.path.exists(fasta_path): pf_fasta = fasta_species_name if not pf_fasta: logger.critical( 'No fasta file (.fasta or .fsa) with the Pathologic file of {0}, this could lead to warnings in Pathway Tools.' .format(pgdb_id)) taxon_error, taxon_id, taxon_datas = extract_taxon_id( run_folder, pgdb_id, taxon_id, taxon_file) if taxon_error == True: logger.critical('Issue with taxon ID of {0}.'.format(run_folder)) return None # Create the organism-params dat file. with open(organism_dat, 'w', encoding='utf-8') as organism_file: organism_writer = csv.writer(organism_file, delimiter='\t', lineterminator='\n') organism_writer.writerow(['ID', pgdb_id]) organism_writer.writerow(['STORAGE', "FILE"]) organism_writer.writerow(['NCBI-TAXON-ID', taxon_id]) organism_writer.writerow(['NAME', species_name]) if 'reference_pgdbs' in taxon_datas: for reference_pgdb in taxon_datas['reference_pgdbs']: organism_writer.writerow(['REF-ORGID', reference_pgdb]) # Create the genetic-elements dat file. with open(genetic_dat, 'w', encoding='utf-8') as genetic_file: if os.path.isfile(gff_pathname) or os.path.isfile( gbk_pathname) or os.path.isfile(gbff_pathname): genetic_writer = csv.writer(genetic_file, delimiter='\t', lineterminator='\n') genetic_writer.writerow(['NAME', '']) genetic_writer.writerow(['ANNOT-FILE', input_name]) if os.path.isfile(gff_pathname): genetic_writer.writerow(['SEQ-FILE', gff_fasta]) if 'circular' in taxon_datas: circular = taxon_datas['circular'] genetic_writer.writerow(['CIRCULAR?', circular]) if 'element_type' in taxon_datas: element_type = taxon_datas['element_type'] genetic_writer.writerow(['TYPE', element_type]) if 'codon_table' in taxon_datas: codon_table = taxon_datas['codon_table'] genetic_writer.writerow(['CODON-TABLE', codon_table]) genetic_writer.writerow(['//']) elif all([ True for species_file in os.listdir(run_folder) if '.pf' in species_file or '.fasta' in species_file or '.fsa' in species_file ]): genetic_writer = csv.writer(genetic_file, delimiter='\t', lineterminator='\n') for species_file in os.listdir(run_folder): if '.pf' in species_file: species_file_name = os.path.splitext(species_file)[0] genetic_writer.writerow( ['NAME', species_file.replace('.pf', '')]) genetic_writer.writerow( ['ID', species_file.replace('.pf', '')]) genetic_writer.writerow(['ANNOT-FILE', species_file]) fasta_path = os.path.join( run_folder, species_file.replace('.pf', '.fasta')) fsa_path = os.path.join( run_folder, species_file.replace('.pf', '.fsa')) if os.path.exists(fasta_path): genetic_writer.writerow([ 'SEQ-FILE', species_file.replace('.pf', '.fasta') ]) elif os.path.exists(fsa_path): genetic_writer.writerow( ['SEQ-FILE', species_file.replace('.pf', '.fsa')]) if species_file_name in taxon_datas: if 'circular' in taxon_datas[species_file_name]: circular = taxon_datas[species_file_name][ 'circular'] genetic_writer.writerow(['CIRCULAR?', circular]) if 'element_type' in taxon_datas[species_file_name]: element_type = taxon_datas[species_file_name][ 'element_type'] genetic_writer.writerow(['TYPE', element_type]) if 'codon_table' in taxon_datas[species_file_name]: codon_table = taxon_datas[species_file_name][ 'codon_table'] genetic_writer.writerow( ['CODON-TABLE', codon_table]) else: if 'circular' in taxon_datas: circular = taxon_datas['circular'] genetic_writer.writerow(['CIRCULAR?', circular]) if 'element_type' in taxon_datas: element_type = taxon_datas['element_type'] genetic_writer.writerow(['TYPE', element_type]) if 'codon_table' in taxon_datas: codon_table = taxon_datas['codon_table'] genetic_writer.writerow( ['CODON-TABLE', codon_table]) genetic_writer.writerow(['//']) # Create the lisp script. check_lisp_file = create_flat_creation_script(pgdb_id, lisp_pathname) return all([ os.path.isfile(organism_dat), os.path.isfile(genetic_dat), check_lisp_file ])
def Analyze(self): print "Loading results ..." bar = progressbar.ProgressBar() results = [] for i in bar(range(self.AlLen - self.window)): # print "Loading %s/%s" % (i, self.AlLen - self.window) df = self.Parse_HMMER_output('%s_%s.out' % (self.name, i)) res = self.Extract_kingdoms(df) results.append(res) print "Compute ratio" # save ratio in array df = [] if os.path.isfile(self.name + '_database.fasta'): res = {'B': 0, 'E': 0, 'A': 0, 'O': 0, 'Other': 0} tot = 0 seqs = SeqIO.parse(self.name + '_database.fasta', format='fasta') for i in seqs: specID = i.name.split('|')[2].split('_')[1] kg = self.Get_kingdom(specID, i.description, i.name.split('|')[1]) if kg in res.keys(): res[kg] += 1 else: res['Other'] += 1 tot += 1 if self.osk: df.append(['db', res['B'], res['E'], res['A'], res['O'], res['Other'], 0]) else: df.append(['db', res['B'], res['E'], res['A'], res['Other'], 0]) counts = [] j = 0 for i in results: tmp = [] tmp.append(i.count('B')) tmp.append(i.count('E')) tmp.append(i.count('A')) if self.osk: tmp.append(i.count('O')) tmp.append(len(i) - (i.count('B') + i.count('A') + i.count('E') + i.count('O'))) else: tmp.append(len(i) - (i.count('B') + i.count('A') + i.count('E'))) counts.append(tmp) p = 1 - self.alignment[:, j].count('-') / float(len(self.alignment[:, j])) df.append([j] + tmp + [p]) j += 1 counts = np.array(counts).T if self.osk: df = pd.DataFrame(df, columns=['Position', 'Bacteria', 'Eukaryotes', 'Arthropods', 'Oskar', 'Other', 'Occupancy']) else: df = pd.DataFrame(df, columns=['Position', 'Bacteria', 'Eukaryotes', 'Arthropods', 'Other', 'Occupancy']) df.to_csv(self.name + '.csv', index=False) # return results, counts # plot ratio # sns. # http://matplotlib.org/examples/pylab_examples/stackplot_demo.html print "Plotting results" fig, ax = plt.subplots() if self.osk: t = ax.stackplot(range(len(counts[0])), counts[0], counts[1], counts[2], counts[3], counts[4]) leg = ['Bacteria', 'Eukaryotes', 'Arthropoda', 'Oskar', 'Other'] else: t = ax.stackplot(range(len(counts[0])), counts[0], counts[1], counts[2], counts[3]) leg = ['Bacteria', 'Eukaryotes', 'Arthropoda', 'Other'] handles = [] for i in range(len(t)): handles.append(mpatches.Patch(color=t[i].get_facecolor()[0], label=leg[i])) ax.legend(handles=handles) plt.title(self.name) fig.savefig("%s.pdf" % self.name) fig.savefig("%s.png" % self.name)
from Bio import SeqIO from math import floor from sys import argv # read files region_file = argv[1] results_file = argv[2] ref_file = argv[3] alignment_file = argv[4] regions = pd.read_csv(region_file) df = pd.read_csv(results_file) new_df = pd.DataFrame(columns=["sample", "mutations", "aa_mut"]) # read sequences reference = list(SeqIO.read(ref_file, 'fasta').seq) alignment = SeqIO.to_dict(SeqIO.parse(alignment_file, 'fasta')) for sample, record in alignment.items(): alignment[sample] = list(str(record.seq).upper()) translate_table = { 'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AAC': 'N', 'AAT': 'N', 'AAA': 'K',
def read_taxon_id(run_folder): """ Search for Taxon ID in genbank or GFF files. For GenBank file searc for ''taxon:' key in 'db_xref' qualifier. For GFF file search for 'taxon' in dbxref feature. Args: run_folder (str): path to the input folder """ taxon_ids = {} for input_folder in os.listdir(run_folder): input_folder_path = os.path.join(run_folder, input_folder) for input_file in os.listdir(input_folder_path): if '.gbk' in input_file: gbk_pathname = os.path.join(input_folder_path, input_file) # Take the species name and the taxon id from the genbank file. with open(gbk_pathname, "r") as gbk: # Take the first record of the genbank (first contig/chromosome) to retrieve the species name. first_seq_record = next(SeqIO.parse(gbk, "genbank")) # Take the source feature of the first record. # This feature contains the taxon ID in the db_xref qualifier. src_features = [ feature for feature in first_seq_record.features if feature.type == "source" ] for src_feature in src_features: try: src_dbxref_qualifiers = src_feature.qualifiers[ 'db_xref'] for src_dbxref_qualifier in src_dbxref_qualifiers: if 'taxon:' in src_dbxref_qualifier: taxon_id = src_dbxref_qualifier.replace( 'taxon:', '') except KeyError: logger.info( 'No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.' .format(gbk_pathname)) elif '.gff' in input_file: gff_pathname = os.path.join(input_folder_path, input_file) # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature. try: region_feature = [ feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region' ][0] except IndexError: raise IndexError( 'No region feature in the GFF file of {0}, GFF file must have region features.' .format(input_folder)) try: region_feature.attributes['Dbxref'] except KeyError: raise KeyError( 'No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.' .format(input_folder)) for dbxref in region_feature.attributes['Dbxref']: if 'taxon' in dbxref: taxon_id = dbxref.split('taxon:')[1] elif '.pf' in input_file: logger.info( 'No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id' .format(input_folder)) taxon_id = "missing" taxon_ids[input_folder] = taxon_id return taxon_ids
from Bio import SeqIO from Bio.Alphabet import generic_dna from Bio.SeqUtils import GC from operator import itemgetter, attrgetter, methodcaller high_id = None high_gc = 0 for record in SeqIO.parse("dataset/rosalind_gc.fasta", "fasta", generic_dna): gc = GC(record.seq) if gc > high_gc: high_id = record.id high_gc = gc print(high_id) print('{:.3f}'.format(high_gc))
if __name__ == '__main__': import re from Bio import SeqIO from common import file_parser import os parser = file_parser(prog_desc='Find sequences that contain KLEEKS', file_desc='A FASTA file containing protein sequences') args = parser.parse_args() kleek_filter = re.compile(r'KL[EI]{2,}K') hits = 0 with open(args.file, 'rU') as fd: for record in SeqIO.parse(fd, 'fasta'): if re.search(kleek_filter, str(record.seq)): hits += 1 print(record.format('fasta')) # print(hits)
from Bio import SeqIO import gzip with open("tmpxcmM8P/splitted_temp.fastq", "rU") as splitted: original=SeqIO.parse(gzip.open("SRR027957_2.fastq.gz", "rb"), "fastq") for record in SeqIO.parse(splitted, "fastq"): orecord=original.next()
track.add_set(fs) diag.add_track(track, 1) pglen = float(clust[k]['end'] - clust[k]['start']) / float(1000) diag.draw( format = "linear", orientation = "landscape", pagesize = (pglen*cm, 5*cm), fragments = 1, start = clust[k]['start'], end= clust[k]['end'] ) diag.write('.'.join([pref, k, 'svg']), "SVG") ## Clustmine pipeline if(gbt == 'clustmine'): for rec in SeqIO.parse(gb, "genbank"): clust = {} cur = '' for feat in rec.features: if(feat.type == "cluster"): cur = feat.qualifiers['name'][0] clust[cur] = { 'start': feat.location.start, 'end': feat.location.end, 'gene': {} } elif(feat.type == "gene"): if(feat.location.start > clust[cur]['start'] and feat.location.end < clust[cur]['end']): clust[cur]['gene'][ feat.qualifiers['name'][0] ] = feat drawsvg(clust)
def get_hits_number_helper(self, fasta_file): """Parse fasta file and find out the number of sequences""" seq_length = [ sequence.id for sequence in SeqIO.parse(fasta_file, 'fasta') ] return len(seq_length)
if sequence == database['Sequence'][i]: index = i break return index database = pd.read_csv(sys.argv[1]) fasta_LAMP = sys.argv[2] path_output = sys.argv[3] #read doc sequences_data = [] for record in SeqIO.parse(fasta_LAMP, "fasta"): sequences_data.append(record.seq) sequences_data = list(set(sequences_data)) sequences_not_DB = [] for sequence in sequences_data: print("Process sequence: ", sequence) index = search_sequences_into_db(database, sequence) if index == -1: sequences_not_DB.append(sequence) #export sequence not DB to fasta file fasta_export = open(path_output + "sequences_not_DB.fasta", 'w') print(len(sequences_not_DB))
#!/home/incerta/Metagenomics/bin/Programs/Miniconda/envs/python2.7/bin/python #!python # Written for biopython under python2.7 # V0.01 Written by Gisle Vestergaard ([email protected]) from Bio import SeqIO import argparse parser = argparse.ArgumentParser( description="Split the fasta file into individual file with each gene seq") parser.add_argument('-f', action='store', dest='fasta_file', help='Input fasta file') result = parser.parse_args() f_open = open(result.fasta_file, "rU") for rec in SeqIO.parse(f_open, "fasta"): id = rec.id seq = rec.seq id_file = open(id + ".fasta", "w") id_file.write(">" + str(id) + "\n" + str(seq)) id_file.close() f_open.close()
def load_ref(ref_g): logging.info("Loading reference genome...") return SeqIO.to_dict(SeqIO.parse(ref_g, "fasta"))
searched_genes = [ "SAOUHSC_01852", "SAOUHSC_01482", "SAOUHSC_00832", "SAOUHSC_01699", "SAOUHSC_01635", "SAOUHSC_01481", "SAOUHSC_01483" ] input_file = "Aureus.gb" #Your GenBank file locataion. e.g C:\\Sequences\\my_genbank.gb output_file_name = "try.fasta" #The name out your fasta output accession_numbers = [ line.strip() for line in open('Aureus.gb') ] #the same as your input file, defines the headers for each sequence if not os.path.exists( output_file_name ): #checks for a pre-existing file with the same name as the output for rec in SeqIO.parse( input_file, "gb" ): #calls the record for the genbank file and SeqIO (BioPython module) to parse it acc = rec.annotations['accessions'][0] #Defines your accession numbers organism = rec.annotations['organism'] #defines your organism tax_line = ("| ").join( rec.annotations['taxonomy'] ) #defines your taxonomy and seperates entries with a |, remove this line, the 'tax_line', and the {2} in your save for a simpler output for i in range(len(searched_genes)): for feature in rec.features: #looks for features in the genbank if feature.type == 'CDS': for key, val in feature.qualifiers.items( ): #looks for val in the feature qualifiers if any( searched_genes[i] in s for s in val ): #Finds all the CDS which contain the word "protein" in the qualifiers. Change to 'if "Name" in val:' for protein called "name" exactly with open(
import sys, os from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord if len(sys.argv) < 4: print( "Usage: python combine_alignments.py exon.fasta intron.fasta[or any value if no intron] geneName" ) sys.exit(1) exon_fn = sys.argv[1] intron_fn = sys.argv[2] geneName = sys.argv[3] exon_dict = SeqIO.to_dict(SeqIO.parse(exon_fn, 'fasta')) exonLength = len(next(exon_dict.itervalues())) if os.path.isfile(intron_fn): with open("{}.combined.fasta".format(geneName), 'w') as outfile: for seq in SeqIO.parse(intron_fn, 'fasta'): intronLength = len(seq) sampleID = seq.id.split("-")[0] newseq = exon_dict[sampleID].seq + seq.seq outfile.write(">{}\n{}\n".format(sampleID, newseq)) partition = """ begin sets; charset codon1 = 1-{}\\3; charset codon2 = 2-{}\\3; charset codon3 = 3-{}\\3; charset intron = {}-{};
def removeAntiSense(input, readTuple, output): ''' function will map reads to the input transcripts, determine strandedness, and then filter out transcripts that were assembled in antisense orientation. idea here is that the antisense transcripts, while potentially valid, aren't going to help update the gene models and perhaps could hurt the annotation effort? ''' lib.log.info("Running anti-sense filtering of Trinity transcripts") bamthreads = ( args.cpus + 2 // 2) // 2 #use half number of threads for bam compression threads aligner = choose_aligner() if aligner == 'hisat2': bowtie2bam = os.path.join(tmpdir, 'hisat2.transcripts.coordSorted.bam') if not os.path.isfile(bowtie2bam): lib.log.info("Building Hisat2 index of " + "{0:,}".format(lib.countfasta(input)) + " trinity transcripts") cmd = [ 'hisat2-build', input, os.path.join(tmpdir, 'hisat2.transcripts') ] lib.runSubprocess4(cmd, '.', lib.log) #now launch the aligner lib.log.info("Aligning reads to trinity transcripts with Hisat2") hisat2cmd = [ 'hisat2', '-p', str(args.cpus), '-k', '50', '--max-intronlen', str(args.max_intronlen), '-x', os.path.join(tmpdir, 'hisat2.transcripts') ] if readTuple[2]: hisat2cmd = hisat2cmd + ['-U', readTuple[2]] if readTuple[0] and readTuple[1]: hisat2cmd = hisat2cmd + [ '-1', readTuple[0], '-2', readTuple[1] ] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd), str(bamthreads), bowtie2bam ] lib.runSubprocess4(cmd, '.', lib.log) elif aligner == 'bowtie2': #using bowtie2 bowtie2bam = os.path.join(tmpdir, 'bowtie2.transcripts.coordSorted.bam') if not os.path.isfile(bowtie2bam): lib.log.info("Building Bowtie2 index of " + "{0:,}".format(lib.countfasta(input)) + " trinity transcripts") cmd = [ 'bowtie2-build', input, os.path.join(tmpdir, 'bowtie2.transcripts') ] lib.runSubprocess4(cmd, '.', lib.log) #now launch the subprocess commands in order lib.log.info("Aligning reads to trinity transcripts with Bowtie2") bowtie2cmd = [ 'bowtie2', '-p', str(args.cpus), '-k', '50', '--local', '--no-unal', '-x', os.path.join(tmpdir, 'bowtie2.transcripts') ] if readTuple[2]: bowtie2cmd = bowtie2cmd + ['-U', readTuple[2]] if readTuple[0] and readTuple[1]: bowtie2cmd = bowtie2cmd + [ '-1', readTuple[0], '-2', readTuple[1] ] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(bowtie2cmd), str(bamthreads), bowtie2bam ] lib.runSubprocess4(cmd, '.', lib.log) elif aligner == 'rapmap': #using bowtie2 bowtie2bam = os.path.join(tmpdir, 'rapmap.transcripts.coordSorted.bam') if not os.path.isfile(bowtie2bam): lib.log.info("Building RapMap index of " + "{0:,}".format(lib.countfasta(input)) + " trinity transcripts") cmd = [ 'rapmap', 'quasiindex', '-t', input, '-i', os.path.join(tmpdir, 'rapmap_index') ] lib.runSubprocess4(cmd, '.', lib.log) #now launch the subprocess commands in order lib.log.info("Aligning reads to trinity transcripts with RapMap") rapmapcmd = [ 'rapmap', 'quasimap', '-t', str(args.cpus), '-i', os.path.join(tmpdir, 'rapmap_index'), '-1', readTuple[0], '-2', readTuple[1] ] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(rapmapcmd), str(bamthreads), bowtie2bam ] lib.runSubprocess(cmd, '.', lib.log) #now run Trinity examine strandeness tool lib.log.info("Examining strand specificity") cmd = [ os.path.join(TRINITY, 'util', 'misc', 'examine_strand_specificity.pl'), bowtie2bam, os.path.join(tmpdir, 'strand_specific') ] lib.runSubprocess(cmd, '.', lib.log) #parse output dat file and get list of transcripts to remove removeList = [] with open(os.path.join(tmpdir, 'strand_specific.dat'), 'rU') as infile: for line in infile: line = line.replace('\n', '') if line.startswith('#'): continue cols = line.split('\t') if args.stranded == 'RF': #then we want to keep negative ratios in cols[4] if not cols[4].startswith('-'): removeList.append(cols[0]) elif args.stranded == 'FR': #keep + values if cols[4].startswith('-'): removeList.append(cols[0]) #now parse the input fasta file removing records in list with open(output, 'w') as outfile: with open(input, 'rU') as infile: for record in SeqIO.parse(infile, 'fasta'): if not record.id in removeList: outfile.write(">%s\n%s\n" % (record.description, str(record.seq))) lib.log.info("Removing %i antisense transcripts" % (len(removeList)))
from Bio import SeqIO import sys if len(sys.argv)!= 5: print("Usage: python simple_trim.py <input_file> <output_file> <start_position> <end_position>") else: input_file = sys.argv[1] output_file = sys.argv[2] start = int(sys.argv[3]) end = int(sys.argv[4]) for seq_record in SeqIO.parse(input_file, "fasta"): my_seq = seq_record.seq trimmed_seq = str(my_seq[start:end]) my_header = seq_record.id out = open(output_file, "w") out.write(">" + my_header + "\n") out.write(trimmed_seq) out.close()
def fasta_id(fastafile): fastaid = defaultdict(str) for record in SeqIO.parse(fastafile,"fasta"): fastaid[record.id] = 1 return fastaid
def postprocess_chains(assembly, reads_real_coords): rare_kmers = get_kmers(assembly.kmers_fname) unique_kmers = get_kmers(assembly.solid_kmers_fname) rare_kmers_by_pos = [] unique_kmers_by_pos = [] assembly_id = "" with open(assembly.fname) as handle: for record in SeqIO.parse(handle, 'fasta'): assembly_len = len(record.seq) assembly_seq = str(record.seq) assembly_id = str(record.id) '''rare_kmers_by_pos = [0] * assembly_len unique_kmers_by_pos = [0] * assembly_len for i in range(len(assembly_seq) - KMER_SIZE + 1): kmer = assembly_seq[i:i + KMER_SIZE] if kmer in rare_kmers or rev_comp(kmer) in rare_kmers: rare_kmers_by_pos[i] = 1 if kmer in unique_kmers or rev_comp(kmer) in unique_kmers: unique_kmers_by_pos[i] = 1''' read_alignments = defaultdict(list) read_lengths = dict() read_seeds = defaultdict(lambda: defaultdict(list)) with open(assembly.chains_fname) as f: #Aln -a3aa51bb-4cZ 4 23715 24082 +cenX 1894557 1918322 3053970 -1894573 1135315 140 0.27543 for line in f: fs = line.split() if "Aln" in line and len(fs) >= 8: read_name, align_start, align_end, read_len, \ ref_name, ref_start, ref_end, ref_len = fs[1], fs[2], fs[3], fs[4], fs[5], fs[6], fs[7], fs[8] align_start, align_end, read_len, ref_start, ref_end, ref_len = map( int, (align_start, align_end, read_len, ref_start, ref_end, ref_len)) seq_div = float(fs[-1]) read_lengths[read_name[1:]] = read_len if read_name.startswith('-'): align_start, align_end = read_len - align_end - 1, read_len - align_start - 1 if ref_name.startswith('-'): ref_start, ref_end = ref_len - ref_end - 1, ref_len - ref_start - 1 read_alignments[read_name[1:]].append( (ref_start, ref_end, align_start, align_end, seq_div)) elif len(fs) >= 2: read_pos, ref_pos = int(fs[0]), int(fs[1]) if read_name.startswith('-'): read_pos = read_len - read_pos - KMER_SIZE if ref_name.startswith('-'): ref_pos = ref_len - ref_pos - KMER_SIZE read_seeds[read_name[1:]][(ref_start, ref_end, align_start, align_end, seq_div)].append( (read_pos, ref_pos)) num_alignments = 0 all_errors = [] with open(assembly.bed_fname, "w") as f: for read_name, aligns in read_alignments.items(): max_kmers = 0 max_len = 0 selected_chain = [] selected_errors = [] for align in aligns: seeds = read_seeds[read_name][align] seeds.sort(key=lambda x: x[1]) best_chain = None best_kmers = 0 best_len = 0 best_errors = [] cur_errors = [] if len(seeds) >= MIN_CHAIN_KMERS: prev_pos = 0 unique_seeds = [] for seed in seeds: read_pos, ref_pos = seed if ref_pos - prev_pos >= KMER_SIZE or not unique_seeds: unique_seeds.append((read_pos, ref_pos)) prev_pos = ref_pos unique_seeds.append(seeds[-1]) # seeds = unique_seeds new_chains = [] breakpoints = [] if assembly.real_coords: for i, s in enumerate(seeds): seeds[i] = ( reads_real_coords[read_name][s[0]], assembly.real_coords[assembly_id][s[1]]) for i in range(1, len(seeds)): ref_diff = abs(seeds[i][1] - seeds[i - 1][1]) read_diff = abs(seeds[i][0] - seeds[i - 1][0]) max_diff = max(100, min(ref_diff, read_diff) * 0.05) if abs( ref_diff - read_diff ) >= max_diff: # and abs(ref_diff-read_diff) >= 5000: prev_bp = i - 1 breakpoints.append(i - 1) cur_errors.append( (seeds[i - 1][1], seeds[i][1], read_name, ref_diff - read_diff)) elif seeds[i][1] - seeds[i - 1][1] >= MAX_REF_GAP: gap_s, gap_e = seeds[i - 1][1] + KMER_SIZE, seeds[i][1] if sum(rare_kmers_by_pos[gap_s:gap_e] ) > MAX_MISSED_KMERS: breakpoints.append(i - 1) prev_bp = i - 1 if breakpoints: chain_start1, chain_end1, chain_start2, chain_end2 = seeds[ 0][1], seeds[-1][1], seeds[0][0], seeds[-1][0] start_n = 0 for p in breakpoints: chain_end1, chain_end2 = seeds[p][1], seeds[p][0] new_chains.append( (chain_start1, chain_end1, chain_start2, chain_end2, p - start_n + 1)) if p < len(seeds): chain_start1, chain_start2, start_n = seeds[ p + 1][1], seeds[p + 1][0], p + 1 chain_end1, chain_end2 = seeds[-1][1], seeds[-1][0] if chain_end1 > chain_start1: new_chains.append( (chain_start1, chain_end1, chain_start2, chain_end2, len(seeds) - p)) chains = [] total_kmers = 0 total_len = 0 for c in new_chains: chain_start1, chain_end1, chain_start2, chain_end2, chain_kmers = c if chain_kmers > MIN_CHAIN_KMERS and chain_end1 - chain_start1 >= MIN_CHAIN_LEN: chains.append((chain_start1, chain_end1, chain_start2, chain_end2)) total_kmers += chain_kmers total_len += chain_end1 - chain_start1 if total_kmers > best_kmers: best_kmers = total_kmers best_len = total_len best_chain = chains best_errors = cur_errors else: best_kmers = len( seeds) if len(seeds) > MIN_CHAIN_KMERS / 2 else 0 best_len = seeds[-1][1] - seeds[0][1] best_chain = [[ seeds[0][1], seeds[-1][1], seeds[0][0], seeds[-1][0] ]] best_errors = [] if best_len < 100: continue if best_kmers > max_kmers: max_kmers = best_kmers max_len = best_len selected_chain = best_chain selected_errors = best_errors for c in selected_chain: ref_start, ref_end, align_start, align_end = c if (ref_end - ref_start) < MIN_CHAIN_LEN: continue num_alignments += 1 f.write("seq\t%d\t%d\t%s\t%d\t%d\t%d\n" % (ref_start, ref_end, slugify(read_name), align_start, align_end, read_lengths[read_name])) all_errors.extend(selected_errors) print(" Total %d alignments" % num_alignments) print(" Longest chains saved to %s" % assembly.bed_fname) return all_errors
"CO2" : 'COX2', 'ATP8': "ATP8", 'ATP6': "ATP6", "CO3" : 'COX3', 'ND3': "ND3", "ND4L" : 'ND4L', 'ND4': "ND4", 'ND5': "ND5", 'ND6': "ND6", "CYB" : 'CYTB'} position = {1 : 1, 2 : 2, 0 : 3} # REFERENCE # Parse genome reference and get gene refs refGenes = {"geneName" : [], "startEnd" : [], "sequence" : []} refGenbank = os.path.join(dirname, "../../Body/3Results/NC_012920.1.gb") refGenbank = open(refGenbank) for rec in SeqIO.parse(refGenbank, "genbank"): if rec.features: for feature in rec.features: if feature.type == "CDS": # Extract gene name and append to dict featureName = feature.qualifiers['gene'][0] refGenes["geneName"].append(featureName) # Extract gene location as range and append to dict featureLocation = [int(feature.location.start), int(feature.location.end)] refGenes["startEnd"].append(featureLocation) # Extract gene sequence string and append to dict featureSequence = str(feature.location.extract(rec).seq) refGenes["sequence"].append(featureSequence) # Turn into data frame refGenes = pd.DataFrame.from_dict(refGenes)
def open_genome(): handle = open(genome_seq, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() #print record_dict.keys() return record_dict