def run_random_select(self): # java SeqRandomSampling does not seem to take the abs path for fastq files merged_fq = '{}{}'.format( self.sample_info.sample_name, '.merged.primer_trim.len_trim.fastq' ) cur_dir = os.getcwd() os.chdir(self.hulk_sample_dir_path) merged_fa = os.path.splitext(merged_fq)[0] + '.fasta' SeqIO.convert(merged_fq, 'fastq', merged_fa, 'fasta') if not os.path.exists(merged_fq): print('{} is not found in {}'.format( merged_fq, self.hulk_sample_dir_path, ) ) jar_cmd = ['java SeqRandomSampling', '-c ', self.random_num, '-i', merged_fa, ] final_jar_cmd = ' '.join(str(x) for x in jar_cmd) os.system(final_jar_cmd) os.chdir(cur_dir)
def main(argv): inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv, 'hi:') except getopt.GetoptError: print "Incorrect syntax: Use '-h' for help." with open('fastq2fasta_error-log.txt', 'w') as error: error.write("Syntax Error\n") sys.exit(2) for opt, arg in opts: if opt == '-h': print('Use Syntax: fastq2fasta.py -i <inputfile>') sys.exit() elif opt == '-i': inputfile = arg filename, extension = os.path.splitext(arg) outputfile = '%s.fasta' % (filename) if inputfile == '': print("Incorrect syntax: Use '-h' for help.") with open('fastq2fasta_error-log.txt', 'w') as error: error.write("Syntax Error\n") sys.exit(2) with open(inputfile, 'r') as inpt: with open(outputfile, 'w') as otpt: SeqIO.convert(inputfile, "fastq", outputfile, "fasta") otpt.write(outputfile) print('Your FASTQ has been converted. See %s') % (outputfile)
def align(arguments): """ Align sequences to a reference package alignment. """ refpkg = arguments.refpkg prof = arguments.profile_version or refpkg.guess_align_method() alignment_func = ALIGNERS[prof] alignment_options = (arguments.alignment_options or ALIGNMENT_DEFAULTS.get(prof)) dn = os.path.dirname(arguments.outfile) with _temp_file(prefix='.refpkg_align', dir=dn) as tf: tf.close() r = alignment_func(refpkg, arguments.seqfile, tf.name, use_mask=arguments.use_mask, use_mpi=arguments.use_mpi, mpi_args=arguments.mpi_arguments, mpi_program=arguments.mpi_run, alignment_options=alignment_options, stdout=arguments.stdout) if (not arguments.output_format or arguments.output_format == DEFAULT_FORMAT[prof]): # No format converseion needed os.rename(tf.name, arguments.outfile) else: # Convert SeqIO.convert(tf.name, DEFAULT_FORMAT[prof], arguments.outfile, arguments.output_format) return r
def stockfastaconvert(options): ''' Conversion of multiple alignment - Stockholm to Multifasta ''' while True: try: fam = q.get(block=True, timeout=0.1) except Empty: if n.qsize() > 0: for _ in range(n.qsize()): print n.get(), print "" break else: n.put(fam) if n.qsize() >= 10: for _ in range(10): print n.get(), print "" if os.path.exists(options.dbdir+"align/stockholm/"+fam+".stockholm"): if os.path.exists(options.dbdir+"align/"+fam+".fasta"): os.remove(options.dbdir+"align/"+fam+".fasta", "fasta") SeqIO.convert(options.dbdir+"align/stockholm/"+fam+".stockholm", "stockholm", options.dbdir+"align/"+fam+".fasta", "fasta") handle = open(options.dbdir+"align/"+fam.upper()+".fasta", "r") temp = set() for nuc_rec in SeqIO.parse(handle, "fasta"): temp.add(nuc_rec.id) handle.close() handle = open(options.dbdir+"align/gene_list/"+fam.upper()+".gene", "w") for gene in temp: handle.write(gene+"\n") handle.close() os.remove(options.dbdir+"align/stockholm/"+fam+".stockholm") q.task_done()
def simple_check(self, base_name, in_variant): for out_variant in ["sanger", "solexa", "illumina"]: in_filename = "Quality/%s_original_%s.fastq" \ % (base_name, in_variant) self.assertTrue(os.path.isfile(in_filename)) # Load the reference output... with open("Quality/%s_as_%s.fastq" % (base_name, out_variant), _universal_read_mode) as handle: expected = handle.read() with warnings.catch_warnings(): if out_variant != "sanger": # Ignore data loss warnings from max qualities warnings.simplefilter("ignore", BiopythonWarning) warnings.simplefilter("ignore", UserWarning) # Check matches using convert... handle = StringIO() SeqIO.convert(in_filename, "fastq-"+in_variant, handle, "fastq-"+out_variant) self.assertEqual(expected, handle.getvalue()) # Check matches using parse/write handle = StringIO() SeqIO.write(SeqIO.parse(in_filename, "fastq-"+in_variant), handle, "fastq-"+out_variant) self.assertEqual(expected, handle.getvalue())
def run(self, edit): for region in self.view.sel(): seq_str = self.view.substr(region).strip() if not seq_str: sublime.error_message("No selected text") return # Check that the selection begins as expected startmatch = re.match(r'^LOCUS', seq_str) # It turns out that SeqIO can handle Genbank format that # does not end in '//' so there is no need to check for this if startmatch: # Read from a string and write to a string seqout = io.StringIO() with io.StringIO(seq_str) as seqin: SeqIO.convert(seqin, 'genbank', seqout, 'fasta') seqin.close() # Write the fasta string to a new window at position 0 self.view.window().new_file().insert( edit, 0, seqout.getvalue()) else: sublime.error_message( "Selected text does not look like Genbank: no 'LOCUS'") return
def main(gbdir, outdir): os.makedirs(gbdir, exist_ok=True) os.makedirs(outdir, exist_ok=True) tempq = 'tempquery.fasta' tempdb = 'tempdb.fasta' for org in tqdm(Organism.objects.all()): # get genbank and convert to fasta fpath = os.path.join(gbdir, '{}.gb'.format(org.accession)) if not os.path.isfile(fpath): print('\nFetching {} with accession {}'.format( org.name, org.accession )) fetch(fpath) SeqIO.convert(fpath, 'genbank', tempdb, 'fasta') # get spacers of organism and convert to fasta spacers = Spacer.objects.filter(loci__organism=org) fastatext = ''.join(['>{}\n{}\n'.format(spacer.id, spacer.sequence) for spacer in spacers]) with open(tempq, 'w') as f: f.write(fastatext) # run blast and save output outpath = os.path.join(outdir, '{}.json'.format(org.accession)) commandargs = ['blastn', '-query', tempq, '-subject', tempdb, '-out', outpath, '-outfmt', '15'] subprocess.run(commandargs, stdout=subprocess.DEVNULL) os.remove(tempq) os.remove(tempdb)
def roundtrip_check(filepath): __doc__ = '''Is there any odd data in the genbank that will get lost? Bar for the know things addressed with Botch()''' import os input_handle = open("TMO.gbk", "rU") SeqIO.convert(filepath, "genbank", "test.gbk", "genbank") print("went from " + str(os.stat('TMO.gbk').st_size) + " to " + str(os.stat("test.gbk").st_size)) os.remove("test.gbk")
def runSeqGen(workingFile, srp_hap_file, srp_tree_file, debug): seqgen_infile = workingFile + "_seqgen.phylip" seqgen = runExtProg(seqgenDir + "./seq-gen", pdir=seqgenDir, length=3) seqgen.set_param_at("-mHKY", 1) seqgen.set_param_at("-t2", 2) seqgen.set_param_at("-k1", 3) # seqgen.set_param_at("-d0.1", 4) # seqgen.set_param_at("-s0.00001", 4) seqgen.set_stdin(seqgen_infile) all_unique = False repeat = 0 while not all_unique: if repeat == 100: runBSSC(workingFile, srp_tree_file, debug) print "==========rerun BSSC========" repeat = 0 repeat += 1 # print repeat seqgen.run(0) all_unique = check_unique_sequences(seqgen) temp_handle = open(workingFile + "_seqgen_out.phylip", "w") temp_handle.write(seqgen.output) temp_handle.close() SeqIO.convert(workingFile + "_seqgen_out.phylip", "phylip", workingFile + ".fasta", "fasta") shutil.copy(workingFile + ".fasta", srp_hap_file)
def convertQ2A( qd, qname): qpath = qd + "/" + qname + ".fastq" cpath = qd + "/" + qname + ".fasta" SeqIO.convert(qpath, "fastq", cpath, "fasta") print "converted file to fasta" return cpath
def illumina2sangerFq(inputfile): print help(SeqIO.convert) filename = inputfile[:-3]+'.fastq' SeqIO.convert(inputfile, "fastq-illumina", filename, "fastq")
def toNexus (listOfFiles): for file in listOfFiles: output_handle = file.replace(".fasta", ".nex") output_handle = re.sub(".+/.+/", os.getcwd()+"/", output_handle) SeqIO.convert(file, "fasta", output_handle, "nexus", generic_dna) dir = os.getcwd() return dir
def check_convert_fails(in_filename, in_format, out_format, alphabet=None): qual_truncate = truncation_expected(out_format) #We want the SAME error message from parse/write as convert! err1 = None try: records = list(SeqIO.parse(in_filename,in_format, alphabet)) handle = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.write(records, handle, out_format) if qual_truncate: warnings.filters.pop() handle.seek(0) assert False, "Parse or write should have failed!" except ValueError as err: err1 = err #Now do the conversion... try: handle2 = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet) if qual_truncate: warnings.filters.pop() assert False, "Convert should have failed!" except ValueError as err2: assert str(err1) == str(err2), \ "Different failures, parse/write:\n%s\nconvert:\n%s" \ % (err1, err2)
def convert(basename, genbank): '''Convert the provided genbank to a fasta to BLAST.''' refFasta = "{}.fasta.tmp".format(basename) SeqIO.convert(genbank, 'genbank', refFasta, 'fasta') return refFasta
def copy_sequence(input_file, output_file): '''Copy sequence files from staging area''' GZIP_HEADER = '\x1f\x8b' BZIP_HEADER = 'BZ' pmsg('Copying sequence files', input_file, output_file) # check if this is actually a gzipped file header = open(input_file).read(2) if header == GZIP_HEADER: input_file_handle = gzip.open(input_file, 'rb') elif header == BZIP_HEADER: input_file_handle = BZ2File(input_file, 'r') else: input_file_handle = open(input_file, 'rb') output_file_handle = gzip.open(output_file, 'wb') # check whether this is a illumina or sanger fastq file try: SeqIO.convert(input_file_handle, 'fastq-illumina', output_file_handle, 'fastq-sanger') except ValueError as e: # check if this is a quality score problem if e.args != ('Invalid character in quality string',): raise e input_file_handle.seek(0) output_file_handle.seek(0) output_file_handle.writelines(input_file_handle.readlines()) finally: input_file_handle.close() output_file_handle.close()
def main(): parser = OptionParser() parser.add_option("-i", "--input", dest="input", help="read INPUT fastq file", metavar="INPUT") parser.add_option("-o", "--output", dest="output", help="write OUTPUT fasta file", metavar="OUTPUT") parser.add_option("-q", "--qual", dest="qual", help="write OUTPUT qual file", metavar="QUAL") (opt, args) = parser.parse_args() if opt.input == None: print "Missing input file" return if opt.output == None: print "Missing output file" return print "Converting files..." print "Creating csfasta file" count = SeqIO.convert(opt.input, "fastq", opt.output, "fasta") print "Converted %i records" % count if opt.qual != None: print "Creating Qual file" count = SeqIO.convert(opt.input, "fastq", opt.qual, "qual") print "Converted %i qual records" % count
def process_file(filepath, organism): """Process a single file given by the user on the command line.""" fasta_filepath = filepath # Determine file type spstring = re.split('/', filepath) fname = spstring[-1].lower() fnamesplit = re.split('\.', fname) ftype = fnamesplit[-1] if ftype == 'gb': ftype = 'genbank' check_features = (ftype == 'genbank') # Open the file and get metadata obj_file = SeqIO.read(filepath, ftype) features = get_genbank_features(obj_file) my_seq = str(obj_file.seq) # Create FASTA file if necessary if ftype != 'fasta': fasta_filepath = "/tmp/" + fnamesplit[0] + ".fasta" with open(fasta_filepath, 'w') as handle: SeqIO.convert(filepath, "genbank", handle, "fasta") # Process the file output_dict = process_efm_cli(fasta_filepath, features, my_seq, organism, check_features, fname) return output_dict
def prepare_data(ionfile, index_length): ''' * Changes quality format from Phred to Solexa (which is required by the fastx-toolkit). * Changes sequences id to incremental numbers. * Creates temporal FASTA file with the indexes removed from the sequences. Files generated will be written to folder ``data/modified/`` * ``ionfile`` argument is FASTQ format file as produced by IonTorrent * ``index_length`` number of base pairs of your indexes. This is necessary \ to trim the indexes before blasting the FASTA file \ against the reference gene sequences. Example: >>> from pyphylogenomics import NGS >>> ionfile = "ionrun.fastq"; >>> index_length = 8; >>> NGS.prepare_data(ionfile, index_length); Your file has been saved using Solexa quality format as data/modified/wrk_ionfile.fastq Your sequence IDs have been changed to numbers. The FASTA format file data/modified/wrk_ionfile.fasta has been created. ''' # create folder to keep data folder = os.path.join("data", "modified"); if not os.path.exists(folder): os.makedirs(folder); # change quality format from Phred to Solexa (required by fastx-toolkit) # write file to work on wrkfile = os.path.join(folder, "wrk_ionfile.fastq") SeqIO.convert(ionfile, "fastq", wrkfile, "fastq-solexa"); print "Your file has been saved using Solexa quality format as " + wrkfile # change sequences id to incremental numbers command = "fastx_renamer -n COUNT -i " + wrkfile + " -o tmp.fastq" p = subprocess.check_call(command, shell=True); if p != 0: print "\nError, couldn't execute " + command; sys.exit(); print "Your sequence IDs have been changed to numbers." # replace working file with temporal file os.rename("tmp.fastq", wrkfile); # create temporal FASTA file command = "fastq_to_fasta -i " + wrkfile + " -o tmp.fasta"; p = subprocess.check_call(command, shell=True); # trim index region index_length = int(index_length) + 1; command = "fastx_trimmer -f " + str(index_length) + " -i tmp.fasta " command += "-o " + os.path.join(folder, "wrk_ionfile.fasta"); p = subprocess.check_call(command, shell=True); if os.path.isfile("tmp.fasta"): os.remove("tmp.fasta"); print "The FASTA format file " + os.path.join(folder, "wrk_ionfile.fasta") \ + " has been created.";
def seqio(in_fhands, out_fhands, out_format, copy_if_same_format=True): 'It converts sequence files between formats' in_formats = [guess_format(fhand) for fhand in in_fhands] if (len(in_formats) == 1 and in_formats[0] == out_format and hasattr(in_fhands[0], 'name')): if copy_if_same_format: copyfileobj(in_fhands[0], out_fhands[0]) else: rel_symlink(in_fhands[0].name, out_fhands[0].name) elif len(in_fhands) == 1 and len(out_fhands) == 1: try: SeqIO.convert(in_fhands[0], in_formats[0], out_fhands[0], out_format) except ValueError as error: if error_quality_disagree(error): raise MalformedFile(str(error)) raise elif (len(in_fhands) == 1 and len(out_fhands) == 2 and out_format == 'fasta'): try: for seq in read_seqrecords([in_fhands[0]]): SeqIO.write([seq], out_fhands[0], out_format) SeqIO.write([seq], out_fhands[1], 'qual') except ValueError, error: if error_quality_disagree(error): raise MalformedFile(str(error)) raise
def sync_readdata( rawdir, ngsdata ): ''' Ensures that ab1 files are symlinked from the RawData/Sanger/Run directory and that they are then converted to fastq @param rawdir - RawData/Sanger/Run path @param ngsdata - Path to root NGSData directory ''' raw_reads = glob( join( rawdir, '*.ab1' ) ) readd = join( ngsdata, 'ReadData', 'Sanger', basename(rawdir) ) if not isdir( readd ): os.makedirs( readd ) for read in raw_reads: lnk = relpath( read, readd ) rdpath = join( readd, basename(read) ) if not exists( rdpath ): logger.info( 'Symlinking {0} to {1}'.format(rdpath, lnk) ) cd = os.getcwd() os.symlink( lnk, rdpath ) else: logger.info( 'Skipping existing abi file {0}'.format(rdpath) ) fqpath = rdpath.replace('.ab1', '.fastq' ) if not exists( fqpath ): logger.info( 'Converting {0} to fastq {1}'.format(rdpath,fqpath) ) SeqIO.convert( rdpath, 'abi', fqpath, 'fastq' ) else: logger.info( 'Skipping existing fastq file {0}'.format(fqpath) )
def aln2hmm(task): """Convert a Clustal alignment to an HMM profile. Cleans: .stk """ stk = ext(task.depends[0], 'stk') SeqIO.convert(str(task.depends[0]), 'clustal', stk, 'stockholm') sh('hmmbuild %s %s' % (task.target, stk))
def cmalign(infile, outfile, cpu): with util.ntf(suffix='.sto') as a_sto, open(outfile, 'w') as a_fasta: scores = wrap.cmalign_files(infile, a_sto.name, cpu=cpu) SeqIO.convert(a_sto, 'stockholm', a_fasta, 'fasta') a_fasta.flush() return scores
def convert(in_file, in_format, out_format, treeid, seq_data_type): out_file = TMP_UTILS_PATH + treeid + "." + out_format #SeqIO responds to clustal as identifier for the format but the official extension is .aln if seq_data_type == "clustal": seq_data_type = ".aln" SeqIO.convert(in_file, in_format, out_file, out_format, alphabet=SEQUENCE_ALPHABET[seq_data_type]) os.remove(in_file) return out_file
def formatFasta(inFile, outFile): """ Sometimes, there are format errror in fasta files. That should correct them :param inFile: A fasta files that might contains format error :param outFile: A fasta file with no format error :return: Nothing """ SeqIO.convert(inFile, "fasta", outFile, "fasta")
def parallel(self, names, fileType='fasta', nprocs=1, **kwargs): """Running simulator using apply_async Args: names -- NameFile class with iter_names() method fileType -- sequence file format nprocs -- max number of parallel simulation calls kwargs -- passed to simulator Attribs added to each name instance in names: simReadsFile -- file name of simulated reads simReadsFileType -- file type (eg., 'fasta' or 'fastq') simReadsFileCount -- number of simulated reads Return: boolean on run success/fail """ # making list of fasta file to provide simulator call fastaFiles = [name.get_fastaFile() for name in names.iter_names()] # settig kwargs new_simulator = partial(self, **kwargs) # calling simulator res = parmap.map(new_simulator, fastaFiles, processes=nprocs) # checking that simulated reads were created for all references; return 1 if no file for row in res: if row['simReadsFile'] is None or not os.path.isfile(row['simReadsFile']): return 1 elif os.stat(row['simReadsFile'])[0] == 0: # file size = 0 return 1 # converting reads to fasta if needed if fileType.lower() == 'fasta': for result in res: simFile = result['simReadsFile'] fileType = result['simReadsFileType'].lower() if fileType != 'fasta': fastaFile = os.path.splitext(simFile)[0] + '.fna' SeqIO.convert(simFile, fileType, fastaFile, 'fasta') result['simReadsFile'] = fastaFile result['simReadsFileType'] = 'fasta' # setting attribs in name instances for i,name in enumerate(names.iter_names()): # read file simReadsFile = res[i]['simReadsFile'] name.set_simReadsFile(simReadsFile) # file type fileType = res[i]['simReadsFileType'].lower() name.set_simReadsFileType(fileType) # number of simulated reads num_reads = len([True for i in SeqIO.parse(simReadsFile, fileType)]) name.set_simReadsCount(num_reads) return 0
def setUp(self): self.fastaInputDir = tempfile.mkdtemp() self.fastqInputDir = tempfile.mkdtemp() self.fastaOutputDir = join(dirname(self.fastaInputDir), 'fastaout') self.fastqOutputDir = join(dirname(self.fastqInputDir), 'fastqout') fq = here(inputFastq) fa = "R1.fasta" shutil.copy(fq, self.fastqInputDir) SeqIO.convert(fq, 'fastq', join(self.fastaInputDir, fa), 'fasta')
def main(fichier): """ convert fastq into fasta """ from Bio import SeqIO handle = open(fichier, "r") g = open('output.txt','w') SeqIO.convert(handle, 'fastq', g, 'fasta' ) handle.close() g.close()
def fas2clus(inFile): """ Convert the input file from fasta format to clustalw format. Modules required: - SeqIO (from Bio) Usage: <file> """ SeqIO.convert(inFile, 'fasta', inFile + 'cl', 'clustal') clustalFile = open(inFile + 'cl') return clustalFile
def main(): parser = argparse.ArgumentParser(description="This program parses a .gbk file to a .fasta file") parser.add_argument('-gbk', nargs='?', type=str, help=".gbk file", required=True) parser.add_argument('-o', nargs='?', type=str, help="results file name", required=True) args = parser.parse_args() SeqIO.convert(args.gbk, "genbank", args.o, "fasta")
def convert2fastq(input, output): for file in os.listdir(input): if file.endswith(".ab1"): abi = open(input + os.sep + file, 'rb') tmp = output + os.sep + "tmp" if not os.path.exists(tmp): os.makedirs(tmp) out = tmp + os.sep + str(file).split('.')[0] + '.fastq' SeqIO.convert(abi , 'abi', out, 'fastq') return(tmp)
def check_convert(in_filename, in_format, out_format, alphabet=None): records = list(SeqIO.parse(open(in_filename), in_format, alphabet)) #Write it out... handle = StringIO() qual_truncate = truncation_expected(out_format) if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.write(records, handle, out_format) if qual_truncate: warnings.filters.pop() handle.seek(0) #Now load it back and check it agrees, records2 = list(SeqIO.parse(handle, out_format, alphabet)) compare_records(records, records2, qual_truncate) #Finally, use the convert fuction, and check that agrees: handle2 = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet) if qual_truncate: warnings.filters.pop() #We could re-parse this, but it is simpler and stricter: assert handle.getvalue() == handle2.getvalue()
def prep_for_beast(): # print file to NEXUS file ny_aligned = list(SeqIO.parse('final_ny_aligned.txt', "fasta")) for record in ny_aligned: desc = record.description.split(" ") record.id = desc[1] record.description = desc[1] SeqIO.write(ny_aligned, 'final_ny_aligned_name_fixed.txt', "fasta") count = SeqIO.convert("final_ny_aligned_name_fixed.txt", "fasta", "final_ny_aligned.nex", "nexus", alphabet=IUPAC.ambiguous_dna) print("Converted %i records" % count)
def distmat_cmalign( sequence_file, prefix, cpu=wrap.CMALIGN_THREADS, min_bitscore=10): with util.ntf(prefix=prefix, suffix='.aln') as a_sto, \ util.ntf(prefix=prefix, suffix='.fasta') as a_fasta: scores = wrap.cmalign_files(sequence_file, a_sto.name, cpu=cpu) low_scores = scores['bit_sc'] < min_bitscore if low_scores.any(): msg = 'The following sequences aligned with bit score < {}: {}' log.warning(msg.format(min_bitscore, scores[low_scores].index)) # FastTree requires FASTA SeqIO.convert(a_sto, 'stockholm', a_fasta, 'fasta') a_fasta.flush() taxa, distmat = outliers.fasttree_dists(a_fasta.name) return taxa, distmat
def to_fasta(self, rmFile=False): """Converting from fastq to fasta. Args: rmFile -- remove old version of file? Attrib edit: readFile name set to new file (*.fasta) readFileFormat set to fasta format """ # unpack readFile = self.get_readFile() readFileFormat = self.get_readFileFormat() # new file name basename, ext = os.path.splitext(readFile) ## rename file to prevent overwrite if ext == '.fasta': os.rename(readFile, basename + '.tmp') readFile = basename + '.tmp' newFile = basename + '.fasta' # convert try: SeqIO.convert(readFile, readFileFormat, newFile, 'fasta') except ValueError: return False # remove if rmFile: os.remove(readFile) # setting attributes self.set_readFile(newFile) self.set_readFileFormat('fasta') return True
def main(): ap = GooeyParser( description= "splits a fasta file with user specified length and fragment overlap") ap.add_argument("-in", "--input", required=True, widget='FileChooser', help="input fasta file") ap.add_argument("-step", "--step", required=True, help="step size to split fasta, type = int") ap.add_argument("-win", "--window", required=True, help="window size of splitted subsets, type = int") ap.add_argument("-out", "--output", required=True, widget='FileSaver', help="output fasta file") args = vars(ap.parse_args()) # main sequences = [] headers = [] # setup empty lists for record in SeqIO.parse(args['input'], "fasta"): for i in range(0, len(record.seq) - int(args['window']) + 1, int(args['step'])): sequences.append(record.seq[i:i + int(args['window'])]) headers.append(i) # create data frame df = pd.DataFrame() df['id'] = headers df['seq'] = sequences # export with open("out.tab", 'a') as f: f.write( df.to_csv(header=False, index=False, sep='\t', doublequote=False, line_terminator='\n')) # convert to fasta convert = SeqIO.convert("out.tab", "tab", args['output'], "fasta") os.system("del out.tab")
def simple_check(self, base_name, in_variant): for out_variant in ["sanger", "solexa", "illumina"]: if out_variant != "sanger": #Ignore data loss warnings from max qualities warnings.simplefilter('ignore', BiopythonWarning) in_filename = "Quality/%s_original_%s.fastq" \ % (base_name, in_variant) self.assertTrue(os.path.isfile(in_filename)) #Load the reference output... with open("Quality/%s_as_%s.fastq" % (base_name, out_variant), "rU") as handle: expected = handle.read() #Check matches using convert... handle = StringIO() SeqIO.convert(in_filename, "fastq-" + in_variant, handle, "fastq-" + out_variant) self.assertEqual(expected, handle.getvalue()) #Check matches using parse/write handle = StringIO() SeqIO.write(SeqIO.parse(in_filename, "fastq-" + in_variant), handle, "fastq-" + out_variant) self.assertEqual(expected, handle.getvalue()) if out_variant != "sanger": warnings.filters.pop()
def read_sanger(infilepath,outfilepath): count_infile = 0 print infilepath for filename in os.listdir(infilepath): #print "infilepath :" +infilepath #print filename if filename.endswith('.ab1'): abi_filename=os.path.join(infilepath,filename) fastq_filename= os.path.join(outfilepath,filename.replace('.ab1','.fastq')) #print abi_filename ##print fastq_filename SeqIO.convert(abi_filename,'abi',fastq_filename,'fastq') ''' if filename.endswith("QB5505.ab1"): F_fastq_filename =os.path.join(outfilepath_F,filename.replace('.ab1','.fastq')) SeqIO.convert(abi_filename,'abi',F_fastq_filename,'fastq') else: R_fastq_filename =os.path.join(outfilepath_R,filename.replace('.ab1','.fastq')) SeqIO.convert(abi_filename,'abi',R_fastq_filename,'fastq') ''' count_infile += 1 print "There are total %d sequences" %count_infile return
def generate_dist(self): mafft_cline = MafftCommandline(input=self.fasta_seq, maxiterate=1000, localpair=True, phylipout=True) stdout, stderr = mafft_cline() #Save alignments into FASTA and PHYLIP format phyFile = 'testing/alignment.phy' outPhy = open(phyFile, 'w') outPhy.write(stdout) outPhy.close() fastaFile = 'testing/align.fasta' SeqIO.convert(phyFile, 'phylip', fastaFile, 'fasta') #Create phylogenetic tree of the original sequences raxml_cline = RaxmlCommandline(sequences=phyFile, model='GTRGAMMA', name='reversatest', working_dir=self.cwPath) raxml_cline() #Calculate the phylo distances between each branch of the tree tree = dendropy.Tree.get_from_path("testing/RAxML_result.reversatest", "newick") pdm = tree.phylogenetic_distance_matrix() pdm.write_csv('distance.csv')
def check_conversion(self, filename, in_format, out_format, alphabet): msg = "Convert %s from %s to %s" % (filename, in_format, out_format) records = list(SeqIO.parse(filename, in_format, alphabet)) # Write it out... handle = StringIO() qual_truncate = truncation_expected(out_format) with warnings.catch_warnings(): if qual_truncate: warnings.simplefilter("ignore", BiopythonWarning) SeqIO.write(records, handle, out_format) handle.seek(0) # Now load it back and check it agrees, records2 = list(SeqIO.parse(handle, out_format, alphabet)) self.assertEqual(len(records), len(records2), msg=msg) for record1, record2 in zip(records, records2): self.compare_record(record1, record2, qual_truncate, msg=msg) # Finally, use the convert function, and check that agrees: handle2 = StringIO() with warnings.catch_warnings(): if qual_truncate: warnings.simplefilter("ignore", BiopythonWarning) SeqIO.convert(filename, in_format, handle2, out_format, alphabet) # We could re-parse this, but it is simpler and stricter: self.assertEqual(handle.getvalue(), handle2.getvalue(), msg=msg)
def ficheirosProteinas(dicionario): Entrez.email = "*****@*****.**" i = 0 lista_ficheiros = [] for key in dicionario: handleGB = Entrez.efetch(db="protein", rettype="gb", retmode="text", id=key) seq_record = SeqIO.read(handleGB, "genbank") nome_ficheiro = 'sequenceProtGenbank' + str(i) + '.gb' SeqIO.write(seq_record, nome_ficheiro, "genbank") #Guarda em formato genbank lista_ficheiros.append(nome_ficheiro) handleGB.close() SeqIO.convert('sequenceProtGenbank' + str(i) + '.gb', "genbank", 'sequenceProtF' + str(i) + '.fasta', "fasta") i += 1 return lista_ficheiros
def groom(in_file, in_qual="fastq-sanger", out_dir=None, out_file=None): """ Grooms a FASTQ file into sanger format, if it is not already in that format. Use fastq-illumina for Illumina 1.3-1.7 qualities and fastq-solexa for the original solexa qualities. When in doubt, your sequences are probably fastq-sanger. """ if in_qual == "fastq-sanger": logger.info("%s is already in Sanger format." % (in_file)) return out_file with file_transaction(out_file) as tmp_out_file: count = SeqIO.convert(in_file, in_qual, tmp_out_file, "fastq-sanger") logger.info("Converted %d reads in %s to %s." % (count, in_file, out_file)) return out_file
def check(self, sff_name, sff_format, out_name, format): wanted = list(SeqIO.parse(out_name, format)) data = StringIO() count = SeqIO.convert(sff_name, sff_format, data, format) self.assertEqual(count, len(wanted)) data.seek(0) converted = list(SeqIO.parse(data, format)) self.assertEqual(len(wanted), len(converted)) for old, new in zip(wanted, converted): self.assertEqual(old.id, new.id) self.assertEqual(old.name, new.name) if format != "qual": self.assertEqual(str(old.seq), str(new.seq)) elif format != "fasta": self.assertEqual(old.letter_annotations["phred_quality"], new.letter_annotations["phred_quality"])
def countKmerMatch(fas, kmer_list, outfile, is_fastq): match = [] total_count = 0 num_found_markers = 0 handle = StringIO("") num_elem = SeqIO.convert(fas, "fastq", handle, "fasta") # there must be an easyer way to get the number of sequences? bar = progressbar.ProgressBar(redirect_stdout=True, max_value=num_elem) with open(kmer_list,'r') as fin: with open(outfile + '.fasta', "w") as output_fasta_handle: lines = fin.read().splitlines() if is_fastq: with open(outfile + '.fastq', "w") as output_fastq_handle: i = 0 for record in SeqIO.parse(fas, "fastq"): read = record.seq i = i + 1 num = 0 bar.update(i) for plasmid_sequence in lines: sequence = Seq(plasmid_sequence) num = num + int(read.count(sequence)) if num > 0: print("marker found at read number %d" %(i)) num_found_markers = num_found_markers + 1 match.append('1') # write to fasta file SeqIO.write(record, output_fastq_handle, "fastq") SeqIO.write(record, output_fasta_handle, "fasta") total_count = total_count + 1 else: match.append('0') else: for record in SeqIO.parse(fas, "fasta"): read = record.seq num = 0 for plasmid_sequence in lines: sequence = Seq(plasmid_sequence) num = num + int(read.count(sequence)) if num > 0: match.append('1') # write to fasta file SeqIO.write(record, output_fasta_handle, "fasta") total_count = total_count + 1 else: match.append('0') return match, total_count
def main(): ap = GooeyParser( description= "converts a fasta file , into a tabular file with identifier and sequence" ) ap.add_argument("-in", "--input", required=True, widget='FileChooser', help="input fasta file") ap.add_argument("-out", "--output", required=True, widget='FileSaver', help="output tab seperated file") args = vars(ap.parse_args()) # main count = SeqIO.convert(args['input'], "fasta", args['output'], "tab")
def gb_to_fasta(dirpath): for filename in os.listdir(dirpath): if filename.endswith("gb") or filename.endswith("gbk"): outname = 'concat-' + filename + '.fas' count = SeqIO.convert(filename, "genbank", outname, "fasta") print "Genbank file found - " + filename print "Generating " + outname + " ...Done!" f = open(outname) lines = f.readlines() f.close() f = open(outname, "w") for line in lines: if line.startswith(">"): line = ">" + filename + "\n" f.write(line) else: f.write(line) f.close()
def main(): ap = GooeyParser( description= "converts a pdb file with only the atom coordinates section, into a fasta file" ) ap.add_argument("-in", "--input", required=True, widget='FileChooser', help="input pdb file without SEQRES header") ap.add_argument("-out", "--output", required=True, widget='FileSaver', help="output fasta file") args = vars(ap.parse_args()) # main count = SeqIO.convert(args['input'], "pdb-atom", args['output'], "fasta")
def convertMod(args): if args.outFormat.lower() == 'gff': acceptable = ['genbank', 'embl'] if args.inFormat.lower() in acceptable: to_GFF(args) return None else: sys.err.write("ERROR: ValueError, Could not convert file\n") return None else: try: count = SeqIO.convert(args.input, args.inFormat, args.output, args.outFormat ) if count == 0: sys.err.write('ERROR: No records converted. Possibly wrong input filetype\n') else: if args.verbose: sys.err.write("Converted %i records\n" %count) except ValueError: sys.err.write("ERROR: ValueError, Could not convert file\n") return None
def main(): print "Begin!" prj_folder = os.getcwd() print "Quality contorl..." infiles = glob.glob("%s/1.0-origin/*.fastq"%(prj_folder)) if len(infiles) != 2: print "The %s be loaded in error, are they two?"%infiles for the_file in infiles: trim_fastq_by_quality(the_file,prj_folder) print "Merging..." infiles = glob.glob("%s/1.1-trimed-fastq-file/*.fastq"%(prj_folder)) os.chdir("%s/1.2-merged-fastq-file/"%(prj_folder)) merge = subprocess.call("pear -f %s -r %s -o %s"%(infiles[0],infiles[1],project),shell=True) print "Convert fastq to fasta..." merged_file = "%s/1.2-merged-fastq-file/%s.assembled.fastq"%(prj_folder, project) fname, suffix = os.path.splitext(merged_file) count = SeqIO.convert(merged_file, "fastq","%s.fasta"%fname, "fasta") print count print "There are %i records have been Converted!" %(count) print "Unique fasta file..." unique_fasta(prj_folder, project) print "Split large file to small..." os.chdir("%s/1.3-splited-fasta-file/"%(prj_folder)) record_iter = SeqIO.parse(open("%s_unique.fasta"%fname), "fasta") for i, batch in enumerate(batch_iterator(record_iter, 10000)) : filename = "%s-%i.fasta" % (project, i+1) handle = open(filename, "w") count = SeqIO.write(batch, handle, "fasta") handle.close() print "Wrote %i records to %s" % (count, filename) print "Begin IgBLAST..." os.chdir("%s/1.4-IgBLAST-output/"%(prj_folder)) IgBLAST_result = open("%s-igblast-output.txt "%(project),"w") mv_database = subprocess.call("cp -r /zzh_gpfs/home/zzhgroup/yanmingchen/IgBLAST_database/ ./",shell = True) #IgBLAST_run = subprocess.call("igblastn -germline_db_V ./IgBLAST_database/20150429-human-gl-v -germline_db_J ./IgBLAST_database/20150429-human-gl-j -germline_db_D ./IgBLAST_database/20150429-human-gl-d -organism human -domain_system imgt -query %s -auxiliary_data optional_file/human_gl.aux -outfmt '7 qseqid sseqid pident length mismatch gapopen gaps qstart qend sstart send evalue bitscore qlen slen qseq sseq score frames qframe sframe positive ppos btop staxids stitle sstrand qcovs qcovhsp' -num_alignments_V 10 -num_alignments_D 10 -num_alignments_J 10 -out IgBLAST_result"%merged_file,shell=True) IGBLAST_infiles = glob.glob("%s/1.3-splited-fasta-file/*.fasta"%(prj_folder)) for index, the_file in enumerate(IGBLAST_infiles): print index,the_file IgBLAST_run = subprocess.call("igblastn -germline_db_V ./IgBLAST_database/20150429-human-gl-v -germline_db_J ./IgBLAST_database/20150429-human-gl-j -germline_db_D ./IgBLAST_database/20150429-human-gl-d -organism human -domain_system imgt -query %s -auxiliary_data optional_file/human_gl.aux -outfmt '7 qseqid sseqid pident length mismatch gapopen gaps qstart qend sstart send evalue bitscore qlen slen qseq sseq score frames qframe sframe positive ppos btop staxids stitle sstrand qcovs qcovhsp' -num_alignments_V 10 -num_alignments_D 10 -num_alignments_J 10 -out IgBLAST_result_%i &"%(the_file,index+1),shell=True)
def main(): if sys.version_info[0] < 3: sys.exit( 'Must be using Python 3. Try calling "python3 genbank_2_embl.py"') parser = argparse.ArgumentParser( prog='genbank_2_embl.py', description='Converts a genbank file into a embl file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) parser_required = parser.add_argument_group('Required options') parser_required.add_argument('-g', '--genbank', nargs=1, type=argparse.FileType('r'), required=True, metavar='/path/to/genbank/file.gb', help='Path to the genbank file') parser_required.add_argument('-e', '--embl', nargs=1, type=str, metavar='/path/to/output/embl/file.embl', help='Path to the output embl file', required=True) args = parser.parse_args() args.genbank = os.path.abspath(args.genbank[0].name) args.embl = os.path.abspath(args.embl[0]) if not os.path.isdir(os.path.dirname(args.embl)): os.makedirs(os.path.dirname(args.embl)) count = SeqIO.convert(args.genbank, 'genbank', args.embl, 'embl') print('Converted {} records'.format(count))
def readwrite_fasta(infilename, outfilename): try: if compression_type in ["gzip", "gz"]: infile = gzip.open(infilename, 'r') elif compression_type in ["bzip2", "bz2"]: infile = bz2.BZ2File(infilename, 'r') elif compression_type == "zip": myzipfile = zipfile.ZipFile(infilename, 'r') if len(myzipfile.namelist()) > 1: raise IOError, "TOO MANY FILES IN ZIPFILE" else: infile = myzipfile.open(myzipfile.namelist()[0]) else: infile = open(infilename, 'r') outfile = open(outfilename, "w") outcounter = SeqIO.convert(infile, "fastq-sanger", outfile, "fasta") outfile.close() infile.close() return outcounter except Exception, ex: print ex.__class__.__name__ + " : " + str(ex) return None
def convert_format(in_file, out_file, in_format=None, out_format=None, data_type='dna', ambiguities=True): if in_format == None: in_format = FILE_FORMATS.get_format_from_file_object(in_file) if out_format == None: out_format = FILE_FORMATS.get_format_from_file_object(out_file) _LOG.debug("converting {in_format}-formatted file {in_file!r} to " "{out_format}-formatted file {out_file!r}.".format( in_file=in_file, in_format=in_format, out_file=out_file, out_format=out_format)) nseqs = SeqIO.convert(in_file=in_file, in_format=in_format, out_file=out_file, out_format=out_format, alphabet=get_state_alphabet(data_type, ambiguities)) return nseqs
def test_qual_negative(self): """Check QUAL negative scores mapped to PHRED zero.""" data = """>1117_10_107_F3 23 31 -1 -1 -1 29 -1 -1 20 32 -1 18 25 7 -1 6 -1 -1 -1 30 -1 20 13 7 -1 -1 21 30 -1 24 -1 22 -1 -1 22 14 -1 12 26 21 -1 5 -1 -1 -1 20 -1 -1 12 28 >1117_10_146_F3 20 33 -1 -1 -1 29 -1 -1 28 28 -1 7 16 5 -1 30 -1 -1 -1 14 -1 4 13 4 -1 -1 11 13 -1 5 -1 7 -1 -1 10 16 -1 4 12 15 -1 8 -1 -1 -1 16 -1 -1 10 4 >1117_10_1017_F3 33 33 -1 -1 -1 27 -1 -1 17 16 -1 28 24 11 -1 6 -1 -1 -1 29 -1 8 29 24 -1 -1 8 8 -1 20 -1 13 -1 -1 8 13 -1 28 10 24 -1 10 -1 -1 -1 4 -1 -1 7 6 >1117_11_136_F3 16 22 -1 -1 -1 33 -1 -1 30 27 -1 27 28 32 -1 29 -1 -1 -1 27 -1 18 9 6 -1 -1 23 16 -1 26 -1 5 7 -1 22 7 -1 18 14 8 -1 8 -1 -1 -1 11 -1 -1 4 24""" # noqa : W291 h = StringIO(data) h2 = StringIO() with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonParserWarning) self.assertEqual(4, SeqIO.convert(h, "qual", h2, "fastq")) self.assertEqual( h2.getvalue(), """\ @1117_10_107_F3 ?????????????????????????????????????????????????? + 8@!!!>!!5A!3:(!'!!!?!5.(!!6?!9!7!!7/!-;6!&!!!5!!-= @1117_10_146_F3 ?????????????????????????????????????????????????? + 5B!!!>!!==!(1&!?!!!/!%.%!!,.!&!(!!+1!%-0!)!!!1!!+% @1117_10_1017_F3 ?????????????????????????????????????????????????? + BB!!!<!!21!=9,!'!!!>!)>9!!))!5!.!!).!=+9!+!!!%!!(' @1117_11_136_F3 ?????????????????????????????????????????????????? + 17!!!B!!?<!<=A!>!!!<!3*'!!81!;!&(!7(!3/)!)!!!,!!%9 """, )
print >>out_handle, gtf_gene gtf_tx = '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s;' % ( acc, source, 'transcript', f.location.start.position+1, f.location.end.position, strand, comments ) print >>out_handle, gtf_tx comments += '; exon_number "1"' gtf_exon = '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s;' % ( acc, source, 'exon', f.location.start.position+1, f.location.end.position, strand, comments ) print >>out_handle, gtf_exon sys.stderr.write( "%s\tSkipped %s entries having types: %s.\n" % ( gb.id,skipped, ', '.join(skippedTypes) ) ) if __name__=='__main__': description = ("Convert GeneBank files to FASTA and GTF bcbio ready files.") parser = ArgumentParser(description=description) parser.add_argument("--gbk", required=True, help="GBK files") parser.add_argument("--prefix", required=True, help="prefix") args = parser.parse_args() t0=datetime.now() count = SeqIO.convert(args.gbk, "genbank", args.prefix + "_tmp.fa", "fasta") with open(args.prefix + ".fa", "w") as out_handle: with open(args.prefix + "_tmp.fa") as in_handle: header = next(in_handle) print >>out_handle, header.split()[0] for line in in_handle: print >>out_handle, line.strip() gb2gtf(args.gbk, args.prefix + ".gtf") dt=datetime.now() - t0 sys.stderr.write( "#Time elapsed: %s\n" % dt )
#!/usr/bin/env python from Bio import SeqIO import sys SeqIO.convert(sys.argv[1], "fastq", sys.argv[2], "fasta")
def converter(fq): output_fa = os.path.splitext(fq)[0] + '.fa' SeqIO.convert(fq, 'fastq', output_fa, 'fasta')
len(genbankdf[genbankdf.locus_tag == 'none'])) genbankdf = genbankdf[genbankdf.locus_tag != "none"] genbankdf.reset_index(level=0, inplace=True) # we split with pipes "|" later on; this rmoves any from locus_tags , bd_xref's, and old_locus_tags try: genbankdf.db_xref = genbankdf.db_xref.str.replace("|", "_") genbankdf.locus_tag = genbankdf.locus_tag.str.replace("|", "_") genbankdf.old_locus_tag = genbankdf.old_locus_tag.str.replace("|", "_") except KeyError: pass #print(genbankdf.loc[genbankdf['locus_tag'] == 'QV15_00005']) ### just a test #%%#%%# make a fasta for genomic sequence(s), clean up header in resulting file output_genfasta_handle = open(os.path.join(subdirname, gb + "_genomic.fasta"), "w") SeqIO.convert(input_handle.name, "genbank", output_genfasta_handle, "fasta") output_genfasta_handle.close() #%% if rename_fa: print("After running this script, run the following output as a command" + ", starting with 'awk' and ending with _renamed.fasta:") import string input_handle = open(input_genome, "r") # input database names = [] for record in SeqIO.parse(input_handle, "genbank"): print(record.id) names.append(record.id) if len(names) > 1: starts_at = names[0][-1] # removes number, period, and version number
while_condition = False else: qwindow_average = int(qwindow_average) if qwindow_average > 40 or qwindow_average < 0: print 'You entered a wrong value.' print 'Try again!' else: while_condition = False #sff sequence extraction if file_extension == 'sff': # extract the sequences from the sff file fastq_file = sample_name + '.fastq' stars() print 'The sff file is now being converted into a fastq file.' print 'This could take a while...' sff_seq_count = SeqIO.convert(raw_file_name, "sff-trim", fastq_file, "fastq") print '%i sequences have been converted to fastq format.' % sff_seq_count print '%s file has been created' % fastq_file stars() if file_extension == 'fastq': fastq_file = raw_file_name #*** DNA sequence quality trimming *** stars() print 'Now we trim the sequences using the desired quality settings.' print 'Depending on the settings, this could take a while... (10-20 minutes)' print 'Please be patient!' stars() trim_fasta_file_name = sample_name + '.trim.fasta' good_reads = []
from Bio import SeqIO with open('rosalind_tfsq.txt') as input_data, open('output.txt', 'w') as output_data: SeqIO.convert(input_data, 'fastq', output_data, 'fasta' )
def _method_biopython(self, *args, **kwargs): from Bio import SeqIO SeqIO.convert(self.infile, "genbank", self.outfile, "fasta")
help='''Generate qual file.''') parser.add_argument( '--out', '-o', type=str, default=False, help= '''Output prefix. Default: same as fastq. Use "stdout" or "-" to print to screen.''' ) args = parser.parse_args() assert args.fa or args.qual if not args.out: args.out = args.fastq.split("/")[-1].split(".")[0] if args.fastq in ('', '-', 'stdin'): args.fastq = sys.stdin if args.out and args.out in ("stdout", "-"): out = sys.stdout else: out = args.out + ".fasta" if args.fa else args.out + ".qual" if args.fa: SeqIO.convert(args.fastq, "fastq", out, "fasta") if args.qual: SeqIO.convert(args.fastq, "fastq", out, "qual")