Ejemplo n.º 1
0
    def run_random_select(self):
        # java SeqRandomSampling does not seem to take the abs path for fastq files
        merged_fq = '{}{}'.format(
            self.sample_info.sample_name,
            '.merged.primer_trim.len_trim.fastq'
        )

        cur_dir = os.getcwd()
        os.chdir(self.hulk_sample_dir_path)

        merged_fa = os.path.splitext(merged_fq)[0] + '.fasta'
        SeqIO.convert(merged_fq, 'fastq', merged_fa, 'fasta')

        if not os.path.exists(merged_fq):
            print('{} is not found in {}'.format(
                merged_fq,
                self.hulk_sample_dir_path,
            )
            )

        jar_cmd = ['java SeqRandomSampling',
                   '-c ',
                   self.random_num,
                   '-i',
                   merged_fa,
                   ]

        final_jar_cmd = ' '.join(str(x) for x in jar_cmd)
        os.system(final_jar_cmd)
        os.chdir(cur_dir)
Ejemplo n.º 2
0
def main(argv):
	inputfile = ''
	outputfile = ''
	
	try:
		opts, args = getopt.getopt(argv, 'hi:')
	except getopt.GetoptError:
		print "Incorrect syntax: Use '-h' for help."
		with open('fastq2fasta_error-log.txt', 'w') as error:
			error.write("Syntax Error\n")
		sys.exit(2)

	for opt, arg in opts:
		if opt == '-h':
			print('Use Syntax: fastq2fasta.py -i <inputfile>')
			sys.exit()
		elif opt == '-i':
			inputfile = arg
			filename, extension = os.path.splitext(arg)
			outputfile = '%s.fasta' % (filename)			

	if inputfile == '':
		print("Incorrect syntax: Use '-h' for help.")
		with open('fastq2fasta_error-log.txt', 'w') as error:
			error.write("Syntax Error\n")
		sys.exit(2)

	with open(inputfile, 'r') as inpt:
		with open(outputfile, 'w') as otpt:
			SeqIO.convert(inputfile, "fastq", outputfile, "fasta")
			otpt.write(outputfile)
			print('Your FASTQ has been converted. See %s') % (outputfile)
Ejemplo n.º 3
0
def align(arguments):
    """
    Align sequences to a reference package alignment.
    """
    refpkg = arguments.refpkg
    prof = arguments.profile_version or refpkg.guess_align_method()
    alignment_func = ALIGNERS[prof]
    alignment_options = (arguments.alignment_options or ALIGNMENT_DEFAULTS.get(prof))

    dn = os.path.dirname(arguments.outfile)
    with _temp_file(prefix='.refpkg_align', dir=dn) as tf:
        tf.close()
        r = alignment_func(refpkg, arguments.seqfile, tf.name,
            use_mask=arguments.use_mask, use_mpi=arguments.use_mpi,
            mpi_args=arguments.mpi_arguments, mpi_program=arguments.mpi_run,
            alignment_options=alignment_options, stdout=arguments.stdout)

        if (not arguments.output_format or
                arguments.output_format == DEFAULT_FORMAT[prof]):
            # No format converseion needed
            os.rename(tf.name, arguments.outfile)
        else:
            # Convert
            SeqIO.convert(tf.name, DEFAULT_FORMAT[prof], arguments.outfile,
                    arguments.output_format)

    return r
Ejemplo n.º 4
0
def stockfastaconvert(options):
    '''
    Conversion of multiple alignment - Stockholm to Multifasta
    '''
    while True:
        try:
            fam = q.get(block=True, timeout=0.1)
        except Empty:
            if n.qsize() > 0:
                for _ in range(n.qsize()):
                    print n.get(),
                print ""
            break
        else:
            n.put(fam)
            if n.qsize() >= 10:
                for _ in range(10):
                    print n.get(),
                print ""
            if os.path.exists(options.dbdir+"align/stockholm/"+fam+".stockholm"):
                if os.path.exists(options.dbdir+"align/"+fam+".fasta"):
                    os.remove(options.dbdir+"align/"+fam+".fasta", "fasta")
                SeqIO.convert(options.dbdir+"align/stockholm/"+fam+".stockholm",
                    "stockholm", options.dbdir+"align/"+fam+".fasta", "fasta")
                handle = open(options.dbdir+"align/"+fam.upper()+".fasta", "r")
                temp = set()
                for nuc_rec in SeqIO.parse(handle, "fasta"):
                    temp.add(nuc_rec.id)
                handle.close()
                handle = open(options.dbdir+"align/gene_list/"+fam.upper()+".gene", "w")
                for gene in temp:
                    handle.write(gene+"\n")
                handle.close()
                os.remove(options.dbdir+"align/stockholm/"+fam+".stockholm")
            q.task_done()
Ejemplo n.º 5
0
    def simple_check(self, base_name, in_variant):
        for out_variant in ["sanger", "solexa", "illumina"]:
            in_filename = "Quality/%s_original_%s.fastq" \
                          % (base_name, in_variant)
            self.assertTrue(os.path.isfile(in_filename))
            # Load the reference output...
            with open("Quality/%s_as_%s.fastq" % (base_name, out_variant),
                      _universal_read_mode) as handle:
                expected = handle.read()

            with warnings.catch_warnings():
                if out_variant != "sanger":
                    # Ignore data loss warnings from max qualities
                    warnings.simplefilter("ignore", BiopythonWarning)
                    warnings.simplefilter("ignore", UserWarning)
                # Check matches using convert...
                handle = StringIO()
                SeqIO.convert(in_filename, "fastq-"+in_variant,
                              handle, "fastq-"+out_variant)
                self.assertEqual(expected, handle.getvalue())
                # Check matches using parse/write
                handle = StringIO()
                SeqIO.write(SeqIO.parse(in_filename, "fastq-"+in_variant),
                            handle, "fastq-"+out_variant)
                self.assertEqual(expected, handle.getvalue())
Ejemplo n.º 6
0
    def run(self, edit):

        for region in self.view.sel():

            seq_str = self.view.substr(region).strip()

            if not seq_str:
                sublime.error_message("No selected text")
                return

            # Check that the selection begins as expected
            startmatch = re.match(r'^LOCUS', seq_str)
            # It turns out that SeqIO can handle Genbank format that
            # does not end in '//' so there is no need to check for this

            if startmatch:
                # Read from a string and write to a string
                seqout = io.StringIO()

                with io.StringIO(seq_str) as seqin:
                    SeqIO.convert(seqin, 'genbank', seqout, 'fasta')
                seqin.close()

                # Write the fasta string to a new window at position 0
                self.view.window().new_file().insert(
                    edit, 0, seqout.getvalue())
            else:
                sublime.error_message(
                    "Selected text does not look like Genbank: no 'LOCUS'")
                return
Ejemplo n.º 7
0
def main(gbdir, outdir):
    os.makedirs(gbdir, exist_ok=True)
    os.makedirs(outdir, exist_ok=True)
    tempq = 'tempquery.fasta'
    tempdb = 'tempdb.fasta'
    for org in tqdm(Organism.objects.all()):
        # get genbank and convert to fasta
        fpath = os.path.join(gbdir, '{}.gb'.format(org.accession))
        if not os.path.isfile(fpath):
            print('\nFetching {} with accession {}'.format(
                org.name,
                org.accession
            ))
            fetch(fpath)
        SeqIO.convert(fpath, 'genbank', tempdb, 'fasta')
        # get spacers of organism and convert to fasta
        spacers = Spacer.objects.filter(loci__organism=org)
        fastatext = ''.join(['>{}\n{}\n'.format(spacer.id, spacer.sequence)
                             for spacer in spacers])
        with open(tempq, 'w') as f:
            f.write(fastatext)
        # run blast and save output
        outpath = os.path.join(outdir, '{}.json'.format(org.accession))
        commandargs = ['blastn', '-query', tempq,
                       '-subject', tempdb, '-out', outpath, '-outfmt', '15']
        subprocess.run(commandargs, stdout=subprocess.DEVNULL)

    os.remove(tempq)
    os.remove(tempdb)
Ejemplo n.º 8
0
def roundtrip_check(filepath):
    __doc__ = '''Is there any odd data in the genbank that will get lost? Bar for the know things addressed with Botch()'''
    import os
    input_handle = open("TMO.gbk", "rU")
    SeqIO.convert(filepath, "genbank", "test.gbk", "genbank")
    print("went from " + str(os.stat('TMO.gbk').st_size) + " to " + str(os.stat("test.gbk").st_size))
    os.remove("test.gbk")
Ejemplo n.º 9
0
def runSeqGen(workingFile, srp_hap_file, srp_tree_file, debug):

    seqgen_infile = workingFile + "_seqgen.phylip"
    seqgen = runExtProg(seqgenDir + "./seq-gen", pdir=seqgenDir, length=3)
    seqgen.set_param_at("-mHKY", 1)
    seqgen.set_param_at("-t2", 2)
    seqgen.set_param_at("-k1", 3)
#     seqgen.set_param_at("-d0.1", 4)
    # seqgen.set_param_at("-s0.00001", 4)
    seqgen.set_stdin(seqgen_infile)

    all_unique = False
    repeat = 0
    while not all_unique:
        if repeat == 100:
            runBSSC(workingFile, srp_tree_file, debug)
            print "==========rerun BSSC========"
            repeat = 0
        repeat += 1
#        print repeat
        seqgen.run(0)
        all_unique = check_unique_sequences(seqgen)


    temp_handle = open(workingFile + "_seqgen_out.phylip", "w")
    temp_handle.write(seqgen.output)
    temp_handle.close()

    SeqIO.convert(workingFile + "_seqgen_out.phylip", "phylip", workingFile + ".fasta", "fasta")
    shutil.copy(workingFile + ".fasta", srp_hap_file)
Ejemplo n.º 10
0
def convertQ2A( qd, qname):
    qpath = qd + "/" + qname + ".fastq"
    cpath = qd + "/" + qname + ".fasta"
    SeqIO.convert(qpath, "fastq", cpath, "fasta")
    print "converted file to fasta"

    return cpath
Ejemplo n.º 11
0
def illumina2sangerFq(inputfile):
    
    print help(SeqIO.convert)
    
    filename = inputfile[:-3]+'.fastq'
    
    SeqIO.convert(inputfile, "fastq-illumina", filename, "fastq")
def toNexus (listOfFiles):
	for file in listOfFiles:
		output_handle = file.replace(".fasta", ".nex")
		output_handle = re.sub(".+/.+/", os.getcwd()+"/", output_handle) 
		SeqIO.convert(file, "fasta", output_handle, "nexus", generic_dna)
	dir = os.getcwd()
	return dir
Ejemplo n.º 13
0
def check_convert_fails(in_filename, in_format, out_format, alphabet=None):
    qual_truncate = truncation_expected(out_format)
    #We want the SAME error message from parse/write as convert!
    err1 = None
    try:
        records = list(SeqIO.parse(in_filename,in_format, alphabet))
        handle = StringIO()
        if qual_truncate:
            warnings.simplefilter('ignore', UserWarning)
        SeqIO.write(records, handle, out_format)
        if qual_truncate:
            warnings.filters.pop()
        handle.seek(0)
        assert False, "Parse or write should have failed!"
    except ValueError as err:
        err1 = err
    #Now do the conversion...
    try:
        handle2 = StringIO()
        if qual_truncate:
            warnings.simplefilter('ignore', UserWarning)
        SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet)
        if qual_truncate:
            warnings.filters.pop()
        assert False, "Convert should have failed!"
    except ValueError as err2:
        assert str(err1) == str(err2), \
               "Different failures, parse/write:\n%s\nconvert:\n%s" \
               % (err1, err2)
Ejemplo n.º 14
0
def convert(basename, genbank):
    '''Convert the provided genbank to a fasta to BLAST.'''

    refFasta = "{}.fasta.tmp".format(basename)
    SeqIO.convert(genbank, 'genbank', refFasta, 'fasta')

    return refFasta
Ejemplo n.º 15
0
def copy_sequence(input_file, output_file):
    '''Copy sequence files from staging area'''
    GZIP_HEADER = '\x1f\x8b'
    BZIP_HEADER = 'BZ'

    pmsg('Copying sequence files', input_file, output_file)
    # check if this is actually a gzipped file
    header = open(input_file).read(2)
    if header == GZIP_HEADER:
        input_file_handle = gzip.open(input_file, 'rb')
    elif header == BZIP_HEADER:
        input_file_handle = BZ2File(input_file, 'r')
    else:
        input_file_handle = open(input_file, 'rb')
    output_file_handle = gzip.open(output_file, 'wb')

    # check whether this is a illumina or sanger fastq file
    try:
        SeqIO.convert(input_file_handle, 'fastq-illumina', output_file_handle, 'fastq-sanger')
    except ValueError as e:
        # check if this is a quality score problem
        if e.args != ('Invalid character in quality string',):
            raise e
        input_file_handle.seek(0)
        output_file_handle.seek(0)
        output_file_handle.writelines(input_file_handle.readlines())
    finally:
        input_file_handle.close()
        output_file_handle.close()
Ejemplo n.º 16
0
def main():
    parser = OptionParser()
    parser.add_option("-i", "--input", dest="input",
                      help="read INPUT fastq file", metavar="INPUT")

    parser.add_option("-o", "--output", dest="output",
                      help="write OUTPUT fasta file", metavar="OUTPUT")

    parser.add_option("-q", "--qual", dest="qual",
                      help="write OUTPUT qual file", metavar="QUAL")

    (opt, args) = parser.parse_args()

    if opt.input == None:
        print "Missing input file"
        return

    if opt.output == None:
        print "Missing output file"
        return

    print "Converting files..."

    print "Creating csfasta file"
    count = SeqIO.convert(opt.input, "fastq", opt.output, "fasta")

    print "Converted %i records" % count

    if opt.qual != None:
        print "Creating Qual file"
        count = SeqIO.convert(opt.input, "fastq", opt.qual, "qual")
        
        print "Converted %i qual records" % count
Ejemplo n.º 17
0
def process_file(filepath, organism):
    """Process a single file given by the user on the command line."""

    fasta_filepath = filepath

    # Determine file type
    spstring = re.split('/', filepath)
    fname = spstring[-1].lower()
    fnamesplit = re.split('\.', fname)
    ftype = fnamesplit[-1]
    if ftype == 'gb':
        ftype = 'genbank'
    check_features = (ftype == 'genbank')

    # Open the file and get metadata
    obj_file = SeqIO.read(filepath, ftype)
    features = get_genbank_features(obj_file)
    my_seq = str(obj_file.seq)

    # Create FASTA file if necessary
    if ftype != 'fasta':
        fasta_filepath = "/tmp/" + fnamesplit[0] + ".fasta"
        with open(fasta_filepath, 'w') as handle:
            SeqIO.convert(filepath, "genbank", handle, "fasta")

    # Process the file
    output_dict = process_efm_cli(fasta_filepath, features, my_seq, organism, check_features, fname)
    return output_dict
Ejemplo n.º 18
0
def prepare_data(ionfile, index_length):
    '''
    * Changes quality format from Phred to Solexa (which is required by the fastx-toolkit). 
    * Changes sequences id to incremental numbers.
    * Creates temporal FASTA file with the indexes removed from the sequences.

    Files generated will be written to folder ``data/modified/`` 

    * ``ionfile`` argument is FASTQ format file as produced by IonTorrent
    * ``index_length`` number of base pairs of your indexes. This is necessary \
                       to trim the indexes before blasting the FASTA file      \
                       against the reference gene sequences.

    Example:

    >>> from pyphylogenomics import NGS
    >>> ionfile = "ionrun.fastq";
    >>> index_length = 8;
    >>> NGS.prepare_data(ionfile, index_length);
    Your file has been saved using Solexa quality format as data/modified/wrk_ionfile.fastq
    Your sequence IDs have been changed to numbers.
    The FASTA format file data/modified/wrk_ionfile.fasta has been created.
    '''
    # create folder to keep data
    folder = os.path.join("data", "modified");
    if not os.path.exists(folder):
        os.makedirs(folder);

    # change quality format from Phred to Solexa (required by fastx-toolkit)    
    # write file to work on
    wrkfile = os.path.join(folder, "wrk_ionfile.fastq")
    SeqIO.convert(ionfile, "fastq", wrkfile, "fastq-solexa");
    print "Your file has been saved using Solexa quality format as " + wrkfile

    # change sequences id to incremental numbers
    command = "fastx_renamer -n COUNT -i " + wrkfile + " -o tmp.fastq"
    p = subprocess.check_call(command, shell=True);
    if p != 0:
        print "\nError, couldn't execute " + command;
        sys.exit();
    print "Your sequence IDs have been changed to numbers."

    # replace working file with temporal file
    os.rename("tmp.fastq", wrkfile);

    # create temporal FASTA file
    command = "fastq_to_fasta -i " + wrkfile + " -o tmp.fasta";
    p = subprocess.check_call(command, shell=True);

    # trim index region
    index_length = int(index_length) + 1;
    command  = "fastx_trimmer -f " + str(index_length) + " -i tmp.fasta " 
    command += "-o " + os.path.join(folder, "wrk_ionfile.fasta");
    p = subprocess.check_call(command, shell=True);

    if os.path.isfile("tmp.fasta"):
        os.remove("tmp.fasta");

    print "The FASTA format file " + os.path.join(folder, "wrk_ionfile.fasta") \
            + " has been created.";
Ejemplo n.º 19
0
def seqio(in_fhands, out_fhands, out_format, copy_if_same_format=True):
    'It converts sequence files between formats'

    in_formats = [guess_format(fhand) for fhand in in_fhands]

    if (len(in_formats) == 1 and in_formats[0] == out_format and
        hasattr(in_fhands[0], 'name')):
        if copy_if_same_format:
            copyfileobj(in_fhands[0], out_fhands[0])
        else:
            rel_symlink(in_fhands[0].name, out_fhands[0].name)

    elif len(in_fhands) == 1 and len(out_fhands) == 1:
        try:
            SeqIO.convert(in_fhands[0], in_formats[0], out_fhands[0],
                          out_format)
        except ValueError as error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            raise
    elif (len(in_fhands) == 1 and len(out_fhands) == 2 and
          out_format == 'fasta'):
        try:
            for seq in read_seqrecords([in_fhands[0]]):
                SeqIO.write([seq], out_fhands[0], out_format)
                SeqIO.write([seq], out_fhands[1], 'qual')
        except ValueError, error:
            if error_quality_disagree(error):
                raise MalformedFile(str(error))
            raise
Ejemplo n.º 20
0
def sync_readdata( rawdir, ngsdata ):
    '''
    Ensures that ab1 files are symlinked from the RawData/Sanger/Run directory
    and that they are then converted to fastq
    
    @param rawdir - RawData/Sanger/Run path
    @param ngsdata - Path to root NGSData directory
    '''
    raw_reads = glob( join( rawdir, '*.ab1' ) )
    readd = join( ngsdata, 'ReadData', 'Sanger', basename(rawdir) )
    if not isdir( readd ):
        os.makedirs( readd )
    for read in raw_reads:
        lnk = relpath( read, readd )
        rdpath = join( readd, basename(read) )
        if not exists( rdpath ):
            logger.info( 'Symlinking {0} to {1}'.format(rdpath, lnk) )
            cd = os.getcwd()
            os.symlink( lnk, rdpath )
        else:
            logger.info( 'Skipping existing abi file {0}'.format(rdpath) )
        fqpath = rdpath.replace('.ab1', '.fastq' )
        if not exists( fqpath ):
            logger.info( 'Converting {0} to fastq {1}'.format(rdpath,fqpath) )
            SeqIO.convert( rdpath, 'abi', fqpath, 'fastq' )
        else:
            logger.info( 'Skipping existing fastq file {0}'.format(fqpath) )
Ejemplo n.º 21
0
def aln2hmm(task):
    """Convert a Clustal alignment to an HMM profile.

    Cleans: .stk
    """
    stk = ext(task.depends[0], 'stk')
    SeqIO.convert(str(task.depends[0]), 'clustal', stk, 'stockholm')
    sh('hmmbuild %s %s' % (task.target, stk))
Ejemplo n.º 22
0
def cmalign(infile, outfile, cpu):

    with util.ntf(suffix='.sto') as a_sto, open(outfile, 'w') as a_fasta:
        scores = wrap.cmalign_files(infile, a_sto.name, cpu=cpu)
        SeqIO.convert(a_sto, 'stockholm', a_fasta, 'fasta')
        a_fasta.flush()

    return scores
Ejemplo n.º 23
0
def convert(in_file, in_format, out_format, treeid, seq_data_type):
    out_file = TMP_UTILS_PATH + treeid + "." + out_format
    #SeqIO responds to clustal as identifier for the format but the official extension is .aln
    if seq_data_type == "clustal":
        seq_data_type = ".aln"
    SeqIO.convert(in_file, in_format, out_file, out_format, alphabet=SEQUENCE_ALPHABET[seq_data_type])
    os.remove(in_file)
    return out_file
Ejemplo n.º 24
0
def formatFasta(inFile, outFile):
    """
    Sometimes, there are format errror in fasta files. That should correct them
    :param inFile: A fasta files that might contains format error
    :param outFile: A fasta file with no format error
    :return: Nothing
    """
    SeqIO.convert(inFile, "fasta", outFile, "fasta")
Ejemplo n.º 25
0
    def parallel(self, names, fileType='fasta', nprocs=1, **kwargs):
        """Running simulator using apply_async

        Args:
        names -- NameFile class with iter_names() method
        fileType -- sequence file format
        nprocs -- max number of parallel simulation calls
        kwargs -- passed to simulator

        Attribs added to each name instance in names:
        simReadsFile -- file name of simulated reads
        simReadsFileType -- file type (eg., 'fasta' or 'fastq')
        simReadsFileCount -- number of simulated reads

        Return:
        boolean on run success/fail
        """
        # making list of fasta file to provide simulator call
        fastaFiles = [name.get_fastaFile() for name in names.iter_names()]

        # settig kwargs
        new_simulator = partial(self, **kwargs)

        # calling simulator
        res = parmap.map(new_simulator, fastaFiles, processes=nprocs)

        # checking that simulated reads were created for all references; return 1 if no file
        for row in res:
            if row['simReadsFile'] is None or not os.path.isfile(row['simReadsFile']):
                return 1
            elif os.stat(row['simReadsFile'])[0] == 0:  # file size = 0
                return 1
        
        # converting reads to fasta if needed
        if fileType.lower() == 'fasta':
            for result in res:
                simFile = result['simReadsFile']
                fileType = result['simReadsFileType'].lower()
                if fileType != 'fasta':
                    fastaFile = os.path.splitext(simFile)[0] + '.fna'
                    SeqIO.convert(simFile, fileType, fastaFile, 'fasta')
                    result['simReadsFile'] = fastaFile
                    result['simReadsFileType'] = 'fasta'
                    
        # setting attribs in name instances                    
        for i,name in enumerate(names.iter_names()):
            # read file
            simReadsFile = res[i]['simReadsFile']
            name.set_simReadsFile(simReadsFile)
            # file type
            fileType = res[i]['simReadsFileType'].lower()
            name.set_simReadsFileType(fileType)
            # number of simulated reads            
            num_reads = len([True for i in SeqIO.parse(simReadsFile, fileType)])
            name.set_simReadsCount(num_reads)
            
        return 0
Ejemplo n.º 26
0
 def setUp(self):
     self.fastaInputDir = tempfile.mkdtemp()
     self.fastqInputDir = tempfile.mkdtemp()
     self.fastaOutputDir = join(dirname(self.fastaInputDir), 'fastaout')
     self.fastqOutputDir = join(dirname(self.fastqInputDir), 'fastqout')
     fq = here(inputFastq)
     fa = "R1.fasta"
     shutil.copy(fq, self.fastqInputDir)
     SeqIO.convert(fq, 'fastq', join(self.fastaInputDir, fa), 'fasta')
Ejemplo n.º 27
0
def main(fichier):
	"""
		convert fastq into fasta
	"""
	from Bio import SeqIO
	handle = open(fichier, "r")
	g = open('output.txt','w')
	SeqIO.convert(handle, 'fastq', g, 'fasta' )
	handle.close()
	g.close()
Ejemplo n.º 28
0
def fas2clus(inFile):
    """
    Convert the input file from fasta format to clustalw format.
    Modules required:
    - SeqIO (from Bio)
    Usage: <file>
    """
    SeqIO.convert(inFile, 'fasta', inFile + 'cl', 'clustal')
    clustalFile = open(inFile + 'cl')
    return clustalFile
Ejemplo n.º 29
0
def main():

	parser = argparse.ArgumentParser(description="This program parses a .gbk file to a .fasta file")
	parser.add_argument('-gbk', nargs='?', type=str, help=".gbk file", required=True)
	parser.add_argument('-o', nargs='?', type=str, help="results file name", required=True)


	args = parser.parse_args()

	SeqIO.convert(args.gbk, "genbank", args.o, "fasta")
Ejemplo n.º 30
0
def convert2fastq(input, output):
	for file in os.listdir(input):
		if file.endswith(".ab1"):
			abi = open(input + os.sep + file, 'rb')
			tmp = output + os.sep + "tmp"
			if not os.path.exists(tmp):
				os.makedirs(tmp)
			out = tmp + os.sep + str(file).split('.')[0] + '.fastq'
			SeqIO.convert(abi , 'abi', out, 'fastq')
	return(tmp)
Ejemplo n.º 31
0
def check_convert(in_filename, in_format, out_format, alphabet=None):
    records = list(SeqIO.parse(open(in_filename), in_format, alphabet))
    #Write it out...
    handle = StringIO()
    qual_truncate = truncation_expected(out_format)
    if qual_truncate:
        warnings.simplefilter('ignore', UserWarning)
    SeqIO.write(records, handle, out_format)
    if qual_truncate:
        warnings.filters.pop()
    handle.seek(0)
    #Now load it back and check it agrees,
    records2 = list(SeqIO.parse(handle, out_format, alphabet))
    compare_records(records, records2, qual_truncate)
    #Finally, use the convert fuction, and check that agrees:
    handle2 = StringIO()
    if qual_truncate:
        warnings.simplefilter('ignore', UserWarning)
    SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet)
    if qual_truncate:
        warnings.filters.pop()
    #We could re-parse this, but it is simpler and stricter:
    assert handle.getvalue() == handle2.getvalue()
Ejemplo n.º 32
0
def prep_for_beast():
    # print file to NEXUS file
    ny_aligned = list(SeqIO.parse('final_ny_aligned.txt', "fasta"))
    for record in ny_aligned:
        desc = record.description.split(" ")
        record.id = desc[1]
        record.description = desc[1]
    SeqIO.write(ny_aligned, 'final_ny_aligned_name_fixed.txt', "fasta")
    count = SeqIO.convert("final_ny_aligned_name_fixed.txt",
                          "fasta",
                          "final_ny_aligned.nex",
                          "nexus",
                          alphabet=IUPAC.ambiguous_dna)
    print("Converted %i records" % count)
Ejemplo n.º 33
0
def distmat_cmalign(
        sequence_file,
        prefix,
        cpu=wrap.CMALIGN_THREADS,
        min_bitscore=10):

    with util.ntf(prefix=prefix, suffix='.aln') as a_sto, \
            util.ntf(prefix=prefix, suffix='.fasta') as a_fasta:

        scores = wrap.cmalign_files(sequence_file, a_sto.name, cpu=cpu)

        low_scores = scores['bit_sc'] < min_bitscore
        if low_scores.any():
            msg = 'The following sequences aligned with bit score < {}: {}'
            log.warning(msg.format(min_bitscore, scores[low_scores].index))

        # FastTree requires FASTA
        SeqIO.convert(a_sto, 'stockholm', a_fasta, 'fasta')
        a_fasta.flush()

        taxa, distmat = outliers.fasttree_dists(a_fasta.name)

    return taxa, distmat
Ejemplo n.º 34
0
    def to_fasta(self, rmFile=False):
        """Converting from fastq to fasta.

        Args:
        rmFile -- remove old version of file?

        Attrib edit:
        readFile name set to new file (*.fasta)
        readFileFormat set to fasta format        
        """
        # unpack
        readFile = self.get_readFile()
        readFileFormat = self.get_readFileFormat()

        # new file name
        basename, ext = os.path.splitext(readFile)
        ## rename file to prevent overwrite
        if ext == '.fasta':
            os.rename(readFile, basename + '.tmp')
            readFile = basename + '.tmp'
        newFile = basename + '.fasta'

        # convert
        try:
            SeqIO.convert(readFile, readFileFormat, newFile, 'fasta')
        except ValueError:
            return False

        # remove
        if rmFile:
            os.remove(readFile)

        # setting attributes
        self.set_readFile(newFile)
        self.set_readFileFormat('fasta')

        return True
Ejemplo n.º 35
0
def main():
    ap = GooeyParser(
        description=
        "splits a fasta file with user specified length and fragment overlap")
    ap.add_argument("-in",
                    "--input",
                    required=True,
                    widget='FileChooser',
                    help="input fasta file")
    ap.add_argument("-step",
                    "--step",
                    required=True,
                    help="step size to split fasta, type = int")
    ap.add_argument("-win",
                    "--window",
                    required=True,
                    help="window size of splitted subsets, type = int")
    ap.add_argument("-out",
                    "--output",
                    required=True,
                    widget='FileSaver',
                    help="output fasta file")
    args = vars(ap.parse_args())
    # main
    sequences = []
    headers = []  # setup empty lists
    for record in SeqIO.parse(args['input'], "fasta"):
        for i in range(0,
                       len(record.seq) - int(args['window']) + 1,
                       int(args['step'])):
            sequences.append(record.seq[i:i + int(args['window'])])
            headers.append(i)
# create data frame
    df = pd.DataFrame()
    df['id'] = headers
    df['seq'] = sequences
    # export
    with open("out.tab", 'a') as f:
        f.write(
            df.to_csv(header=False,
                      index=False,
                      sep='\t',
                      doublequote=False,
                      line_terminator='\n'))


# convert to fasta
    convert = SeqIO.convert("out.tab", "tab", args['output'], "fasta")
    os.system("del out.tab")
Ejemplo n.º 36
0
 def simple_check(self, base_name, in_variant):
     for out_variant in ["sanger", "solexa", "illumina"]:
         if out_variant != "sanger":
             #Ignore data loss warnings from max qualities
             warnings.simplefilter('ignore', BiopythonWarning)
         in_filename = "Quality/%s_original_%s.fastq" \
                       % (base_name, in_variant)
         self.assertTrue(os.path.isfile(in_filename))
         #Load the reference output...
         with open("Quality/%s_as_%s.fastq" % (base_name, out_variant),
                   "rU") as handle:
             expected = handle.read()
         #Check matches using convert...
         handle = StringIO()
         SeqIO.convert(in_filename, "fastq-" + in_variant, handle,
                       "fastq-" + out_variant)
         self.assertEqual(expected, handle.getvalue())
         #Check matches using parse/write
         handle = StringIO()
         SeqIO.write(SeqIO.parse(in_filename, "fastq-" + in_variant),
                     handle, "fastq-" + out_variant)
         self.assertEqual(expected, handle.getvalue())
         if out_variant != "sanger":
             warnings.filters.pop()
Ejemplo n.º 37
0
def read_sanger(infilepath,outfilepath):
    count_infile = 0
    print infilepath
    for filename in os.listdir(infilepath):
        #print "infilepath :" +infilepath
        #print filename
        if filename.endswith('.ab1'):
            abi_filename=os.path.join(infilepath,filename)
            fastq_filename= os.path.join(outfilepath,filename.replace('.ab1','.fastq'))
            #print abi_filename
            ##print fastq_filename
            SeqIO.convert(abi_filename,'abi',fastq_filename,'fastq')
            '''
            if filename.endswith("QB5505.ab1"):
                F_fastq_filename =os.path.join(outfilepath_F,filename.replace('.ab1','.fastq'))
                SeqIO.convert(abi_filename,'abi',F_fastq_filename,'fastq')
            else:
                R_fastq_filename =os.path.join(outfilepath_R,filename.replace('.ab1','.fastq'))
                SeqIO.convert(abi_filename,'abi',R_fastq_filename,'fastq')
            '''
            count_infile += 1
    print "There are total %d sequences" %count_infile

    return
Ejemplo n.º 38
0
 def generate_dist(self):
     mafft_cline = MafftCommandline(input=self.fasta_seq,
                                    maxiterate=1000,
                                    localpair=True,
                                    phylipout=True)
     stdout, stderr = mafft_cline()
     #Save alignments into  FASTA and PHYLIP format
     phyFile = 'testing/alignment.phy'
     outPhy = open(phyFile, 'w')
     outPhy.write(stdout)
     outPhy.close()
     fastaFile = 'testing/align.fasta'
     SeqIO.convert(phyFile, 'phylip', fastaFile, 'fasta')
     #Create phylogenetic tree of the original sequences
     raxml_cline = RaxmlCommandline(sequences=phyFile,
                                    model='GTRGAMMA',
                                    name='reversatest',
                                    working_dir=self.cwPath)
     raxml_cline()
     #Calculate the phylo distances between each branch of the tree
     tree = dendropy.Tree.get_from_path("testing/RAxML_result.reversatest",
                                        "newick")
     pdm = tree.phylogenetic_distance_matrix()
     pdm.write_csv('distance.csv')
Ejemplo n.º 39
0
 def check_conversion(self, filename, in_format, out_format, alphabet):
     msg = "Convert %s from %s to %s" % (filename, in_format, out_format)
     records = list(SeqIO.parse(filename, in_format, alphabet))
     # Write it out...
     handle = StringIO()
     qual_truncate = truncation_expected(out_format)
     with warnings.catch_warnings():
         if qual_truncate:
             warnings.simplefilter("ignore", BiopythonWarning)
         SeqIO.write(records, handle, out_format)
     handle.seek(0)
     # Now load it back and check it agrees,
     records2 = list(SeqIO.parse(handle, out_format, alphabet))
     self.assertEqual(len(records), len(records2), msg=msg)
     for record1, record2 in zip(records, records2):
         self.compare_record(record1, record2, qual_truncate, msg=msg)
     # Finally, use the convert function, and check that agrees:
     handle2 = StringIO()
     with warnings.catch_warnings():
         if qual_truncate:
             warnings.simplefilter("ignore", BiopythonWarning)
         SeqIO.convert(filename, in_format, handle2, out_format, alphabet)
     # We could re-parse this, but it is simpler and stricter:
     self.assertEqual(handle.getvalue(), handle2.getvalue(), msg=msg)
Ejemplo n.º 40
0
def ficheirosProteinas(dicionario):
    Entrez.email = "*****@*****.**"

    i = 0
    lista_ficheiros = []

    for key in dicionario:
        handleGB = Entrez.efetch(db="protein",
                                 rettype="gb",
                                 retmode="text",
                                 id=key)
        seq_record = SeqIO.read(handleGB, "genbank")
        nome_ficheiro = 'sequenceProtGenbank' + str(i) + '.gb'
        SeqIO.write(seq_record, nome_ficheiro,
                    "genbank")  #Guarda em formato genbank
        lista_ficheiros.append(nome_ficheiro)
        handleGB.close()

        SeqIO.convert('sequenceProtGenbank' + str(i) + '.gb', "genbank",
                      'sequenceProtF' + str(i) + '.fasta', "fasta")

        i += 1

    return lista_ficheiros
Ejemplo n.º 41
0
def groom(in_file, in_qual="fastq-sanger", out_dir=None, out_file=None):
    """
    Grooms a FASTQ file into sanger format, if it is not already in that
    format. Use fastq-illumina for Illumina 1.3-1.7 qualities and
    fastq-solexa for the original solexa qualities. When in doubt, your
    sequences are probably fastq-sanger.

    """
    if in_qual == "fastq-sanger":
        logger.info("%s is already in Sanger format." % (in_file))
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        count = SeqIO.convert(in_file, in_qual, tmp_out_file, "fastq-sanger")
    logger.info("Converted %d reads in %s to %s." % (count, in_file, out_file))
    return out_file
Ejemplo n.º 42
0
 def check(self, sff_name, sff_format, out_name, format):
     wanted = list(SeqIO.parse(out_name, format))
     data = StringIO()
     count = SeqIO.convert(sff_name, sff_format, data, format)
     self.assertEqual(count, len(wanted))
     data.seek(0)
     converted = list(SeqIO.parse(data, format))
     self.assertEqual(len(wanted), len(converted))
     for old, new in zip(wanted, converted):
         self.assertEqual(old.id, new.id)
         self.assertEqual(old.name, new.name)
         if format != "qual":
             self.assertEqual(str(old.seq), str(new.seq))
         elif format != "fasta":
             self.assertEqual(old.letter_annotations["phred_quality"],
                              new.letter_annotations["phred_quality"])
Ejemplo n.º 43
0
def countKmerMatch(fas, kmer_list, outfile, is_fastq):
	match = []
	total_count = 0
	num_found_markers = 0
	handle = StringIO("")
	num_elem = SeqIO.convert(fas, "fastq", handle, "fasta") # there must be an easyer way to get the number of sequences?
	bar = progressbar.ProgressBar(redirect_stdout=True, max_value=num_elem)
	with open(kmer_list,'r') as fin:
		with open(outfile + '.fasta', "w") as output_fasta_handle:
			lines =  fin.read().splitlines()
			if is_fastq:
				with open(outfile + '.fastq', "w") as output_fastq_handle:
					i = 0
					for record in SeqIO.parse(fas, "fastq"):
						read = record.seq
						i = i + 1
						num = 0
						bar.update(i)
						for plasmid_sequence in lines:
							sequence = Seq(plasmid_sequence)
							num = num + int(read.count(sequence))
						if num > 0:
							print("marker found at read number %d" %(i))
							num_found_markers = num_found_markers + 1
							match.append('1')
							# write to fasta file
							SeqIO.write(record, output_fastq_handle, "fastq") 
							SeqIO.write(record, output_fasta_handle, "fasta")
							total_count = total_count + 1
						else:
							match.append('0')
			else:
				for record in SeqIO.parse(fas, "fasta"):
					read = record.seq
					num = 0
					for plasmid_sequence in lines:
						sequence = Seq(plasmid_sequence)
						num = num + int(read.count(sequence))
					if num > 0:
						match.append('1')
						# write to fasta file
						SeqIO.write(record, output_fasta_handle, "fasta")
						total_count = total_count + 1
					else:
						match.append('0')
	return match, total_count
def main():
    ap = GooeyParser(
        description=
        "converts a fasta file , into a tabular file with identifier and sequence"
    )
    ap.add_argument("-in",
                    "--input",
                    required=True,
                    widget='FileChooser',
                    help="input fasta file")
    ap.add_argument("-out",
                    "--output",
                    required=True,
                    widget='FileSaver',
                    help="output tab seperated file")
    args = vars(ap.parse_args())
    # main
    count = SeqIO.convert(args['input'], "fasta", args['output'], "tab")
Ejemplo n.º 45
0
def gb_to_fasta(dirpath):
    for filename in os.listdir(dirpath):
        if filename.endswith("gb") or filename.endswith("gbk"):
            outname = 'concat-' + filename + '.fas'
            count = SeqIO.convert(filename, "genbank", outname, "fasta")
            print "Genbank file found - " + filename
            print "Generating " + outname + " ...Done!"
            f = open(outname)
            lines = f.readlines()
            f.close()
            f = open(outname, "w")
            for line in lines:
                if line.startswith(">"):
                    line = ">" + filename + "\n"
                    f.write(line)
                else:
                    f.write(line)
            f.close()
Ejemplo n.º 46
0
def main():
    ap = GooeyParser(
        description=
        "converts a pdb file with only the atom coordinates section, into a fasta file"
    )
    ap.add_argument("-in",
                    "--input",
                    required=True,
                    widget='FileChooser',
                    help="input pdb file without SEQRES header")
    ap.add_argument("-out",
                    "--output",
                    required=True,
                    widget='FileSaver',
                    help="output fasta file")
    args = vars(ap.parse_args())
    # main
    count = SeqIO.convert(args['input'], "pdb-atom", args['output'], "fasta")
Ejemplo n.º 47
0
def convertMod(args):
    if args.outFormat.lower() == 'gff':
        acceptable = ['genbank', 'embl']
        if args.inFormat.lower() in acceptable:
            to_GFF(args)
            return None
        else:
            sys.err.write("ERROR: ValueError, Could not convert file\n") 
            return None
    else:
        try:
            count = SeqIO.convert(args.input, args.inFormat, args.output, args.outFormat )
            if count == 0:
                sys.err.write('ERROR: No records converted. Possibly wrong input filetype\n')
            else: 
                if args.verbose: sys.err.write("Converted %i records\n" %count)
        except ValueError:
            sys.err.write("ERROR: ValueError, Could not convert file\n")
        return None
Ejemplo n.º 48
0
def main():
	print "Begin!"
	prj_folder = os.getcwd()	
	print "Quality contorl..."
	infiles = glob.glob("%s/1.0-origin/*.fastq"%(prj_folder))
	if len(infiles) != 2:
		print "The %s be loaded in error, are they two?"%infiles
	for the_file in infiles:
		trim_fastq_by_quality(the_file,prj_folder)	
	print "Merging..."	
	infiles = glob.glob("%s/1.1-trimed-fastq-file/*.fastq"%(prj_folder))
	
	os.chdir("%s/1.2-merged-fastq-file/"%(prj_folder))
	merge = subprocess.call("pear -f %s -r %s -o %s"%(infiles[0],infiles[1],project),shell=True)
	
	print "Convert fastq to fasta..."
	merged_file = "%s/1.2-merged-fastq-file/%s.assembled.fastq"%(prj_folder, project)
	fname, suffix = os.path.splitext(merged_file)
	count = SeqIO.convert(merged_file, "fastq","%s.fasta"%fname, "fasta")
	print count
	print "There are  %i records have been Converted!" %(count)
	
	print "Unique fasta file..."
	unique_fasta(prj_folder, project)
	
	print "Split large file to small..."
	os.chdir("%s/1.3-splited-fasta-file/"%(prj_folder))
	record_iter = SeqIO.parse(open("%s_unique.fasta"%fname), "fasta")
	for i, batch in enumerate(batch_iterator(record_iter, 10000)) :
		filename = "%s-%i.fasta" % (project, i+1)
		handle = open(filename, "w")
		count = SeqIO.write(batch, handle, "fasta")
		handle.close()
		print "Wrote %i records to %s" % (count, filename)
	print "Begin IgBLAST..."
	os.chdir("%s/1.4-IgBLAST-output/"%(prj_folder))
	IgBLAST_result = open("%s-igblast-output.txt "%(project),"w")	
	mv_database = subprocess.call("cp -r /zzh_gpfs/home/zzhgroup/yanmingchen/IgBLAST_database/ ./",shell = True)
	#IgBLAST_run = subprocess.call("igblastn -germline_db_V ./IgBLAST_database/20150429-human-gl-v -germline_db_J ./IgBLAST_database/20150429-human-gl-j -germline_db_D ./IgBLAST_database/20150429-human-gl-d -organism human -domain_system imgt -query %s -auxiliary_data optional_file/human_gl.aux -outfmt '7 qseqid sseqid pident length mismatch gapopen gaps qstart qend sstart send evalue bitscore qlen slen qseq sseq score frames qframe sframe positive ppos btop staxids stitle sstrand qcovs qcovhsp' -num_alignments_V 10 -num_alignments_D 10 -num_alignments_J 10 -out IgBLAST_result"%merged_file,shell=True)
	IGBLAST_infiles = glob.glob("%s/1.3-splited-fasta-file/*.fasta"%(prj_folder))
	for index, the_file in enumerate(IGBLAST_infiles):
		print index,the_file
		IgBLAST_run = subprocess.call("igblastn -germline_db_V ./IgBLAST_database/20150429-human-gl-v -germline_db_J ./IgBLAST_database/20150429-human-gl-j -germline_db_D ./IgBLAST_database/20150429-human-gl-d -organism human -domain_system imgt -query %s -auxiliary_data optional_file/human_gl.aux -outfmt '7 qseqid sseqid pident length mismatch gapopen gaps qstart qend sstart send evalue bitscore qlen slen qseq sseq score frames qframe sframe positive ppos btop staxids stitle sstrand qcovs qcovhsp' -num_alignments_V 10 -num_alignments_D 10 -num_alignments_J 10 -out IgBLAST_result_%i &"%(the_file,index+1),shell=True)
Ejemplo n.º 49
0
def main():
    if sys.version_info[0] < 3:
        sys.exit(
            'Must be using Python 3. Try calling "python3 genbank_2_embl.py"')

    parser = argparse.ArgumentParser(
        prog='genbank_2_embl.py',
        description='Converts a genbank file into a embl file',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version',
                        help='Version information',
                        action='version',
                        version=str('%(prog)s v' + version))

    parser_required = parser.add_argument_group('Required options')
    parser_required.add_argument('-g',
                                 '--genbank',
                                 nargs=1,
                                 type=argparse.FileType('r'),
                                 required=True,
                                 metavar='/path/to/genbank/file.gb',
                                 help='Path to the genbank file')
    parser_required.add_argument('-e',
                                 '--embl',
                                 nargs=1,
                                 type=str,
                                 metavar='/path/to/output/embl/file.embl',
                                 help='Path to the output embl file',
                                 required=True)

    args = parser.parse_args()

    args.genbank = os.path.abspath(args.genbank[0].name)
    args.embl = os.path.abspath(args.embl[0])

    if not os.path.isdir(os.path.dirname(args.embl)):
        os.makedirs(os.path.dirname(args.embl))

    count = SeqIO.convert(args.genbank, 'genbank', args.embl, 'embl')
    print('Converted {} records'.format(count))
Ejemplo n.º 50
0
def readwrite_fasta(infilename, outfilename):
    try:
        if compression_type in ["gzip", "gz"]:
            infile = gzip.open(infilename, 'r')
        elif compression_type in ["bzip2", "bz2"]:
            infile = bz2.BZ2File(infilename, 'r')
        elif compression_type == "zip":
            myzipfile = zipfile.ZipFile(infilename, 'r')
            if len(myzipfile.namelist()) > 1:
                raise IOError, "TOO MANY FILES IN ZIPFILE"
            else:
                infile = myzipfile.open(myzipfile.namelist()[0])
        else:
            infile = open(infilename, 'r')
        outfile = open(outfilename, "w")
        outcounter = SeqIO.convert(infile, "fastq-sanger", outfile, "fasta")
        outfile.close()
        infile.close()
        return outcounter
    except Exception, ex:
        print ex.__class__.__name__ + " : " + str(ex)
        return None
Ejemplo n.º 51
0
def convert_format(in_file,
                   out_file,
                   in_format=None,
                   out_format=None,
                   data_type='dna',
                   ambiguities=True):
    if in_format == None:
        in_format = FILE_FORMATS.get_format_from_file_object(in_file)
    if out_format == None:
        out_format = FILE_FORMATS.get_format_from_file_object(out_file)
    _LOG.debug("converting {in_format}-formatted file {in_file!r} to "
               "{out_format}-formatted file {out_file!r}.".format(
                   in_file=in_file,
                   in_format=in_format,
                   out_file=out_file,
                   out_format=out_format))
    nseqs = SeqIO.convert(in_file=in_file,
                          in_format=in_format,
                          out_file=out_file,
                          out_format=out_format,
                          alphabet=get_state_alphabet(data_type, ambiguities))
    return nseqs
    def test_qual_negative(self):
        """Check QUAL negative scores mapped to PHRED zero."""
        data = """>1117_10_107_F3
23 31 -1 -1 -1 29 -1 -1 20 32 -1 18 25 7 -1 6 -1 -1 -1 30 -1 20 13 7 -1 -1 21 30 -1 24 -1 22 -1 -1 22 14 -1 12 26 21 -1 5 -1 -1 -1 20 -1 -1 12 28
>1117_10_146_F3
20 33 -1 -1 -1 29 -1 -1 28 28 -1 7 16 5 -1 30 -1 -1 -1 14 -1 4 13 4 -1 -1 11 13 -1 5 -1 7 -1 -1 10 16 -1 4 12 15 -1 8 -1 -1 -1 16 -1 -1 10 4
>1117_10_1017_F3
33 33 -1 -1 -1 27 -1 -1 17 16 -1 28 24 11 -1 6 -1 -1 -1 29 -1 8 29 24 -1 -1 8 8 -1 20 -1 13 -1 -1 8 13 -1 28 10 24 -1 10 -1 -1 -1 4 -1 -1 7 6
>1117_11_136_F3
16 22 -1 -1 -1 33 -1 -1 30 27 -1 27 28 32 -1 29 -1 -1 -1 27 -1 18 9 6 -1 -1 23 16 -1 26 -1 5 7 -1 22 7 -1 18 14 8 -1 8 -1 -1 -1 11 -1 -1 4 24"""  # noqa : W291
        h = StringIO(data)
        h2 = StringIO()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", BiopythonParserWarning)
            self.assertEqual(4, SeqIO.convert(h, "qual", h2, "fastq"))
        self.assertEqual(
            h2.getvalue(),
            """\
@1117_10_107_F3
??????????????????????????????????????????????????
+
8@!!!>!!5A!3:(!'!!!?!5.(!!6?!9!7!!7/!-;6!&!!!5!!-=
@1117_10_146_F3
??????????????????????????????????????????????????
+
5B!!!>!!==!(1&!?!!!/!%.%!!,.!&!(!!+1!%-0!)!!!1!!+%
@1117_10_1017_F3
??????????????????????????????????????????????????
+
BB!!!<!!21!=9,!'!!!>!)>9!!))!5!.!!).!=+9!+!!!%!!('
@1117_11_136_F3
??????????????????????????????????????????????????
+
17!!!B!!?<!<=A!>!!!<!3*'!!81!;!&(!7(!3/)!)!!!,!!%9
""",
        )
Ejemplo n.º 53
0
                print >>out_handle, gtf_gene
                gtf_tx = '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s;' % ( acc, source, 'transcript', f.location.start.position+1, f.location.end.position, strand, comments )
                print >>out_handle, gtf_tx
                comments += '; exon_number "1"'
                gtf_exon = '%s\t%s\t%s\t%s\t%s\t.\t%s\t.\t%s;' % ( acc, source, 'exon', f.location.start.position+1, f.location.end.position, strand, comments )
                print >>out_handle, gtf_exon

            sys.stderr.write( "%s\tSkipped %s entries having types: %s.\n" % ( gb.id,skipped, ', '.join(skippedTypes) ) )

if __name__=='__main__':
    description = ("Convert GeneBank files to FASTA and GTF bcbio ready files.")

    parser = ArgumentParser(description=description)
    parser.add_argument("--gbk", required=True, help="GBK files")
    parser.add_argument("--prefix", required=True, help="prefix")
    args = parser.parse_args()

    t0=datetime.now()

    count = SeqIO.convert(args.gbk, "genbank", args.prefix + "_tmp.fa", "fasta")
    with open(args.prefix + ".fa", "w") as out_handle:
        with open(args.prefix + "_tmp.fa") as in_handle:
            header = next(in_handle)
            print >>out_handle, header.split()[0]
            for line in in_handle:
                print >>out_handle, line.strip()

    gb2gtf(args.gbk, args.prefix + ".gtf")
    dt=datetime.now() - t0
    sys.stderr.write( "#Time elapsed: %s\n" % dt )
Ejemplo n.º 54
0
#!/usr/bin/env python

from Bio import SeqIO
import sys

SeqIO.convert(sys.argv[1], "fastq", sys.argv[2], "fasta")
Ejemplo n.º 55
0
def converter(fq):
    output_fa = os.path.splitext(fq)[0] + '.fa'
    SeqIO.convert(fq, 'fastq', output_fa, 'fasta')
Ejemplo n.º 56
0
          len(genbankdf[genbankdf.locus_tag == 'none']))
    genbankdf = genbankdf[genbankdf.locus_tag != "none"]

genbankdf.reset_index(level=0, inplace=True)
#  we split with pipes "|" later on;  this rmoves any from locus_tags , bd_xref's, and old_locus_tags
try:
    genbankdf.db_xref = genbankdf.db_xref.str.replace("|", "_")
    genbankdf.locus_tag = genbankdf.locus_tag.str.replace("|", "_")
    genbankdf.old_locus_tag = genbankdf.old_locus_tag.str.replace("|", "_")
except KeyError:
    pass
#print(genbankdf.loc[genbankdf['locus_tag'] == 'QV15_00005'])   ### just a test
#%%#%%#  make a fasta for genomic sequence(s), clean up header in resulting file
output_genfasta_handle = open(os.path.join(subdirname, gb + "_genomic.fasta"),
                              "w")
SeqIO.convert(input_handle.name, "genbank", output_genfasta_handle, "fasta")
output_genfasta_handle.close()
#%%

if rename_fa:
    print("After running this script, run the following output as a command" +
          ", starting with 'awk' and ending with _renamed.fasta:")
    import string
    input_handle = open(input_genome, "r")  # input database
    names = []
    for record in SeqIO.parse(input_handle, "genbank"):
        print(record.id)
        names.append(record.id)
    if len(names) > 1:
        starts_at = names[0][-1]
        # removes number, period, and version number
Ejemplo n.º 57
0
    while_condition = False
  else:
    qwindow_average = int(qwindow_average)
  if qwindow_average > 40 or qwindow_average < 0:
    print 'You entered a wrong value.'
    print 'Try again!'
  else:
    while_condition = False

#sff sequence extraction
if file_extension == 'sff':  # extract the sequences from the sff file
  fastq_file = sample_name + '.fastq'
  stars()
  print 'The sff file is now being converted into a fastq file.'
  print 'This could take a while...'
  sff_seq_count = SeqIO.convert(raw_file_name, "sff-trim", fastq_file, "fastq")
  print '%i sequences have been converted to fastq format.' % sff_seq_count
  print '%s file has been created' % fastq_file
  stars()

if file_extension == 'fastq':
  fastq_file = raw_file_name

#*** DNA sequence quality trimming ***
stars()
print 'Now we trim the sequences using the desired quality settings.'
print 'Depending on the settings, this could take a while... (10-20 minutes)'
print 'Please be patient!'
stars()
trim_fasta_file_name = sample_name + '.trim.fasta'
good_reads = []
Ejemplo n.º 58
0
from Bio import SeqIO

with open('rosalind_tfsq.txt') as input_data, open('output.txt', 'w') as output_data:
    SeqIO.convert(input_data, 'fastq', output_data, 'fasta' )
Ejemplo n.º 59
0
 def _method_biopython(self, *args, **kwargs):
     from Bio import SeqIO
     SeqIO.convert(self.infile, "genbank", self.outfile, "fasta")
Ejemplo n.º 60
0
                    help='''Generate qual file.''')

parser.add_argument(
    '--out',
    '-o',
    type=str,
    default=False,
    help=
    '''Output prefix. Default: same as fastq. Use "stdout" or "-" to print to screen.'''
)

args = parser.parse_args()

assert args.fa or args.qual

if not args.out:
    args.out = args.fastq.split("/")[-1].split(".")[0]

if args.fastq in ('', '-', 'stdin'):
    args.fastq = sys.stdin

if args.out and args.out in ("stdout", "-"):
    out = sys.stdout
else:
    out = args.out + ".fasta" if args.fa else args.out + ".qual"

if args.fa:
    SeqIO.convert(args.fastq, "fastq", out, "fasta")
if args.qual:
    SeqIO.convert(args.fastq, "fastq", out, "qual")