Example #1
0
 def test_using_stdin(self):
     """Simple alignment using stdin"""
     input_file = "Fasta/f002"
     self.assertTrue(os.path.isfile(input_file))
     records = list(SeqIO.parse(input_file,"fasta"))
     #Prepare the command... use Clustal output (with a MUSCLE header)
     cline = MuscleCommandline(muscle_exe, clw=True)
     self.assertEqual(str(cline).rstrip(),
                      _escape_filename(muscle_exe) + " -clw")
     self.assertEqual(str(eval(repr(cline))), str(cline))
     child = subprocess.Popen(str(cline),
                              stdin=subprocess.PIPE,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform!="win32"))
     SeqIO.write(records, child.stdin, "fasta")
     child.stdin.close()
     #Alignment will now run...
     align = AlignIO.read(child.stdout, "clustal")
     align.sort()
     records.sort(key = lambda rec: rec.id)
     self.assertEqual(len(records),len(align))
     for old, new in zip(records, align):
         self.assertEqual(old.id, new.id)
         self.assertEqual(str(new.seq).replace("-",""), str(old.seq))
     self.assertEqual(0, child.wait())
     child.stdout.close()
     child.stderr.close()
     del child
Example #2
0
 def loop(self, filename, format):
     original_records = list(SeqIO.parse(open(filename, "rU"), format))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                           user = DBUSER, passwd = DBPASSWD,
                                           host = DBHOST, db = TESTDB)
     db_name = "test_loop_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(original_records)
     self.assertEqual(count, len(original_records))
     server.commit()
     #Now read them back...
     biosql_records = [db.lookup(name=rec.name)
                       for rec in original_records]
     #And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
     #Now write to a handle...
     handle = StringIO()
     SeqIO.write(biosql_records, handle, "gb")
     #Now read them back...
     handle.seek(0)
     new_records = list(SeqIO.parse(handle, "gb"))
     #And check they still agree
     self.assertEqual(len(new_records), len(original_records))
     for old, new in zip(original_records, new_records):
         #TODO - remove this hack because we don't yet write these (yet):
         for key in ["comment", "references", "db_source"]:
             if key in old.annotations and key not in new.annotations:
                 del old.annotations[key]
         self.assertTrue(compare_record(old, new))
     #Done
     server.close()
 def test_fasta_out(self):
     """Check FASTQ to FASTA output"""
     records = SeqIO.parse("Quality/example.fastq", "fastq")
     h = StringIO()
     SeqIO.write(records, h, "fasta")
     with open("Quality/example.fasta") as expected:
         self.assertEqual(h.getvalue(), expected.read())
Example #4
0
def write_fasta(filename, data):
    fd = open(filename, "w")
    seq_list = []
    for i in data.keys():
        seq_list.append(SeqRecord(Seq(data.get(i)), id=i, description=""))
    SeqIO.write(seq_list, fd, "fasta")
    fd.close()
 def test_generated(self):
     """Write and read back odd SeqRecord objects"""
     record1 = SeqRecord(Seq("ACGT"*500, generic_dna),  id="Test", description="Long "*500,
                        letter_annotations={"phred_quality":[40,30,20,10]*500})
     record2 = SeqRecord(MutableSeq("NGGC"*1000),  id="Mut", description="very "*1000+"long",
                        letter_annotations={"phred_quality":[0,5,5,10]*1000})
     record3 = SeqRecord(UnknownSeq(2000,character="N"),  id="Unk", description="l"+("o"*1000)+"ng",
                        letter_annotations={"phred_quality":[0,1]*1000})
     record4 = SeqRecord(Seq("ACGT"*500),  id="no_descr", description="", name="",
                        letter_annotations={"phred_quality":[40,50,60,62]*500})
     record5 = SeqRecord(Seq("",generic_dna),  id="empty_p", description="(could have been trimmed lots)",
                        letter_annotations={"phred_quality":[]})
     record6 = SeqRecord(Seq(""),  id="empty_s", description="(could have been trimmed lots)",
                        letter_annotations={"solexa_quality":[]})
     record7 = SeqRecord(Seq("ACNN"*500),  id="Test_Sol", description="Long "*500,
                        letter_annotations={"solexa_quality":[40,30,0,-5]*500})
     record8 = SeqRecord(Seq("ACGT"),  id="HighQual", description="With very large qualities that even Sanger FASTQ can't hold!",
                        letter_annotations={"solexa_quality":[0,10,100,1000]})
     #TODO - Record with no identifier?
     records = [record1, record2, record3, record4, record5, record6, record7, record8]
     #TODO - Have a Biopython defined "DataLossWarning?"
     warnings.simplefilter('ignore', BiopythonWarning)
     #TODO - Include phd output?
     for format in ["fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"]:
         handle = StringIO()
         SeqIO.write(records, handle, format)
         handle.seek(0)
         compare_records(records,
                         list(SeqIO.parse(handle, format)),
                         truncation_expected(format))
     warnings.filters.pop()
Example #6
0
def blastclust_to_fasta(infname, seqfname, outdir):
    """Converts input BLASTCLUST output list to a subdirectory of FASTA files.


    Each individual FASTA file contains all sequences from a single cluster.
    The sequences matching the IDs listed in the BLASTCLUST output .lst file 
    should all be found in the same file.

    Returns the output directory and a list of the files, as a tuple.
    """
    outdirname = os.path.join(outdir, "blastclust_OTUs")
    if not os.path.exists(outdirname):
        os.makedirs(outdirname)
    seqdict = SeqIO.index(seqfname, 'fasta')
    outfnames = []
    with open(infname, 'r') as fh:
        otu_id = 0
        for line in fh:
            otu_id += 1
            outfname = os.path.join(outdirname,
                                    "blastclust_OTU_%06d.fasta" % otu_id)
            SeqIO.write((seqdict[key] for key in line.split()),
                        outfname, 'fasta')
            outfnames.append(outfname)
    return (outdirname, outfnames)
Example #7
0
def run_pal2nal(fname_aln, fname_nuc, fname_prot):
    """
    Generate a codon alignment via PAL2NAL.

    @param fname_aln:
        MSA of protein sequences in CLUSTAL format (.aln)
    @param fname_nuc:
        Nucleotide sequences in FASTA format (.fasta)
    @param fname_prot:
        Protein sequences in FASTA format (.fasta)
    @return:
        Codon alignment in CLUSTAL format (.aln), suitable for codeml
    1"""
    sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc))

    # Reorder fname_nuc according to the order of the proteins in fname_aln, which
    # was reordered due to CLUSTALW2.  Note that the first protein in each of
    # these files remains the same as at the start, however; this first protein
    # is our original query protein.
    nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")]
    prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")]
    records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records))
    fname_nuc2 = "homologs_ordered.dna.fasta"
    with open(fname_nuc2, "w") as f:
        for record in SeqIO.parse(fname_aln, "clustal"):
            SeqIO.write(records_map[record.id], f, "fasta")
    fname_codon = "homologs.codon.aln"
    # TODO: use subprocess
    os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon))
    return fname_codon
Example #8
0
def CutOutDomain(coords,filename, header=False, column_id=0, column_start=8, column_stop=9):
    """COMMENTS"""
    from Bio import SeqIO
    fh=open(coords)
    seqfile=open(filename)
    Towrite=[]
    CoordIDDic={}
    if header==True:
        print 'header set to True, first line of %s will be ignored'%coords
        skip_header=fh.readline()
    else:
        print 'header not set to True, first line of %s will be processed'%coords

    for unformatedLine in fh:
        l=unformatedLine.replace('\xa0', '').strip().split(',')
        if l[column_id] not in CoordIDDic:
            CoordIDDic[l[column_id]]=l[column_start], l[column_stop]
    else:
        for s in SeqIO.parse(seqfile, 'fasta'):
            if s.id in CoordIDDic:
                start=(int(CoordIDDic.get(s.id)[0])-1)
                stop=int(CoordIDDic.get(s.id)[1])
                s.id=s.id+'_%s_%s'%((start+1), stop)
                Towrite.append(s[start:stop])
        else:
            Output=open('CutOutdomain_%s'%filename, 'w')
            SeqIO.write(Towrite, Output, 'fasta')
Example #9
0
def needle_score(seq1, seq2, verbose=False, keep=False):
    """
    get needlman-wunsch score for aligning two sequences
    """
    ntf = tempfile.NamedTemporaryFile
    with ntf(prefix='seq1', delete = not keep) as fh1, \
         ntf(prefix='seq2', delete = not keep) as fh2, \
         ntf(prefix='align_out') as outfile, \
         open(os.devnull) as dn:
        SeqIO.write(seq1, fh1, 'fasta')
        fh1.flush()
        SeqIO.write(seq2, fh2, 'fasta')
        fh2.flush()

        cmd = ['needle', '-gapopen', '0',
               '-gapextend', '0',
               '-outfile',  outfile.name,
               fh1.name, fh2.name]
        if verbose:
            print(' '.join(cmd))
        subprocess.check_call(cmd, stderr=dn)
        result = outfile.read()
        pattern = re.compile(r'# Score: (.*)')
        score = pattern.search(result)
        if score is not None:
            return float(score.group(1))
        return 0
Example #10
0
def frameshift_writer(contigs, file):
    sys.stderr.write("[predict] writing frameshifts...")
    seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values()
            if c.annotation['majority_frameshift']]
    SeqIO.write(seqs, file, "fasta")
    file.close()
    sys.stderr.write("\tdone.\n")
Example #11
0
def no_relatives_writer(contigs, file):
    sys.stderr.write("[predict] writing contigs with no relatives...")
    seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values() if
            c.annotation['num_relatives'] == 0]
    SeqIO.write(seqs, file, "fasta")
    file.close()
    sys.stderr.write("\tdone.\n")
Example #12
0
def gather_est2genome_seqs(refseq_obj, est2genome_handle, log_line, velvet_file):
	seq_dir = log_line.split("\t")[1]
	tmp_refseq = seq_dir.split("/")[3].replace(".","%2E")#hardcoded in this position
	gff_file = refseq_obj.id + ".velvet_contigs.maker.output/" + seq_dir + "/" + tmp_refseq + ".gff"
	gff_handle = open(gff_file,'r')
	for gff_line in gff_handle:
		if(re.search("est2gneome",gff_line) and \
		re.search("\texpressed_sequence_match\t",gff_line)):
			curr_start = int(gff_line.split("\t")[3])
			curr_stop = int(gff_line.split("\t")[4])
			curr_strand = gff_line.split("\t")[6]
			
			tmp_handle = open(velvet_file,'r')
			tmp_fasta = SeqIO.to_dict(SeqIO.parse(tmp_handle,"fasta"))
			tmp_handle.close()
		
			if seq_dir.split("/")[3] in tmp_fasta:
				curr_record = tmp_fasta[seq_dir.split("/")[3]]
			else:
				continue
			new_seq = curr_record.seq[curr_start - 1:curr_stop]
			if(curr_strand == "-"):
				new_seq = curr_record.seq[curr_start - 1:curr_stop].reverse_complement()
			new_record = SeqRecord(new_seq,id=seqname,name=seqname,description="")
				
			SeqIO.write(est2genome_handle,"fasta")
Example #13
0
    def __format__(self, format_spec):
        """Returns the record as a string in the specified file format.

        This method supports the python format() function added in
        Python 2.6/3.0.  The format_spec should be a lower case string
        supported by Bio.SeqIO as an output file format. See also the
        SeqRecord's format() method.
        """
        if not format_spec:
            #Follow python convention and default to using __str__
            return str(self)    
        from Bio import SeqIO
        if format_spec in SeqIO._BinaryFormats:
            #Return bytes on Python 3
            try:
                #This is in Python 2.6+, but we need it on Python 3
                from io import BytesIO
                handle = BytesIO()
            except ImportError:
                #Must be on Python 2.5 or older
                from StringIO import StringIO
                handle = StringIO()
        else:
            from StringIO import StringIO
            handle = StringIO()
        SeqIO.write(self, handle, format_spec)
        return handle.getvalue()
def splitFastaFile(infile, informat, outdir):
    for record in SeqIO.parse(open(infile), informat):
        iid = record.id
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        f_out = os.path.join(outdir,iid+'.fasta')
        SeqIO.write([record],open(f_out,'w'),"fasta")
Example #15
0
def check_convert_fails(in_filename, in_format, out_format, alphabet=None):
    qual_truncate = truncation_expected(out_format)
    #We want the SAME error message from parse/write as convert!
    err1 = None
    try:
        records = list(SeqIO.parse(in_filename,in_format, alphabet))
        handle = StringIO()
        if qual_truncate:
            warnings.simplefilter('ignore', UserWarning)
        SeqIO.write(records, handle, out_format)
        if qual_truncate:
            warnings.filters.pop()
        handle.seek(0)
        assert False, "Parse or write should have failed!"
    except ValueError as err:
        err1 = err
    #Now do the conversion...
    try:
        handle2 = StringIO()
        if qual_truncate:
            warnings.simplefilter('ignore', UserWarning)
        SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet)
        if qual_truncate:
            warnings.filters.pop()
        assert False, "Convert should have failed!"
    except ValueError as err2:
        assert str(err1) == str(err2), \
               "Different failures, parse/write:\n%s\nconvert:\n%s" \
               % (err1, err2)
def getFasta(sequences, header):
    with open('query.fa', 'w') as fasta:
        with open(sequences, 'rU') as input:
            SeqRecords = SeqIO.parse(input, 'fasta')
            for rec in SeqRecords:
                if rec.id == header:
                    SeqIO.write(rec, fasta, 'fasta')
Example #17
0
 def compress(self,filename,cd,pos):
     filename.compdeep=cd-1
     filename.comptype=pos[0:len(pos)-1]
     if filename.ext=='.gb':
         rec=SeqIO.read(filename.get_name(),"genbank")
         ln=len(rec.seq)
     else:
         rec=SeqIO.read(filename.get_name(),"fasta")
         ln=len(rec.seq)
     filename.compdeep=cd
     filename.comptype=pos
     numpos=int(pos[len(pos)-1])
     compstep=self.compopt['compstep']
     resseq=Seq('',rec.seq.alphabet)
     res=open(filename.get_name(),'w')
     oligolist=[]
     self.complete_oligolist(oligolist,'',compstep)
     for i in xrange(0,ln-ln%compstep,compstep):
         if str(rec.seq[i:i+compstep]).lower() in oligolist:
             resseq+=rec.seq[i:i+compstep][numpos]
     rec.seq=resseq
     if filename.ext=='.gb':
         SeqIO.write(rec,res,"genbank")
     else:
         SeqIO.write(rec,res,"fasta")
     res.close()
     return resseq
Example #18
0
def gbk_to_fasta(genbank, fasta):
    '''
    Converts a genbank to a fasta using BioPython
    '''

    sequences = SeqIO.parse(genbank, "genbank")
    SeqIO.write(sequences, fasta, "fasta")
Example #19
0
def output(target, option):
    if option == '1':
        for record in target:
            gene = record[0].id.split(sep='|')[-1]
            output_file = ''.join([
                'output/',
                sys.argv[2], 
                '-',
                gene, 
                '.fasta'
            ])
            rename_seq = SeqRecord(
                seq=record[1].seq, 
                id='|'.join([
                    gene,
                    sys.argv[1],
                    record[1].id
                ]),
                description=''
            )
            SeqIO.write(rename_seq, output_file, 'fasta')
    else:
        output_file = open('output/' + sys.argv[1] + '-filtered.fasta', 'w' )
        contig_id = {i[0].id for i in target} 
        query_file = SeqIO.parse(sys.argv[1], 'fasta')
        for record in query_file:
            if record.id in contig_id:
                SeqIO.write(record, output_file, 'fasta')
Example #20
0
def write_selected_pfam_genes(options, annot_genes_all):
    '''
    For each Protein Family write a second multiple alignment file with just the
    annotated genes.
    '''
    global q 
    while 1:
        try:
            pf = q.get(block=True, timeout=0.1)
        except Empty: 
            break
        else:
            if options.type == "pf":
                shutil.copy(options.dbdir+"align/"+pf.upper()+".fasta", 
                            options.outdir+pf+"/"+pf.upper()+".fasta")
                q.task_done()
            else:
                handle = open(options.dbdir+"align/"+pf.upper()+".fasta","r")
                handle_out = open(options.outdir+pf+"/"+pf.upper()+".fasta", "w")
                for nuc_rec in SeqIO.parse(handle, "fasta"):
                    if nuc_rec.id[0:nuc_rec.id.find("/")] in annot_genes_all:
                        SeqIO.write(SeqRecord(seq = nuc_rec.seq, id = nuc_rec.id, 
                            description = ""), handle_out, "fasta")
                handle_out.close()
                handle.close()
                q.task_done()
def check_seq_between(gb, insertion, start, end, name, temp):
    '''
    Check the sequence between two ends to see
    if it matches the IS query or not, and what
    the coverage and %ID to the query.
    '''

    genbank = SeqIO.read(gb, 'genbank')
    # Get sequence between left and right ends
    seq_between = genbank.seq[start:end]
    # Turn the sequence into a fasta file
    seq_between = SeqRecord(Seq(str(seq_between), generic_dna), id=name)
    SeqIO.write(seq_between, temp + name + '.fasta', 'fasta')
    # Perform the BLAST
    doBlast(temp + name + '.fasta', temp + name + '_out.txt', insertion)
    # Only want the top hit, so set count variable to 0
    first_result = 0
    # Open the BLAST output file 
    with open(temp + name + '_out.txt') as summary:
        for line in summary:
            # Get coverage and % ID for top hit
            if first_result == 0:
                info = line.strip().split('\t')
                coverage = float(info[4]) / float(info[5]) * 100
                hit = [info[3], coverage]
                first_result += 1
            return hit
    # If there is not hit, just return an empty list
    hit = []
    return []
Example #22
0
    def save_seqs_to_file(self):
        """Query sequences for each gene from database and save to local disk.

        Sets attribute `self.seq_file` containing necessary sequences from our
        database.

        """
        if self.blast_type == 'new':
            self.seq_file = os.path.join(self.cwd,
                                         'db',
                                         '_'.join(self.gene_codes) + "_seqs.fas",
                                         )
            if self.gene_codes:
                # Taken from http://stackoverflow.com/a/1239602
                Qr = None
                for gene_code in self.gene_codes:
                    q = Q(gene_code=gene_code)
                    if Qr:
                        Qr = Qr | q
                    else:
                        Qr = q
                queryset = Sequences.objects.filter(Qr)
            else:
                queryset = Sequences.objects.all()

            my_records = []
            for i in queryset:
                item_id = i.code_id + '|' + i.gene_code
                seq = self.strip_question_marks(i.sequences)
                if seq != '':
                    seq_record = SeqRecord(Seq(seq), id=item_id)
                    my_records.append(seq_record)
            SeqIO.write(my_records, self.seq_file, "fasta")
Example #23
0
def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS):
    """Context manager yielding a temporary reference package for a
    collection of aligned sequences.

    Builds a tree with FastTree, creates a reference package, yields.

    """
    sequences = list(sequences)
    with ntf(prefix='fasttree-', suffix='.log') as log_fp, \
         ntf(prefix='fasttree-', suffix='.tre') as tree_fp, \
         tempdir(prefix='refpkg') as refpkg_dir:

        log_fp.close()

        fasttree(sequences, log_path=log_fp.name, output_fp=tree_fp, gtr=True,
                 threads=threads)
        tree_fp.close()

        rp = Refpkg(refpkg_dir(name), create=True)
        rp.update_metadata('locus', '')
        rp.update_phylo_model('FastTree', log_fp.name)
        rp.update_file('tree', tree_fp.name)

        # FASTA and Stockholm alignment
        with ntf(suffix='.fasta') as f:
            SeqIO.write(sequences, f, 'fasta')
            f.close()
            rp.update_file('aln_fasta', f.name)
        with ntf(suffix='.sto') as f:
            SeqIO.write(sequences, f, 'stockholm')
            f.close()
            rp.update_file('aln_sto', f.name)
        logging.debug("Reference package written to %s", rp.path)
        yield rp
Example #24
0
def getTree(cdhitProc,rrnaFile):
    
    record_dict = SeqIO.to_dict(SeqIO.parse(open(rrnaFile,'r'), "fasta"))
    rrnas = []
    seen = set()
    for cluster in cdhitProc.clusters:
        members = cluster.seqs
        for mem in members:
            #Obtain accession IDs from cdhitProc 
            acc = acc_reg.findall(mem)[0]
            try:
                if acc not in seen:
                    record = record_dict[acc]
                    rrnas.append(record)
                    seen.add(acc)
            except Exception as k:
                print 'Accession missing',k
                
            #Obtain corresponding 16SRNAs
    #print "Number of rRNAs",len(rrnas)
    basename,_ = os.path.splitext(rrnaFile)
    tmp_rrna = "%s_filtered.fasta"%basename
    tree = "%s_filtered.tree"%basename
    SeqIO.write(rrnas, open(tmp_rrna,'w'), "fasta")
    
    #Run FastTree
    ft = UnAlignedFastTree(tmp_rrna,tree)
    ft.align() #Run multiple sequence alignment and spit out aligned fasta file
    ft.run() #Run fasttree on multiple alignment and spit out newick tree
    ft.cleanUp() #Clean up!
    return tree
Example #25
0
 def write_full_delta_files(self, deltaFileList):
     outFile = "./bstrap/bstrp_iteration_delta_"+str(self.p_iterN)+".fasta"
     ofile = open(outFile, "w")
     for dfile in deltaFileList: 
         for record in SeqIO.parse(dfile, "fasta"):
             SeqIO.write(record, ofile, "fasta")            
     ofile.close()
Example #26
0
 def test_long(self):
     """Simple muscle call using long file."""
     #Create a large input file by converting some of another example file
     temp_large_fasta_file = "temp_cw_prot.fasta"
     handle = open(temp_large_fasta_file, "w")
     records = list(SeqIO.parse(open("NBRF/Cw_prot.pir", "rU"), "pir"))[:40]
     SeqIO.write(records, handle, "fasta")
     handle.close()
     #Prepare the command...
     cmdline = MuscleCommandline(muscle_exe)
     cmdline.set_parameter("in", temp_large_fasta_file)
     #Preserve input record order
     cmdline.set_parameter("stable", True) #Default None treated as False!
     #Use fast options
     cmdline.set_parameter("maxiters", 1)
     cmdline.set_parameter("diags", True) #Default None treated as False!
     #Use clustal output
     cmdline.set_parameter("clwstrict", True) #Default None treated as False!
     #Shoudn't need this, but just to make sure it is accepted
     cmdline.set_parameter("maxhours", 0.1)
     #No progress reports to stderr
     cmdline.set_parameter("quiet", True) #Default None treated as False!
     self.assertEqual(str(cmdline).rstrip(), muscle_exe + \
                      " -in temp_cw_prot.fasta -diags -maxhours 0.1" + \
                      " -maxiters 1 -clwstrict -stable -quiet")
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, out_handle, err_handle = generic_run(cmdline)
     align = AlignIO.read(out_handle, "clustal")
     self.assertEqual(len(records), len(align))
     for old, new in zip(records, align):
         self.assertEqual(old.id, new.id)
         self.assertEqual(str(new.seq).replace("-",""), str(old.seq))
     os.remove(temp_large_fasta_file)
     #See if quiet worked:
     self.assertEqual("", err_handle.read().strip())
Example #27
0
def extract_seq_from_file(seq_file, coords_file, output_file):
	# 记录reference sequence名称
	chrs = []

	# 存储片段
	chr_seg = {}

	# 对片段计数
	cnt = 0

	seqio = SeqIO.parse(seq_file, 'fasta')
	for seq_record in seqio:
		chrs.append(seq_record.id)

	with open(coords_file, 'r') as f:
		for line in f:
			cnt += 1
			line = line.strip('\n')
			regions = re.split('\s+', line)

			if regions[0] not in chrs:
				log.warning('{0} not in reference sequence'.format(regions[0]))

			if len(regions) < 3:
				log.warning('The numbers of this line are less than 3(required)')
				continue

			if regions[0] not in chr_seg:
				chr_seg[regions[0]] = []
				chr_seg[regions[0]].append(regions)
			else:
				chr_seg[regions[0]].append(regions)

	log.info('Summary: {0} chromosomes, {1} segments processed'.format(len(chr_seg), cnt))

	res_file_handle = open(output_file, 'w')

	# 遍历reference sequence
	seqio = SeqIO.parse(seq_file, 'fasta')
	for seq_record in seqio:
		if seq_record.id in chr_seg:
			for seg in chr_seg[seq_record.id]:
				try:
					# 创建SeqRecord对象
					tmp_seq = SeqRecord.SeqRecord(seq=(seq_record.seq)[(int(seg[1])-1):int(seg[2])],
												  id='{0}:{1}..{2}:{3}'.format(seg[0], seg[1], seg[2], seg[3]))

					# 当strang为-时, 进行反向互补处理
					if seg[3] == '-':
						tmp_seq = tmp_seq.reverse_complement(id=True,
															 name=True,
															 description='reverse_complement')

					SeqIO.write(tmp_seq, res_file_handle, 'fasta')
				except Exception as e:
					log.error(e)
		else:
			log.warning(seq_record.id + ' not exists in reference sequences')

	res_file_handle.close()
Example #28
0
def stitch_scaffolds(fa,outFile,len_limit=200000000,dist=500):
    """
    This function merge multiple scaffold together to form a longer sequence. 
    * fa: str. Reference fa file name
    * outFile: str. Filename output to the file.
    * len_limit: int. Maximum length of each merged scaffold.
    * dist: int. Distance between each scaffold.
    """
    in_handle = open(fa,'r')
    out_handle = open(outFile,'w')
    sequence = ''
    n = 1
    for record in SeqIO.parse(in_handle,'fasta'):
        sequence += str(record.seq)
        if len(sequence) >= len_limit:
            item = SeqRecord(Seq(sequence), id = 'chr'+str(n),description="")
            SeqIO.write(item,out_handle,'fasta')
            sequence = ''
            n += 1
        else:
            sequence += 'N'*500
    if sequence != '':
        item = SeqRecord(Seq(sequence[:-500]), id = 'chr'+str(n),description="")
        SeqIO.write(item,out_handle,'fasta')
    # output the last one
    handle = open(outFile)
    for record in SeqIO.parse(handle,'fasta'):
        print len(record.seq)
Example #29
0
def generate_random_fasta(path):
    gt = None
    with open(__genome_table__) as f:
        gt = f.readlines()
    chr_map = {}
    for line in gt:
        line = line.strip()
        s = line.split('\t')
        chr_map[s[0]] = int(s[1])
    records = list(SeqIO.parse(open(__genome__,'r'), 'fasta'))
    seqs = []

    for i in range(0,1000):
        chr_index = randint(1, 22)
        chr_id = None
        for r in records:
            if r.id == 'chr{}'.format(chr_index):
                record = records[chr_index]
                chr_id = r.id
        ##We use limit from genome_table!
        ##I think it ignores N
        limit = chr_map[chr_id]
        start = randint(0, limit-200)
        end = start+200
        data = record.seq[start:end]
        seq = SeqRecord(data,'{}_{}_-10'.format(chr_id, start),'','')
        seqs.append(seq)

    random_fasta = os.path.join(path,'random.fa')
    output_handle = open(random_fasta, 'w')
    logging.info('###########GeneratingRandomFA Start########################')
    SeqIO.write(seqs, output_handle, 'fasta')
    logging.info('###########GeneratingRandomFA End########################')
    output_handle.close()
    return random_fasta
Example #30
0
def split_query_fasta(options, n_seq):
    '''
    Split "query.fasta" for multithreading use of PfamScan
    '''
    handle_in = open(options.outdir+"query.fasta","r")
    n_threads = int(options.threads/2)
    n_seq_split = (n_seq/n_threads)+1
    i = 0
    n_seq_temp = 0
    handle_out = open(options.outdir+"query_temp"+str(i)+".fasta","w")
    for record in SeqIO.parse(handle_in, "fasta"):
        if n_seq_temp <= n_seq_split:
            SeqIO.write(SeqRecord(seq = record.seq, id = record.id, 
                description = record.description), handle_out, "fasta")
            n_seq_temp = n_seq_temp + 1
        else:
            handle_out.close()
            i = i + 1
            n_seq_temp = 0
            handle_out = open(options.outdir+"query_temp"+str(i)+".fasta","w")
            SeqIO.write(SeqRecord(seq = record.seq, id = record.id, 
                description = record.description), handle_out, "fasta")
            n_seq_temp = n_seq_temp + 1
    try:
        handle_out.close()
    except:
        pass
    handle_in.close()
    return i, n_threads
Example #31
0
# reads commands
files = sys.argv[1]
Npercent= float(sys.argv[2])

# prepare the output file
outname=files+".pN"+str(int(Npercent))+".fasta"
output_handle = open(outname, "w")


# prepare both (fasta and qual) input files indexing
countN=[]
records = PairedFastaQualIterator(open(files+".fasta"), open(files+".qual"))
for record in records:
	s=list(record)
	for i in range(len(record.letter_annotations['phred_quality'])):
		if record.letter_annotations['phred_quality'][i] < cutoff:
			s[i]="N"
	snew="".join(s).strip("N")
	if snew=="":
		 pass
	else:
		nbN=snew.count("N")
		if (float(nbN)/len(snew))< (Npercent/100):
			countN.append(nbN)
			newrecord = SeqRecord(Seq(snew,), id=record.id, description="length="+str(len(snew)))
			SeqIO.write(newrecord, output_handle, "fasta")
output_handle.close()
print "New fasta written in "+outname
print "This file contains "+ str(sum(countN)/len(countN))+ "N in average within the sequences"

 phylome_dict = fasta2dict(os.path.join(phylomedb_path, phylo_ID + '.raw.fasta'))
 
 full_region_dict, match_list = first_round_alignments2(phylome_dict, phylo_ID, paralog_ID, alignment_dict, chain_dict)
 
 record_list = fasta2list(os.path.join(phylomedb_path, phylo_ID + '.raw.fasta'))
                           
 # Need to check if the paralog is in the phylome
 record_2 = phylome_dict.get(phylo_ID_2, -1)
 if type(record_2) != SeqRecord:
     record_2 = proteome_dict[paralog_ID_2]
 
     record_list.append(record_2)
 
     # Write the records and align
     outpath = output_009_phylo_PDB_aln + '/' + P1 + '_' + P2 + '.fasta'
     SeqIO.write(record_list, outpath, 'fasta')
     pair_align_dict = align_seqs(outpath)
     
     final_pair_dict = OrderedDict()
     
     for key, value in pair_align_dict.items():
         if key in (phylo_ID, paralog_ID_2):
             final_pair_dict[key] = value
      
     out_files = save_final_sequences2(final_pair_dict, full_region_dict, paralog_ID_2, phylo_ID, aa2three_letter)
     
     same_phylome = 0
     
 else:
     final_pair_dict = OrderedDict()
 
Example #33
0
start_time = time.time()
print("\nEntering python script: 'parse_for_barcode_qual.py'")

# inputs
fastq_file = sys.argv[1]  # fastq file to parse
qual_threshold = int(sys.argv[2])  # int for the quality desired
output_file = sys.argv[3]  # file to be written

filtered_fastq = open(sys.argv[3], "a")  # open results file
# read your fastq file
for read in SeqIO.parse(fastq_file, 'fastq'):
    if read.letter_annotations["phred_quality"][
            0] < qual_threshold:  # check quality at position 0
        continue
    elif read.letter_annotations["phred_quality"][
            1] < qual_threshold:  # check quality at position 1
        continue
    elif read.letter_annotations["phred_quality"][
            2] < qual_threshold:  # check quality at position 2
        continue
    else:
        SeqIO.write(
            read, filtered_fastq, 'fastq'
        )  # write the record if all qualities are above qual_threshold

# Close file
filtered_fastq.close()

print("...time elapsed: " + str(time.time() - start_time) + " seconds")
print("...exiting python script: 'parse_for_barcode_qual.py'")
filter_out = "/staton/projects/chestnut/psudochro/analysis_081718_annotation/12k_RNA_Qrobur_081718/8_fixInternalStops/3_renameGenes/genes_to_filter.txt"

final_annotation = "/staton/projects/chestnut/psudochro/analysis_081718_annotation/12k_RNA_Qrobur_081718/8_fixInternalStops/3_renameGenes/Castanea_mollissima_scaffolds_v3.4_HQcds.fna"

# Create a list with every gene you want to filter out.

merged_list = []
with open(filter_out) as m:
    for line in m:
        merged_gene = line.rstrip()
        merged_list.append(merged_gene)

m.close()

# Use this list to filter out any sequences with a matching record ID:

inhandle = open(input_annotations)
outhandle = open(final_annotation, "w")
count = 0

for record in SeqIO.parse(inhandle, "fasta"):
    id = record.id
    if id not in merged_list:
        SeqIO.write(record, outhandle, "fasta")
    else:
        count += 1

inhandle.close
outhandle.close
print("%d genes were filtered out" % count)
Example #35
0
    rec = list(SeqIO.parse(StringIO(out), "fasta"))[0]
    print rec

    metadata = find_sample(sample['sample_id'])
    print metadata
    """
{u'pregnancy_week': u'', u'municipality': u'murici', u'patient_sex': u'male', u'host_species': u'human', u'lab_internal_sample_id': u'', u'sample_id': u'ZBRD103', u'minion_barcodes': u'', u'ct': u'29.09', u'lab_id_lacen': u'150101004197', u'collection_date': u'2015-08-20', u'amplicon_concentration_pool_1': u'', u'pregnancy_trimester': u'', u'sample_number': u'103', u'symptoms': u'', u'creation_persistent_id': u'9EDCA6E1F234B3A6E160D5E819D8918D', u'state': u'alagoas', u'extraction_date': u'2016-06-13', u'creation_host_timestamp': u'09/08/2016 21:06:44', u'rt_positive': u'1', u'patient_age': u'25', u'modification_account_name': u'Admin', u'modification_persistent_id': u'9EDCA6E1F234B3A6E160D5E819D8918D', u'lab': u'lacen_maceio', u'onset_date': u'2015-08-18', u'microcephaly': u'', u'sample_type': u'', u'creation_account_name': u'Admin', u'modification_host_timestamp': u'', u'country': u'brazil', u'notes': u'', u'pregnant': u''}
"""

    rec.id = "%s|%s|%s|%s|%s|%s" % (
        metadata['lab_id_lacen'], metadata['sample_id'], run_name,
        metadata['municipality'], metadata['state'],
        metadata['collection_date'])

    if rec.seq.count('N') < 3000:
        SeqIO.write([rec], goodfh, "fasta")
    elif rec.seq.count('N') < 5500:
        SeqIO.write([rec], partialfh, "fasta")
    else:
        SeqIO.write([rec], badfh, "fasta")
    """
    con = sqlite3.connect(sys.argv[1])
    con.row_factory = sqlite3.Row
    cur = con.cursor()

def lookup_sample(sample):
    cur.execute("select * from samples, runs where runs.Batch = ? and runs.sample_fk = samples.rowid", (sample,))
    row = cur.fetchone()
    return row

for rec in SeqIO.parse(sys.stdin, "fasta"):
Example #36
0
    recs = [rec for rec in SeqIO.parse(gbk, "genbank")]
    return (recs[0])


genomes = folder_list()

# read and modify gbk files
for genome in genomes:
    list_of_files = file_parser(genome)
    print('Parsing genome ', str(genome))
    for i in range(len(list_of_files)):
        record = read_gbk(list_of_files[i])
        record.id = genome
        path_to_save = os.path.join(path, genome,
                                    genome) + '.region00' + str(i + 1) + '.gbk'
        SeqIO.write(record, path_to_save, format='genbank')

# move created files into the right folder to be analysed
target_path = '/home/dani/Documents/MRC_postdoc/Pangenomic/phylo/original_data/bigscape_results/gbks'
for genome in genomes:
    list_of_files = file_parser_NT(genome)
    for file in list_of_files:
        file_n = file.split('/')[-1]
        if 'NT' in file_n and 'region' in file_n:  # condition to move files
            os.rename(file, os.path.join(target_path, file_n))
        else:
            pass

n = 0
for genome in genomes:
    if len(file_parser(genome)) > 0:
Example #37
0
import sys
from Bio import SeqIO


# Define a function to check files exist, as a type for the argparse.
def File(MyFile):
    if not os.path.isfile(MyFile):
        raise argparse.ArgumentTypeError(MyFile +
                                         ' does not exist or is not a file.')
    return MyFile


# Set up the arguments for this script
ExplanatoryMessage = ExplanatoryMessage.replace('\n', ' ').replace('  ', ' ')
parser = argparse.ArgumentParser(description=ExplanatoryMessage)
parser.add_argument('FastaFile', type=File)
args = parser.parse_args()

OutSeqs = []
for seq in SeqIO.parse(open(args.FastaFile), 'fasta'):
    empty = True
    for base in str(seq.seq):
        if not base in ["?", "-", "N"]:
            empty = False
            break
    if not empty:
        OutSeqs.append(seq)
        continue

SeqIO.write(OutSeqs, sys.stdout, "fasta")
Example #38
0
#Isolates a chromosome from an SGA file
import pandas as pd
from Bio import SeqIO

chromosome = 1
annotations_file = "data/human_complete.sga"
promoters_file = "data/human_complete.fa"

annotations = pd.read_csv(
    annotations_file,
    sep='\t',
    names=["Id", "Type", "Position", "Strand", "Chromosome", "Gene"])
annotations['Chromosome'] = annotations.Id.str[7:9].astype(int)

isolated_promoters = annotations[annotations['Chromosome'] ==
                                 chromosome].Gene.tolist()

record_list = []
with open(promoters_file, 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        if (record.description.split(' ')[1] in isolated_promoters):
            record_list.append(record)

output_file = "data/blast/genome_database/human_promoters_chr{0}.fa".format(
    chromosome)
with open(output_file, "w+") as output_handle:
    SeqIO.write(record_list, output_handle, "fasta")
Example #39
0
    ## present in summary_complete.  If it is not it is bad and should not
    ## be used.  Also add a reasonable taxonomic name for each reference sequence
    ## to summary_complete as tax_id.

    with open(ref_dir_domain + 'combined_18S.' + domain + '.tax.fasta',
              'w') as good_fasta_18S:
        for record in SeqIO.parse(ref_dir_domain + 'combined_18S.unique.fasta',
                                  'fasta'):
            tax_name = str(record.id)
            genome = tax_name.split('|')[0]

            if genome in summary_complete.index:
                summary_complete.loc[genome, 'tax_name'] = tax_name
                keep = True
                kept_genomes.append(genome)
                SeqIO.write(record, good_fasta_18S, 'fasta')

    summary_complete = summary_complete[summary_complete.index.isin(
        kept_genomes)]

    ## Write out summary_complete and exit.

    summary_complete.to_csv(ref_dir_domain + 'genome_data.csv.gz')
    quit()

## For bacteria and archaea, find 16S rRNA genes in fna files. Get some paramenters on the genome; number of
## 16S genes, number of elements, size of genome, and add these to summary_complete.
## Generate two fasta files of the 16S rRNA genes.  One will be used later to build
## the reference tree and has sensible taxonomic names.  One is used to calculate
## the phi values and is named by assembly.
Example #40
0
def identify_link_subclass(input_fastq,
                           output_prefix,
                           ambiguous_ok=False,
                           find_prs_max=50):
    semi_good = []
    still_bad = []

    reader = SeqIO.parse(open(input_fastq), 'fastq')
    for r in reader:
        s = str(r.seq)
        s2 = str(r.seq.reverse_complement())
        i, j = -1, -1
        for x, name in prs_bcr.items():
            i = s[:find_prs_max].find(x)
            if i > 0:
                print("precise found for ", x, " on + strand at pos", i)
                break
            j = s2[:find_prs_max].find(x)
            if j > 0: break

        if i < 0 and j < 0 and ambiguous_ok:
            for mutseq in bcr_mutation_dict[name]:
                i = s[:find_prs_max].find(mutseq)
                if i > 0:
                    print("ambiguous found for ", mutseq,
                          " on + strand at pos", i)
                    break
        if i < 0 and j < 0 and ambiguous_ok:
            for mutseq in bcr_mutation_dict[name]:
                j = s2[:find_prs_max].find(mutseq)
                if j > 0:
                    print("ambiguous found for ", mutseq,
                          " on + strand at pos", j)
                    break
        if i > 0: semi_good.append((r, len(r.seq) - i - 6, name, 's1'))
        elif j > 0: semi_good.append((r, len(r.seq) - j - 6, name, 's2'))
        else: still_bad.append((r, i))
        #if len(semi_good)>=1000: break

    seen = defaultdict(lambda: [])  # (umi,type) --> list of CCS id
    seen_debug = defaultdict(
        lambda: Counter())  # (umi,type) --> (insert) --> count
    f_by_type = {}
    for rep_seq, rep_name in prs_bcr.items():
        f_by_type[rep_name] = open(
            "{o}_{n}.determined.fq".format(o=output_prefix, n=rep_name), 'w')
    f2 = open(output_prefix + '.info.csv', 'w')
    f3 = open(output_prefix + '.cluster_info.csv', 'w')
    f3.write("tag\tcount\tmembers\n")
    fu = open(output_prefix + '.undetermined.fq', 'w')
    writer = DictWriter(f2, fieldnames=INFO_FIELDNAMES, delimiter='\t')
    writer.writeheader()

    #
    #  [r2 should mostly be blank] -- [12bp UMI] -- [insert] --- [primer] --- [r1, actually "C" region]
    #

    # for debugging
    linker_pos = []

    for p in semi_good:
        info = {
            'id': p[0].id,
            'strand': p[-1],
            'type': p[2],
            'len': len(r.seq),
            'ilen': 'NA',
            'umi': 'NA',
            'primer': 'NA',
            'r2': 'NA',
            'insert': 'NA',
            'r1': 'NA'
        }
        if p[-1] == 's2':
            s = str(p[0].seq)
        else:
            s = str(p[0].seq.reverse_complement())
        i = s.find(LINKER)
        if i > 0: linker_pos.append(i)
        if UMI_LEN <= i < UMI_LEN * 3:
            insert = s[i:p[1]]
            ilen = p[1] - i
            info['umi'] = s[i - 12:i]
            info['r2'] = s[:i - 12]
            info['primer'] = s[p[1]:p[1] + 6]
            info['r1'] = s[p[1] + 6:]
            info['insert'] = insert
            info['ilen'] = ilen
            tag = info['umi'] + '-' + info['type']
            writer.writerow(info)
            seen[tag].append(p[0].id)
            seen_debug[tag][insert] += 1
        else:
            SeqIO.write(p[0], fu, 'fastq')

    # now for each (umi,type), output the most common sequence
    umi_index = 0
    for umi_type in seen_debug:
        umi_index += 1
        umi, type = umi_type.split('-')
        major_seq, major_count = seen_debug[umi_type].most_common(1)[0]
        total_count = sum(seen_debug[umi_type].values())
        f_by_type[type].write("@pacbio.{0} UMI:{1}:{2} type:{3} mcount:{4} count:{5}\n".format(\
            umi_index, umi, 'G'*len(umi), type, major_count, total_count))
        f_by_type[type].write("{0}\n+\n{1}\n".format(major_seq,
                                                     'I' * len(major_seq)))

    for f in f_by_type.values():
        f.close()
    f2.close()
    fu.close()

    for k, v in seen.items():
        f3.write("{0}\t{1}\t{2}\n".format(k, len(v), ",".join(v)))
    f3.close()

    linker_pos = np.array(linker_pos)
    print("DEBUG: # of linkers found", len(linker_pos))
    print("DEBUG: # of linkers found at 12bp:", sum(linker_pos == 12))
    print("DEBUG: # of linkers found > 12bp:", sum(linker_pos > 12))
    print("DEBUG: # of linkers found < 12bp:", sum(linker_pos < 12))
Example #41
0
 def create_fasta_file(file_address, corpus, label):
     seq_id_pairs=[('.'.join([str(idx+1),label[idx]]),x) for idx, x in enumerate(corpus)]
     seq_recs=[ SeqRecord(Seq(seq,generic_dna),id=id, description='') for id,seq in seq_id_pairs]
     SeqIO.write(seq_recs, file_address, "fasta")
Example #42
0
                    metavar='<ncRNA file>',
                    required=True)

#Getting arguments
args = parser.parse_args()
transcriptome_file = args.transcriptome_file
protein_file = args.protein_file
ncRNA_file = args.ncRNA_file

#Loading transcriptomes seq ID and proteins features
transcripts = list(SeqIO.parse(transcriptome_file, "fasta"))
dict_proteins = SeqIO.to_dict(SeqIO.parse(protein_file, "fasta"))

#Generate list with ncRNA sequences id
ncRNA_list = []
transcriptome_index = transcriptome_file[:-5] + "index"

for i in range(0, len(transcripts)):
    if transcripts[i].id not in dict_proteins.keys():
        ncRNA_list.append(transcripts[i].id)

#Generate index database for transcriptome (store sequence features)
transcriptome_db = SeqIO.index_db(transcriptome_index, transcriptome_file,
                                  "fasta")

#Generate ncRNA file
with open(ncRNA_file, "w") as ncRNA_output:
    for i in ncRNA_list:
        if i in transcriptome_db:
            SeqIO.write(transcriptome_db[i], ncRNA_output, "fasta")
Example #43
0
def multi_to_single(genbank, name, output):
    '''
    Converts a multi entry genbank (where each entry is a contig)
    into a single entry genbank, preserving all annotations.
    '''

    # total bases
    total = 0

    handle = open(genbank, "rU")
    records = list(SeqIO.parse(handle, "genbank"))
    feature_count = 0
    colour_count = 0

    # make header genbank format friendly
    if len(name) >= 10:
        name = name[:9]
    for r in records:
        length = len(r)
        id = r.name
        seq = r.seq
        seq.alphabet = generic_dna
        if total > 0:
            newrecord.seq = newrecord.seq + seq
        else:
            # first sequence, initialise seqrecord
            newrecord = SeqRecord(seq=r.seq, name=name, id=name)
            newrecord.seq.alphabet = generic_dna
        # create feature for contig
        if colour_count % 2 == 0:
            newrecord.features.append(
                SeqFeature(FeatureLocation(total, total + length),
                           type="fasta_record",
                           qualifiers={
                               'note': [r.name],
                               'colour': '11'
                           }))
            colour_count = colour_count + 1
        else:
            newrecord.features.append(
                SeqFeature(FeatureLocation(total, total + length),
                           type="fasta_record",
                           qualifiers={
                               'note': [r.name],
                               'colour': '10'
                           }))
            colour_count = colour_count + 1
        # copy CDS features
        for f in r.features:
            feature_count += 1
            f.qualifiers["locus_tag"] = str(feature_count)
            newrecord.features.append(
                SeqFeature(FeatureLocation(f.location.nofuzzy_start + total,
                                           f.location.nofuzzy_end + total),
                           strand=f.strand,
                           type=f.type,
                           qualifiers=f.qualifiers))
        total += length
    handle.close()
    #write out new single entry genbank
    SeqIO.write(newrecord, output, "genbank")
Example #44
0
    action="store_true")

in_args = parser.parse_args()

in_file = os.path.abspath(in_args.in_file)

prot_seqs = []
with open(in_file, "r") as ifile:
    dna_seqs = SeqIO.parse(ifile, "fasta")
    for seq in dna_seqs:
        if in_args.strip_description:
            seq.description = ""
        seq.alphabet = IUPAC.protein
        seq.seq = seq.seq.translate(to_stop=True)
        prot_seqs.append(seq)

tmp_file = MyFuncs.TempFile()
with open(tmp_file.file, "w") as ofile:
    SeqIO.write(prot_seqs, ofile, "fasta")

if not in_args.out_file:
    with open(tmp_file.file, "r") as ifile:
        print(ifile.read())
else:
    out_file = os.path.abspath(in_args.out_file)
    if os.path.exists(out_file) and not in_args.over_write:
        print(
            "Error: The outfile you've specified already exists. Use the -ow flag if you want to over-write it."
        )
    else:
        shutil.move(tmp_file.file, out_file)
Example #45
0
    for i, arg in enumerate(sys.argv):
        if arg == "-f":
            filepath = sys.argv[i + 1]
        elif arg == "-w":
            window_size = int(sys.argv[i + 1])
        elif arg == "-c":
            cutoff = int(sys.argv[i + 1])
        elif arg == "-k":
            ksize = int(sys.argv[i + 1])
        elif arg == "-o":
            outpath = sys.argv[i + 1]

    if os.path.isdir(filepath):
        filepaths = [os.path.join(filepath, fn) for fn in os.listdir(filepath)]
    else:
        filepaths = list(filepath.split(","))

    haplotypes, haplo_freqs = predict_haplotypes(filepaths=filepaths,
                                                 window_size=window_size,
                                                 ksize=ksize,
                                                 cutoff=cutoff)

    sequences = [
        SeqRecord(Seq(haplo), str(i)) for i, haplo in enumerate(haplotypes)
    ]
    SeqIO.write(sequences, outpath, "fasta")

    freq_path = outpath + ".freqs.txt"
    with open(freq_path, "w") as freq_file:
        freq_file.writelines([",".join(haplo_freqs.astype(str))])
Example #46
0
def main():

    start_time = time.time()

    args = parse_args()

    samtools_runner = RunSamtools()

    # If the user gave an output directory and it doesn't already exist,
    # create it.
    if args.directory and not os.path.exists(args.directory):
        os.makedirs(args.directory)

    # Set up logfile
    if args.log is True:
        if args.output != '':
            logfile = os.path.join(args.directory, args.output + ".log")
        else:
            # come up with a different prefix
            logfile = os.path.join(
                args.directory,
                time.strftime("%d%m%y_%H%M", time.localtime()) + '.log')
    else:
        logfile = None
    logging.basicConfig(filename=logfile,
                        level=logging.DEBUG,
                        filemode='w',
                        format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logging.info('program started')
    logging.info('command line: {0}'.format(' '.join(sys.argv)))

    # Checks that the correct programs are installed
    check_command(['bwa'], 'bwa')
    #check_command(['samtools'], 'samtools')
    check_command(['makeblastdb'], 'blast')
    check_command(['bedtools'], 'bedtools')

    # Checks to make sure the runtype is valid and provides an error if not
    if args.runtype != "improvement" and args.runtype != "typing":
        logging.info('Invalid runtype selected: {}'.format(args.runtype))
        logging.info(
            'Runtype should be improvement or typing (see instructions for further details)'
        )
        exit(-1)

    # Get feature types in correct format
    args.cds = ' '.join(args.cds)
    args.trna = ' '.join(args.trna)
    args.rrna = ' '.join(args.rrna)

    # Gather together the reads in pairs with their corresponding
    # assemblies (if required)
    fileSets = read_file_sets(args)
    # Start analysing each read set specified
    for sample in fileSets:
        forward_read = fileSets[sample][0]
        reverse_read = fileSets[sample][1]
        try:
            assembly = fileSets[sample][2]
        except IndexError:
            pass

        # Read in the queries
        query_records = SeqIO.parse(args.queries, 'fasta')
        # Cycle through each query on its own before moving onto the next one
        for query in query_records:
            # get the name of the query to set up file names
            query_name = query.id

            # Create the output file and folder names,
            # make the folders where necessary
            if args.directory == '':
                current_dir = os.getcwd() + '/'
            else:
                current_dir = args.directory
            if current_dir[-1] != '/':
                current_dir = current_dir + '/'

            temp_folder = current_dir + sample + '_' + query_name + '_temp/'
            output_sam = temp_folder + sample + '_' + query_name + '.sam'
            left_bam = temp_folder + sample + '_' + query_name + '_left.bam'
            right_bam = temp_folder + sample + '_' + query_name + '_right.bam'
            left_reads = temp_folder + sample + '_' + query_name + '_left.fastq'
            right_reads = temp_folder + sample + '_' + query_name + '_right.fastq'
            left_clipped_reads = temp_folder + sample + '_' + query_name + '_left_clipped.fastq'
            right_clipped_reads = temp_folder + sample + '_' + query_name + '_right_clipped.fastq'
            final_left_reads = temp_folder + sample + '_' + query_name + '_LeftFinal.fastq'
            final_right_reads = temp_folder + sample + '_' + query_name + '_RightFinal.fastq'
            no_hits_table = current_dir + sample + '_' + query_name + '_table.txt'
            make_directories([temp_folder])

            # need to write out each query to a temp file
            # otherwise it can't be indexed etc
            query_tmp = temp_folder + query_name + '.fasta'
            SeqIO.write(query, query_tmp, 'fasta')

            # Index the IS query for BWA
            bwa_index(query_tmp)

            # Map to IS query
            run_command([
                'bwa', 'mem', '-t', args.t, query_tmp, forward_read,
                reverse_read, '>', output_sam
            ],
                        shell=True)
            # Pull unmapped reads flanking IS
            run_command(samtools_runner.view(left_bam, output_sam, smallF=36),
                        shell=True)
            run_command(samtools_runner.view(right_bam,
                                             output_sam,
                                             smallF=4,
                                             bigF=40),
                        shell=True)
            # Turn bams to reads for mapping
            run_command(
                ['bedtools', 'bamtofastq', '-i', left_bam, '-fq', left_reads],
                shell=True)
            run_command([
                'bedtools', 'bamtofastq', '-i', right_bam, '-fq', right_reads
            ],
                        shell=True)
            # Add corresponding clipped reads to their respective left and right ends
            print 'Usage before extracting soft-clipped reads'
            print('Memory usage: %s (kb)' %
                  resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
            logging.info(
                'Extracting soft clipped reads, selecting reads that are <= ' +
                str(args.max_clip) + 'bp and >= ' + str(args.min_clip) + 'bp')
            extract_clipped_reads(output_sam, args.min_clip, args.max_clip,
                                  left_clipped_reads, right_clipped_reads)
            print 'Usage after reads written out, before concatentation'
            print('Memory usage: %s (kb)' %
                  resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
            run_command(
                ['cat', left_clipped_reads, left_reads, '>', final_left_reads],
                shell=True)
            run_command([
                'cat', right_clipped_reads, right_reads, '>', final_right_reads
            ],
                        shell=True)
            print 'Usage after reads concatenated onto previous reads'
            print('Memory usage: %s (kb)' %
                  resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

            # Create BLAST database for IS query
            check_blast_database(query_tmp)
            if os.stat(final_left_reads)[6] == 0 or os.stat(
                    final_right_reads)[6] == 0:
                logging.info(
                    'One or both read files are empty. This is probably due to no copies of the IS of interest being present in this sample. Program quitting.'
                )
                with open(no_hits_table, 'w') as f:
                    if args.runtype == 'typing':
                        header = [
                            "region", "orientation", "x", "y", "gap", "call",
                            "%ID", "%Cov", "left_gene", "left_strand",
                            "left_distance", "right_gene", "right_strand",
                            "right_distance", "functional_prediction"
                        ]
                        f.write('\t'.join(header) + '\nNo hits found')
                    else:
                        header = ['contig', 'end', 'x', 'y']
                        f.write('\t'.join(header) + '\nNo hits found')
                remove_temp_directory(args.temp, temp_folder)
                continue

            # Improvement mode
            if args.runtype == "improvement":

                # Get prefix for output filenames
                left_header = sample + '_left'
                right_header = sample + '_right'
                left_to_ref_sam = temp_folder + left_header + '_' + query_name + '.sam'
                right_to_ref_sam = temp_folder + right_header + '_' + query_name + '.sam'
                left_to_ref_bam = temp_folder + left_header + '_' + query_name + '.bam'
                right_to_ref_bam = temp_folder + right_header + '_' + query_name + '.bam'
                left_bam_sorted = current_dir + left_header + '_' + query_name + '.sorted'
                right_bam_sorted = current_dir + right_header + '_' + query_name + '.sorted'
                left_cov_bed = temp_folder + left_header + '_' + query_name + '_cov.bed'
                right_cov_bed = temp_folder + right_header + '_' + query_name + '_cov.bed'
                left_final_cov = current_dir + left_header + '_' + query_name + '_finalcov.bed'
                right_final_cov = current_dir + right_header + '_' + query_name + '_finalcov.bed'
                left_merged_bed = current_dir + left_header + '_' + query_name + '_merged.sorted.bed'
                right_merged_bed = current_dir + right_header + '_' + query_name + '_merged.sorted.bed'
                final_genbankSingle = current_dir + sample + '_' + query_name + '_annotatedSingle.gbk'

                # create fasta file from genbank if required
                if args.extension == '.gbk':
                    assembly_gbk = assembly
                    (file_path, file_name_before_ext,
                     full_ext) = get_readFile_components(assembly_gbk)
                    assembly_fasta = os.path.join(
                        temp_folder, file_name_before_ext) + '.fasta'
                    gbk_to_fasta(assembly, assembly_fasta)
                    assembly = assembly_fasta
                # Map ends back to contigs
                bwa_index(assembly)
                if args.a == True:
                    run_command([
                        'bwa', 'mem', 'a', '-T', args.T, '-t', args.t,
                        assembly, final_left_reads, '>', left_to_ref_sam
                    ],
                                shell=True)
                    run_command([
                        'bwa', 'mem', 'a', '-T', args.T, '-t', args.t,
                        assembly, final_right_reads, '>', right_to_ref_sam
                    ],
                                shell=True)
                else:
                    run_command([
                        'bwa', 'mem', '-t', args.t, assembly, final_left_reads,
                        '>', left_to_ref_sam
                    ],
                                shell=True)
                    run_command([
                        'bwa', 'mem', '-t', args.t, assembly,
                        final_right_reads, '>', right_to_ref_sam
                    ],
                                shell=True)

                run_command(samtools_runner.view(left_to_ref_bam,
                                                 left_to_ref_sam),
                            shell=True)
                run_command(samtools_runner.view(right_to_ref_bam,
                                                 right_to_ref_sam),
                            shell=True)
                run_command(samtools_runner.sort(left_bam_sorted,
                                                 left_to_ref_bam),
                            shell=True)
                run_command(samtools_runner.sort(right_bam_sorted,
                                                 right_to_ref_bam),
                            shell=True)
                run_command(samtools_runner.index(left_bam_sorted), shell=True)
                run_command(samtools_runner.index(right_bam_sorted),
                            shell=True)
                # Create BED file with coverage information
                run_command([
                    'bedtools', 'genomecov', '-ibam', left_bam_sorted + '.bam',
                    '-bg', '>', left_cov_bed
                ],
                            shell=True)
                run_command([
                    'bedtools', 'genomecov', '-ibam',
                    right_bam_sorted + '.bam', '-bg', '>', right_cov_bed
                ],
                            shell=True)
                filter_on_depth(left_cov_bed, left_final_cov, args.cutoff)
                filter_on_depth(right_cov_bed, right_final_cov, args.cutoff)
                run_command([
                    'bedtools', 'merge', '-i', left_final_cov, '-d',
                    args.merging, '>', left_merged_bed
                ],
                            shell=True)
                run_command([
                    'bedtools', 'merge', '-i', right_final_cov, '-d',
                    args.merging, '>', right_merged_bed
                ],
                            shell=True)
                # Create table and genbank
                if args.extension == '.fasta':
                    run_command([
                        args.path + 'create_genbank_table.py', '--left_bed',
                        left_merged_bed, '--right_bed', right_merged_bed,
                        '--assembly', assembly, '--type fasta', '--output',
                        current_dir + sample + '_' + query_name
                    ],
                                shell=True)
                elif args.extension == '.gbk':
                    run_command([
                        args.path + 'create_genbank_table.py', '--left_bed',
                        left_merged_bed, '--right_bed', right_merged_bed,
                        '--assembly', assembly_gbk, '--type genbank',
                        '--output', current_dir + sample + '_' + query_name
                    ],
                                shell=True)
                #create single entry genbank
                multi_to_single(sample + '_' + query_name + '_annotated.gbk',
                                sample, final_genbankSingle)

            # Typing mode
            if args.runtype == "typing":

                # Get prefix of typing reference for output filenames
                (file_path, file_name) = os.path.split(args.typingRef)
                typingName = file_name.split('.g')[0]
                typingRefFasta = temp_folder + typingName + '.fasta'
                # Create reference fasta from genbank
                gbk_to_fasta(args.typingRef, typingRefFasta)
                # Create bwa index file for typing reference
                bwa_index(typingRefFasta)
                # Set up file names for output files
                left_header = sample + '_left_' + typingName
                right_header = sample + '_right_' + typingName
                left_to_ref_sam = temp_folder + left_header + '_' + query_name + '.sam'
                right_to_ref_sam = temp_folder + right_header + '_' + query_name + '.sam'
                left_to_ref_bam = temp_folder + left_header + '_' + query_name + '.bam'
                right_to_ref_bam = temp_folder + right_header + '_' + query_name + '.bam'
                left_bam_sorted = current_dir + left_header + '_' + query_name + '.sorted'
                right_bam_sorted = current_dir + right_header + '_' + query_name + '.sorted'
                left_cov_bed = temp_folder + left_header + '_' + query_name + '_cov.bed'
                right_cov_bed = temp_folder + right_header + '_' + query_name + '_cov.bed'
                left_cov_merged = temp_folder + left_header + '_' + query_name + '_cov_merged.sorted.bed'
                right_cov_merged = temp_folder + right_header + '_' + query_name + '_cov_merged.sorted.bed'
                left_final_cov = current_dir + left_header + '_' + query_name + '_finalcov.bed'
                right_final_cov = current_dir + right_header + '_' + query_name + '_finalcov.bed'
                left_merged_bed = current_dir + left_header + '_' + query_name + '_merged.sorted.bed'
                right_merged_bed = current_dir + right_header + '_' + query_name + '_merged.sorted.bed'
                bed_intersect = current_dir + sample + '_' + typingName + '_' + query_name + '_intersect.bed'
                bed_closest = current_dir + sample + '_' + typingName + '_' + query_name + '_closest.bed'
                bed_unpaired_left = current_dir + sample + '_' + typingName + '_' + query_name + '_left_unpaired.bed'
                bed_unpaired_right = current_dir + sample + '_' + typingName + '_' + query_name + '_right_unpaired.bed'

                # Map reads to reference, sort
                if args.a == True:
                    run_command([
                        'bwa', 'mem', '-a', '-T', args.T, '-t', args.t,
                        typingRefFasta, final_left_reads, '>', left_to_ref_sam
                    ],
                                shell=True)
                    run_command([
                        'bwa', 'mem', '-a', '-T', args.T, '-t', args.t,
                        typingRefFasta, final_right_reads, '>',
                        right_to_ref_sam
                    ],
                                shell=True)
                else:
                    run_command([
                        'bwa', 'mem', '-t', args.t, typingRefFasta,
                        final_left_reads, '>', left_to_ref_sam
                    ],
                                shell=True)
                    run_command([
                        'bwa', 'mem', '-t', args.t, typingRefFasta,
                        final_right_reads, '>', right_to_ref_sam
                    ],
                                shell=True)

                run_command(samtools_runner.view(left_to_ref_bam,
                                                 left_to_ref_sam),
                            shell=True)
                run_command(samtools_runner.view(right_to_ref_bam,
                                                 right_to_ref_sam),
                            shell=True)
                run_command(samtools_runner.sort(left_bam_sorted,
                                                 left_to_ref_bam),
                            shell=True)
                run_command(samtools_runner.sort(right_bam_sorted,
                                                 right_to_ref_bam),
                            shell=True)
                run_command(samtools_runner.index(left_bam_sorted), shell=True)
                run_command(samtools_runner.index(right_bam_sorted),
                            shell=True)

                # Create BED files with coverage information
                run_command([
                    'bedtools', 'genomecov', '-ibam', left_bam_sorted + '.bam',
                    '-bg', '>', left_cov_bed
                ],
                            shell=True)
                run_command([
                    'bedtools', 'genomecov', '-ibam',
                    right_bam_sorted + '.bam', '-bg', '>', right_cov_bed
                ],
                            shell=True)
                run_command([
                    'bedtools', 'merge', '-d', args.merging, '-i',
                    left_cov_bed, '>', left_cov_merged
                ],
                            shell=True)
                run_command([
                    'bedtools', 'merge', '-d', args.merging, '-i',
                    right_cov_bed, '>', right_cov_merged
                ],
                            shell=True)
                # Filter coveraged BED files on coverage cutoff (so only take
                # high coverage regions for further analysis)
                filter_on_depth(left_cov_bed, left_final_cov, args.cutoff)
                filter_on_depth(right_cov_bed, right_final_cov, args.cutoff)
                run_command([
                    'bedtools', 'merge', '-d', args.merging, '-i',
                    left_final_cov, '>', left_merged_bed
                ],
                            shell=True)
                run_command([
                    'bedtools', 'merge', '-d', args.merging, '-i',
                    right_final_cov, '>', right_merged_bed
                ],
                            shell=True)
                # Find intersects and closest points of regions
                run_command([
                    'bedtools', 'intersect', '-a', left_merged_bed, '-b',
                    right_merged_bed, '-wo', '>', bed_intersect
                ],
                            shell=True)
                # if one or more of the bed files are empty, then closestBed returns an error
                # that needs to be caught
                try:
                    run_command([
                        'closestBed', '-a', left_merged_bed, '-b',
                        right_merged_bed, '-d', '>', bed_closest
                    ],
                                shell=True)
                except BedtoolsError:
                    with open(no_hits_table, 'w') as f:
                        header = [
                            "region", "orientation", "x", "y", "gap", "call",
                            "%ID", "%Cov", "left_gene", "left_strand",
                            "left_distance", "right_gene", "right_strand",
                            "right_distance", "functional_prediction"
                        ]
                        f.write('\t'.join(header) + '\nNo hits found')
                    continue
                # Create all possible closest bed files for checking unpaired hits
                # If any of these fail, just make empty unapired files to pass to create_typing_out
                try:
                    run_command([
                        'closestBed', '-a', left_merged_bed, '-b',
                        right_cov_merged, '-d', '>', bed_unpaired_left
                    ],
                                shell=True)
                except BedtoolsError:
                    if not os.path.isfile(bed_unpaired_left) or os.stat(
                            bed_unpaired_left)[6] == 0:
                        open(bed_unpaired_left, 'w').close()
                try:
                    run_command([
                        'closestBed', '-a', left_cov_merged, '-b',
                        right_merged_bed, '-d', '>', bed_unpaired_right
                    ],
                                shell=True)
                except BedtoolsError:
                    if not os.path.isfile(bed_unpaired_right) or os.stat(
                            bed_unpaired_right)[6] == 0:
                        open(bed_unpaired_right, 'w').close()
                # Create table and annotate genbank with hits
                if args.igv:
                    igv_flag = '1'
                else:
                    igv_flag = '0'
                run_command([
                    args.path + 'create_typing_out.py', '--intersect',
                    bed_intersect, '--closest', bed_closest, '--left_bed',
                    left_merged_bed, '--right_bed', right_merged_bed,
                    '--left_unpaired', bed_unpaired_left, '--right_unpaired',
                    bed_unpaired_right, '--seq', query_tmp, '--ref',
                    args.typingRef, '--temp', temp_folder, '--cds', args.cds,
                    '--trna', args.trna, '--rrna', args.rrna, '--min_range',
                    args.min_range, '--max_range', args.max_range, '--output',
                    current_dir + sample + '_' + query_name, '--igv', igv_flag,
                    '--chr_name', args.chr_name
                ],
                            shell=True)

            # remove temp folder if required
            remove_temp_directory(args.temp, temp_folder)
            remove_bams(args.bam, left_bam_sorted, right_bam_sorted)

    total_time = time.time() - start_time
    time_mins = float(total_time) / 60
    logging.info('ISMapper finished in ' + str(time_mins) + ' mins.')
	blast_out_cog = open("%s/cog_%s.blast" % (seq_record.id,seq_record.id))
	blast_lines_cog = blast_out_cog.readlines()
	blast_out_cog.close()
	
	j=1
	best_blast_lines_cog =  {}
	while j < len(blast_lines_cog):
		id = blast_lines_cog[j].split("\t")[0]
		best_blast_lines_cog[ id ] = blast_lines_cog[j].split("\t")[1].strip()
		while   j < len(blast_lines_cog) and id == blast_lines_cog[j].split("\t")[0] :
			j+=1

	j=1
	for cogseq in cog_db_iterator:
		if cogseq.id in best_blast_lines_cog.values():
			SeqIO.write(cogseq, fasta_cog, "fasta")
	fasta_cog.close()
	fasta_cog_db.close()
	blastout =  'tmpbia/bbhcog_%s.blast' % seq_record.id
	
	if not os.path.exists("tempgenoma"):
		getoutput("cat  */orfs_*.faa > tempgenoma")
	
	print getoutput("formatdb -i  tempgenoma " )
	getoutput("blastall -p blastp -i '%s/cog_%s.faa' -d tempgenoma -e %s -a 1 -v 30 -b 30 -o %s" % (seq_record.id,seq_record.id,params["e"],blastout))

	
	
	
	#Parseo del archivo blast generado
	fileoutHandler =  open(blastout)
Example #48
0
#! /usr/bin/env python

import sys
from Bio import SeqIO
name = sys.argv[1]
n = sys.argv[2]
if "fasta" not in name:
    sys.exit("bad name")
try:
    n = int(n)
except:
    sys.exit("bad number")
fcount = 0
scount = 0
for record in SeqIO.parse(name, "fasta"):
    if fcount == 0 or scount == n:
        if fcount > 0:
            fh.close()
        scount = 0
        fcount += 1
        fh = open(name.replace(".fasta", "-%04d.fasta" % (fcount)), "w")
    scount += 1
    SeqIO.write(record, fh, "fasta")
fh.close()
Example #49
0
# were specified).
if not args.match_start:
    SeqsNotFound = [seq for seq in args.SequenceName \
    if not seq in AllSeqNamesEncountered]
    if len(SeqsNotFound) != 0:
        print('The following sequences were not found in', args.FastaFile+':', \
        ' '.join(SeqsNotFound) +'\nQuitting.', file=sys.stderr)
        exit(1)

# Trim to the specified window and/or gap strip, if desired
if args.window != None:
    LeftCoord, RightCoord = args.window
    for seq in SeqsWeWant:
        if RightCoord > len(seq.seq):
            print('A window', LeftCoord, '-', RightCoord, 'was specified but', \
            seq.id, 'is only', len(seq.seq), 'bases long. Quitting.', file=sys.stderr)
            exit(1)
        seq.seq = seq.seq[LeftCoord - 1:RightCoord]
        if args.gap_strip:
            seq.seq = seq.seq.ungap("-").ungap("?")

# Skip blank sequences if desired
if args.skip_blanks:
    NewSeqsWeWant = []
    for seq in SeqsWeWant:
        if len(seq.seq.ungap("-").ungap("?")) != 0:
            NewSeqsWeWant.append(seq)
    SeqsWeWant = NewSeqsWeWant

SeqIO.write(SeqsWeWant, sys.stdout, "fasta")
Example #50
0
                            'Cond_B_CPM_media'
                        ])

df_final = pd.merge(df, df_media)

asc_A = pd.DataFrame(df_final.nlargest(5, 'Cond_A_CPM_media'))
asc_B = pd.DataFrame(df_final.nlargest(5, 'Cond_B_CPM_media'))
id_gene = asc_A['gene_id'].append(asc_B['gene_id'])
gene_cond = list(id_gene)

count = 1
for i in dd:
    for e in gene_cond:
        if i.id == e:
            arg = "gene_" + count.__str__()
            arquivo = SeqIO.write(i, arg, "fasta")
            count = count + 1
        else:
            continue

arquivo_genes = [
    "gene_1", "gene_2", "gene_3", "gene_4", "gene_5", "gene_6", "gene_7",
    "gene_8", "gene_9", "gene_10"
]
for e in arquivo_genes:
    refArquivo = SeqIO.read(
        f"C:\\Users\\bia_g\\PycharmProjects\\pythonProject\\{e}", "fasta")
    comand_line = NcbiblastxCommandline(cmd=blastx_path,
                                        query=refArquivo,
                                        subject=dm,
                                        out=meuOutput,
Example #51
0
 shasta = pipenv + "/" + shasta_os + " --input " + cible_shasta + " --Reads.minReadLength " + str(
     tailleread) + " --Align.maxTrim " + str(
         trim) + " --output " + tempo_out + "Shastarun"
 os.system(shasta)
 mv = "mv " + tempo_out + "Shastarun/Assembly.fasta " + tempo_out
 os.system(mv)
 rm = "rm" + " -r " + tempo_out + "Shastarun"
 os.system(rm)
 assembly = tempo_out + "Assembly.fasta"
 nbcontig = len([x for x in SeqIO.parse(assembly, "fasta")])
 longueur_A = sum([len(x.seq) for x in SeqIO.parse(assembly, "fasta")])
 if nbcontig == 2 and longueur_A > longueur_ref:
     num_contig = 1
     for rec in SeqIO.parse(assembly, "fasta"):
         nom = tempo_out + str(num_contig) + '.fasta'
         SeqIO.write(rec, nom, "fasta")
         num_contig = num_contig + 1
     os.system("""makeblastdb -in """ + un + """ -out """ + tempo_out +
               """target -dbtype 'nucl'""")
     os.system(
         """blastn -query """ + deux + """ -db """ + tempo_out +
         """target -out """ + tempo_out +
         """contig.fasta  -outfmt "10 sstrand sseqid"  -evalue 0.01""")
     tailletest = open(tempo_out + "contig.fasta", "r")
     tailletest2 = tailletest.readlines()
     tailletest.close()
     trim = trim + 1
     if len(tailletest2) == 0:
         ref_ok = True
         longueur_ref = longueur_A
         cp1 = "cp " + un + " " + tempo_out + "1f.fasta"
def direct_blast(infile, database, outfile=None, in_type="fasta", cores=1, patent_db=None, min_aa_size=0,
                 psi_blast=False, min_bitscore=50):
    """
    direct_blast is the worker function which takes an input filename and blasts it against a target database, then
    calculates identity against the query and against some other database

    Positional Arguments:

    :param infile:  str
        Input file path
    :param database:    str
        Input database path. Goes directly into command line blast, requires the database name as well.

    Keyword Arguments:

    :param outfile: str
        Output file path
    :param in_type: str
        Input file type. Takes anything SeqIO can take.
    :param cores: int
        Passed to blast command line as num_threads, and number of pool processes to spawn to analyze hits
    :param patent_db: str
        Screening database path. Goes directly into command line blast, requires the database name as well.
    :param min_aa_size: int
        Minimum size of the protein in amino acids
    :param psi_blast: bool
        Flag to use psi-blast instead of blastp
    :param min_bitscore: int
        The minimum required bitscore of a BLAST high scoring pair. HSPs with lower scores will be filtered

    """

    # Use a generic name for the outfile if it hasn't been explicitly set
    if outfile is None:
        outfile = infile + ".db.out"

    # Make sure the databases exist
    # Easier then catching it way down the line
    if not (os.path.isfile(database + ".phd") or os.path.isfile(database + ".00.phd")):
        raise FileNotFoundError("BLAST database {} not located".format(database))
    if patent_db is not None and not (os.path.isfile(patent_db + ".phd") or os.path.isfile(patent_db + ".00.phd")):
        raise FileNotFoundError("BLAST database {} not located".format(database))

    # Set the BLAST hsp minimums
    hsp_filter = lambda hsp: hsp.aln_span > 50 and hsp.bitscore > min_bitscore

    # Open and read in the query file as SeqRecords
    with open(outfile, mode="w") as out_fh, open(infile, mode="rU") as in_fh:
        for query_sequence in SeqIO.parse(in_fh, format=in_type):

            # Count the DNA bases to do a lazy job determining if this is a protein or a DNA sequence
            dna = 0
            for base in ["A", "T", "G", "C", "N"]:
                dna += str(query_sequence.seq).upper().count(base)

            blast_out_file = tempfile.mkstemp(suffix=".blast.xml")

            # Decide which blast command line arguments to use for the query sequence
            # Also set sequence alphabet

            # If the sequence looks like protein
            if dna / len(query_sequence) < 0.95:
                query_sequence.seq.alphabet = generic_protein

            # If it's probably DNA
            else:
                # Translate the sequence if it looks like DNA and use the translated sequence for downstream
                try:
                    query_sequence.seq = query_sequence.seq.translate(cds=True)
                    query_sequence.seq.alphabet = generic_protein
                except TranslationError as trans_err:
                    print("Input Sequence {} Not a CDS: {}".format(infile, trans_err.args))
                    try:
                        query_sequence.seq = query_sequence.seq.translate()
                        query_sequence.seq.alphabet = generic_protein
                    except TranslationError:
                        print("Translation Error")
                        exit(0)

                # Write the translated protein to a file to use as the query
                blast_query_temp = tempfile.mkstemp(suffix=".fasta")
                with open(blast_query_temp[0], mode="w") as blast_temp_fh:
                    SeqIO.write(query_sequence, blast_temp_fh, format="fasta")
                infile = blast_query_temp[1]

            # Run psiblast if the psi flag is set otherwise blastp
            if psi_blast:
                blast_cmd = ["psiblast", "-db", database, "-query", infile, "-outfmt", "5", "-out",
                             blast_out_file[1],
                             "-num_threads", str(cores), "-max_target_seqs", str(5000)]
            else:
                blast_cmd = ["blastp", "-db", database, "-query", infile, "-outfmt", "5", "-out", blast_out_file[1],
                             "-num_threads", str(cores), "-task", "blastp", "-max_target_seqs", str(5000)]

            subprocess.call(blast_cmd)

            # Read in the blast output file as a QueryResult
            with open(blast_out_file[0], mode="rU") as blast_fh:
                try:
                    query = SearchIO.read(blast_fh, format='blast-xml')
                except ParseError:
                    print("BLAST Command Failed")
                    exit(0)

            # Preprocess the BLAST query result with the hsp filter object
            filter_query = query.hsp_filter(hsp_filter)
            print("{} BLAST results [{} filtered]".format(len(filter_query), len(query) - len(filter_query)))

            # Pass the control arguments to the hit processor and then multiprocess hits through mp.Pool
            heavy_hitter = HitProcess(database, query_sequence, min_aa=min_aa_size, patentdb=patent_db)
            blast_process_runner = multiprocessing.Pool(processes=cores).imap_unordered(heavy_hitter.process_hits,
                                                                                        (hit for hit in filter_query))

            print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format("Query ID",
                                                      "Hit ID",
                                                      "Hit Description",
                                                      "Hit Identity",
                                                      "Hit Similarity",
                                                      "Patent DB Identity",
                                                      "Hit Sequence"),
                  file=out_fh)

            # Iterate through the processing results and print them
            for hit_id, hit_seq, hit_ident, hit_simil, patent_ident in blast_process_runner:
                if hit_ident is not None:
                    print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(query_sequence.id,
                                                              hit_id,
                                                              hit_seq.description,
                                                              hit_ident,
                                                              hit_simil,
                                                              patent_ident,
                                                              str(hit_seq.seq)),
                          file=out_fh)

            os.remove(blast_out_file[1])
def write_unlabelled_seqs(seq_dict, outdir):
    with open('{}/silix/seqs.fasta'.format(outdir), 'w') as out:
        for seq in list(seq_dict.keys()):
            SeqIO.write(seq_dict.pop(seq), out, 'fasta')
    return seq_dict
						
			print("Low confidence viral predictions by VirFinder identified")
					
	return (HC_viral_predictions, LC_viral_predictions, prophage_predictions)


if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Write fasta files with predicted viral contigs sorted in categories and putative prophages")
	parser.add_argument("-a", "--assemb", dest="assemb", help="Metagenomic assembly fasta file", required=True)
	parser.add_argument("-f", "--vfout", dest="finder", help="Absolute or relative path to VirFinder output file",
						required=True)
	parser.add_argument("-s", "--vsdir", dest="sorter",
						help="Absolute or relative path to directory containing VirSorter output", required=True)
	parser.add_argument("-o", "--outdir", dest="outdir",
						help="Absolute or relative path of directory where output viral prediction files should be stored (default: cwd)",
						default=".")
	if len(sys.argv) == 1:
		parser.print_help()
	else:
		args = parser.parse_args()
		viral_predictions = virus_parser(assembly_file=args.assemb, VF_output=args.finder, VS_output=args.sorter)
		if sum([len(x) for x in viral_predictions]) > 0:
			if len(viral_predictions[0]) > 0:
				SeqIO.write(viral_predictions[0], os.path.join(args.outdir, "High_confidence_putative_viral_contigs.fna"), "fasta")
			if len(viral_predictions[1]) > 0:
				SeqIO.write(viral_predictions[1], os.path.join(args.outdir, "Low_confidence_putative_viral_contigs.fna"), "fasta")
			if len(viral_predictions[2]) > 0:
				SeqIO.write(viral_predictions[2], os.path.join(args.outdir, "Putative_prophages.fna"), "fasta")
		else:
			print("Overall, no putative viral contigs or prophages were detected in the analysed metagenomic assembly")
        help='Replace all invalid bases (not A,C,T,G,a,c,t,g,-) with "N"'
    )
parser = argparse.ArgumentParser()
add_arguments(parser)
args = parser.parse_args()



#_________________________________________________
# Insert a function here, which replaces all invalid characters with N's

def replace_bad_chars(alignment):
	'...your code here...'
#_________________________________________________



# !!!!!!!!!!!!NOTE:!!!!!!!!!!!!!! You can call the input variables in the following manner: args.name_of_variable,
# e.g. args.input, args.input_format, etc.

# 1. read the alignment, using the AlignIO.read() function
alignment=AlignIO.read()

# 2. apply the function you defined above, in case the fix_invalid_characters option is activated
if args.fix_invalid_characters:
	replace_bad_chars(alignment)

# 3. write the alignment to the defined output file (args.output) using the SeqIO.write() function
SeqIO.write()

print('\n\nNew alignment written to file %s\n\n' %args.output)
Example #56
0
    def handle(self, *args, **options):
        dbname = options['dbname']
        query = options['query']
        out_format = options['format']

        if options['start'] < 1:
            raise CommandError(f'start must be greater than 0, "{options["start"]}" passed')
        if options['end'] and options['end'] < 0:
            raise CommandError(f'end must be greater than 0, "{options["start"]}" passed')

        options['start'] = options['start'] - 1
        
        
        
        if (out_format == "auto") and  dbname:
            out_format = "fasta"
        else:
            out_format = "list"

        if query:
            try:
                query = json.loads(query)
            except JSONDecodeError:
                if dbname:
                    query = {"name": query}
                else:
                    query = {"accession": query}

        if dbname:
            query["biodatabase__name"] = dbname
            query_manager = Bioentry.objects
        else:
            query_manager = Biodatabase.objects
        
        if not options['end']:
            if (out_format == "list") and (options['end'] == None):
                options['end'] = 10

        self.stderr.write(f"quering... {json.dumps(query)}")

        qs = query_manager.filter(**query)

        self.stderr.write(f"retreived sequences: {qs.count()}")

        seqstore = SeqStore.instance()
        seqtype = "genome"
        if dbname.endswith(Biodatabase.PROT_POSTFIX):
            dbname = dbname[:-len(Biodatabase.PROT_POSTFIX)]
            seqtype = "proteome"
        seq_qs = seqstore.qs(dbname,seqtype)
            
        qs2 = qs
        if options['start'] and options['end']:
            qs2 = qs2[options['start']:options['end']]
        elif options['start']:
            qs2 = qs2[options['start']:]
        elif options['end']:
            qs2 = qs2[:options['end']]

        for be in self.tqdm(qs2,  total=qs.count()):
            if out_format == "fasta":
                r = be.to_seq_record(seq_qs)
                bpio.write(r, self.stdout, out_format)
            else:
                self.stdout.write(be.name)

        if not dbname:
            if not options["end"]:
                options["end"] = qs.count()
            self.stderr.write(f'exported from {options["start"]} to {options["end"]} of {qs.count()}')

        self.stderr.write("finished!")
Example #57
0
    return args


if __name__ == '__main__':
    args = parse_args()

    file, seq_format, fh = args.infile, args.format,  None,
    if file:
        if not seq_format:
            found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file)
            if not found:
                print("invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]",
                      file=sys.stderr)
                sys.exit(1)
            seq_format, is_gz = found.groups()
            if seq_format == 'fa':
                seq_format = 'fasta'
            if seq_format == 'fq':
                seq_format = 'fastq'

        fh = gzip.open(file, 'rt') if file.endswith('.gz') else open(file, 'r')
    else:
        fh = sys.stdin
        seq_format = args.format

    for seq in SeqIO.parse(fh, seq_format):
        SeqIO.write([SeqRecord(seq.seq.translate(table=args.table), id=seq.id, description='')], sys.stdout, 'fasta')

    fh.close()
def write_clusters(eggnog_dict, seq_dict, outdir):
    for nog in eggnog_dict.keys():
        with open("{}/{}.fasta".format(outdir, nog), 'w') as out:
            for seq in eggnog_dict[nog]:
                SeqIO.write(seq_dict.pop(seq), out, 'fasta')
    return seq_dict
Example #59
0
        df = df.append({
            "id": name,
            "seq": sequence,
            "length": length
        },
                       ignore_index=True)

#sort table by sequence length
df.sort_values(by=['length'], ascending=False)

# Save sequences
remaining = []
pooled = []
for index, row in df.iterrows():
    # each sequence above threshold into one file
    if row['length'] >= length_threshold and index + 1 <= max_sequences:
        print("write " + out_base + "." + str(index + 1) + ".fa")
        out = (SeqRecord(Seq(row['seq'], generic_dna), id=row['id']))
        SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta")
    # contigs to retain
    elif row['length'] >= min_length_to_retain_contig:
        pooled.append(SeqRecord(Seq(row['seq'], generic_dna), id=row['id']))
    # remaining sequences
    else:
        remaining.append(SeqRecord(Seq(row['seq'], generic_dna), id=row['id']))

print("write " + out_base + ".pooled.fa")
SeqIO.write(pooled, out_base + ".pooled.fa", "fasta")
print("write " + out_base + ".remaining.fa")
SeqIO.write(remaining, out_base + ".remaining.fa", "fasta")
Example #60
0
#!/usr/bin/env python3

from Bio import SeqIO
import os

# IO
output_dir = 'output/fasta_chunks'
trinity_fasta = 'data/Trinity.fasta'

# build an index of the fasta file
record_index = SeqIO.index(trinity_fasta, 'fasta')
record_keys = list(record_index.keys())
number_of_records = len(record_index)

# write batch_size records to fasta file
batch_size = 2000
i = 0
for start in range(0, number_of_records, batch_size):
    i += 1
    end = min(number_of_records, start + batch_size)
    file_name = ('trinity_chunk%(num)03i.fasta' % {'num': i})
    file_path = os.path.join(output_dir, file_name)
    keys_to_write = record_keys[start:end]
    records_to_write = (record_index[x] for x in keys_to_write)
    SeqIO.write(sequences=records_to_write, handle=file_path, format='fasta')