def translate_trsk_genes(gtf_file, fas_file, out_seq_fname):
    """
    translate the trsk genes to protein sequence 

    @args gtf_file: genome annotation file 
    @type gtf_file: str 
    @args fas_file: genome sequence file
    @type fas_file: str
    @args out_seq_fname: output file in fasta format 
    @type out_seq_fname: str
    """
    
    if filecmp.cmp(gtf_file, fas_file):
        exit("Do the two files are exactly same? Please check that!")

    ## reading the TSkim file to get the features 
    sys.stdout.write('reading genome features from %s\n' % gtf_file)
    anno_db = GFFParser.Parse(gtf_file) 
    total_genes = len(anno_db) 

    ## genome sequence file reading 
    sys.stdout.write('reading genome sequence from %s\n' % fas_file)
    seqlab.chrom_name_consistency(fas_file, anno_db) 

    cds_idx = [] # deleting the empty cds lines  
    for idp, feat in enumerate(anno_db):
        if not feat['cds_exons'][0].any(): # TSkim annotation expects only single transcript from a region
            cds_idx.append(idp) 
    anno_db = np.delete(anno_db, cds_idx) 
    genes_with_cds = len(anno_db) 

    fasFH = helper.open_file(fas_file) 
    out_seq_fh = open(out_seq_fname, "w")
    for rec in SeqIO.parse(fasFH, "fasta"):
        for idx, feature in enumerate(anno_db):
            if rec.id == feature['chr']:
                ## iterate over cds_exons
                cds_seq = ''
                for ex in feature['cds_exons'][0]:## single transcript by TSkim 
                    cds_seq += rec.seq[ex[0]-1:ex[1]]
                
                if feature['strand'] == '-':
                    cds_seq = cds_seq.reverse_complement()
                ## 
                #sys.stdout.write(str(cds_seq.translate()) + "\n")

                ## fasta output 
                if cds_seq:
                    prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence') 
                    out_seq_fh.write(prt_seq.format("fasta"))

        # FIXME need an efficient way to translate multiple gene 
        # iterate over chromosome

    fasFH.close()
    out_seq_fh.close()

    sys.stdout.write('total genes fetched: %d\n' % total_genes)
    sys.stdout.write('total genes translated: %d\n' % genes_with_cds)
    sys.stdout.write('protein sequence stored at %s\n' % out_seq_fname)
Beispiel #2
0
def fastq_rev_transform(input_file, output_name):
    with open(input_file) as myfile:
        with open(output_name, 'w') as outfile:
            for rec in SeqIO.parse(myfile, 'fastq'):
                new = SeqRecord(seq=rec.seq.reverse_complement(), id=rec.id,
                                name=rec.name, description="reverse complement")
                outfile.write(new.format('fasta'))
Beispiel #3
0
 def _translate(self, seq_records, trans_table=1, force_trans_table=False):
     '''Translate given sequences using trans_table. If some sequence feature 
     contains a translation table number, use it instead. If force_trans_table is True, 
     always use trans_table.'''
     trans_seqs =  list()
     for record in seq_records:
         #set the alphabet to DNA
         try:
             record.seq.alphabet = IUPAC.ambiguous_dna
         except Exception,e:
             #provided sequences SHOULD be DNA ones
             raise ValueError('AlignmentUtils.translate: unable to set alphabet of the sequence:\n%s\n%s' \
                              % (record.format('fasta'), e.message))        
         #determine translation table
         translation_table = -1
         if force_trans_table:
             #force a translation table
             translation_table = trans_table
         else:
             #see if a translation table is defined in qualifiers
             for feature in record.features:
                 try:
                     translation_table = int(feature.qualifiers['transl_table'][0])
                     break
                 except: pass
             if translation_table < 0: translation_table = trans_table
         #do a translation
         trans_seq = record.seq.translate(table=translation_table, stop_symbol="X")
         trans_seq_rec = SeqRecord(trans_seq, id=record.id)
         trans_seq_rec.name = record.name
         trans_seq_rec.description = record.description
         trans_seqs.append(trans_seq_rec)
Beispiel #4
0
def getdropboxsequence(request,name):
	fileName,fileExtension=os.path.splitext(name)
	try:
		client=dropbox.client.DropboxClient(request.session['access_token'])
		with client.get_file(name) as f:
			s=f.read()
	except Exception as e:
		print(str(e),file=sys.stderr)
		raise e

	if fileExtension in ('.gbk','.gb'):
		try:
			seq=SeqIO.read(io.StringIO(s.decode("utf-8")), "genbank")
		except Exception as e:
			print(str(e),file=sys.stderr)
			raise e
		return seq
	elif fileExtension=='.seq':
		simple_seq=Seq("".join(s.decode("utf-8").split()))
		seq=SeqRecord(simple_seq)
		seq.id=fileName
		return seq
	else:
		print('impossible case')
	return seq
 def t_write_from_recs(self):
     """Write out GFF3 from SeqRecord inputs.
     """
     seq = Seq("GATCGATCGATCGATCGATC")
     rec = SeqRecord(seq, "ID1")
     qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"],
                   "ID": "gene1"}
     sub_qualifiers = {"source": "prediction"}
     top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1,
                                                       qualifiers=qualifiers)
     top_feature.sub_features = [SeqFeature(FeatureLocation(0, 5), type="exon", strand=1,
                                            qualifiers=sub_qualifiers),
                                 SeqFeature(FeatureLocation(15, 20), type="exon", strand=1,
                                            qualifiers=sub_qualifiers)]
     rec.features = [top_feature]
     out_handle = StringIO.StringIO()
     GFF.write([rec], out_handle)
     wrote_info = out_handle.getvalue().split("\n")
     assert wrote_info[0] == "##gff-version 3"
     assert wrote_info[1] == "##sequence-region ID1 1 20"
     assert wrote_info[2].split("\t") == ['ID1', 'prediction', 'gene', '1',
                                          '20', '10.0', '+', '.',
                                          'other=Some,annotations;ID=gene1']
     assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5',
                                          '.', '+', '.', 'Parent=gene1']
Beispiel #6
0
def get_interregions(genbank_path,intergene_length=1):
    seq_record = next(SeqIO.parse(open(genbank_path), "genbank"))
    cds_list_plus = []
    cds_list_minus = []
    intergenic_records = []
    intergenic_features = []
    # Loop over the genome file, get the CDS features on each of the strands
    for feature in seq_record.features:
        if feature.type == 'CDS':
            mystart = feature.location.start.position
            myend = feature.location.end.position
            if feature.strand == -1:
                cds_list_minus.append((mystart,myend,-1))
            elif feature.strand == 1:
                cds_list_plus.append((mystart,myend,1))
            else:
                sys.stderr.write("No strand indicated %d-%d. Assuming +\n" %
                                  (mystart, myend))
                cds_list_plus.append((mystart,myend,1))
 
    for i,pospair in enumerate(cds_list_plus[1:]):
        # Compare current start position to previous end position
        last_end = cds_list_plus[i][1]
        this_start = pospair[0]
        strand = pospair[2]
        if this_start - last_end >= intergene_length:
            intergene_seq = seq_record.seq[last_end:this_start]
            featurelocation = SeqFeature.FeatureLocation(last_end,this_start,strand=+1);
            strand_string = "+"
            sequencerecord = SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
                  description="%s %d-%d %s" % (seq_record.name, last_end+1,
                                                        this_start,strand_string))
            sequencerecord.features = [featurelocation];
            intergenic_features.append(featurelocation);
            intergenic_records.append(sequencerecord);
            #intergenic_records.append(
            #      SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
            #      description="%s %d-%d %s" % (seq_record.name, last_end+1,
            #                                            this_start,strand_string)))
    for i,pospair in enumerate(cds_list_minus[1:]):
        last_end = cds_list_minus[i][1]
        this_start = pospair[0]
        strand = pospair[2]
        if this_start - last_end >= intergene_length:
            intergene_seq = seq_record.seq[last_end:this_start]
            featurelocation = SeqFeature.FeatureLocation(last_end,this_start,strand=-1);
            strand_string = "-"
            sequencerecord = SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
                  description="%s %d-%d %s" % (seq_record.name, last_end+1,
                                                        this_start,strand_string))
            sequencerecord.features = [featurelocation];
            intergenic_features.append(featurelocation);
            intergenic_records.append(sequencerecord);
            #intergenic_records.append(
            #      SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
            #      description="%s %d-%d %s" % (seq_record.name, last_end+1,
            #                                            this_start,strand_string)))
    #outpath = os.path.splitext(os.path.basename(genbank_path))[0] + "_ign.fasta"
    #SeqIO.write(intergenic_records, open(outpath,"w"), "fasta")
    return intergenic_features;
Beispiel #7
0
def gene_to_fasta(gene_str,gene_id,gene_desc):
    '''Convert a gene string to a fasta formatted string'''

    gene_seq = Seq(gene_str)
    gene_seq_r = SeqRecord(gene_seq,id=gene_id,description=gene_desc)
    fasta_str = gene_seq_r.format('fasta')
    return fasta_str
    def __init__(self, seq, id="<unknown id>", name="<unknown name>", description="<unknown description>",
                 dbxrefs=None, features=None, annotations=None, letter_annotations=None):
        self.id = id
        self.tempseq = seq
        temp = self.id.split('|')
        self.GeneID = temp[0]
        self.TranscriptID = temp[1]
        self.GeneName = temp[2]
        self.ExonRank = temp[3]
        if temp[4] == '': self.ConstExon = False
        else: self.ConstExon = True
        try: self.FPUTRend = int(temp[5])
        except: self.FPUTRend = ''
        try: self.TPUTRstart = int(temp[6])
        except: self.TPUTRstart = ''
        try: self.exonStart = int(temp[7])
        except: self.exonStart = ''
        try: self.exonEnd = int(temp[8])
        except: self.exonEnd = ''

        if self.FPUTRend != '':
            sequence =  str(self.tempseq)[(self.FPUTRend-self.exonStart):(self.exonEnd-self.exonStart)]
        elif self.TPUTRstart != '':
            sequence = str(self.tempseq)[0:(self.TPUTRstart-self.exonStart)]
        else:
            sequence = str(self.tempseq)

        SeqRecord.__init__(self, Seq(sequence, IUPAC.unambiguous_dna), id, name, description, dbxrefs=None,
                           features=None, annotations=None, letter_annotations=None)
Beispiel #9
0
 def _record_formatter(self, temp):
     """return a string formatted as a biopython sequence record"""
     temp_record = SeqRecord(temp)
     temp_record.id = sequence.id
     temp_record.name = sequence.name
     temp_record.description = sequence.description
     return temp_record
Beispiel #10
0
def IgIterator(handle, alphabet = single_letter_alphabet):
    """Iterate over IntelliGenetics records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet

    The optional free format file header lines (which start with two
    semi-colons) are ignored.

    The free format commentary lines at the start of each record (which
    start with a semi-colon) are recorded as a single string with embedded
    new line characters in the SeqRecord's annotations dictionary under the
    key 'comment'.
    """
    #Skip any file header text before the first record (;; lines)
    while True:
        line = handle.readline()
        if not line : break #Premature end of file, or just empty?
        if not line.startswith(";;") : break

    while line:
        #Now iterate over the records
        if line[0]!=";":
            raise ValueError( \
                  "Records should start with ';' and not:\n%s" % repr(line))

        #Try and agree with SeqRecord convention from the GenBank parser,
        #(and followed in the SwissProt parser) which stores the comments
        #as a long string with newlines under annotations key 'comment'.

        #Note some examples use "; ..." and others ";..."
        comment_lines = []
        while line.startswith(";"):
            #TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
            comment_lines.append(line[1:].strip())
            line = handle.readline()
        title = line.rstrip()

        seq_lines = []
        while True:
            line = handle.readline()
            if not line : break
            if line[0] == ";": break
            #Remove trailing whitespace, and any internal spaces
            seq_lines.append(line.rstrip().replace(" ",""))
        seq_str = "".join(seq_lines)
        if seq_str.endswith("1"):
            #Remove the optional terminator (digit one)
            seq_str = seq_str[:-1]
        if "1" in seq_str:
            raise ValueError("Potential terminator digit one found within sequence.")
                
        #Return the record and then continue...
        record= SeqRecord(Seq(seq_str, alphabet),
                          id = title, name = title)
        record.annotations['comment'] = "\n".join(comment_lines)
        yield record
    
    #We should be at the end of the file now
    assert not line
Beispiel #11
0
def PhdIterator(handle):
    """Returns SeqRecord objects from a PHD file.

    This uses the Bio.Sequencing.Phd module to do the hard work.
    """
    phd_records = Phd.parse(handle)
    for phd_record in phd_records:
        # Convert the PHY record into a SeqRecord...
        # The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1'
        # from unit test example file phd_solexa.
        # This will cause problems if used as the record identifier
        # (e.g. output for FASTQ format).
        name = phd_record.file_name.split(None, 1)[0]
        seq_record = SeqRecord(phd_record.seq,
                               id=name, name=name,
                               description=phd_record.file_name)
        # Just re-use the comments dictionary as the SeqRecord's annotations
        seq_record.annotations = phd_record.comments
        # And store the qualities and peak locations as per-letter-annotation
        seq_record.letter_annotations["phred_quality"] = \
            [int(site[1]) for site in phd_record.sites]
        try:
            seq_record.letter_annotations["peak_location"] = \
                [int(site[2]) for site in phd_record.sites]
        except IndexError:
            # peak locations are not always there according to
            # David Gordon (the Consed author)
            pass
        yield seq_record
Beispiel #12
0
def sequence2SeqRecord(seq_obj):
	"""-> loads a SeqRecord object array from a Sequence object array
	"""
	seqRecord_obj = SeqRecord(seq_obj.get_SEQUENCE(),
						id = seq_obj.get_ID())
	seqRecord_obj.letter_annotations['phred_quality'] = seq_obj.get_QUALITY()
	return seqRecord_obj	
Beispiel #13
0
def test():
    # Object #
    checker = QualityChecker(None, None)
    checker.window_size = 10
    checker.threshold = 21
    checker.min_length = 5
    checker.discard_N = True
    # Dummy test sequence #
    scores = "10 11 13 22 23 24 10 10 9 8 7 9 8 9 5 2 5 8 9 8 9 30 33 30 31 32 33 31 33 33 31 33 32 33 32 32 33 32 2 3 2 3 2 1 3 2 1 23 23 23 10 10 9 9"
    seq    = "A  T  C  G  T  T  G  A  C G G A G T G T A A C T C G  A  T  G  A  C  T  T  G  T  C  A  A  C  T  G  G  T A G G G T C A A C  T  G  A  T  C A"
    scores = map(int, scores.split())
    seq    = ''.join(seq.split())
    assert len(seq) == len(scores)
    # Make into biopython object #
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    read = SeqRecord(Seq(seq), id="test", name="test", description="test")
    read.letter_annotations["phred_quality"] = scores
    # Trim it #
    trimmed = checker.trim_read(read)
    # Check result #
    correct = "30 33 30 31 32 33 31 33 33 31 33 32 33 32 32"
    correct = "G  A  T  G  A  C  T  T  G  T  C  A  A  C  T "
    correct = ''.join(correct.split())
    assert str(trimmed.seq) == correct
Beispiel #14
0
    def test_translate(self):
        s = SeqRecord(Seq("ATGGTGTAA"), id="TestID", name="TestName",
                      description="TestDescription", dbxrefs=["TestDbxrefs"],
                      features=[SeqFeature(FeatureLocation(0, 3), type="Site")],
                      annotations={'organism': 'bombyx'},
                      letter_annotations={'test': 'abcdefghi'})

        t = s.translate()
        self.assertEqual(t.seq, "MV*")
        self.assertEqual(t.id, "<unknown id>")
        self.assertEqual(t.name, "<unknown name>")
        self.assertEqual(t.description, "<unknown description>")
        self.assertFalse(t.dbxrefs)
        self.assertFalse(t.features)
        self.assertFalse(t.annotations)
        self.assertFalse(t.letter_annotations)

        t = s.translate(cds=True, id=True, name=True, description=True,
                        dbxrefs=True, annotations=True)
        self.assertEqual(t.seq, "MV")
        self.assertEqual(t.id, "TestID")
        self.assertEqual(t.name, "TestName")
        self.assertEqual(t.description, "TestDescription")
        self.assertEqual(t.dbxrefs, ["TestDbxrefs"])
        self.assertFalse(t.features)
        self.assertEqual(t.annotations, {'organism': 'bombyx'})
        self.assertFalse(t.letter_annotations)
Beispiel #15
0
 def modify_seq(self, fasta_folder, mod_table_file, output_folder):
     datas = self._import_data(mod_table_file)
     for data in datas:
         seq = ""
         if (data["ref_id"] + ".fa") in os.listdir(fasta_folder):
             filename = os.path.join(fasta_folder, data["ref_id"] + ".fa")
             with open(filename, "r") as fasta:
                 for line in fasta:
                     line = line.strip()
                     if line[0] != ">":
                         seq = seq + line
             seq_modifier = SeqModifier(seq)
             for change in data["datas"]:
                 if change["ref_nt"] == "-":
                     seq_modifier.insert(
                                  int(change["position"]), change["tar_nt"])
                 elif change["tar_nt"] == "-":
                     seq_modifier.remove(int(change["position"]),
                                         len(change["ref_nt"]))
                 else:
                     seq_modifier.replace(
                                  int(change["position"]), change["tar_nt"])
             record = SeqRecord(Seq(seq_modifier.seq()))
             record.id = data["target_id"]
             record.description = ""
             SeqIO.write(record, os.path.join(
                         output_folder, record.id + ".fa"), "fasta")
def fetchGene(GeneName):
    
    service = Service("http://yeastmine.yeastgenome.org/yeastmine/service")
    template = service.get_template('Gene_GenomicDNA')

    rows = template.rows(
        E = {"op": "LOOKUP", "value": GeneName, "extra_value": "S. cerevisiae"}
    )
    
    # this service seems to return multiple similar genes but we want the first one only, so count
    # and it returns information about the gene you want
    count=0
    for row in rows:
        
        count=count+1
        if count==1:
            descr= row["description"]
            GeneSeq=Seq(row["sequence.residues"])
            GeneSysName=row["secondaryIdentifier"]
       
    #let's create a record for the oldGene
    GeneRecord = SeqRecord(GeneSeq, id=GeneSysName)
    
    #now let's add some more information to make it useful
    GeneRecord.name=GeneName
    GeneRecord.features=GeneSysName
    GeneRecord.description=descr

    return GeneRecord 
    def test_genbank_date_list(self):
        """Check if date lists are handled correctly"""

        sequence_object = Seq("ATGC", generic_dna)
        record = SeqRecord(sequence_object,
                           id='123456789',
                           name='UnitTest',
                           description='Test case for date parsing')
        record.annotations["date"] = ["24-DEC-2015"]
        handle = StringIO()
        SeqIO.write(record, handle, 'genbank')
        handle.seek(0)
        gb = SeqIO.read(handle, "gb")
        self.assertEqual(gb.annotations["date"], "24-DEC-2015")

        record = SeqRecord(sequence_object,
                           id='123456789',
                           name='UnitTest',
                           description='Test case for date parsing')
        record.annotations["date"] = ["24-DEC-2015", "25-JAN-2016"]
        handle = StringIO()
        SeqIO.write(record, handle, 'genbank')
        handle.seek(0)
        gb = SeqIO.read(handle, "gb")
        self.assertEqual(gb.annotations["date"], "01-JAN-1980")
Beispiel #18
0
Datei: io.py Projekt: Chris7/edge
    def to_gff(self, filename):
        """
        Export to GFF format, saving to the specified filename.
        """
        records = []

        for fragment in self.__genome.fragments.all():
            fragment = fragment.indexed_fragment()
            seq = Seq(fragment.sequence)
            rec = SeqRecord(seq, "%s" % (fragment.name,))
            features = []

            for annotation in fragment.annotations():
                # FeatureLocation first bp is AfterPosition, so -1
                loc = FeatureLocation(annotation.base_first-1, annotation.base_last)
                qualifiers = {'name': annotation.feature.name}
                feature = SeqFeature(loc,
                                     type=annotation.feature.type,
                                     strand=1,
                                     qualifiers=qualifiers)
                features.append(feature)

            rec.features = features
            records.append(rec)

        with open(filename, "w") as out_handle:
            GFF.write(records, out_handle, include_fasta=True)
def translate_and_write_DNA_frames(seq, directionsToConsider="forward", tranlationTable=1):
    """
    directionToConsider:
        forward - normal DNA direction
        reverse - reverse complement
        both - both of the abover
    """
    allPossibilities = []
    if directionsToConsider in ("forward","both"):
        # start translation from 1, 2 and 3 nucleotide
        for frame in range(3):
            trans = str(seq[frame:].translate(tranlationTable))
            allPossibilities.append(trans)
                        
    if directionsToConsider in ("reverse","both"):            
        # consider reverse complement DNA sequence as well
        # start translation from 1, 2 and 3 nucleotide
        for frame in range(3):
            trans = str(seq.reverse_complement()[frame:].translate(tranlationTable))
            allPossibilities.append(trans)

    i = 0
    for currentFrame in allPossibilities:
        i = i + 1
        currentProtein = Seq(currentFrame, alphabet=ProteinAlphabet)

        if len(currentProtein) >= minimum_peptide_length:
            currentProteinRecord = SeqRecord(currentProtein, seq_record.name)
            currentProteinRecord.id = currentProteinRecord.id + "." + str(i)
            currentProteinRecord.description = seq_record.description + "; frame " + str(i)
    
            SeqIO.write(currentProteinRecord, output_handle, "fasta")
        

    return
Beispiel #20
0
    def test_trimming(self):
        'The sequences are trimmed according to the recommendations.'
        seq1 = 'gggtctcatcatcaggg'.upper()
        seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}})

        trim_rec = seq.annotations[TRIMMING_RECOMMENDATIONS]
        seq_trimmer = TrimOrMask()

        trim_rec['vector'] = [(0, 3), (8, 13)]
        seq.annotations[TRIMMING_RECOMMENDATIONS] = trim_rec
        seqs2 = seq_trimmer([seq])
        assert str(seqs2[0].seq) == 'CTCA'

        trim_rec['vector'] = [(0, 0), (8, 13)]
        seq.annotations[TRIMMING_RECOMMENDATIONS] = trim_rec
        seqs2 = seq_trimmer([seq])
        assert str(seqs2[0].seq) == 'GGTCTCA'

        trim_rec['vector'] = [(0, 1), (8, 12)]
        trim_rec['quality'] = [(1, 8), (13, 17)]
        seq.annotations[TRIMMING_RECOMMENDATIONS] = trim_rec
        seqs2 = seq_trimmer([seq])
        assert not seqs2

        trim_rec['vector'] = [(0, 0), (8, 13)]
        trim_rec['quality'] = []
        seq.annotations[TRIMMING_RECOMMENDATIONS] = trim_rec
        seqs2 = seq_trimmer([seq])
        assert str(seqs2[0].seq) == 'GGTCTCA'
        assert TRIMMING_RECOMMENDATIONS not in seqs2[0].annotations
def getPrimerRegions(primers, options, alignment):
    consensusSequences = []
    
    # Pull out all the (start,end) tuples from primers
    starts = []
    for primer in primers:
        starts.append(primer[0][0])
    
    coordinates = []
    # Go through the starts. Check if the next number is i+1. If not, then output slice
    sliceStart = 0
    primerLength = options.maxprimer
    for i in range(0,len(starts)):
        start = starts[i]
        try:
            nextStart = starts[i+1]
        except:
            break
        if (nextStart != start+1):
            coordinates.append((sliceStart,start+primerLength))
            sliceStart = nextStart
    
    # Create the slices
    for pair in coordinates:
        slicedAlignment = alignment[:,pair[0]:pair[1]]
        consensus = Seq(calculateConsensusSequence(slicedAlignment,options.degeneracy))
        seq = SeqRecord(consensus)
        seq.id = str(pair[0]) + "-" + str(pair[1])
        consensusSequences.append(seq)
    
    return consensusSequences
Beispiel #22
0
 def bioseq_to_bwa_seq(self):
   nseq = 100
   for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa":
     qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality"
     for unknown in False, True:
       g = u.random_reads_generator(nseq, fmt=fmt, unknown=unknown)
       for i, (seq, q) in enumerate(g):
         name = "foo-%d" % i
         seq_str = "".join(seq)
         bioseq = SeqRecord(Seq(seq_str, single_letter_alphabet),
                            id=name, name=name, description=name)
         bioseq.letter_annotations[qkey] = q
         n, m = len(bioseq), len(bioseq.name)
         bwseq = bwa.alloc_seq(1, n, m)[0]
         bwa.bioseq_to_bwa_seq(bioseq, bwseq, n, m, fmt)
         self.assertEqual(bioseq.name, bwseq.get_name())
         self.assertEqual(bioseq.seq.data[::-1], bwseq.get_seq())
         self.assertEqual(reverse_complement(bioseq.seq.data),
                          bwseq.get_rseq())
         # check that quality has been converted to sanger
         if fmt == "fastq-solexa":
           exp_q = [int(round(x+10*math.log10(1+10**(-x/10.)))) for x in q]
         else:
           exp_q = q
         exp_qstr = "".join(chr(x+sg.Q_OFFSET["fastq-sanger"]) for x in exp_q)
         self.assertEqual(bwseq.get_qual(), exp_qstr)
Beispiel #23
0
def main(fastq_name):

	MAX_Q = 100
	logQs = [10 ** (float(-Q) / 10) for Q in range(MAX_Q + 1)]
	for record in SeqIO.parse(fastq_name, "fastq"):
	
		#print _get_phred_quality(record)
			
	#for header, seq in fasta_iter(fasta_name):

		seq = list(record.seq)
		Q = record.letter_annotations["phred_quality"]
		P = [10 ** (float(-x) / 10) for x in Q]
		
		
	

		for i, s in enumerate(seq):
			val = random()
			mutation_freq = P[i]
			if val < mutation_freq:
				#choose a random nucleotide that's different.
				seq[i] = choice([x for x in "ACTG" if  x != s.upper()])

		#record.seq = "".join(seq)
		#print record.format("fastq")
		
		#print "@" + str(record.ide) + "\n" + "".join(seq) + "\n" + "+" + "\n" + Q
		mutated = SeqRecord( Seq("".join(seq), generic_dna), id = record.id, description = "" )
		mutated.letter_annotations["phred_quality"] = Q
		print mutated.format("fastq"),
Beispiel #24
0
 def _init_with_SeqRecord(self,record):
     # Initialize self using existing SeqRecord object
     SeqRecord.__init__(self, seq=record.seq, id=record.id,
                         name=record.name, description=record.description,
                         dbxrefs=record.dbxrefs, features=record.features,
                         annotations=record.annotations,
                         letter_annotations=record.letter_annotations)
Beispiel #25
0
 def __init__(
     self,
     seq,
     id=UNKNOWN_ID,
     name=UNKNOWN_NAME,
     description=UNKNOWN_DESCRIPTION,
     dbxrefs=None,
     features=None,
     annotations=None,
     letter_annotations=None,
     qual=None,
 ):
     if id == UNKNOWN_ID and name != UNKNOWN_NAME:
         id = name
     # We don't want a Biopython Seq, we need our repr
     if not isinstance(seq, Seq) and not isinstance(seq, UnknownSeq):
         raise ValueError("seq should be a franklin Seq")
     SeqRecord.__init__(
         self,
         seq,
         id=id,
         name=name,
         description=description,
         dbxrefs=dbxrefs,
         features=features,
         annotations=annotations,
         letter_annotations=letter_annotations,
     )
     if qual is not None:
         self.qual = qual
Beispiel #26
0
def main(fastq):
	for record in SeqIO.parse(fastq, "fastq"):
		Q = record.letter_annotations["phred_quality"]
		
		upperseq = SeqRecord( Seq(str(record.seq).upper()), id = record.id, description = "" )
		upperseq.letter_annotations["phred_quality"] = Q
		print upperseq.format("fastq")
Beispiel #27
0
def CpGIslandsToGFF(island_location):
# Output methylation regions (CpG Islands, namely) to a GFF3 compliant file 

    out_file = os.getcwd() \
    + '/' \
    + os.path.splitext(base)[0] \
    + '.gff'


    seq = cur_record.seq
    rec = SeqRecord(seq, "ID1") 

    qualifiers = {"source": "bssimulation", "score": '.', "ID": cur_record.name}
    sub_qualifiers = {"source": "bssimulation"}
    top_feature = SeqFeature(FeatureLocation(0, len(cur_record)), type="region", strand=0,
                         qualifiers=qualifiers)
    for i in island_location:
        begin = int(i[0] - i[1]/2)
        end = int(i[0] + i[1]/2)

        top_feature.sub_features.append(SeqFeature(FeatureLocation(begin, end), 
            type="CpG_island", 
            strand=0,
            qualifiers=sub_qualifiers))

    rec.features = [top_feature]
 
    with open(out_file, "w") as out_handle:
        GFF.write([rec], out_handle)
Beispiel #28
0
def clip_seq_record( seqrecord ):
    '''
    Correctly trims sff Bio.SeqRecord.SeqRecord
    
    @seqrecord - Bio.SeqRecord.SeqRecord object to trim phred_quality and sequence based on the
        annotations

    @returns a new trimmed seqrecord
    '''
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    lefti = seqrecord.annotations['clip_qual_left']
    righti = seqrecord.annotations['clip_qual_right']
    if lefti > righti and righti == 0:
        righti = len(seqrecord.seq._data)
    newseq = Seq( seqrecord.seq._data[lefti:righti], seqrecord.seq.alphabet )
    newrec = SeqRecord(
        seq = newseq,
        id = seqrecord.id,
        description = seqrecord.description
    )
    quals = seqrecord._per_letter_annotations['phred_quality']
    trimquals = quals[lefti:righti]
    newrec._per_letter_annotations['phred_quality'] = trimquals
    return newrec
 def write_seqrecord(self, outh, fmt='fasta'):
     # Just make up quality scores for now if FASTQ
     for i in range(self.count): 
         record = SeqRecord(Seq(self.get_sequence(), IUPACAmbiguousDNA()))
         if fmt.startswith('fastq'):
             record.letter_annotations = {'phred_quality': [10]*len(record)}
         outh.write(record.format(fmt))
Beispiel #30
0
def proteins(cursor, experiment=None, filter_experiments=True, sequence_key=None):
    """
    Return the selected proteins as SeqRecord objects
    """
    query = """SELECT s.id,s.sequence, e.id, e.short_name, e.taxonomy_id
        from hpf.experiment e 
        join bddb.protein p on e.id=p.experiment_key
        join ddbCommon.sequence s on p.sequence_key=s.id 
        """
    assert experiment!= None or sequence_key != None
    if experiment != None or filter_experiments==True or sequence_key != None:
        query += " where "
    if experiment:
        if not hasattr(experiment, "__iter__"):
            experiment = [experiment]
        query += " e.id in (%s)" % (",".join([str(key) for key in experiment]))
    if filter_experiments:
        t = " e.taxonomy_id!=0"
        query += " and "+t if experiment else t
    if sequence_key:
        t = " s.id in (%s)" % (",".join([str(key) for key in sequence_key]))
        query += " and "+t if experiment or filter_experiments else t
    runtime().debug(query)
    cursor.execute(query)
    runtime().debug("Fetching")
    for id, sequence, e_id, e_name, taxonomy_id in cursor.fetchall():
        record = SeqRecord(Seq(sequence), str(id), description=e_name)
        record.annotations = {"taxonomy_id":taxonomy_id,
                              "experiment_key":e_id,
                              "organism":e_name}
        yield record
Beispiel #31
0
class SeqRecordMethods(unittest.TestCase):
    """Test SeqRecord methods."""
    def setUp(self):
        f0 = SeqFeature(FeatureLocation(0, 26),
                        type="source",
                        qualifiers={"mol_type": ["fake protein"]})
        f1 = SeqFeature(FeatureLocation(0, ExactPosition(10)))
        f2 = SeqFeature(
            FeatureLocation(WithinPosition(12, left=12, right=15),
                            BeforePosition(22)))
        f3 = SeqFeature(
            FeatureLocation(
                AfterPosition(16),
                OneOfPosition(
                    26,
                    [ExactPosition(25), AfterPosition(26)])))
        self.record = SeqRecord(Seq("ABCDEFGHIJKLMNOPQRSTUVWZYX",
                                    generic_protein),
                                id="TestID",
                                name="TestName",
                                description="TestDescr",
                                dbxrefs=["TestXRef"],
                                annotations={"k": "v"},
                                letter_annotations={"fake": "X" * 26},
                                features=[f0, f1, f2, f3])

    def test_iter(self):
        for amino in self.record:
            self.assertEqual("A", amino)
            break

    def test_contains(self):
        self.assertIn(Seq("ABC", generic_protein), self.record)

    def test_str(self):
        expected = """
ID: TestID
Name: TestName
Description: TestDescr
Database cross-references: TestXRef
Number of features: 4
/k=v
Per letter annotation for: fake
Seq('ABCDEFGHIJKLMNOPQRSTUVWZYX', ProteinAlphabet())"""
        self.assertEqual(expected.lstrip(), str(self.record))

    def test_repr(self):
        expected = "SeqRecord(seq=Seq('ABCDEFGHIJKLMNOPQRSTUVWZYX', ProteinAlphabet()), " \
                   "id='TestID', name='TestName', description='TestDescr', dbxrefs=['TestXRef'])"
        self.assertEqual(expected, repr(self.record))

    def test_format(self):
        expected = ">TestID TestDescr\nABCDEFGHIJKLMNOPQRSTUVWZYX\n"
        self.assertEqual(expected, self.record.format("fasta"))

    def test_format_str(self):
        expected = ">TestID TestDescr\nABCDEFGHIJKLMNOPQRSTUVWZYX\n"
        self.assertEqual(expected, "{:fasta}".format(self.record))

    if sys.version_info[0] >= 3:

        def test_format_str_binary(self):
            with self.assertRaisesRegex(
                    ValueError,
                    "Binary format sff cannot be used with SeqRecord format method"
            ):
                "{:sff}".format(self.record)

    def test_format_spaces(self):
        rec = SeqRecord(Seq("ABCDEFGHIJKLMNOPQRSTUVWZYX", generic_protein),
                        id="TestID",
                        name="TestName",
                        description="TestDescr")
        rec.description = "TestDescr     with5spaces"
        expected = ">TestID TestDescr     with5spaces\nABCDEFGHIJKLMNOPQRSTUVWZYX\n"
        self.assertEqual(expected, rec.format("fasta"))

    def test_upper(self):
        self.assertEqual("ABCDEFGHIJKLMNOPQRSTUVWZYX",
                         str(self.record.lower().upper().seq))

    def test_lower(self):
        self.assertEqual("abcdefghijklmnopqrstuvwzyx",
                         str(self.record.lower().seq))

    def test_slicing(self):
        self.assertEqual("B", self.record[1])
        self.assertEqual("BC", self.record[1:3].seq)
        with self.assertRaises(ValueError):
            c = self.record["a"].seq

    def test_slice_variants(self):
        """Simple slices using different start/end values."""
        for start in list(range(-30, 30)) + [None]:
            for end in list(range(-30, 30)) + [None]:
                if start is None and end is None:
                    continue
                rec = self.record[start:end]
                seq = self.record.seq[start:end]
                seq_str = str(self.record.seq)[start:end]
                self.assertEqual(seq_str, str(seq))
                self.assertEqual(seq_str, str(rec.seq))
                self.assertEqual("X" * len(seq_str),
                                 rec.letter_annotations["fake"])

    def test_slice_simple(self):
        """Simple slice."""
        rec = self.record
        self.assertEqual(len(rec), 26)
        left = rec[:10]
        self.assertEqual(str(left.seq), str(rec.seq[:10]))
        right = rec[-10:]
        self.assertEqual(str(right.seq), str(rec.seq[-10:]))
        mid = rec[12:22]
        self.assertEqual(str(mid.seq), str(rec.seq[12:22]))
        for sub in [left, right, mid]:
            self.assertEqual(len(sub), 10)
            self.assertEqual(sub.id, "TestID")
            self.assertEqual(sub.name, "TestName")
            self.assertEqual(sub.description, "TestDescr")
            self.assertEqual(sub.letter_annotations, {"fake": "X" * 10})
            self.assertEqual(sub.dbxrefs, [])  # May change this...
            self.assertEqual(sub.annotations, {})  # May change this...
            self.assertEqual(len(sub.features), 1)
            # By construction, each feature matches the full sliced region:
            self.assertEqual(str(sub.features[0].extract(sub.seq)),
                             str(sub.seq))
            self.assertEqual(sub.features[0].extract(str(sub.seq)),
                             str(sub.seq))

    def test_slice_zero(self):
        """Zero slice."""
        rec = self.record
        self.assertEqual(len(rec), 26)
        self.assertEqual(len(rec[2:-2]), 22)
        self.assertEqual(len(rec[5:2]), 0)
        self.assertEqual(len(rec[5:2][2:-2]), 0)

    def test_add_simple(self):
        """Simple addition."""
        rec = self.record + self.record
        self.assertEqual(len(rec), 52)
        self.assertEqual(rec.id, "TestID")
        self.assertEqual(rec.name, "TestName")
        self.assertEqual(rec.description, "TestDescr")
        self.assertEqual(rec.dbxrefs, ["TestXRef"])
        self.assertEqual(rec.annotations, {"k": "v"})
        self.assertEqual(rec.letter_annotations, {"fake": "X" * 52})
        self.assertEqual(len(rec.features), 2 * len(self.record.features))

    def test_add_seq(self):
        """Simple addition of Seq or string."""
        for other in [Seq("BIO"), "BIO"]:
            rec = self.record + other  # will use SeqRecord's __add__ method
            self.assertEqual(len(rec), 26 + 3)
            self.assertEqual(str(rec.seq), str(self.record.seq) + "BIO")
            self.assertEqual(rec.id, "TestID")
            self.assertEqual(rec.name, "TestName")
            self.assertEqual(rec.description, "TestDescr")
            self.assertEqual(rec.dbxrefs, ["TestXRef"])
            self.assertEqual(rec.annotations, {"k": "v"})
            self.assertEqual(rec.letter_annotations, {})
            self.assertEqual(len(rec.features), len(self.record.features))
            self.assertEqual(rec.features[0].type, "source")
            self.assertEqual(rec.features[0].location.nofuzzy_start, 0)
            self.assertEqual(rec.features[0].location.nofuzzy_end,
                             26)  # not +3

    def test_add_seqrecord(self):
        """Simple left addition of SeqRecord from genbank file."""
        other = SeqIO.read("GenBank/dbsource_wrap.gb", "gb")
        other.dbxrefs = ["dummy"]
        rec = self.record + other
        self.assertEqual(len(rec), len(self.record) + len(other))
        self.assertEqual(str(rec.seq), str(self.record.seq) + str(other.seq))
        self.assertEqual(rec.id, "<unknown id>")
        self.assertEqual(rec.name, "<unknown name>")
        self.assertEqual(rec.description, "<unknown description>")
        self.assertEqual(rec.dbxrefs, ["TestXRef", "dummy"])
        self.assertEqual(len(rec.annotations), 0)
        self.assertEqual(len(rec.letter_annotations), 0)
        self.assertEqual(len(rec.features),
                         len(self.record.features) + len(other.features))
        self.assertEqual(rec.features[0].type, "source")
        self.assertEqual(rec.features[0].location.nofuzzy_start, 0)
        self.assertEqual(rec.features[0].location.nofuzzy_end,
                         len(self.record))  # not +3
        i = len(self.record.features)
        self.assertEqual(rec.features[i].type, "source")
        self.assertEqual(rec.features[i].location.nofuzzy_start,
                         len(self.record))
        self.assertEqual(rec.features[i].location.nofuzzy_end, len(rec))

    def test_add_seq_left(self):
        """Simple left addition of Seq or string."""
        for other in [Seq("BIO"), "BIO"]:
            rec = other + self.record  # will use SeqRecord's __radd__ method
            self.assertEqual(len(rec), 26 + 3)
            self.assertEqual(str(rec.seq), "BIO" + str(self.record.seq))
            self.assertEqual(rec.id, "TestID")
            self.assertEqual(rec.name, "TestName")
            self.assertEqual(rec.description, "TestDescr")
            self.assertEqual(rec.dbxrefs, ["TestXRef"])
            self.assertEqual(rec.annotations, {"k": "v"})
            self.assertEqual(rec.letter_annotations, {})
            self.assertEqual(len(rec.features), len(self.record.features))
            self.assertEqual(rec.features[0].type, "source")
            self.assertEqual(rec.features[0].location.nofuzzy_start, 3)
            self.assertEqual(rec.features[0].location.nofuzzy_end, 26 + 3)

    def test_slice_add_simple(self):
        """Simple slice and add."""
        for cut in range(27):
            rec = self.record[:cut] + self.record[cut:]
            self.assertEqual(str(rec.seq), str(self.record.seq))
            self.assertEqual(len(rec), 26)
            self.assertEqual(rec.id, "TestID")
            self.assertEqual(rec.name, "TestName")
            self.assertEqual(rec.description, "TestDescr")
            self.assertEqual(rec.dbxrefs, [])  # May change this...
            self.assertEqual(rec.annotations, {})  # May change this...
            self.assertEqual(rec.letter_annotations, {"fake": "X" * 26})
            self.assertTrue(len(rec.features) <= len(self.record.features))

    def test_slice_add_shift(self):
        """Simple slice and add to shift."""
        for cut in range(27):
            rec = self.record[cut:] + self.record[:cut]
            self.assertEqual(
                str(rec.seq),
                str(self.record.seq[cut:] + self.record.seq[:cut]))
            self.assertEqual(len(rec), 26)
            self.assertEqual(rec.id, "TestID")
            self.assertEqual(rec.name, "TestName")
            self.assertEqual(rec.description, "TestDescr")
            self.assertEqual(rec.dbxrefs, [])  # May change this...
            self.assertEqual(rec.annotations, {})  # May change this...
            self.assertEqual(rec.letter_annotations, {"fake": "X" * 26})
            self.assertTrue(len(rec.features) <= len(self.record.features))
Beispiel #32
0
 def gt():
     SeqRecord(Seq("A")) > SeqRecord(Seq("A"))
Beispiel #33
0
 def notequality():
     SeqRecord(Seq("A")) != SeqRecord(Seq("A"))
Beispiel #34
0
 def equality():
     SeqRecord(Seq("A")) == SeqRecord(Seq("A"))
Beispiel #35
0
 def le():
     SeqRecord(Seq("A")) <= SeqRecord(Seq("A"))
Beispiel #36
0
 def lt():
     SeqRecord(Seq("A")) < SeqRecord(Seq("A"))
Beispiel #37
0
 def test_reverse_complement_mutable_seq(self):
     s = SeqRecord(MutableSeq("ACTG"))
     self.assertEqual("CAGT", str(s.reverse_complement().seq))
Beispiel #38
0
 def test_valid_annotations(self):
     with self.assertRaises(TypeError):
         SeqRecord(Seq("ACGT", generic_dna), annotations=[])
Beispiel #39
0
 def test_valid_description(self):
     with self.assertRaises(TypeError):
         SeqRecord(Seq("ACGT", generic_dna), description={})
Beispiel #40
0
 def test_valid_features(self):
     with self.assertRaises(TypeError):
         SeqRecord(Seq("ACGT", generic_dna), features={})
Beispiel #41
0
def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
    from Bio import SeqIO
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq

    sequences = compress_seq['sequences']
    ref = compress_seq['reference']
    positions = compress_seq['positions']

    #If want to exclude sites from initial treebuild, read in here
    strip_pos = load_mask_sites(stripFile) if stripFile else []

    #Get sequence names
    seqNames = list(sequences.keys())

    #Check non-ref sites to see if informative
    printPositionMap = False  #If true, prints file mapping Fasta position to real position
    sites = []
    pos = []

    for key in positions:
        if key not in strip_pos:
            pattern = []
            for k in sequences.keys():
                #looping try/except is faster than list comprehension
                try:
                    pattern.append(sequences[k][key])
                except KeyError:
                    pattern.append(ref[key])
            origPattern = list(pattern)
            if '-' in pattern or 'N' in pattern:
                #remove gaps/Ns to see if otherwise informative
                pattern = [
                    value for value in origPattern
                    if value != '-' and value != 'N'
                ]
            un = np.unique(pattern, return_counts=True)
            #If not all - or N, not all same base, and >1 differing base, append
            if len(un[0]) != 0 and len(
                    un[0]) != 1 and not (len(un[0]) == 2 and min(un[1]) == 1):
                sites.append(origPattern)
                pos.append("\t".join([str(len(pos) + 1), str(key)]))

    #Rotate and convert to SeqRecord
    sites = np.asarray(sites)
    align = np.rot90(sites)
    seqNamesCorr = list(reversed(seqNames))
    toFasta = [
        SeqRecord(id=seqNamesCorr[i],
                  seq=Seq("".join(align[i])),
                  description='') for i in range(len(sequences.keys()))
    ]

    fasta_file = os.path.join(os.path.dirname(alignment),
                              'informative_sites.fasta')

    #now output this as fasta to read into raxml or iqtree
    SeqIO.write(toFasta, fasta_file, 'fasta')

    #If want a position map, print:
    if printPositionMap:
        with open(fasta_file + ".positions.txt", 'w',
                  encoding='utf-8') as the_file:
            the_file.write("\n".join(pos))

    return fasta_file
Beispiel #42
0
 def test_valid_dbxrefs(self):
     with self.assertRaises(TypeError):
         SeqRecord(Seq("ACGT", generic_dna), dbxrefs={})
Beispiel #43
0
def run(ifile, cleand_sequence_file, agp_file = 'scaffold.agp', locus_tag = 'XYX', sequence_description = ''):
    # read input scaffold file
    """
    Reviewer suggestion:
    Seq1 [organism=Glarea lozoyensis 74030] [strain=74030]

    """
    
    #sequence_description = '[organism=Glarea lozoyensis 74030] [strain=74030]'
    
    if agp_file != False: agp_file = open(agp_file, 'w+')

    new_sequence_array = []
    scaffold_counter = 1
    scaffold_name = locus_tag + '_scaffold%(scaffold_counter)s'
    agp_contig_string = scaffold_name + '\t%(start)s\t%(end)s\t%(linecount)s\tW\t%(sequence_iid)s\t1\t%(end_part_sequence)s\t+\n'
    agp_gap_string    = scaffold_name + '\t%(start)s\t%(end)s\t%(linecount)s\tN\t%(gap_len)s\tfragment\tyes\t\n'
    sequence_counter = 0
    for record in SeqIO.parse(open(ifile), 'fasta'):
        sequence_start = 0
        sequence_end = 0
        linecount = 0

        # if the whole sequence has a length under 200, we can skip it
        if len(record.seq) < 200:
            continue

        """
            Filter out all contigs from a scaffold that are shorter than 200
            nucleotides. Two steps, at first replace all short contigs in the
            sequence and afterwards a special case for the beginning of the
            line.
        """

        record_seq = str(record.seq)

        # Check if an arbitrary contig is shorter than 200 nucleotides
        # replace them with gaps
        def match_func( match ):
            return 'N'*len(match.group())
        record_seq = re.sub(r'(?=([nN][ACGTactg]{1,199}[nN]))', match_func, record_seq)

        # Check if the sequence starts with a contig shorter than 200
        # replace it with N's and remove it
        record_seq = re.sub(r'^[ACGTactg]{1,199}[nN]', match_func, record_seq)

        def match_func( match ):
            if len(match.group()) < 200:
                return 'N'*len(match.group())
            return match.group()
        record_seq = re.sub(r'[ACGTactg]+', match_func, record_seq)

        # remove first occurences of N's
        record_seq = re.sub('^[nN]+', '', record_seq)
        # remove last occurences of N's
        record_seq = re.sub('[nN]+$', '', record_seq)
        if len(record_seq) < 200:
            continue
        # find all N-runs with at least two N's
        # a single N letter will not be touched
        for match in re.finditer('[nN]{2,}', record_seq):
            seq_before_match = match.string[sequence_start : match.start()]

            linecount += 1
            sequence_counter += 1
            gap_len = match.end() - match.start()
            sequence_end += len(seq_before_match)

            data = {'start': sequence_start + 1, # start relative to the scaffold
                    'end': sequence_end, # end relative to the scaffold
                    'linecount': linecount, # linecount in the scaffold
                    'sequence_iid': 'Seq%s' % sequence_counter, # unique contig iid
                    'scaffold_counter': scaffold_counter, # counting the scaffold
                    'end_part_sequence': len(seq_before_match), # end position and len of the contig
                    'gap_len': gap_len, # Gap length
                    }
            if agp_file != False: agp_file.write(agp_contig_string % data)
            linecount += 1
            sequence_start += len(seq_before_match)
            sequence_end += gap_len
            data.update({'linecount': linecount,
                        'start': sequence_start + 1,
                        'end': sequence_end,
            })
            
            if len(record_seq) < sequence_end:
                # GAP is at the end of the scaffold, do not track that, in theory that should never happen
                continue
            # write to AGP file
            if agp_file != False: agp_file.write(agp_gap_string % data)
            sequence_start = sequence_end

            seq_obj = Seq(seq_before_match, IUPAC.IUPACUnambiguousDNA)
            temp = SeqRecord(seq_obj, id = 'Seq%s' % sequence_counter, description=sequence_description)
            assert len( temp.seq ) > 199
            new_sequence_array.append(temp)

        """
            sequence after the last gap
        """
        if len(record_seq) > sequence_start:
            seq_after_last_match = record_seq[sequence_start : len(record_seq)]
            if len(seq_after_last_match) >= 200:
                linecount += 1
                sequence_end = len(record_seq)
                sequence_counter += 1
                data = {'linecount': linecount,
                        'start': sequence_start + 1,
                        'end': sequence_end,
                        'sequence_iid': 'Seq%s' % sequence_counter,
                        'scaffold_counter': scaffold_counter, # counting the scaffold
                        'end_part_sequence': sequence_end - sequence_start,
                }
                if agp_file != False: agp_file.write(agp_contig_string % data)
                seq_obj = Seq(seq_after_last_match, IUPAC.IUPACUnambiguousDNA)
                temp = SeqRecord(seq_obj, id = 'Seq%s' % sequence_counter, name=record.name, description=sequence_description)
                assert len( temp.seq ) > 199
                new_sequence_array.append(temp)

        scaffold_counter += 1


    output_file = open(cleand_sequence_file,"w+")
    SeqIO.write(new_sequence_array, output_file, "fasta")
    output_file.close()
    
    # Test if all contigs bigger than 199
    for record in SeqIO.parse(open(cleand_sequence_file), 'fasta'):
        assert len( record.seq ) > 199
Beispiel #44
0
 def test_valid_name(self):
     with self.assertRaises(TypeError):
         SeqRecord(Seq("ACGT", generic_dna), name={})
Beispiel #45
0
def PirIterator(handle):
    """Iterate over Fasta records as SeqRecord objects.

    handle - input file
    alphabet - optional alphabet
    title2ids - A function that, when given the title of the FASTA
    file (without the beginning >), will return the id, name and
    description (in that order) for the record as a tuple of strings.

    If this is not given, then the entire title line will be used
    as the description, and the first word as the id and name.

    Note that use of title2ids matches that of Bio.Fasta.SequenceParser
    but the defaults are slightly different.

    Examples
    --------
    >>> with open("NBRF/DMB_prot.pir") as handle:
    ...    for record in PirIterator(handle):
    ...        print("%s length %i" % (record.id, len(record)))
    HLA:HLA00489 length 263
    HLA:HLA00490 length 94
    HLA:HLA00491 length 94
    HLA:HLA00492 length 80
    HLA:HLA00493 length 175
    HLA:HLA01083 length 188

    """
    # Skip any text before the first record (e.g. blank lines, comments)
    while True:
        line = handle.readline()
        if line == "":
            return  # Premature end of file, or just empty?
        if line[0] == ">":
            break

    while True:
        if line[0] != ">":
            raise ValueError(
                "Records in PIR files should start with '>' character")
        pir_type = line[1:3]
        if pir_type not in _pir_alphabets or line[3] != ";":
            raise ValueError("Records should start with '>XX;' "
                             "where XX is a valid sequence type")
        identifier = line[4:].strip()
        description = handle.readline().strip()

        lines = []
        line = handle.readline()
        while True:
            if not line:
                break
            if line[0] == ">":
                break
            # Remove trailing whitespace, and any internal spaces
            lines.append(line.rstrip().replace(" ", ""))
            line = handle.readline()
        seq = "".join(lines)
        if seq[-1] != "*":
            # Note the * terminator is present on nucleotide sequences too,
            # it is not a stop codon!
            raise ValueError(
                "Sequences in PIR files should include a * terminator!")

        # Return the record and then continue...
        record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
                           id=identifier,
                           name=identifier,
                           description=description)
        record.annotations["PIR-type"] = pir_type
        yield record

        if not line:
            return  # StopIteration
    assert False, "Should not reach this line"
Beispiel #46
0
def ConSeqMaker(SeqDict, SeqList):
    AlignTemp = []
    #find the sequences for that individual
    for Contig in SeqList:
        SeqTemp = SeqRecord(seq=(Seq(SeqDict[Contig])), id=Contig),
        AlignTemp += SeqTemp
    #put them in an alignment
    AlignTemp = MultipleSeqAlignment(AlignTemp)
    NSeqs = len(AlignTemp)
    #make the consensus of the alignment
    #dumb_consensus works well as long as there are no ambiguities.  If I want to be able to notice/count them, I need something more sophisticated.
    #AlignTempInfo = AlignInfo.SummaryInfo(AlignTemp)
    #ConSeq = AlignTempInfo.dumb_consensus(ambiguous='-', consensus_alpha=IUPAC)
    ConSeq = ""
    AmbigNucs = 0
    Overlap = 0
    AmbigNucList = []
    for SeqPos in range(0, len(SeqDict[Contig])):
        PosNucs = []
        NumNucs = 0
        for record in AlignTemp:
            if record[SeqPos] != '-':
                PosNucs += record[SeqPos]
                NumNucs += 1
        if NumNucs > 1:
            Overlap += 1
        PosNucs = list(set(PosNucs))
        if len(PosNucs) == 1:
            ConSeq += PosNucs[0]
        elif len(PosNucs) > 2:
            ConSeq += 'n'
            AmbigNucs += 1
            AmbigNucList.append(SeqPos)
        elif len(PosNucs) == 2:
            if 'n' in PosNucs: ConSeq += 'n'
            elif 'm' in PosNucs:
                if 'a' in PosNucs: ConSeq += 'm'
                elif 'c' in PosNucs: ConSeq += 'm'
                else: ConSeq += 'n'
            elif 'r' in PosNucs:
                if 'a' in PosNucs: ConSeq += 'r'
                elif 'g' in PosNucs: ConSeq += 'r'
                else: ConSeq += 'n'
            elif 'w' in PosNucs:
                if 'a' in PosNucs: ConSeq += 'w'
                elif 't' in PosNucs: ConSeq += 'w'
                else: ConSeq += 'n'
            elif 's' in PosNucs:
                if 'c' in PosNucs: ConSeq += 's'
                elif 'g' in PosNucs: ConSeq += 's'
                else: ConSeq += 'n'
            elif 'y' in PosNucs:
                if 'c' in PosNucs: ConSeq += 'y'
                elif 't' in PosNucs: ConSeq += 'y'
                else: ConSeq += 'n'
            elif 'k' in PosNucs:
                if 'g' in PosNucs: ConSeq += 'k'
                elif 't' in PosNucs: ConSeq += 'k'
                else: ConSeq += 'n'
            elif 'a' in PosNucs:
                if 'c' in PosNucs: ConSeq += 'm'
                elif 'g' in PosNucs: ConSeq += 'r'
                elif 't' in PosNucs: ConSeq += 'w'
                else:
                    print(
                        "ERROR!!! We have strange bases in our sequence: %s\n"
                        % (" ".join(PosNucs)))
                    ConSeq += 'n'
            elif 'c' in PosNucs:
                if 'g' in PosNucs: ConSeq += 's'
                elif 't' in PosNucs: ConSeq += 'y'
                else:
                    print(
                        "ERROR!!! We have strange bases in our sequence: %s\n"
                        % (" ".join(PosNucs)))
                    ConSeq += 'n'
            elif 'g' in PosNucs:
                if 't' in PosNucs: ConSeq += 'k'
                else:
                    print(
                        "ERROR!!! We have strange bases in our sequence: %s\n"
                        % (" ".join(PosNucs)))
                    ConSeq += 'n'
            AmbigNucs += 1
            AmbigNucList.append(SeqPos)
    return (ConSeq, AmbigNucs, AmbigNucList, NSeqs, Overlap)
Beispiel #47
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle
        line = handle.readline()

        if not line:
            raise StopIteration

        # Strip out header comments
        while line and line.strip().startswith("#"):
            line = handle.readline()

        seqs = {}
        seq_regions = {}
        passed_end_alignment = False

        latest_id = None
        while True:
            if not line:
                break  # end of file
            line = line.strip()

            if line.startswith("="):
                # There may be more data, but we've reached the end of this
                # alignment
                break
            elif line.startswith(">"):
                m = XMFA_HEADER_REGEX_BIOPYTHON.match(line)
                if not m:
                    m = XMFA_HEADER_REGEX.match(line)
                    if not m:
                        raise ValueError("Malformed header line: %s", line)

                parsed_id = m.group("id")
                parsed_data = {}
                for key in ("start", "end", "id", "strand", "name",
                            "realname"):
                    try:
                        value = m.group(key)
                        if key == "start":
                            value = int(value)
                            # Convert to zero based counting
                            if value > 0:
                                value -= 1

                        if key == "end":
                            value = int(value)
                        parsed_data[key] = value
                    except IndexError:
                        # This will occur if we're asking for a group that
                        # doesn't exist. It's fine.
                        pass
                seq_regions[parsed_id] = parsed_data

                if parsed_id not in self._ids:
                    self._ids.append(parsed_id)

                seqs.setdefault(parsed_id, "")
                latest_id = parsed_id
            else:
                assert not passed_end_alignment
                if latest_id is None:
                    raise ValueError("Saw sequence before definition line")
                seqs[latest_id] += line
            line = handle.readline()

        assert len(seqs) <= len(self._ids)

        self.ids = self._ids
        self.sequences = seqs

        if self._ids and seqs:
            alignment_length = max(map(len, list(seqs.values())))
            records = []
            for id in self._ids:
                if id not in seqs or len(seqs[id]) == 0 or len(seqs[id]) == 0:
                    seq = "-" * alignment_length
                else:
                    seq = seqs[id]

                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )

                # Sometimes we don't see a particular sequence in the
                # alignment, so we skip that record since it isn't present in
                # that LCB/alignment
                if id not in seq_regions:
                    continue

                if seq_regions[id]["start"] != 0 or seq_regions[id]["end"] != 0:
                    suffix = "/{start}-{end}".format(**seq_regions[id])
                    if "realname" in seq_regions[id]:
                        corrected_id = seq_regions[id]["realname"]
                    else:
                        corrected_id = seq_regions[id]["name"]
                    if corrected_id.count(suffix) == 0:
                        corrected_id += suffix
                else:
                    if "realname" in seq_regions[id]:
                        corrected_id = seq_regions[id]["realname"]
                    else:
                        corrected_id = seq_regions[id]["name"]

                record = SeqRecord(Seq(seq), id=corrected_id, name=id)

                record.annotations["start"] = seq_regions[id]["start"]
                record.annotations["end"] = seq_regions[id]["end"]
                record.annotations["strand"] = (1 if seq_regions[id]["strand"]
                                                == "+" else -1)

                records.append(record)
            return MultipleSeqAlignment(records)
        else:
            raise StopIteration
Beispiel #48
0
 def add_str(self, seq, name=None, size=1, desc=""):
     """Use this method to add a sequence as a string to this fasta."""
     self.add_seq(
         SeqRecord(Seq(seq), id=name + ';size=%i;' % size,
                   description=desc))
Beispiel #49
0
def mugration_inference(tree=None, seq_meta=None, field='country', confidence=True,
                        infer_gtr=True, root_state=None, missing='?'):
        from treetime import GTR
        from Bio.Align import MultipleSeqAlignment
        from Bio.SeqRecord import SeqRecord
        from Bio.Seq import Seq


        # Determine alphabet
        places = set()
        for meta in seq_meta.values():
            if field in meta:
                places.add(meta[field])
        if root_state is not None:
            places.add(root_state)

        # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45)
        places = sorted(places)
        nc = len(places)
        if nc>180:
            print("geo_inference: can't have more than 180 places!")
            return None
        elif nc==1:
            print("geo_inference: only one place found -- set every internal node to %s!"%places[0])
            return None
        elif nc==0:
            print("geo_inference: list of places is empty!")
            return None
        else:
            alphabet = {chr(65+i):place for i,place in enumerate(places)}
            myGeoGTR = GTR.custom(pi = np.ones(nc, dtype=float)/nc, W=np.ones((nc,nc)),
                              alphabet = np.array(sorted(alphabet.keys())))
            missing_char = chr(65+nc)
            alphabet[missing_char]=missing
            myGeoGTR.profile_map[missing_char] = np.ones(nc)
            alphabet_rev = {v:k for k,v in alphabet.iteritems()}

            pseudo_seqs = []
            for name, meta in seq_meta.items():
                s=alphabet_rev[meta[field]] if field in meta else missing_char
                pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name))
            aln = MultipleSeqAlignment(pseudo_seqs)

            from treetime import TreeAnc
            tt = TreeAnc(tree=tree, aln=aln, gtr=myGeoGTR, convert_upper=False)
            tt.use_mutation_length=False
            tt.infer_ancestral_sequences(infer_gtr=infer_gtr, store_compressed=False, pc=5.0,
                                         marginal=True, normalized_rate=False)

            for node in tt.tree.find_clades():
                node.__setattr__(field, alphabet[node.sequence[0]])

            if confidence:
                for node in tt.tree.find_clades():
                    pdis = node.marginal_profile[0]
                    S = -np.sum(pdis*np.log(pdis+TINY))

                    marginal = [(alphabet[tt.gtr.alphabet[i]], pdis[i]) for i in range(len(tt.gtr.alphabet))]
                    marginal.sort(key=lambda x: x[1], reverse=True) # sort on likelihoods
                    marginal = [(a, b) for a, b in marginal if b > 0.01][:4] #only take stuff over 1% and the top 4 elements
                    conf = {a:b for a,b in marginal}
                    node.__setattr__(field + "_entropy", S)
                    node.__setattr__(field + "_confidence", conf)

            return tt, alphabet
Beispiel #50
0
def fullContigs(prot, sequence_dict, assembly_dict, protein_dict, prefix):
    """Generates a contig from all hits to a protein. 
	If more than one hit, conduct a second exonerate search with the original contigs
	stitched together."""
    logger = logging.getLogger("pipeline")
    #logger.setLevel(logger.debug)
    numHits = len(prot["assemblyHits"])
    sequence_list = []
    contigHits = []

    logger.debug("All hits:")
    logger.debug(prot["assemblyHits"])
    write_exonerate_stats(prot["assemblyHits"], prefix)

    #print numHits
    if numHits == 1:
        return str(sequence_dict[prot["assemblyHits"]
                                 [0]].seq)  #If only one hit to this protein.
    else:
        for hit in range(len(prot["assemblyHits"])):
            assembly_seq_name = prot["assemblyHits"][hit].split(",")[0]
            logger.debug(
                "Protein hit {} from {} to {} with {}% id on strand {}".format(
                    assembly_seq_name, prot["hit_start"][hit],
                    prot["hit_end"][hit], prot["percentid"][hit],
                    prot["hit_strand"][hit]))
            if assembly_seq_name not in contigHits:  #Only add each contig once.
                if prot["hit_strand"][hit] == "+":
                    sequence_list.append(assembly_dict[assembly_seq_name])
                else:
                    sequence_list.append(
                        assembly_dict[assembly_seq_name].reverse_complement())
            contigHits.append(assembly_seq_name)


# 	logger.debug([i for i in prot["assemblyHits"]])
#	logger.debug([(prot["hit_start"][i],prot["hit_end"][i]) for i in range(len(prot["hit_start"]))])
# 	logger.debug(prot["hit_strand"])
# 	logger.debug(prot["percentid"])
# 	logger.debug("\n".join(["{}   {}".format(x,assembly_dict[x].seq) for x in contigHits]))
    supercontig = SeqRecord(Seq("".join(str(b.seq) for b in sequence_list)),
                            id=prot["name"])

    #Need to remove contigs if they have the same basename
    supercontig_cds = supercontig_exonerate(supercontig,
                                            protein_dict[prot["name"]], prefix)

    #Sort the supercontigs by hit location to the protein.
    joined_supercontig_cds = [b for b in supercontig_cds]
    joined_supercontig_cds.sort(key=sort_byhitloc, reverse=True)
    #logger.info([x for x in prot['assemblyHits'] if x in sequence_list])
    #write_exonerate_stats([x for x in prot['assemblyHits'] if x in sequence_list])

    #Get rid of supercontig sequences that are subsumed by longer sequences on the same stretch.
    joined_supercontig_cds = subsume_supercontigs(joined_supercontig_cds)

    SeqIO.write(joined_supercontig_cds,
                '%s/supercontig_exonerate.fasta' % prefix, 'fasta')
    if len(joined_supercontig_cds) == 1:
        logger.debug("One sequence remaining")
        return str(joined_supercontig_cds[0].seq)
    #One more Exonerate, just to be sure.
    superdupercontig = SeqRecord(Seq("".join(
        str(b.seq) for b in joined_supercontig_cds)),
                                 id=prot["name"])
    final_supercontig = [
        x for x in supercontig_exonerate(superdupercontig, protein_dict[
            prot["name"]], prefix)
    ]
    final_supercontig.sort(key=sort_byhitloc, reverse=True)
    final_supercontig = subsume_supercontigs(final_supercontig)

    return str(Seq("".join(str(b.seq) for b in final_supercontig)))
    return str(Seq("".join(str(b.seq) for b in joined_supercontig_cds)))
Beispiel #51
0
 def Task(self):
     return SeqRecord(Seq(self["SEQ"], alphabet=MAP_ALPHABET[self["TYPE"]]), name=self["NAME"])
        list_of_dictionary.append(mydict)
        df = dataframe(sort_tuple_list)
        dataframe_list.append(df)
        count = count + 1


fname = input('Enter filename__')

dedup_records = defaultdict(list)
for record in SeqIO.parse(fname, "fasta"):
    # Use the sequence as the key and then have a list of id's as the value
    dedup_records[str(record.seq)].append(record.id)

# this creates a generator; if you need a physical list, replace the outer "(", ")" by "[" and "]", respectively
final_seq = (SeqRecord(Seq(seqi, IUPAC.protein),
                       id="|".join(gi),
                       name='',
                       description='') for seqi, gi in dedup_records.items())

# write file
SeqIO.write(final_seq, 'fuzzy.fasta', 'fasta')

#parsing the file
with open('fuzzy.fasta', 'r') as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    sequences = []
    A = []
    C = []
    D = []
    E = []
    F = []
Beispiel #53
0
        genome2summary[genome] = genome + '\t' + str(scaffold_nb) + '\t' + str(
            cluster_nb) + '\t' + str(nb) + '\t' + str(
                best) + '\t' + contamination + '\t' + result

    ##########################################################
    # extracting the fasta sequences and performing the MSAs #
    ##########################################################

    rp2seq = defaultdict(list)
    for genome in genome2scaffold:
        scaffold, cluster = genome2scaffold[genome]
        for orf in genome2scaffold2cluster2orf[genome][scaffold][cluster]:
            pfam = orf2hmm[orf][0]
            rp = pfam2rp[pfam]
            rp2seq[rp].append(
                SeqRecord(seq=orf2seq[orf].seq, id=genome, description=""))

    print('performing MSA...')
    for rp, seqList in rp2seq.items():
        print(rp + '\t' + str(len(seqList)))

        output_filename = folder + '/' + rp + '.fa'
        SeqIO.write(seqList, output_filename, 'fasta')
        mafft_filename = output_filename.replace('.fa', '.mafft')
        cmd = 'mafft --auto --thread ' + str(
            cpu
        ) + ' ' + output_filename + ' > ' + mafft_filename + ' 2>/dev/null'
        print(cmd)
        os.system(cmd)

        trimal_filename = mafft_filename.replace('.mafft', '.trimal')
def test_unit_pipeline_default(tmpdir, mocker):
    tmpdir = str(tmpdir)
    mocker.patch('os.mkdir')
    mocker.patch('deepbgc.command.pipeline.logging.FileHandler')
    mock_seqio = mocker.patch(
        'deepbgc.command.pipeline.deepbgc.util.SequenceParser')

    record1 = SeqRecord('ABC')
    record2 = SeqRecord('DEF')
    mock_seqio_instance = mock_seqio.return_value
    mock_seqio_instance.__enter__.return_value = mock_seqio_instance
    mock_seqio_instance.parse.return_value = [record1, record2]

    mock_annotator = mocker.patch('deepbgc.command.pipeline.DeepBGCAnnotator')
    mock_classifier = mocker.patch(
        'deepbgc.command.pipeline.DeepBGCClassifier')
    mock_detector = mocker.patch('deepbgc.command.pipeline.DeepBGCDetector')

    writer_paths = [
        'deepbgc.command.pipeline.BGCRegionPlotWriter',
        'deepbgc.command.pipeline.ClusterTSVWriter',
        'deepbgc.command.pipeline.PfamScorePlotWriter',
        'deepbgc.command.pipeline.PfamTSVWriter',
        'deepbgc.command.pipeline.GenbankWriter',
        'deepbgc.command.pipeline.BGCGenbankWriter',
        'deepbgc.command.pipeline.ReadmeWriter'
        # Note: We are mocking classes imported in deepbgc.command.pipeline, not at their original location!
    ]
    writers = [mocker.patch(path) for path in writer_paths]

    report_dir = os.path.join(tmpdir, 'report')
    report_tmp_dir = os.path.join(report_dir, 'tmp')
    run([
        'pipeline', '--output', report_dir, '--detector', 'mydetector',
        '--label', 'mylabel', '--score', '0.1', '--merge-max-protein-gap', '8',
        '--merge-max-nucl-gap', '9', '--min-nucl', '10', '--min-proteins',
        '20', '--min-domains', '30', '--min-bio-domains', '40', '--classifier',
        'myclassifier1', '--classifier', 'myclassifier2', '--classifier-score',
        '0.2', 'mySequence.gbk'
    ])

    os.mkdir.assert_any_call(report_dir)
    os.mkdir.assert_any_call(report_tmp_dir)

    mock_annotator.assert_called_with(tmp_dir_path=report_tmp_dir)
    mock_classifier.assert_any_call(classifier='myclassifier1',
                                    score_threshold=0.2)
    mock_classifier.assert_any_call(classifier='myclassifier2',
                                    score_threshold=0.2)
    mock_detector.assert_called_with(detector='mydetector',
                                     label='mylabel',
                                     score_threshold=0.1,
                                     merge_max_protein_gap=8,
                                     merge_max_nucl_gap=9,
                                     min_nucl=10,
                                     min_proteins=20,
                                     min_domains=30,
                                     min_bio_domains=40)

    assert mock_annotator.return_value.run.call_count == 2  # Two records
    assert mock_detector.return_value.run.call_count == 2  # Two records
    assert mock_classifier.return_value.run.call_count == 4  # Two records for each of the two classifiers

    mock_annotator.return_value.print_summary.assert_called_once_with()
    mock_detector.return_value.print_summary.assert_called_once_with()
    assert mock_classifier.return_value.print_summary.call_count == 2  # For each of the two classifiers

    for writer in writers:
        assert writer.return_value.write.call_count == 2  # Two records
        writer.return_value.close.assert_called_once_with()

    # Remove logging handlers to avoid affecting other tests
    logger = logging.getLogger('')
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
            adj_str = ";".join(protein2adj[protein])

            merged.write(acco + "_" + protein + "\t" + protein2acc[protein] +
                         "\t" + hit + "\t" + protein2dups[item] + "\t" +
                         str(protein2length[protein]) + "\t" +
                         str(protein2score[protein]) + "\t" +
                         str(protein2align_length[item]) + "\t" +
                         str(num_proteins[item]) + "\t" + prot_str + "\t" +
                         loc_str + "\t" + adj_str + "\n")
            #print sorted_prot_list

            if len(sorted_prot_list) > 1:
                tally = tally + len(sorted_prot_list)

                newrecord = SeqRecord(Seq("", IUPAC.protein),
                                      id=acco + "_" + protein + "_JOINED",
                                      name=acco + "_" + protein + "_JOINED",
                                      description=protein2acc[protein])
                for fragment in sorted_prot_list:
                    #print fragment
                    subrecord = seq_dict[fragment]
                    subseq = subrecord.seq
                    subseq = re.sub("\*", "", str(subseq))
                    #print subseq
                    #print record.seq
                    #print type(subseq)
                    newrecord.seq = newrecord.seq + "" + subseq

                #if len(newrecord.seq) > 900 :# and protein2score[protein] > 500:
                final_proteins.append(newrecord)

            else:
    for isolate in taxa:
        for position in region:
            recomb_regions[isolate].append(position)

# mask indices/positions of recombinant regions identified by gubbins
print('Masking recombinant positions in whole genome alignment.')
sample_masked_indices = defaultdict(list)
new_aln = list()

for record in aln:
    seq_str = list(str(record.seq))
    masked_indices = recomb_regions.get(record.id, [])
    for index in masked_indices:
        seq_str[index] = 'N'
    seq_str = ''.join(seq_str)
    new_record = SeqRecord(Seq(seq_str), id=record.id, description='')
    sample_masked_indices[record.id] = masked_indices
    new_aln.append(new_record)

# write new FASTA file with recombinant regions masked
fasta_outfile = outdir + '/' + re.split('/|\.', aln_path)[-2] + \
                '_gubbins_masked.fa'
text_outfile = outdir + '/' + re.split('/|\.', aln_path)[-2] + \
               '_masked_recomb_positions.txt'
var_site_outfile = outdir + '/' + re.split('/|\.', aln_path)[-2] + \
                '_gubbins_masked_var_sites.fa'

print('Writing', fasta_outfile)
with open(fasta_outfile, 'w') as handle:
    SeqIO.write(new_aln, handle, 'fasta')
def cutSequence(sequence, motif, cut_mark=""):
    """
    Takes a Biopython sequence and a motif and return the sequences cut at 
    this motif.
    
    Takes
    sequence: the sequence as a string, a Seq or a SeqRecord
    motif: a string containing a '-' at the cut site; e.g. 'g-aattc' for EcoRI
    cut_mark: mark appended to the cutting site
    
    Returns
    a list of Biopython sequences SeqRecord, with names corresponding to the 
    input sequence, the cut and the location in the input sequence
    
    # NOTE: the cutSequence function proceeds by reading the sequence and cutting
    # the next occurrence of the restriction site. In a case such as a "CCC-CC"
    # restriction site and a "ACCCCCCG" sequence, which could produce in reality
    # both "ACCC CCCG" and "ACCCC CCG" cuts, only the first one will be produced
    # by the function.
    
    """

    # clean the cutting motif

    pattern = motif.replace('-', '').upper()

    cutSite = motif.find('-')

    if (cutSite == -1):

        cutSite = len(
            motif)  # if no '-' is found, the cut is at the end of the motif

    # extract the sequence string

    try:

        sequenceString = sequence.seq.upper()  # if sequence is a SeqRecord

        name = sequence.name + '_' + motif + '_'

    except:

        try:

            sequenceString = sequence.tostring().upper(
            )  # if sequence is a Seq

            name = motif + '_'

        except:

            sequenceString = sequence.upper()  # sequence should be a string

            name = motif + '_'

    # find the first occurrence

    fragments = []

    nextSite = sequenceString.find(pattern)

    lastCut = 1

    lastPosition = len(sequenceString)

    # loop

    while (nextSite > 0):

        # while a site is found

        newFragment = SeqRecord(sequenceString[:nextSite + cutSite] + cut_mark)

        newFragment.name = (name + str(lastCut) + '_' +
                            str(lastCut + nextSite + cutSite - 1))

        lastCut = lastCut + nextSite + cutSite

        sequenceString = cut_mark + sequenceString[nextSite + cutSite:]

        fragments.append(newFragment)

        nextSite = sequenceString.find(pattern)

    # add the remaining sequence

    if (sequenceString != ''):

        name = name + str(lastCut) + '_' + str(lastPosition)

        lastFragment = SeqRecord(sequenceString)

        lastFragment.name = name

        fragments.append(lastFragment)

    # return

    return fragments
Beispiel #58
0
def make_record(string, trio, parent):
    """Returns a new SeqRecord with the reverse complement sequence."""
    return SeqRecord(seq = Seq(string), \
                 id = trio + "_" + parent, \
                 description = "")
Beispiel #59
0
    def next(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            #Empty file - just give up.
            raise StopIteration
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")
            #import sys
            #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0'

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while 1:
            line = self.handle.readline()
            if not line: break  #end of file
            line = line.strip()  #remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                #The "//" line indicates the end of the alignment.
                #There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                #blank line, ignore
                pass
            elif line[0] != "#":
                #Sequence
                #Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    #This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier " \
                                      + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                #Comment line or meta-data
                if line[:5] == "#=GF ":
                    #Generic per-File annotation, free text
                    #Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    #Each feature key could be used more than once,
                    #so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    #Generic per-Column annotation, exactly 1 char per column
                    #Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    #Generic per-Sequence annotation, free text
                    #Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    #Generic per-Sequence AND per-Column markup
                    #Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip(
                    )  # append to any previous entry
                    #TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            #Next line...

        assert len(seqs) <= len(ids)
        #assert len(gs)   <= len(ids)
        #assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids):
                raise ValueError("Found %i records in this alignment, told to expect %i" \
                                 % (len(ids), self.records_per_alignment))

            alignment_length = len(seqs.values()[0])
            records = []  #Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )
                name, start, end = self._identifier_split(id)
                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=id,
                                   name=name,
                                   description=id,
                                   annotations={"accession": name})
                #Accession will be overridden by _populate_meta_data if an explicit
                #accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            #TODO - Introduce an annotated alignment class?
            #For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Beispiel #60
0
def makeSeqRecord(id):
    return SeqRecord(Seq("AAA", Reduced.Alphabet),
                     id=id,
                     name="",
                     description="",
                     letter_annotations={"phred_quality": [32, 32, 32]})