Example #1
0
def load_fasta_file(input_file: str) -> Tuple[str, List]:
    """
    Load a fasta file into a list of SeqRecords.

    :param input_file: The path to the input fasta file.
    :returns: A tuple of the sequence type ('protein' or 'dna'), and the list of SeqRecords.
    """
    if _is_gzipped(input_file):
        openfunc = gzip.open
        bit = 'rt'
    else:
        openfunc = open
        bit = 'r'
    with openfunc(input_file, bit) as handle:
        seqs = [
            x.upper() for x in SeqIO.parse(
                handle=handle, format='fasta', alphabet=IUPAC.ambiguous_dna)
        ]
        if not all(_verify_alphabet(x.seq) for x in seqs):
            handle.seek(0)
            seqs = [
                x.upper() for x in SeqIO.parse(handle=handle,
                                               format='fasta',
                                               alphabet=HasStopCodon(
                                                   IUPAC.extended_protein))
            ]
            if not all(_verify_alphabet(x.seq) for x in seqs):
                raise ValueError(
                    'Invalid input file (neither DNA nor protein FASTA).')
            return 'protein', seqs
        return 'dna', seqs
Example #2
0
    def __init__(self, strand1, strand2=None, align=(0, 0)):
        """
        Constructs a duplex from two strands.  The duplex contains
        only the subsequences of the two strands that are specified by
        the 'align' parameter.

        Parameters
        ----------
        strand1 : Biopython Seq object or string
            First strand
        strand2 : Biopython Seq object or string, optional
            Second strand.  If not specified, the reverse
            complement is used.
        align : tuple of int, optional
            Specifies the beginning and end position of the duplex.
            First number is the position on strand 1 where the duplex
            begins, and the second number is the position on strand 2
            where the duplex ends. Default is (0,0), meaning that the
            5' end of the first strand is the beginning of the duplex,
            and the 5' end of the second strand is the end.

        Notes
        -----
        If a hybrid DNA-RNA duplex is specified, the seq attribute
        will be the DNA strand and the cseq attribute will be the RNA
        strand, regardless of the order in which the strands were
        given.
        """
        s1 = _disambiguate(strand1)
        if _verify_alphabet(s1) is False:
            raise ValueError("Couldn't identify strand 1 %s as "
                             "unambiguous DNA or RNA" % str(strand1))
        if strand2 is None:
            s2 = s1.reverse_complement()
        else:
            s2 = _disambiguate(strand2)
            if _verify_alphabet(s2) is False:
                raise ValueError("Couldn't identify strand 2 %s as "
                                 "as unambiguous DNA or RNA" % str(strand2))

        # calculate dangling ends (TODO: implement)
        i = align[0]
        m = align[1]
        #j = min(len(s1), )
        #n = min(

        # For DNA-RNA hybrids, use the DNA strand as seq and the
        # RNA strand as cseq
        if ((s1.alphabet == IUPAC.unambiguous_rna)
                and (s2.alphabet == IUPAC.unambiguous_dna)):
            self.seq = s2
            self.cseq = s1
        else:
            self.seq = s1
            self.cseq = s2
Example #3
0
    def read(self, input_handle):
        """Read patterns from the specified handle.
        """
        all_patterns = []

        while True:
            cur_line = input_handle.readline()

            if not(cur_line):
                break

            cur_pattern = cur_line.rstrip()
            # split up signatures
            if self.separator in cur_pattern:
                cur_pattern = tuple(cur_pattern.split(self.separator))

            if self._alphabet is not None:
                # make single patterns (not signatures) into lists, so we
                # can check signatures and single patterns the same
                if not isinstance(cur_pattern, tuple):
                    test_pattern = [cur_pattern]
                else:
                    test_pattern = cur_pattern
                for pattern_item in test_pattern:
                    pattern_seq = Seq(pattern_item, self._alphabet)
                    if not(_verify_alphabet(pattern_seq)):
                        raise ValueError("Pattern %s not matching alphabet %s"
                                         % (cur_pattern, self._alphabet))

            all_patterns.append(cur_pattern)

        return all_patterns
Example #4
0
def similarity_score(matrix, ref, query):
    """
    Similarity for pseudosequences using a substitution matrix.
    Args:
        matrix: subs matrix as dictionary
        ref: reference sequence
        query: query sequence
    Returns:
        a similarity value normalized to matrix
    """

    if type(ref) is not str or type(query) is not str:
        return
    r = ref
    q = query
    s = Seq(q, alphabet=IUPAC.IUPACProtein)
    #check protein sequence of query
    if _verify_alphabet(s) is False:
        return
    sim = sum([matrix[i][j] for i, j in zip(r, q) if (i != '-' and j != '-')])
    sim1 = sum([matrix[i][j] for i, j in zip(r, r) if (i != '-' and j != '-')])
    sim2 = sum([matrix[i][j] for i, j in zip(q, q) if (i != '-' and j != '-')])
    #normalise the score
    normsim = sim / np.sqrt(sim1 * sim2)
    return normsim
Example #5
0
    def read(self, input_handle):
        """Read patterns from the specified handle.
        """
        all_patterns = []

        while True:
            cur_line = input_handle.readline()

            if not(cur_line):
                break

            cur_pattern = cur_line.rstrip()
            # split up signatures
            if self.separator in cur_pattern:
                cur_pattern = tuple(cur_pattern.split(self.separator))

            if self._alphabet is not None:
                # make single patterns (not signatures) into lists, so we
                # can check signatures and single patterns the same
                if not isinstance(cur_pattern, tuple):
                    test_pattern = [cur_pattern]
                else:
                    test_pattern = cur_pattern
                for pattern_item in test_pattern:
                    pattern_seq = Seq(pattern_item, self._alphabet)
                    if not(_verify_alphabet(pattern_seq)):
                        raise ValueError("Pattern %s not matching alphabet %s"
                                         % (cur_pattern, self._alphabet))

            all_patterns.append(cur_pattern)

        return all_patterns
Example #6
0
    def _get_signature_dict(self, seq_records, sig_size, max_gap):
        """Return a dictionary with all signatures and their counts.

        This internal function does all of the hard work for the
        find_signatures function.
        """
        if self._alphabet_strict:
            alphabet = seq_records[0].seq.alphabet
        else:
            alphabet = None

        # loop through all records to find signatures
        all_sigs = {}
        for seq_record in seq_records:
            # if we are working with alphabets, make sure we are consistent
            if alphabet is not None:
                assert seq_record.seq.alphabet == alphabet, \
                       "Working with alphabet %s and got %s" % \
                       (alphabet, seq_record.seq.alphabet)

            # now start finding signatures in the sequence
            largest_sig_size = sig_size * 2 + max_gap
            for start in range(len(seq_record.seq) - (largest_sig_size - 1)):
                # find the first part of the signature
                first_sig = seq_record.seq[start:start + sig_size].tostring()

                # now find all of the second parts of the signature
                for second in range(start + 1, (start + 1) + max_gap):
                    second_sig = seq_record.seq[second:second +
                                                sig_size].tostring()

                    # if we are being alphabet strict, make sure both parts
                    # of the sig fall within the specified alphabet
                    if alphabet is not None:
                        first_seq = Seq(first_sig, alphabet)
                        second_seq = Seq(second_sig, alphabet)
                        if _verify_alphabet(first_seq) \
                        and _verify_alphabet(second_seq):
                            all_sigs = self._add_sig(all_sigs,
                                                     (first_sig, second_sig))

                    # if we are not being strict, just add the motif
                    else:
                        all_sigs = self._add_sig(all_sigs,
                                                 (first_sig, second_sig))

        return all_sigs
Example #7
0
    def _get_signature_dict(self, seq_records, sig_size, max_gap):
        """Return a dictionary with all signatures and their counts.

        This internal function does all of the hard work for the
        find_signatures function.
        """
        if self._alphabet_strict:
            alphabet = seq_records[0].seq.alphabet
        else:
            alphabet = None

        # loop through all records to find signatures
        all_sigs = {}
        for seq_record in seq_records:
            # if we are working with alphabets, make sure we are consistent
            if alphabet is not None:
                assert seq_record.seq.alphabet == alphabet, \
                       "Working with alphabet %s and got %s" % \
                       (alphabet, seq_record.seq.alphabet)

            # now start finding signatures in the sequence
            largest_sig_size = sig_size * 2 + max_gap
            for start in range(len(seq_record.seq) - (largest_sig_size - 1)):
                # find the first part of the signature
                first_sig = str(seq_record.seq[start:start + sig_size])

                # now find all of the second parts of the signature
                for second in range(start + 1, (start + 1) + max_gap):
                    second_sig = str(seq_record.seq[second: second + sig_size])

                    # if we are being alphabet strict, make sure both parts
                    # of the sig fall within the specified alphabet
                    if alphabet is not None:
                        first_seq = Seq(first_sig, alphabet)
                        second_seq = Seq(second_sig, alphabet)
                        if _verify_alphabet(first_seq) \
                        and _verify_alphabet(second_seq):
                            all_sigs = self._add_sig(all_sigs,
                                                     (first_sig, second_sig))

                    # if we are not being strict, just add the motif
                    else:
                        all_sigs = self._add_sig(all_sigs,
                                                 (first_sig, second_sig))

        return all_sigs
Example #8
0
def clean_dna_sequence(sequence, strict=False, alphabet=None):
  sequence = sequence.strip()
  sequence = re.sub(r'\s+', '', sequence)
  if strict: # throws exception if DNA is not valid
    if alphabet is None:
      alphabet = IUPAC.unambiguous_dna
    if not _verify_alphabet(Seq(sequence.upper(), alphabet)):
      raise Exception("Sequence %s contains illegal character. Expecting %s only." % (sequence, alphabet.letters))
  return sequence
Example #9
0
 def checkProtein(self):
     self.checkExtension()
     for i in self.proteomeList:
         for seq in SeqIO.parse(i, "fasta"):
             protSeq = str(seq.seq).translate(None, '*')
             seqObj = Seq(protSeq, IUPAC.extended_protein)
             my_seq = _verify_alphabet(seqObj)
             if my_seq is False:
                 raise TypeError(
                     "In file %s, sequence %s is not a protein" %
                     (i, seq.id))
Example #10
0
 def setSequenceRaw_slot(self, rawseq):
     """Set the current sequence to the given one"""
     rawseq = re.compile('[\s]').sub('', rawseq.upper())
     try:
         seq = Seq(rawseq, IUPAC.unambiguous_dna)
         if not _verify_alphabet(seq):
             raise ValueError("Alphabet Verification Failed!")
         self.currentSequence = seq
         self.window.gotSequence.emit(str(self.currentSequence))
     except ValueError:
         self.errorMessage("This doesn't look like DNA!\n\n" + str(rawseq[0:64]) + "...\n")
Example #11
0
def single_prediction(seq):
    if len(seq) > 20 and _verify_alphabet(Seq(seq, IUPAC.protein)):
        seq = seq_repair(seq)
        full_features = np.array(feature_extraction(seq))
        ph_x = ph_f_selection(full_features)
        opt_ph = phpredict(ph_x)[0]
        temp_x = temp_f_selection(full_features)
        opt_temp = tempredict(temp_x)[0]
        return opt_temp[0], opt_ph[0]

    else:
        return 'Not a Valid Protein Sequence!', 'Not a Valid Protein Sequence!'
Example #12
0
def clean_sequence(sequence, strict=False, alphabet=None, exception=True):
  sequence = sequence.strip()
  sequence = re.sub(r'\s+', '', sequence)
  if strict: # throws exception if DNA is not valid
    if alphabet is None:
      alphabet = IUPAC.unambiguous_dna
    if not _verify_alphabet(Seq(sequence.upper(), alphabet)):
      if exception is True:
        raise Exception("Sequence %s contains illegal character. Expecting %s only." %\
                        (sequence, alphabet.letters))
      else:
        return None
  return sequence
Example #13
0
    def __validate(self, seqdata):
        """Verifies the user provided input is either a file or a file-like object
        containing sequence data.

        Args:
            seqdata: a string (path to file) or file-like object.

        Returns:
            A list with one namedtuple per input sequence.

        Raises:
            TypeError: if the input is not a string or a file-like object.
            ParseError: if the sequence contains others than the 20 canonical AAs.
        """

        _Sequence = namedtuple('Seq', ['name', 'data'])

        # file-like object
        # isinstance(obj, file) does not hold in Py3
        if hasattr(seqdata, 'read') and hasattr(seqdata, 'name'):
            self.logger.debug('Reading data from file-like object {}'.format(seqdata.name))
            fname = seqdata.name

        elif isinstance(seqdata, basestring):
            self.logger.debug('Reading data from file path {}'.format(seqdata))
            fname = seqdata

            # can be file name string or sequence
            if not os.path.isfile(fname):
                raise OSError('Sequence file not found: {}'.format(seqdata))
        else:
            raise TypeError('Sequence input format not recognized: {}'.format(seqdata))

        # parse and validate sequences
        # defining these two a prior just in case later we decide to support more stuff
        _seq_alphabet = IUPACProtein()
        _seq_format = 'fasta'

        seq_iterator = SeqIO.parse(seqdata, _seq_format, alphabet=_seq_alphabet)
        for seq_i, seq_record in enumerate(seq_iterator, start=1):

            seq_name = seq_record.name
            seq_raw = str(seq_record.seq)
            if not _verify_alphabet(seq_record.seq):
                msg = 'Entry #{} ({}) in {} is not a valid protein sequence'
                raise ParseError(msg.format(seq_i, seq_name, fname))

            self.sequences.append(_Sequence(seq_name, seq_raw))

        return self.sequences
Example #14
0
def obtain_seq_from_ent(ent):
    from Bio import Entrez, SeqIO
    Entrez.email = "*****@*****.**"

    try:
        handle = Entrez.efetch(db="protein", id=ent, rettype="gp")
        record = SeqIO.read(handle, "gb")
        seq = str(record.seq)

    except:
        from pydpi import pypro
        seq = str(pypro.GetProteinSequence(ent))

    if not _verify_alphabet(Seq(seq, IUPAC.protein)):
        print('Inserted Entry Is Not Valid!')
    return seq
Example #15
0
def site2dna(site):
    """Convert "site" to a DNA sequence

    site is a str
    returns a Bio.Seq.Seq

    First tries to match site to a restriction enzyme. On failure, tries to
    convert to DNA sequence and checks strict alphabet
    """
    if site in enzymedict:
        dna = Seq(enzymedict[site]["site"], unambiguous_dna)
    else:
        dna = Seq(site, unambiguous_dna)
    if not _verify_alphabet(dna):
        raise ValueError("site is not recognized enzyme and not strict DNA")
    return dna
Example #16
0
    def _user_submits(
            self, seq_str, island_size_str, min_gc_ratio_str,
            min_obs_exp_cpg_ratio_str, algo_index):
        """Called when the user submits the form.

        :param seq_str: the sequence as a string
        :type seq_str: :class:`str`
        :param island_size_str: number of bases which an island may contain
        :type island_size_str: :class:`str`
        :param min_gc_ratio_str: the ratio of GC to other bases
        :type min_gc_ratio_str: :class:`str`
        :param algo_index: the algorithm chosen
        :type algo_index: :class:`int`
        """
        seq_mixed_case = Seq(seq_str, IUPAC.unambiguous_dna)
        seq = seq_mixed_case.upper()
        # Using `_verify_alphabet' is somewhat questionable, since it
        # is marked as private. However, there are no other documented
        # ways to verify the sequence.
        if not _verify_alphabet(seq):
            self.view.show_error(
                '''Sequence letters not within alphabet:
  Alphabet: {0}
  Sequence: {1}'''.format(seq.alphabet.letters, str(seq)))
            return
        try:
            island_size = int(island_size_str)
        except ValueError:
            self.view.show_error(
                'Invalid integer for island size: {0}'.format(island_size_str))
            return
        try:
            min_gc_ratio = float(min_gc_ratio_str)
        except ValueError:
            self.view.show_error(
                'Invalid ratio for GC: {0}'.format(min_gc_ratio_str))
            return
        try:
            min_obs_exp_cpg_ratio = float(min_obs_exp_cpg_ratio_str)
        except ValueError:
            self.view.show_error(
                'Invalid ratio for minimum observed/expected '
                'CpG ratio: {0}'.format(min_obs_exp_cpg_ratio_str))
            return
        self.model.compute_islands(
            SeqRecord(seq), island_size, min_gc_ratio,
            min_obs_exp_cpg_ratio, algo_index)
Example #17
0
def get_Short(genesList):
    """Creates a short version of each fasta file with only the 1st allele"""

    if not genesList:
        #print("An empty list was provided. Stopping execution...")
        #return None
        raise Exception("An empty list was provided. Stopping execution...")
    else:

        for gene in genesList:
            # gene = gene.rstrip('\n')
            pathtoDir = os.path.join(os.path.dirname(gene), "short")
            if not os.path.exists(pathtoDir):
                os.makedirs(pathtoDir)
            shortgene = os.path.join(os.path.dirname(gene), "short",
                                     os.path.basename(gene))
            shortgene = shortgene.replace(".fasta", "_short.fasta")

            first_allele = next(
                SeqIO.parse(gene, "fasta", IUPAC.unambiguous_dna))

            if not _verify_alphabet(first_allele.seq.upper()):
                print(
                    "The DNA sequence has invalid nucleotides. Execution will not be stopped."
                )
                raise Exception(
                    "The DNA sequence has invalid nucleotides. Execution will not be stopped."
                )

            else:
                with open(shortgene, "w") as fG:
                    fG.write('>' + str(first_allele.id) + '\n' +
                             str(first_allele.seq.upper()) + '\n')


#            with open(shortgene, "w") as fG:
#                first_allele = next(SeqIO.parse(gene, "fasta", generic_dna))
#                fG.write('>' + str(first_allele.id) + '\n' + str(first_allele.seq.upper()) + '\n')

#gene_fp2 = HTSeq.FastaReader(gene)
# for allele in SeqIO.parse(gene, "fasta", generic_dna):
#     fG = open(shortgene, 'w')
#     fG.write('>' + str(allele.id) + '\n' + str(allele.seq.upper()) + '\n')
#     fG.close()
#     break

    return True
Example #18
0
    def clean(self):
        cleaned_data = super(JobForm, self).clean()
        fasta = cleaned_data['fasta']
        non_field_errors = []
        alphabet = AlphabetEncoder(IUPAC.unambiguous_dna, 'N')
        references = list(SeqIO.parse(fasta, 'fasta', alphabet=alphabet))

        if not references:
            e = ValidationError(
                "File does not contain any valid fasta records. Descriptor line "
                "should start with >"
            )
            self.add_error('fasta', e)
            non_field_errors.append(e)
            raise ValidationError(non_field_errors)

        primary_ref = references[0]
        primary_ref_len = len(primary_ref)

        if any(abs(len(r) - primary_ref_len) > 500 for r in references):
            e = ValidationError(
                "One or more of your references is too different in length to "
                "the primary (first) reference. The maximum difference is 500 nt",
                code='invalid')
            non_field_errors.append(e)

        if not 1 <= len(references) <= 100:
            e = ValidationError(
                "Between 1 and 100 reference genomes are required in your fasta file. "
                "We recommend selecting a candidate reference from each lineage "
                "of interest, rather than many similar references.", code='invalid')
            non_field_errors.append(e)

        if any(not _verify_alphabet(r.seq) for r in references):
            e = ValidationError(
                "One or more of your fasta sequences contain invalid nucleotide codes. "
                "The supported alphabet is '{}'. Ambiguity codes and gaps are not "
                "currently supported.".format(alphabet.letters), code='invalid')
            non_field_errors.append(e)

        if non_field_errors:
            raise ValidationError(non_field_errors)

        return cleaned_data
Example #19
0
    def _user_submits(self, seq_str, island_size_str, min_gc_ratio_str,
                      min_obs_exp_cpg_ratio_str, algo_index):
        """Called when the user submits the form.

        :param seq_str: the sequence as a string
        :type seq_str: :class:`str`
        :param island_size_str: number of bases which an island may contain
        :type island_size_str: :class:`str`
        :param min_gc_ratio_str: the ratio of GC to other bases
        :type min_gc_ratio_str: :class:`str`
        :param algo_index: the algorithm chosen
        :type algo_index: :class:`int`
        """
        seq_mixed_case = Seq(seq_str, IUPAC.unambiguous_dna)
        seq = seq_mixed_case.upper()
        # Using `_verify_alphabet' is somewhat questionable, since it
        # is marked as private. However, there are no other documented
        # ways to verify the sequence.
        if not _verify_alphabet(seq):
            self.view.show_error('''Sequence letters not within alphabet:
  Alphabet: {0}
  Sequence: {1}'''.format(seq.alphabet.letters, str(seq)))
            return
        try:
            island_size = int(island_size_str)
        except ValueError:
            self.view.show_error(
                'Invalid integer for island size: {0}'.format(island_size_str))
            return
        try:
            min_gc_ratio = float(min_gc_ratio_str)
        except ValueError:
            self.view.show_error(
                'Invalid ratio for GC: {0}'.format(min_gc_ratio_str))
            return
        try:
            min_obs_exp_cpg_ratio = float(min_obs_exp_cpg_ratio_str)
        except ValueError:
            self.view.show_error(
                'Invalid ratio for minimum observed/expected '
                'CpG ratio: {0}'.format(min_obs_exp_cpg_ratio_str))
            return
        self.model.compute_islands(SeqRecord(seq), island_size, min_gc_ratio,
                                   min_obs_exp_cpg_ratio, algo_index)
Example #20
0
def process_fasta(file_path):
    """
    Parse and validate the fasta file.
    """

    references = []
    alphabet = AlphabetEncoder(IUPAC.unambiguous_dna, 'N')

    records = SeqIO.parse(file_path, 'fasta')  # may raise

    # Remove gaps, set alphabet
    for record in records:
        ref = SeqRecord(Seq(
            str(record.seq).replace('-', '').upper(), alphabet),
                        id=record.id,
                        description=record.id)
        references.append(ref)

    # Check for too few or too many references
    if not (1 <= len(references) <= 100):
        raise ValueError('Between 1 and 100 reference genomes are required.')

    # Check for max difference in length between references
    primary_ref = references[0]
    primary_ref_len = len(primary_ref)

    if any(abs(len(r) - primary_ref_len) > 500 for r in references):
        raise ValueError(
            'One or more of your references is too different in length to '
            'the primary (first) reference. The maximum difference is '
            '500 nt.')

    # Check for a valid alphabet
    if any(not _verify_alphabet(r.seq) for r in references):
        raise ValueError(
            'One or more of your fasta sequences contain invalid '
            "nucleotide codes. The supported alphabet is '{}'. "
            'Ambiguity codes and gaps are not currently supported.'.format(
                alphabet.letters))

    return references
Example #21
0
def verify_alphabet(records):
    from Bio.Seq import Seq
    from Bio.Data import IUPACData
    from Bio.Alphabet import _verify_alphabet
    from Bio.Alphabet import IUPAC

    letters = list(IUPACData.extended_protein_letters)
    #print letters
    #print IUPAC.extended_protein.contains
    
    for record in records:
        my_seq = Seq(str(record.seq),
                     IUPAC.extended_protein)
        if  _verify_alphabet(my_seq) is True:
            continue
        else:
            illegal_char = []
            for letter in list(str(record.seq)):
                if letter not in letters:
                    illegal_char.append(letter)
            print '%s\t%s' % (record.name, '-'.join(illegal_char))
Example #22
0
    def _get_motif_dict(self, seq_records, motif_size):
        """Return a dictionary with information on motifs.

        This internal function essentially does all of the hard work for
        finding motifs, and returns a dictionary containing the found motifs
        and their counts. This is internal so it can be reused by
        find_motif_differences.
        """
        if self.alphabet_strict:
            alphabet = seq_records[0].seq.alphabet
        else:
            alphabet = None

        # loop through all records to find the motifs in the sequences
        all_motifs = {}
        for seq_record in seq_records:
            # if we are working with alphabets, make sure we are consistent
            if alphabet is not None:
                assert seq_record.seq.alphabet == alphabet, "Working with alphabet %s and got %s" % (
                    alphabet,
                    seq_record.seq.alphabet,
                )

            # now start finding motifs in the sequence
            for start in range(len(seq_record.seq) - (motif_size - 1)):
                motif = str(seq_record.seq[start : start + motif_size])

                # if we are being alphabet strict, make sure the motif
                # falls within the specified alphabet
                if alphabet is not None:
                    motif_seq = Seq(motif, alphabet)
                    if _verify_alphabet(motif_seq):
                        all_motifs = self._add_motif(all_motifs, motif)

                # if we are not being strict, just add the motif
                else:
                    all_motifs = self._add_motif(all_motifs, motif)

        return all_motifs
Example #23
0
    def _get_motif_dict(self, seq_records, motif_size):
        """Return a dictionary with information on motifs.

        This internal function essentially does all of the hard work for
        finding motifs, and returns a dictionary containing the found motifs
        and their counts. This is internal so it can be reused by
        find_motif_differences.
        """
        if self.alphabet_strict:
            alphabet = seq_records[0].seq.alphabet
        else:
            alphabet = None

        # loop through all records to find the motifs in the sequences
        all_motifs = {}
        for seq_record in seq_records:
            # if we are working with alphabets, make sure we are consistent
            if alphabet is not None:
                assert seq_record.seq.alphabet == alphabet, \
                       "Working with alphabet %s and got %s" % \
                       (alphabet, seq_record.seq.alphabet)

            # now start finding motifs in the sequence
            for start in range(len(seq_record.seq) - (motif_size - 1)):
                motif = str(seq_record.seq[start:start + motif_size])

                # if we are being alphabet strict, make sure the motif
                # falls within the specified alphabet
                if alphabet is not None:
                    motif_seq = Seq(motif, alphabet)
                    if _verify_alphabet(motif_seq):
                        all_motifs = self._add_motif(all_motifs, motif)

                # if we are not being strict, just add the motif
                else:
                    all_motifs = self._add_motif(all_motifs, motif)

        return all_motifs
Example #24
0
def count_alignment(alignment, columns=None, refidx=None, limit=100):

    aln = iter(alignment)
    records = []

    for i, r in enumerate(aln, start=len(records)):
        if len(records) > limit:
            break
        if i == refidx:
            continue
        records.append((i, r))

    alph = None
    for alph_ in (DNA_ALPHABET, RNA_ALPHABET, AMINO_ALPHABET):
        for _, r in records:
            r.seq.alphabet = alph_
        if all(_verify_alphabet(r.seq.upper()) for _, r in records):
            alph = alph_
            break

    if alph is None:
        raise RuntimeError('unknown alphabet')

    skips = _GAP
    if alph in (DNA_ALPHABET, RNA_ALPHABET):
        T = 'T' if alph == DNA_ALPHABET else 'U'
        letters = 'ACG' + T
        ambigs = {
            'M': 'AC',
            'R': 'AG',
            'W': 'A' + T,
            'S': 'CG',
            'Y': 'C' + T,
            'K': 'G' + T,
            'V': 'ACG',
            'H': 'AC' + T,
            'D': 'AG' + T,
            'B': 'CG' + T,
        }
        skips += 'N'
        colors = DNA_COLORS
    elif alph == AMINO_ALPHABET:
        letters = 'ACDEFGHIKLMNPQRSTVWY'
        ambigs = {
            'B': 'DN',
            'J': 'IL',
            'Z': 'EQ',
        }
        skips += _STOP + 'X' + 'OU'
        colors = AMINO_COLORS
    else:
        raise RuntimeError("sequences with indeterminable alphabet provided")

    def b(r):
        return r.upper()

    def allrecords():
        for i, r in records:
            yield b(str(r.seq))
        for i, r in enumerate(aln, start=i):
            if i == refidx:
                continue
            yield b(str(r.seq))

    alphabet = letters + ''.join(sorted(ambigs.keys()))
    nchar = len(alphabet)
    values = np.zeros((nchar, len(letters)), dtype=float)

    for i, c in enumerate(alphabet):
        if i < len(letters):
            values[i, i] = 1.
        else:
            v = 1. / len(ambigs[c])
            for d in ambigs[c]:
                values[i, alphabet.index(d)] = v

    if columns is not None:
        columns = np.array(columns, dtype=int)

    counts = _count(
        allrecords(),
        columns,
        b(alphabet),
        values
        )

    return counts.transpose(), (letters, colors)
Example #25
0
    def test_all_even_numbered_lines_are_dna_sequences(self):
        class DNAdict():
            letters = 'GATCN'

        seq_list = [Seq(x, DNAdict) for x in self.lines[1::2]]
        self.assertTrue(all(_verify_alphabet(seq) for seq in seq_list))
    filename = infl.split("\t")[-1]
    name = filename.split(".")[0]
    dirpath = os.path.join(cudir + "/" + name)

    if os.path.exists(dirpath) and os.path.isdir(dirpath):
        shutil.rmtree(dirpath)
    os.mkdir(cudir + "/" + name)
    subprocess.call(["cp", infl, cudir + "/input_file/"])
    fasta_sequence = SeqIO.parse(open(cudir + "/input_file/" + filename),
                                 "fasta")
    for fasta in fasta_sequence:

        name1, sequence = fasta.id, str(fasta.seq)

        my_prot = Seq(sequence, IUPAC.protein)
        if _verify_alphabet(my_prot) == False:
            with open(cudir + "/false_sequence444.fasta", "a") as handle:
                count = SeqIO.write(fasta, handle, "fasta")
            continue
        else:
            ##print(my_prot, _verify_alphabet(my_prot))
            with open(cudir + "/" + name + "/" + name + ".fasta",
                      "a") as handle:
                count = SeqIO.write(fasta, handle, "fasta")
    fasta_sequence = SeqIO.parse(
        open(cudir + "/" + name + "/" + name + ".fasta"), "fasta")

    for fasta in fasta_sequence:
        with open(name + "_Final_Resultant.txt", "a") as final_fl:
            with open(name + "_Final_Resultant_values.txt", "a") as inter_fl:
                #print fasta
def global_align(seq_record1, seq_record2):
    """Global alignment using the Bio.pairwise2 package. 
    Check if sequences are nucleotide or amino acids using the _verify_alphabet function from the Bio.Alphabet module.
    """

    from Bio.Alphabet import IUPAC
    from Bio.Seq import Seq
    from Bio.Alphabet import _verify_alphabet

    #gap_open = -10
    #gap_extend = -0.5

    seq_record1.seq = seq_record1.seq.upper()
    seq_record2.seq = seq_record2.seq.upper()

    seq1_file = NamedTemporaryFile()
    SeqIO.write(seq_record1, seq1_file, "fasta")
    seq1_file.flush()
    seq2_file = NamedTemporaryFile()
    SeqIO.write(seq_record2, seq2_file, "fasta")
    seq2_file.flush()

    seq_record1.seq.alphabet = IUPAC.ambiguous_dna
    seq_record2.seq.alphabet = IUPAC.ambiguous_dna

    if _verify_alphabet(seq_record1.seq) and _verify_alphabet(seq_record2.seq):
        #print "DNA!"
        #    alns = pairwise2.align.globalds(seq1, seq2, DNA_matrix, gap_open, gap_extend)
        #    print ">"+noms[id_seq1]
        #    print alns[0][0]
        #    print ">"+noms[id_seq2]
        #    print alns[0][1]
        #    return  alns[0]
        needle_cline = NeedleCommandline(asequence=seq1_file.name,
                                         bsequence=seq2_file.name,
                                         stdout=True,
                                         gapopen=10,
                                         gapextend=0.5,
                                         auto=True,
                                         aformat="srspair")
        stdout, stderr = needle_cline()
        #print stdout
        align = AlignIO.read(StringIO.StringIO(stdout), "emboss")
        return align

    seq_record1.seq.alphabet = IUPAC.protein
    seq_record2.seq.alphabet = IUPAC.protein
    #print seq1
    #print _verify_alphabet(seq1)

    if _verify_alphabet(seq_record1.seq) and _verify_alphabet(seq_record2.seq):
        #print "AA!"
        #    alns = pairwise2.align.globalds(seq1, seq2, matlist.blosum62, gap_open, gap_extend)
        #    return  alns[0]

        needle_cline = NeedleCommandline(asequence=seq1_file.name,
                                         bsequence=seq2_file.name,
                                         stdout=True,
                                         gapopen=10,
                                         gapextend=0.5,
                                         auto=True,
                                         aformat="srspair")
        stdout, stderr = needle_cline()
        align = AlignIO.read(StringIO.StringIO(stdout), "emboss")
        return align

    else:
        raise "unkown alphabet!"
Example #28
0
 def test_all_sequences_are_legitimate_dna_sequences(self):
     class DNAdict(): letters='GATCN'
     seq_list = [Seq(x, DNAdict) for x in self.seqs.values()]
     self.assertTrue(all(_verify_alphabet(seq) for seq in seq_list))
#import Bio
import Bio.Seq
from Bio.Alphabet import _verify_alphabet
#help(Bio.SeqIO)
#help(Bio.SeqIO.FastaIO)
#from Bio.Align.Applications import ClustalwCommandline
#help(Bio.Align)
part_name = '/home/nastia/Desktop/fasta/'
list_fasta = os.listdir(part_name)
list_finish = []
for entry in list_fasta:
    data = Bio.SeqIO.FastaIO.SimpleFastaParser(open(part_name+entry))#, (alphabet=Bio.Alphabet.generic_protein))
    for val in data:
        my_seq=Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.unambiguous_rna)
        my_seq_1=Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.unambiguous_dna)
        my_seq_2=Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.protein)
        #print(val)
        #list_finish.append(val)
        if not _verify_alphabet(my_seq) and not _verify_alphabet(my_seq_1) and _verify_alphabet(my_seq_2):
        #if not _verify_alphabet(my_seq):
            list_finish.append(val)
print(len(list_finish))
#for record in list_finish:
    #print(record.alphabet)
    #print(list_finish)
    #cline = ClustalwCommandline('clustalo', infile=list_finish)
#for entry in list_finish:
#cline = MuscleCommandline(input=list_finish, output='/home/nastia/Desktop/fasta.txt')
    #from Bio import AlignIO
    #align = AlignIO.read('/home/nastia/fasta/opuntia.aln', 'clustal')
#/home/nastia/Desktop/fasta
Example #30
0
    def build_fasta(self, organism, process_type, output_file):
        # read IEDB input file
        iedb = pd.read_csv(self.input_file, skiprows=1)

        # filter entries
        filtered_iedb = iedb[(iedb["Name"].str.contains(organism))
                             & (iedb["Object Type"] == "Linear peptide")
                             & (iedb["Process Type"] == process_type)
                             & (iedb["Qualitative Measure"] == "Positive")
                             & (iedb["Class"] == "I")]

        # parses peptides and validates them, non-valid peptides are filtered out
        filtered_iedb.loc[:,
                          "seq"] = filtered_iedb.loc[:,
                                                     "Description"].transform(
                                                         lambda x: x.strip())
        filtered_iedb.loc[:,
                          "valid_peptide"] = filtered_iedb.loc[:, "seq"].transform(
                              lambda x: _verify_alphabet(Seq(x, IUPAC.protein)
                                                         ))
        filtered_iedb = filtered_iedb[filtered_iedb.valid_peptide]

        # build fasta header: 449|FL-160-2 protein - Trypanosoma cruzi|JH0823|Trypanosoma cruzi|5693
        # epitope id|Antigen Name|antigen_id|Organism Name|organism_id
        filtered_iedb.loc[:,
                          "epitope_id"] = filtered_iedb.loc[:, "Epitope IRI"].transform(
                              lambda x: x.replace(
                                  "http://www.iedb.org/epitope/",
                                  "",
                                  regex=True))
        filtered_iedb.loc[:,
                          "antigen_id"] = filtered_iedb.loc[:, "Antigen IRI"].transform(
                              lambda x: x.replace(
                                  "http://www.ncbi.nlm.nih.gov/protein/",
                                  "",
                                  regex=True).replace(
                                      "https://ontology.iedb.org/ontology/",
                                      "",
                                      regex=True))
        filtered_iedb.loc[:,
                          "organism_id"] = filtered_iedb.loc[:, "Organism IRI"].transform(
                              lambda x: x.replace(
                                  "http://purl.obolibrary.org/obo/NCBITaxon_",
                                  "",
                                  regex=True))
        filtered_iedb.loc[:, "fasta_header"] = filtered_iedb.apply(
            lambda row:
            ">{epitope_id}|{antigen_name}|{antigen_id}|{organism_name}|{organism_id}"
            .format(
                epitope_id=str(row["epitope_id"]),
                antigen_name=row["Antigen Name"],
                antigen_id=str(row["antigen_id"]),
                organism_name=row["Organism Name"],
                organism_id=str(row["organism_id"]),
            ),
            axis=1,
        )
        filtered_iedb.drop_duplicates(subset="seq", keep="last", inplace=True)

        # writes output FASTA file
        with open(output_file, "w") as fasta:
            for index, row in filtered_iedb.iterrows():
                fasta.write(
                    ">{header}\n".format(header=str(row["fasta_header"])))
                fasta.write("{sequence}\n".format(sequence=str(row["seq"])))
Example #31
0
from Bio.Alphabet import _verify_alphabet
#help(Bio.SeqIO)
#help(Bio.SeqIO.FastaIO)
#from Bio.Align.Applications import ClustalwCommandline
#help(Bio.Align)
part_name = '/home/nastia/Desktop/fasta/'
list_fasta = os.listdir(part_name)
list_finish = []
for entry in list_fasta:
    data = Bio.SeqIO.FastaIO.SimpleFastaParser(
        open(part_name + entry))  #, (alphabet=Bio.Alphabet.generic_protein))
    for val in data:
        my_seq = Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.unambiguous_rna)
        my_seq_1 = Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.unambiguous_dna)
        my_seq_2 = Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.protein)
        #print(val)
        #list_finish.append(val)
        if not _verify_alphabet(my_seq) and not _verify_alphabet(
                my_seq_1) and _verify_alphabet(my_seq_2):
            #if not _verify_alphabet(my_seq):
            list_finish.append(val)
print(len(list_finish))
#for record in list_finish:
#print(record.alphabet)
#print(list_finish)
#cline = ClustalwCommandline('clustalo', infile=list_finish)
#for entry in list_finish:
#cline = MuscleCommandline(input=list_finish, output='/home/nastia/Desktop/fasta.txt')
#from Bio import AlignIO
#align = AlignIO.read('/home/nastia/fasta/opuntia.aln', 'clustal')
#/home/nastia/Desktop/fasta
Example #32
0
def graph_logo(
    alignment,
    columns,
    filename=None,
    dpi=None, edgecolor='k', figsize=None, format='pdf', labels=None, linewidth=0., transparent=True,
    refidx=-1
):
    if filename is None:
        fd, filename = mkstemp(); close(fd)

    if figsize is None:
        figsize = (3, 3)

    if labels is None:
        labels = ['%d' % (idx + 1) for idx in columns]

    if refidx >= 0:
        msa = alignment
        alignment = msa[:refidx]
        alignment.extend(msa[refidx + 1:])

    M = len(alignment)
    N = len(columns)

    alph = None
    for _alph in (_DNA_ALPHABET, _RNA_ALPHABET, _AMINO_ALPHABET):
        for r in alignment:
            r.seq.alphabet = _alph
        if all([_verify_alphabet(r.seq.upper()) for r in alignment]):
            alph = _alph
            break
    if alph is None:
        raise RuntimeError("sequences with indeterminable alphabet provided")

    motif = Motif(alphabet=alph)

    instances = (''.join(z).upper() for z in zip(*[alignment[:, i] for i in columns]))
    for instance in instances:
        motif.add_instance(Seq(instance, alph))

    # set laplace = True to include the backgrounds
    pwm = _fix_ambigs(motif.pwm(laplace=False), alph)

    # heuristic to determine whether nucleotide or protein alphabet
    # need to use either base 4 or 20 depending
    alphlen, _alphkeys = max(((len(pwm[i]), pwm[i].keys()) for i in range(N)), key=itemgetter(0))
    s, colors = (4, _DNA_COLORS) if alphlen < 20 else (20, _AMINO_COLORS)
    alphkeys = ['']
    alphkeys.extend(_alphkeys)
    alphmap = dict(zip(alphkeys, range(len(alphkeys))))

    # compute the information content at each position
    maxbits = np.log2(s)
    e_n = (s - 1) / (2. * np.log(2) * M)
    R = maxbits * np.ones((N,), dtype=float)
    R -= [-sum(v * np.log2(v) for _, v in pwm[i].items() if v > 0.) for i in range(N)]
    R -= e_n

    heights = np.zeros((alphlen, N), dtype=float)
    identities = np.zeros((alphlen, N), dtype=int)

    for j in range(N):
        i = 0
        for k, v in sorted(pwm[j].items(), key=itemgetter(1)):
            heights[i, j] = R[j] * v
            identities[i, j] = alphmap[k]
            i += 1

    font = Basefont(join(_HY454_FONT_PATHS[0], 'Roboto-Black.ttf'))

    fig = plt.figure(figsize=figsize, dpi=dpi)

    # make each column a vertical golden rect
    rect = 0.2, 0.2, 0.382 * N, 0.618
    ax = fig.add_axes(rect)

    _adjust_spines_outward(ax, ('left',), 9)

    ax.set_ylabel('bits', fontproperties=_ROBOTO_REGULAR)

    if figsize is None:
        fig.set_figwidth(N)

    if transparent:
        fig.patch.set_alpha(0.)
        ax.patch.set_alpha(0.)

    # remove the top and right ticks
    for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks():
        tick.tick2On = False

    # remove the bottom ticks
    for tick in ax.xaxis.get_major_ticks():
        tick.tick1On = False

    # rotate the x-axis labels by 45 degrees to enhance packing
    for label in ax.xaxis.get_ticklabels():
        label.set_rotation(45)

    # set font properties
    for label in ax.xaxis.get_ticklabels() + ax.yaxis.get_ticklabels():
        label.set_fontproperties(_ROBOTO_REGULAR)

    # disable top and right spines, we don't need them
    ax.spines['bottom'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    def format_xlabel(x, pos=None):
        idx = np.clip(int(x)-1, 0, N-1)
        return labels[idx]

    ax.xaxis.set_major_formatter(FuncFormatter(format_xlabel))
    # avoid too much precision
    ax.yaxis.set_major_formatter(FormatStrFormatter('%1.1f'))

    # set the ticks
    ysep = 0.5 if alphlen < 20 else 1.0
    yticks = np.arange(0, maxbits, ysep, dtype=float)
    if maxbits - yticks[-1] < ysep:
        yticks[-1] = maxbits
    else:
        yticks = np.append(yticks, maxbits)
    ax.set_yticks(yticks)
    ax.set_xticks(np.arange(1, N+1, dtype=float) + 0.5)

    # set the axes limits here AFTER the ticks, otherwise borkage
    ax.set_xlim((1, N+1))
    ax.set_ylim((0, maxbits))

    idxs = np.arange(1, N+1)
    bottoms = np.zeros((N,), dtype=float)
    for i in range(alphlen):
        bars = ax.bar(idxs, heights[i, :], width=1., bottom=bottoms)
        bottoms += heights[i, :]
        for j, bar in enumerate(bars):
            if identities[i, j]:
                l = alphkeys[identities[i, j]]
                glyph = font[l]
                ax.add_patch(glyph)
                glyph.set_transform(bar.get_transform())
                bar.set_visible(False)
                glyph.set_edgecolor(edgecolor)
                glyph.set_facecolor(colors[l])
                glyph.set_linewidth(linewidth)
                glyph.set_zorder(-1)

    # set the remaining spine to show the maximum value
    ax.spines['left'].set_bounds(0, max(bottoms))

    fig.savefig(filename, format=format, transparent=transparent, bbox_inches='tight', pad_inches=0.25)

    return filename
Example #33
0
def count_alignment(alignment, columns='all', refidx=None, limit=100, embedded_counts = None):
    records = []
    
    if embedded_counts is not None:
        import re

    if columns is None or columns == 'all':
        r = next(iter(alignment))
        columns = list(range(len(r)))
        records.append((0, r))

    N = len(columns)

    for i, r in enumerate(alignment, start=len(records)):
        if len(records) > limit:
            break
        if i == refidx:
            continue
        records.append((i, r))

    alph = None
    for alph_ in (DNA_ALPHABET, RNA_ALPHABET, AMINO_ALPHABET):
        for _, r in records:
            r.seq.alphabet = alph_
        if all(_verify_alphabet(r.seq.upper()) for _, r in records):
            alph = alph_
            break

    if alph is None:
        raise RuntimeError('unknown alphabet')

    skips = _GAP
    if alph in (DNA_ALPHABET, RNA_ALPHABET):
        T = 'T' if alph == DNA_ALPHABET else 'U'
        letters = 'ACG' + T
        ambigs = {
            'M': 'AC',
            'R': 'AG',
            'W': 'A' + T,
            'S': 'CG',
            'Y': 'C' + T,
            'K': 'G' + T,
            'V': 'ACG',
            'H': 'AC' + T,
            'D': 'AG' + T,
            'B': 'CG' + T,
        }
        skips += 'N'
        colors = DNA_COLORS
    elif alph == AMINO_ALPHABET:
        letters = 'ACDEFGHIKLMNPQRSTVWY'
        ambigs = {
            'B': 'DN',
            'J': 'IL',
            'Z': 'EQ',
        }
        skips += _STOP + 'X' + 'OU'
        colors = AMINO_COLORS
    else:
        raise RuntimeError("sequences with indeterminable alphabet provided")

    s = len(letters)
    counts = np.zeros((s, N), dtype=float)

    def allrecords():
        i = 0
        for i, r in records:
            yield r
        for i, r in enumerate(alignment, start=i):
            if i == refidx:
                continue
            yield r

    for r in allrecords():
        for j, c in enumerate(columns):
            if embedded_counts is not None:
                m = embedded_counts.search (r.name)
                if m is not None:
                    weight = float (m.group(1))
            else:
                weight = 1.
                
            ltr = r[c].upper()

            if ltr in skips:
                continue
            elif ltr in ambigs:
                frac = weight / len(ambigs[ltr])
                for ltr_ in ambigs[ltr]:
                    i = letters.index(ltr_)
                    counts[i, j] += frac
            elif ltr in letters:
                i = letters.index(ltr)
                counts[i, j] += weight
            else:
                raise ValueError('unknown letter: {0}'.format(ltr))

    return counts, (letters, colors)