Ejemplo n.º 1
0
    def setUp(self):
        # generate a Protein for use in the tests of this class
        self.single_protein = Protein("ASDERWQTGHKILPMNVFCY", gene_id='gene 1', transcript_id='someID')

        # generate a set of Proteins for use in the tests of this class
        self.prot_set = list()
        self.prot_set.append(Protein("IIIVRC", gene_id='gene 1', transcript_id='set entry 1'))
        self.prot_set.append(Protein("VRCVR", gene_id='gene 1', transcript_id='set entry 2'))
        self.prot_set.append(Protein("IIVRCIT", gene_id='gene 1', transcript_id='set entry 3'))
        self.prot_set.append(Protein("IVRC", gene_id='gene 1', transcript_id='set entry 4'))
Ejemplo n.º 2
0
 def setUp(self):
     self.seqs = [
         Peptide("SYFPEISYFP"),
         Protein("IHTIEPFYSIHTIEPFYSIHTIEPFYSIHTIEPFYSIHTIEPFYS", "ID-01",
                 "FOXP3")
     ]
     self.fragments = [Peptide("FSYFPEITHIR"), Peptide("FIHTIEPFYSR")]
Ejemplo n.º 3
0
    def setUp(self):
        self.peptides = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")]
        testsequences_file = pkg_resources.resource_filename(
            'Fred2', path.join('Data', 'examples', 'testSequences.fasta'))
        with open(testsequences_file, "rU") as handle:
            records = list(SeqIO.parse(handle, "fasta"))
        prot_set = [Protein(str(r.seq)) for r in records]
        unique_test_pep_set = generate_peptides_from_proteins(prot_set, 9)
        self.selfpeptides = [str(x) for x in unique_test_pep_set]

        small_prot_set = [
            Protein(
                "MKERRIDMKEKNVKAKAPNKKVLGLTTKIFIALLAGAILGIVLCYLVPDSSFKKDVIVEGILYVIGQGFIRLMKMLVVPLVFCSLVCGSMAIGDTKKLGTVGVRTLAFYLATTALAVVALGVGNLINPGVGLDMSAIQSSAASVETMEATSLTDTILNIIPDNPINSLASGSMLQVIVFALIVGVILAKMGERAETVANFFSQFNDIMMEMTMMIMSLAPIGVFCLISRTFANIGFSAFIPLAKYMIGVLLALAIQCFGVYQILLKIFTGLNPIRFIKKFFPVMAFAFSTATSNATIPMSIDTLSKKVGVSKKISSFTIPLGATINMDGTSIMQGVAVVFAAQAFGIHLTPMDYVTVIGTATLASVGTAGVPSVGLVTLTMVFNSVGLPVEAIGLIMGIDRILDMTRTAVNITGDAVCTTIVAHQNGALDKKVFNETE"
            ),
            Protein(
                "MLKVWIAGASGQIGRALNDVLDPMQIEALNTDLDELDITDTDEVINFGTVNRPDVIINCTGITDTDECEANPEHAYRVNALGARNLSIVARKCGSKIVQLSTDDVFDGQSKKPYTEFDDTNPLTVYGRSKRAGENYVKEFTHKHFVIRSNWVYGHGGHNFVNRVLAAAEAGNGLSVASDQFGSPTSAKDLAKMIMYLISTNEYGTYHVTCRGVCSRYEFAQEILKLAGKDIELRAVPTEQSDLSAVRPPYAVLDNFILRIIEVYDMPDWKESLKEYMDERTED"
            )
        ]
        small_unique_test_pep_set = generate_peptides_from_proteins(
            small_prot_set, 9)
        self.fewselfpeptides = [str(x) for x in small_unique_test_pep_set]
Ejemplo n.º 4
0
def generate_proteins_from_transcripts(transcripts, table='Standard', stop_symbol='*', to_stop=True, cds=False):
        """
        Enables the translation from a :class:`~Fred2.Core.Transcript.Transcript` to a
        :class:`~Fred2.Core.Protein.Protein` instance. The result is a generator.

        The result is a generator.

        :param transcripts:  A list of or a single transcripts to translate
        :type transcripts: list(:class:`~Fred2.Core.Transcript.Transcript`) or :class:`~Fred2.Core.Transcript.Transcript`
        :param str table: Which codon table to use? This can be either a name (string), an NCBI identifier (integer),
                          or a CodonTable object (useful for non-standard genetic codes). Defaults to the 'Standard'
                          table
        :param str stop_symbol: Single character string, what to use for any terminators, defaults to the asterisk, '*'
        :param bool to_stop: Translates sequence and passes any stop codons if False (default True)(translated as the
                             specified stop_symbol). If True, translation is terminated at the first in frame stop
                             codon (and the stop_symbol is not appended to the returned protein sequence)
        :param bool cds: Boolean, indicates this is a complete CDS. If True, this checks the sequence starts with a
                         valid alternative start codon (which will be translated as methionine, M), that the sequence
                         length is a multiple of three, and that there is a single in frame stop codon at the end
                         (this will be excluded from the protein sequence, regardless of the to_stop option).
                         If these tests fail, an exception is raised
        :returns: The protein that corresponds to the transcript
        :rtype: Generator(:class:`~Fred2.Core.Protein.Protein`)
        :raises ValueError: If incorrect table argument is pasted
        :raises TranslationError: If sequence is not multiple of three, or first codon is not a start codon, or last
                                  codon ist not a stop codon, or an extra stop codon was found in frame, or codon is
                                  non-valid

        """

        if isinstance(transcripts, Transcript):
            transcripts = [transcripts]

        for t in transcripts:
            if not isinstance(t, Transcript):
                raise ValueError("An element of specified input is not of type Transcript")
            # translate to a protein sequence
            #if len(str(self)) % 3 != 0:
            #    raise ValueError('ERROR while translating: lenght of transcript %s is no multiple of 3, the transcript is:\n %s' % (self.transcript_id, self))

            #TODO warn if intrasequence stops - biopython warns if  % 3 != 0
            prot_seq = str(t.translate(table=table, stop_symbol=stop_symbol, to_stop=to_stop, cds=cds))

            new_vars = dict()
            for pos, var in t.vars.iteritems():
                if not var.isSynonymous:
                    prot_pos = pos // 3
                    new_vars.setdefault(prot_pos, []).append(var)

            gene_id = t.gene_id
            yield Protein(prot_seq, gene_id, t.transcript_id, t, new_vars)
Ejemplo n.º 5
0
def extractEpitopeInformationFromMSA(input, epitope_length):
    error = ''
    NONE = 0
    ANTIGEN_HEADER = 1
    SEQUENCE_HEADER = 2
    SEQUENCE = 3

    last_read = NONE
    antigen = ''
    current_sequence = ''
    sequences = []

    consensus_info = {}
    consensuses = {}
    epitope_info = []
    antigens = []
    conservation = {}
    with open(input, "r") as f:
        for line in f:
            line = line.strip()
            if line == '' or line[0] == '#':
                continue
            # print 'in extract msa info',line
            if line.startswith('>'):
                if last_read == NONE or last_read == SEQUENCE:
                    if last_read == SEQUENCE:  # done with previous antigen

                        sequences.append(current_sequence)
                        current_sequence = ''

                    antigen = line[2:].strip()
                    last_read = SEQUENCE_HEADER

                else:  # wrong format
                    error = "Input format does not comply with required MSA format."
                    break

            elif line[0] == '>' and antigen != '':
                if last_read == ANTIGEN_HEADER or last_read == SEQUENCE:
                    if last_read == SEQUENCE:
                        sequences.append(current_sequence)
                        current_sequence = ''

                    last_read = SEQUENCE_HEADER

            else:  # sequence

                if not isValidMSASequence(line) and antigen != '':
                    error = "Invalid amino acid sequence given for antigen " + antigen + ". "
                    break

                if last_read == SEQUENCE_HEADER or last_read == SEQUENCE:
                    # current_sequence += line[:-1]
                    current_sequence += line
                    last_read = SEQUENCE
                else:
                    error = "Input format does not comply with required MSA format."
                    break

    if error == '':
        if last_read == SEQUENCE:
            sequences.append(current_sequence)
            consensus = determineConsensusFromMSA(sequences)
            if consensus[0] != '':
                consensuses[antigen] = consensus[0].upper()
                consensus_info.setdefault(antigen, []).append(
                    (Protein(consensus[0].upper(), antigen, antigen), consensus[1]))
            else:  # different sequence lengths
                error = "MSA sequences of antigen " + antigen + " are of different lengths."
        else:
            error = "Input format does not comply with required MSA format."

        if error == '':
            (error, conservation) = extractEpitopesAndConservationFromConsensus(consensus_info, epitope_length)
        # print "infos form extractEpitopesAndConservationFromConsensus", conservation

    return (error, conservation, consensuses)