Esempio n. 1
0
def parse_uniprot(entries):
    ent = ['id:' + s for s in entries]
    print(ent[2])
    u = UniProt(verbose=True)
    df = u.get_df(ent)

    return df
Esempio n. 2
0
def retrieve_label_from_uniprot_df(ID):
    uniprot = UniProt()
    df = uniprot.get_df(ID)
    label = df["Taxonomic lineage (PHYLUM)"][0]

    if type(label) == np.float64 and np.isnan(label):
        raise ValueError("Label was NaN")
    return label
Esempio n. 3
0
def obtained(liste, output_file):

    uprot = []
    u = UniProt()
    for ids in liste:
        sp = " "
        name = "Escherichia coli (strain K12)"
        final = str(ids) + sp + name
        d = u.quick_search(final, limit=1)
        for entry in d.keys():
            uprot.append(entry)

    dd = u.get_df(uprot)
    GO = dd["Gene ontology (GO)"]

    file2 = open(output_file, 'w')

    file2.write("numero uniprot est le suivant \n \n")
    for i in range(len(liste)):
        file2.write(str(liste[i] + " "))

    file2.write("\n")
    file2.write("\n")
    file2.write("ID accession uniprot est le suivant \n")
    for i in range(len(uprot)):
        file2.write(str(uprot[i] + "  "))

    file2.write("\n")
    file2.write("\n")
    file2.write(
        "L'enrichissement ontologique pour ces termes est la suivante \n\n")
    for i in range(len(GO)):
        for j in range(len(GO[i])):
            file2.write(GO[i][j])
        file2.write("\n \n")

    return GO
Esempio n. 4
0
class Annotations(MassSpecReader):
    """Create/store/read annotations from uniprot and figure out entry names

    The Annotations classes allows one to populate the dataframe attribute
    :attr:`df` with the **Entry** and **Entry_name**  columns (UniProt entries).
    This is not strictly speaking required columns but provide more tools if
    available. The annotations also creates a new dataframe called :attr:`annotations`
    that stores in particular the protein sequence and the GO terms. The former
    being used to check the peptide sequence and the latter to plot relevant
    histogram about GO terms.

    This class inherits from :class:`msdas.readers.MassSpecReader`. Consequently,
    input can be a MassSpecReader instance, or a filename or even nothing (data
    can be read at a later stage). The dataframe must contain the Protein column.

    One reason to fetch the entries from UniProt is that the protein column name
    may contain typos or non-uniprot entries, therefore it is quite useful to
    fetch all entries from uniprot based on the protein name provided. This can
    be done thanks to the :meth:`get_uniprot_entries`. This method fills a dictionary called
    :attr:`_mapping` (note the underscore), which is used to populate a new column
    in the dataframe called **Entry**.

    If your initial dataframe contains the columns "Entry" with all valid UniProt
    entries (e.g., P23300) then the :attr:`_mapping` attribute is populated during the
    initialisation and the call to :meth:`get_uniprot_entries` can be skipped.
    If called, it will also be faster but will overwrite the content of the Entry column.
    You can also fill/correct/complete the :attr:`_mapping` attribute before calling
    :meth:`get_uniprot_entries`

    .. doctest::

        >>> from msdas import annotations
        >>> import pandas as pd
        >>> df = pd.DataFrame({
            'Protein':['DIG1'],
            'Sequence_Phospho':['SS(Phospho)T'],
            'Psite':['S2']})
        >>> a = annotations.Annotations(df, "YEAST")
        >>> a._mapping
        {}
        >>> a.get_uniprot_entries()
        {'DIG1_YEAST': ['Q03063']}
        >>> a.df.Entry
        0    Q03063
        Name: Entry, dtype: object

    Then, call :meth:`set_annotations`, which will fetch all annotations from
    uniprot and store them in a new dataframe in the :attr:`annotations` attribute

    ::

        a.set_annotations()
        a.annotations

    A new field called **Entry_name** is also added to the dataframe itself::

        a.df.Entry_name

    On a big data set, it may take a few minutes to fetch all information from uniprot.
    So, we also provide tools to save and read back the relevant information (
    :meth:`read_annotations`, :meth:`to_pickle`, :meth:`to_csv`  ) ::

        from msdas import *
        r = readers.MassSpecReader(get_yeast_raw_data())
        # this takes about 10 minutes depending on the connection for 1600 unique protein names
        r.get_uniprot_entries()
        r.set_annotations()
        r.to_pickle(tag="test") # creates a file called YEAST_annotations_test.pkl
        r.to_csv("data.csv")

    Next time, just type::

        from msdas import *
        a = annotations.Annotations("data.csv", "YEAST")
        a.read_annotations("YEAST_annotations_test.pkl")

    To check that the entries are correct, one thing that can be done is to
    look for the peptide sequence into the FASTA sequence found in the annotations::

        a.check_entries_versus_sequence()

    This is a very good sanity check to verify that the entry names found
    correspond to the peptide provided. If not, the protein name was probably wrong
    or was a gene name that could not be mapped correctly to the correct protein.

    If some entries are not found or mapping was not found, you need to
    manually check the issues and update the :attr:`_mapping` attribute,
    update the uniprot entries and annotations::

        a._mapping[entry] = ['entry name']
        a.get_uniprot_entries()
        a.set_annotations()
        a.check_entries_versus_sequence()

    if you cannot find a mapping, we would recommend to delete the item from
    the dataframe :attr:`df`.
    """
    def __init__(self, data, organism=None, verbose=True, annotations=None, **kargs):
        """.. rubric:: Constructor

        :param data: a MassSpecReader compatible input (e.g., CSV file, None, a
            MassSpecReader instance). See :class:`msdas.readers.MassSpecReader`
            documentation for details
        :param organism: valid uniprot identifier for the organism e.g., HUMAN
            YEAST.
        :param annotations: a pickled file containing the annotations saved
            using :meth:`to_pickle`.
        :param kargs: valid parameter recognised by :class:`msdas.readers.MassSpecReader`

        """
        super(Annotations, self).__init__(data=data, verbose=verbose, **kargs)
        if organism is None:
            raise ValueError("organism must be provided e.g. YEAST, HUMAN")

        self.organism = organism

        #: the dataframe where annotations from uniprot will be stored.
        self.annotations = None

        self._mapping = {}
        self.build_mapping_from_df()# if Entry is provided

        self._init_uniprot()

        if annotations:
            self.read_pickle(annotations)

    def _init_uniprot(self):
        if hasattr(self, "_uniprot") == False:
            self._uniprot = UniProt(verbose=self.debugLevel)

    def _update_species_to_find(self):
        entry_names = [x + "_" + self.organism for x in self.df.Protein]
        #unique_entry_names = list(set(entry_names))

        species_to_find = [k for k in entry_names if k not in self._mapping.keys()]
        species_to_find = list(set(species_to_find))
        self._species_to_find = list(set(species_to_find))

    def build_mapping_from_df(self):
        """Populate the _mapping dictionary using the Uniprot Entry column"""
        if "Entry" in self.df.columns:
            for index in self.df.index:
                k = self.df.Protein.ix[index]
                if k.endswith("_"+self.organism) == False:
                    k += "_" + self.organism
                v = self.df.Entry.ix[index]
                self._mapping[k] = [v]
        else:
            self.warning("Entry column not found in the dataframe. call get_uniprot_entries")

    def get_uniprot_entries(self, Nmax=50):
        """Search for the uniprot entries and entry names given protein column.

        Protein names from the dataframe are first used to feed uniprot mapping tool.
        Some protein names won't be found as a uniprot entry because there are
        not uniprot entry name but gene names.  We therefore also scan
        missing entries by looking for gene names. Once found, the proposed
        items that contain the gene names and organism are candidates for the
        entry names. There may be several solutions though, which explain why
        the values in the :attr:`_mapping` dictionary  are made of lists. If several
        candidates are found, warning and raised.

        Results are stored in :attr:`_mapping` and in the dataframe itself.

        Let us show one example with 3 protein names that cover all cases:

            * DIG1, is a valid uniprot entry
            * ASC1 is not a uniprot entry. It is a gene name from which the entry
              may be retrieved automatically.
            * LEU1 is a gene name AND a uniprot entry. This is an ambiguous
              case. The default is to use the uniprot entry but if you call
              :meth:`check_entries_versus_sequence` (after meth:`set_annotations`)
              you will see that there is a mismatch meaning that LEU1_YEAST provided
              in the protein column is catually not the protein name but the gene name


        ::

            >>> import pandas as pd
            >>> from msdas import *
            >>> df = pd.DataFrame({'Protein':['DIG1', 'LEU1', 'ASC1'],
                'Sequence_Phospho':['S(Phospho)APAQVTQHSK', 'VEVTS(Phospho)EDEK',
                                    'DS(Phospho)VTIISAGNDK'],
                'Psite':['S142','S495', 'S166']})
            >>> a = Annotations(df, "YEAST)
            >>> a.get_uniprot_entries()
            >>> a._mapping
            {'ASC1_YEAST': ['P38011', 'P01120'],
             'DIG1_YEAST': ['Q03063'],
             'LEU1_YEAST': ['P06208-1', 'P06208']}

        Here, DIG1 has one unique entry. This is expected because DIG1 is in fact
        an entry name (unique by definition). ASC1 is a gene name. This method
        figures out that it correspond to either P38011 or P01120. There are
        several entries because mapping from gene to protein is not unique.
        By default, the entry with highest score appears first. There is no 100% guarantee
        that this mapping is correct and :meth:`check_entries_versus_sequence`
        should be called to check that the peptide sequence is contained in
        this entry sequence. The last case (LEU1) is even more problematic because
        it is a valid entry name even though the protein name provided is actually
        a gene name... again call :meth:`check_entries_versus_sequence`.


            >>> a.set_annotations()
            >>> a.check_entries_versus_sequence()
            P06208-1 not found in the annotations index

        So, here we are told that amongst the 3 entries, P06208-1 is not found.
        This the LEU1 case. If you were to use batch tool, you would figure out
        given the peptide sequence that this is actually LEUC_YEAST entry with
        uniprot entry LEUC_YEAST/P07264.

        So, you need to manually update the mapping:

            >>> a._mapping['LEU1_YEAST'] = ['P07264']
            >>> a.get_uniprot_entries() # to update the main df with new entries
            >>> a.set_annotations() # to retrieve the sequence of LEUC_YEAST
            >>> a.check_entries_versus_sequence()

        .. seealso:: :meth:`set_annotations`

        """
        # get the mapping using bioservices.uniprot

        # apply function is 3 times slower than list...
        # entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism)
        self._update_species_to_find()
        if len(self._species_to_find)>0:
            self.logging.info("Fetching uniprot accession numbers for %s entries" % len(self.df.Protein))
            self.logging.info("Fetching uniprot accession numbers for %s unique entries" % len(self.df.Protein.unique()))
            mapping = self._uniprot.multi_mapping(fr="ID", to="ACC",Nmax=Nmax,
                                              query=self._species_to_find)
            for k,v in mapping.iteritems():
                if k not in self._mapping.keys():
                    self._mapping[k] = v

        # some species may not be found (secondary accession number) if _human
        # appended in tcell case. so we may need to call again the mapping but
        # without the appended organism string.
        self._update_species_to_find()
        if len(self._species_to_find):
            self.logging.info("Some species were not found ({}). Using secondary accession:".format(len(self._species_to_find)))
            self.logging.info("Fetching uniprot without trailing species")
            self.logging.info("Fetching  %s new ones " % len(self._species_to_find))
            mapping = self._uniprot.multi_mapping(fr="ID", to="ACC",
                    query=[x.split("_")[0] for x in self._species_to_find], Nmax=Nmax)
            for k,v in mapping.iteritems():
                self._mapping[k+ "_" + self.organism] = v

        # Some are not yet found. this could be because the provided protein name is actually a
        # gene name...
        def func(x, tag):
            if len(x)==0:
                return False
            else:
                return tag in x[0].split()

        self._genes = {}
        self._update_species_to_find()
        if len(self._species_to_find):

            self.logging.info("Some species are still not found {}. Trying to use gene names".format(len(self._species_to_find)))
            self.logging.info("Fetching uniprot accession numbers for those without _species appended")
            self.logging.info("Fetching  %s new ones " % len(self._species_to_find))
            for i,this in enumerate(self._species_to_find):
                if " " in this:
                    continue
                self.logging.info("Searching for entry {}/{} for gene names".format(i+1,len(self._species_to_find)))
                df = self._uniprot.get_df(this.split("_")[0], organism=self.organism)
                l1 = df['Gene names'].apply(lambda x : func(x,this.split("_")[0] ))
                l2 = df['Entry name'].apply(lambda x: x.endswith(self.organism))

                if sum(l1&l2) >= 1:
                    k = list(df.ix[l2&l1]['Entry name'])
                    v = list(df.ix[l2&l1]['Entry'])
                    self.logging.debug(k, v, this)
                    if k in self._mapping.keys():
                        raise ValueError("!!!!!!!%s Already in the dictionary " % k)
                    #self._mapping[k] = v
                    self._mapping[this] = v
                else:
                    print("skipping %s... sum=%s" % (this, sum(l1&l2)))

        self._update_species_to_find()
        if len(self._species_to_find):
            self.logging.info("Some species were not found. Using gene names")
            self.logging.info("Fetching uniprot accession numbers for those without _species appended")
            self.logging.info("Fetching  %s new ones " % len(self._species_to_find))

        self._append_uniprot_entries_to_df()

    def _append_uniprot_entries_to_df(self):

        if "Entry" in self.df.columns:
            self.logging.warning("Overwritting column called Entry in the dataframe")

        # get list of unique entry names.
        entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism)
        #remapping = [(k,v[0]) for k,va.g in a._mapping.iteritems()]
        # add into dataframe the uniprot entries but must have same order as
        # in the dataframe (entry_names)
        uniprot_entries = []
        for name in entry_names:
            # if not found, let us use unknown as a label. could use NA?
            uniprot_entry = self._mapping.get(name, "")
            if uniprot_entry == "":
                uniprot_entries.append("")
                print("!! ", name, " not found")
            else:
                if len(uniprot_entry)>1:
                    self.logging.info("Found entry with several matches: %s %s . Only first one is selected (highest uniprot score)" % (name,
                                      uniprot_entry))
                uniprot_entries.append(uniprot_entry[0])

        # index=df.index is important to use the join afterwards
        thisdf = pd.DataFrame({'Entry': uniprot_entries}, index=self.df.index)
        if "Entry" in self.df.columns:
            del self.df['Entry']
        self.df = self.df.join(thisdf)

    def _append_uniprot_entry_names_to_df(self):

        if isinstance(self.annotations, types.NoneType) == True:
            self.error("must call set_annotations first")
            return
        # let us add the Entry_name column as well
        entry_names = [self.annotations.ix[e]['Entry name'] if e in self.annotations.index else "" for e in self.df.Entry]
        self.df['Entry_name'] = entry_names

    def plot_goid_histogram(self, drop_duplicates=True):
        """Histrogram of the number of GO terms per petide or protein

        :param drop_duplicates: ignore duplicates entries

        .. plot::
            :width: 80%
            :include-source:

            from msdas import *
            m = Annotations(get_yeast_small_data(), "YEAST", verbose=False)
            m.set_annotations()
            m.plot_goid_histogram()

        .. todo:: is this functional process or not

        """
        if self.annotations is False:
            raise AttributeError(self._error_messages['annotations'])
        if drop_duplicates:
            entries = self.df.Entry.drop_duplicates()
        counter = self.annotations.ix[entries]['Gene ontology IDs'].apply(lambda x: len(x))

        M = counter.max()
        # if we want the GO per peptides, then we need to look at the original
        # dataframe that contains several psites per peptide. UniProt_entry is
        # not a set so values from counter may be duplicated, which is what we
        # want for this first figure
        duplicated_counter = [counter[x] for x in self.df.Entry]
        pylab.figure(1)
        pylab.clf()
        pylab.hist(duplicated_counter, bins=[x+.5 for x in range(0,M+1)])
        pylab.title("Distribution of number of GO id terms per peptide")
        pylab.grid()

        # annotations contains the unique protein entry, so here we get the number of GO terms per protein
        counter = self.annotations['Gene ontology IDs'].apply(lambda x: len(x))
        M = counter.max()
        pylab.figure(2)
        pylab.clf()
        pylab.hist(counter, bins = [x+.5 for x in range(0,M+1)])
        pylab.title("Distribution of number of GO id terms per protein")
        pylab.grid()

    def set_annotations(self, Nmax=100):
        """Fetched all information from uniprot and set :attr:`annotations`
        as a pandas dataframe.


        Look into the dataframe Entry column and update the annotations dataframe
        to populate missing entries. The Entry column in the :attr:`df` should have been
        populated by :meth:`get_uniprot_entries` with valid entries from Uniprot.

        If you have thousand of entries, this is taking a few minutes. You can
        save the annotations and read them back using :meth:`msdas.MassSpecReader.read_annotations`
        and :meth:`to_pickle`.


        """
        self.logging.info("Fectching information from uniprot. Takes some time")

        #could split if too long
        entries = [this for this in list(set(self.df.Entry)) if this]

        # not need to search again if already present in the attribute
        if self.annotations is not None:
            entries = [x for x in entries if x not in list(self.annotations.index)]

        if len(entries)==0:
            self.warning("No new entries found. Your annotations dataframe is already up-to-date")
            self.annotations.drop_duplicates(subset="Entry name", inplace=True)
            self._append_uniprot_entry_names_to_df()
            return

        annotations = self._uniprot.get_df(entries, nChunk=Nmax)
        annotations = annotations[annotations.Entry.apply(lambda x: x in entries)]

        if len(annotations) == 0:
            raise ValueError("your list of protein is empty")
        self.logging.info("Fectching {}".format(len(annotations)))
        annotations.set_index(["Entry"], inplace=True)
        if self.annotations is None:
            self.annotations = annotations
        else:
            self.annotations = self.annotations.append(annotations)
        #self.annotations.set_index(["Entry"], inplace=True)
        self.logging.info("Annotations have been loaded. You can save the annotations" +
            " dataframe attribute using x.to_pickle('annotations.pkl') " +
            " Next time, you could just load if using \n\n" +
            "     >>> m = readers.MassSpecReader(filename, mode='yeast')\n" +
            "     >>>  m.read_annotations('annotations.pkl')")

        #indices are the uniprot entry. Some may be identical with slightly different columns
        # but the entry name should be unique. Here, we keep the first instance of each entry
        self.annotations.drop_duplicates(subset="Entry name", inplace=True)
        self._append_uniprot_entry_names_to_df()

    def to_pickle(self, tag=None, overwrite=False):
        """Save annotations dataframe as a pickle

        :param tag: a tag to append to the name of the annotations file.
        :param overwrite: overwrite file if it exists

        filename is going to be organism_annotations_tag.pkl
        """
        filename = self.organism + "_annotations"
        if tag != None and isinstance(tag, str):
            filename += "_" + tag
        filename += ".pkl"
        if overwrite == False:
            if os.path.exists(filename):
                raise IOError("file %s already exists" % filename)
        self.annotations.to_pickle(filename)

    def read_pickle(self, filename):
        """Read annotations in pickled format as saved by :meth:`to_pickle`

        :param str filename: filename to read
        """
        try:
            self.annotations = pd.read_pickle(filename)

            # update the mapping dictionary
            for k,v in self.annotations['Entry name'].iteritems():
                if k not in self._mapping.keys():
                    self._mapping[v] = [k]
        except:
            self.logging.error("Could not read your file. Expected a pkl \
            containing a dataframe with Entry name and index being uniprot \
            indices. ")

    def hist_most_relevant_goids(self, N=10, tight_layout=True, wrap_length=40,
                                 drop_duplicates=True, **kargs):
        """Plot histogram of the GO identifiers found in all proteins.

        :param int N: restrict histogram to terms that appear at least N times
        :param int wrap_length:  wrap text on the y-axis by wrap_length (defaults to 40)
        :param drop_duplicates: drop the duplicated entries
        :param kargs: pandas.plot arguments accepted.

        .. plot::
            :include-source:
            :width: 80%

            from msdas import *
            m = Annotations(get_yeast_small_data(), "YEAST", verbose=False)
            m.set_annotations()
            m.hist_most_relevant_goids(N=5)


        .. todo:: this is made on the annotations dataframe. Should be
            done based on the entry names in the dataframe

        """
        if self.annotations is False:
            raise AttributeError(self._error_messages['annotations'])
        kargs['legend'] = kargs.get("legend", False)


        if drop_duplicates:
            entries = self.df.Entry.drop_duplicates()

        goids = [y for x in self.annotations.ix[entries]['Gene ontology (GO)'] for y in x]
        uniq_goids = set(goids)
        names = [x for x in uniq_goids]

        # let us wrap the string by 40 character max to avoid long labels in the figure
        names = ["\n".join(textwrap.wrap(name, width=wrap_length)) for name in names]

        count = [goids.count(x) for x in uniq_goids]
        df = pd.DataFrame({'name':names, 'size':count}, index=range(0, len(uniq_goids)))
        if N:
            subdf = df[df['size']>N].set_index("name")
        subdf.sort("size").plot(kind="barh", **kargs)
        if tight_layout:
            pylab.tight_layout()

    def check_entries_versus_sequence(self):
        """Check that peptide sequence are contained in uniprot sequence

        This is a very good sanity check on the validity of the uniprot entry names
        found by :meth:`get_uniprot_entries` method

        If a peptide sequence is not found, it means that the protein name is
        not correct.

        See AnnotationsYeast class where the :meth:`AnnotationsYeast.update_mapping` is used to
        update the incorrect mapping.

        .. seealso:: :meth:`find_sequence_blast`

        """

        self.logging.info("Comparing peptide sequence in the attribute df with sequences in the annotations")
        self.logging.info("row index, protein name, uniprot entry")
        if isinstance(self.annotations, types.NoneType):
            raise Exception("annotations not set. call set_annotations")
        found = False
        for i in self.df.index:
            entry = self.df.ix[i].Entry
            if entry not in self.annotations.index:
                print("{} not found in the annotations index".format(entry))
                continue
            if self.df.ix[i].Sequence not in self.annotations.ix[entry].Sequence:
                if found == False:
                    print("Found unknown entries\nindex, protein name, uniprot entry ")
                    found = True
                print(i, self.df.ix[i].Protein, self.df.ix[i].Entry)

    def find_sequence_blast(self, seq, email):
        """Utility to search for a sequence using BLAST via bioservices

        :param str seq: the sequence
        :param email: a valid email address

        .. note:: This is using NCIBlast web service via
            `BioServices <https://pypi.python.org/pypi/bioservices>`_.
        """
        from bioservices import NCBIblast
        s = NCBIblast(verbose=self.level)
        jobid = s.run(program="blastp", sequence=seq, stype="protein",
                      database="uniprotkb", email=email)
        return s.getResult(jobid, "out")

    def to_csv(self, filename):
        """Export the dataframe with data and annotations into a CSV file

        :meth:`set_annotations` and :meth:`get_uniprot_entries` must have been called.


        """
        if "Entry" not in self.df.columns or "Entry_name" not in self.df.columns:
            raise ValueError("Entry or Entry_name missing in dataframe. You must call get_entries_uniprot and set_annotations methods")
        self.df.Identifier = self.df.Protein + "_" + self.df.Psite
        self.df.to_csv(filename, index=False, sep=",")
Esempio n. 5
0
def process_data(current_directory='.'):

    # Fetch original data (from DeepSequence paper)
    if not os.path.exists('BLAT_ECOLX_1_b0.5.a2m'):
        os.system(
            'wget https://sid.erda.dk/share_redirect/a5PTfl88w0/BLAT_ECOLX_1_b0.5.a2m'
        )

    filename = "BLAT_ECOLX_1_b0.5.a2m"

    record_map = {}
    for record in SeqIO.parse(filename, "fasta"):
        if record.id == "BLAT_ECOLX/24-286":
            record.id = 'QUERY_P62593/24-286'
        key = record.id
        assert key not in record_map
        record_map[key] = record

    ids = list(record_map.keys())
    info_map = {}
    for i, batch in enumerate(
            tqdm(np.array_split(ids, math.ceil(len(ids) / 100)))):

        # print(i)
        u = UniProt(verbose=True)
        u.settings.TIMEOUT = 240
        df = u.get_df(['id:' + id.split('/')[0].split('_')[1] for id in batch],
                      limit=None)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_colwidth', None)
        result = df[[
            'Entry', 'Sequence', 'Taxonomic lineage (PHYLUM)',
            'Protein families'
        ]]

        for i in range(len(result)):
            row = result.iloc[i]
            if len(row['Sequence']) > 0:
                info_map[row['Entry']] = row.fillna('Other').to_dict()

    records = []
    records_full = []
    uniprot_ids_processed = {}
    for id in record_map:
        info_map_key = id.split('/')[0].split('_')[1]
        if info_map_key in info_map:

            if info_map_key in uniprot_ids_processed:
                print("WARNING - duplicate entry:", info_map_key)
                if skip_duplicate_uniprot_ids:
                    continue
            uniprot_ids_processed[info_map_key] = True

            info = info_map[info_map_key]
            record = record_map[id]
            record.description = "[" + info['Taxonomic lineage (PHYLUM)'] + "]"
            records.append(record)

            record_full = SeqRecord(Seq(info['Sequence']),
                                    id=info['Entry'],
                                    description="[" +
                                    info['Taxonomic lineage (PHYLUM)'] + "]")
            records_full.append(record_full)
        else:
            print("Not found: ", id, info_map_key)

    with open(
            os.path.join(current_directory, "BLAT_ECOLX_1_b0.5_labeled.fasta"),
            "w") as output_handle:
        SeqIO.write(records, output_handle, "fasta")
    with open(
            os.path.join(current_directory,
                         "BLAT_ECOLX_1_b0.5_full_sequences_labeled.fasta"),
            "w") as output_handle:
        SeqIO.write(records_full, output_handle, "fasta")

    # Alignment using clustal omega
    os.system(
        'clustalo --verbose -i BLAT_ECOLX_1_b0.5_full_sequences_labeled.fasta -o BLAT_ECOLX_alignment_clustal0.fasta'
    )

    # Refinement using maxalign
    os.system(
        'perl maxalign.pl -v=1 -a BLAT_ECOLX_alignment_clustalo.fasta; mv heuristic.fsa BLAT_ECOLX_alignment_clustalo_maxalign.fasta'
    )
Esempio n. 6
0
def retrieve_labels(infile, outfile):
    seqs = SeqIO.parse(infile, "fasta")
    uniprot = UniProt()

    if outfile.exists():
        with open(outfile, "r") as out:
            lines = out.readlines()
    else:
        lines = []

    with open(outfile, "a") as out:
        for i, seq in enumerate(seqs):
            if i < len(lines):
                continue

            if "|" in seq.id:
                ID = seq.id.split("|")[1]
            else:
                ID = seq.id.split("/")[0].replace("UniRef100_", "")
            print(f"Doing ID {i:6}, {ID + ':':15} ", end="")
            try:
                # Try get_df
                df = uniprot.get_df(ID)
                label = df["Taxonomic lineage (PHYLUM)"][0]

                if type(label) == np.float64 and np.isnan(label):
                    columns, values = uniprot.search(ID,
                                                     database="uniparc",
                                                     limit=1)[:-1].split("\n")
                    name_idx = columns.split("\t").index("Organisms")
                    name = values.split("\t")[name_idx].split("; ")[0]
                    columns, values = uniprot.search(name,
                                                     database="taxonomy",
                                                     limit=1)[:-1].split("\n")
                    lineage_idx = columns.split("\t").index("Lineage")
                    label = values.split("\t")[lineage_idx].split("; ")[:2][-1]
            except:
                try:
                    columns, values = uniprot.search(ID,
                                                     database="uniparc",
                                                     limit=1)[:-1].split("\n")
                    name_idx = columns.split("\t").index("Organisms")
                    name = values.split("\t")[name_idx].split("; ")[0]
                    columns, values = uniprot.search(name,
                                                     database="taxonomy",
                                                     limit=1)[:-1].split("\n")
                    lineage_idx = columns.split("\t").index("Lineage")
                    label = values.split("\t")[lineage_idx].split("; ")[:2][-1]
                except:
                    try:
                        columns, values = uniprot.search(
                            ID, database="uniparc", limit=1)[:-1].split("\n")
                        name_idx = columns.split("\t").index("Organisms")
                        name = values.split("\t")[name_idx].split(
                            "; ")[0].split(" ")[0]
                        columns, values = uniprot.search(
                            name, database="taxonomy",
                            limit=1)[:-1].split("\n")
                        lineage_idx = columns.split("\t").index("Lineage")
                        label = values.split("\t")[lineage_idx].split(
                            "; ")[:2][-1]
                    except:
                        print("Couldn't handle it!")
                        breakpoint()

            print(f"{label}")
            out.write(f"{seq.id}: {label}\n")
Esempio n. 7
0
    an exact sub-string match to the target peptide sequences.

    Parameters
    ----------
    spectrumScoreDatabase : DataFrame
        A pandas DataFrame containing target and decoy PSM scores for all spectra.
        Obtained by calling matchAllSpectra().

    Returns
    -------
    accepted : DataFrame
        The filtered pandas DataFrame containing only those target PSM's
        with a score higher than the specified FDR threshold.
    """
    spectrumScoreDatabase.loc[:,'Inferred Proteins'] = spectrumScoreDatabase.apply(lambda row: proteinData.loc[proteinData.Sequence.str.contains(row['Sequence'])].Identifier.tolist() ,axis=1)
    return spectrumScoreDatabase

print('\nFinding protein identifiers associated with the matched peptides...\n')
PSM = retrieveProteins(PSM)
print(PSM)
input("\n\nPress Enter to continue...")

print('\nLooking up identifiers on UniProt...\n')

all_identifiers = []
for index, identifierlist in PSM['Inferred Proteins'].iteritems():
    for identifier in identifierlist:
        all_identifiers.append(''.join(identifier))
#print(u.search('Q12797',frmt="tab", columns="id,protein names,families,genes,organism"))
print(u.get_df(set(all_identifiers)))
Esempio n. 8
0
    protlist: list of proteins
    idnm: type of protein names, such as AC.
    output::
    dict of protein information::
    Entry name; Gene names; Length; Organism; Protein names; Status.
    """
    u = UniProt(verbose=False)
    return u.quick_search(protlist)


# if "__name__" == "__main__":
u = UniProt(verbose=False)
data = u.search("zap70+taxonomy:9606",
                frmt="tab",
                limit=3,
                columns="entry name, length, id, genes")
print data

res = u.search("DNMT1_HUMAN",
               frmt="tab",
               columns="entry name, protein names, pathway, comments")
print(res)

res = u.quick_search("DNMT1_HUMAN")
getreskey = res.keys()
res[getreskey]['Gene names']

df = u.get_df("GALK1_HUMAN")
df['Length'].hist()
plt.show()
Esempio n. 9
0
    Returns
    -------
    accepted : DataFrame
        The filtered pandas DataFrame containing only those target PSM's
        with a score higher than the specified FDR threshold.
    """
    spectrumScoreDatabase.loc[:,
                              'Inferred Proteins'] = spectrumScoreDatabase.apply(
                                  lambda row: proteinData.loc[
                                      proteinData.Sequence.str.contains(row[
                                          'Sequence'])].Identifier.tolist(),
                                  axis=1)
    return spectrumScoreDatabase


print(
    '\nFinding protein identifiers associated with the matched peptides...\n')
PSM = retrieveProteins(PSM)
print(PSM)
input("\n\nPress Enter to continue...")

print('\nLooking up identifiers on UniProt...\n')

all_identifiers = []
for index, identifierlist in PSM['Inferred Proteins'].iteritems():
    for identifier in identifierlist:
        all_identifiers.append(''.join(identifier))
#print(u.search('Q12797',frmt="tab", columns="id,protein names,families,genes,organism"))
print(u.get_df(set(all_identifiers)))
#Sets up -i to store the input CRISPR screen files
parser.add_argument("-i", help = "Input CRISPR screen file", required = True, type = argparse.FileType('r'))

args = parser.parse_args()

input_file = args.i.name


df_input = pd.read_excel(input_file)

#Storing UniProt as an easier to type variable 'u'
u = UniProt()

#Using a built-in UniProt method to create a data frame containg everything in UniProt
df_uniprot = u.get_df("organism:9606+and+reviewed:yes")

#Rename the common column to match the inputted column
df_uniprot.rename(columns={'Gene names  (primary )':'Gene Symbol', 'Entry':'UniProt Symbol',
                            'Proteomes':'Chromosome Number', 'Length':'Protein Length'}, inplace = True)

#Selecting columns I think are interesting
df1_uniprot = df_uniprot[['UniProt Symbol', 'Gene names', 'Gene Symbol', 'Protein names',
        'Chromosome Number', 'Sequence', 'Protein Length', 'Function [CC]', 'Gene ontology (GO)',
        'Gene ontology (biological process)', 'Gene ontology (molecular function)',
        'Gene ontology (cellular component)', 'Protein families']]

#Converting the NCBI gene list to a pandas data frame
df_ncbi = pd.read_table('NCBI_GeneID_File.txt')

#Merge the data frames on the UniProt