Exemple #1
0
    def __init__(self, name, annotation='annotation.pklz', new=True):

        if new:
            method.Method.__init__(self, __name__, None)

            if annotation == "ucsc":
                self._genes = ucsc_gene_network.UCSCGeneNetwork(name)
                self._txs = ucsc_transcript_network.UCSCTranscriptNetwork(name)
            elif annotation == "gencode":
                self._genes = gencode_gene_network.GENCODEGeneNetwork(name)
                self._txs = gencode_transcript_network.GENCODETranscriptNetwork(
                    name)
            elif annotation == "ensembl":
                self._genes = ensembl_gene_network.ENSEMBLGeneNetwork(name)
                self._txs = ensembl_transcript_network.ENSEMBLTranscriptNetwork(
                    name)
            else:
                raise SpadaError(
                    "Unrecognized annotation: {}.".format(annotation))
        else:
            method.Method.__init__(self, __name__, annotation)
            self._genes._name = name
            self._txs._name = name

        self._new = new
Exemple #2
0
    def getDomainInteractions(self, ddi):

        if self._new and not ddi:
            raise SpadaError(
                "A file containing the domain-domain interactions must be provided."
            )
        elif not ddi:
            self.logger.info(
                "Domain-domain interactions from the provided network will be used."
            )
            return

        self.logger.info("Building isoform-isoform interaction network.")
        allDDIs = {
            frozenset([x['Pfam1'], x['Pfam2']])
            for x in io.readTable(ddi, keys=['Pfam1', 'Pfam2'])
        }
        gene2tx = io.getGene2Tx(self._txs)

        for gene1, gene2 in self._genes._net.edges():
            for tx1, tx2 in product(gene2tx.get(gene1, set()),
                                    gene2tx.get(gene2, set())):

                possibleDDIs = {
                    frozenset(x)
                    for x in product(self._txs[tx1]["Pfam"], self._txs[tx2]
                                     ["Pfam"])
                }
                matches = possibleDDIs & allDDIs

                if matches:
                    self._txs.add_edge(tx1, tx2, ddi=matches)
Exemple #3
0
def parseExpression(ctrlFile, caseFile, genes, txs):

    gene2tx = getGene2Tx(txs)

    with open(ctrlFile, "r") as CTRL, open(caseFile, "r") as CASE:

        idsCtrl = readSamples(CTRL)
        idsCase = readSamples(CASE)

        # gene -> xpr
        expression = {}

        for (tx, ctrl), (tx2, case) in zip(parseExpressionLine(CTRL),
                                           parseExpressionLine(CASE)):

            if tx != tx2:
                raise SpadaError(
                    "Case and control expresion files mismatch: {} vs. {}.".
                    format(tx, tx2))

            try:
                gene = txs[tx]["gene_id"]
            except KeyError:
                continue

            expression.setdefault(
                gene, GeneExpression(gene2tx[gene], idsCtrl, idsCase))
            expression[gene].addTx(tx, ctrl, case)

            if expression[gene].isComplete:
                yield gene, expression.pop(gene)
Exemple #4
0
    def computePSI(self, expression, nan_rm=False):

        if expression.shape == (0, ):
            raise SpadaError("Expression empty.")
        psi = expression / expression.sum(axis=0)

        if nan_rm:
            nancols = np.where(np.isnan(psi))[1]
            psi = np.delete(psi, nancols, axis=1)

        return psi
Exemple #5
0
    def createNetworks(self, gtf):

        if not self._new:
            if gtf:
                raise SpadaError(
                    "gtf provided when previous network is to be used.")
            else:
                return

        self.logger.info("Importing genes and transcripts from GTF.")

        txLines = ['transcript', 'exon', 'CDS', 'start_codon', 'stop_codon']

        for line in io.readGTF(gtf):

            if line["feature"] == "gene" and self._genes.accept(line):
                self._genes.add_node(gene_id=line["gene_id"],
                                     gene_symbol=line["gene_name"])
            elif line["feature"] in txLines and self._txs.accept(line):
                if line["feature"] == "transcript":
                    self._txs.add_node(line["transcript_id"], line["gene_id"])
                    self._txs.update_node(
                        line["transcript_id"], "txCoords",
                        [int(line["start"]),
                         int(line["end"])])
                    self._txs.update_node(line["transcript_id"], "strand",
                                          line["strand"])
                    self._txs.update_node(line["transcript_id"], "chr",
                                          line["chromosome"])
                    self._txs.update_node(line["transcript_id"], "main",
                                          self._txs.isMain(line))
                elif line["feature"] == "exon":
                    self._txs.update_node(
                        line["transcript_id"], "exons",
                        [int(line["start"]),
                         int(line["end"])])
                elif line["feature"] == "start_codon":
                    pos = line["start"] if line['strand'] == '+' else line[
                        'end']
                    self._txs.update_node(line["transcript_id"], "start_codon",
                                          pos)
                elif line["feature"] == "stop_codon":
                    pos = line["start"] if line['strand'] == '+' else line[
                        'end']
                    self._txs.update_node(line["transcript_id"], "stop_codon",
                                          pos)
                elif line["feature"] == "CDS" and self._txs.acceptCDS(line):
                    self._txs.update_node(
                        line["transcript_id"], "CDS",
                        [int(line["start"]),
                         int(line["end"])])
Exemple #6
0
    def getIsoformSequences(self, proteins):

        if self._new and not proteins:
            raise SpadaError(
                "A FASTA file with protein sequences file must be provided.")
        elif not proteins:
            self.logger.info(
                "Protein sequences from the provided network will be used.")
            return

        self.logger.info("Reading protein sequences.")

        for tx, sequence in io.readFasta(proteins):
            self._txs.update_node(tx, "proteinSequence", sequence)
Exemple #7
0
    def getInteractions(self, ppi):

        if self._new and not ppi:
            raise SpadaError(
                "A file containing the protein-protein interactions must be provided."
            )
        elif not ppi:
            self.logger.info(
                "Protein-protein interactions from the provided network will be used."
            )
            return

        self.logger.info("Building protein-protein interaction network.")
        symbols = [y["symbol"] for x, y in self._genes.nodes(data=True)]

        for line in io.readPSIMITAB(ppi):

            if line["organismA"][0]["id"] != "9606" or line["organismB"][0][
                    "id"] != "9606":
                continue

            symbolA = [
                x["id"] for x in line["symbolA"]
                if x["type"] == 'entrez gene/locuslink'
            ]
            symbolB = [
                x["id"] for x in line["symbolB"]
                if x["type"] == 'entrez gene/locuslink'
            ]

            added = False
            for _ in range(2):
                for A, B in product(symbolA, symbolB):
                    if self._genes.add_edge(symbol1=A, symbol2=B):
                        added = True
                        break

                if added:
                    break

                symbolA.extend([
                    x["id"] for x in line["aliasA"] if x.get("extra", None) in
                    ["gene name", "gene name synonym"] and x["id"] in symbols
                ])
                symbolB.extend([
                    x["id"] for x in line["aliasB"] if x.get("extra", None) in
                    ["gene name", "gene name synonym"] and x["id"] in symbols
                ])
Exemple #8
0
    def getIsoformFeatures(self, features):

        if self._new and not features:
            raise SpadaError(
                "A file with the protein features must be provided.")
        elif not features:
            self.logger.info(
                "Protein features from the provided network will be used.")
            return

        self.logger.info("Reading isoform features.")

        featureFields = [
            'Transcript', 'Feature_type', 'Feature', 'Start', 'End'
        ]
        for line in io.readTable(features, keys=featureFields):

            tx = line['Transcript']
            featureType = line['Feature_type']
            feature = line['Feature']
            start = int(line['Start'])
            end = int(line['End'])

            self._txs.update_node(tx, featureType, (start, end), feature)
Exemple #9
0
    def analyzeDomains(self, featureType):

        featureInfo = []
        features = set()

        if not featureType in ["Pfam", "Prosite"]:
            raise SpadaError(featureType +
                             ' not recognized. Use Pfam or Prosite.')

        for isoform in [self.ctrlIsoform, self.caseIsoform]:
            if isoform:
                f = isoform._prosite if featureType == "Prosite" else isoform._pfam
                [features.add(x) for x in f]

        for feature in features:

            featInfo = {self.ctrl: [], self.case: []}

            for isoform in [self.ctrlIsoform, self.caseIsoform]:
                if not isoform:
                    continue

                featureRegions = isoform.getFeature(featureType, feature)
                specificRegions = isoform.getSegments("isoform-specific")

                for region in featureRegions:

                    thisIsosp = polypeptide.Polypeptide([])
                    if None not in [self.ctrlIsoform, self.caseIsoform]:
                        for x in specificRegions:
                            if len(x & region):
                                thisIsosp = thisIsosp | x
                    else:
                        thisIsosp = region
                    intersection = float(len(region & thisIsosp))
                    domainLength = float(len(region))
                    specificLength = float(len(thisIsosp))

                    macroScore = intersection / domainLength
                    microScore = float(
                        "nan"
                    ) if specificLength == 0 else intersection / specificLength
                    jaccard = intersection / len(region | thisIsosp)

                    start = min([x.num for x in region])
                    end = max([x.num for x in region])

                    featInfo[isoform.tx].append({
                        'macro': macroScore,
                        'micro': microScore,
                        'jaccard': jaccard,
                        'start': start,
                        'end': end
                    })

                featInfo[isoform.tx] = sorted(featInfo[isoform.tx],
                                              key=operator.itemgetter("macro"))

            i = 1
            emptyDict = {
                'macro': float('nan'),
                'micro': float('nan'),
                'jaccard': float('nan'),
                'start': float('nan'),
                'end': float('nan')
            }
            for nDict, tDict in zip_longest(featInfo[self.ctrl],
                                            featInfo[self.case],
                                            fillvalue=emptyDict):