Ejemplo n.º 1
0
    def dfSummary(cls, thisObservation, mc=10):

        df = DataFrame()

        madeObs = cls.summarizeKmers(thisObservation, mc)

        for idx, obs in enumerate(madeObs):

            if idx == 0:  # header
                cols = [x for x in obs]

                df.addColumns(cols)

            dfrow = DataRow.fromDict(obs)

            df.addRow(dfrow)

        return df
Ejemplo n.º 2
0
          "genes")

    indf = DataFrame.parseFromFile(args.de.name,
                                   skipChar='#',
                                   replacements={
                                       "None": None,
                                       "": None,
                                       "NA": None
                                   })

    inHeaders = indf.getHeader()

    outdf = DataFrame()
    outdf.addColumns([
        'elem_id', 'population_size', 'success_population', 'sample_size',
        'success_samples', 'sample_success_fraction', 'pval', 'adj_pval',
        'direction', 'genes'
    ])

    for direction in ["UP", "DOWN", "ANY"]:

        significantGenes = set()
        measuredGenes = set()

        for row in indf:

            geneID = row[args.gene_symbol].upper()

            if geneID in sym2approvSym:
                geneID = sym2approvSym[geneID]
Ejemplo n.º 3
0
    def fetch(self,
              fromEntity,
              elements,
              toEntities=[
                  GeneIdentity.UNIPROT, GeneIdentity.GENE_SYMBOL,
                  GeneIdentity.ORDERED_LOCUS
              ],
              error_on_empty_result=True):

        self._must_accept(fromEntity)
        self._must_provide(toEntities)

        elements = sorted(elements)

        reqParams = self._make_params(fromEntity, elements, toEntities)

        for x in reqParams:

            lenReqParams = len(reqParams[x])

            if lenReqParams < 100:
                print(str(x) + " " + str(reqParams[x]))
            else:
                print(str(x) + " " + str(lenReqParams) + " elements")

        resp = self._request(RequestMethod.POST, "", reqParams)

        if (resp.text == None):
            print(json.dumps(reqParams))
            raise StoreException("Could not retrieve elements")

        if len(resp.text) == 0 and error_on_empty_result:
            raise StoreException("Empty result")

        convData = DataFrame()
        dfCols = toEntities + [fromEntity]
        convData.addColumns(dfCols)

        def addLineToReturn(lineData):

            modLine = {}
            for c, x in zip(dfCols, lineData):
                if x == '':
                    modLine[c] = None
                else:
                    modLine[c] = x

            convData.addRow(DataRow.fromDict(modLine))

        bFirstLine = True
        for line in resp.text.split('\n'):

            if bFirstLine:
                bFirstLine = False
                continue

            if len(line) == 0:
                continue

            aline = line.split('\t')
            if len(aline) == 0:
                continue

            aline = aline[:-1]

            if ',' in aline[-1]:
                elems = aline[-1].split(',')
                elemCount = len(elems)

                for i in range(0, elemCount):

                    modLine = []

                    for elem in aline[:-1]:

                        aelem = elem.split(' ')
                        if len(aelem) != elemCount:
                            modLine.append(elem)
                        else:
                            modLine.append(aelem[i])

                    modLine.append(elems[i])

                    addLineToReturn(modLine)

            else:
                addLineToReturn(aline)

        return convData
Ejemplo n.º 4
0
        for mirna in fInts:

            mirnaFound = False

            for defMirna in defInts:
                if defMirna.accept(mirna):
                    mirnaFound = True
                    break

            if mirnaFound == False:
                additionalInteractions[x].add(miRNA(mirna))

    missingDF = DataFrame()
    missingDF.addColumns([
        'chemokine', 'miRNA Group', 'miRNA', 'Original Network', 'PubMed',
        'MIRECORD', 'MIRTARBASE', 'MIRWALK'
    ])

    linkedDF = DataFrame()
    linkedDF.addColumns([
        'chemokine', 'miRNA Group', 'miRNA', 'Original Network', 'PubMed',
        'MIRECORD', 'MIRTARBASE', 'MIRWALK'
    ])

    totalMissing = 0
    print("Missing miRNAs")
    for x in missingInteractions:
        print(x, len(missingInteractions[x]), len(interactions[x]),
              missingInteractions[x])

        totalMissing += len(missingInteractions[x])
Ejemplo n.º 5
0
                [x[1] - pseudoCount for x in sample2genecount[sample]])
            print(sample, totalCount)

            sample2stats[sample] = {"sample": sample, "totalCount": totalCount}

        sample2stats = makeplot(sample2genecount, defile.name, sample2stats,
                                args.output[fidx])

        columns = list()
        for sample in sample2stats:
            for x in sample2stats[sample]:
                if not x in columns:
                    columns.append(x)

        outdf = DataFrame()
        outdf.addColumns(columns)

        for sample in sample2stats:

            dr = DataRow.fromDict(sample2stats[sample])
            outdf.addRow(dr)

        print(outdf)

        if dfGroups != None:

            allGenes = set()
            for sample in sample2genecount:
                for x in sample2genecount[sample]:
                    allGenes.add(x[0])
Ejemplo n.º 6
0
    for x in df2SpecialCols:
        xn = x.split("_")
        xn.insert(1, args.prefix2)
        xn = "_".join(xn)
        df2Col2New[x] = xn
        print("Sp2", x, xn)

    df1NewCols = [df1Col2New[x] for x in df1Col2New]
    df2NewCols = [df2Col2New[x] for x in df2Col2New]

    outdf = DataFrame()

    if args.prefix_counts:
        outdf.addColumns(
            df12CommonCols +
            [args.prefix1 + "_" + x for x in df1UniqueCols + df1SampleCols] +
            [args.prefix2 + "_" + x
             for x in df2UniqueCols + df2SampleCols] + df1NewCols + df2NewCols)
    else:
        outdf.addColumns(df12CommonCols + df1UniqueCols + df2UniqueCols +
                         df1NewCols + df2NewCols)

    for x in outdf.getHeader():
        print("O", x)

    id2dataDf = {}

    for row in indf1:
        data = {}
        for x in df12CommonCols:
            data[x] = row[x]
Ejemplo n.º 7
0
            if args.fpkm:

                #print(curGeneID, geneLength)

                fpkmValue = row[sample] / (sample2total[sample] *
                                           geneLength) * pow(10, 9)
                rowDict[sample + ".FPKM"] = fpkmValue

            if args.tpm:

                tpmValue = row[sample] / (geneLength *
                                          sample2ratio[sample]) * pow(10, 6)
                rowDict[sample + ".TPM"] = tpmValue

        allRowUpdates.append(rowDict)

    allCols = set()
    for x in allRowUpdates:
        for y in x:
            if not y in featureCountsColumns:
                allCols.add(y)

    outdf.addColumns(featureCountsColumns)
    outdf.addColumns(sorted(allCols), default=0, ignoreDuplicates=True)
    outdf.updateRowIndexed("Geneid",
                           allRowUpdates,
                           ignoreMissingCols=True,
                           addIfNotFound=True)

    outdf.export(args.output.name, exType=ExportTYPE.TSV)
Ejemplo n.º 8
0
                print(stage, mirna, cellpair[0], cellpair[1], mirnaCellPairs[mirna][cellpair], stageMirnaCellPairs[cellpair], stageMir2CellEvidence[stage][mirna].get(cellpair[0]),stageMir2CellEvidence[stage][mirna].get(cellpair[1]) )

    cellgraph = networkx.Graph()

    allnodes = set()
    for edge in edge2support:
        allnodes.add(edge[0])
        allnodes.add(edge[1])

    for node in allnodes:
        cellgraph.add_node(node[1] + " ("+node[0]+")", size=20 + stageCellCount[node])


    cellCommunicatorDF = DataFrame()
    cellCommunicatorDF.addColumns(["miRNA", "cells"])

    mirna2cells = defaultdict(set)

    for edge in edge2support:
        cellgraph.add_edge(
            edge[0][1] + " (" + edge[0][0] + ")",
            edge[1][1] + " (" + edge[1][0] + ")",
            label=", ".join(edge2support.get(edge, [])))

        mirnas = edge2support.get(edge, [])

        for mirna in mirnas:
            mirna2cells[mirna].add(edge[0][1] + " (" + edge[0][0] + ")")
            mirna2cells[mirna].add(edge[1][1] + " (" + edge[1][0] + ")")
Ejemplo n.º 9
0
        pubmedID = article['PubmedData']['ArticleIdList'][0] if len(
            article['PubmedData']['ArticleIdList']) > 0 else "-1"
        pubID = int(pubmedID)

        artInfo = article['MedlineCitation']['Article']
        articleTitle = artInfo['ArticleTitle']
        articleJournal = artInfo['Journal'][
            'Title'] if 'Journal' in artInfo else ''

        pmid2title[pubID] = articleTitle

    return pmid2title


res = DataFrame()
res.addColumns(["SET", "PMID_ID", "PMID_TITLE", 'Common'])

print(ntd)
print("NTD", len(ntd))

pmidt = getPMIDTitles(ntd)
for x in sorted([x for x in pmidt]):

    dataDict = {
        'SET':
        'NTinfect',
        'PMID_ID':
        "<a href='https://www.ncbi.nlm.nih.gov/pubmed/" + str(x) +
        "' target='_blank'>" + str(x) + "</a>",
        'PMID_TITLE':
        pmidt[x],
Ejemplo n.º 10
0
                foundAcceptedInteractions[x].add(mirna)

        for mirna in fInts:

            mirnaFound = False

            for defMirna in defInts:
                if defMirna.accept(mirna):
                    mirnaFound = True
                    break

            if mirnaFound == False:
                additionalInteractions[x].add(miRNA(mirna))

    missingDF = DataFrame()
    missingDF.addColumns(
        ['chemokine', 'miRNA Group', 'miRNA', 'Weber', 'PubMed', 'MIRTARBASE'])

    linkedDF = DataFrame()
    linkedDF.addColumns(
        ['chemokine', 'miRNA Group', 'miRNA', 'Weber', 'PubMed', 'MIRTARBASE'])

    totalMissing = 0
    print("Missing miRNAs")
    for x in missingInteractions:
        print(x, len(missingInteractions[x]), len(interactions[x]),
              missingInteractions[x])

        totalMissing += len(missingInteractions[x])

        selInts = missingInteractions[x]
Ejemplo n.º 11
0

        print(datetime.datetime.now(), "Loading ncit")
        ncitPMIDs = easyPMIDFinder(args.pmidBase + "/ncit.pmid")
        dbs2pmids["NCIT"] = ncitPMIDs

        with open("/mnt/d/pmidsindims.pickle", 'wb') as fout:
            pickle.dump(dbs2pmids, fout)
    else:

        with open("/mnt/d/pmidsindims.pickle", 'rb') as fout:
            dbs2pmids = pickle.load(fout)


    outdf = DataFrame()
    outdf.addColumns(["Subset", "Number of PMIDs"])

    allDims = [x for x in dbs2pmids]

    allPowerSets = powerset(sorted(allDims))

    allPMIDs = set()
    for x in dbs2pmids:
        allPMIDs = allPMIDs.union(dbs2pmids[x])


    for pset in allPowerSets:

        if len(pset) == 0:
            continue
Ejemplo n.º 12
0
    ]

    networks['targetMirsCholEfflux'] = targetMirsCholEfflux

    # SMC proliferation / migration
    targetMirsSMCProlif = [
        'miR-24', 'miR-26a', 'miR-31', 'miR-146a', 'miR-155', 'miR-208',
        'miR-221', 'miR-222', 'miR-7d', 'let-7d', 'miR-1', 'miR-10a', 'miR-21',
        'miR-29', 'miR-100', 'miR-132', 'miR-133', 'miR-143', 'miR-145',
        'miR-195', 'miR-204', 'miR-424', 'miR-638', 'miR-663'
    ]

    networks['targetMirsSMCProlif'] = targetMirsSMCProlif

    summaryDF = DataFrame()
    summaryDF.addColumns(
        ["Network", "Accepted miRNAs", 'Additional miRNAs', "Missing miRNAs"])

    networkGraphs = {}
    makeStory = []

    allNetworks = [x for x in networks]
    print(allNetworks)

    #exit()

    ignoreNetworks = []

    networkRestrictions = {
        'targetMirsECA': {
            "cells": [{
                "group": "cells",
Ejemplo n.º 13
0
                'name': 'atherosclerosis'
            }]
        },
        'andreou_table1_athero': {
            'sentences':
            "false",
            "disease": [{
                'group': 'disease',
                'termid': 'DOID:1936',
                'name': 'atherosclerosis'
            }]
        },
    }

    restrictDF = DataFrame()
    restrictDF.addColumns(["Network", "Cells", "Disease", "Other"], "")

    for x in networkRestrictions:

        restricts = networkRestrictions[x]

        networkDRdict = defaultdict(str)
        networkDRdict["Network"] = x.replace("_", " ")

        diseaseElems = []
        cellElems = []
        otherElems = []

        for restrictType in restricts:

            if restrictType == "sentences":
Ejemplo n.º 14
0
        'miR-29',
        'miR-100',
        'miR-132',
        'miR-133',
        'miR-143',
        'miR-145',
        'miR-195',
        'miR-204',
        'miR-424',
        'miR-638',
        'miR-663']

    networks['targetMirsSMCProlif'] = targetMirsSMCProlif

    summaryDF = DataFrame()
    summaryDF.addColumns(["Network", "miRNAs", 'Target Genes'])

    networkGraphs = {}
    makeStory = [

    ]

    allNetworks = [x for x in networks]
    print(allNetworks)

    #exit()

    ignoreNetworks = []

    networkRestrictions = {
        'targetMirsECA': {
Ejemplo n.º 15
0
    homDB.finalize()
    homDB.save_to_file(fileLocation + "combed")
    """

    for orgname in homDB.get_all_organisms():
        genomDB.loadGenome(orgname)
    allorgs = list(homDB.get_all_organisms())

    mc = ['4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1', '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1']
    nmc = [x for x in allorgs if not x in mc] # and not x.startswith("15")


    allData = DataFrame()


    allData.addColumns(allorgs)
    homClusterIDs = []


    for homid in homDB.homologies:

        val = homDB.get_homology_cluster(homid)

        maxlength = 0
        for org in val:

            geneid = val[org]
            seq = genomDB.get_sequence(org, geneid)

            if len(seq) > maxlength:
                maxlength = len(seq)
Ejemplo n.º 16
0
            xn = "_".join(xn)
            df2Col2New[x] = xn
            print("S2", x, xn)

        df1NewCols = [df1Col2New[x] for x in df1Col2New]
        df2NewCols = [df2Col2New[x] for x in df2Col2New]

        outdf = DataFrame()

        if args.prefix_counts:

            if len(curPrefix) > 0:
                curPrefix += "_"

            outdf.addColumns(
                df12CommonCols + [curPrefix + x for x in df1UniqueCols] +
                [args.prefixes[didx] + "_" + x
                 for x in df2UniqueCols] + df1NewCols + df2NewCols)
        else:
            outdf.addColumns(df12CommonCols + df1UniqueCols + df2UniqueCols +
                             df1NewCols + df2NewCols)

        for x in outdf.getHeader():
            print("O", x)

        id2dataDf = {}

        for row in curDF:
            data = {}
            for x in df12CommonCols:
                data[x] = row[x]
Ejemplo n.º 17
0
    allorgs = list(homDB.get_all_organisms())

    extra = ['AE001439', 'CP009259']

    mc = [
        '4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1',
        '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1'
    ]
    nmc = [
        x for x in allorgs
        if not x in mc and not x in extra and not x.startswith("6_")
    ]  # and not x.startswith("15")

    allData = DataFrame()

    allData.addColumns(allorgs)
    homClusterIDs = []

    for homid in homDB.homologies:

        val = homDB.get_homology_cluster(homid)

        maxlength = 0
        for org in val:

            geneid = val[org]
            seq = genomDB.get_sequence(org, geneid)

            if len(seq) > maxlength:
                maxlength = len(seq)