Ejemplo n.º 1
0
def funcIDNormPCL(target, source, env, iMaxLines=100000):
    strT, astrSs = sfle.ts(target, source)
    strS = astrSs[1] if (
        len(sfle.readcomment(astrSs[1])) > c_iSkip) else astrSs[0]
    iLC = sfle.lc(strS)
    return (sfle.ex("Normalizer -t pcl -T medmult < " + strS, strT) if
            (iLC < iMaxLines) else sfle.ex("head -n 3 < " + strS, strT))
Ejemplo n.º 2
0
def get_mappingfile(strTaxID, fApprox=True, strDir=c_strDirMapping):

    if not (strTaxID):
        return None
    else:
        if not (sfle.isempty(c_strFileManualMapping)):
            pHash = {
                k: v
                for k, v in [
                    a.split('\t')
                    for a in sfle.readcomment(open(c_strFileManualMapping))
                ]
            }
            astrMapOutTmp = list(
                filter(bool, [
                    pHash.get(item)
                    for item in [" ".join(taxid2org(strTaxID).split(" ")[:2])]
                ]))
            astrMapOut = [sfle.d( c_strDirMapping, x) for x in astrMapOutTmp] \
            if astrMapOutTmp else []
        if not (astrMapOut):
            # give an un-prioritized list
            astrIDs = [strTaxID] if not (fApprox) else org2taxid(
                taxid2org(strTaxID), True)
            for strID in astrIDs:
                astrGlob = glob.glob(sfle.d(strDir, strID + "_*"))
                if astrGlob:
                    astrMapOut = astrGlob
                    break
            return (astrMapOut[0] if astrMapOut else None)
Ejemplo n.º 3
0
def funcPCL2DAB(pE, fileIDRawPCL, fileGPLTXTGZ, fileProgAnnot2Map,
                fileProgGPL2TXT, fileProgMergeMapping, fileTaxa, filePlatform):

    astrSleipnir = sfle.readcomment(c_fileFlagSleipnir)
    bSleipnir = astrSleipnir[0] == "True" if astrSleipnir else False

    print("sleipnir", ("On" if bSleipnir else "Off"))

    #Produce raw mapping file for gene mapping
    astrMapRaw = pE.Command(
        c_fileIDMapRaw,
        [fileGPLTXTGZ, filePlatform, fileProgAnnot2Map, fileProgGPL2TXT],
        funcRawMap)

    #Produce merged mapping file
    astrMap = pE.Command(c_fileIDMap,
                         [fileTaxa, fileProgMergeMapping, astrMapRaw[0]],
                         funcMergeMap)

    #Perform Gene Mapping
    astrMapped = funcGeneIDMapping(pE, fileIDRawPCL, arepa.genemap_probeids(),
                                   c_fileStatus, astrMap[0], c_aiCOL, c_iSkip)

    #Get rid of duplicate identifiers
    astrUnique = funcMakeUnique(pE, astrMapped[0], c_iSkip, c_iCOL)

    if bSleipnir:
        pE.Command(c_fileIDNormPCL, [c_fileIDRawPCL, astrUnique[0]],
                   funcIDNormPCL)
        pE.Command(c_fileIDPCL, c_fileIDNormPCL, funcIDKNNPCL)
        pE.Command(c_fileIDDAB, c_fileIDPCL, funcIDDAB)
        pE.Command(c_fileIDQUANT, c_fileIDPCL, funcIDQUANT)
    else:
        sfle.sop(pE, "cp", [[astrUnique[0]], [True, c_fileIDPCL]])
Ejemplo n.º 4
0
def funcMergeMap(target, source, env):
    strT, astrSs = sfle.ts(target, source)
    fileTaxa, fileMerge, fileIDRaw = astrSs[:3]
    astrTaxa = sfle.readcomment(fileTaxa)
    strMap = arepa.get_mappingfile(astrTaxa[0]) if astrTaxa else ""
    return (sfle.ex([fileMerge, fileIDRaw, strMap, strT])
            if strMap else sfle.ex(["cp", fileIDRaw, strT]))
Ejemplo n.º 5
0
def funcRawMap(target, source, env):
    strT, astrSs = sfle.ts(target, source)
    strGPLTXTGZ, strPlatformTXT, strProgAnnot2Map, strProgGPL2TXT = astrSs[:4]
    strGPLID = (sfle.readcomment(open(strPlatformTXT)) or [""])[0]

    return (ex([strProgAnnot2Map, strGPLTXTGZ, strT])
            if not (sfle.isempty(str(strGPLTXTGZ))) else ex(
                [strProgGPL2TXT, c_strGPLPath + strGPLID, strT]))
Ejemplo n.º 6
0
def funcDAB(pE, fileOutDAB, afileInDAT):

    astrSleipnir = sfle.readcomment(c_fileFlagSleipnir)
    bSleipnir = (astrSleipnir[0] == "True")
    print("sleipnir", ("On" if bSleipnir else "Off"))

    def _funcDAB(target, source, env):
        strT, astrSs = sfle.ts(target, source)
        strOut, strMap = astrSs[:2]
        return sfle.ex(("Dat2Dab", "-o", strT, "-i",
                        (strOut if sfle.isempty(strMap) else strMap)))

    if bSleipnir:
        return pE.Command(fileOutDAB, afileInDAT, _funcDAB)
Ejemplo n.º 7
0
def getGPL(target, source, env):
    astrTs, astrSs = ([f.get_abspath() for f in a] for a in (target, source))
    strAnnot, strPlatform = astrTs[:2]
    strRMeta = astrSs[0]
    pid = [row for row in csv.DictReader(open(strRMeta))][0]["platform_id"]
    strGPLID = c_strID.split("-")[1] if len(c_strID.split("-")) == 2 else pid
    listGPL = [
        v.replace(".annot.gz", "") for v in sfle.readcomment(c_fileAnnot)
    ]
    if strGPLID in listGPL:
        #Annotation file exist, download
        sfle.ex([
            "wget",
            sfle.d(c_strURLGPL, strGPLID + ".annot.gz"), "-O", strAnnot
        ])
    else:
        #Annotation file does not exist, skip download
        sfle.ex(["touch", strAnnot])
    #Make platform file containing gpl identifier
    with open(strPlatform, "w") as outputf:
        outputf.write(strGPLID)
Ejemplo n.º 8
0
import pickle
import itertools
import re
import time
import sfle

g_iterCounter = itertools.count(0)

c_strID = arepa.cwd()
c_strPathRepo = arepa.name_repo()
c_strSufMap = ".map"
c_strMapped = "_mapped"
c_strDirData = sfle.d(arepa.path_repo(), sfle.c_strDirData)
c_strDirManMap = sfle.d(arepa.path_repo(), sfle.c_strDirEtc, "manual_mapping")
c_astrGeneTo = sfle.readcomment(
    sfle.d(arepa.path_arepa(), sfle.c_strDirEtc, "geneid")
    or [arepa.genemap_genename()])
c_strPathGeneMapper = sfle.d(arepa.path_arepa(), "GeneMapper")
c_strFileUnzipLog = sfle.d(c_strPathGeneMapper, sfle.c_strDirTmp, "unzip.log")
c_strFileCompileLog = sfle.d(c_strPathGeneMapper, sfle.c_strDirTmp,
                             "compile.log")
c_strPathTopMapping = sfle.d(c_strPathGeneMapper, sfle.c_strDirEtc,
                             "manual_mapping")
c_strPathUniprotKO = sfle.d(c_strPathGeneMapper, sfle.c_strDirEtc, "uniprotko")
c_fileProgMakeUnique = sfle.d(arepa.path_arepa(), sfle.c_strDirSrc,
                              "makeunique.py")
c_funcGeneMapper = sfle.d(c_strPathGeneMapper, sfle.c_strDirSrc,
                          "bridgemapper.py")

c_strManualGeneIDs = sfle.d(arepa.path_repo(), sfle.c_strDirEtc,
                            "manual_geneid")
Ejemplo n.º 9
0
 
parse mapping files
begins with !platform_table_begin 
ends with !platform_table_end 
"""

import sfle
import glob
import csv
import sys
import re
import arepa
import gzip

c_fileMapping = sfle.d(arepa.path_repo(), sfle.c_strDirEtc, "mapping")
c_hashHead  = { k:v for (k, v) in [[y.strip() for y in x.split("%")] for x in sfle.readcomment( open( c_fileMapping))] } if sfle.readcomment(open(c_fileMapping))\
  else {
    "^ID .*? platform"              : "Affy",
    "Entrez Gene Symbol"         : "HGNC",
    "Uniprot .*? Symbol"     : "Uniprot/TrEMBL",
    "^(Entrez)? UniGene Symbol"  : "UniGene",
    "Entrez Unigene Identifier"     : "UniGene_ID",
    "GenBank Accession"             : "GB_ACC",
    "Entrez Gene identifier"        : "Entrez Gene",
    "GenBank Identifier"            : "GenBank"
}

iArg = len(sys.argv)
strFileAnnotGZ = sys.argv[1] if iArg > 1 else None
strFileOut = sys.argv[2] if iArg > 2 else None
Ejemplo n.º 10
0
    sys.exit()

pE = DefaultEnvironment()

c_strID = arepa.cwd()
c_strGDS, c_strGPL = c_strID.split("-")[:2]

c_fileInputSConscript = sfle.d(pE, arepa.path_arepa(), sfle.c_strDirSrc,
                               "SConscript_pcl-dab.py")
c_fileRSConscript = sfle.d(pE, arepa.path_arepa(), sfle.c_strDirSrc,
                           "SConscript_rpackage.py")
c_fileInputSOFTGZ = sfle.d(pE, "../" + c_strGDS + ".soft.gz")
c_fileInputManCurTXT = sfle.d(pE, arepa.path_repo(), sfle.c_strDirEtc,
                              "manual_curation/", c_strID + ".txt")
c_filePPfun = sfle.d(pE, arepa.path_repo(), sfle.c_strDirEtc, "preprocess")
c_strPPfun = sfle.readcomment(c_filePPfun)[0]

c_fileTaxa = sfle.d(pE, "taxa.txt")
c_fileStatus = sfle.d(pE, "status.txt")
c_filePlatform = sfle.d(pE, "platform.txt")
c_fileIDMap = sfle.d(pE, c_strID + ".map")
c_fileIDMapRaw = sfle.d(pE, c_strID + "_raw.map")
c_fileIDPKL = sfle.d(pE, c_strID + ".pkl")
c_fileGPLTXTGZ = sfle.d(pE, c_strGPL + ".annot.gz")
c_fileIDRawPCL = sfle.d(pE, c_strID + "_00raw.pcl")
c_fileLogPackage = sfle.d(pE, "package")
c_fileConfigPacakge = sfle.d(pE, arepa.path_repo(), sfle.c_strDirEtc,
                             "rpackage")
c_fileExpTable = sfle.d(pE, c_strID + "_exp_metadata.txt")

c_strDirR = "R"
Ejemplo n.º 11
0
c_strType = "type"
c_strCount = "count"
c_strQuery = "query_key"
c_strWebEnv = "WebEnv"
c_iIncrement = 10000
c_iRetMax = 1000000
c_strSufXML = ".xml"
c_strURLSum = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gds&retstart=%s&retmax=1000000&query_key=%s&WebEnv=%s"

if len(sys.argv[1:]) != 2:
    raise Exception("Usage: xmlmerge.py <metadata> <id>")

inputf, geo_id = sys.argv[1:]
hashRet = {
    k: v
    for k, v in [s.split("\t") for s in sfle.readcomment(open(inputf))]
}
id_count, query_key, web_env = [
    hashRet.get(i) for i in [c_strCount, c_strQuery, c_strWebEnv]
]

#===========================================================================
# Iteratively download temporary xml files
#===========================================================================


def discrete_list(num, increment):
    iTries = (num / increment) + 1
    return [str(1 + (a * increment)) for a in range(iTries)]

Ejemplo n.º 12
0
                              "preprocessRaw.R")
c_fileProgAnnot2Map = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc,
                             "annot2map.py")
c_fileProgGPL2TXT = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc,
                           "gpl2txt.py")
c_fileProgMergeMapping = sfle.d(pE, arepa.path_arepa(), sfle.c_strDirSrc,
                                "merge_genemapping.py")
c_fileProgGetInfo = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc,
                           "getinfo.py")
c_fileProgEset2Help = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc,
                             "eset2help.R")
c_fileProgArrayQualMetrics = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc,
                                    "ArrayQualityMetrics.R")

m_strGPLID = None
m_strPPfun = (sfle.readcomment(c_filePPfun) or ["affy::rma"])[0]
m_boolRunRaw = sfle.readcomment(c_fileRunRaw) == ["True"] or False
m_boolRPackage = sfle.readcomment(c_fileConfigPacakge) == ["True"] or False

Import("hashArgs")

#===============================================================================
# Download series matrix file, Convert SERIES file with
# platform info to PKL and PCL
#===============================================================================

#Run gse.R
sfle.ssink(pE, str(c_fileInputGSER), "R --no-save --args",
           [[True, c_fileIDSeriesTXTGZ], [True, c_fileRPlatformTXT],
            [True, c_fileRMetadataTXT], [True, c_fileRDataTXT]])
Ejemplo n.º 13
0
    return (iLevel == 3) and (strID.find("GSE") == 0)


if "testing" in locals():
    sys.exit()

pE = DefaultEnvironment()

c_strID = arepa.cwd().replace("-RAW", "")
c_strURLGEO = 'ftp.ncbi.nih.gov'
c_strURLGEOsupp = 'pub/geo/DATA/supplementary/samples/'
c_strURLSupp = 'ftp://' + c_strURLGEO + '/' + c_strURLGEOsupp
c_strFileGSM = "../GSM.txt"
c_strFilePCL = "../" + c_strID + ".pcl"

c_listTs = sfle.readcomment(c_strFileGSM)
c_fileProgReadCel = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc,
                           "readCel.R")
c_fileProgProcessRaw = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc,
                              "preprocessRaw.R")
c_strInputRData = arepa.cwd() + ".RData"
c_strOutputRData = c_strInputRData.replace("-RAW", "")

c_filePPfun = sfle.d(pE, arepa.path_repo(), sfle.c_strDirEtc, "preprocess")
c_strPPfun   = sfle.readcomment( c_filePPfun )[0] if \
     sfle.readcomment( c_filePPfun ) else "affy::rma"

c_fileExpTable = sfle.d(pE, "../" + c_strID + "_exp_metadata.txt")
c_fileCondTable = sfle.d(pE, "../" + c_strID + "_cond_metadata.txt")

Import("hashArgs")
Ejemplo n.º 14
0
pSOFT.open(sys.stdin)

for pDS in list(pSOFT.get("DATASET").values()):
    pMetadata.pmid(pDS.get_attribute("dataset_pubmed_id"))
    pMetadata.title(pDS.get_attribute("dataset_title"))
    pMetadata.gloss(pDS.get_attribute("dataset_description"))
    pMetadata.type(
        re.sub(r' by .+$', "", (pDS.get_attribute("dataset_type")
                                or "").lower()))
    pMetadata.channels(pDS.get_attribute("dataset_channel_count"))
    pMetadata.conditions(pDS.get_attribute("dataset_sample_count"))
    pMetadata.platform(pDS.get_attribute("dataset_platform"))
    pMetadata.taxid(
        arepa.org2taxid(pDS.get_attribute("dataset_sample_organism")))

# Auxillary Metadata
if strMetadata:
    astrHeaders = None
    for astrLine in csv.reader(open(strMetadata), csv.excel_tab):
        if astrHeaders:
            for i in range(len(astrLine)):
                pMetadata.setdefault(astrHeaders[i], []).append(astrLine[i])
        else:
            pMetadata[c_hashkeyCurated] = astrLine
            astrHeaders = astrLine

# Add Mapping Status and Save
k, v = sfle.readcomment(open(strStatus))[0].split("\t")
pMetadata.update({k: v})
pMetadata.save(sys.stdout)