def funcIDNormPCL(target, source, env, iMaxLines=100000): strT, astrSs = sfle.ts(target, source) strS = astrSs[1] if ( len(sfle.readcomment(astrSs[1])) > c_iSkip) else astrSs[0] iLC = sfle.lc(strS) return (sfle.ex("Normalizer -t pcl -T medmult < " + strS, strT) if (iLC < iMaxLines) else sfle.ex("head -n 3 < " + strS, strT))
def get_mappingfile(strTaxID, fApprox=True, strDir=c_strDirMapping): if not (strTaxID): return None else: if not (sfle.isempty(c_strFileManualMapping)): pHash = { k: v for k, v in [ a.split('\t') for a in sfle.readcomment(open(c_strFileManualMapping)) ] } astrMapOutTmp = list( filter(bool, [ pHash.get(item) for item in [" ".join(taxid2org(strTaxID).split(" ")[:2])] ])) astrMapOut = [sfle.d( c_strDirMapping, x) for x in astrMapOutTmp] \ if astrMapOutTmp else [] if not (astrMapOut): # give an un-prioritized list astrIDs = [strTaxID] if not (fApprox) else org2taxid( taxid2org(strTaxID), True) for strID in astrIDs: astrGlob = glob.glob(sfle.d(strDir, strID + "_*")) if astrGlob: astrMapOut = astrGlob break return (astrMapOut[0] if astrMapOut else None)
def funcPCL2DAB(pE, fileIDRawPCL, fileGPLTXTGZ, fileProgAnnot2Map, fileProgGPL2TXT, fileProgMergeMapping, fileTaxa, filePlatform): astrSleipnir = sfle.readcomment(c_fileFlagSleipnir) bSleipnir = astrSleipnir[0] == "True" if astrSleipnir else False print("sleipnir", ("On" if bSleipnir else "Off")) #Produce raw mapping file for gene mapping astrMapRaw = pE.Command( c_fileIDMapRaw, [fileGPLTXTGZ, filePlatform, fileProgAnnot2Map, fileProgGPL2TXT], funcRawMap) #Produce merged mapping file astrMap = pE.Command(c_fileIDMap, [fileTaxa, fileProgMergeMapping, astrMapRaw[0]], funcMergeMap) #Perform Gene Mapping astrMapped = funcGeneIDMapping(pE, fileIDRawPCL, arepa.genemap_probeids(), c_fileStatus, astrMap[0], c_aiCOL, c_iSkip) #Get rid of duplicate identifiers astrUnique = funcMakeUnique(pE, astrMapped[0], c_iSkip, c_iCOL) if bSleipnir: pE.Command(c_fileIDNormPCL, [c_fileIDRawPCL, astrUnique[0]], funcIDNormPCL) pE.Command(c_fileIDPCL, c_fileIDNormPCL, funcIDKNNPCL) pE.Command(c_fileIDDAB, c_fileIDPCL, funcIDDAB) pE.Command(c_fileIDQUANT, c_fileIDPCL, funcIDQUANT) else: sfle.sop(pE, "cp", [[astrUnique[0]], [True, c_fileIDPCL]])
def funcMergeMap(target, source, env): strT, astrSs = sfle.ts(target, source) fileTaxa, fileMerge, fileIDRaw = astrSs[:3] astrTaxa = sfle.readcomment(fileTaxa) strMap = arepa.get_mappingfile(astrTaxa[0]) if astrTaxa else "" return (sfle.ex([fileMerge, fileIDRaw, strMap, strT]) if strMap else sfle.ex(["cp", fileIDRaw, strT]))
def funcRawMap(target, source, env): strT, astrSs = sfle.ts(target, source) strGPLTXTGZ, strPlatformTXT, strProgAnnot2Map, strProgGPL2TXT = astrSs[:4] strGPLID = (sfle.readcomment(open(strPlatformTXT)) or [""])[0] return (ex([strProgAnnot2Map, strGPLTXTGZ, strT]) if not (sfle.isempty(str(strGPLTXTGZ))) else ex( [strProgGPL2TXT, c_strGPLPath + strGPLID, strT]))
def funcDAB(pE, fileOutDAB, afileInDAT): astrSleipnir = sfle.readcomment(c_fileFlagSleipnir) bSleipnir = (astrSleipnir[0] == "True") print("sleipnir", ("On" if bSleipnir else "Off")) def _funcDAB(target, source, env): strT, astrSs = sfle.ts(target, source) strOut, strMap = astrSs[:2] return sfle.ex(("Dat2Dab", "-o", strT, "-i", (strOut if sfle.isempty(strMap) else strMap))) if bSleipnir: return pE.Command(fileOutDAB, afileInDAT, _funcDAB)
def getGPL(target, source, env): astrTs, astrSs = ([f.get_abspath() for f in a] for a in (target, source)) strAnnot, strPlatform = astrTs[:2] strRMeta = astrSs[0] pid = [row for row in csv.DictReader(open(strRMeta))][0]["platform_id"] strGPLID = c_strID.split("-")[1] if len(c_strID.split("-")) == 2 else pid listGPL = [ v.replace(".annot.gz", "") for v in sfle.readcomment(c_fileAnnot) ] if strGPLID in listGPL: #Annotation file exist, download sfle.ex([ "wget", sfle.d(c_strURLGPL, strGPLID + ".annot.gz"), "-O", strAnnot ]) else: #Annotation file does not exist, skip download sfle.ex(["touch", strAnnot]) #Make platform file containing gpl identifier with open(strPlatform, "w") as outputf: outputf.write(strGPLID)
import pickle import itertools import re import time import sfle g_iterCounter = itertools.count(0) c_strID = arepa.cwd() c_strPathRepo = arepa.name_repo() c_strSufMap = ".map" c_strMapped = "_mapped" c_strDirData = sfle.d(arepa.path_repo(), sfle.c_strDirData) c_strDirManMap = sfle.d(arepa.path_repo(), sfle.c_strDirEtc, "manual_mapping") c_astrGeneTo = sfle.readcomment( sfle.d(arepa.path_arepa(), sfle.c_strDirEtc, "geneid") or [arepa.genemap_genename()]) c_strPathGeneMapper = sfle.d(arepa.path_arepa(), "GeneMapper") c_strFileUnzipLog = sfle.d(c_strPathGeneMapper, sfle.c_strDirTmp, "unzip.log") c_strFileCompileLog = sfle.d(c_strPathGeneMapper, sfle.c_strDirTmp, "compile.log") c_strPathTopMapping = sfle.d(c_strPathGeneMapper, sfle.c_strDirEtc, "manual_mapping") c_strPathUniprotKO = sfle.d(c_strPathGeneMapper, sfle.c_strDirEtc, "uniprotko") c_fileProgMakeUnique = sfle.d(arepa.path_arepa(), sfle.c_strDirSrc, "makeunique.py") c_funcGeneMapper = sfle.d(c_strPathGeneMapper, sfle.c_strDirSrc, "bridgemapper.py") c_strManualGeneIDs = sfle.d(arepa.path_repo(), sfle.c_strDirEtc, "manual_geneid")
parse mapping files begins with !platform_table_begin ends with !platform_table_end """ import sfle import glob import csv import sys import re import arepa import gzip c_fileMapping = sfle.d(arepa.path_repo(), sfle.c_strDirEtc, "mapping") c_hashHead = { k:v for (k, v) in [[y.strip() for y in x.split("%")] for x in sfle.readcomment( open( c_fileMapping))] } if sfle.readcomment(open(c_fileMapping))\ else { "^ID .*? platform" : "Affy", "Entrez Gene Symbol" : "HGNC", "Uniprot .*? Symbol" : "Uniprot/TrEMBL", "^(Entrez)? UniGene Symbol" : "UniGene", "Entrez Unigene Identifier" : "UniGene_ID", "GenBank Accession" : "GB_ACC", "Entrez Gene identifier" : "Entrez Gene", "GenBank Identifier" : "GenBank" } iArg = len(sys.argv) strFileAnnotGZ = sys.argv[1] if iArg > 1 else None strFileOut = sys.argv[2] if iArg > 2 else None
sys.exit() pE = DefaultEnvironment() c_strID = arepa.cwd() c_strGDS, c_strGPL = c_strID.split("-")[:2] c_fileInputSConscript = sfle.d(pE, arepa.path_arepa(), sfle.c_strDirSrc, "SConscript_pcl-dab.py") c_fileRSConscript = sfle.d(pE, arepa.path_arepa(), sfle.c_strDirSrc, "SConscript_rpackage.py") c_fileInputSOFTGZ = sfle.d(pE, "../" + c_strGDS + ".soft.gz") c_fileInputManCurTXT = sfle.d(pE, arepa.path_repo(), sfle.c_strDirEtc, "manual_curation/", c_strID + ".txt") c_filePPfun = sfle.d(pE, arepa.path_repo(), sfle.c_strDirEtc, "preprocess") c_strPPfun = sfle.readcomment(c_filePPfun)[0] c_fileTaxa = sfle.d(pE, "taxa.txt") c_fileStatus = sfle.d(pE, "status.txt") c_filePlatform = sfle.d(pE, "platform.txt") c_fileIDMap = sfle.d(pE, c_strID + ".map") c_fileIDMapRaw = sfle.d(pE, c_strID + "_raw.map") c_fileIDPKL = sfle.d(pE, c_strID + ".pkl") c_fileGPLTXTGZ = sfle.d(pE, c_strGPL + ".annot.gz") c_fileIDRawPCL = sfle.d(pE, c_strID + "_00raw.pcl") c_fileLogPackage = sfle.d(pE, "package") c_fileConfigPacakge = sfle.d(pE, arepa.path_repo(), sfle.c_strDirEtc, "rpackage") c_fileExpTable = sfle.d(pE, c_strID + "_exp_metadata.txt") c_strDirR = "R"
c_strType = "type" c_strCount = "count" c_strQuery = "query_key" c_strWebEnv = "WebEnv" c_iIncrement = 10000 c_iRetMax = 1000000 c_strSufXML = ".xml" c_strURLSum = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gds&retstart=%s&retmax=1000000&query_key=%s&WebEnv=%s" if len(sys.argv[1:]) != 2: raise Exception("Usage: xmlmerge.py <metadata> <id>") inputf, geo_id = sys.argv[1:] hashRet = { k: v for k, v in [s.split("\t") for s in sfle.readcomment(open(inputf))] } id_count, query_key, web_env = [ hashRet.get(i) for i in [c_strCount, c_strQuery, c_strWebEnv] ] #=========================================================================== # Iteratively download temporary xml files #=========================================================================== def discrete_list(num, increment): iTries = (num / increment) + 1 return [str(1 + (a * increment)) for a in range(iTries)]
"preprocessRaw.R") c_fileProgAnnot2Map = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc, "annot2map.py") c_fileProgGPL2TXT = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc, "gpl2txt.py") c_fileProgMergeMapping = sfle.d(pE, arepa.path_arepa(), sfle.c_strDirSrc, "merge_genemapping.py") c_fileProgGetInfo = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc, "getinfo.py") c_fileProgEset2Help = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc, "eset2help.R") c_fileProgArrayQualMetrics = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc, "ArrayQualityMetrics.R") m_strGPLID = None m_strPPfun = (sfle.readcomment(c_filePPfun) or ["affy::rma"])[0] m_boolRunRaw = sfle.readcomment(c_fileRunRaw) == ["True"] or False m_boolRPackage = sfle.readcomment(c_fileConfigPacakge) == ["True"] or False Import("hashArgs") #=============================================================================== # Download series matrix file, Convert SERIES file with # platform info to PKL and PCL #=============================================================================== #Run gse.R sfle.ssink(pE, str(c_fileInputGSER), "R --no-save --args", [[True, c_fileIDSeriesTXTGZ], [True, c_fileRPlatformTXT], [True, c_fileRMetadataTXT], [True, c_fileRDataTXT]])
return (iLevel == 3) and (strID.find("GSE") == 0) if "testing" in locals(): sys.exit() pE = DefaultEnvironment() c_strID = arepa.cwd().replace("-RAW", "") c_strURLGEO = 'ftp.ncbi.nih.gov' c_strURLGEOsupp = 'pub/geo/DATA/supplementary/samples/' c_strURLSupp = 'ftp://' + c_strURLGEO + '/' + c_strURLGEOsupp c_strFileGSM = "../GSM.txt" c_strFilePCL = "../" + c_strID + ".pcl" c_listTs = sfle.readcomment(c_strFileGSM) c_fileProgReadCel = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc, "readCel.R") c_fileProgProcessRaw = sfle.d(pE, arepa.path_repo(), sfle.c_strDirSrc, "preprocessRaw.R") c_strInputRData = arepa.cwd() + ".RData" c_strOutputRData = c_strInputRData.replace("-RAW", "") c_filePPfun = sfle.d(pE, arepa.path_repo(), sfle.c_strDirEtc, "preprocess") c_strPPfun = sfle.readcomment( c_filePPfun )[0] if \ sfle.readcomment( c_filePPfun ) else "affy::rma" c_fileExpTable = sfle.d(pE, "../" + c_strID + "_exp_metadata.txt") c_fileCondTable = sfle.d(pE, "../" + c_strID + "_cond_metadata.txt") Import("hashArgs")
pSOFT.open(sys.stdin) for pDS in list(pSOFT.get("DATASET").values()): pMetadata.pmid(pDS.get_attribute("dataset_pubmed_id")) pMetadata.title(pDS.get_attribute("dataset_title")) pMetadata.gloss(pDS.get_attribute("dataset_description")) pMetadata.type( re.sub(r' by .+$', "", (pDS.get_attribute("dataset_type") or "").lower())) pMetadata.channels(pDS.get_attribute("dataset_channel_count")) pMetadata.conditions(pDS.get_attribute("dataset_sample_count")) pMetadata.platform(pDS.get_attribute("dataset_platform")) pMetadata.taxid( arepa.org2taxid(pDS.get_attribute("dataset_sample_organism"))) # Auxillary Metadata if strMetadata: astrHeaders = None for astrLine in csv.reader(open(strMetadata), csv.excel_tab): if astrHeaders: for i in range(len(astrLine)): pMetadata.setdefault(astrHeaders[i], []).append(astrLine[i]) else: pMetadata[c_hashkeyCurated] = astrLine astrHeaders = astrLine # Add Mapping Status and Save k, v = sfle.readcomment(open(strStatus))[0].split("\t") pMetadata.update({k: v}) pMetadata.save(sys.stdout)