othrType = sys.argv[3] outFile = sys.argv[4] corrThresh = abs(float(sys.argv[5])) if (not othrType.startswith("N:")): othrType = "N:" + othrType ## corrThresh = 0.40 ## corrThresh = 0.20 print " " print " " print " ************* " print " in methCorr : " print " ***************************************************************** " print " calling readTSV ... ", methFile methD = tsvIO.readTSV(methFile) tsvIO.lookAtDataD(methD) print " calling readTSV ... ", othrFile othrD = tsvIO.readTSV(othrFile) tsvIO.lookAtDataD(othrD) try: methRowLabels = methD['rowLabels'] methColLabels = methD['colLabels'] methDataMatrix = methD['dataMatrix'] except: print " no valid METH feature matrix " sys.exit(-1) numMethRow = len(methRowLabels)
outFile = sys.argv[2] do_summaryMeth = 1 else: print " " print " Usage: %s <input TSV file> <output TSV file> " print " " print " ERROR -- bad command line arguments " sys.exit(-1) print " " print " Running : %s %s %s " % (sys.argv[0], sys.argv[1], sys.argv[2]) print " " # read in the input feature matrix first, just in case there # actually isn't one yet available ... testD = tsvIO.readTSV(inFile) try: print len(testD['rowLabels']), len(testD['colLabels']) except: print " --> invalid / missing input feature matrix " sys.exit(-1) # we want to "check" for "deleted" METH probes if (do_summaryMeth): newD = summaryMeth(testD) ## tsvIO.writeTSV_dataMatrix ( newD, 0, 0, outFile ) testD = newD # and finally write it out ... tsvIO.writeTSV_dataMatrix(testD, 0, 0, outFile)
infFilename = bioinformaticsReferencesDir + "/ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info" print " " print " Running : %s %s %s %s " % (sys.argv[0], sys.argv[1], sys.argv[2], sys.argv[3]) print " %s " % gafFilename print " %s " % gencodeFilename print " %s " % refGeneFilename print " %s " % cybFilename print " %s " % infFilename print " " print " " # read in the input feature matrix first, just in case there # actually isn't one yet available ... print " --> calling tsvIO.readTSV ... " testD = tsvIO.readTSV(inFile) try: print len(testD['rowLabels']), len(testD['colLabels']) if (len(testD['rowLabels']) == 0 or len(testD['colLabels']) == 0): print " EXITING ... no data " sys.exit(-1) except: print " --> invalid / missing input feature matrix " sys.exit(-1) # read in the gene_info file ... # this was turned off ... looking into turning it back on (1/7/13) # turning it back off (1/17/14) if (0): print " --> calling readGeneInfoFile ... " (geneInfoDict, synMapDict) = refData.readGeneInfoFile(infFilename)
fhA.close() fhB = file(inFileB, 'r') fhB.close() except: print " one or the other file does not exist ??? " print inFileA print inFileB sys.exit(-1) print " " print " reading input files ... : " print " file A : ", inFileA print " file B : ", inFileB print " " dataA = tsvIO.readTSV(inFileA) dataB = tsvIO.readTSV(inFileB) if (len(dataA) == 0): print " input file A does not exist ??? " print inFileA sys.exit(-1) if (len(dataB) == 0): print inFileB print " input file B does not exist ??? " sys.exit(-1) # first take a look at the feature (row) labels ... rowLabelsA = makeBetterLabels(dataA['rowLabels']) rowLabelsB = makeBetterLabels(dataB['rowLabels'])
if (1): if (len(sys.argv) != 2): print " Usage : %s <input tsv file> " % sys.argv[0] print " ERROR -- bad command line arguments " sys.exit(-1) tsvFile = sys.argv[1] print " " print " " print " ************** " print " in methCorr2 : " print " ***************************************************************** " print " calling readTSV ... ", tsvFile methD = tsvIO.readTSV(tsvFile) tsvIO.lookAtDataD(methD) try: methRowLabels = methD['rowLabels'] methColLabels = methD['colLabels'] methDataMatrix = methD['dataMatrix'] except: print " no valid METH feature matrix " sys.exit(-1) numMethRow = len(methRowLabels) numMethCol = len(methColLabels) dThresh = 10000 minCount = 30
ii += 3 if (len(listInfo) < 1): print " ERROR ??? no sample-list information provided ??? " sys.exit(-1) print " " print " in filterTSVbySampList.py ... " print " input file : ", inFile print " output file : ", outFile print " list info : ", listInfo print " " # print " " # print " ***************************************************************** " # print " calling readTSV ... ", inFile dataD = tsvIO.readTSV(inFile) if (dataD == {}): sys.exit(-1) tsvIO.lookAtDataD(dataD) # print " " # print " reading sample list ... " numLists = len(listInfo) listDetails = [0] * numLists listBW = [0] * numLists listLS = [0] * numLists for iList in range(numLists): print " --> loading sample list #%d from <%s> " % ((iList + 1), listInfo[iList][0]) listDetails[iList] = readSampleListFromFile(listInfo[iList][0]) listBW[iList] = listInfo[iList][1]
0] print " ", sys.argv print " ERROR -- bad command line arguments " sys.exit(-1) tsvName1 = sys.argv[1] tsvName2 = sys.argv[2] tsvName3 = sys.argv[3] # test out readTSV ... ## tsvName = "coad_read_clinical.27jan.tsv" print " " print " ****************************************************************** " print " IN add2clinTSV.py ... " print " reading input file <%s> " % tsvName1 allClinDict = tsvIO.readTSV(tsvName1) # take a look ... (naCounts, otherCounts) = miscClin.lookAtClinDict(allClinDict) bestKeyOrder = miscClin.getBestKeyOrder(allClinDict, naCounts) # now we want to read in a new tsv file ... print " " print " ****************************************************************** " print " reading input file <%s> " % tsvName2 tmpDict = tsvIO.readTSV(tsvName2) # check to make sure that we actually got something back ... if (len(tmpDict) == 0): print " WARNING ... no information found ... "
if (1): if (len(sys.argv) != 4): print " Usage : %s <input feature matrix> <output feature matrix> <featType> " % sys.argv[ 0] print " to avoid filtering on featType, use ANY " print " ERROR -- bad command line arguments " sys.exit(-1) inFile = sys.argv[1] outFile = sys.argv[2] featType = sys.argv[3] print " " print " " print " ******************** " print " in filterIdentFeat " print " ******************** " inD = tsvIO.readTSV(inFile) rowLabels = inD['rowLabels'] numRow = len(rowLabels) outD = removeIdenticalFeatures(inD, featType) tsvIO.writeTSV_dataMatrix(outD, 0, 0, outFile) print " FINISHED " print " " # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# if __name__ == "__main__": if (len(sys.argv) != 3): print " Usage : %s <input TSV> <output TSV> " % sys.argv[0] print " ERROR -- bad command line arguments " sys.exit(-1) tsvNameIn = sys.argv[1] tsvNameOut = sys.argv[2] print " " print " ****************************************************************** " print " reading input file <%s> " % tsvNameIn tmpD = tsvIO.readTSV(tsvNameIn) if (len(tmpD) == 0): print " in addIndicators ... no input data ... nothing to do here ... " sys.exit(-1) # automatically generate indicator features for categorical features tmpD = addIndicatorFeatures(tmpD) tsvIO.writeTSV_dataMatrix(tmpD, 0, 0, tsvNameOut) print " " print " " print " FINISHED " print " " print " "
print " Usage : %s <old input TSV> <new input TSV> <output merged TSV> " % sys.argv[0] print " ", sys.argv print " ERROR -- bad command line arguments " sys.exit(-1) tsvName1 = sys.argv[1] tsvName2 = sys.argv[2] tsvName3 = sys.argv[3] # test out readTSV ... ## tsvName = "coad_read_clinical.27jan.tsv" print " " print " ****************************************************************** " print " IN add2clinTSV.py ... " print " reading input file <%s> " % tsvName1 allClinDict = tsvIO.readTSV(tsvName1) # take a look ... (naCounts, otherCounts) = miscClin.lookAtClinDict(allClinDict) bestKeyOrder = miscClin.getBestKeyOrder(allClinDict, naCounts) # now we want to read in a new tsv file ... print " " print " ****************************************************************** " print " reading input file <%s> " % tsvName2 tmpDict = tsvIO.readTSV(tsvName2) # check to make sure that we actually got something back ... if (len(tmpDict) == 0): print " WARNING ... no information found ... "
if __name__ == "__main__": if (1): if (len(sys.argv) != 2): print " Usage : %s <input tsv file> " % sys.argv[0] sys.exit(-1) tsvFile = sys.argv[1] print " " print " " print " ************** " print " in methCorr2 : " print " ***************************************************************** " print " calling readTSV ... ", tsvFile methD = tsvIO.readTSV(tsvFile) tsvIO.lookAtDataD(methD) try: methRowLabels = methD['rowLabels'] methColLabels = methD['colLabels'] methDataMatrix = methD['dataMatrix'] except: print " no valid METH feature matrix " sys.exit(-1) numMethRow = len(methRowLabels) numMethCol = len(methColLabels) dThresh = 10000 minCount = 30
fhA.close() fhB = file(inFileB, 'r') fhB.close() except: print " one or the other file does not exist ??? " print inFileA print inFileB sys.exit(-1) print " " print " reading input files ... : " print " file A : ", inFileA print " file B : ", inFileB print " " dataA = tsvIO.readTSV(inFileA) dataB = tsvIO.readTSV(inFileB) if (len(dataA) == 0): print " input file A does not exist ??? " print inFileA sys.exit(-1) if (len(dataB) == 0): print inFileB print " input file B does not exist ??? " sys.exit(-1) # first take a look at the feature (row) labels ... rowLabelsA = makeBetterLabels ( dataA['rowLabels'] ) rowLabelsB = makeBetterLabels ( dataB['rowLabels'] )
sys.exit(-1) tsvFile = sys.argv[1] outFile = sys.argv[2] dThresh = int ( sys.argv[3] ) minCount = int ( sys.argv[4] ) corrThresh = float ( sys.argv[5] ) print " " print " " print " ************** " print " in methCorr3 : ", dThresh, minCount, corrThresh print " ***************************************************************** " print ' (a) TIME ', time.asctime(time.localtime(time.time())) print " calling readTSV ... ", tsvFile tsvD = tsvIO.readTSV(tsvFile) tsvIO.lookAtDataD(tsvD) try: tsvRowLabels = tsvD['rowLabels'] tsvColLabels = tsvD['colLabels'] tsvDataMatrix = tsvD['dataMatrix'] except: print " no valid METH feature matrix " sys.exit(-1) numRow = len(tsvRowLabels) numCol = len(tsvColLabels) keepMeth = {} keepGexp = {}
def handleOneFeature ( featString, typeString, tsvFilename, \ pathwaysFilename, numRandFactor=200, dirName='' ): print " " print " ***************************************************************** " print " " print " in handleOneFeature <%s> <%s> <%s> <%s> " % \ ( featString, typeString, tsvFilename, dirName ) print " " isBinaryFeat = 0 isNumericFeat = 0 if ( featString.startswith("B:") ): isBinaryFeat = 1 elif ( featString.startswith("N:") ): isNumericFeat = 1 ## get the time-stamp for the TSV file tTSV = os.path.getmtime ( tsvFilename ) ## first we need to read in the feature matrix ... dataD = tsvIO.readTSV ( tsvFilename ) rowLabels = dataD['rowLabels'] dataMatrix = dataD['dataMatrix'] foundRows = [] exactMatch = [] for iRow in range(len(rowLabels)): if ( rowLabels[iRow].startswith(featString) ): exactMatch += [ iRow ] foundRows = exactMatch if ( len(foundRows) < 1 ): print " ERROR ... no features found ", featString, len(rowLabels) print tsvFilename print " --> SKIPPING ... " return() ## sys.exit(-1) elif ( len(foundRows) > 1 ): print " ERROR ... more than one feature found ", featString, len(rowLabels) print tsvFilename print foundRows print " --> SKIPPING ... " return () ## sys.exit(-1) print " " print " " ## figure out the root directory name and then the base output file name ( rootDirName, justFileName ) = splitPath ( tsvFilename ) print " rootDirName = <%s> " % rootDirName print " justFileName = <%s> " % justFileName if ( dirName != '' ): if ( dirName[-1] != '/' ): dirName += '/' ## test whether output directory already exists ... if ( not os.path.exists(rootDirName+dirName) ): ## create output directory cmdString = 'mkdir %s%s' % ( rootDirName, dirName ) ## print cmdString ( status, output ) = commands.getstatusoutput ( cmdString ) if ( not os.path.exists(rootDirName+dirName) ): print " ERROR ??? failed to create output directory ??? " print cmdString print status sys.exit(-1) else: print " output directory created %s " % (rootDirName+dirName) else: print " output directory already exists %s " % (rootDirName+dirName) prBaseName = rootDirName + dirName + justFileName[:-3] print " prBaseName = <%s> " % prBaseName for iRow in foundRows: print " " print " " curLabel = rowLabels[iRow] print iRow, curLabel multiCatFlag = checkMultiCat ( curLabel, dataMatrix[iRow] ) ## print dataMatrix[iRow] ## maybe we actually don't have to redo anything ... prOutFile = prBaseName + "%d.pxP" % iRow if ( prAlreadyDone ( prOutFile, tTSV ) ): prOutFile = prBaseName + "%d.pxN" % iRow if ( prAlreadyDone ( prOutFile, tTSV ) ): prOutFile = prBaseName + "%d.pxA" % iRow if ( prAlreadyDone ( prOutFile, tTSV ) ): print " --> already done !!! <%s> " % curLabel continue print " " print " " print " ************************************************************* " print " TIME : ", time.asctime(time.localtime(time.time())) print " " cmdString = 'python %s/main/run-pairwise-v2.py ' % ( gidgetConfigVars['TCGAFMP_ROOT_DIR'] ) cmdString += '--pvalue 2. --one "%s" --tsvFile %s' % ( curLabel, tsvFilename ) print cmdString ( status, output ) = commands.getstatusoutput ( cmdString ) print " status : ", status print " output : ", output print " " print " " print " ************************************************************* " print " " pwpvFile = tsvFilename[:-3] + "%d.all.pwpv.sort" % iRow ## ok, now we are going to loop over 3 different scoring options ... signList = [ '+', '-', 'x' ] tagList = [ 'pxP', 'pxN', 'pxA' ] ## if this feature is a categorical feature with more than 2 categories, ## then we cannot test the sign of the correlation ... if ( multiCatFlag ): print " WARNING: will only create the pxA output file for this feature " signList = [ 'x' ] tagList = [ 'pxA' ] for iTest in range(len(signList)): ## first do the pathway-ranking looking for positive correlations ... prOutFile = prBaseName + "%d.%s" % ( iRow, tagList[iTest] ) if ( not prAlreadyDone ( prOutFile, tTSV ) ): cmdString = 'rm -fr %s' % prOutFile print cmdString ( status, output ) = commands.getstatusoutput ( cmdString ) print " " print " " cmdString = 'python %s/main/runPR.py ' % ( gidgetConfigVars['TCGAFMP_ROOT_DIR'] ) cmdString += ' --tsvFile %s ' % tsvFilename cmdString += ' --pwpvFile %s ' % pwpvFile cmdString += ' --pathways %s ' % pathwaysFilename cmdString += ' --featName "%s" ' % curLabel cmdString += ' --sign "%s" ' % signList[iTest] cmdString += ' --nRand %d ' % numRandFactor cmdString += ' > %s ' % prOutFile print cmdString print " " if ( 1 ): if ( 0 ): print " just pretending ... " else: ( status, output ) = commands.getstatusoutput ( cmdString ) print " status : ", status print " output : ", output else: os.system ( cmdString ) time.sleep ( 10 ) print " " print " " print " ************************************************************* " print " "
if __name__ == "__main__": if (1): if (len(sys.argv) != 4): print " Usage : %s <input feature matrix> <output feature matrix> <featType> " % sys.argv[0] print " to avoid filtering on featType, use ANY " sys.exit(-1) inFile = sys.argv[1] outFile = sys.argv[2] featType = sys.argv[3] print " " print " " print " ******************** " print " in filterIdentFeat " print " ******************** " inD = tsvIO.readTSV(inFile) rowLabels = inD['rowLabels'] numRow = len(rowLabels) outD = removeIdenticalFeatures(inD, featType) tsvIO.writeTSV_dataMatrix(outD, 0, 0, outFile) print " FINISHED " print " " # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
dTypeList = [] fTypeList = [] # if trying to prune LOTS of rows and columns, then set the *MaxNAfrac # values to small values ... if trying not to prune ANYTHING, then set # the values near 1 ... ## colMaxNAfrac = 0.90 ## rowMaxNAfrac = 0.90 for aFile in inFileList: print " " print " ***************************************************************** " print " calling readTSV ... ", aFile testD = tsvIO.readTSV(aFile) if (len(testD) == 0): print " --> nothing found ??? continuing ... " continue tsvIO.lookAtDataD(testD) if (1): # TCGA-CJ-4635-01A-02R # the first 12 characters identify the patient # the first 15 characters identify the sample # now we check for duplicates at the sample level ... print " "
# for the input barcode lengths ... getBarcodeLength(inFileList) # if trying to prune LOTS of rows and columns, then set the *MaxNAfrac # values to small values ... if trying not to prune ANYTHING, then set # the values near 1 ... ## colMaxNAfrac = 0.90 ## rowMaxNAfrac = 0.90 for aFile in inFileList: print " " print " ***************************************************************** " print " calling readTSV ... ", aFile testD = tsvIO.readTSV(aFile) ## check to see if we actually have any data ... skipFile = 0 try: if (len(testD) == 0): skipFile = 1 if (len(testD['rowLabels']) == 0): skipFile = 1 if (len(testD['colLabels']) == 0): skipFile = 1 except: print " ERROR in looking at data from <%s> ??? " % (aFile) skipFile = 1 if (skipFile): print " --> nothing found ??? continuing ... " continue
# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# if __name__ == "__main__": if (len(sys.argv) != 3): print " Usage : %s <input TSV> <output TSV> " % sys.argv[0] sys.exit(-1) tsvNameIn = sys.argv[1] tsvNameOut = sys.argv[2] print " " print " ****************************************************************** " print " reading input file <%s> " % tsvNameIn tmpD = tsvIO.readTSV(tsvNameIn) if (len(tmpD) == 0): print " in addIndicators ... no input data ... nothing to do here ... " sys.exit(-1) # automatically generate indicator features for categorical features tmpD = addIndicatorFeatures(tmpD) tsvIO.writeTSV_dataMatrix(tmpD, 0, 0, tsvNameOut) print " " print " " print " FINISHED " print " " print " "
# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# if __name__ == "__main__": if (1): if (len(sys.argv) == 3): inFile = sys.argv[1] outFile = sys.argv[2] else: print " " print " Usage: %s <input TSV file> <output TSV file> " print " " print " ERROR -- bad command line arguments " sys.exit(-1) print " " print " Running : %s %s %s " % (sys.argv[0], sys.argv[1], sys.argv[2]) print " " print " " # now read in the input feature matrix ... dataD = tsvIO.readTSV(inFile) # add new custom features ... dataD = addCustomFeatures(dataD) # and write the matrix back out tsvIO.writeTSV_dataMatrix(dataD, 0, 0, outFile) # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
print " " print " ERROR -- bad command line arguments " sys.exit(-1) print " " print " Running : %s %s %s %s " % (sys.argv[0], sys.argv[1], sys.argv[2], sys.argv[3]) print " " print " " listDict = {} # read in the current clinical file ... topDir = "%s/%s/%s" % (gidgetConfigVars['TCGAFMP_DATA_DIR'], tumorString, dateString) clin1name = topDir + "/" + "%s.clinical.%s.tsv" % ( tumorString, dateString ) print clin1name allClinDict = tsvIO.readTSV ( clin1name ) # find out which features are interesting ... # BUT IS THIS REALLY COMPLETELY NOT NECESSARY ??? # was this just for debugging purposes ??? fList = getFeatList ( featureList ) for aF in fList: print aF for aKey in allClinDict.keys(): if ( aKey[1] == ":" ): aTokens = aKey.split(':') tKey = aTokens[2]