def aligning(self): BinHISAT2 = libConfig.config() BinHISAT2.queryStr = "binHISAT2-RUN" BinHISAT2.folderStr = "config/" BinHISAT2.modeStr = "UPDATE" BinHISAT2.load() SAMconvert = libConfig.config() SAMconvert.queryStr = "binSAMtools-CONVERT" SAMconvert.folderStr = "config/" SAMconvert.modeStr = "UPDATE" SAMconvert.load() SAMsort = libConfig.config() SAMsort.queryStr = "binSAMtools-SORT" SAMsort.folderStr = "config/" SAMsort.modeStr = "UPDATE" SAMsort.load() Remove = libConfig.config() Remove.queryStr = "commandRM" Remove.folderStr = "config/" Remove.modeStr = "UPDATE" Remove.load() expRep = libConfig.config() expRep.queryStr = self.queryStr expRep.folderStr = "config/" expRep.modeStr = "UPDATE" expRep.load() branchStr = expRep.storeDict["branch"] pairPostfixStr = expRep.storeDict["pairPostfix"] unpairPostfixStr = expRep.storeDict["unpairPostfix"] groupList = expRep.storeDict["group"] modeStr = expRep.storeDict["mode"] replicationList = expRep.storeDict["replication"] conditionList = expRep.storeDict["conditionsList"] for conditionDict in conditionList: annotateConditionStr = conditionDict["genome"] trimConditionStr = conditionDict["trim"] hisat2ConditionStr = conditionDict["map"] directionDict = expRep.storeDict["[hisat2]direction"] fileTypeStr = expRep.storeDict["[trim]fileType"] inputFileNameStr = expRep.storeDict["[hisat2]inputFileName"] outputFolderStr = expRep.storeDict["[hisat2]outputFolder"] outputFileNameStr = expRep.storeDict["[hisat2]outputFileName"] if not expRep.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True finalOutputFolderStr = outputFolderStr.format( annotate=annotateConditionStr, trim=trimConditionStr) pathlib.Path(finalOutputFolderStr).mkdir(parents=True, exist_ok=True) Print = libPrint.timer() Print.logFilenameStr = "04-hs1-hisat2-{branch}-{hisat2cond}-{annotateCon}-{trimCon}".format( branch=branchStr, hisat2cond=hisat2ConditionStr, annotateCon=annotateConditionStr, trimCon=trimConditionStr, ) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() for groupStr in groupList: for replicationStr in replicationList: finalDict = dict() Para = libConfig.config() #parameters Para.queryStr = hisat2ConditionStr Para.folderStr = "config/" Para.modeStr = "UPDATE" Para.load() finalDict.update(Para.storeDict) Spec = libConfig.config() #parameters Spec.queryStr = annotateConditionStr Spec.folderStr = "config/" Spec.modeStr = "UPDATE" Spec.load() finalDict.update( {"indexHeader": Spec.storeDict["indexHeader"]}) if modeStr == "pairEnd": pairForwardDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "direction": directionDict['1'], "pairType": pairPostfixStr, "fileType": fileTypeStr, } pairReverseDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "direction": directionDict['2'], "pairType": pairPostfixStr, "fileType": fileTypeStr, } unpairForwardDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "direction": directionDict['1'], "pairType": unpairPostfixStr, "fileType": fileTypeStr, } unpairReverseDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "direction": directionDict['2'], "pairType": unpairPostfixStr, "fileType": fileTypeStr, } elif modeStr == "singleEnd": unpairDict = { "trim": trimConditionStr, "group": groupStr, "replication": replicationStr, "fileType": fileTypeStr, } samDict = { "annotate": annotateConditionStr, "trim": trimConditionStr, "hisat2Condition": hisat2ConditionStr, "group": groupStr, "replication": replicationStr, "fileType": ".sam", } samFileStr = outputFileNameStr.format(**samDict) bamDict = { "annotate": annotateConditionStr, "trim": trimConditionStr, "hisat2Condition": hisat2ConditionStr, "group": groupStr, "replication": replicationStr, "fileType": ".bam", } bamFileStr = outputFileNameStr.format(**bamDict) sortedBAMDict = { "annotate": annotateConditionStr, "trim": trimConditionStr, "hisat2Condition": hisat2ConditionStr, "group": groupStr, "replication": replicationStr, "fileType": "-sorted.bam", } sortedBAMFileStr = outputFileNameStr.format( **sortedBAMDict) if pathlib.Path(samFileStr).exists(): Print.phraseStr = "SAM File existed: " + samFileStr Print.printTimeStamp() elif not pathlib.Path(samFileStr).exists( ) and not pathlib.Path(bamFileStr).exists( ) and not pathlib.Path(sortedBAMFileStr).exists(): if modeStr == "pairEnd": commandStr = BinHISAT2.storeDict.get( "command-PE", "") finalDict.update({ "pairForwardFASTQ": inputFileNameStr.format(**pairForwardDict), "pairReverseFASTQ": inputFileNameStr.format(**pairReverseDict), "unpairForwardFASTQ": inputFileNameStr.format(**unpairForwardDict), "unpairReverseFASTQ": inputFileNameStr.format(**unpairReverseDict), "outputSAM": samFileStr }) finalCommandStr = commandStr.format(**finalDict) Print.phraseStr = finalCommandStr Print.runCommand() elif modeStr == "singleEnd": commandStr = BinHISAT2.storeDict.get( "command-SE", "") finalDict.update({ "unpairFASTQ": inputFileNameStr.format(**unpairDict), "outputSAM": samFileStr }) finalCommandStr = commandStr.format(**finalDict) Print.phraseStr = finalCommandStr Print.runCommand() if pathlib.Path(bamFileStr).exists(): Print.phraseStr = "BAM File existed: " + bamFileStr Print.printTimeStamp() elif not pathlib.Path(bamFileStr).exists( ) and not pathlib.Path(sortedBAMFileStr).exists(): commandStr = SAMconvert.storeDict.get("command", "") finalDict.update({ "outputBAM": bamFileStr, "inputSAM": samFileStr, }) finalCommandStr = commandStr.format(**finalDict) Print.phraseStr = finalCommandStr Print.runCommand() if pathlib.Path(samFileStr).exists() and pathlib.Path( bamFileStr).exists(): commandStr = Remove.storeDict.get("command", "") finalCommandStr = commandStr.format(target=samFileStr) Print.phraseStr = finalCommandStr Print.runCommand() if pathlib.Path(sortedBAMFileStr).exists(): Print.phraseStr = "Sorted BAM File existed: " + sortedBAMFileStr Print.printTimeStamp() else: commandStr = SAMsort.storeDict.get("command", "") finalDict.update({ "outputBAM": sortedBAMFileStr, "inputBAM": bamFileStr, }) finalCommandStr = commandStr.format(**finalDict) Print.phraseStr = finalCommandStr Print.runCommand() if pathlib.Path(bamFileStr).exists() and pathlib.Path( sortedBAMFileStr).exists(): commandStr = Remove.storeDict.get("command", "") finalCommandStr = commandStr.format(target=bamFileStr) Print.phraseStr = finalCommandStr Print.runCommand() Print.stopLog()
def trimming(self): # ---- Parameter ---- BinTrim = libConfig.config() BinTrim.queryStr = "binTrimmomatic" BinTrim.folderStr = "config/" BinTrim.modeStr = "UPDATE" BinTrim.load() ExpRep = libConfig.config() ExpRep.queryStr = self.queryStr ExpRep.folderStr = "config/" ExpRep.modeStr = "UPDATE" ExpRep.load() # ---- Initialization ---- commandStr = BinTrim.storeDict["command"] conditionList = ExpRep.storeDict.get("conditionsList", []) groupList = ExpRep.storeDict.get("group", []) replicationList = ExpRep.storeDict.get("replication", []) directionList = ExpRep.storeDict.get("direction", []) branchStr = ExpRep.storeDict.get("branch", "") pairStr = ExpRep.storeDict.get("pairPostfix", "") unpairStr = ExpRep.storeDict.get("unpairPostfix", "") modeStr = ExpRep.storeDict.get("mode", "") inputFileNameStr = ExpRep.storeDict.get("[trim]inputFileName", "") outputFileNameStr = ExpRep.storeDict.get("[trim]outputFileName", "") fileTypeStr = ExpRep.storeDict.get("[trim]fileType", "") checkFolderList = ExpRep.storeDict.get("checkFolder", []) if not ExpRep.storeDict.get("testing", True): testingBool = False else: testingBool = True # ---- Action ---- for folderStr in checkFolderList: pathlib.Path(folderStr).mkdir(parents=True, exist_ok=True) if type(conditionList) == type(list()) and conditionList != []: for conditionDict in conditionList: conditionStr = conditionDict['trim'] Print = libPrint.timer() Print.logFilenameStr = "03-trim-{branch}-{cond}".format( branch=branchStr, cond=conditionStr) Print.folderStr = "log/" Print.testingBool = testingBool Print.startLog() TrimPara = libConfig.config() TrimPara.queryStr = conditionStr TrimPara.folderStr = "config/" TrimPara.modeStr = "UPDATE" TrimPara.load() headerStr = TrimPara.storeDict.get('header', "") for groupStr in groupList: for replicationStr in replicationList: if modeStr == "pairEnd": inputFileList = list() outputFileList = list() for directionStr in directionList: inputStr = inputFileNameStr.format( group=groupStr, replication=replicationStr, direction=directionStr, fileType=fileTypeStr) inputFileList.append(inputStr) outputPairStr = outputFileNameStr.format( condition=headerStr, direction=directionStr, group=groupStr, replication=replicationStr, pairType=pairStr, fileType=fileTypeStr, ) outputFileList.append(outputPairStr) outputUnPairStr = outputFileNameStr.format( condition=headerStr, direction=directionStr, group=groupStr, replication=replicationStr, pairType=unpairStr, fileType=fileTypeStr, ) outputFileList.append(outputUnPairStr) fileList = inputFileList + outputFileList fileStr = " ".join(fileList) commandDict = dict() commandDict.update(TrimPara.storeDict) commandDict.update({ 'files': fileStr, 'mode': "PE", }) CommandStr = commandStr.format(**commandDict) Print.phraseStr = CommandStr Print.runCommand() elif modeStr == "singleEnd": inputStr = inputFileNameStr.format( group=groupStr, replication=replicationStr, fileType=fileTypeStr) outputStr = outputFileNameStr.format( condition=headerStr, group=groupStr, replication=replicationStr, fileType=fileTypeStr, ) fileStr = "{} {}".format(inputStr, outputStr) commandDict = dict() commandDict.update(TrimPara.storeDict) commandDict.update({ 'files': fileStr, 'mode': "SE", }) CommandStr = commandStr.format(**commandDict) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
def importingStringtie(self): # ---- Initialization for Converting ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() branchStr = self.branchStr controlStr = Target.storeDict.get("controlSample", "") controlSafeStr = controlStr.replace("-", "_") groupList = Target.storeDict.get("group", []) replicationList = Target.storeDict.get("replication", []) patternStr = Target.storeDict.get("samplePattern", "") sampleList = list() for groupStr in groupList: for replicationStr in replicationList: sampleList.append( patternStr.format(group=groupStr, replication=replicationStr)) conditionList = Target.storeDict.get("conditionList", []) methodList = Target.storeDict.get("methodList", []) geneSourceDict = Target.storeDict.get("[sqlite]geneSourceDict", dict()) transcriptSourceDict = Target.storeDict.get( "[sqlite]transcriptSourceDict", dict()) geneExpPathStr = Target.storeDict.get("[sqlite]geneSourcePathStr", "") transcriptExpPathStr = Target.storeDict.get( "[sqlite]transcriptSourcePathStr", "") sqlFolderStr = Target.storeDict.get("sqlFolderStr", "") sqlPathStr = Target.storeDict.get("sqlPathStr", "") sqlLogStr = Target.storeDict.get("[sqlite]logFilename", "") for methodStr in methodList: pathlib.Path( sqlFolderStr.format(branch=branchStr, method=methodStr)).mkdir(parents=True, exist_ok=True) geneFolderStr = geneSourceDict.get(methodStr, "") transcriptFolderStr = transcriptSourceDict.get(methodStr, "") compareSet = set() for conditionTup in conditionList: antStr = conditionTup[0] trimStr = conditionTup[1] Print = libPrint.timer() Print.logFilenameStr = sqlLogStr.format(ant=antStr, trim=trimStr) Print.folderStr = sqlFolderStr.format(branch=branchStr, method=methodStr) Print.testingBool = self.testingBool Print.startLog() for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") Print.phraseStr = "-- Data format conversion for Gene Expression in {} --".format( sampleStr) Print.printTimeStamp() geneSamplePath = geneExpPathStr.format( folder=geneFolderStr, branch=branchStr, ant=antStr, trim=trimStr, sample=sampleStr) sampleDF = pd.read_csv(geneSamplePath, delimiter="\t", header=0) Print.printing("[Pandas:Read]" + geneSamplePath) rowList = sampleDF.values.tolist() countInt = len(rowList) # check compareList = list() for rowInt in range(countInt): insertList = [] sourceList = rowList[rowInt] insertList.append("UUID." + str(rowInt)) insertList.extend(sourceList[0:7]) compareStr = "\t".join([str(x) for x in insertList]) compareList.append(compareStr) if compareSet == set(): Print.printing(" " + sampleStr + ": Empty") compareSet = set(compareList) elif compareSet != set(compareList): Print.printing(" " + sampleStr + ": Same") elif compareSet == set(compareList): Print.printing(" " + sampleStr + ": Different") sqlPath = sqlPathStr.format(branch=branchStr, method=methodStr, ant=antStr, trim=trimStr) createCommandStr = """CREATE TABLE GeneExpression_{} ('UUID' TEXT PRIMARY KEY NOT NULL, 'GeneID' TEXT NOT NULL, 'GeneName' TEXT NOT NULL, 'Reference' TEXT NOT NULL, 'Strand' TEXT NOT NULL, 'Start' INTEGER NOT NULL, 'End' INTEGER NOT NULL, 'Coverage' REAL NOT NULL, 'FPKM' REAL NOT NULL, 'TPM' REAL NOT NULL);""".format(sampleSafeStr) insertCommandStr = "INSERT INTO GeneExpression_{} ('UUID','GeneID','GeneName','Reference','Strand','Start','End','Coverage','FPKM','TPM')\ VALUES (?,?,?,?,?,?,?,?,?,?)".format(sampleSafeStr) self.expressionInputDict = { "sqlPath": sqlPath, "count": countInt, "rowList": rowList, "createCommand": createCommandStr, "insertCommand": insertCommandStr, } self.exportingExpression(Print) # Transcript Print.phraseStr = "-- Data format conversion for Transcript Expression in {} --".format( sampleStr) Print.printTimeStamp() transcriptSamplePath = transcriptExpPathStr.format( folder=transcriptFolderStr, branch=branchStr, ant=antStr, trim=trimStr, sample=sampleStr) sampleDF = pd.read_csv(transcriptSamplePath, delimiter="\t", header=0) Print.printing("[Pandas:Read] " + transcriptSamplePath) rowList = sampleDF.values.tolist() countInt = len(rowList) # check compareList = list() for rowInt in range(countInt): insertList = [] sourceList = rowList[rowInt] insertList.append("UUID." + str(rowInt)) insertList.extend(sourceList[0:10]) compareStr = "\t".join([str(x) for x in insertList]) compareList.append(compareStr) if compareSet == set(): Print.printing(" " + sampleStr + ": Empty") compareSet = set(compareList) elif compareSet != set(compareList): Print.printing(" " + sampleStr + ": Same") elif compareSet == set(compareList): Print.printing(" " + sampleStr + ": Different") sqlPath = sqlPathStr.format(branch=branchStr, method=methodStr, ant=antStr, trim=trimStr) createCommandStr = """CREATE TABLE TranscriptExpression_{} ('UUID' TEXT PRIMARY KEY NOT NULL, 'TranscriptID' INTEGER NOT NULL, 'Chromosome' TEXT, 'Strand' TEXT NOT NULL, 'Start' INTEGER NOT NULL, 'End' INTEGER NOT NULL, 'TranscriptName' TEXT NOT NULL, 'ExonCount' INTEGER NOT NULL, 'Length' INTEGER NOT NULL, 'GeneID' TEXT NOT NULL, 'GeneName' TEXT NOT NULL, 'Coverage' REAL NOT NULL, 'FPKM' REAL NOT NULL);""".format(sampleSafeStr) insertCommandStr = "INSERT INTO TranscriptExpression_{} ('UUID','TranscriptID','Chromosome','Strand','Start','End','TranscriptName','ExonCount','Length','GeneID','GeneName','Coverage','FPKM')\ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)".format( sampleSafeStr) self.expressionInputDict = { "sqlPath": sqlPath, "count": countInt, "rowList": rowList, "createCommand": createCommandStr, "insertCommand": insertCommandStr, } self.exportingExpression(Print) Print.phraseStr = "-- Summarising for Gene Expression --" Print.printTimeStamp() createComStr = "CREATE TABLE GeneExpressionSummary ({})" createColumnList = [ "'UUID' TEXT PRIMARY KEY NOT NULL", "'GeneID' TEXT", "'GeneName' TEXT", ] insertColumnList = ["UUID", "GeneID", "GeneName"] for targetStr in ["FPKM", "TPM"]: for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") columnStr = "{target}_{sample} REAL".format( target=targetStr, sample=sampleSafeStr) createColumnList.append(columnStr) insertColumnList.append("{target}_{sample}".format( target=targetStr, sample=sampleSafeStr)) Connect = sqlite3.connect(sqlPath) Cursor = Connect.cursor() ReturnMsg = Cursor.execute( createComStr.format(",".join(createColumnList))) # pylint: disable=unused-variable Connect.commit() Print.printing("[SQLite3:CreateTable] " + sqlPath) resultDict = dict() controlExc = Cursor.execute( "SELECT UUID, GeneID, GeneName from GeneExpression_{}". format(controlSafeStr)) for rowList in controlExc: uuid, geneid, genename = rowList subDict = { "UUID": uuid, "GeneID": geneid, "GeneName": genename } resultDict.update({uuid: subDict}) for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") sampleExc = Cursor.execute( "SELECT UUID, FPKM, TPM from GeneExpression_{}". format(sampleSafeStr)) for rowList in sampleExc: uuid, fpkm, tpm = rowList subDict = resultDict[uuid] subDict.update({ "FPKM_{}".format(sampleSafeStr): fpkm, "TPM_{}".format(sampleSafeStr): tpm }) resultDict.update({uuid: subDict}) insertComStr = "INSERT INTO GeneExpressionSummary ({column}) VALUES ({value})" for uuid in resultDict.keys(): valueList = list() for posInt in range(len(insertColumnList)): valueList.append( resultDict[uuid][insertColumnList[posInt]]) insertCommand = insertComStr.format( column=",".join(insertColumnList), value=(("?," * (len(valueList) - 1))) + "?") ReturnMsg = Cursor.execute(insertCommand, valueList) Connect.commit() Print.printing("[SQLite3:Insert] " + sqlPath) Connect.close() Print.printing("[SQLite3:Close]\n") # Transcript Print.phraseStr = "-- Summarising for Gene Expression --" Print.printTimeStamp() createComStr = "CREATE TABLE TranscriptExpressionSummary ({})" createColumnList = [ "'UUID' TEXT PRIMARY KEY NOT NULL", "'TranscriptID' INTEGER", "'TranscriptName' TEXT", "'GeneID' TEXT", "'GeneName' TEXT", ] insertColumnList = [ "UUID", "TranscriptID", "TranscriptName", "GeneID", "GeneName" ] for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") columnStr = "FPKM_{sample} REAL".format( sample=sampleSafeStr) createColumnList.append(columnStr) insertColumnList.append( "FPKM_{sample}".format(sample=sampleSafeStr)) Connect = sqlite3.connect(sqlPath) Cursor = Connect.cursor() ReturnMsg = Cursor.execute( createComStr.format(",".join(createColumnList))) Connect.commit() Print.printing("[SQLite3:CreateTable] " + sqlPath) resultDict = dict() controlExc = Cursor.execute( "SELECT UUID, TranscriptID, TranscriptName, GeneID, GeneName from TranscriptExpression_{}" .format(controlSafeStr)) for rowList in controlExc: uuid, tid, tname, geneid, genename = rowList subDict = { "UUID": uuid, "TranscriptID": tid, "TranscriptName": tname, "GeneID": geneid, "GeneName": genename } resultDict.update({uuid: subDict}) for sampleStr in sampleList: sampleSafeStr = sampleStr.replace("-", "_") sampleExc = Cursor.execute( "SELECT UUID, FPKM from TranscriptExpression_{}". format(sampleSafeStr)) for rowList in sampleExc: uuid, fpkm = rowList subDict = resultDict[uuid] subDict.update({"FPKM_{}".format(sampleSafeStr): fpkm}) resultDict.update({uuid: subDict}) insertComStr = "INSERT INTO TranscriptExpressionSummary ({column}) VALUES ({value})" for uuid in resultDict.keys(): valueList = list() for posInt in range(len(insertColumnList)): valueList.append( resultDict[uuid][insertColumnList[posInt]]) insertCommand = insertComStr.format( column=",".join(insertColumnList), value=(("?," * (len(valueList) - 1))) + "?") ReturnMsg = Cursor.execute(insertCommand, valueList) Connect.commit() Print.printing("[SQLite3:Insert] " + sqlPath) Connect.close() Print.printing("[SQLite3:Close]\n") Print.stopLog()
def summaring(self): FLAGstat = libConfig.config() FLAGstat.queryStr = "binSAMtools-FLAGSTAT" FLAGstat.folderStr = "config/" FLAGstat.modeStr = "UPDATE" FLAGstat.load() expRep = libConfig.config() expRep.queryStr = self.branchStr expRep.folderStr = "config/" expRep.modeStr = "UPDATE" expRep.load() trimConditionList = expRep.storeDict.get("[trim]condition", []) hisat2ConditionList = expRep.storeDict.get("[hisat2]Condition", []) annotateConditionList = expRep.storeDict.get("conditionList", []) groupList = expRep.storeDict.get("group", []) replicationList = expRep.storeDict.get("replication", []) outputFolderStr = expRep.storeDict.get("[hisat2]outputFolder", "") outputFileNameStr = expRep.storeDict.get("[hisat2]outputFileName", "") if not expRep.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True for trimConditionStr in trimConditionList: for conditionList in annotateConditionList: annotateConditionStr = conditionList[0] finalOutputFolderStr = outputFolderStr.format( annotateCondition=annotateConditionStr, trimCondition=trimConditionStr) pathlib.Path(finalOutputFolderStr).mkdir(parents=True, exist_ok=True) for hisat2ConditionStr in hisat2ConditionList: Print = libPrint.timer() Print.logFilenameStr = "04-hs2-hisat2-{hisat2cond}-{annotateCon}-{trimCon}".format( hisat2cond=hisat2ConditionStr, annotateCon=annotateConditionStr, trimCon=trimConditionStr, ) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() for groupStr in groupList: for replicationStr in replicationList: sortedBAMDict = { "annotateCondition": annotateConditionStr, "trimCondition": trimConditionStr, "hisat2Condition": hisat2ConditionStr, "group": groupStr, "replication": replicationStr, "fileType": "-sorted.bam", } sortedBAMFileStr = outputFileNameStr.format( **sortedBAMDict) if pathlib.Path(sortedBAMFileStr).exists(): commandStr = FLAGstat.storeDict.get( "command", "") finalCommandStr = commandStr.format( BAMfile=sortedBAMFileStr) Print.phraseStr = finalCommandStr Print.runCommand() Print.stopLog()
def assembling(self): # ---- Parameter for Assembling ---- if self.withoutAnnotation: BinMap = libConfig.config() BinMap.queryStr = "binStringTie-RUN-withoutAnnotation" BinMap.folderStr = "config/" BinMap.modeStr = "UPDATE" BinMap.load() else: BinMap = libConfig.config() BinMap.queryStr = "binStringTie-RUN" BinMap.folderStr = "config/" BinMap.modeStr = "UPDATE" BinMap.load() commandStr = BinMap.storeDict["command"] # ---- Initialization for Assembling ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() branchStr = Target.storeDict.get("branch", "") groupList = Target.storeDict.get("group", []) replicationList = Target.storeDict.get("replication", []) hisat2ConditionStr = Target.storeDict.get("[hisat2]Condition", "") conditionList = Target.storeDict.get("conditionList", []) inputFileNameStr = Target.storeDict.get( "[{}]inputFileName".format(self.headerStr), "") outputFileNameStr = Target.storeDict.get( "[{}]outputFileName".format(self.headerStr), "") outputFolderStr = Target.storeDict.get( "[{}]outputFolder".format(self.headerStr), "") if not Target.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True for conditionTup in conditionList: antCondStr = conditionTup[0] trimCondStr = conditionTup[1] Annotate = libConfig.config() Annotate.queryStr = antCondStr Annotate.folderStr = "config/" Annotate.modeStr = "UPDATE" Annotate.load() threadStr = Annotate.storeDict.get("thread", "") antPathStr = Annotate.storeDict.get("antPath", "") # ---- Action ---- Print = libPrint.timer() Print.logFilenameStr = "05-{stringtie}-assembling-{branch}-{annotate}-{trim}".format( stringtie=self.headerStr, branch=branchStr, annotate=antCondStr, trim=trimCondStr, ) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() for groupStr in groupList: for repliStr in replicationList: outputFolderStr = outputFolderStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr) pathlib.Path(outputFolderStr).mkdir(parents=True, exist_ok=True) outputFilenameStr = outputFileNameStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) inputFilenameStr = inputFileNameStr.format( annotateCondition=antCondStr, hisat2Condition=hisat2ConditionStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) CommandStr = commandStr.format( bamfile=inputFilenameStr, outputfile=outputFilenameStr, thread=threadStr, antPath=antPathStr) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
def estimating(self): # ---- Parameter for Assembling ---- BinMap = libConfig.config() BinMap.queryStr = "binStringTie-ESTIMATE" BinMap.folderStr = "config/" BinMap.modeStr = "UPDATE" BinMap.load() commandStr = BinMap.storeDict["command"] # ---- Initialization for Assembling ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() branchStr = Target.storeDict.get("branch", "") groupList = Target.storeDict.get("group", []) replicationList = Target.storeDict.get("replication", []) hisat2ConditionStr = Target.storeDict.get("[hisat2]Condition", "") conditionList = Target.storeDict.get("conditionList", []) inputFileNameStr = Target.storeDict.get( "[{}]inputFileName".format(self.headerStr), "") mergedFileNameStr = Target.storeDict.get( "[{}]mergedFileName".format(self.headerStr), "") balgownFolderStr = Target.storeDict.get( "[{}]ballgownFolder".format(self.headerStr), "") gtfFileNameStr = Target.storeDict.get( "[{}]gtfFileName".format(self.headerStr), "") tsvFileNameStr = Target.storeDict.get( "[{}]tsvFileName".format(self.headerStr), "") if not Target.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True for conditionTup in conditionList: antCondStr = conditionTup[0] trimCondStr = conditionTup[1] Annotate = libConfig.config() Annotate.queryStr = antCondStr Annotate.folderStr = "config/" Annotate.modeStr = "UPDATE" Annotate.load() threadStr = Annotate.storeDict.get("thread", "") antPathStr = Annotate.storeDict.get("antPath", "") # ---- Action ---- Print = libPrint.timer() Print.logFilenameStr = "05-{stringtie}-estimating-{branch}-{annotate}-{trim}".format( stringtie=self.headerStr, branch=branchStr, annotate=antCondStr, trim=trimCondStr, ) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() for groupStr in groupList: for repliStr in replicationList: ballgownPathStr = balgownFolderStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) pathlib.Path(ballgownPathStr).mkdir(parents=True, exist_ok=True) bamPathStr = inputFileNameStr.format( annotateCondition=antCondStr, hisat2Condition=hisat2ConditionStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) mergeFileNameStr = mergedFileNameStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr) gtfPathStr = gtfFileNameStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) tsvPathStr = tsvFileNameStr.format( annotateCondition=antCondStr, trimCondition=trimCondStr, group=groupStr, replication=repliStr) if self.directEstimating: CommandStr = commandStr.format( thread=threadStr, mergePath=antPathStr, bamfile=bamPathStr, ballgownPath=ballgownPathStr, gtffile=gtfPathStr, tsvfile=tsvPathStr) else: CommandStr = commandStr.format( thread=threadStr, mergePath=mergeFileNameStr, bamfile=bamPathStr, ballgownPath=ballgownPathStr, gtffile=gtfPathStr, tsvfile=tsvPathStr) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
def converting(self): # ---- Parameter ---- BinGFF = libConfig.config() BinGFF.queryStr = "binCufflinks-gffread" BinGFF.folderStr = "config/" BinGFF.modeStr = "UPDATE" BinGFF.load() Copying = libConfig.config() Copying.queryStr = "commandCP" Copying.folderStr = "config/" Copying.modeStr = "UPDATE" Copying.load() # ---- Initialization for Assembling ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() if not Target.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True gffreadStr = BinGFF.storeDict["command"] copyStr = Copying.storeDict["command"] branchStr = self.branchStr conditionsList = [ n for n in Target.storeDict["conditionsList"] if n["transcriptome"] == "gffRead" ] gtfDict = Target.storeDict["gtfDict"] for conditionDict in conditionsList: genomeStr = conditionDict["genome"] trimStr = conditionDict["trim"] transcriptomeStr = conditionDict["transcriptome"] folderStr = gtfDict[transcriptomeStr]['folder'] infoDict = { "branch": self.branchStr, "annotate": genomeStr, "trim": trimStr, "folder": folderStr, } targetFolderStr = Target.storeDict["transcriptomeFolder"] targetStr = Target.storeDict["transcriptomeGTF"] Spec = libConfig.config() #parameters Spec.queryStr = genomeStr Spec.folderStr = "config/" Spec.modeStr = "UPDATE" Spec.load() inputStr = Spec.storeDict["antPath"] outputStr = Spec.storeDict["gtfPath"] outputFolderStr = Spec.storeDict["dbgaPath"] Print = libPrint.timer() Print.logFilenameStr = "05-gffConversion-{branch}-{annotate}".format( branch=branchStr, annotate=genomeStr, ) Print.folderStr = outputFolderStr Print.testingBool = self.testingBool Print.startLog() targetPath = targetStr.format(**infoDict) if not pathlib.Path(outputStr).exists(): CommandStr = gffreadStr.format(inputFile=inputStr, outputFile=outputStr) Print.phraseStr = CommandStr Print.runCommand() folderPath = targetFolderStr.format(**infoDict) pathlib.Path(folderPath).mkdir(parents=True, exist_ok=True) CommandStr = copyStr.format(output=outputStr, target=targetPath) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
def diffing(self): # ---- Parameter for Assembling ---- BinMap = libConfig.config() BinMap.queryStr = "binCuffDiff-RUN" BinMap.folderStr = "config/" BinMap.modeStr = "UPDATE" BinMap.load() commandStr = BinMap.storeDict["command"] # ---- Initialization for Assembling ---- Target = libConfig.config() Target.queryStr = self.branchStr Target.folderStr = "config/" Target.modeStr = "UPDATE" Target.load() groupList = Target.storeDict["group"] replicationList = Target.storeDict["replication"] threadStr = Target.storeDict["thread"] bamFileNameStr = Target.storeDict["[hisat2]outputFileName"] gtfFileNameStr = Target.storeDict["transcriptomeGTF"] gtfDict = Target.storeDict["gtfDict"] resultFolderStr = Target.storeDict["[CuffDiff]resultFolder"] conditionsList = Target.storeDict["conditionsList"] for conditionDict in conditionsList: genomeStr = conditionDict["genome"] trimStr = conditionDict["trim"] transcriptomeStr = conditionDict["transcriptome"] folderStr = gtfDict[transcriptomeStr]['folder'] hisat2ConditionStr = conditionDict["map"] if not Target.storeDict.get("testing", True): self.testingBool = False else: self.testingBool = True infoDict = { "branch": self.branchStr, "method": transcriptomeStr, "annotate": genomeStr, "trim": trimStr, "folder": folderStr, "hisat2Condition": hisat2ConditionStr, "fileType": "-sorted.bam", } # ---- Action ---- Print = libPrint.timer() Print.logFilenameStr = "07-CuffDiff-{branch}-from({method})-{annotate}-{trim}".format( **infoDict) Print.folderStr = "log/" Print.testingBool = self.testingBool Print.startLog() resultPathStr = resultFolderStr.format(**infoDict) pathlib.Path(resultPathStr).mkdir(parents=True, exist_ok=True) gtfFileStr = gtfFileNameStr.format(**infoDict) bamGroupList = list() for groupStr in groupList: bamFileList = list() for repliStr in replicationList: bamFileDict = dict() bamFileDict.update(infoDict) bamFileDict.update({ "group": groupStr, "replication": repliStr, }) bamFileStr = bamFileNameStr.format(**bamFileDict) bamFileList.append(bamFileStr) bamGroupList.append(",".join(bamFileList)) bamSampleStr = " ".join(bamGroupList) infoDict.update({ "thread": threadStr, "outputFolder": resultPathStr, "labelList": ",".join(groupList), "mergedGTF": gtfFileStr, "bamFiles": bamSampleStr, }) CommandStr = commandStr.format(**infoDict) Print.phraseStr = CommandStr Print.runCommand() Print.stopLog()
#!/usr/bin/env python3 import libConfig # ---- Configuration of Indexing Conditions ---- SpeDataBase = libConfig.config() SpeDataBase.queryStr = "speciesDatabase" SpeDataBase.folderStr = "config/" SpeDataBase.queryDict = { "from": "binHISAT2-BUILD", "dbgaPath": "userData/dbga-GenomeAnnotation/speciesDatabase/", "seqPath": "userData/dbgs-GenomeSequence/speciesDatabase/speciesDatabase.fn", "antPath": "userData/dbga-GenomeAnnotation/speciesDatabase/speciesDatabase.gff3", "gtfPath": "userData/dbga-GenomeAnnotation/speciesDatabase/speciesDatabase.gtf", "indexHeader": "largeData/02-hisat2Index/speciesDatabase", "checkFolder": "largeData/02-hisat2Index/", "thread": "6", "testing": False, } SpeDataBase.modeStr = "UPDATE" SpeDataBase.save() SpeDataBaseTwo = libConfig.config() SpeDataBaseTwo.queryStr = "speciesDatabase2" SpeDataBaseTwo.folderStr = "config/" SpeDataBaseTwo.queryDict = { "from": "binHISAT2-BUILD", "dbgaPath": "userData/dbga-GenomeAnnotation/speciesDatabase2/", "seqPath": "userData/dbgs-GenomeSequence/speciesDatabase2/speciesDatabase2.fn",
hisat2-build [options]* <reference_in> <ht2_base> Main arguments <reference_in> A comma-separated list of FASTA files containing the reference sequences to be aligned to, or, if -c is specified, the sequences themselves. E.g., <reference_in> might be chr1.fa,chr2.fa,chrX.fa,chrY.fa, or, if -c is specified, this might be GGTCATCCT,ACGGGTCGT,CCGTTCTATGCGGCTTA. <ht2_base> The basename of the index files to write. By default, hisat2-build writes files named NAME.1.ht2, NAME.2.ht2, NAME.3.ht2, NAME.4.ht2, NAME.5.ht2, NAME.6.ht2, NAME.7.ht2, and NAME.8.ht2 where NAME is <ht2_base>. """ HISAT = libConfig.config() HISAT.queryStr = "binHISAT2-BUILD" HISAT.folderStr = "config/" HISAT.queryDict = { "command": "bin/hisat2/hisat2-build -p {thread} {seqPath} {indexHeader}" } HISAT.modeStr = "OVERWRITE" HISAT.save() # ---- Configuration of Trimming Command ---- """ FROM: http://www.usadellab.org/cms/?page=trimmomatic java -jar <path to trimmomatic.jar> PE [-threads <threads] [-phred33 | -phred64] [-trimlog <logFile>] \ <input 1> <input 2> <paired output 1> <unpaired output 1> <paired output 2> <unpaired output 2> \ <step 1> ...
#!/usr/bin/env python3 import libConfig # ---- Configuration of Experiment Design ---- expRep = libConfig.config() expRep.queryStr = "speciesTreatment" expRep.folderStr = "config/" # Arguments/Parameters below need to modify depend on # your experiment design and naming style expRep.queryDict = { "branch" : "speciesTreatment", "group" : ["control","treat1","treat2","treat3","treat4"], "direction" : ["R1","R2"], "replication" : ["1","2","3"], "pairPostfix" : "pair", "unpairPostfix" : "unpair", "mode" : "pairEnd", "thread" : "6", "conditionsList" : [ { "genome" : "speciesDatabase", "trim" : "trimQ30", "transcriptome" : "dsStringtie", "map" : "hisat2ForStringtie", }, { "genome" : "speciesDatabase", "trim" : "trimQ30", "transcriptome" : "gffRead", "map" : "hisat2ForCufflinks", } ],