def MakeReference(self): if not os.path.isfile(self.strRefFile): with open(self.strBarcodeFile) as Barcode, \ open(self.strTargetSeqFile) as Target, \ open(self.strReferenceSeqFile) as Ref, \ open(self.strRefFile, 'w') as Output: listBarcode = Helper.RemoveNullAndBadKeyword(Barcode) listTarget = Helper.RemoveNullAndBadKeyword(Target) listRef = Helper.RemoveNullAndBadKeyword(Ref) ## defensive assert len(listBarcode) == len(listTarget) == len(listRef), 'Barcode, Target and Reference must be a same row number.' listName = [] for strBar, strTar in zip(listBarcode, listTarget): strBar = strBar.replace('\n', '').replace('\r', '').strip().upper() strTar = strTar.replace('\n', '').replace('\r', '').strip().upper() Helper.CheckIntegrity(self.strBarcodeFile, strBar) ## defensive Helper.CheckIntegrity(self.strBarcodeFile, strTar) ## defensive listName.append(strBar + ':' + strTar + '\n') for i, strRow in enumerate(listRef): strRow = strRow.replace('\r', '').strip().upper() Output.write('>' + listName[i] + strRow + '\n')
def __init__(self, strSample, strRef, options, InstInitFolder): UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath) self.strSample = strSample self._RemoveTmpBeforStart() self.MakeSampleFolder() ## inheritance self.strRef = strRef self.intCore = options.multicore self.strGapOpen = options.gap_open self.strGapExtend = options.gap_extend self.strTargetWindow = options.target_window self.strIndelCheckPos = options.indel_check_pos self.strTargetRefAlt = options.target_ref_alt self.strBarcodeFile = os.path.join(self.strRefDir, 'Barcode.txt') self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference.txt') self.strRefFile = os.path.join(self.strRefDir, 'Reference.fa') self.strPamSeq = options.PAM_seq self.strPamPos = options.PAM_pos self.strGuidePos = options.Guide_pos Helper.MakeFolderIfNot( './Output/{user}/{project}/{sample}/Tmp/Alignment'.format( user=self.strUser, project=self.strProject, sample=self.strSample))
def RunPipeline(**kwargs): setGroup = set() """ listSamples is ... 190819_Nahye_12K_D4_D0_1-Cas9D7 Cas9D7 Ctrl 190819_Nahye_12K_D4_eCas9_Rep1-Cas9D7 Cas9D7 Exp ... """ for strSample in listSamples: """ tupSampleInfo is ... tuple instance (190819_Nahye_12K_D4_D0_1-Cas9D7, Cas9D7, Ctrl) """ tupSampleInfo = Helper.SplitSampleInfo(strSample) if not tupSampleInfo: continue """ strSample = 190819_Nahye_12K_D4_D0_1-Cas9D7 ... sample name , strRef = Cas9D7 ... reference name , strExpCtrl = Ctrl/Exp/"" """ strSample, strRef, strExpCtrl = tupSampleInfo setGroup.add(strExpCtrl) """ options ... has PAM type: Cas9 Cpf1, PAM position: Forward Reverse ... """ InstRunner = clsIndelSearcherRunner(strSample, strRef, options, InstInitFolder) #""" logging.info('SplitFile') InstRunner.SplitFile() logging.info('MakeReference') InstRunner.MakeReference() """ Indel_searcher_crispresso_hash.py """ logging.info('MakeIndelSearcherCmd') listCmd = InstRunner.MakeIndelSearcherCmd() logging.info('RunMulticore') RunMulticore(listCmd, options.multicore) ## from CoreSystem.py logging.info('MakeOutput') InstRunner.MakeOutput() logging.info('RunIndelFreqCalculator') InstRunner.RunIndelFreqCalculator() #""" if setGroup == {'EXP', 'CTRL'}: InstRunner.IndelNormalization() elif setGroup in [set(), set([]), set(['']), set([' '])]: pass else: logging.error('The group category is not appropriate. : %s' % setGroup) logging.error('Please make sure your project file is correct.') logging.error('The group category must be Exp or Ctrl') raise Exception
def SubtractIndelWithD0(dictD0IndelMerge, dictExpIndel): """ dictD0IndelMerge: indel proportion - dictExpIndel: indel proportion """ strD0SubResultDir = './Output/{user}/{project}/All_results/D0SubResult'.format( user=strUserName, project=strProjectName) Helper.MakeFolderIfNot(strD0SubResultDir) for strSample, dictBarcode in dictExpIndel.items(): with open( os.path.join( strD0SubResultDir, '{sample}_D0SubResult.txt').format(sample=strSample), 'w') as Output: Output.write( 'Barcode_indel_seq\tD0_total\tD0_indel_prop\tExp_total\tExp_indel_prop\tD0_sub_indel_prop\n' ) for strBarcode, dictCountTotalAndIndel in dictBarcode.items(): intExpTotal = dictCountTotalAndIndel['Total'] for strIndelSeq, dictCount in dictCountTotalAndIndel.items(): if strIndelSeq == 'Total': continue try: intD0Total = dictD0IndelMerge[strBarcode]['Total'] intD0Count = dictD0IndelMerge[strBarcode][strIndelSeq][ 'IndelCount'] floD0Prop = round(intD0Count / float(intD0Total), 6) intExpCount = dictCount['IndelCount'] floExpProp = round(intExpCount / float(intExpTotal), 6) floSubExpIndel = floExpProp - floD0Prop if floSubExpIndel < 0: floSubExpIndel = 0 Output.write('\t'.join( map(str, [ strIndelSeq, intD0Total, floD0Prop, intExpTotal, floExpProp, floSubExpIndel ])) + '\n') except KeyError: intExpCount = dictCount['IndelCount'] floExpProp = round(intExpCount / float(intExpTotal), 6) Output.write('\t'.join( map(str, [ strIndelSeq, 'None', 'None', intExpTotal, floExpProp, floExpProp ])) + '\n')
def MakeReference(self): with open(self.strBarcodeFile) as Barcode, \ open(self.strReferenceSeqFile) as Ref, \ open(self.strRefFile, 'w') as Output: listBarcode = Helper.RemoveNullAndBadKeyword(Barcode) listRef = Helper.RemoveNullAndBadKeyword(Ref) ## defensive assert len(listBarcode) == len( listRef), 'Barcode and Reference must be a same row number.' dictBarcode = {} for strBarcode in listBarcode: strBarcode = strBarcode.replace('\n', '').replace('\r', '').upper() Helper.CheckIntegrity(self.strBarcodeFile, strBarcode) ## defensive listBarcode = strBarcode.split(':') strBarSample = listBarcode[0] strBarcode = listBarcode[1] dictBarcode[strBarSample] = strBarcode for strRef in listRef: strRef = strRef.replace('\n', '').replace('\r', '').upper() Helper.CheckIntegrity(self.strBarcodeFile, strRef) ## defensive listRef = strRef.split(':') strRefSample = listRef[0] strRef = listRef[1] try: sBarcode = dictBarcode[strRefSample] Output.write('%s\t%s\t%s\n' % (strRefSample, sBarcode, strRef)) except KeyError: logging.error('no matching') logging.error(strRefSample, strRef)
def _RemoveTmpBeforStart(self): strFolderPath = './Output/{user}/{project}/{sample}'.format( user=self.strUser, project=self.strProject, sample=self.strSample) if os.path.isdir(strFolderPath): strCmd = 'rm -r %s' % strFolderPath Helper.PreventFromRmMistake(strCmd) ## defensive logging.info( 'Delete the %s folder before starting if these were existed.' % self.strSample) sp.call(strCmd.format(user=self.strUser, project=self.strProject, sample=self.strSample), shell=True)
def RunPipeline(**kwargs): for strSample in listSamples: if strSample[0] == '#': continue tupSampleInfo = Helper.SplitSampleInfo(strSample) if not tupSampleInfo: continue strSample, strRef, strExpCtrl = tupSampleInfo InstBaseEdit = clsBaseEditRunner(strSample, strRef, options, InstInitFolder) InstBaseEdit.MakeReference() listCmd = InstBaseEdit.MakeIndelSearcherCmd() ###print(lCmd[:5]) RunMulticore(listCmd, options.multicore) ## from CoreSystem.py InstBaseEdit.MakeMergeTarget() InstBaseEdit.CopyToAllResultFolder()
def Main(): logging.info('Program Start') logging.info('Make commands for a multiple processing') lPara = [] with open('./User/{user}/{project}.txt'.format( user=strUser, project=strProject)) as Project_list: for strSample in Project_list: if strSample[0] == '#': continue tupSampleInfo = Helper.SplitSampleInfo(strSample) if not tupSampleInfo: continue strSample, strRef, strExpCtrl = tupSampleInfo strSample = strSample.replace('\n', '').replace('\r', '') sFile_path = './Output/{user}/{project}/{sample}/Tmp/Alignment'.format( user=strUser, project=strProject, sample=strSample) sTotal_readcnt_path = './Output/{user}/{project}/{sample}/Tmp/All'.format( user=strUser, project=strProject, sample=strSample) dInput_fa = Make_ref_dict(strRef) lPara.append( [strSample, sFile_path, sTotal_readcnt_path, dInput_fa]) logging.info('Multiple processing Start') p = mp.Pool(iCore) p.map_async(Count_seq_freq, lPara).get() logging.info('Multiple processing End') #logging.info('Count group Start') #Count_group() #logging.info('Count group End') #logging.info('Trim data Start') #Trim_data() #logging.info('Trim data End') logging.info('Program End')
def RunPipeline(**kwargs): setGroup = set() for strSample in listSamples: tupSampleInfo = Helper.SplitSampleInfo(strSample) if not tupSampleInfo: continue strSample, strRef, strExpCtrl = tupSampleInfo setGroup.add(strExpCtrl) InstRunner = clsIndelSearcherRunner(strSample, strRef, options, InstInitFolder) #""" logging.info('SplitFile') InstRunner.SplitFile() logging.info('MakeReference') InstRunner.MakeReference() logging.info('MakeIndelSearcherCmd') listCmd = InstRunner.MakeIndelSearcherCmd() logging.info('RunMulticore') RunMulticore(listCmd, options.multicore) ## from CoreSystem.py logging.info('MakeOutput') InstRunner.MakeOutput() logging.info('RunIndelFreqCalculator') InstRunner.RunIndelFreqCalculator() #""" if setGroup == {'EXP', 'CTRL'}: InstRunner.IndelNormalization() elif setGroup in [set(), set([]), set(['']), set([' '])]: pass else: logging.error('The group category is not appropriate. : %s' % setGroup) logging.error('Please make sure your project file is correct.') logging.error('The group category must be Exp or Ctrl') raise Exception
def CountGroup(InstParameters): """ Sorting_barcode Unique_RandomBarcodeNumber_In_SortingBarcode RandomBarcode Each_RandomBarcode_read_count TATATCATAGCGTACTCATC 8 TGCGTTTG 3 TATATCATAGCGTACTCATC 8 CGCGTTTG 3 TATATCATAGCGTACTCATC 8 TAGTTTTG 1 TATATCATAGCGTACTCATC 8 ATAGTTTG 1 """ sHeader = '' with open(InstParameters.strSampleList) as Sample: ## tmp input listSample = Sample.readlines() setGroup = set([ strRow.replace('\n', '').split('\t')[2].upper() for strRow in listSample ]) for strGroup in setGroup: if strGroup == 'CTRL': continue for strRow in listSample: if strGroup == strGroupOfSample: ## matched group names -> Sum the counts listCol = strRow.replace('\n', '').split('\t') strSample = listCol[0] strRef = listCol[1] strGroupOfSample = listCol[2] strProjectDir = './Output/{user}/{project}'.format( user=InstParameters.strUser, project=InstParameters.strProject) strGroupDir = os.path.join(strProjectDir, 'Group_result') Helper.MakeFolderIfNot(strGroupDir) dTotal_RandomBarcode_cnt_in_SortingBarcode = OrderedDict( ) ## ('GECKO_6367_GATCTGCTC', ['GECKO_6367', 'GATCTGCTC', 2, 156, '0.0128']), ## Unique key, only one list. with open('{project_dir}/{sample}_all_random_barcode.txt'. format( project_dir=strProjectDir, sample=strSample)) as RandomBarcode_SeqFreq: sHeader = RandomBarcode_SeqFreq.readline() for sRow in RandomBarcode_SeqFreq: lCol = sRow.replace('\n', '').split('\t') sSortingBarcode = lCol[0] #iTotal_RandomBarcode_cnt_in_SortingBarcode = int(lCol[1]) sSorting_and_Random_barcode_seq = lCol[ 0] + '_' + lCol[ 2] ## Unique name : Doench2014_1000_CTCTGGGGT iRandomBarcode_count = int(lCol[3]) lCol[3] = iRandomBarcode_count try: _ = dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSorting_and_Random_barcode_seq] dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSorting_and_Random_barcode_seq][ 3] += iRandomBarcode_count except KeyError: dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSorting_and_Random_barcode_seq] = lCol ## initial assignment #END for dRecal_total_kind_of_RandomBarcode = OrderedDict() for sSort_Rand_seq in dTotal_RandomBarcode_cnt_in_SortingBarcode: ## sSorting_and_Random_barcode_seq sSortBarcode = sSort_Rand_seq.split('_')[0] try: dRecal_total_kind_of_RandomBarcode[ sSortBarcode].append( dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSort_Rand_seq]) except KeyError: dRecal_total_kind_of_RandomBarcode[ sSortBarcode] = [ dTotal_RandomBarcode_cnt_in_SortingBarcode[ sSort_Rand_seq] ] for sKey, llValue in dRecal_total_kind_of_RandomBarcode.items( ): ## sKey: TATATCATAGCGTACTCATC, llValue : [[TATATCATAGCGTACTCATC, 8, TGCGTTTG, 3],[],[] ... iKind_of_RandomBarcode = len( llValue ) ################## why do I make like this ????? for lValue in llValue: lValue[ 1] = iKind_of_RandomBarcode ## Recal using group total cnt. llValue = sorted(llValue, key=lambda x: x[3], reverse=True) dRecal_total_kind_of_RandomBarcode[sKey] = llValue strEachGroup = './Output/Group_result/%s' % strGroup Helper.MakeFolderIfNot(strEachGroup) with open(os.path.join(strEachGroup, 'Summary_all_random_barcode_in_group.txt'), 'w') as Sort_Random_cnt,\ open(os.path.join(strEachGroup, 'Summary_Unique_RandomBarcodeNumber_in_group.txt'), 'w') as Uniq_random_cnt: Sort_Random_cnt.write(sHeader) Uniq_random_cnt.write( 'Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\n' ) for sSortBarcode, llCol in dRecal_total_kind_of_RandomBarcode.items( ): Uniq_random_cnt.write( '\t'.join(map(str, [sSortBarcode, len(llCol)])) + '\n') for lCol in llCol: Sort_Random_cnt.write( '\t'.join(map(str, lCol)) + '\n')
def Main(): print('BaseEdit program start: %s' % datetime.now()) sCmd = ( "BaseEdit frequency analyzer\n\n./Run_BaseEdit_freq.py -t 15 -w 16-48 --indel_check_pos 39-40 --target_ref_alt A,T --PAM_seq NGG --PAM_pos 43-45 --Guide_pos 23-42" " --gap_open -10 --gap_extend 1\n\n" "The sequence position is the one base position (start:1)\n" "1: Barcode\n" "2: Base target window (end pos = PAM pos +3)\n" "3: Indel check pos\n" "4: PAM pos\n" "5: Guide pos (without PAM)\n\n" "TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCAAAGGTCC\n" "<------1------><----------------2--------------->\n" " <3> <4> \n" " <---------5--------> \n\n") parser = OptionParser(sCmd) parser.add_option("-t", "--thread", default="1", type="int", dest="multicore", help="multiprocessing number") parser.add_option('--gap_open', default='-10', type='float', dest='gap_open', help='gap open: -100~0') parser.add_option('--gap_extend', default='1', type='float', dest='gap_extend', help='gap extend: 1~100') parser.add_option("-w", "--target_window", type="str", dest="target_window", help="a window size for target sequence : 20-48") parser.add_option( "--indel_check_pos", type="str", dest="indel_check_pos", help= "indel check position to filter : 39-40; insertion 39, deletion 39 & 40" ) parser.add_option("--target_ref_alt", type="str", dest="target_ref_alt", help="Ref 'A' is changed to Alt 'T': A,T") parser.add_option("--PAM_seq", type="str", dest="PAM_seq", help="PAM sequence: NGG, NGC ...") parser.add_option( "--PAM_pos", type="str", dest="PAM_pos", help="PAM position range in the reference seqeunce : 43-45") parser.add_option( "--Guide_pos", type="str", dest="Guide_pos", help="Guide position range in the reference seqeunce : 23-42") parser.add_option('--python', dest='python', help='The python path including the CRISPResso2') parser.add_option('--user', dest='user_name', help='The user name with no space') parser.add_option('--project', dest='project_name', help='The project name with no space') options, args = parser.parse_args() InstInitFolder = InitialFolder(options.user_name, options.project_name, os.path.basename(__file__)) InstInitFolder.MakeDefaultFolder() InstInitFolder.MakeInputFolder() InstInitFolder.MakeOutputFolder() logging.basicConfig( format='%(process)d %(levelname)s %(asctime)s : %(message)s', level=logging.DEBUG, filename=InstInitFolder.strLogPath, filemode='a') logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logging.info('Program start') if options.multicore > 15: logging.warning('Optimal treads <= 15') logging.info(str(options)) with open(InstInitFolder.strProjectFile) as Sample_list: listSamples = Helper.RemoveNullAndBadKeyword(Sample_list) strInputProject = './Input/{user}/Query/{project}'.format( user=options.user_name, project=options.project_name) @CheckProcessedFiles def RunPipeline(**kwargs): for strSample in listSamples: if strSample[0] == '#': continue tupSampleInfo = Helper.SplitSampleInfo(strSample) if not tupSampleInfo: continue strSample, strRef, strExpCtrl = tupSampleInfo InstBaseEdit = clsBaseEditRunner(strSample, strRef, options, InstInitFolder) InstBaseEdit.MakeReference() listCmd = InstBaseEdit.MakeIndelSearcherCmd() ###print(lCmd[:5]) RunMulticore(listCmd, options.multicore) ## from CoreSystem.py InstBaseEdit.MakeMergeTarget() InstBaseEdit.CopyToAllResultFolder() RunPipeline(InstInitFolder=InstInitFolder, strInputProject=strInputProject, listSamples=listSamples, logging=logging) print('BaseEdit program end: %s' % datetime.now())
def Main(): parser = OptionParser('Indel search program for CRISPR CAS9 & CPF1\n<All default option> python2.7 Run_indel_searcher.py --pam_type Cas9 --pam_pos Forward') parser.add_option('-t', '--thread', default='1', type='int', dest='multicore', help='multiprocessing number, recommendation:t<16') parser.add_option('-c', '--chunk_number', default='400000', type='int', dest='chunk_number', help='split FASTQ, must be multiples of 4. file size < 1G recommendation:40000, size > 1G recommendation:400000') parser.add_option('-q', '--base_quality', default='20', dest='base_quality', help='NGS read base quality') parser.add_option('--gap_open', default='-10', type='float', dest='gap_open', help='gap open: -100~0') parser.add_option('--gap_extend', default='1', type='float', dest='gap_extend', help='gap extend: 1~100') parser.add_option('-i', '--insertion_window', default='4', type='int', dest='insertion_window', help='a window size for insertions') parser.add_option('-d', '--deletion_window', default='4', type='int', dest='deletion_window', help='a window size for deletions') parser.add_option('--pam_type', dest='pam_type', help='PAM type: Cas9 Cpf1') parser.add_option('--pam_pos', dest='pam_pos', help='PAM position: Forward Reverse') parser.add_option('--python', dest='python', help='The python path including the CRISPResso2') parser.add_option('--user', dest='user_name', help='The user name with no space') parser.add_option('--project', dest='project_name', help='The project name with no space') parser.add_option('--pickle', dest='pickle', default='False', help='Dont remove the pickles in the tmp folder : True, False') parser.add_option('--split', dest='split', default='False', help='Dont remove the split files in the input folder : True, False') parser.add_option('--classfied_FASTQ', dest='class_fastq', default='True', help='Dont remove the ClassfiedFASTQ in the tmp folder : True, False') parser.add_option('--ednafull', dest='ednafull', help='The nucleotide alignment matrix') options, args = parser.parse_args() InstInitFolder = InitialFolder(options.user_name, options.project_name, os.path.basename(__file__)) InstInitFolder.MakeDefaultFolder() InstInitFolder.MakeInputFolder() InstInitFolder.MakeOutputFolder() logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s', level=logging.DEBUG, filename=InstInitFolder.strLogPath, filemode='a') logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logging.info('Program start') if options.multicore > 15: logging.warning('Optimal treads <= 15') logging.info(str(options)) """ InstInitFolder.strProjectFile is... ./User/Nahye/2019_Nahye_Cas9D7_samples.txt in INPUT3 190819_Nahye_12K_D4_D0_1-Cas9D7 Cas9D7 Ctrl 190819_Nahye_12K_D4_eCas9_Rep1-Cas9D7 Cas9D7 Exp 190819_Nahye_12K_D4_eCas9_Rep2-Cas9D7 Cas9D7 Exp 190819_Nahye_12K_D4_evo_Rep1-Cas9D7 Cas9D7 Exp """ with open(InstInitFolder.strProjectFile) as Sample_list: listSamples = Helper.RemoveNullAndBadKeyword(Sample_list) intProjectNumInTxt = len(listSamples) strInputProject = './Input/{user}/FASTQ/{project}'.format(user=options.user_name, project=options.project_name) @CheckProcessedFiles def RunPipeline(**kwargs): setGroup = set() """ listSamples is ... 190819_Nahye_12K_D4_D0_1-Cas9D7 Cas9D7 Ctrl 190819_Nahye_12K_D4_eCas9_Rep1-Cas9D7 Cas9D7 Exp ... """ for strSample in listSamples: """ tupSampleInfo is ... tuple instance (190819_Nahye_12K_D4_D0_1-Cas9D7, Cas9D7, Ctrl) """ tupSampleInfo = Helper.SplitSampleInfo(strSample) if not tupSampleInfo: continue """ strSample = 190819_Nahye_12K_D4_D0_1-Cas9D7 ... sample name , strRef = Cas9D7 ... reference name , strExpCtrl = Ctrl/Exp/"" """ strSample, strRef, strExpCtrl = tupSampleInfo setGroup.add(strExpCtrl) """ options ... has PAM type: Cas9 Cpf1, PAM position: Forward Reverse ... """ InstRunner = clsIndelSearcherRunner(strSample, strRef, options, InstInitFolder) #""" logging.info('SplitFile') InstRunner.SplitFile() logging.info('MakeReference') InstRunner.MakeReference() """ Indel_searcher_crispresso_hash.py """ logging.info('MakeIndelSearcherCmd') listCmd = InstRunner.MakeIndelSearcherCmd() logging.info('RunMulticore') RunMulticore(listCmd, options.multicore) ## from CoreSystem.py logging.info('MakeOutput') InstRunner.MakeOutput() logging.info('RunIndelFreqCalculator') InstRunner.RunIndelFreqCalculator() #""" if setGroup == {'EXP', 'CTRL'}: InstRunner.IndelNormalization() elif setGroup in [set(), set([]), set(['']), set([' '])]: pass else: logging.error('The group category is not appropriate. : %s' % setGroup) logging.error('Please make sure your project file is correct.') logging.error('The group category must be Exp or Ctrl') raise Exception #""" RunPipeline(InstInitFolder=InstInitFolder, strInputProject=strInputProject, intProjectNumInTxt=intProjectNumInTxt, listSamples=listSamples, logging=logging) logging.info('Program end')
def __init__(self, strSample, strRef, options, InstInitFolder): UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath) self.MakeSampleFolder() self.strProjectFile = InstInitFolder.strProjectFile self.intChunkSize = options.chunk_number self.strQualCutoff = options.base_quality self.intInsertionWin = options.insertion_window # Insertion window 0,1,2,3,4 self.intDeletionWin = options.deletion_window # Deletion window 0,1,2,3,4 self.strPamType = options.pam_type # CRISPR type : Cpf1(2 cleavages), Cas9(1 cleavage) self.strPamPos = options.pam_pos # Barcode target position : Forward (barcode + target), Reverse (target + barcode) self.strPickle = options.pickle self.strClassFASTQ = options.class_fastq self.strSplit = options.split self.strLogPath = InstInitFolder.strLogPath self.strBarcodeFile = os.path.join(self.strRefDir, 'Barcode.txt') self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference_sequence.txt') self.strTargetSeqFile = os.path.join(self.strRefDir, 'Target_region.txt') self.strRefFile = os.path.join(self.strRefDir, 'Reference.fa') ## The file name required for the user is 'B'arcode.txt but it may be written as 'b'arcode.txt by mistake. ## This part is to fix the situation as mentioned above. if not os.path.isfile(self.strBarcodeFile): if os.path.isfile(self.strRefDir + 'barcode.txt'): self.strBarcodeFile = self.strRefDir + 'barcode.txt' else: logging.error('Barcode path is not correct, please make sure the path correctly.') if not os.path.isfile(self.strReferenceSeqFile): if os.path.isfile(self.strRefDir + 'reference_sequence.txt'): self.strReferenceSeqFile = self.strRefDir + 'reference_sequence.txt' else: logging.error('Reference path is not correct, please make sure the path correctly.') if not os.path.isfile(self.strTargetSeqFile): if os.path.isfile(self.strRefDir + 'target_region.txt'): self.strTargetSeqFile = self.strRefDir + 'target_region.txt' else: logging.error('Target path is not correct, please make sure the path correctly.') self.strFastqDir = './Input/{user}/FASTQ/{project}'.format(user=self.strUser, project=self.strProject) ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1' self.strSampleDir = os.path.join(self.strFastqDir, self.strSample) self.strFastq_name = '' for strFile in os.listdir(self.strSampleDir): if os.path.isfile(self.strSampleDir + '/' + strFile) and strFile.split('.')[-1] == 'fastq': self.strFastq_name = '.'.join(strFile.split('.')[:-1]) logging.info('File name : %s' % self.strFastq_name) ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.fastq' self.strInputFile = os.path.join(self.strSampleDir, self.strFastq_name+'.fastq') ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.txt' self.strInputList = os.path.join(self.strSampleDir, self.strFastq_name+'.txt') ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Split_files' self.strSplitPath = os.path.join(self.strSampleDir, 'Split_files') Helper.MakeFolderIfNot(self.strSplitPath) self.strPair = 'False' # FASTQ pair: True, False
def Convert_Indelsearcher_output(strSampleRefGroup): listSampleRefGroup = strSampleRefGroup.replace('\n', '').replace('\r', '').split('\t') strSample = listSampleRefGroup[0] strRef = listSampleRefGroup[1] print('Processing: %s, %s' % (strSample, strRef)) strBaseEditRefFolder = '../Base_edit_2/Input/{user}/Reference/{project}/{ref}'.format( user=strUser, project=strProject, ref=strRef) strBaseEditQueryFolder = '../Base_edit_2/Input/{user}/Query/{project}/{sample}'.format( user=strUser, project=strProject, sample=strSample) try: Helper.MakeFolderIfNot(strBaseEditRefFolder) Helper.MakeFolderIfNot(strBaseEditQueryFolder) except OSError as e: print(e) pass ## BaseEdit refer format : filename, barcode, reference ReferenceFile_in_IndelSearcher = open( './Input/{user}/Reference/{project}/{ref}/Reference_sequence.txt'. format(user=strUser, project=strProject, ref=strRef)) BarcodeFile_in_IndelSearcher = open( './Input/{user}/Reference/{project}/{ref}/Barcode.txt'.format( user=strUser, project=strProject, ref=strRef)) BarcodeFile_for_BaseEdit = open( '../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Barcode.txt'. format(user=strUser, project=strProject, ref=strRef), 'w') Reference_for_BaseEdit = open( '../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Reference.txt'. format(user=strUser, ref=strRef, project=strProject), 'w') ## conversion target to barcode:refseq dictBarcodeSeq = {} for strBarcodeIndelSearcher, strReferenceIndelSearcher in zip( BarcodeFile_in_IndelSearcher, ReferenceFile_in_IndelSearcher): strBarcodeIndelSearcher = strBarcodeIndelSearcher.replace('\n', '').strip() strReferenceIndelSearcher = strReferenceIndelSearcher.replace( '\n', '').strip() dictBarcodeSeq[strBarcodeIndelSearcher] = [] BarcodeFile_for_BaseEdit.write( strBarcodeIndelSearcher + ':' + strBarcodeIndelSearcher + '\n' ) ## first is filename, second is barcode. BaseEdit barcode format Reference_for_BaseEdit.write(strBarcodeIndelSearcher + ':' + strReferenceIndelSearcher + '\n') ReferenceFile_in_IndelSearcher.close() BarcodeFile_in_IndelSearcher.close() Reference_for_BaseEdit.close() Total_result_file = open( './Output/{user}/{project}/{sample}/Tmp/{sample}_Classified_Indel_barcode.fastq' .format(user=strUser, project=strProject, sample=strSample)) intCheckTotLine = 0 intOneLineMore = 0 for i, strRow in enumerate(Total_result_file): ## for query reads if intOneLineMore == 1: intCheckTotLine = 0 intOneLineMore = 0 if i % 4 == 0: ## Classified_Indel_barcode has all total sequence. strBarcode = strRow.split('Barcode_')[1].split(':')[0] intCheckTotLine = 1 elif intCheckTotLine == 1: dictBarcodeSeq[strBarcode].append(strRow) intOneLineMore = 1 for strBarcode, listSeq in dictBarcodeSeq.items(): with open( '../Base_edit_2/Input/{user}/Query/{project}/{sample}/{barcode}.txt' .format(user=strUser, project=strProject, sample=strSample, barcode=strBarcode), 'w') as Output: Output.write(''.join(listSeq)) Total_result_file.close()