def MakeReference(self):

        if not os.path.isfile(self.strRefFile):
            with open(self.strBarcodeFile) as Barcode, \
                open(self.strTargetSeqFile) as Target, \
                open(self.strReferenceSeqFile) as Ref, \
                open(self.strRefFile, 'w') as Output:

                listBarcode = Helper.RemoveNullAndBadKeyword(Barcode)
                listTarget  = Helper.RemoveNullAndBadKeyword(Target)
                listRef     = Helper.RemoveNullAndBadKeyword(Ref)

                ## defensive
                assert len(listBarcode) == len(listTarget) == len(listRef), 'Barcode, Target and Reference must be a same row number.'

                listName = []
                for strBar, strTar in zip(listBarcode, listTarget):
                    strBar = strBar.replace('\n', '').replace('\r', '').strip().upper()
                    strTar = strTar.replace('\n', '').replace('\r', '').strip().upper()

                    Helper.CheckIntegrity(self.strBarcodeFile, strBar) ## defensive
                    Helper.CheckIntegrity(self.strBarcodeFile, strTar) ## defensive

                    listName.append(strBar + ':' + strTar + '\n')
                
                for i, strRow in enumerate(listRef):
                    strRow = strRow.replace('\r', '').strip().upper()
                    Output.write('>' + listName[i] + strRow + '\n')
    def __init__(self, strSample, strRef, options, InstInitFolder):
        UserFolderAdmin.__init__(self, strSample, strRef, options,
                                 InstInitFolder.strLogPath)

        self.strSample = strSample
        self._RemoveTmpBeforStart()
        self.MakeSampleFolder()  ## inheritance

        self.strRef = strRef
        self.intCore = options.multicore
        self.strGapOpen = options.gap_open
        self.strGapExtend = options.gap_extend
        self.strTargetWindow = options.target_window
        self.strIndelCheckPos = options.indel_check_pos
        self.strTargetRefAlt = options.target_ref_alt

        self.strBarcodeFile = os.path.join(self.strRefDir, 'Barcode.txt')
        self.strReferenceSeqFile = os.path.join(self.strRefDir,
                                                'Reference.txt')
        self.strRefFile = os.path.join(self.strRefDir, 'Reference.fa')

        self.strPamSeq = options.PAM_seq
        self.strPamPos = options.PAM_pos
        self.strGuidePos = options.Guide_pos

        Helper.MakeFolderIfNot(
            './Output/{user}/{project}/{sample}/Tmp/Alignment'.format(
                user=self.strUser,
                project=self.strProject,
                sample=self.strSample))
        def RunPipeline(**kwargs):

            setGroup = set()
            """
            listSamples is ...
            190819_Nahye_12K_D4_D0_1-Cas9D7 Cas9D7  Ctrl 
            190819_Nahye_12K_D4_eCas9_Rep1-Cas9D7   Cas9D7  Exp ...
            """
            for strSample in listSamples:

                """
                tupSampleInfo is ... tuple instance
                (190819_Nahye_12K_D4_D0_1-Cas9D7, Cas9D7,  Ctrl)
                """
                tupSampleInfo = Helper.SplitSampleInfo(strSample)
                if not tupSampleInfo: continue

                """
                strSample = 190819_Nahye_12K_D4_D0_1-Cas9D7 ... sample name
                , strRef = Cas9D7 ... reference name
                , strExpCtrl = Ctrl/Exp/"" 
                """
                strSample, strRef, strExpCtrl = tupSampleInfo
                setGroup.add(strExpCtrl)

                """
                options ... has PAM type: Cas9 Cpf1, PAM position: Forward Reverse ...
                """
                InstRunner = clsIndelSearcherRunner(strSample, strRef, options, InstInitFolder)
                #"""
                logging.info('SplitFile')
                InstRunner.SplitFile()
                logging.info('MakeReference')
                InstRunner.MakeReference()

                """
                Indel_searcher_crispresso_hash.py
                """
                logging.info('MakeIndelSearcherCmd')
                listCmd = InstRunner.MakeIndelSearcherCmd()
                logging.info('RunMulticore')
                RunMulticore(listCmd, options.multicore)  ## from CoreSystem.py
                logging.info('MakeOutput')
                InstRunner.MakeOutput()
                logging.info('RunIndelFreqCalculator')
                InstRunner.RunIndelFreqCalculator()
                #"""

            if setGroup == {'EXP', 'CTRL'}:
                InstRunner.IndelNormalization()
            elif setGroup in [set(), set([]), set(['']), set([' '])]:
                pass
            else:
                logging.error('The group category is not appropriate. : %s' % setGroup)
                logging.error('Please make sure your project file is correct.')
                logging.error('The group category must be Exp or Ctrl')
                raise Exception
def SubtractIndelWithD0(dictD0IndelMerge, dictExpIndel):
    """
    dictD0IndelMerge: indel proportion - dictExpIndel: indel proportion
    """
    strD0SubResultDir = './Output/{user}/{project}/All_results/D0SubResult'.format(
        user=strUserName, project=strProjectName)
    Helper.MakeFolderIfNot(strD0SubResultDir)

    for strSample, dictBarcode in dictExpIndel.items():
        with open(
                os.path.join(
                    strD0SubResultDir,
                    '{sample}_D0SubResult.txt').format(sample=strSample),
                'w') as Output:
            Output.write(
                'Barcode_indel_seq\tD0_total\tD0_indel_prop\tExp_total\tExp_indel_prop\tD0_sub_indel_prop\n'
            )

            for strBarcode, dictCountTotalAndIndel in dictBarcode.items():

                intExpTotal = dictCountTotalAndIndel['Total']

                for strIndelSeq, dictCount in dictCountTotalAndIndel.items():
                    if strIndelSeq == 'Total': continue

                    try:
                        intD0Total = dictD0IndelMerge[strBarcode]['Total']
                        intD0Count = dictD0IndelMerge[strBarcode][strIndelSeq][
                            'IndelCount']

                        floD0Prop = round(intD0Count / float(intD0Total), 6)

                        intExpCount = dictCount['IndelCount']
                        floExpProp = round(intExpCount / float(intExpTotal), 6)

                        floSubExpIndel = floExpProp - floD0Prop
                        if floSubExpIndel < 0:
                            floSubExpIndel = 0

                        Output.write('\t'.join(
                            map(str, [
                                strIndelSeq, intD0Total, floD0Prop,
                                intExpTotal, floExpProp, floSubExpIndel
                            ])) + '\n')
                    except KeyError:
                        intExpCount = dictCount['IndelCount']
                        floExpProp = round(intExpCount / float(intExpTotal), 6)

                        Output.write('\t'.join(
                            map(str, [
                                strIndelSeq, 'None', 'None', intExpTotal,
                                floExpProp, floExpProp
                            ])) + '\n')
    def MakeReference(self):

        with open(self.strBarcodeFile) as Barcode, \
            open(self.strReferenceSeqFile) as Ref, \
            open(self.strRefFile, 'w') as Output:

            listBarcode = Helper.RemoveNullAndBadKeyword(Barcode)
            listRef = Helper.RemoveNullAndBadKeyword(Ref)

            ## defensive
            assert len(listBarcode) == len(
                listRef), 'Barcode and Reference must be a same row number.'

            dictBarcode = {}

            for strBarcode in listBarcode:
                strBarcode = strBarcode.replace('\n', '').replace('\r',
                                                                  '').upper()
                Helper.CheckIntegrity(self.strBarcodeFile,
                                      strBarcode)  ## defensive
                listBarcode = strBarcode.split(':')
                strBarSample = listBarcode[0]
                strBarcode = listBarcode[1]
                dictBarcode[strBarSample] = strBarcode

            for strRef in listRef:
                strRef = strRef.replace('\n', '').replace('\r', '').upper()
                Helper.CheckIntegrity(self.strBarcodeFile,
                                      strRef)  ## defensive
                listRef = strRef.split(':')
                strRefSample = listRef[0]
                strRef = listRef[1]

                try:
                    sBarcode = dictBarcode[strRefSample]
                    Output.write('%s\t%s\t%s\n' %
                                 (strRefSample, sBarcode, strRef))
                except KeyError:
                    logging.error('no matching')
                    logging.error(strRefSample, strRef)
    def _RemoveTmpBeforStart(self):
        strFolderPath = './Output/{user}/{project}/{sample}'.format(
            user=self.strUser, project=self.strProject, sample=self.strSample)

        if os.path.isdir(strFolderPath):
            strCmd = 'rm -r %s' % strFolderPath

            Helper.PreventFromRmMistake(strCmd)  ## defensive

            logging.info(
                'Delete the %s folder before starting if these were existed.' %
                self.strSample)
            sp.call(strCmd.format(user=self.strUser,
                                  project=self.strProject,
                                  sample=self.strSample),
                    shell=True)
        def RunPipeline(**kwargs):

            for strSample in listSamples:
                if strSample[0] == '#': continue

                tupSampleInfo = Helper.SplitSampleInfo(strSample)
                if not tupSampleInfo: continue
                strSample, strRef, strExpCtrl = tupSampleInfo

                InstBaseEdit = clsBaseEditRunner(strSample, strRef, options,
                                                 InstInitFolder)
                InstBaseEdit.MakeReference()

                listCmd = InstBaseEdit.MakeIndelSearcherCmd()
                ###print(lCmd[:5])
                RunMulticore(listCmd, options.multicore)  ## from CoreSystem.py

                InstBaseEdit.MakeMergeTarget()

            InstBaseEdit.CopyToAllResultFolder()
def Main():

    logging.info('Program Start')

    logging.info('Make commands for a multiple processing')
    lPara = []
    with open('./User/{user}/{project}.txt'.format(
            user=strUser, project=strProject)) as Project_list:

        for strSample in Project_list:
            if strSample[0] == '#': continue

            tupSampleInfo = Helper.SplitSampleInfo(strSample)
            if not tupSampleInfo: continue
            strSample, strRef, strExpCtrl = tupSampleInfo

            strSample = strSample.replace('\n', '').replace('\r', '')
            sFile_path = './Output/{user}/{project}/{sample}/Tmp/Alignment'.format(
                user=strUser, project=strProject, sample=strSample)
            sTotal_readcnt_path = './Output/{user}/{project}/{sample}/Tmp/All'.format(
                user=strUser, project=strProject, sample=strSample)
            dInput_fa = Make_ref_dict(strRef)

            lPara.append(
                [strSample, sFile_path, sTotal_readcnt_path, dInput_fa])

    logging.info('Multiple processing Start')
    p = mp.Pool(iCore)
    p.map_async(Count_seq_freq, lPara).get()
    logging.info('Multiple processing End')

    #logging.info('Count group Start')
    #Count_group()
    #logging.info('Count group End')

    #logging.info('Trim data Start')
    #Trim_data()
    #logging.info('Trim data End')

    logging.info('Program End')
        def RunPipeline(**kwargs):

            setGroup = set()
            for strSample in listSamples:

                tupSampleInfo = Helper.SplitSampleInfo(strSample)
                if not tupSampleInfo: continue
                strSample, strRef, strExpCtrl = tupSampleInfo
                setGroup.add(strExpCtrl)

                InstRunner = clsIndelSearcherRunner(strSample, strRef, options,
                                                    InstInitFolder)
                #"""
                logging.info('SplitFile')
                InstRunner.SplitFile()
                logging.info('MakeReference')
                InstRunner.MakeReference()
                logging.info('MakeIndelSearcherCmd')
                listCmd = InstRunner.MakeIndelSearcherCmd()
                logging.info('RunMulticore')
                RunMulticore(listCmd, options.multicore)  ## from CoreSystem.py
                logging.info('MakeOutput')
                InstRunner.MakeOutput()
                logging.info('RunIndelFreqCalculator')
                InstRunner.RunIndelFreqCalculator()
                #"""

            if setGroup == {'EXP', 'CTRL'}:
                InstRunner.IndelNormalization()
            elif setGroup in [set(), set([]), set(['']), set([' '])]:
                pass
            else:
                logging.error('The group category is not appropriate. : %s' %
                              setGroup)
                logging.error('Please make sure your project file is correct.')
                logging.error('The group category must be Exp or Ctrl')
                raise Exception
Exemple #10
0
def CountGroup(InstParameters):
    """
    Sorting_barcode Unique_RandomBarcodeNumber_In_SortingBarcode    RandomBarcode   Each_RandomBarcode_read_count
    TATATCATAGCGTACTCATC    8       TGCGTTTG        3
    TATATCATAGCGTACTCATC    8       CGCGTTTG        3
    TATATCATAGCGTACTCATC    8       TAGTTTTG        1
    TATATCATAGCGTACTCATC    8       ATAGTTTG        1
    """

    sHeader = ''

    with open(InstParameters.strSampleList) as Sample:  ## tmp input

        listSample = Sample.readlines()

        setGroup = set([
            strRow.replace('\n', '').split('\t')[2].upper()
            for strRow in listSample
        ])

        for strGroup in setGroup:
            if strGroup == 'CTRL': continue

            for strRow in listSample:
                if strGroup == strGroupOfSample:  ## matched group names -> Sum the counts
                    listCol = strRow.replace('\n', '').split('\t')
                    strSample = listCol[0]
                    strRef = listCol[1]
                    strGroupOfSample = listCol[2]

                    strProjectDir = './Output/{user}/{project}'.format(
                        user=InstParameters.strUser,
                        project=InstParameters.strProject)
                    strGroupDir = os.path.join(strProjectDir, 'Group_result')
                    Helper.MakeFolderIfNot(strGroupDir)

                    dTotal_RandomBarcode_cnt_in_SortingBarcode = OrderedDict(
                    )  ## ('GECKO_6367_GATCTGCTC', ['GECKO_6367', 'GATCTGCTC', 2, 156, '0.0128']),
                    ## Unique key, only one list.

                    with open('{project_dir}/{sample}_all_random_barcode.txt'.
                              format(
                                  project_dir=strProjectDir,
                                  sample=strSample)) as RandomBarcode_SeqFreq:
                        sHeader = RandomBarcode_SeqFreq.readline()

                        for sRow in RandomBarcode_SeqFreq:
                            lCol = sRow.replace('\n', '').split('\t')

                            sSortingBarcode = lCol[0]
                            #iTotal_RandomBarcode_cnt_in_SortingBarcode  = int(lCol[1])
                            sSorting_and_Random_barcode_seq = lCol[
                                0] + '_' + lCol[
                                    2]  ## Unique name : Doench2014_1000_CTCTGGGGT
                            iRandomBarcode_count = int(lCol[3])

                            lCol[3] = iRandomBarcode_count

                            try:
                                _ = dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                    sSorting_and_Random_barcode_seq]

                                dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                    sSorting_and_Random_barcode_seq][
                                        3] += iRandomBarcode_count

                            except KeyError:
                                dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                    sSorting_and_Random_barcode_seq] = lCol  ## initial assignment
                    #END for
                    dRecal_total_kind_of_RandomBarcode = OrderedDict()
                    for sSort_Rand_seq in dTotal_RandomBarcode_cnt_in_SortingBarcode:  ## sSorting_and_Random_barcode_seq
                        sSortBarcode = sSort_Rand_seq.split('_')[0]
                        try:
                            dRecal_total_kind_of_RandomBarcode[
                                sSortBarcode].append(
                                    dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                        sSort_Rand_seq])
                        except KeyError:
                            dRecal_total_kind_of_RandomBarcode[
                                sSortBarcode] = [
                                    dTotal_RandomBarcode_cnt_in_SortingBarcode[
                                        sSort_Rand_seq]
                                ]

                    for sKey, llValue in dRecal_total_kind_of_RandomBarcode.items(
                    ):
                        ## sKey: TATATCATAGCGTACTCATC, llValue : [[TATATCATAGCGTACTCATC, 8, TGCGTTTG, 3],[],[] ...
                        iKind_of_RandomBarcode = len(
                            llValue
                        )  ################## why do I make like this ?????
                        for lValue in llValue:
                            lValue[
                                1] = iKind_of_RandomBarcode  ## Recal using group total cnt.

                        llValue = sorted(llValue,
                                         key=lambda x: x[3],
                                         reverse=True)
                        dRecal_total_kind_of_RandomBarcode[sKey] = llValue

                    strEachGroup = './Output/Group_result/%s' % strGroup
                    Helper.MakeFolderIfNot(strEachGroup)

                    with open(os.path.join(strEachGroup, 'Summary_all_random_barcode_in_group.txt'), 'w') as Sort_Random_cnt,\
                        open(os.path.join(strEachGroup, 'Summary_Unique_RandomBarcodeNumber_in_group.txt'), 'w') as Uniq_random_cnt:

                        Sort_Random_cnt.write(sHeader)
                        Uniq_random_cnt.write(
                            'Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\n'
                        )

                        for sSortBarcode, llCol in dRecal_total_kind_of_RandomBarcode.items(
                        ):
                            Uniq_random_cnt.write(
                                '\t'.join(map(str, [sSortBarcode,
                                                    len(llCol)])) + '\n')
                            for lCol in llCol:
                                Sort_Random_cnt.write(
                                    '\t'.join(map(str, lCol)) + '\n')
def Main():
    print('BaseEdit program start: %s' % datetime.now())

    sCmd = (
        "BaseEdit frequency analyzer\n\n./Run_BaseEdit_freq.py -t 15 -w 16-48 --indel_check_pos 39-40 --target_ref_alt A,T --PAM_seq NGG --PAM_pos 43-45 --Guide_pos 23-42"
        " --gap_open -10 --gap_extend 1\n\n"
        "The sequence position is the one base position (start:1)\n"
        "1: Barcode\n"
        "2: Base target window (end pos = PAM pos +3)\n"
        "3: Indel check pos\n"
        "4: PAM pos\n"
        "5: Guide pos (without PAM)\n\n"
        "TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCAAAGGTCC\n"
        "<------1------><----------------2--------------->\n"
        "                                     <3>  <4>   \n"
        "                      <---------5-------->      \n\n")

    parser = OptionParser(sCmd)

    parser.add_option("-t",
                      "--thread",
                      default="1",
                      type="int",
                      dest="multicore",
                      help="multiprocessing number")
    parser.add_option('--gap_open',
                      default='-10',
                      type='float',
                      dest='gap_open',
                      help='gap open: -100~0')
    parser.add_option('--gap_extend',
                      default='1',
                      type='float',
                      dest='gap_extend',
                      help='gap extend: 1~100')
    parser.add_option("-w",
                      "--target_window",
                      type="str",
                      dest="target_window",
                      help="a window size for target sequence : 20-48")
    parser.add_option(
        "--indel_check_pos",
        type="str",
        dest="indel_check_pos",
        help=
        "indel check position to filter : 39-40; insertion 39, deletion 39 & 40"
    )
    parser.add_option("--target_ref_alt",
                      type="str",
                      dest="target_ref_alt",
                      help="Ref 'A' is changed to Alt 'T': A,T")
    parser.add_option("--PAM_seq",
                      type="str",
                      dest="PAM_seq",
                      help="PAM sequence: NGG, NGC ...")
    parser.add_option(
        "--PAM_pos",
        type="str",
        dest="PAM_pos",
        help="PAM position range in the reference seqeunce : 43-45")
    parser.add_option(
        "--Guide_pos",
        type="str",
        dest="Guide_pos",
        help="Guide position range in the reference seqeunce : 23-42")
    parser.add_option('--python',
                      dest='python',
                      help='The python path including the CRISPResso2')
    parser.add_option('--user',
                      dest='user_name',
                      help='The user name with no space')
    parser.add_option('--project',
                      dest='project_name',
                      help='The project name with no space')

    options, args = parser.parse_args()

    InstInitFolder = InitialFolder(options.user_name, options.project_name,
                                   os.path.basename(__file__))
    InstInitFolder.MakeDefaultFolder()
    InstInitFolder.MakeInputFolder()
    InstInitFolder.MakeOutputFolder()

    logging.basicConfig(
        format='%(process)d %(levelname)s %(asctime)s : %(message)s',
        level=logging.DEBUG,
        filename=InstInitFolder.strLogPath,
        filemode='a')
    logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

    logging.info('Program start')
    if options.multicore > 15:
        logging.warning('Optimal treads <= 15')
    logging.info(str(options))

    with open(InstInitFolder.strProjectFile) as Sample_list:

        listSamples = Helper.RemoveNullAndBadKeyword(Sample_list)

        strInputProject = './Input/{user}/Query/{project}'.format(
            user=options.user_name, project=options.project_name)

        @CheckProcessedFiles
        def RunPipeline(**kwargs):

            for strSample in listSamples:
                if strSample[0] == '#': continue

                tupSampleInfo = Helper.SplitSampleInfo(strSample)
                if not tupSampleInfo: continue
                strSample, strRef, strExpCtrl = tupSampleInfo

                InstBaseEdit = clsBaseEditRunner(strSample, strRef, options,
                                                 InstInitFolder)
                InstBaseEdit.MakeReference()

                listCmd = InstBaseEdit.MakeIndelSearcherCmd()
                ###print(lCmd[:5])
                RunMulticore(listCmd, options.multicore)  ## from CoreSystem.py

                InstBaseEdit.MakeMergeTarget()

            InstBaseEdit.CopyToAllResultFolder()

        RunPipeline(InstInitFolder=InstInitFolder,
                    strInputProject=strInputProject,
                    listSamples=listSamples,
                    logging=logging)

    print('BaseEdit program end: %s' % datetime.now())
def Main():
    parser = OptionParser('Indel search program for CRISPR CAS9 & CPF1\n<All default option> python2.7 Run_indel_searcher.py --pam_type Cas9 --pam_pos Forward')

    parser.add_option('-t', '--thread', default='1', type='int', dest='multicore', help='multiprocessing number, recommendation:t<16')
    parser.add_option('-c', '--chunk_number', default='400000', type='int', dest='chunk_number',
                      help='split FASTQ, must be multiples of 4. file size < 1G recommendation:40000, size > 1G recommendation:400000')
    parser.add_option('-q', '--base_quality', default='20', dest='base_quality', help='NGS read base quality')
    parser.add_option('--gap_open', default='-10', type='float', dest='gap_open', help='gap open: -100~0')
    parser.add_option('--gap_extend', default='1', type='float', dest='gap_extend', help='gap extend: 1~100')
    parser.add_option('-i', '--insertion_window', default='4', type='int', dest='insertion_window', help='a window size for insertions')
    parser.add_option('-d', '--deletion_window', default='4', type='int', dest='deletion_window', help='a window size for deletions')
    parser.add_option('--pam_type', dest='pam_type', help='PAM type: Cas9 Cpf1')
    parser.add_option('--pam_pos', dest='pam_pos', help='PAM position: Forward Reverse')
    parser.add_option('--python', dest='python', help='The python path including the CRISPResso2')
    parser.add_option('--user', dest='user_name', help='The user name with no space')
    parser.add_option('--project', dest='project_name', help='The project name with no space')
    parser.add_option('--pickle', dest='pickle', default='False', help='Dont remove the pickles in the tmp folder : True, False')
    parser.add_option('--split', dest='split', default='False', help='Dont remove the split files in the input folder : True, False')
    parser.add_option('--classfied_FASTQ', dest='class_fastq', default='True', help='Dont remove the ClassfiedFASTQ in the tmp folder : True, False')
    parser.add_option('--ednafull', dest='ednafull', help='The nucleotide alignment matrix')

    options, args = parser.parse_args()

    InstInitFolder = InitialFolder(options.user_name, options.project_name, os.path.basename(__file__))
    InstInitFolder.MakeDefaultFolder()
    InstInitFolder.MakeInputFolder()
    InstInitFolder.MakeOutputFolder()

    logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s',
                        level=logging.DEBUG,
                        filename=InstInitFolder.strLogPath,
                        filemode='a')
    logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

    logging.info('Program start')
    if options.multicore > 15:
        logging.warning('Optimal treads <= 15')
    logging.info(str(options))
    """
    InstInitFolder.strProjectFile is...
    ./User/Nahye/2019_Nahye_Cas9D7_samples.txt in INPUT3
    
    190819_Nahye_12K_D4_D0_1-Cas9D7 Cas9D7  Ctrl 
    190819_Nahye_12K_D4_eCas9_Rep1-Cas9D7   Cas9D7  Exp 
    190819_Nahye_12K_D4_eCas9_Rep2-Cas9D7   Cas9D7  Exp 
    190819_Nahye_12K_D4_evo_Rep1-Cas9D7 Cas9D7  Exp  
    """
    with open(InstInitFolder.strProjectFile) as Sample_list:

        listSamples        = Helper.RemoveNullAndBadKeyword(Sample_list)
        intProjectNumInTxt = len(listSamples)

        strInputProject = './Input/{user}/FASTQ/{project}'.format(user=options.user_name, project=options.project_name)

        @CheckProcessedFiles
        def RunPipeline(**kwargs):

            setGroup = set()
            """
            listSamples is ...
            190819_Nahye_12K_D4_D0_1-Cas9D7 Cas9D7  Ctrl 
            190819_Nahye_12K_D4_eCas9_Rep1-Cas9D7   Cas9D7  Exp ...
            """
            for strSample in listSamples:

                """
                tupSampleInfo is ... tuple instance
                (190819_Nahye_12K_D4_D0_1-Cas9D7, Cas9D7,  Ctrl)
                """
                tupSampleInfo = Helper.SplitSampleInfo(strSample)
                if not tupSampleInfo: continue

                """
                strSample = 190819_Nahye_12K_D4_D0_1-Cas9D7 ... sample name
                , strRef = Cas9D7 ... reference name
                , strExpCtrl = Ctrl/Exp/"" 
                """
                strSample, strRef, strExpCtrl = tupSampleInfo
                setGroup.add(strExpCtrl)

                """
                options ... has PAM type: Cas9 Cpf1, PAM position: Forward Reverse ...
                """
                InstRunner = clsIndelSearcherRunner(strSample, strRef, options, InstInitFolder)
                #"""
                logging.info('SplitFile')
                InstRunner.SplitFile()
                logging.info('MakeReference')
                InstRunner.MakeReference()

                """
                Indel_searcher_crispresso_hash.py
                """
                logging.info('MakeIndelSearcherCmd')
                listCmd = InstRunner.MakeIndelSearcherCmd()
                logging.info('RunMulticore')
                RunMulticore(listCmd, options.multicore)  ## from CoreSystem.py
                logging.info('MakeOutput')
                InstRunner.MakeOutput()
                logging.info('RunIndelFreqCalculator')
                InstRunner.RunIndelFreqCalculator()
                #"""

            if setGroup == {'EXP', 'CTRL'}:
                InstRunner.IndelNormalization()
            elif setGroup in [set(), set([]), set(['']), set([' '])]:
                pass
            else:
                logging.error('The group category is not appropriate. : %s' % setGroup)
                logging.error('Please make sure your project file is correct.')
                logging.error('The group category must be Exp or Ctrl')
                raise Exception
            #"""

        RunPipeline(InstInitFolder=InstInitFolder,
                    strInputProject=strInputProject,
                    intProjectNumInTxt=intProjectNumInTxt,
                    listSamples=listSamples,
                    logging=logging)

    logging.info('Program end')
    def __init__(self, strSample, strRef, options, InstInitFolder):
        UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath)
        self.MakeSampleFolder()

        self.strProjectFile    = InstInitFolder.strProjectFile
        self.intChunkSize      = options.chunk_number
        self.strQualCutoff     = options.base_quality
        self.intInsertionWin   = options.insertion_window  # Insertion window 0,1,2,3,4
        self.intDeletionWin    = options.deletion_window  # Deletion window 0,1,2,3,4
        self.strPamType        = options.pam_type  # CRISPR type : Cpf1(2 cleavages), Cas9(1 cleavage)
        self.strPamPos         = options.pam_pos  # Barcode target position : Forward (barcode + target), Reverse (target + barcode)
        self.strPickle         = options.pickle
        self.strClassFASTQ     = options.class_fastq
        self.strSplit          = options.split
        self.strLogPath        = InstInitFolder.strLogPath

        self.strBarcodeFile      = os.path.join(self.strRefDir, 'Barcode.txt')
        self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference_sequence.txt')
        self.strTargetSeqFile    = os.path.join(self.strRefDir, 'Target_region.txt')
        self.strRefFile          = os.path.join(self.strRefDir, 'Reference.fa')

        ## The file name required for the user is 'B'arcode.txt but it may be written as 'b'arcode.txt by mistake.
        ## This part is to fix the situation as mentioned above.
        if not os.path.isfile(self.strBarcodeFile):
            if os.path.isfile(self.strRefDir + 'barcode.txt'):
                self.strBarcodeFile = self.strRefDir + 'barcode.txt'
            else:
                logging.error('Barcode path is not correct, please make sure the path correctly.')
        if not os.path.isfile(self.strReferenceSeqFile):
            if os.path.isfile(self.strRefDir + 'reference_sequence.txt'):
                self.strReferenceSeqFile = self.strRefDir + 'reference_sequence.txt'
            else:
                logging.error('Reference path is not correct, please make sure the path correctly.')
        if not os.path.isfile(self.strTargetSeqFile):
            if os.path.isfile(self.strRefDir + 'target_region.txt'):
                self.strTargetSeqFile = self.strRefDir + 'target_region.txt'
            else:
                logging.error('Target path is not correct, please make sure the path correctly.')


        self.strFastqDir = './Input/{user}/FASTQ/{project}'.format(user=self.strUser,
                                                                     project=self.strProject)
        ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1'
        self.strSampleDir  = os.path.join(self.strFastqDir, self.strSample)

        self.strFastq_name = ''
        for strFile in os.listdir(self.strSampleDir):
            if os.path.isfile(self.strSampleDir + '/' + strFile) and strFile.split('.')[-1] == 'fastq':
                self.strFastq_name = '.'.join(strFile.split('.')[:-1])
        logging.info('File name : %s' % self.strFastq_name)

        ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.fastq'
        self.strInputFile = os.path.join(self.strSampleDir, self.strFastq_name+'.fastq')
        ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.txt'
        self.strInputList = os.path.join(self.strSampleDir, self.strFastq_name+'.txt')

        ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Split_files'
        self.strSplitPath = os.path.join(self.strSampleDir, 'Split_files')
        Helper.MakeFolderIfNot(self.strSplitPath)

        self.strPair = 'False'  # FASTQ pair: True, False
Exemple #14
0
def Convert_Indelsearcher_output(strSampleRefGroup):

    listSampleRefGroup = strSampleRefGroup.replace('\n',
                                                   '').replace('\r',
                                                               '').split('\t')

    strSample = listSampleRefGroup[0]
    strRef = listSampleRefGroup[1]

    print('Processing: %s, %s' % (strSample, strRef))

    strBaseEditRefFolder = '../Base_edit_2/Input/{user}/Reference/{project}/{ref}'.format(
        user=strUser, project=strProject, ref=strRef)
    strBaseEditQueryFolder = '../Base_edit_2/Input/{user}/Query/{project}/{sample}'.format(
        user=strUser, project=strProject, sample=strSample)
    try:
        Helper.MakeFolderIfNot(strBaseEditRefFolder)
        Helper.MakeFolderIfNot(strBaseEditQueryFolder)
    except OSError as e:
        print(e)
        pass

    ## BaseEdit refer format : filename, barcode, reference
    ReferenceFile_in_IndelSearcher = open(
        './Input/{user}/Reference/{project}/{ref}/Reference_sequence.txt'.
        format(user=strUser, project=strProject, ref=strRef))
    BarcodeFile_in_IndelSearcher = open(
        './Input/{user}/Reference/{project}/{ref}/Barcode.txt'.format(
            user=strUser, project=strProject, ref=strRef))
    BarcodeFile_for_BaseEdit = open(
        '../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Barcode.txt'.
        format(user=strUser, project=strProject, ref=strRef), 'w')
    Reference_for_BaseEdit = open(
        '../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Reference.txt'.
        format(user=strUser, ref=strRef, project=strProject),
        'w')  ## conversion target to barcode:refseq

    dictBarcodeSeq = {}

    for strBarcodeIndelSearcher, strReferenceIndelSearcher in zip(
            BarcodeFile_in_IndelSearcher, ReferenceFile_in_IndelSearcher):

        strBarcodeIndelSearcher = strBarcodeIndelSearcher.replace('\n',
                                                                  '').strip()
        strReferenceIndelSearcher = strReferenceIndelSearcher.replace(
            '\n', '').strip()

        dictBarcodeSeq[strBarcodeIndelSearcher] = []
        BarcodeFile_for_BaseEdit.write(
            strBarcodeIndelSearcher + ':' + strBarcodeIndelSearcher + '\n'
        )  ## first is filename, second is barcode. BaseEdit barcode format
        Reference_for_BaseEdit.write(strBarcodeIndelSearcher + ':' +
                                     strReferenceIndelSearcher + '\n')

    ReferenceFile_in_IndelSearcher.close()
    BarcodeFile_in_IndelSearcher.close()
    Reference_for_BaseEdit.close()

    Total_result_file = open(
        './Output/{user}/{project}/{sample}/Tmp/{sample}_Classified_Indel_barcode.fastq'
        .format(user=strUser, project=strProject, sample=strSample))

    intCheckTotLine = 0
    intOneLineMore = 0

    for i, strRow in enumerate(Total_result_file):  ## for query reads

        if intOneLineMore == 1:
            intCheckTotLine = 0
            intOneLineMore = 0

        if i % 4 == 0:  ## Classified_Indel_barcode has all total sequence.
            strBarcode = strRow.split('Barcode_')[1].split(':')[0]
            intCheckTotLine = 1

        elif intCheckTotLine == 1:
            dictBarcodeSeq[strBarcode].append(strRow)
            intOneLineMore = 1

    for strBarcode, listSeq in dictBarcodeSeq.items():
        with open(
                '../Base_edit_2/Input/{user}/Query/{project}/{sample}/{barcode}.txt'
                .format(user=strUser,
                        project=strProject,
                        sample=strSample,
                        barcode=strBarcode), 'w') as Output:
            Output.write(''.join(listSeq))

    Total_result_file.close()