Ejemplo n.º 1
0
def getRepeatForKnownGene(commonOptions, specifiedOptions, moreOptions={}):
    if "repeatName" in moreOptions:
        repeatName = moreOptions['repeatName'].lower()  # repeatName.lower()
    else:
        repeatName = commonOptions['repeatName'].lower()
        moreOptions['repeatName'] = repeatName

    retoptions = get_gLoc(repeatName, commonOptions)
    mgloc = retoptions['mgloc']
    if commonOptions['outlog'] <= M_INFO:
        print('mgloc', mgloc)

    moreOptions.update(retoptions)
    myHMM.produce_for_repPat(commonOptions, moreOptions)

    if specifiedOptions["SepbamfileTemp"] is not None:
        specifiedOptions["bamfile"] = (specifiedOptions["SepbamfileTemp"] % moreOptions['chr'][3:])

    myret = {}
    myretdetail = {}
    if (commonOptions['SplitAndReAlign'] in [0, 2]) or testall:
        start_time = time.time()
        if commonOptions['outlog'] <= M_INFO and 'thread' not in specifiedOptions:
            print('p2bamhmm start')
        p2bamhmm = getRepeatForGivenGene(commonOptions, specifiedOptions, moreOptions)
        memres = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
        if p2bamhmm is None:
            print('ERROR None detection', moreOptions['repeatName'], mgloc)
            logging.error('ERROR None detection: ' +
                          str(moreOptions['repeatName']) + ' ' + str(mgloc))
        else:
            addSumForAGene(p2bamhmm, myret, myretdetail, 'p2bamhmm', 2)
        end_time = time.time()
        if commonOptions['outlog'] <= M_WARNING and 'thread' not in specifiedOptions:
            print('p2bamhmm end---running time%.0f mem%d' % (end_time - start_time, memres))
            sys.stdout.flush()
    if (commonOptions['SplitAndReAlign'] in [1, 2]) or testall:
        start_time = time.time()
        if commonOptions['outlog'] <= M_INFO and 'thread' not in specifiedOptions:
            print('p2sp start')
        moreOptions['fafqfile'] = specifiedOptions["bamfile"]
        moreOptions['fafqtype'] = 'bam'
        p2sp = myRepeatReAlignment.getRepeatCounts(commonOptions, specifiedOptions, moreOptions)
        memres = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
        if p2sp is None:
            print('ERROR None detection (sp)', moreOptions['repeatName'], mgloc)
            logging.error('ERROR None detection (sp): ' +
                          str(moreOptions['repeatName']) + ' ' + str(mgloc))
        else:
            addSumForAGene(p2sp, myret, myretdetail, 'p2sp', 2)
        end_time = time.time()
        if commonOptions['outlog'] <= M_WARNING and 'thread' not in specifiedOptions:
            print('p2sp end---running time%.0f mem%d' % (end_time - start_time, memres))
            sys.stdout.flush()

    return [myret, myretdetail]
Ejemplo n.º 2
0
def detectRepCounts(commonOptions, specifiedOptions, moreOptions):
	retoptions = myBAMhandler.get_Loc1(moreOptions['mgloc'], commonOptions)
	if commonOptions['outlog'] <= M_INFO: print 'mgloc', moreOptions['mgloc']
	#print moreOptions['mgloc'][:5]

	moreOptions.update(retoptions)
	myHMM.produce_for_repPat(commonOptions, moreOptions)

	if not specifiedOptions["SepbamfileTemp"]==None:
		specifiedOptions["bamfile"] = (specifiedOptions["SepbamfileTemp"] % moreOptions['chr'][3:])

	if (commonOptions['SplitAndReAlign'] in [0,2]) or testall:
		start_time = time.time();
		if commonOptions['outlog'] <= M_INFO and (not specifiedOptions.has_key('thread')): print 'p2bamhmm start'
		p2bamhmm = myBAMhandler.getRepeatForGivenGene(commonOptions, specifiedOptions, moreOptions)
		memres = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024
		if p2bamhmm==None:
			print 'ERROR None detection', moreOptions['repeatName'], moreOptions['mgloc']
			logging.error('ERROR None detection: ' + str( moreOptions['repeatName']) + ' ' + str(moreOptions['mgloc']))
		else:
			addSumForAGene(p2bamhmm, 'p2bamhmm', 2, commonOptions, specifiedOptions, moreOptions)
		end_time = time.time();
		if commonOptions['outlog'] <= M_WARNING and (not specifiedOptions.has_key('thread')): print ('p2bamhmm end---running time%.0f mem%d' % (end_time-start_time, memres)); sys.stdout.flush()
	if (commonOptions['SplitAndReAlign'] in [1,2]) or testall:
		start_time = time.time();
		if commonOptions['outlog'] <= M_INFO and (not specifiedOptions.has_key('thread')): print 'p2sp start'
		moreOptions['fafqfile'] = specifiedOptions["bamfile"]
		moreOptions['fafqtype'] = 'bam'
		p2sp = myRepeatReAlignment.getRepeatCounts(commonOptions, specifiedOptions, moreOptions)
		memres = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024
		if p2sp==None:
			print 'ERROR None detection (sp)', moreOptions['repeatName'], moreOptions['mgloc']
			logging.error('ERROR None detection (sp): ' + str( moreOptions['repeatName']) + ' ' + str(moreOptions['mgloc']))
		else:
			addSumForAGene(p2sp, 'p2sp', 2, commonOptions, specifiedOptions, moreOptions)
		end_time = time.time();
		if commonOptions['outlog'] <= M_WARNING and (not specifiedOptions.has_key('thread')): print ('p2sp end---running time%.0f mem%d' % (end_time-start_time, memres)); sys.stdout.flush()
Ejemplo n.º 3
0
def getSCA3ForGivenGene(commonOptions, specifiedOptions, moreOptions):
    predres = []

    mgloc = moreOptions['mgloc']
    repeatName = moreOptions['repeatName']
    gene_start_end = moreOptions['gene_start_end']
    repeat_start_end = moreOptions['repeat_start_end']

    fastafile = specifiedOptions['fastafile']
    unique_file_id = specifiedOptions['unique_file_id']
    analysis_file_id = specifiedOptions['analysis_file_id']

    hgfile = commonOptions['hgfile']
    MinSup = commonOptions['MinSup']

    repPat = moreOptions['repPat']

    myHMM.produce_for_repPat(commonOptions, moreOptions)
    len_repPat = printHMMmatrix.get_len_repPat(repPat, commonOptions)
    logging.info("len_repPat=" + str(len_repPat))
    repPat = moreOptions['repPat']

    upstreamstr, repregion, downstreamstr = get3part(
        mgloc, gene_start_end, repeat_start_end, repeatName, unique_file_id,
        analysis_file_id, hgfile, specifiedOptions)

    if len(repregion) == 0:
        logging.error("Not repeat region! please check!!" + repeatName +
                      (' gene_location=[%d, %d], repeat_location=[%d, %d]' %
                       (gene_start_end[0], gene_start_end[1],
                        repeat_start_end[0], repeat_start_end[1])))
        sys.exit(1)

    logging.info("Test " + repeatName + (
        ' gene_location=[%d, %d], repeat_location=[%d, %d]; upstreamsize=%d, downstreamsize=%d'
        % (gene_start_end[0], gene_start_end[1], repeat_start_end[0],
           repeat_start_end[1], repeat_start_end[0] - gene_start_end[0],
           gene_start_end[1] - repeat_start_end[1])))
    logging.info("Normal/Pathogenic repeats: %s" % mgloc[5])

    orirepeat = int(len(repregion) / float(len_repPat))  #3)

    logging.info("Orignal Test read=" + '<<<' + repregion + '>>>' +
                 (" #repeat=%d; #len=%d" % (orirepeat, len(repregion))))

    bwamem_w_option = 90 * 4
    max_w_option, min_w_option = 500, 100
    if bwamem_w_option < min_w_option: bwamem_w_option = min_w_option
    if bwamem_w_option > max_w_option: bwamem_w_option = max_w_option
    bwamem_w_option = bwamem_w_option + int(
        len(upstreamstr + repregion + downstreamstr) * 0.4)
    if bwamem_w_option > max_w_option: bwamem_w_option = max_w_option

    start_time = time.time()

    bamfile = fastafile + '.bam'
    bamfile = fastafile + unique_file_id + '.bam'
    specifiedOptions['bamfile'] = bamfile

    myret = {}
    myretdetail = {}

    #cmd = 'bwa mem -k17 -w'+str(bwamem_w_option)+' -W40 -r10 -A1 -B1 -O1 -E1 -L1 -t '+mthreads+' -v 2 '+hg_reference_and_index+'/'+hgfile+' '+ fastafile +' | samtools view -S -b | samtools sort > '+bamfile
    cmd = 'bwa mem -k17 -w' + str(
        bwamem_w_option
    ) + ' -W40 -r10 -A1 -B1 -O1 -E1 -L1 -t ' + mthreads + ' -v 2 ' + hgfile + ' ' + fastafile + ' | samtools view -S -b | samtools sort > ' + bamfile
    logging.info(cmd)
    os.system(cmd)

    cmd = 'samtools index ' + bamfile
    logging.info(cmd)
    os.system(cmd)

    if (commonOptions['SplitAndReAlign'] in [0, 2]) or testall:
        start_time = time.time()
        if commonOptions['outlog'] <= M_INFO and (
                not specifiedOptions.has_key('thread')):
            print 'p2bamhmm start'
            sys.stdout.flush()
        p2bamhmm = myBAMhandler.getRepeatForGivenGene(commonOptions,
                                                      specifiedOptions,
                                                      moreOptions)
        memres = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
        if p2bamhmm == None:
            print 'ERROR None detection', moreOptions['repeatName'], mgloc
            logging.error('ERROR None detection: ' +
                          str(moreOptions['repeatName']) + ' ' + str(mgloc))
        else:
            myBAMhandler.addSumForAGene(p2bamhmm, myret, myretdetail,
                                        'p2bamhmm', 2)
        end_time = time.time()
        if commonOptions['outlog'] <= M_WARNING and (
                not specifiedOptions.has_key('thread')):
            print('p2bamhmm end---running time%.0f mem%d' %
                  (end_time - start_time, memres))
            sys.stdout.flush()

    if (commonOptions['SplitAndReAlign'] in [1, 2]) or testall:
        start_time = time.time()
        if commonOptions['outlog'] <= M_INFO and (
                not specifiedOptions.has_key('thread')):
            print 'start p2sp'
            sys.stdout.flush()

        #moreOptions['fafqfile'] = specifiedOptions['fastafile']
        #moreOptions['fafqtype'] = 'fq'
        moreOptions['fafqfile'] = bamfile
        moreOptions['fafqtype'] = 'bam'

        p2sp = myRepeatReAlignment.getRepeatCounts(commonOptions,
                                                   specifiedOptions,
                                                   moreOptions)
        memres = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
        if p2sp == None:
            print 'ERROR None detection (sp)', moreOptions['repeatName'], mgloc
            logging.error('ERROR None detection (sp): ' +
                          str(moreOptions['repeatName']) + ' ' + str(mgloc))
        else:
            myBAMhandler.addSumForAGene(p2sp, myret, myretdetail, 'p2sp', 2)
        end_time = time.time()
        if commonOptions['outlog'] <= M_WARNING and (
                not specifiedOptions.has_key('thread')):
            print('p2sp end---running time%.0f mem%d' %
                  (end_time - start_time, memres))
            sys.stdout.flush()

    os.system('rm ' + bamfile)
    os.system('rm ' + bamfile + '.bai')

    return [myret, myretdetail]