Beispiel #1
0
def main():

    instance_main = GATK_pipeline.main()
    d_chromosome_lengths = instance_main.parse_chromosome_ranges()

    hours = 0

    while hours < 72:
        bool_break = True
        for chromosome in [
                'X',
                'Y',
        ] + range(
                1,
                22 + 1,
        ):
            chromosome = str(chromosome)
            intervals = int(
                math.ceil(d_chromosome_lengths[chromosome] /
                          instance_main.bps_per_interval))
            for array in range(
                    1,
                    intervals + 1,
            ):
                for suffix in [
                        '',
                        '.idx',
                ]:
                    fn = 'out_GATK/UnifiedGenotyper.%s.vcf.%s.idx' % (
                        chromosome,
                        array,
                    )
                    ## file missing
                    if not os.path.isfile(fn):
                        print fn, 'missing'
                        bool_break = False
                        break
                    ## file empty
                    elif os.path.getsize(fn) == 0:
                        print fn, 'empty'
                        bool_break = False
                        break
                    else:
                        continue
                if bool_break == False:
                    break
            if bool_break == False:
                break

        if bool_break == False:
            print 'I have slept for %.2f hours, and now I will sleep some more.' % (
                hours)
            time.sleep(15 * 60)
            hours += .25
        else:
            break

    print '--------------------PROCEED TO NEXT STEP--------------------'

    return
Beispiel #2
0
def main():

    instance_main = GATK_pipeline.main()
    d_chromosome_lengths = instance_main.parse_chromosome_ranges()

    hours = 0

    while hours < 72:
        bool_break = True
        for chromosome in ['X','Y',]+range(1,22+1,):
            chromosome = str(chromosome)
            intervals = int(
                math.ceil(
                    d_chromosome_lengths[chromosome]/instance_main.bps_per_interval
                    )
                )
            for array in range(1,intervals+1,):
                for suffix in ['','.idx',]:
                    fn = 'out_GATK/UnifiedGenotyper.%s.vcf.%s.idx' %(
                        chromosome,array,
                        )
                    ## file missing
                    if not os.path.isfile(fn):
                        print fn, 'missing'
                        bool_break = False
                        break
                    ## file empty
                    elif os.path.getsize(fn) == 0:
                        print fn, 'empty'
                        bool_break = False
                        break
                    else:
                        continue
                if bool_break == False:
                    break
            if bool_break == False:
                break

        if bool_break == False:
            print 'I have slept for %.2f hours, and now I will sleep some more.' %(hours)
            time.sleep(15*60)
            hours += .25
        else:
            break

    print '--------------------PROCEED TO NEXT STEP--------------------'

    return
Beispiel #3
0
def main():

    instance = GATK_pipeline.main()
    d_chromosome_lengths = instance.parse_chromosome_ranges()
    d_centromere_ranges = parse_centromere_ranges()

    ped = 'omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped'
    sepjoin = 'sep'

    transpose(ped)

    IMPUTE2_tped(ped,d_chromosome_lengths,sepjoin,d_centromere_ranges,)

    BEAGLE_tped(ped,d_chromosome_lengths,sepjoin,d_centromere_ranges,)

    return
Beispiel #4
0
    def main(self,):

        instance_GATK = GATK_pipeline.main()
        d_chromosome_lengths = instance_GATK.parse_chromosome_ranges()

        l_fn = os.listdir('stdout')

##        for fn in l_fn:
##            print fn
##            if (
##                fn[:len('UnifiedGenotyper')] == 'UnifiedGenotyper'
##                and
##                fn[len('UnifiedGenotyper')] != '.'
##                ):
##                old = os.path.join('stdout',fn)
##                new = os.path.join('stdout',fn.replace('UnifiedGenotyper','UnifiedGenotyper.'))
##                os.rename(old,new)
##            if '99' in fn:
##                fn_new = '.'.join([fn.split('.')[0],fn.split('.')[1],fn.split('.')[3],fn.split('.')[2],])
##                old = os.path.join('stdout',fn)
##                new = os.path.join('stdout',fn.replace(fn_new,''))
##                print old
##                print new
##                stop
##                os.rename(old,new)
##                continue
##            if (
##                fn[:len('UnifiedGenotyper')] == 'UnifiedGenotyper'
##                and
##                fn[-1] != 't'
##                ):
##                old = os.path.join('stdout',fn)
##                fn_new = '.'.join([fn.split('.')[0],fn.split('.')[1],fn.split('.')[3],fn.split('.')[2],])
##                new = os.path.join('stdout',fn_new)
##                os.rename(old,new)
##        stop

        d_resources = {'CPU':{},'Memory':{},}
        l_fn.sort()
        for fn in l_fn:
            if os.path.isdir(os.path.join('stdout',fn)):
                continue
##            print fn
            index1 = fn.index('.')
            step = fn[:index1]
            if '.' in fn[index1+1:]:
                index2 = index1+fn[index1+1:].index('.')+1
                chromosome = fn[index1+1:index2]
            else:
                chromosome = ''

            if chromosome == '23': chromosome = 'X'
            if chromosome == '24': chromosome = 'Y'

            fd = open('stdout/%s' %(fn),'r')
            lines = fd.readlines()
            fd.close()

            ## it would be faster to do rindex instead of regex,
            ## but in a few cases rubbish was appended to the farm log files
            keyword1 = re.compile(r'    Max Memory :')
            keyword2 = re.compile(r'    CPU time   :')
            l_mem = []
            l_cpu = []
            for line in lines:
                result1 = keyword1.search(line)
                result2 = keyword2.search(line)
                if result1 or result2:
                    v = float(line.split(':')[1].replace('sec.','').replace('MB',''))
                    if result1: l_mem += [v]
                    else: l_cpu += [v]

            cpu = max(l_cpu)
            mem = l_mem[l_cpu.index(cpu)]
##            if 'ApplyRecalibration' in fn and cpu > 1:
            if 'VariantRecalibrator' in fn and cpu > 1:
                print '%4i %4i %s' %(int(mem), int(cpu), chromosome), fn

            ## ignore if took less than a minute
            if cpu < 60:
                if os.path.getsize(os.path.join('stdout',fn)) < 2200:
                    print os.path.getsize(os.path.join('stdout',fn)), fn
                    stop
                    os.remove(os.path.join('stdout',fn))
                    continue
                continue

            for k_resource, v_resource in [
                ['CPU',cpu,],
                ['Memory',mem,],
                ]:
                if not step in d_resources[k_resource].keys():
                    d_resources[k_resource][step] = {}
                if not chromosome in d_resources[k_resource][step].keys():
                    d_resources[k_resource][step][chromosome] = []
                elif step not in ['UnifiedGenotyper','IMPUTE2',]:
                    print step, chromosome, k_resource, v_resource
                d_resources[k_resource][step][chromosome] += [v_resource]

        for k_resource in d_resources.keys():
            for step in d_resources[k_resource].keys():
                if 'Downsample' in step: continue
                if 'samtools' in step: continue
                l_y = []
                for chromosome in d_resources[k_resource][step].keys():
                    y = usage = d_resources[k_resource][step][chromosome]
                    if k_resource == 'CPU':
                        y = sum(y)/3600.
                    elif k_resource == 'Memory':
                        y = (sum(y)/len(y))
                    else:
                        print k_resource
                        stop
##                    lines += ['%s %s\n' %(x,y,)]
                    l_y += [y]
                if k_resource == 'CPU':
                    print k_resource, step, sum(l_y), len(l_y)
                else:
                    print k_resource, step, sum(l_y)/len(l_y), len(l_y)

        d_labels = {'Memory':'Mb','CPU':'hours'}
        for k_resource in d_resources.keys():
            for step in d_resources[k_resource].keys():
                l_x = []
                l_y = []
                if len(d_resources[k_resource][step].keys()) <= 3:
                    continue
                for chromosome in d_resources[k_resource][step].keys():
                    if chromosome == '':
                        continue
                    if chromosome[0] == '_':
                        continue
                    x = chromosome_length = d_chromosome_lengths[chromosome]/(10**6)
                    y = usage = d_resources[k_resource][step][chromosome]
                    if k_resource == 'CPU':
                        y = sum(y)/3600.
                    elif k_resource == 'Memory':
                        y = (sum(y)/len(y))
                    else:
                        print k_resource
                        stop
##                    lines += ['%s %s\n' %(x,y,)]
                    l_x += [x]
                    l_y += [y]
                if len(l_x) == 0:
                    print step
                    continue
##                fd = open('gnuplot_%s_%s.data' %(k_resource,step,),'w')
##                fd.writelines(lines)
##                fd.close()
                prefix = '%s_%s' %(k_resource,step,)
                print 'plotting', k_resource, step
                if k_resource == 'CPU':
                    print k_resource, step, sum(l_y), len(l_y)
                else:
                    print k_resource, step, sum(l_y)/len(l_y), len(l_y)
                gnuplot.scatter_plot_2d(
                    prefix,l1=l_x,l2=l_y,
                    ylabel='%s (%s)' %(k_resource,d_labels[k_resource]),
                    xlabel='chromosome length (Mbp)',
                    title=prefix.replace('_',' '),
                    )

        return