Ejemplos de checkexists en Python, ejemplos de sc.checkexists en Python

Ejemplo n.º 1

0

Mostrar archivo

def mapping(runCFG,param_paths,outDir,threads='1'):

    logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile'])

    num_jobs,num_threads = cpu_count(threads)

    cmds = []
    read_path = ''
    ref_path = ''
    output_bam_list = []
    for param_path in param_paths:
        id = param_path[0]
        read1 = os.path.basename(param_path[1])
        read2 = os.path.basename(param_path[2])
        read_path = os.path.dirname(os.path.abspath(param_path[1]))
        ref_path = runCFG['exec']['outdir'] + '/ref_sequence'
        reference_sequence_name = os.path.basename(param_path[3])

        #check output folder exists
        checkexists(os.path.join(outDir))
        if read2 != '':
            #generate command for paired end
            cmd = f"bash -c \'bowtie2 -x {reference_sequence_name} -1 /reads/{read1} -2 /reads/{read2} -p {num_threads} --local | samtools view -bS | samtools sort -o /output/{id}.bam\'"
        else:
            #generate command for interleaved
            cmd = f"bash -c \'bowtie2 -x {reference_sequence_name} --interleaved /reads/{read1} -p {num_threads} --local | samtools view -bS | samtools sort -o /output/{id}.bam\'"
        cmds.append(cmd)

        #data for next stage
        output_bam_list.append(os.path.join(outDir,f'{id}.bam'))

    #set up multiprocessing
    #start multiprocessing
    pool = mp.Pool(processes=num_jobs)
    #notify starting mapping
    procTitle('Mapping Reads')
    print('\nSniffles: Started mapping')
    #get start time
    start = time.time()
    #denote start of mapping in logs
    with open(logfile,'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Mapping\n')
        #start multiprocessing
        results = pool.starmap_async(cd.call,[[cmd,'/reads',{ref_path:"/reference",read_path:"/reads",outDir:"/output"}] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')
    #get end time
    end = time.time()
    #get total runtime
    runtime = round(end - start,2)
    print(f'\nSniffles: Finished mapping in {runtime} seconds')
    return output_bam_list

Ejemplo n.º 2

0

Mostrar archivo

Archivo: readcleaning.py Proyecto: uttaravadina/SARS-CoV-2_sequencing

def removeDuplicates(runCFG, bam_files, threads='1'):
    #initial parameters
    outDir = runCFG['exec']['outdir']
    logfile = runCFG['exec']['logfile']
    checkexists(os.path.join(outDir, 'rm_dups'))
    outDir = os.path.join(outDir, 'rm_dups')

    #notify starting to remove duplicates
    procTitle('Remove Duplicates', runCFG)
    print('\nSniffles: Removing duplicate reads')
    #get time at start
    start = time.time()

    #generate commands
    cmds = []
    output_list = []
    for path in bam_files:
        full_path = os.path.abspath(path)
        file_name = os.path.basename(full_path)
        path = os.path.dirname(full_path)
        id = file_name.split(".")[0]

        #remove duplicate reads command
        cmd = f'java -Xmx2g -jar /tools/picard.jar MarkDuplicates I=/in_dir/{id}.bam O=/out_dir/{id}.bam REMOVE_DUPLICATES=true M=/out_dir/{id}.removeDupMetrics.txt'
        cmds.append(cmd)

        #add id to finished list
        output_list.append(os.path.join(outDir, f'{id}.bam'))

    #set up multiprocessing
    pool = mp.Pool(processes=threads)

    #denote start of remove duplicate reads in logs
    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Removing Duplicates\n')
        #start multiprocessing
        results = pool.starmap_async(
            cd.call, [[cmd, '/reads', {
                path: "/in_dir",
                outDir: "/out_dir"
            }] for cmd in cmds])
        pool.close()
        pool.join()
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')
    #get time at end
    end = time.time()

    #determine runtime of processes
    runtime = round(end - start, 2)
    print(f'\nSniffles: Finished removing duplicates in {runtime} seconds')
    return output_list

Ejemplo n.º 3

0

Mostrar archivo

def average_depth(runCFG,bam_list,inDir,outDir):
    #check that output folder exists
    checkexists(os.path.join(outDir))
    #setup inital parameters
    ref_path = runCFG['exec']['outdir'] + '/ref_sequence'
    reference_sequence_name = os.path.basename(runCFG['exec']['referenceSequence'])
    logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile'])

    #bam file list that will meet threashold
    filtered_bam_list = []

    #create logfile that will hold all average depths
    with open(os.path.join(outDir,'average_depth.log'),'w') as outdepth:

        #loop through each bam
        for bam in bam_list:
            filename = os.path.basename(bam)
            id = filename.split('.')[0]
            #open the log to log output
            with open(logfile,'a') as outlog:
                outlog.write('***********\n')
                outlog.write('Coverage\n')
                #generate command for the current bam file
                cmd = f'bash -c "samtools view /indata/{filename} > {id}.tmp.sam  && /tools/bbmap/pileup.sh in={id}.tmp.sam out={id}_coverage.csv ref=/reference/{reference_sequence_name} && rm {id}.tmp.sam"'
                #use docker to run the command
                output = cd.call(cmd,'/outdata',{inDir:"/indata",outDir:"/outdata",ref_path:"/reference"})
                #record the output in the log
                outlog.write(output)
                #denote end of logs
                outlog.write('***********\n')

            #only add isolates that pass average depth and percent of reference covered
            percent_cov = 0
            avg_cov = 0
            #parse lines of stdout for info we need
            for line in output.splitlines():
                if "Percent of reference bases covered:" in line:
                    match = re.search('[0-9,.]+',line)
                    if match:
                        percent_cov = float(match[0])

                if "Average coverage:" in line:
                    match = re.search('[0-9,.]+',line)
                    if match:
                        avg_cov = float(match[0])
            #check against the config for min thresholds
            if avg_cov >= runCFG['exec']['minimumAverageDepth'] and percent_cov >= runCFG['exec']['percentRefCovered']:
                #record
                outdepth.write(f'{id},{avg_cov},{percent_cov},Pass\n')
                filtered_bam_list.append(bam)
            else:
                outdepth.write(f'{id},{avg_cov},{percent_cov},Fail\n')
    #return only bam files that meet coverage requirements
    return filtered_bam_list

Ejemplo n.º 4

0

Mostrar archivo

def indexing(runCFG,*paths):
    logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile'])
    outDir = runCFG['exec']['outdir'] + '/ref_sequence'
    checkexists(outDir)
    procTitle('Indexing Reference Genome')
    for path in paths:
        reference_sequence_abspath = os.path.abspath(path)
        reference_sequence_name = os.path.basename(reference_sequence_abspath)
        #index reference
        cmd = f'bowtie2-build {reference_sequence_name} {reference_sequence_name}'
        with open(logfile,'a') as outlog:
            outlog.write("*************************\n")
            outlog.write("Bowtie2 indexing the reference\n")
            copyfile(reference_sequence_abspath,os.path.join(outDir,reference_sequence_name))
            outlog.write(cd.call(cmd,'/data',{outDir:"/data"}))
            outlog.write("*************************\n")

Ejemplo n.º 5

0

Mostrar archivo

Archivo: mapping.py Proyecto: uttaravadina/SARS-CoV-2_sequencing

def indexing(runCFG, *paths):
    #print('\n-----------------------Sniffles: Indexing reference sequence-----------------------\n')

    logfile = runCFG['exec']['logfile']
    outDir = runCFG['exec']['outdir'] + '/ref_sequence'
    checkexists(outDir)
    procTitle("Indexing reference sequence", runCFG)

    for path in paths:
        reference_sequence_abspath = os.path.abspath(path)
        reference_sequence_name = os.path.basename(reference_sequence_abspath)
        print(path)
        #index reference
        cmd = f'bowtie2-build {reference_sequence_name} {reference_sequence_name} --quiet'  #' --threads {threads}'
        with open(logfile, 'a') as outlog:
            outlog.write("***********\n")
            outlog.write("Bowtie2 indexing the reference\n")
            copyfile(reference_sequence_abspath,
                     os.path.join(outDir, reference_sequence_name))
            outlog.write(cd.call(cmd, '/data', {outDir: "/data"}))
            outlog.write("***********\n")

Ejemplo n.º 6

0

Mostrar archivo

    parser.print_help()
    parser.exit()
args = parser.parse_args()
numThreads = args.t
configFile = args.c

#get start time
start = time.time()

#get input and output paths
try:
    outDir = os.path.abspath(args.o)
except (AttributeError, TypeError) as err:
    outDir = os.getcwd()
#check if output dir exists if not create it
sc.checkexists(outDir)

try:
    inDir = os.path.abspath(args.i)
except (AttributeError, TypeError) as err:
    inDir = os.getcwd()

#open config file and store configuation
with open(configFile, 'r') as ymlFile:
    cfg = yaml.load(ymlFile)

#create outdir
cfg['exec']['outdir'] = os.path.join(outDir, cfg['exec']['outdir'])
try:
    os.mkdir(cfg['exec']['outdir'])
except FileExistsError:

Ejemplo n.º 7

0

Mostrar archivo

def snpcaller(runCFG, bam_files, threads='1'):
    #set parameters
    outDir = runCFG['exec']['outdir']
    logfile = os.path.join(outDir, runCFG['exec']['logfile'])
    outDir = os.path.join(outDir, 'snp_calls')
    checkexists(outDir)

    #set reference sequence
    reference_sequence_path = runCFG['exec']['outdir'] + '/ref_sequence'
    reference_sequence_name = os.path.basename(
        runCFG['exec']['referenceSequence'])

    #starting time point
    start = time.time()
    procTitle('SNP Calling')
    print(f'\nSniffles: Started calling SNPs')

    bams = []
    sample_list = []
    for path in bam_files:
        full_path = os.path.abspath(path)
        file_name = os.path.basename(full_path)
        path = os.path.dirname(full_path)
        id = file_name.split(".")[0]
        sample_list.append(id)
        bams.append('/infile/' + file_name)

    #generate mpileup
    cmd1 = 'bash -c \'samtools mpileup -ABR -d 1000000 {bams} -f /ref/{reference_sequence_name} > all.mpileup &&'.format(
        bams=' '.join(bams), reference_sequence_name=reference_sequence_name)

    #call snps
    snp_frequency = runCFG['snpcalling']['snpFrequency']
    min_cov = runCFG['snpcalling']['minCoverage']
    snp_qual_threshold = runCFG['snpcalling']['snpQualityThreshold']

    cmd2 = 'java -jar /tools/varscan.jar mpileup2cns all.mpileup --min-coverage {min_cov} --min-avg-qual {snp_qual_threshold} --min-var-freq {snp_frequency} --strand-filter 1 --output-vcf 1 --variants --vcf-sample-list <(echo -e "{samples}") > all_snps.vcf\''.format(
        min_cov=min_cov,
        snp_qual_threshold=snp_qual_threshold,
        snp_frequency=snp_frequency,
        samples='\n'.join(sample_list))

    #add commands to list for multiprocessing
    cmd = cmd1 + cmd2

    #future code block for annotating aa changes
    #if runCFG['exec']['annotateAAChanges']:
    #pass
    #TODO add annotater for annotating aa changes

    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Calling SNPs\n')
        results = cd.call(cmd, '/outfile', {
            reference_sequence_path: "/ref",
            path: "/infile",
            outDir: "/outfile"
        })
        outlog.write('-----------\n')
        outlog.write(results)
        #denote end of logs
        outlog.write('***********\n')

    #get end time
    end = time.time()
    #get total runtime
    runtime = round(end - start, 2)
    print(f'\nSniffles: Finished calling snps in {runtime} seconds')

Ejemplo n.º 8

0

Mostrar archivo

Archivo: consensus.py Proyecto: k-florek/sniffles

def consensus(runCFG,bam_list,threads='1'):
    #inital parameters
    outDir =runCFG['exec']['outdir']
    logfile = os.path.join(outDir,runCFG['exec']['logfile'])
    outDir = os.path.join(outDir,'consensus')
    checkexists(outDir)

    #notify starting mapping
    procTitle('Generate Consensus')
    print('\nSniffles: Started generating consensus vcf')

    #get start time
    overall_start = time.time()
    start = time.time()

    #set reference sequence
    reference_sequence_abspath = os.path.abspath(runCFG['exec']['referenceSequence'])
    reference_sequence_name = os.path.basename(reference_sequence_abspath)
    reference_sequence_dir = runCFG['exec']['outdir'] + '/ref_sequence'

    #command list
    cmds = []
    vcf_list = []
    for path in bam_list:
        full_path = os.path.abspath(path)
        file_name = os.path.basename(full_path)
        path = os.path.dirname(full_path)
        id = file_name.split(".")[0]
        #run varscan mpileup2cns to generate vcf with consensus information
        minCov = runCFG['snpcalling']['minCoverage']
        quality = runCFG['snpcalling']['snpQualityThreshold']
        freq = runCFG['snpcalling']['consensusFrequency']

        #make multiway pileup using samtools
        cmd1 = f'bash -c \'samtools mpileup -ABd 1000000 /infile/{file_name} -f /ref/{reference_sequence_name} -o {id}.pileup && '

        cmd2 = f'java -jar /tools/varscan.jar mpileup2cns {id}.pileup --min-coverage {minCov} --min-avg-qual {quality} --min-var-freq {freq} --strand-filter 1 --output-vcf 1 > {id}.vcf\''
        cmds.append(cmd1 + cmd2)
        vcf_list.append(os.path.join(outDir,f'{id}.vcf'))

    #setup multiprocessing
    pool = mp.Pool(processes=threads)

    #start multiprocessing
    with open(logfile,'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Generating Consensus\n')
        #start multiprocessing
        results = pool.starmap_async(cd.call,[[cmd,'/outfile',{reference_sequence_dir:"/ref",path:"/infile",outDir:"/outfile"}] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')

    #check if vcf file is empty, if it is skip id and remove vcf file
    filtered_vcf_list = []
    for path in vcf_list:
        try:
            if os.path.getsize(path)>0:
                filtered_vcf_list.append(path)
            else:
                os.remove(path)
        except:
            pass

    end = time.time()
    runtime = round(end - start,2)
    print(f'\nSniffles: Finished generating the consensus vcf in {runtime} seconds')
    start = time.time()
    print(f'\nSniffles: Generating consensus fasta')

    #command list for compressing files
    cmds = []
    out_fasta = []
    for vcf in filtered_vcf_list:
        full_path = os.path.abspath(vcf)
        file_name = os.path.basename(full_path)
        path = os.path.dirname(full_path)
        id = file_name.split(".")[0]
        #compress vcf file with bgzip
        cmd = f'bash -c \'bgzip {id}.vcf && tabix {id}.vcf.gz && bcftools consensus -f /ref/{reference_sequence_name} {id}.vcf.gz -o {id}.fasta\''
        out_fasta.append(os.path.join(outDir,f'{id}.fasta'))
        cmds.append(cmd)

    #start multiprocessing
    with open(logfile,'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Creating consensus Fasta\n')
        #start multiprocessing
        results = pool.starmap_async(cd.call,[[cmd,'/outfile',{reference_sequence_dir:"/ref",outDir:"/outfile"}] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')

    end = time.time()
    runtime = round(end - start,2)
    print(f'\nSniffles: Finished generating consensus fasta in {runtime} seconds')

    #determine runtime of processes
    end = time.time()
    runtime = round(end - overall_start,2)
    print(f'\nSniffles: Finished generating consensus sequence in {runtime} seconds')
    return out_fasta

Ejemplo n.º 9

0

Mostrar archivo

Archivo: sniffles.py Proyecto: katarinabraun/SARS-CoV-2_sequencing

#get input path
try:
	inDir = os.path.abspath(args.i)
except (AttributeError, TypeError) as err:
	inDir = os.getcwd()
	print(f"Raw reads directory {args.i} cannot be found. Sniffles will look for fasta files in the current working directory {inDir}.\n")

#create outdir
try:
	outDir = os.path.abspath(args.o)
except (AttributeError, TypeError) as err:
	outDir = os.getcwd()
	print(f"Output directory {args.o} cannot be found. Output will be placed in a separate folder in the current working directory {outDir}.\n")

sc.checkexists(outDir)

cfg['exec']['outdir'] = os.path.join(outDir,cfg['exec']['outdir'])

try:
	os.mkdir(cfg['exec']['outdir'])
except FileExistsError:
	cfg['exec']['outdir'] = cfg['exec']['outdir']+'_'+str(int(time.time()))
	os.mkdir(cfg['exec']['outdir'])
outDir = cfg['exec']['outdir']

logfile=os.path.join(outDir,cfg['exec']['logfile'])
cfg['exec']['logfile'] = logfile

startRunMessage = f"Beginning run at {strftime('%a, %d %b %Y %I:%M:%S %p', time.localtime())}"
sc.procTitle(startRunMessage, cfg)

Ejemplo n.º 10

0

Mostrar archivo

def normCoverage(runCFG,bam_files,threads='1'):
    #NOTE: normalizing with bbnorm uses all available memory, thus can only be run serialy

    #inital parameters
    outDir =runCFG['exec']['outdir']
    checkexists(os.path.join(outDir,'normalized'))
    logfile = os.path.join(outDir,runCFG['exec']['logfile'])
    outDir = os.path.join(outDir,'normalized')

    #notify starting to remove duplicates
    procTitle('Normalize Coverage')
    print('\nSniffles: Normalizing read coverage')

    #get time at start
    start = time.time()

    #denote start of remove duplicate reads in logs
    with open(logfile,'a') as outlog:
        outlog.write('********************\n')
        outlog.write('Normalizing coverage\n')

        #run normalization
        output_list = []
        for path in bam_files:
            full_path = os.path.abspath(path)
            file_name = os.path.basename(full_path)
            path = os.path.dirname(full_path)
            id = file_name.split('.')[0]

            #get reads from mapped bamfile
            cmd_get_reads = f'bash -c \'samtools fastq /bam_files/{id}.bam -1 /out_dir/{id}_mapped_1.fastq -2 /out_dir/{id}_mapped_2.fastq && '

            #run seqtk to subsample reads
            total_reads = runCFG['exec']['totalReads']
            cmd_normalization = f'seqtk sample -s100 /out_dir/{id}_mapped_1.fastq {total_reads} > {id}_1.fastq && seqtk sample -s100 /out_dir/{id}_mapped_2.fastq {total_reads} > {id}_2.fastq\''

            #start docker containers and run
            outlog.write(f'{id}-----------\n')
            stdout=cd.call(cmd_get_reads+cmd_normalization,'/out_dir',{path:"/bam_files",outDir:"/out_dir"})
            outlog.write(stdout)
            outlog.write(f'-----------\n')

            output_list.append([os.path.join(outDir,f'{id}_1.fastq'),os.path.join(outDir,f'{id}_2.fastq')])


            #cleanup
            try:
                os.remove(f'{outDir}/{id}_mapped_1.fastq')
            except:
                pass
            try:
                os.remove(f'{outDir}/{id}_mapped_2.fastq')
            except:
                pass
        outlog.write('********************\n')

    #get time at end
    end = time.time()

    #determine runtime of processes
    runtime = round(end - start,2)
    print(f'\nSniffles: Finished normalizing read coverage in {runtime} seconds')
    return output_list

Ejemplo n.º 11

0

Mostrar archivo

Archivo: readcleaning.py Proyecto: uttaravadina/SARS-CoV-2_sequencing

def normCoverage(runCFG, bam_files, threads='1'):
    #initial parameters
    outDir = runCFG['exec']['outdir']
    checkexists(os.path.join(outDir, 'normalized'))
    logfile = runCFG['exec']['logfile']
    outDir = os.path.join(outDir, 'normalized')

    #notify starting to remove duplicates
    procTitle("Downsampling with seqtk to normalize coverage", runCFG)
    #print('\n-----------------------Sniffles: Downsampling with seqtk to normalize coverage-----------------------')

    #get time at start
    start = time.time()

    #denote start of remove duplicate reads in logs
    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Downsampling with seqtk to normalize coverage\n')

        #run normalization
        output_list = []
        for path in bam_files:

            full_path = os.path.abspath(path)
            file_name = os.path.basename(full_path)
            path = os.path.dirname(full_path)
            id = file_name.split('.')[0]

            #get reads from mapped bamfile
            cmd_get_reads = f'bash -c \'samtools collate /bam_files/{id}.bam collating && samtools fastq -n /bam_files/{id}.bam -1 /out_dir/{id}_mapped_1.fastq.gz -2 /out_dir/{id}_mapped_2.fastq.gz'

            #run seqtk to subsample readsc
            total_reads = runCFG['exec']['totalReads']
            cmd_normalization = f' && seqtk sample -s100 /out_dir/{id}_mapped_1.fastq.gz {total_reads} > {id}_1.fastq.gz && seqtk sample -s100 /out_dir/{id}_mapped_2.fastq.gz {total_reads} > /out_dir/{id}_2.fastq.gz\''

            if runCFG['exec']['unpaired']:
                cmd_get_reads += f' -0 /out_dir/{id}_mapped_U.fastq.gz && seqtk sample -s100 /out_dir/{id}_mapped_U.fastq.gz {total_reads} > /out_dir/{id}_U.fastq.gz'
                output_list.append([
                    os.path.join(outDir, f'{id}_1.fastq.gz'),
                    os.path.join(outDir, f'{id}_2.fastq.gz'),
                    os.path.join(outDir, f'{id}_U.fastq.gz')
                ])
            else:
                output_list.append([
                    os.path.join(outDir, f'{id}_1.fastq.gz'),
                    os.path.join(outDir, f'{id}_2.fastq.gz')
                ])

            #start docker containers and run
            outlog.write(f'{id}\n-----------\n')
            stdout = cd.call(cmd_get_reads + cmd_normalization, '/out_dir', {
                path: "/bam_files",
                outDir: "/out_dir"
            })
            outlog.write(stdout)

            #cleanup
            try:
                os.remove(f'{outDir}/{id}_mapped_1.fastq.gz')
            except:
                pass
            try:
                os.remove(f'{outDir}/{id}_mapped_2.fastq.gz')
            except:
                pass
            try:
                os.remove(f'{outDir}/{id}_mapped_U.fastq.gz')
            except:
                pass
        outlog.write('***********\n')

    #get time at end
    end = time.time()

    #determine runtime of processes
    runtime = round(end - start, 2)
    runtime = str(datetime.timedelta(seconds=runtime))
    print(f'\nSniffles: Finished normalizing read coverage in {runtime}')
    return output_list

Ejemplo n.º 12

0

Mostrar archivo

Archivo: VCFannotater.py Proyecto: uttaravadina/SARS-CoV-2_sequencing

def VCFannotator(runCFG, vcffiles):
    # read in reference sequences and gtfs and store information about protein sequences
    # create a dictionary of gene names and start/stop sites, allowing for more than one start/stop site.

    #import file location parameters from config file
    outDir = os.path.join(runCFG['exec']['outdir'], 'vcf_annotations')
    checkexists(outDir)
    logfile = os.path.join(outDir, runCFG['exec']['logfile'])
    refseqfasta = runCFG['exec']['referenceSequence']
    refseqname = refseqfasta.split(".")[0]

    if runCFG['exec']['mapToConsensus']:
        refseqfasta = os.path.join(runCFG['exec']['outdir'], 'ref_sequence',
                                   refseqfasta)

    #get start time
    start1 = time.time()

    procTitle('Annotating SNPs', runCFG)

    #Extract coding sequence coordinates from gtf files:
    coding_regions = {
    }  #will be dictionary of dictionaryies (format segment:gene:[[startExon1, stopExon1], [startExon2, stopExon2]])
    with open(runCFG['postprocessing']['gtfFileName'], "r") as gtf:
        for line in gtf:
            if line.strip(
                    "\n"
            ) != "":  # ignore blank lines (otherwise throws an index error)
                line = line.replace("/", "_")
                lineitems = line.split("\t")
                segment_name = lineitems[0]
                annotation_type = lineitems[2]
                start = int(
                    lineitems[3]) - 1  # adding the -1 here for 0 indexing
                stop = int(
                    lineitems[4]) - 1  # adding the -1 here for 0 indexing
                gene_name = lineitems[8]
                gene_name = gene_name.split(";")[0]
                gene_name = gene_name.replace("gene_id ", "")
                gene_name = gene_name.replace("\"", "")

                if annotation_type.lower() == "cds":
                    if segment_name not in coding_regions:
                        coding_regions[segment_name] = {}
                        coding_regions[segment_name][gene_name] = [[
                            start, stop
                        ]]
                    elif segment_name in coding_regions and gene_name not in coding_regions[
                            segment_name]:
                        coding_regions[segment_name][gene_name] = [[
                            start, stop
                        ]]
                    elif gene_name in coding_regions[segment_name]:
                        coding_regions[segment_name][gene_name].append(
                            [start, stop])

    # pull in reference fasta file, separate gene segments into a dictionary
    ref_segments = {}

    for seq in SeqIO.parse(refseqfasta, "fasta"):
        refseqname = str(seq.id).replace("/", "_")
        sequence = str(seq.seq).lower()
        ref_segments[refseqname] = sequence

    # use gene coordinates to create coding sequences from reference sequences
    transcripts = {}

    #Reminder of current data structures:
    #coding_regions[segment][gene]:coordinates of genes
    #ref_segments[nameofsegment]:sequence

    for segment in coding_regions:
        for gene in coding_regions[segment]:
            transcripts[gene] = ""
            coordinates = coding_regions[segment][
                gene]  # define the coding regions for each gene
            for start, stop in coordinates:  # loop through start/stop sites in coding regions
                sequence_chunk = ref_segments[segment][start:stop + 1]
                transcripts[gene] = transcripts[
                    gene] + sequence_chunk  # append each piece of the transcript together

    # loop through each transcript to make sure that it begins with a start codon and ends with a stop codon
    #for t in transcripts:
    #if transcripts[t][0:3] != start_codon:
    #print("WARNING! " + refseqname + " " + t + " does not contain a start codon! The first three nucleotides are " + transcripts[t][0:3])
    #if transcripts[t][-3:] not in stop_codons:
    #print("WARNING! " + refseqname + " " + t + " does not contain a stop codon! These are the last 3 nucleotides: " + transcripts[t][-3:])

    print(vcffiles)
    if os.path.isdir(vcffiles):
        vcffiles = glob.glob(vcffiles + "/*.vcf")
    elif type(vcffiles) == list:
        if vcffiles[0].split(".")[-1] == "vcf":
            pass
    else:
        print("vcffiles has no vcf files!")

    listofmutstoExport = []
    ##Loop through each vcf file and annotate amino acid changes
    print(vcffiles)
    for i, vcfname in tqdm(enumerate(vcffiles)):
        with open(vcfname, "r") as TextVCF:
            for index, line in enumerate(TextVCF, 0):
                if "#CHROM" in line:
                    rowstoskip = index

        #Reads the vcf file into a pandas DataFrame
        print(vcfname)
        try:
            vcfDF = pd.read_csv(vcfname, sep='\t', skiprows=rowstoskip)
        except OSError as inst:
            print("\n" + vcfname +
                  " did not open appropriately. Please check file.\n")

        #fix the / in chrome bug
        vcfDF["#CHROM"] = vcfDF["#CHROM"].str.replace("/", "_")
        #extract frequencies for list of muts to export:
        #In order to make this easier, I'm going to assume each VCF has only one sample. This code DOES NOT WORK for more than one sample per VCF.
        #print(vcfDF.iloc[:,-1].str.split(":").str[6].str.rstrip('%').astype('float')/100)
        freqlocation = vcfDF.loc[0, "FORMAT"].split(":").index("FREQ")
        try:
            vcfDF["FREQ"] = vcfDF.iloc[:, -1].str.split(
                ":").str[freqlocation].astype('float')
        except ValueError:
            vcfDF["FREQ"] = vcfDF.iloc[:, -1].str.split(
                ":").str[freqlocation].str.rstrip('%').astype('float') / 100
        except:
            raise

        listofmuts = []

        #loop through each line in vcfDF, extract chrom, pos, reference nucleotide, alternate nucleotide
        for chrom, pos, ref, alt, freq in zip(vcfDF['#CHROM'], vcfDF["POS"],
                                              vcfDF["REF"].str.lower(),
                                              vcfDF["ALT"].str.lower(),
                                              vcfDF["FREQ"]):
            pos -= 1  #subtract one from position to convert from VCF's 1 indexing to python's 0
            for gene in coding_regions[chrom].keys(
            ):  #loop through each gene potentially applicable to that position (i.e., all on chromosome)
                priorExonLength = 0
                #print (gene)
                for start, stop in coding_regions[chrom][
                        gene]:  #loop through each exon of gene
                    #if pos in exon, calculate codon, reference aa, and variant aa
                    #print (f"pos: {pos} start: {start} stop: {stop}")
                    if pos in range(start, stop):
                        #print ('is in range, annotating.')
                        within_gene_position = pos - start + priorExonLength  #within gene position is the position in this exon (pos-startOfExon), plus the length of any prior exons (exonstart)
                        codon_pos = (within_gene_position % 3)
                        alternatetranscript = transcripts[
                            gene][:within_gene_position] + alt + transcripts[
                                gene][within_gene_position + 1:]

                        codon = transcripts[gene][(within_gene_position -
                                                   codon_pos):(
                                                       within_gene_position +
                                                       (3 - codon_pos))]
                        variantcodon = alternatetranscript[(
                            within_gene_position -
                            codon_pos):(within_gene_position +
                                        (3 - codon_pos))]
                        ref_aa = Seq(codon).translate()
                        variant_aa = Seq(variantcodon).translate()
                        aa_num = str(int(within_gene_position / 3) + 1)
                        #Catch errors in annotation calculations where the math results in an incorrect codon
                        if codon[codon_pos] != ref:
                            print(
                                "Something's quite wrong here. The reference SNP is not what it should be."
                            )
                            print(f"\n\nchrom: {chrom}, gene: {gene}")
                            print(
                                f"\npos: {pos}  within_gene_position: {within_gene_position}\ncodon_pos: {codon_pos}  codon: {codon}  variantcodon: {variantcodon}\n\n"
                            )
                            print(
                                f"ref:  {ref} alt: {alt}  ref_aa: {ref_aa} \nvariant_aa: {variant_aa}\n aa_num: {aa_num}\n"
                            )
                            print(transcripts[gene])
                        ref_aa = Seq(codon).translate()
                        variant_aa = Seq(variantcodon).translate()
                        aa_num = str(int(within_gene_position / 3) + 1)
                        if ref_aa != variant_aa:
                            listofmuts.append([
                                chrom, gene, pos + 1,
                                str(ref_aa + aa_num + variant_aa)
                            ])
                            if freq > 0.01 and freq < 0.99:
                                listofmutstoExport.append({
                                    "segment":
                                    chrom,
                                    'gene':
                                    gene,
                                    'position':
                                    pos + 1,
                                    'frequency':
                                    float(freq),
                                    'AAchange':
                                    str(ref_aa + aa_num + variant_aa)
                                })
                        elif ref_aa == variant_aa:
                            listofmuts.append([chrom, gene, pos + 1, "."])
                        break  #if pos is in exon, stop looping though exons
                    else:
                        priorExonLength += (
                            stop + 1 - start
                        )  #The next exon will begin after the length of this exon, i.e., after the stop point minus the start point
                #else statement only executed if for loop finishes without breaking (ie if pos is never within the gene being examined)
                else:
                    listofmuts.append([chrom, gene, pos + 1, "not in ORF"])
                    continue  #continue onto next gene

        AAchange = pd.DataFrame(listofmuts,
                                columns=['#CHROM', 'gene', 'POS', 'AAchange'])

        vcfDF = vcfDF.merge(AAchange, how='left', on=['#CHROM', 'POS'])

        vcfDF['gene'] = vcfDF['gene'].astype(str)
        vcfDF['gene'] = vcfDF['gene'].replace("NA", "NA gene")
        annotatedVCFname = os.path.basename(vcfname).split(
            ".")[0] + ".annotated_vcf"
        outputfile = vcfDF.to_csv(os.path.join(outDir, annotatedVCFname),
                                  sep="\t",
                                  index=None,
                                  header=True)
        vcffiles[i] = annotatedVCFname

    #importantMuts = pd.DataFrame(listofmutstoExport)
    #print (importantMuts)
    #importantMuts = importantMuts.groupby(['segment', 'gene', 'position', 'AAchange'], as_index=False).mean()

    #importantMuts = importantMuts.loc[importantMuts['freq']>0.02 & importantMuts['freq']<0.98]
    #importantMuts['freq'] = importantMuts['freq']/len(vcffiles)
    #importantMutsexport = importantMuts.to_csv(os.path.join(outDir, "allMutationsPresent.tsv"), sep = '\t', index=None, header=True)

    #get end time
    end = time.time()
    #get total runtime
    runtimeSeconds = end - start1
    runtime = datetime.timedelta(seconds=runtimeSeconds)
    print(f'\nSniffles: Finished annotating snps in {str(runtime)}')
    return (vcffiles)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: snpcaller.py Proyecto: uttaravadina/SARS-CoV-2_sequencing

def snpcaller(runCFG,bam_files,threads='1'):
	#set parameters
	outDir = runCFG['exec']['outdir']
	logfile = runCFG['exec']['logfile']
	outDir = os.path.join(outDir,'snp_calls')
	checkexists(outDir)
	
	#set reference sequence
	reference_sequence_path = os.path.dirname(runCFG['exec']['referenceSequence'])
	reference_sequence_name = os.path.basename(runCFG['exec']['referenceSequence'])
	

	#starting time point
	start =  time.time()
	if runCFG['exec']['replicates']:
		message = 'Calling replicate SNPs with Varscan'
	else:
		message = 'Calling SNPs with Varscan'

	procTitle(message, runCFG)

	bams = []
	sample_list = []
	listofVCFs = []
	repDict = {}
	
	#Create list of bam files to call SNPs on
	for path in bam_files:
		full_path = os.path.abspath(path)
		file_name = os.path.basename(full_path)
		path = os.path.dirname(full_path)
		id = file_name.split(".")[0]
		sample_list.append(id)
		bams.append('/infile/'+file_name)

		#if processing replicate runs, create dictionary samplename:[list of replicate vcf files for that sample]
		#this dictionary will be used later to merge and average replicate vcfs
		if runCFG['exec']['replicates']:
			repBreakdown = runCFG['exec']['replicateNotation'].split("_")
			repBreakdown = "_".join(repBreakdown[:-1])
			repBreakdown = repBreakdown[:repBreakdown.find(r"\d")]
			repBreakdown = repBreakdown.split("Sample")
			repKey = file_name[file_name.find(repBreakdown[0])+len(repBreakdown[0]):file_name.find(repBreakdown[1])]

			vcf_name = (id+".vcf")
			listofVCFs.append(vcf_name)
			if repKey not in repDict.keys():
				repDict[repKey] = [vcf_name]
			else:
				repDict[repKey].append(vcf_name)

	#import SNP calling quality parameters from config file
	snp_frequency=runCFG['snpcalling']['snpFrequency']
	min_cov=runCFG['snpcalling']['minCoverage']
	snp_qual_threshold=runCFG['snpcalling']['snpQualityThreshold']

	#generate commands to call variants
	cmds=[]
	for bam, sample in zip(bams, sample_list):
		#mpileup command
		outlogHeader = f"{bam.split('/')[-1].split('.')[0]}\n-----------\n"

		cmd1 = f'printf \"{outlogHeader}\" >> {os.path.join("/logfile", os.path.basename(logfile))} && samtools mpileup -ABR -d 1000000 {bam} -f /ref/{reference_sequence_name} > {sample}.mpileup'

		#varscan command
		cmd2 = f'java -jar /tools/varscan.jar mpileup2snp {sample}.mpileup --min-coverage {min_cov} --min-avg-qual {snp_qual_threshold} --min-var-freq {snp_frequency} --strand-filter 1 --output-vcf 1 --variants --vcf-sample-list <(echo -e "{sample}") > {sample}_temp.vcf'
		#compress and normalize vcf
		cmd3 = f'bcftools norm -c sw -m - -f /ref/{reference_sequence_name} -o {sample}.vcf {sample}_temp.vcf && rm {sample}_temp.vcf'
		
		if not runCFG['exec']['replicates']:
			listofVCFs.append(os.path.join(outDir, f"{sample}.vcf"))
		#add commands to list for multiprocessing
		cmds.append("bash -c \'" + cmd1 + " && " + cmd2 + " && " + cmd3 + "\'")
	
	#initialize multiprocessing pool
	pool = mp.Pool(processes=threads)

	#open logfile
	with open(logfile,'a') as outlog:
		outlog.write('***********\n')
		outlog.write('Calling SNPs\n')

		#run commands with mutliprocessing
		results = pool.starmap_async(cd.call,[[cmd, '/outfile',{reference_sequence_path:"/ref",path:"/infile",outDir:"/outfile",os.path.dirname(logfile):"/logfile"}] for cmd in cmds])
		
		pool.close()
		pool.join()
		stdouts = results.get()
		print ('finished all results')
		for stdout in stdouts:
			#outlog.write('-----------\n')
			outlog.write(stdout)
		#if processing duplicate runs, merge and average SNP calls
		if runCFG['exec']['replicates']:
			listofVCFs = (VCFaverager(runCFG, repDict, listofVCFs))

		outlog.write(str(listofVCFs))
		outlog.write('-----------\n')
		
		#Combine sample vcfs into one master VCF:
		#allSNPs = VCFcombiner(runCFG, listofVCFs, "allVarscanSNVs.vcf")
		#outlog.write(f"\nCombined all vcf files into master vcf file allVarscanSNVs.vcf\n")
		outlog.write('-----------\n')
		#denote end of logs
		outlog.write('***********\n')

	#get end time
	end = time.time()
	#get total runtime
	runtime = round(end - start,2)
	runtime = str(datetime.timedelta(seconds=runtime))
	print(f'\nSniffles: Finished calling snps in {runtime}')
	
	return (listofVCFs)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: mapping.py Proyecto: uttaravadina/SARS-CoV-2_sequencing

def mapping(runCFG, param_paths, outDir, threads='8'):

    seed = 0
    start = time.time()

    #assert len(param_paths) > 0, "Cannot map reads: No reads provided"
    print(param_paths)
    logfile = runCFG['exec']['logfile']

    num_jobs, num_threads = cpu_count(threads)
    cmds = []
    read_path = ''
    ref_path = ''
    output_bam_list = []
    for param_path in param_paths:
        id = param_path[0]
        read1 = os.path.basename(param_path[1])
        read2 = os.path.basename(param_path[2])
        try:
            unpairedReads = os.path.basename(param_path[4])
        except:
            unpairedReads = ""
        print(f"unpairedReads for id {id}: " + unpairedReads)
        read1un = read1.split(".")[0][0:-1] + "U.fastq.gz"
        read2un = read2.split(".")[0][0:-1] + "U.fastq.gz"
        read_path = os.path.dirname(os.path.abspath(param_path[1]))
        ref_path = runCFG['exec']['outdir'] + '/ref_sequence'
        reference_sequence_name = os.path.basename(param_path[3])
        outlogHeader = f"\"{id}\n-----------\n\""
        #check output folder exists
        checkexists(os.path.join(outDir))
        logfilepath = os.path.join("/logfile", os.path.basename(logfile))
        checkexists(os.path.join(outDir, "unmapped"))

        if read2 != '':
            #generate command for paired end
            cmd = f"bash -c \'printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} -1 /reads/{read1} -2 /reads/{read2} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}"
        else:
            #generate command for interleaved
            cmd = f"bash -c \'printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} --interleaved /reads/{read1} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}"

        if runCFG['exec']['unpaired']:
            if unpairedReads != "":
                cmdUn = f"_paired.bam && printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} -U /reads/{unpairedReads} --un /output/unmapped/U_{read1} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}_unpaired.bam"
            elif read2 != '':
                #generate command for unpaired
                cmdUn = f"_paired.bam && printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} -U /reads/{read1un},/reads/{read2un} --un /output/unmapped/U_{read1} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}_unpaired.bam"
            else:
                #generate command for interleaved
                cmdUn = f"_paired.bam && printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} -U /reads/{read1un} --un /output/unmapped/U_{read1} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}_unpaired.bam"

            mergecmd = f" && samtools merge /output/{id}.bam /output/{id}_unpaired.bam /output/{id}_paired.bam && samtools index /output/{id}.bam\'"
            cmds.append(cmd + cmdUn + mergecmd)
        else:
            cmds.append(cmd + ".bam\'")
        #data for next stage
        output_bam_list.append(os.path.join(outDir, f'{id}.bam'))

    #set up multiprocessing
    pool = mp.Pool(processes=num_jobs)

    #get start time
    start = time.time()

    #denote start of mapping in logs
    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Mapping\n')
        #start multiprocessing
        results = pool.starmap_async(cd.call, [[
            cmd, '/reads', {
                ref_path: "/reference",
                read_path: "/reads",
                outDir: "/output",
                os.path.dirname(logfile): "/logfile"
            }
        ] for cmd in cmds])
        pool.close()
        pool.join()
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')

    #get end time
    end = time.time()
    #get total runtime
    runtime = round(end - start, 2)
    runtime = str(datetime.timedelta(seconds=runtime))
    print(f'\nSniffles: Finished mapping in {runtime}')
    return output_bam_list