Esempio n. 1
0
def prepare_genome(genome_file,color_space=False):
    run_fine=True
    pipeline_param=utils_param.get_pipeline_parameters()
    BWA_dir=pipeline_param.get_bwa_dir()
    BWA_bin=os.path.join(BWA_dir,'bwa')
    genome_loader = GenomeLoader(genome_file=genome_file)
    length=0
    for fasta_rec in genome_loader:
        header, sequence = fasta_rec
        length+=len(sequence)
        if length>1000000000:
            break
    genome_loader.close()
    #Following recommendation set the indexing algorithm to is if genome is <10M
    if length>1000000000:
        a_option='bwtsw'
    else:
        a_option='is'
    
    #Create the indexes
    if color_space:
        command='%s index -c -a %s %s'%(BWA_bin, a_option, genome_file)
    else: 
        command='%s index -a %s %s'%(BWA_bin, a_option, genome_file)
    command_runner.run_command(command)
    return run_fine
Esempio n. 2
0
def prepare_genome(genome_file, color_space=False):
    run_fine = True
    pipeline_param = utils_param.get_pipeline_parameters()
    BWA_dir = pipeline_param.get_bwa_dir()
    BWA_bin = os.path.join(BWA_dir, 'bwa')
    genome_loader = GenomeLoader(genome_file=genome_file)
    length = 0
    for fasta_rec in genome_loader:
        header, sequence = fasta_rec
        length += len(sequence)
        if length > 1000000000:
            break
    genome_loader.close()
    #Following recommendation set the indexing algorithm to is if genome is <10M
    if length > 1000000000:
        a_option = 'bwtsw'
    else:
        a_option = 'is'

    #Create the indexes
    if color_space:
        command = '%s index -c -a %s %s' % (BWA_bin, a_option, genome_file)
    else:
        command = '%s index -a %s %s' % (BWA_bin, a_option, genome_file)
    command_runner.run_command(command)
    return run_fine
Esempio n. 3
0
def bin_coordinates_through_genome(input_file, output_file, genome_file, bin_size):
    open_file=utils_logging.open_input_file(input_file)
    open_output=utils_logging.open_output_file(output_file)
    all_coordinates_per_chr={}
    genome_loader=GenomeLoader(genome_file)
    previous_bin=0
    all_chr=[]
    for line in open_file:
        sp_line=line.split()
        all_coordinates=all_coordinates_per_chr.get(sp_line[0])
        if all_coordinates is None:
            all_chr.append(sp_line[0])
            all_coordinates=[]
            all_coordinates_per_chr[sp_line[0]]=all_coordinates
        all_coordinates.append(int(sp_line[1]))
    all_chr.sort()
    for chr in all_chr:
        header, sequence =genome_loader.get_chr(chr)
        chr=header.strip()
        chr_len=len(sequence)
        
        all_coordinates=all_coordinates_per_chr.get(chr)
        all_bins=bin_value_from_array(all_coordinates, bin_size, chr_len)
        for bin,value in enumerate(all_bins):
            open_output.write('%s\t%s\t%s\t%s\n'%(chr, bin*bin_size, (bin*bin_size)+previous_bin, value))
        previous_bin+=len(all_bins)*bin_size
    open_output.close()    
Esempio n. 4
0
def extract_reads_from_all_bam_files_set_of_consensus_old(bam_files, list_consensus, output_dir, genome_loader=None,
                                                          all_read1_consensus_file=None):
    if genome_loader is None:
        genome_loader = GenomeLoader(all_read1_consensus_file, keep_until_done=True)
    for consensus_name in list_consensus:
        logging.info("Extract reads from %s " % consensus_name)
        consensus_name, consensus_sequence = genome_loader.get_chr(consensus_name)
        extract_reads_from_one_consensus(bam_files, output_dir, consensus_name, consensus_sequence)
Esempio n. 5
0
def extract_reads_from_all_bam_files_set_of_consensus_old(
        bam_files,
        list_consensus,
        output_dir,
        genome_loader=None,
        all_read1_consensus_file=None):
    if genome_loader is None:
        genome_loader = GenomeLoader(all_read1_consensus_file,
                                     keep_until_done=True)
    for consensus_name in list_consensus:
        logging.info("Extract reads from %s " % consensus_name)
        consensus_name, consensus_sequence = genome_loader.get_chr(
            consensus_name)
        extract_reads_from_one_consensus(bam_files, output_dir, consensus_name,
                                         consensus_sequence)
Esempio n. 6
0
def extract_reads_from_all_bam_files_set_of_consensus(
        bam_files,
        list_consensus,
        output_dir,
        genome_loader=None,
        all_read1_consensus_file=None):
    all_previous_dir = glob.glob(os.path.join(output_dir, '*_dir'))
    if len(all_previous_dir):
        logging.info("cleanup previous run in %s" % output_dir)
        for dir in all_previous_dir:
            shutil.rmtree(dir)
    if genome_loader is None:
        genome_loader = GenomeLoader(all_read1_consensus_file,
                                     keep_until_done=True)
    for bam_file in bam_files:
        extract_reads_from_one_bam_file(bam_file, output_dir, list_consensus,
                                        genome_loader)
    # All the read have been extract now close the fastq files
    close_fastq_files()