Ejemplo n.º 1
0
def merge_snps_files(directory):
    """This function merge snps files from a single directory"""
    return_code = 0
    all_vcf_files = glob(os.path.join(directory, '*_dir', '*_phased.vcf'))
    output_file_body = os.path.join(
        directory, '%s_snps_files.vcf.body' % len(all_vcf_files))
    output_file_body = concatenate_file(all_vcf_files,
                                        output_file_body,
                                        filter="^#")
    if output_file_body:
        return_code = 0
    output_file_header = os.path.join(
        directory, '%s_snps_files.vcf.header' % len(all_vcf_files))
    command = 'grep  "^#" %s > %s ' % (all_vcf_files[0], output_file_header)
    if return_code == 0:
        return_code = command_runner.run_command(command)
    output_file = os.path.join(directory,
                               '%s_phased_snps_files.vcf' % len(all_vcf_files))
    command = 'cat %s %s > %s ' % (output_file_header, output_file_body,
                                   output_file)
    if return_code == 0:
        return_code = command_runner.run_command(command)
    command = 'rm %s %s' % (output_file_header, output_file_body)
    if return_code == 0:
        return_code = command_runner.run_command(command)
    return return_code
Ejemplo n.º 2
0
def prepare_genome(genome_file,color_space=False):
    run_fine=True
    pipeline_param=utils_param.get_pipeline_parameters()
    BWA_dir=pipeline_param.get_bwa_dir()
    BWA_bin=os.path.join(BWA_dir,'bwa')
    genome_loader = GenomeLoader(genome_file=genome_file)
    length=0
    for fasta_rec in genome_loader:
        header, sequence = fasta_rec
        length+=len(sequence)
        if length>1000000000:
            break
    genome_loader.close()
    #Following recommendation set the indexing algorithm to is if genome is <10M
    if length>1000000000:
        a_option='bwtsw'
    else:
        a_option='is'
    
    #Create the indexes
    if color_space:
        command='%s index -c -a %s %s'%(BWA_bin, a_option, genome_file)
    else: 
        command='%s index -a %s %s'%(BWA_bin, a_option, genome_file)
    command_runner.run_command(command)
    return run_fine
Ejemplo n.º 3
0
def run_assembly(assembly_function, fastq_file, output_dir=None, estimated_size=600, subsample_nb_read=None, rg_ids=[], name=None,
                 adapter_file=None):
    if name is None:
        name,ext =os.path.splitext(os.path.basename(fastq_file))
    current_dir=None
    if output_dir and os.path.exists(output_dir):
        logging.debug('change directory to %s'%output_dir)
        current_dir=os.getcwd()
        os.chdir(output_dir)
    fastq_file = clean_fastq(fastq_file, adapter_file=adapter_file, rg_ids=rg_ids, subsample_nb_read=subsample_nb_read)
    contig_file = assembly_function(fastq_file, estimated_size=estimated_size)
    if contig_file:
        contig_file = os.path.abspath(contig_file)
        merged_consensus = os.path.join(os.path.dirname(contig_file),'merged_consensus.fa')
        if os.path.exists(merged_consensus):
            logging.debug('remove the merged_consensus.fa that already exists before assembling')
            command = 'rm -f %s'%(merged_consensus)
            command_runner.run_command(command)

    if current_dir:
        logging.debug('change directory back to %s'%current_dir)
        os.chdir(current_dir)
    nb_seq=max_len=0
    corrected_contig_file=None
    if contig_file:
        corrected_contig_file, nb_seq, max_len = correct_contig_file(contig_file, name)
    return (corrected_contig_file,nb_seq, max_len)
Ejemplo n.º 4
0
def align_short_reads_se(fastq_file1, genome_file, output_dir, sample_name,
                         thread, BWA_bin, samtools_bin, picard_dir,
                         read_group_command, files_and_dir, illumina, fifo):

    fastq_name, ext = os.path.splitext(os.path.basename(fastq_file1))
    sai_file1 = '%s.sai' % os.path.join(output_dir, fastq_name)
    illumina_str = ""
    if illumina:
        illumina_str = " -I "
    command = '%s aln %s -t %s %s %s > %s' % (
        BWA_bin, illumina_str, thread, genome_file, fastq_file1, sai_file1)

    return_code = command_runner.run_command(command)
    if return_code is not 0:
        run_fine = False
    files_and_dir.append(sai_file1)

    #only one end so just run get the sorted bam file
    bam_file = os.path.join(output_dir, sample_name + "_sorted")
    command = """%s samse %s %s %s %s | %s view -bS - | %s sort - %s""" % (
        BWA_bin, read_group_command, genome_file, sai_file1, fastq_file1,
        samtools_bin, samtools_bin, bam_file)
    return_code = command_runner.run_command(command)
    if return_code is not 0:
        run_fine = False
    return bam_file
def run_smalt_paired(consensus_file, read1_fastq, read2_fastq, **kwarg):
    index1 = '%s.sma' % consensus_file
    command = 'rm -rf %s' % index1
    if os.path.exists(index1):
        return_code = command_runner.run_command(command)
    index2 = '%s.smi' % consensus_file
    command = 'rm -rf %s' % index2
    if os.path.exists(index2):
        return_code = command_runner.run_command(command)
    index3 = '%s.fai' % consensus_file
    command = 'rm -rf %s' % index3
    if os.path.exists(index3):
        return_code = command_runner.run_command(command)

    command = "smalt index %s %s" % (consensus_file, consensus_file)
    return_code = command_runner.run_command(command)
    name = longest_common_substr_from_start(read1_fastq,
                                            read2_fastq).rstrip('_')
    read2_fastq_rev_comp = reverse_complement(read2_fastq)

    sam_file = name + '.sam'
    command = "smalt map -f samsoft -o %s %s %s %s" % (
        sam_file, consensus_file, read1_fastq, read2_fastq_rev_comp)
    return_code = command_runner.run_command(command)
    return sam_file
Ejemplo n.º 6
0
def prepare_genome(genome_file, color_space=False):
    run_fine = True
    pipeline_param = utils_param.get_pipeline_parameters()
    BWA_dir = pipeline_param.get_bwa_dir()
    BWA_bin = os.path.join(BWA_dir, 'bwa')
    genome_loader = GenomeLoader(genome_file=genome_file)
    length = 0
    for fasta_rec in genome_loader:
        header, sequence = fasta_rec
        length += len(sequence)
        if length > 1000000000:
            break
    genome_loader.close()
    #Following recommendation set the indexing algorithm to is if genome is <10M
    if length > 1000000000:
        a_option = 'bwtsw'
    else:
        a_option = 'is'

    #Create the indexes
    if color_space:
        command = '%s index -c -a %s %s' % (BWA_bin, a_option, genome_file)
    else:
        command = '%s index -a %s %s' % (BWA_bin, a_option, genome_file)
    command_runner.run_command(command)
    return run_fine
def align_bwa_long(fastq_file1, fastq_file2, genome_file, sample_name,  read_group, analysis_type, output_dir,
                    BWA_bin, samtools_bin, picard_dir,  thread, sort, illumina, files_and_dir):
    if illumina:
        logging.error("long read alignment do not support illumina format")
        return False
    if analysis_type is not None:
        logging.error("long read alignment do not support %s analsyis"%(analysis_type))
        return False

    run_fine=True
    tmp_bam_file=os.path.join(output_dir, sample_name+'_tmp.bam')
    files_and_dir.append(tmp_bam_file)
    if fastq_file2:
        command = '%s mem -t %s %s %s %s | %s view -bS - > %s'%(BWA_bin, thread, genome_file, fastq_file1, fastq_file2, samtools_bin, tmp_bam_file)
    else:
        command = '%s bwasw -t %s %s %s | %s view -bS - > %s'%(BWA_bin, thread, genome_file, fastq_file1, samtools_bin, tmp_bam_file)
    if run_fine: return_code = command_runner.run_command(command)
    if return_code is not 0:
        run_fine = False 
    
    if sort:
        bam_file=os.path.join(output_dir, sample_name+'_sorted.bam')
        sort_order='coordinate'
    else:
        bam_file=os.path.join(output_dir, sample_name+'.bam')
        sort_order='queryname'
    
    #bwa screw up the mate information 
    fixmate_jar = os.path.join(picard_dir, 'FixMateInformation.jar')
    fixed_bam_file=os.path.join(output_dir, sample_name+'_fixed.bam')
    command = 'java -jar -Xmx2G %s I=%s O=%s SO=%s VALIDATION_STRINGENCY=LENIENT'%(fixmate_jar, tmp_bam_file, fixed_bam_file, sort_order)
    if run_fine: return_code = command_runner.run_command(command)
    if return_code is not 0:
        run_fine = False
    
    if read_group:
        files_and_dir.append(fixed_bam_file)
        read_group_param=[]
        read_group_elements = extract_read_group(read_group)
        replace_readgroup_jar = os.path.join(picard_dir, 'AddOrReplaceReadGroups.jar')
        for key in ['ID', 'LB', 'PL', 'PU', 'SM', 'CN']:
            if read_group_elements.has_key(key):
                read_group_param.append('%s="%s"'%(key, read_group_elements.get(key)))
            else:
                read_group_param.append('%s=0'%(key))

        command = 'java -jar -Xmx2G %s I=%s O=%s SO=%s %s VALIDATION_STRINGENCY=LENIENT'%(replace_readgroup_jar, fixed_bam_file, bam_file, sort_order, ' '.join(read_group_param))
        if run_fine: return_code = command_runner.run_command(command)
        if return_code is not 0: run_fine = False
    else:
        bam_file=os.path.join(output_dir, sample_name+'.bam')
        command='mv %s %s'%(fixed_bam_file, bam_file)
        if run_fine: return_code = command_runner.run_command(command)
        if return_code is not 0: run_fine = False
    if run_fine:
        return bam_file
    else:
        return False
Ejemplo n.º 8
0
def run_velvet(fastq_file_name, kmer_length=29, output_dir= 'velvet', **kwarg):
    log_file='%s.log'%(output_dir)
    command = "%s %s %s -fastq -short %s 2>&1 >%s"%(velveth_bin, output_dir, kmer_length, fastq_file_name, log_file)
    return_code = command_runner.run_command(command)
    command = "%s %s 2>&1 >%s"%(velvetg_bin, output_dir, log_file)
    return_code = command_runner.run_command(command)

    contig_files = glob('%s/contigs.fa'%output_dir)
    contig_file_name=None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    
    return contig_file_name;
def SNP_call_with_samtools(samtools_dir, name, bam_file, ref_file):
    samtools_bin = os.path.join(samtools_dir, "samtools")
    bcftools_bin = os.path.join(samtools_dir, "bcftools/bcftools")
    if not os.path.exists(bcftools_bin):
        bcftools_bin = os.path.join(samtools_dir, "bcftools")
    samtools_raw_vcf = os.path.join(name + '_sorted_mrk_dup_fixed_samtools.vcf')
    command = "%s mpileup -d 50000 -ADESuf %s %s | %s view -gv - > %s"
    command = command % (samtools_bin, ref_file, bam_file, bcftools_bin, samtools_raw_vcf)
    command_runner.run_command(command)

    samtools_raw_filtered = os.path.join(name + '_sorted_mrk_dup_fixed_samtools_filterd20q60.vcf')
    command = "vcfutils.pl varFilter -d 20 %s | awk '{if (/^#/ || $6>60){print}}' > %s" % (
    samtools_raw_vcf, samtools_raw_filtered)
    command_runner.run_command(command)
Ejemplo n.º 10
0
def run_velvet(fastq_file_name, kmer_length=29, output_dir='velvet', **kwarg):
    log_file = '%s.log' % (output_dir)
    command = "%s %s %s -fastq -short %s 2>&1 >%s" % (
        velveth_bin, output_dir, kmer_length, fastq_file_name, log_file)
    return_code = command_runner.run_command(command)
    command = "%s %s 2>&1 >%s" % (velvetg_bin, output_dir, log_file)
    return_code = command_runner.run_command(command)

    contig_files = glob('%s/contigs.fa' % output_dir)
    contig_file_name = None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]

    return contig_file_name
def run_smalt_single(consensus_file, read1_fastq, **kwarg):

    name, ext = os.path.splitext(read1_fastq)
    sam_file = name + '_single.sam'
    command = "smalt map -f samsoft -o %s %s %s" % (sam_file, consensus_file, read1_fastq)
    return_code = command_runner.run_command(command)
    return sam_file
Ejemplo n.º 12
0
def build_jerry(repo, coverage):
    build_command = [
        'tools/build.py',
        '--clean',
        '--debug',
        '--compile-flag=-fsanitize=address',
        '--compile-flag=-m32',
        '--compile-flag=-fno-omit-frame-pointer',
        '--compile-flag=-fno-common',
        '--compile-flag=-g',
        '--strip=off',
        '--system-allocator=on',
        '--logging=on',
        '--linker-flag=-fuse-ld=gold',
        '--error-messages=on',
        '--profile=es2015-subset',
    ]

    if coverage:
        coverage_specs = [
            '--compile-flag=-fprofile-arcs', '--compile-flag=-ftest-coverage',
            '--link-lib', 'gcov'
        ]

        build_command += coverage_specs

    if run_command(build_command, cwd=repo, debug=True):
        raise Exception(f'{build_command} failed!')
Ejemplo n.º 13
0
def run_velvetOptimiser(fastq_file_name, low_k=59, high_k=99, outputdir='velvetopt', **kwarg):
    command='rm -rf %s'%outputdir
    if os.path.exists(outputdir):
        return_code = command_runner.run_command(command)
    log_file='%s.log'%outputdir
    command_tmp = "%s -f '-fastq -short %s' --s %s --e %s --k max --c max --d %s 2>&1 >%s"
    command = command_tmp % (velvetOptimiser_bin, fastq_file_name, low_k, high_k, outputdir, log_file)
    return_code = command_runner.run_command(command)
    
    contig_files = glob('%s/contigs.fa'%outputdir)
    # If only one contig file exists, as it should if VelvetOptimiser runs
    # successfully, write out the assembled contig(s)
    contig_file_name=None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name;
Ejemplo n.º 14
0
def run_cap3(fastq_file_name, output_dir="cap3", **kwarg):
    log_file = '%s.log' % output_dir
    fasta_file = os.path.join(output_dir, os.path.basename(fastq_file_name) + '.fa')
    command = 'mkdir %s' % output_dir
    if not os.path.exists(output_dir):
        return_code = command_runner.run_command(command)
    command = "seqtk seq -A %s > %s" % (fastq_file_name, fasta_file)
    return_code = command_runner.run_command(command)

    command = "%s %s 2>&1 >%s" % (cap3_bin, fasta_file, log_file)
    return_code = command_runner.run_command(command)
    contig_files = glob('%s.cap.contigs' % (fasta_file))
    contig_file_name = None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name
Ejemplo n.º 15
0
def run_all_fastq_files(directory):
    directory=os.path.abspath(directory)
    all_dirs = glob(os.path.join(directory,'*_dir'))
    all_samples=set()
    for sub_dir in all_dirs:
        print sub_dir
        name=os.path.basename(sub_dir)[:-len("_dir")]
        samples=calculate_base_frequency_for_snps(sub_dir, name)
        all_samples.update(set(samples))
        
    for sample in all_samples:
        #concatenate the allele frequency file per samples
        merged_file = os.path.join(directory,'samtools_snps_%s.allelefreq'%sample)
        command = 'cat %s/*_dir/*_%s.allelefreq > %s'%(directory, sample, merged_file)
        command_runner.run_command(command)
    return
Ejemplo n.º 16
0
def concatenate_file(list_of_file,output_file=None, **kwargs):
    """This is a generic merging function for concatenating text files.
    It can take a filter keyword argument to grep out using the provided value"""
    if not output_file:
        #Create a generic name and put it in the current working directory
        if kwargs.has_key('output_dir'):
            working_directory = kwargs.get('output_dir')
        else:
            working_directory = os.getcwd()
        i=1
        output_file_template=os.path.join(working_directory,'tmp_concatenate_%s')
        output_file=output_file_template%i
        while os.path.exists(output_file):
            i+=1
            output_file=output_file_template%i

    if kwargs.has_key('filter'):
        filter_on = kwargs.get('filter')
        command = 'cat %s | egrep -v %s > %s '%(' '.join(list_of_file), filter_on, output_file)
    else:
        command = 'cat %s > %s '%(' '.join(list_of_file), output_file)
    return_code=command_runner.run_command(command)
    if return_code==0:
        return output_file
    else:
        return None
def run_smalt_single(consensus_file, read1_fastq, **kwarg):

    name, ext = os.path.splitext(read1_fastq)
    sam_file = name + '_single.sam'
    command = "smalt map -f samsoft -o %s %s %s" % (sam_file, consensus_file,
                                                    read1_fastq)
    return_code = command_runner.run_command(command)
    return sam_file
Ejemplo n.º 18
0
def run_blast(contig_file, genome_file):
    blastn_plus_bin='/ifs/software/linux_x86_64/blast+/current/bin/blastn'
    output_file='%s.blast6out'%contig_file
    command='%s -query %s -db %s -max_target_seqs 1 -outfmt 6 -out %s'%(blastn_plus_bin, contig_file, genome_file, output_file)
    return_code = command_runner.run_command(command)
    if return_code!=0:
        return None
    return output_file
Ejemplo n.º 19
0
def clone_jerry(repo):
    command = [
        'git', 'clone',
        'https://github.com/jerryscript-project/jerryscript.git', repo
    ]

    if run_command(command, cwd=ROOT_DIR, debug=True):
        raise Exception(f'{command} failed!')
Ejemplo n.º 20
0
def run_cap3(fastq_file_name, output_dir="cap3", **kwarg):
    log_file = '%s.log' % output_dir
    fasta_file = os.path.join(output_dir,
                              os.path.basename(fastq_file_name) + '.fa')
    command = 'mkdir %s' % output_dir
    if not os.path.exists(output_dir):
        return_code = command_runner.run_command(command)
    command = "seqtk seq -A %s > %s" % (fastq_file_name, fasta_file)
    return_code = command_runner.run_command(command)

    command = "%s %s 2>&1 >%s" % (cap3_bin, fasta_file, log_file)
    return_code = command_runner.run_command(command)
    contig_files = glob('%s.cap.contigs' % (fasta_file))
    contig_file_name = None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name
Ejemplo n.º 21
0
def run_clc_assemble(fastq_file_name, word_size=None, output_dir='clc_bio', **kwarg):
    log_file='%s.log'%output_dir
    command='mkdir %s'%output_dir
    if not os.path.exists(output_dir):
        return_code = command_runner.run_command(command)
    if word_size:
        command = "%s -v -w %s -q %s -o clc_bio/contigs.fa -b 200 -m 100 2>&1 >%s " % (
        clc_novo_bin, word_size, fastq_file_name, log_file)
    else:
        command = "%s -v -q %s -o clc_bio/contigs.fa -b 200 -m 100 2>&1 >%s " % (
        clc_novo_bin, fastq_file_name, log_file)
    return_code = command_runner.run_command(command)
    contig_files = glob('%s/contigs.fa'%output_dir)
    contig_file_name=None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name;
def SNP_call_with_samtools(samtools_dir, name, bam_file, ref_file):
    samtools_bin = os.path.join(samtools_dir, "samtools")
    bcftools_bin = os.path.join(samtools_dir, "bcftools/bcftools")
    if not os.path.exists(bcftools_bin):
        bcftools_bin = os.path.join(samtools_dir, "bcftools")
    samtools_raw_vcf = os.path.join(name +
                                    '_sorted_mrk_dup_fixed_samtools.vcf')
    command = "%s mpileup -d 50000 -ADESuf %s %s | %s view -gv - > %s"
    command = command % (samtools_bin, ref_file, bam_file, bcftools_bin,
                         samtools_raw_vcf)
    command_runner.run_command(command)

    samtools_raw_filtered = os.path.join(
        name + '_sorted_mrk_dup_fixed_samtools_filterd20q60.vcf')
    command = "vcfutils.pl varFilter -d 20 %s | awk '{if (/^#/ || $6>60){print}}' > %s" % (
        samtools_raw_vcf, samtools_raw_filtered)
    command_runner.run_command(command)
Ejemplo n.º 23
0
def run_blast(contig_file, genome_file):
    blastn_plus_bin = '/ifs/software/linux_x86_64/blast+/current/bin/blastn'
    output_file = '%s.blast6out' % contig_file
    command = '%s -query %s -db %s -max_target_seqs 1 -outfmt 6 -out %s' % (
        blastn_plus_bin, contig_file, genome_file, output_file)
    return_code = command_runner.run_command(command)
    if return_code != 0:
        return None
    return output_file
Ejemplo n.º 24
0
def apply_patch(repo, patch_file):
    if not isfile(patch_file):
        raise Exception('Cannot find hash file in the given directory.')

    patch_file = abspath(patch_file)
    command = ['git', 'apply', patch_file]

    if run_command(command, cwd=repo, debug=True):
        raise Exception(f'{command} failed!')
def trim_fastq_to_length(fastq_file, output_file,  length):
    #output_file = fastq_file+'trim%s'%length
    opener='cat'
    if fastq_file.endswith('.gz'):
        opener='zcat'
        
    command = '''%s %s | awk '{if (NR%4==2 || NR%4==0){print substr($0, 1,%s)}else{print $0}}' > %s'''%(opener, fastq_file, length, output_file)
    return_code = command_runner.run_command(command)
    return  return_code
Ejemplo n.º 26
0
def run_all_fastq_files(directory):
    directory = os.path.abspath(directory)
    all_dirs = glob(os.path.join(directory, '*_dir'))
    all_samples = set()
    for sub_dir in all_dirs:
        print sub_dir
        name = os.path.basename(sub_dir)[:-len("_dir")]
        samples = calculate_base_frequency_for_snps(sub_dir, name)
        all_samples.update(set(samples))

    for sample in all_samples:
        #concatenate the allele frequency file per samples
        merged_file = os.path.join(directory,
                                   'samtools_snps_%s.allelefreq' % sample)
        command = 'cat %s/*_dir/*_%s.allelefreq > %s' % (directory, sample,
                                                         merged_file)
        command_runner.run_command(command)
    return
Ejemplo n.º 27
0
def run_soapdenovo(fastq_file_name, max_read_len=101, **kwarg):
    log_file='soapdenovo.log'
    command='mkdir soapdenovo'
    if not os.path.exists('soapdenovo'):
        return_code = command_runner.run_command(command)
    config_file='soapdenovo/config_file'
    open_file=open(config_file,'w')
    open_file.write("max_rd_len=%s\n[LIB]\nq=%s\n"%(max_read_len,fastq_file_name))
    open_file.close()
    command='%s pregraph -K 29 -s %s -o soapdenovo/graph -p 1 2>&1 >%s'%(SOAPdenovo_bin, config_file,log_file)
    return_code = command_runner.run_command(command)
    command='%s contig -g soapdenovo/graph 2>&1 >>%s'%(SOAPdenovo_bin,log_file)
    return_code = command_runner.run_command(command)
    contig_files = glob('soapdenovo/graph.contig')
    contig_file_name=None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name
Ejemplo n.º 28
0
def run_idba(fastq_file_name, max_read_len=200, **kwarg):
    log_file='idba_ud.log'
    command="%s -r %s -o idba_ud --min_contig %s --num_threads 1 --mink 40  --min_count 8 --min_support 4   2>&1 >%s"%(idba_ud_bin,fastq_file_name, max_read_len,log_file)
    return_code = command_runner.run_command(command)
    contig_files = glob('idba_ud/contig.fa')
    contig_file_name=None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name
def fastq_2_bam(fastq_file, rgid, qual, files_and_dir, fifo):
    fastqToSam_jar=''
    sam_file=''
    run_fine=True
    command="java -Xmx2G -jar %s F1=%s O=%s RG=%s LB=%s SM=%s, QUALITY_FORMAT=%s"%(fastqToSam_jar,fastq_file,sam_file,rgid,rgid,rgid,qual)
    if fifo:
        command+=" &"
    return_code = command_runner.run_command(command)
    if return_code is not 0:
        run_fine = False 
    files_and_dir.append(sam_file)
    sam_file2_tmp='%s.sam.tmp'%os.path.join(output_dir, fastq_name)
    if fifo:
        if os.path.exists(sam_file2_tmp):
            os.remove(sam_file2_tmp)
        command="mkfifo %s"%sam_file2_tmp
        return_code = command_runner.run_command( command)
    command = fix_read_name_in_sam_command(129, header=False)
    command+=""" %s > %s"""%(sam_file2,sam_file2_tmp)
Ejemplo n.º 30
0
def run_idba(fastq_file_name, max_read_len=200, **kwarg):
    log_file = 'idba_ud.log'
    command = "%s -r %s -o idba_ud --min_contig %s --num_threads 1 --mink 40  --min_count 8 --min_support 4   2>&1 >%s" % (
        idba_ud_bin, fastq_file_name, max_read_len, log_file)
    return_code = command_runner.run_command(command)
    contig_files = glob('idba_ud/contig.fa')
    contig_file_name = None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name
Ejemplo n.º 31
0
def run_flash(output_dir, fastq_1, fastq_2, overlap):
    command="flash -m %s -d %s %s %s"%(overlap, output_dir,fastq_1,fastq_2)
    return_code = command_runner.run_command(command)
    if return_code !=0:
        return None
    out_extended=os.path.join(output_dir,"out.extendedFrags.fastq")
    #if Flash finishes succesfully but nothing was merged
    if os.stat(out_extended).st_size == 0:
        return None
    return out_extended
Ejemplo n.º 32
0
def trim_fastq_to_length(fastq_file, output_file, length):
    #output_file = fastq_file+'trim%s'%length
    opener = 'cat'
    if fastq_file.endswith('.gz'):
        opener = 'zcat'

    command = '''%s %s | awk '{if (NR%4==2 || NR%4==0){print substr($0, 1,%s)}else{print $0}}' > %s''' % (
        opener, fastq_file, length, output_file)
    return_code = command_runner.run_command(command)
    return return_code
Ejemplo n.º 33
0
def run_one_fastq_file(fastq_file,
                       output_dir,
                       assembly_function_list,
                       estimated_size=600,
                       subsample_nb_read=None,
                       rg_ids=[],
                       read1_fasta=None,
                       name=None,
                       force_merge=False,
                       adapter_file=None):
    fastq_file = os.path.abspath(fastq_file)
    #output_dir='%s_dir'%fastq_file
    if not os.path.exists(output_dir):
        command = 'mkdir %s' % (output_dir)
        return_code = command_runner.run_command(command)
    for assembly_function in assembly_function_list:
        #Assemble with provided assembler
        (contig_file, nb_seq,
         max_len) = run_assembly(assembly_function,
                                 fastq_file,
                                 output_dir,
                                 estimated_size=estimated_size,
                                 subsample_nb_read=subsample_nb_read,
                                 rg_ids=rg_ids,
                                 name=name,
                                 adapter_file=adapter_file)
        #Merge read one and read2 contig
        if contig_file:
            #TODO: This function gets run twice need to change that as the second run is not useful
            merge_read1_and_read2_contigs(
                name,
                read1_contig=read1_fasta,
                read2_contigs=contig_file,
                output_dir=os.path.dirname(contig_file))

    best_assembler_name, best_assembly_file = get_best_assembly_merged(
        output_dir, read1_fasta, name, force_merge)

    command = "cp %s %s" % (best_assembly_file,
                            os.path.join(output_dir, "best_assembly.fa"))
    return_code = command_runner.run_command(command)
    return os.path.join(output_dir, "best_assembly.fa")
Ejemplo n.º 34
0
def create_sequence_dictionary(picard_dir, genome_file):
    """Create a sequence dictionary from the genome file provided"""
    name, dummy=os.path.splitext(genome_file)
    genome_dict=name+'.dict'
    if not os.path.exists(genome_dict) or os.path.getmtime(genome_file) > os.path.getmtime(genome_dict):
        CreateSequenceDictionary_jar=os.path.join(picard_dir,'CreateSequenceDictionary.jar')
        command='java -jar %s REFERENCE=%s O=%s'%(CreateSequenceDictionary_jar,genome_file,genome_dict)
        return_code=command_runner.run_command(command)
        if return_code!=0:
            genome_dict=None
    return genome_dict
Ejemplo n.º 35
0
def checkout_to_hash(repo, hash_file):
    if not isfile(hash_file):
        raise Exception('Cannot find hash file in the given directory.')

    with open(hash_file, 'r') as hash_f:
        git_hash = hash_f.read()

    command = ['git', 'checkout', git_hash]

    if run_command(command, cwd=repo, debug=True):
        raise Exception(f'{command} failed!')
Ejemplo n.º 36
0
def merge_snps_files(directory):
    """This function merge snps files from a single directory"""
    return_code = 0
    all_vcf_files = glob(os.path.join(directory, '*_dir', '*_phased.vcf'))
    output_file_body = os.path.join(directory, '%s_snps_files.vcf.body' % len(all_vcf_files))
    output_file_body = concatenate_file(all_vcf_files, output_file_body, filter="^#")
    if output_file_body:
        return_code = 0
    output_file_header = os.path.join(directory, '%s_snps_files.vcf.header' % len(all_vcf_files))
    command = 'grep  "^#" %s > %s ' % (all_vcf_files[0], output_file_header)
    if return_code == 0:
        return_code = command_runner.run_command(command)
    output_file = os.path.join(directory, '%s_phased_snps_files.vcf' % len(all_vcf_files))
    command = 'cat %s %s > %s ' % (output_file_header, output_file_body, output_file)
    if return_code == 0:
        return_code = command_runner.run_command(command)
    command = 'rm %s %s' % (output_file_header, output_file_body)
    if return_code == 0:
        return_code = command_runner.run_command(command)
    return return_code
Ejemplo n.º 37
0
def fastq_2_bam(fastq_file, rgid, qual, files_and_dir, fifo):
    fastqToSam_jar = ''
    sam_file = ''
    run_fine = True
    command = "java -Xmx2G -jar %s F1=%s O=%s RG=%s LB=%s SM=%s, QUALITY_FORMAT=%s" % (
        fastqToSam_jar, fastq_file, sam_file, rgid, rgid, rgid, qual)
    if fifo:
        command += " &"
    return_code = command_runner.run_command(command)
    if return_code is not 0:
        run_fine = False
    files_and_dir.append(sam_file)
    sam_file2_tmp = '%s.sam.tmp' % os.path.join(output_dir, fastq_name)
    if fifo:
        if os.path.exists(sam_file2_tmp):
            os.remove(sam_file2_tmp)
        command = "mkfifo %s" % sam_file2_tmp
        return_code = command_runner.run_command(command)
    command = fix_read_name_in_sam_command(129, header=False)
    command += """ %s > %s""" % (sam_file2, sam_file2_tmp)
Ejemplo n.º 38
0
def run_clc_assemble(fastq_file_name,
                     word_size=None,
                     output_dir='clc_bio',
                     **kwarg):
    log_file = '%s.log' % output_dir
    command = 'mkdir %s' % output_dir
    if not os.path.exists(output_dir):
        return_code = command_runner.run_command(command)
    if word_size:
        command = "%s -v -w %s -q %s -o clc_bio/contigs.fa -b 200 -m 100 2>&1 >%s " % (
            clc_novo_bin, word_size, fastq_file_name, log_file)
    else:
        command = "%s -v -q %s -o clc_bio/contigs.fa -b 200 -m 100 2>&1 >%s " % (
            clc_novo_bin, fastq_file_name, log_file)
    return_code = command_runner.run_command(command)
    contig_files = glob('%s/contigs.fa' % output_dir)
    contig_file_name = None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name
Ejemplo n.º 39
0
def merge_all_summary_files_from_directories(directory):
    """This function will merge the summary files across all the directories"""
    return_code=0
    all_summary_files = glob(os.path.join(directory,'*_dir','*summary_stat.txt'))
    output_file_body = os.path.join(directory,'all_summary_stat.txt.body')
    output_file_body = merge_by_chunck(all_summary_files, concatenate_file, output_file_body, filter="^name")
    if output_file_body:
        return_code=0
    output_file_header = os.path.join(directory,'all_summary_stat.txt.header')
    command = 'head -n 1 %s > %s '%(all_summary_files[0], output_file_header)
    if return_code==0:
        return_code = command_runner.run_command(command)
    output_file = os.path.join(directory,'all_summary_stat.txt')
    command = 'cat %s %s > %s '%(output_file_header, output_file_body, output_file)
    if return_code==0:
        return_code = command_runner.run_command(command)
    command = 'rm %s %s'%(output_file_header, output_file_body)
    if return_code==0:
        return_code = command_runner.run_command(command)
    return return_code
Ejemplo n.º 40
0
def merge_all_snps_files_from_directories(directory):
    """This function will merge the snps files across all the directories"""
    return_code=0
    all_vcf_files = glob(os.path.join(directory,'*_dir','*_snps_files.vcf'))
    output_file_body = os.path.join(directory,'all_consensus_snps_files.vcf.body')
    output_file_body = merge_by_chunck(all_vcf_files, concatenate_file, output_file_body, filter="^#")
    if output_file_body:
        return_code=0
    output_file_header = os.path.join(directory,'all_consensus_snps_files.vcf.header')
    command = 'grep  "^#" %s > %s '%(all_vcf_files[0], output_file_header)
    if return_code==0:
        return_code = command_runner.run_command(command)
    output_file = os.path.join(directory,'all_consensus_snps_files.vcf')
    command = 'cat %s %s > %s '%(output_file_header, output_file_body, output_file)
    if return_code==0:
        return_code = command_runner.run_command(command)
    command = 'rm %s %s'%(output_file_header, output_file_body)
    if return_code==0:
        return_code = command_runner.run_command(command)
    return return_code
Ejemplo n.º 41
0
def run_flash(output_dir, fastq_1, fastq_2, overlap):
    command = "flash -m %s -d %s %s %s" % (overlap, output_dir, fastq_1,
                                           fastq_2)
    return_code = command_runner.run_command(command)
    if return_code != 0:
        return None
    out_extended = os.path.join(output_dir, "out.extendedFrags.fastq")
    #if Flash finishes succesfully but nothing was merged
    if os.stat(out_extended).st_size == 0:
        return None
    return out_extended
Ejemplo n.º 42
0
def run_velvetOptimiser(fastq_file_name,
                        low_k=59,
                        high_k=99,
                        outputdir='velvetopt',
                        **kwarg):
    command = 'rm -rf %s' % outputdir
    if os.path.exists(outputdir):
        return_code = command_runner.run_command(command)
    log_file = '%s.log' % outputdir
    command_tmp = "%s -f '-fastq -short %s' --s %s --e %s --k max --c max --d %s 2>&1 >%s"
    command = command_tmp % (velvetOptimiser_bin, fastq_file_name, low_k,
                             high_k, outputdir, log_file)
    return_code = command_runner.run_command(command)

    contig_files = glob('%s/contigs.fa' % outputdir)
    # If only one contig file exists, as it should if VelvetOptimiser runs
    # successfully, write out the assembled contig(s)
    contig_file_name = None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name
Ejemplo n.º 43
0
def run_soapdenovo(fastq_file_name, max_read_len=101, **kwarg):
    log_file = 'soapdenovo.log'
    command = 'mkdir soapdenovo'
    if not os.path.exists('soapdenovo'):
        return_code = command_runner.run_command(command)
    config_file = 'soapdenovo/config_file'
    open_file = open(config_file, 'w')
    open_file.write("max_rd_len=%s\n[LIB]\nq=%s\n" %
                    (max_read_len, fastq_file_name))
    open_file.close()
    command = '%s pregraph -K 29 -s %s -o soapdenovo/graph -p 1 2>&1 >%s' % (
        SOAPdenovo_bin, config_file, log_file)
    return_code = command_runner.run_command(command)
    command = '%s contig -g soapdenovo/graph 2>&1 >>%s' % (SOAPdenovo_bin,
                                                           log_file)
    return_code = command_runner.run_command(command)
    contig_files = glob('soapdenovo/graph.contig')
    contig_file_name = None
    if len(contig_files) == 1:
        contig_file_name = contig_files[0]
    return contig_file_name
Ejemplo n.º 44
0
def copy_file_across(source, destination, server_source=None, server_destination=None, overwrite=False):
    if check_file_or_dir(source,server_source) == 'file' and check_file_or_dir(destination,server_destination) == 'dir':
        destination_file=os.path.join(destination,os.path.basename(source))
    else:
        destination_file=destination
    if not checkFile(destination_file, server_destination) or overwrite:
        if server_source and server_destination:
            name = os.path.basename(source)
            tmp_file = '/tmp/%s'%(name)
            command='scp %s:%s %s'%(server_source, source, tmp_file)
            command_runner.run_command(command)
            command='scp %s %s:%s'%(tmp_file, server_destination, destination)
            command_runner.run_command(command)
            os.remove(tmp_file)
        else:
            if server_source:
                command='scp %s:%s %s'%(server_source, source, destination)
            elif server_destination:
                command='scp %s %s:%s'%(source, server_destination, destination)
            else:
                command='scp %s %s'%(source, destination)
            command_runner.run_command(command)
    else:
        if server_destination:
            logging.warning('%s exist on %s use force to overwrite'%(destination_file, server_destination))
        else:
            logging.warning('%s exist use force to overwrite'%destination_file)
Ejemplo n.º 45
0
def clean_fastq(fastq_file,
                adapter_file=None,
                rg_ids=[],
                subsample_nb_read=None):
    if rg_ids:
        fastq_file = keep_read_from_samples(fastq_file, rg_ids)
    if adapter_file:
        adapter_trim = fastq_file + '.adapter_trimmed'
        if not os.path.exists(adapter_trim):
            command = "scythe -q sanger -a %s -o %s %s" % (
                adapter_file, adapter_trim, fastq_file)
            command_runner.run_command(command)
        fastq_file = adapter_trim
    qual_trim = fastq_file + ".qual_trimmed"
    if not os.path.exists(qual_trim):
        command = "sickle se -f %s -t sanger -o %s" % (fastq_file, qual_trim)
        command_runner.run_command(command)
        fastq_file = qual_trim
    if subsample_nb_read:
        sub_sampled = qual_trim + ".%s" % subsample_nb_read
        if not os.path.exists(sub_sampled):
            command = "seqtk sample %s %s > %s" % (
                fastq_file, subsample_nb_read, sub_sampled)
            command_runner.run_command(command)
        return sub_sampled
    else:
        return fastq_file
Ejemplo n.º 46
0
def run_assembly(assembly_function,
                 fastq_file,
                 output_dir=None,
                 estimated_size=600,
                 subsample_nb_read=None,
                 rg_ids=[],
                 name=None,
                 adapter_file=None):
    if name is None:
        name, ext = os.path.splitext(os.path.basename(fastq_file))
    current_dir = None
    if output_dir and os.path.exists(output_dir):
        logging.debug('change directory to %s' % output_dir)
        current_dir = os.getcwd()
        os.chdir(output_dir)
    fastq_file = clean_fastq(fastq_file,
                             adapter_file=adapter_file,
                             rg_ids=rg_ids,
                             subsample_nb_read=subsample_nb_read)
    contig_file = assembly_function(fastq_file, estimated_size=estimated_size)
    if contig_file:
        contig_file = os.path.abspath(contig_file)
        merged_consensus = os.path.join(os.path.dirname(contig_file),
                                        'merged_consensus.fa')
        if os.path.exists(merged_consensus):
            logging.debug(
                'remove the merged_consensus.fa that already exists before assembling'
            )
            command = 'rm -f %s' % (merged_consensus)
            command_runner.run_command(command)

    if current_dir:
        logging.debug('change directory back to %s' % current_dir)
        os.chdir(current_dir)
    nb_seq = max_len = 0
    corrected_contig_file = None
    if contig_file:
        corrected_contig_file, nb_seq, max_len = correct_contig_file(
            contig_file, name)
    return (corrected_contig_file, nb_seq, max_len)
Ejemplo n.º 47
0
def convert_untrimmed_read(fastq_file, output_dir, rgid, libid, smid,
                           picard_dir, files_and_dir, illumina, fifo):
    fastq_name, ext = os.path.splitext(os.path.basename(fastq_file))
    sam_file = '%s.sam' % os.path.join(output_dir, fastq_name)
    if fifo:
        command = "mkfifo %s" % sam_file
        if os.path.exists(sam_file):
            os.remove(sam_file)
        return_code = command_runner.run_command(command)

    fastqToSam_jar = os.path.join(picard_dir, "FastqToSam.jar")
    if illumina:
        qual = "Illumina"
    else:
        qual = "Standard"
    command = "java -Xmx2G -jar %s F1=%s O=%s RG=%s LB=%s SM=%s, QUALITY_FORMAT=%s" % (
        fastqToSam_jar, fastq_file, sam_file, rgid, libid, smid, qual)
    if fifo:
        command += " &"
    return_code = command_runner.run_command(command)

    if return_code is not 0:
        run_fine = False
    files_and_dir.append(sam_file)
    read_sam_tmp = '%s.tmp' % (sam_file)
    if fifo:
        if os.path.exists(read_sam_tmp):
            os.remove(read_sam_tmp)
        command = "mkfifo %s" % read_sam_tmp
        return_code = command_runner.run_command(command)
    command = fix_read_name_in_sam_command(65, header=False)
    command += """ %s > %s""" % (sam_file, read_sam_tmp)
    if fifo:
        command += " &"
    return_code = command_runner.run_command(command)
    if return_code is not 0:
        run_fine = False
    files_and_dir.append(read_sam_tmp)
    return read_sam_tmp
Ejemplo n.º 48
0
def run_one_fastq_file(fastq_file, output_dir, assembly_function_list, estimated_size=600, subsample_nb_read=None, rg_ids=[],
                       read1_fasta=None, name=None, force_merge=False, adapter_file=None):
    fastq_file=os.path.abspath(fastq_file)
    #output_dir='%s_dir'%fastq_file
    if not os.path.exists(output_dir):
        command='mkdir %s'%(output_dir)
        return_code = command_runner.run_command(command)
    for assembly_function in assembly_function_list:
        #Assemble with provided assembler
        (contig_file, nb_seq, max_len) = run_assembly(assembly_function, fastq_file, output_dir,
                                                      estimated_size=estimated_size,
                                                      subsample_nb_read=subsample_nb_read, rg_ids=rg_ids,
                                                      name=name, adapter_file=adapter_file)
        #Merge read one and read2 contig
        if contig_file:
            #TODO: This function gets run twice need to change that as the second run is not useful
            merge_read1_and_read2_contigs(name, read1_contig=read1_fasta, read2_contigs=contig_file, output_dir=os.path.dirname(contig_file))
        
    best_assembler_name, best_assembly_file = get_best_assembly_merged(output_dir, read1_fasta, name, force_merge)

    command="cp %s %s"%(best_assembly_file, os.path.join(output_dir, "best_assembly.fa"))
    return_code = command_runner.run_command(command)
    return os.path.join(output_dir, "best_assembly.fa")
def align_short_reads_se(fastq_file1,  genome_file, output_dir, sample_name, thread, 
                      BWA_bin, samtools_bin, picard_dir, read_group_command, files_and_dir, illumina, fifo):
    
    fastq_name, ext=os.path.splitext(os.path.basename(fastq_file1))
    sai_file1='%s.sai'%os.path.join(output_dir,fastq_name)
    illumina_str=""
    if illumina:
        illumina_str=" -I "
    command='%s aln %s -t %s %s %s > %s'%(BWA_bin, illumina_str,thread, genome_file, fastq_file1, sai_file1)
        
    return_code = command_runner.run_command(command)
    if return_code is not 0:
        run_fine = False 
    files_and_dir.append(sai_file1)
    
    #only one end so just run get the sorted bam file
    bam_file=os.path.join(output_dir, sample_name+"_sorted")
    command="""%s samse %s %s %s %s | %s view -bS - | %s sort - %s"""%(BWA_bin, read_group_command, genome_file, sai_file1,
                                                                         fastq_file1, samtools_bin, samtools_bin, bam_file )
    return_code = command_runner.run_command( command)
    if return_code is not 0:
        run_fine = False 
    return bam_file
def run_smalt_paired(consensus_file, read1_fastq, read2_fastq, **kwarg):
    index1 = '%s.sma' % consensus_file
    command = 'rm -rf %s' % index1
    if os.path.exists(index1):
        return_code = command_runner.run_command(command)
    index2 = '%s.smi' % consensus_file
    command = 'rm -rf %s' % index2
    if os.path.exists(index2):
        return_code = command_runner.run_command(command)
    index3 = '%s.fai' % consensus_file
    command = 'rm -rf %s' % index3
    if os.path.exists(index3):
        return_code = command_runner.run_command(command)

    command = "smalt index %s %s" % (consensus_file, consensus_file)
    return_code = command_runner.run_command(command)
    name = longest_common_substr_from_start(read1_fastq, read2_fastq).rstrip('_')
    read2_fastq_rev_comp = reverse_complement(read2_fastq)

    sam_file = name + '.sam'
    command = "smalt map -f samsoft -o %s %s %s %s" % (sam_file, consensus_file, read1_fastq, read2_fastq_rev_comp)
    return_code = command_runner.run_command(command)
    return sam_file
Ejemplo n.º 51
0
def create_sequence_dictionary(picard_dir, genome_file):
    """Create a sequence dictionary from the genome file provided"""
    name, dummy = os.path.splitext(genome_file)
    genome_dict = name + '.dict'
    if not os.path.exists(genome_dict) or os.path.getmtime(
            genome_file) > os.path.getmtime(genome_dict):
        CreateSequenceDictionary_jar = os.path.join(
            picard_dir, 'CreateSequenceDictionary.jar')
        command = 'java -jar %s REFERENCE=%s O=%s' % (
            CreateSequenceDictionary_jar, genome_file, genome_dict)
        return_code = command_runner.run_command(command)
        if return_code != 0:
            genome_dict = None
    return genome_dict
def convert_untrimmed_read(fastq_file, output_dir, rgid,libid,smid, picard_dir, files_and_dir, illumina, fifo):
    fastq_name, ext=os.path.splitext(os.path.basename(fastq_file))
    sam_file='%s.sam'%os.path.join(output_dir,fastq_name)
    if fifo:
        command="mkfifo %s"%sam_file
        if os.path.exists(sam_file):
            os.remove(sam_file)
        return_code = command_runner.run_command(command)

    fastqToSam_jar=os.path.join(picard_dir,"FastqToSam.jar")
    if illumina:
        qual="Illumina"
    else:
        qual="Standard"
    command="java -Xmx2G -jar %s F1=%s O=%s RG=%s LB=%s SM=%s, QUALITY_FORMAT=%s"%(fastqToSam_jar,fastq_file,sam_file,rgid,libid,smid,qual)
    if fifo:
        command+=" &"
    return_code = command_runner.run_command(command)

    if return_code is not 0:
        run_fine = False
    files_and_dir.append(sam_file)
    read_sam_tmp='%s.tmp'%(sam_file)
    if fifo:
        if os.path.exists(read_sam_tmp):
            os.remove(read_sam_tmp)
        command="mkfifo %s"%read_sam_tmp
        return_code = command_runner.run_command( command)
    command=fix_read_name_in_sam_command(65, header=False)
    command+=""" %s > %s"""%(sam_file, read_sam_tmp)
    if fifo:
        command+=" &"
    return_code = command_runner.run_command(command)
    if return_code is not 0:
        run_fine = False
    files_and_dir.append(read_sam_tmp)
    return read_sam_tmp
Ejemplo n.º 53
0
def sort_bam_file_per_coordinate(picard_dir, input_bam, output_bam, overwrite=False,validation_stringency="LENIENT",**kwargs):
    return_code=1
    if picard_dir:
        options=[]
        for key in kwargs.keys():
            options.append("%s=%s"%(key,kwargs.get(key)))
        sort_jar=os.path.join(picard_dir,'SortSam.jar')
        command="java -Xmx4G -jar %s I=%s O=%s SO=coordinate VALIDATION_STRINGENCY=%s %s"%(sort_jar, input_bam, output_bam, 
                                                                                        validation_stringency, ' '.join(options))
        
        if (not os.path.exists(output_bam)) or overwrite:
            return_code = command_runner.run_command(command)
        else:
            logging.warning('The file %s exists, use overwrite option to overwrite if applicable.'%output_bam)
    return return_code
Ejemplo n.º 54
0
def extend_read1_consensus(fastq_1, fastq_2, extended_sequence_name, extended_sequence_file):
    output_dir=os.path.dirname(fastq_1)
    out_extended = run_flash(output_dir, fastq_1, fastq_2, 20)
    if not out_extended:
        out_extended = run_flash(output_dir, fastq_1, fastq_2, 10)
    if not out_extended:
        return None
    command_array = ["cat %s | paste - - - - | cut -f 2  | sort | uniq -c | sort -nr |"%out_extended,
                     " awk '{if($1>best){best=$1;if (length($2)>length(longest)){longest=$2}}} END{print longest}' |",
                     """awk 'BEGIN{print "%s"} {print $0}'"""%extended_sequence_name,
                     " > %s"%extended_sequence_file]
    return_code = command_runner.run_command(' '.join(command_array))
    if return_code !=0:
        return None

    if return_code !=0:
        return None
    return extended_sequence_file
Ejemplo n.º 55
0
def merge_bam_files_with_picard(list_of_file, output_file=None, **kwargs):
    """This is a generic merging function for bam files.
    It assumes that all the bam file comes from mapping to independent contigs"""

    if not output_file:
        #Create a generic name and put it in the current working directory
        working_directory=os.getcwd()
        i=1
        output_file_template=os.path.join(working_directory,'tmp_merge_bam_%s.bam')
        output_file=output_file_template%i
        while os.path.exists(output_file):
            i+=1
            output_file=output_file_template%i
    command = 'java -jar -Xmx2G %s VALIDATION_STRINGENCY=SILENT CAT_SEQUENCE_DICTIONARIES=True USE_THREADING=True O=%s '%(mergeSamFilesWithCat_jar,output_file)
    inputs=['I=%s'%file for file in list_of_file]
    command += ' '.join(inputs)
    return_code=command_runner.run_command(command)
    if return_code==0:
        return output_file
    else:
        return None
Ejemplo n.º 56
0
def clean_fastq(fastq_file, adapter_file=None, rg_ids=[], subsample_nb_read=None):
    if rg_ids:
        fastq_file = keep_read_from_samples(fastq_file, rg_ids)
    if adapter_file:
        adapter_trim = fastq_file + '.adapter_trimmed'
        if not os.path.exists(adapter_trim):
            command = "scythe -q sanger -a %s -o %s %s" % (adapter_file, adapter_trim, fastq_file)
            command_runner.run_command(command)
        fastq_file = adapter_trim
    qual_trim = fastq_file + ".qual_trimmed"
    if not os.path.exists(qual_trim):
        command = "sickle se -f %s -t sanger -o %s" % (fastq_file, qual_trim)
        command_runner.run_command(command)
        fastq_file = qual_trim
    if subsample_nb_read:
        sub_sampled = qual_trim + ".%s" % subsample_nb_read
        if not os.path.exists(sub_sampled):
            command = "seqtk sample %s %s > %s" % (fastq_file, subsample_nb_read, sub_sampled)
            command_runner.run_command(command)
        return sub_sampled
    else:
        return fastq_file