Example #1
0
def aln_to_Newick(path, folders_dict, raxml_timelimit, raxml_path, threads):
    """ function: build core gene SNP tree using SNP alignment
        input: SNP_whole_matrix.aln
        output: strain_tree.nwk
    """
    cluster_seq_path=folders_dict['cluster_seq_path']
    log_path=folders_dict['log_path']
    output_path = '_'.join([cluster_seq_path+'temp_coretree', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))])
    os.system('mkdir %s'%output_path)
    SNP_matrix_path=cluster_seq_path+'SNP_whole_matrix.aln'
    cwd = os.getcwd()
    os.chdir(output_path)

    ## run fasttree
    start = time.time();

    fasttree_program= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    os.system(fasttree_program+' -gtr -nt -gamma -nosupport -mlacc 2 -slownni '+SNP_matrix_path+' > initial_tree.newick0 2> '+log_path+'fasttree.log') ;
    print ' fasttree time-cost:', times(start)

    resolve_polytomies('initial_tree.newick0','initial_tree.newick')

    ## run raxml
    start = time.time();
    out_fname = "tree_infer.newick"
    if raxml_timelimit>0:
        print '%s%d%s'%('RAxML tree optimization within the timelimit of ',raxml_timelimit, ' minutes')
        # exec for killing process
        end_time = time.time() + int(raxml_timelimit*60) #

        raxml_program= 'raxml' if check_dependency('raxml') else 'raxmlHPC'
        process = subprocess.Popen('exec '+raxml_program+' -f d -T '+str(threads)+' -j -s '+SNP_matrix_path+' -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick > '+log_path+'raxml.log', shell=True)
        while (time.time() < end_time):
            if os.path.isfile('RAxML_result.topology'):
                break
            time.sleep(10)
        process.terminate()

        checkpoint_files = glob.glob('RAxML_checkpoint*')
        if os.path.isfile('RAxML_result.topology'):
            checkpoint_files.append('RAxML_result.topology')
        if len(checkpoint_files) > 0:
            last_tree_file = checkpoint_files[-1]
            shutil.copy(last_tree_file, 'raxml_tree.newick')
        else:
            shutil.copy('initial_tree.newick', 'raxml_tree.newick')
    else:
        shutil.copy('initial_tree.newick', 'raxml_tree.newick')

    print 'RAxML branch length optimization and rooting'
    os.system(raxml_program+' -f e -T '+str(threads)+' -s '+SNP_matrix_path+' -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick > '+log_path+'raxml.log')
    shutil.copy('RAxML_result.branches', out_fname)

    print ' raxml time-cost:', times(start)
    midpointRooting(out_fname,'strain_tree.nwk')
    shutil.copy('strain_tree.nwk', cluster_seq_path+'strain_tree.nwk')
    os.chdir(cwd)
    os.system('rm -r %s'%output_path)
def diamond_run(output_path,
                dmd_ref_file,
                threads,
                diamond_evalue,
                diamond_max_target_seqs,
                diamond_identity,
                diamond_query_cover,
                diamond_subject_cover,
                diamond_path,
                diamond_no_self_hits=0):
    """ run diamond using sensitive alignment mode """
    if diamond_path == '':
        diam = ''.join(
            [os.path.dirname(os.path.realpath(__file__)), '/../tools/diamond'])
    else:
        diam = diamond_path
    print 'diamond inputfile:', dmd_ref_file
    input_prefix = dmd_ref_file.split('.faa')[0]
    if input_prefix == 'reference':
        output_m8_filename = 'query_matches.m8'
    else:
        output_m8_filename = '%s%s' % (input_prefix, '.m8')

    makedb_command = ''.join([
        diam, ' makedb -p ', threads, ' --in ', output_path, dmd_ref_file,
        ' -d ', output_path, 'nr_', input_prefix, '> ', output_path,
        'diamond_makedb_', input_prefix, '.log 2>&1'
    ])
    start = time.time()
    os.system(makedb_command)
    print 'diamond build index (makedb):', times(start)
    print 'command line record:', makedb_command
    ## option to enable --no-self-hits
    if diamond_no_self_hits == 0:
        option_no_self_hits = ''
    else:
        option_no_self_hits = ' --no-self-hits'

    blastp_command = ''.join([
        diam, ' blastp --sensitive -p ', threads, ' -e ', diamond_evalue,
        ' --id ', diamond_identity, ' --query-cover ', diamond_query_cover,
        ' --subject-cover ', diamond_subject_cover, option_no_self_hits,
        ' -k ', diamond_max_target_seqs, ' -d ', output_path, 'nr_',
        input_prefix, ' -f 6 qseqid sseqid bitscore', ' -q ', output_path,
        dmd_ref_file, ' -o ', output_path, output_m8_filename, ' -t ./  > ',
        output_path, 'diamond_blastp_', input_prefix, '.log  2>&1'
    ])
    start = time.time()
    os.system(blastp_command)
    print 'diamond alignment (blastp):', times(start)
    print 'diamond_max_target_seqs used: %s' % diamond_max_target_seqs
    print 'command line record:', blastp_command

    ## remove diamond binary database (dmnd) file
    os.system(''.join(['rm ', output_path, 'nr_', input_prefix, '.dmnd']))
def diamond_run(output_path, dmd_ref_file, threads,
    diamond_evalue, diamond_max_target_seqs, diamond_identity,
    diamond_query_cover, diamond_subject_cover, diamond_path, diamond_no_self_hits=0):
    """ run diamond using sensitive alignment mode """
    if diamond_path=='':
        diamond_path=find_executable('diamond')
    diam=diamond_path
    print 'diamond inputfile:', dmd_ref_file
    input_prefix= dmd_ref_file.split('.faa')[0]
    if input_prefix=='reference':
        output_m8_filename='query_matches.m8'
    else:
        output_m8_filename= '%s%s'%(input_prefix,'.m8')

    makedb_command= ''.join([diam,' makedb -p ',threads,
                        ' --in ', output_path, dmd_ref_file,
                        ' -d ',output_path,'nr_',input_prefix,
                        '> ',output_path,'diamond_makedb_',input_prefix,'.log 2>&1'
                        ])
    start = time.time()
    os.system(makedb_command)
    print 'diamond build index (makedb):', times(start)
    print 'command line record:', makedb_command
    ## option to enable --no-self-hits
    if diamond_no_self_hits==0:
        option_no_self_hits=''
    else:
        option_no_self_hits=' --no-self-hits'

    blastp_command= ''.join([diam,' blastp --sensitive -p ',threads,
                        ' -e ', diamond_evalue,
                        ' --id ', diamond_identity,
                        ' --query-cover ', diamond_query_cover,
                        ' --subject-cover ', diamond_subject_cover,
                        option_no_self_hits,
                        ' -k ', diamond_max_target_seqs,
                        ' -d ',output_path,'nr_', input_prefix,
                        ' -f 6 qseqid sseqid bitscore',
                        ' -q ',output_path,dmd_ref_file,
                        ' -o ',output_path,output_m8_filename,
                        ' -t ./  > ',output_path,'diamond_blastp_',input_prefix,'.log  2>&1'
                        ])
    start = time.time()
    os.system(blastp_command)
    print 'diamond alignment (blastp):', times(start)
    print 'diamond_max_target_seqs used: %s'%diamond_max_target_seqs
    print 'command line record:', blastp_command

    ## remove diamond binary database (dmnd) file
    os.system(''.join(['rm ',output_path,'nr_', input_prefix,'.dmnd']))
def build_representative_cluster(clustering_path, threads, input_prefix):
    """ build representative cluster """
    start = time.time()
    cluster_file= ''.join([clustering_path,input_prefix,'_cluster.output'])
    representative_outputfile= ''.join([clustering_path,input_prefix,'_representative','.faa'])
    subproblem_seqs_path= '%ssubproblem_cluster_seqs/'%clustering_path
    subproblem_merged_faa= ''.join([clustering_path,input_prefix,'.faa'])
    subproblem_faa_dict= read_fasta(subproblem_merged_faa)
    with open(cluster_file, 'rb') as cluster_input:
        subproblem_geneCluster_dt= defaultdict(list)
        cluster_input_lines= [iline for iline in cluster_input]
        subproblem_geneCluster_dt= {}
        subproblem_run_number= input_prefix.split('subproblem_')[1]
        for gid, iline in enumerate(cluster_input_lines):#cluster_input
            ## use time to avoid clusterID conflict
            clusterID= "GCs%s_%07d%s"%(subproblem_run_number, gid, time.strftime('%M%S',time.gmtime()))
            gene_ids= iline.rstrip().split('\t')
            subproblem_geneCluster_dt[clusterID]= gene_ids
            ## representative_seq
            representative_seq=subproblem_faa_dict[gene_ids[0]]
            ## write in representative strain
            with open(representative_outputfile, 'a') as representative_output:
                write_in_fa(representative_output, clusterID, representative_seq)
        ## write subproblem_geneCluster_dt
        write_pickle(''.join([clustering_path,input_prefix,'_dicts.cpk']), subproblem_geneCluster_dt)
    print 'build representative clusters for', input_prefix,': ', times(start), '\n'
def build_representative_cluster(clustering_path, threads, input_prefix):
    """ build representative cluster """
    start = time.time()
    cluster_file = ''.join([clustering_path, input_prefix, '_cluster.output'])
    representative_outputfile = ''.join(
        [clustering_path, input_prefix, '_representative', '.faa'])
    subproblem_seqs_path = '%ssubproblem_cluster_seqs/' % clustering_path
    subproblem_merged_faa = ''.join([clustering_path, input_prefix, '.faa'])
    subproblem_faa_dict = read_fasta(subproblem_merged_faa)
    with open(cluster_file, 'rb') as cluster_input:
        subproblem_geneCluster_dt = defaultdict(list)
        cluster_input_lines = [iline for iline in cluster_input]
        subproblem_geneCluster_dt = {}
        subproblem_run_number = input_prefix.split('subproblem_')[1]
        for gid, iline in enumerate(cluster_input_lines):  #cluster_input
            ## use time to avoid clusterID conflict
            clusterID = "GCs%s_%07d%s" % (subproblem_run_number, gid,
                                          time.strftime('%M%S', time.gmtime()))
            gene_ids = iline.rstrip().split('\t')
            subproblem_geneCluster_dt[clusterID] = gene_ids
            ## representative_seq
            representative_seq = subproblem_faa_dict[gene_ids[0]]
            ## write in representative strain
            with open(representative_outputfile, 'a') as representative_output:
                write_in_fa(representative_output, clusterID,
                            representative_seq)
        ## write subproblem_geneCluster_dt
        write_pickle(''.join([clustering_path, input_prefix, '_dicts.cpk']),
                     subproblem_geneCluster_dt)
    print 'build representative clusters for', input_prefix, ': ', times(
        start), '\n'
def filter_hits_single(output_path,
                       threads,
                       input_prefix='',
                       gather_seq_length_flag=0,
                       no_filtering=True):
    """  """
    start = time.time()
    if input_prefix == '':
        ## default empty for all-vs.-all m8 file
        m8_filename = 'query_matches.m8'
        filtered_hits_filename = 'filtered_hits.abc'
    else:
        ## add input prefix for sub_problem abc files in divide-and-conquer method
        m8_filename = '%s%s' % (input_prefix, '.m8')
        filtered_hits_filename = '%s%s' % (input_prefix, '_filtered_hits.abc')

    if no_filtering == True:
        m8_fpath = os.path.abspath(output_path + m8_filename)
        filtered_hits_fpath = os.path.abspath(output_path +
                                              filtered_hits_filename)
        os.system(' '.join(['ln -s', m8_fpath, filtered_hits_fpath]))
    else:
        with open(output_path+m8_filename,'rb') as m8_file,\
            open(output_path+filtered_hits_filename,'wb') as abc_file:
            print('filter hits:')
            #using_BS=1; using_BAL=0
            for iline in m8_file:
                cols_list = iline.rstrip().split('\t')
                #query, ref, bit_score from column (0,1,-1)
                abc_file.write(
                    '%s\n' % '\t'.join([cols_list[ind] for ind in (0, 1, -1)]))
                ## BAL scoring option not used
                # if using_BS:## using bscore
                #     abc_file.write('%s\n'%'\t'.join([cols_list[ind] for ind in (0,1,-1)]))
                # elif using_BAL:## using bscore/aln_len
                #     abc_file.write('%s\n'%'\t'.join([cols_list[0], cols_list[1], \
                #         str(round( float(cols_list[-1])/float(cols_list[3]),5 ))]))
        if input_prefix == '':
            print 'filter hits runtime:', times(start), '\n'
        else:
            print 'filter hits runtime for ', input_prefix, ':', times(
                start), '\n'
def mcl_run(clustering_path, threads, input_prefix, mcl_inflation):
    """ """
    start = time.time()
    cwd = os.getcwd()
    os.chdir(clustering_path)
    command_mcl=''.join(['mcl ',input_prefix,'_filtered_hits.abc --abc ',\
                        '-o ',input_prefix,'_cluster.output -I ',str(mcl_inflation),\
                        ' -te ',str(threads),' > ','mcl-',input_prefix,'.log 2>&1'])
    os.system(command_mcl)
    print 'run command line mcl in ',clustering_path,': \n', command_mcl
    print 'mcl runtime for ', input_prefix,': ', times(start), '\n'
    os.chdir(cwd)
def mcl_run(clustering_path, threads, input_prefix, mcl_inflation):
    """ """
    start = time.time()
    cwd = os.getcwd()
    os.chdir(clustering_path)
    command_mcl=''.join(['mcl ',input_prefix,'_filtered_hits.abc --abc ',\
                        '-o ',input_prefix,'_cluster.output -I ',str(mcl_inflation),\
                        ' -te ',str(threads),' > ','mcl-',input_prefix,'.log 2>&1'])
    os.system(command_mcl)
    print 'run command line mcl in ', clustering_path, ': \n', command_mcl
    print 'mcl runtime for ', input_prefix, ': ', times(start), '\n'
    os.chdir(cwd)
def mcl_run(output_path, threads, mcl_inflation, input_prefix=''):
    """ """
    start = time.time()
    if input_prefix=='': ## default empty for abc file from all-vs.-all m8 file
        filtered_hits_filename='filtered_hits.abc'
    else:
        filtered_hits_filename='%s%s'%(input_prefix,'_filtered_hits.abc')

    command_mcl=''.join(['mcl ',output_path,filtered_hits_filename,' --abc ',\
                        '-o ',output_path,'allclusters.tsv -I ',str(mcl_inflation),\
                        ' -te ',str(threads),' > ',output_path,'mcl.log 2>&1'])
    os.system(command_mcl)
    print 'command line mcl:', command_mcl
    print 'mcl runtime:', times(start),'\n'
def mcl_run(output_path, threads, mcl_inflation, input_prefix=''):
    """ """
    start = time.time()
    if input_prefix == '':  ## default empty for abc file from all-vs.-all m8 file
        filtered_hits_filename = 'filtered_hits.abc'
    else:
        filtered_hits_filename = '%s%s' % (input_prefix, '_filtered_hits.abc')

    command_mcl=''.join(['mcl ',output_path,filtered_hits_filename,' --abc ',\
                        '-o ',output_path,'allclusters.tsv -I ',str(mcl_inflation),\
                        ' -te ',str(threads),' > ',output_path,'mcl.log 2>&1'])
    os.system(command_mcl)
    print 'command line mcl:', command_mcl
    print 'mcl runtime:', times(start), '\n'
def filter_hits_single(output_path, threads,
    input_prefix='', gather_seq_length_flag=0, no_filtering=True):
    """  """
    start = time.time()
    if input_prefix=='':
        ## default empty for all-vs.-all m8 file
        m8_filename='query_matches.m8'
        filtered_hits_filename='filtered_hits.abc'
    else:
        ## add input prefix for sub_problem abc files in divide-and-conquer method
        m8_filename='%s%s'%(input_prefix,'.m8')
        filtered_hits_filename='%s%s'%(input_prefix,'_filtered_hits.abc')

    if no_filtering==True:
        m8_fpath=os.path.abspath(output_path+m8_filename)
        filtered_hits_fpath=os.path.abspath(output_path+filtered_hits_filename)
        os.system(' '.join(['ln -s',m8_fpath,filtered_hits_fpath]))
    else:
        with open(output_path+m8_filename,'rb') as m8_file,\
            open(output_path+filtered_hits_filename,'wb') as abc_file:
            print('filter hits:')
            #using_BS=1; using_BAL=0
            for iline in m8_file:
                cols_list= iline.rstrip().split('\t')
                #query, ref, bit_score from column (0,1,-1)
                abc_file.write('%s\n'%'\t'.join([cols_list[ind] for ind in (0,1,-1)]))
                ## BAL scoring option not used
                # if using_BS:## using bscore
                #     abc_file.write('%s\n'%'\t'.join([cols_list[ind] for ind in (0,1,-1)]))
                # elif using_BAL:## using bscore/aln_len
                #     abc_file.write('%s\n'%'\t'.join([cols_list[0], cols_list[1], \
                #         str(round( float(cols_list[-1])/float(cols_list[3]),5 ))]))
        if input_prefix=='':
            print 'filter hits runtime:', times(start),'\n'
        else:
            print 'filter hits runtime for ',input_prefix,':', times(start),'\n'
Example #12
0
    max_strain_fraction_presence_association=params.max_strain_fraction_presence_association,
    optional_table_column=params.optional_table_column,
    clean_temporary_files=params.clean_temporary_files
    )

if 1 in params.steps:#step 01:
    myPangenome.make_strain_list()
    print '======  step01: strain list successfully loaded'

## deactivated  (activation:'2' -> 2)
if '2' in params.steps:# step02:
    print '======  starting step02: download NCBI RefSeq GenBank file from strain list'
    start = time.time()
    fetch_refseq(path, strain_list)
    print '======  time for step02:'
    print times(start),'\n'

if 3 in params.steps:# step03:
    print '======  starting step03: extract sequences from GenBank file'
    start = time.time()
    myPangenome.extract_gbk_sequences()
    print '======  time for step03:'
    print times(start),'\n'

if 4 in params.steps:# step04:
    print '======  starting step04: extract metadata from GenBank file'
    start = time.time()
    myPangenome.extract_gbk_metadata()
    print '======  time for step04:'
    print times(start),'\n'