Ejemplo n.º 1
0
def extract_para_from_trees(filename):
    
    # prepare list
    out_para = list()

    # load filename and save ortho@para
    tmp = open(path_tmp / filename, 'rb')
    content_pickle = pickle.load(tmp)

    # save para
    for st in content_pickle:
        ortho, para = st.split('@')
        l_ortho = ortho.split(' ')
        l_para = para.split(' ')
        for name1 in l_para:
            for name2 in l_ortho:
                if name1 < name2:
                    combined_name = str(len(name1)) + name1 + name2
                else:
                    combined_name = str(len(name2)) + name2 + name1
                
                if len(combined_name) > 2:
                # check if present in filter and save it
                    reduced = int(combined_name[2:])
                    if reduced in set_filter:
                        out_para.append(int(combined_name))
    
    # dump out_para 
    utils.save_pickle(path_tmp_para / filename, out_para)

    return [0,0]
Ejemplo n.º 2
0
def save_prot_2_sp(d):
    # create new dict with only sp
    d_str = {k:t[0] for k,t in d.items()} 
    d_int = {int(k):t[0] for k,t in d.items()} 
    # save them on disk
    utils.save_pickle(out_dir / 'prot_str_2_species.pic', d_str)
    utils.save_pickle(out_dir / 'prot_int_2_species.pic', d_int)
Ejemplo n.º 3
0
def extract_para(l_trees):

    global set_filter
    set_filter = pickle.load(open(out_dir / 's_filter.pic', 'rb'))
    
    global path_tmp_para
    path_tmp_para  = utils.create_out_dir('./dir_step3/tmp_para')
    
    # extract para from tree files
    print(' extract para from trees')   
    pool = Pool(nb_threads) 
    tmp_res = pool.map_async(extract_para_from_trees, l_trees, chunksize=1)
    pool.close() 
    pool.join()

    # combine results list para
    d_para = utils.get_pickle(out_dir / 'd_ortho.pic')
    d_para = {x:0 for x in d_para}
    for filename in l_trees:
        # load pick file with list para
        content_pickle = pickle.load(open(path_tmp_para / filename, 'rb'))
        for pair in content_pickle:
            try:
                d_para[pair] += 1 
            except:
                pass
                
    # save it to file
    utils.save_pickle(out_dir / 'd_para.pic', d_para)
    
    # free memory
    set_filter = None
    shutil.rmtree(path_tmp_para)
    shutil.rmtree(path_tmp)
Ejemplo n.º 4
0
def extract_ortho(l_trees):
        
    print(' extract ortho from similarity')
    # load pickle files
    tmp_d = utils.get_multi_pickle(Path('dir_step2') / 'dict_similarity_ortho', '_similarity_ortho.pic')

    # extract ortho
    d_ortho = collections.defaultdict(int)
    for l in tmp_d.values():
        for sub in itertools.combinations(l, 2):
            pair_int = int(str(len(sub[0])) + sub[0] + sub[1])
            d_ortho[pair_int] += 1
    
    global path_tmp, path_tmp_ortho
    path_tmp = utils.create_out_dir('./dir_step3/tmp')
    path_tmp_ortho = utils.create_out_dir('./dir_step3/tmp_ortho')
    
    # extract ortho from tree files
    print(' extract ortho from trees')    
    pool = Pool(nb_threads) 
    tmp_res = pool.map_async(extract_ortho_from_trees, l_trees, chunksize=1)
    pool.close() 
    pool.join()            
    
    # unpack ortho and save them
    for filename in l_trees:
        content_pickle = pickle.load(open(path_tmp_ortho / filename, 'rb'))
        for pair in content_pickle:
            d_ortho[pair] += 1 

    # free memory
    content_pickle = None
    shutil.rmtree(path_tmp_ortho)

    # remove ortho found only once
    print(' remove ortho found only once')
    d_ortho = {k:v for k,v in d_ortho.items() if v > 1}
    
    # save it to file
    utils.save_pickle(out_dir / 'd_ortho.pic', d_ortho)

    # save a simplified version (without 2 first digits) as set to file
    s_filter = set()
    for k in d_ortho:
        k2 = str(k)
        s_filter.add( int(k2[2:]) )
    utils.save_pickle(out_dir / 's_filter.pic', s_filter)
Ejemplo n.º 5
0
def extract_ortho_from_trees(filename):

    # prepare output variables
    l_ortho = list()
    l_ortho_para = list()
    
    # load dict of trees
    tmp_d = utils.get_pickle(Path('dir_step2') / 'dict_trees' / filename)
    
    # analyse trees 1 by 1
    for ref_leaf, newick in tmp_d.items():
        
        # load tree and get all leaves
        tree = PhyloTree(newick)
        all_leaves = {leaf.name for leaf in tree}
                                   
        # get all leaves from last interesting nodes      
        ref_node = tree.search_nodes(name = ref_leaf)[0]
        ortho = custom_species_overlap(ref_node)
        
        # add ref_leaf to ortho in case no good node selected
        if len(ortho) == 0:
            ortho.add(ref_leaf)
        
        # get para
        para = all_leaves - ortho
    
        # save ortho
        xx = list(ortho)    
        xx.sort()    
        for sub in itertools.combinations(xx, 2):
            pair_int = int(str(len(sub[0])) + sub[0] + sub[1])
            l_ortho.append(pair_int)
        
        # save ortho@para if there is a paralogous group
        if para:
            l_ortho_para.append(' '.join(ortho) + '@' + ' '.join(para))

    # save ortho @ para
    utils.save_pickle(path_tmp / filename, l_ortho_para)

    # save ortho
    utils.save_pickle(path_tmp_ortho / filename, l_ortho)
    
    return [0,0]
Ejemplo n.º 6
0
def step1_kmer_clustering(dir, ext, lk, ma, nt):
   
    # convert the parameters to global variables (horrible hack)
    global directory, extension, length_kmer, min_aa, nb_threads
    directory, extension, length_kmer, min_aa, nb_threads = Path(dir), ext, lk, ma, nt

    print('\n --- STEP 1: kmer clustering\n')
    print(' # parameters')
    print(' input dir     : ' + str(directory))
    print(' kmer size     : ' + str(length_kmer))
    print(' kmer nb aa    : ' + str(min_aa))

    ## create output directory (delete it first if already exists)
    global out_dir
    out_dir = utils.create_out_dir('dir_step1')
    
    ## check directory and files
    print('\n # check input files')
    global dict_files, list_files, list_start
    dict_files, list_files, list_start = pre_checking(directory, extension)
    
	## analyse each fasta file (multithreading)
    print ('\n # kmer clustering\n ' + str(len(list_files)) + ' proteomes on ' + str(nb_threads) + ' threads')
    pool = ThreadPool(nb_threads) 
    tmp_res = pool.map_async(process_file, list_files, chunksize=1)
    results_2 = tmp_res.get()
    pool.close() 
    pool.join()
    
    ## create log files
    log_file = open(out_dir / 'log_step1.txt', 'w+')
    log_file.write('#index	file_name	nb_initial	nb_final\n')
    
    ## save log file and combine other info
    combined = dict()
    names    = dict()
    nb_final = 0
    for l in results_2:
        log_file.write('	'.join(l[:4]) + '\n')
        names.update(l[4])
        combined.update(l[5])
        nb_final += int(l[3])
    
    ## save pickle files
    utils.save_pickle(out_dir / 'combined_names.pic', combined)
    utils.save_pickle(out_dir / 'original_names.pic', names)
    utils.save_pickle(out_dir / 'species_index.pic', dict_files)
    
    print(' -> ' + str(nb_final) + ' proteins saved for the next step')
    print ('')
Ejemplo n.º 7
0
def process_file(file):       
    ## extract index
    index = file.split('.')[0]
    
    ## create output directory
    index_dir = out_dir / index
    index_dir.mkdir(parents=True, exist_ok=True)
    
    ## perform local search against each database
    for file_db in list_files:
        search_output = index + '_' + file_db.replace('.fas','.gz')
        subprocess.check_output(path_diamond + ' blastp --quiet --threads 1 --db ' + str(db_dir / file_db.replace('.fas','.db')) + ' --max-target-seqs ' + str(max_per_species) + ' --query ' + str(Path('dir_step1') / file) + ' \
                --compress 1 --more-sensitive -e ' + str(evalue) + ' -o ' + str(index_dir / search_output) + ' --outfmt 6 qseqid sseqid qstart qend sstart cigar 2>&1', shell=True)
    
    ## get all DIAMOND output files
    p = index_dir.glob('*.gz')
    tmp_l = [x for x in p if x.is_file()]

    ## get all hits in a dict of list
    all_output = collections.defaultdict(list)
    for out_file in tmp_l:
        with gzip.open(out_file, mode="rt") as f:
            file_content = csv.reader(f, delimiter='	')
            for line in file_content:
                # save output
                all_output[line[0]].append(line[1:])     
    
    ## analyse BLAST hits
    nb_phylo    = 0
    nb_NO_phylo = 0
    nb_empty_ali  = 0
    all_alis = dict()
    no_phylo = dict()
    
    for prot in all_output:
        ## variable for reduced list of output
        reduced = list()
        
        ## get all species hits (initialise with query prot)
        ref_species = dict(all_species)
        ref_species[name_2_sp_phylip_seq[prot][0]] += 1
        all_hits = {prot}
        for ll in all_output[prot]:
            target = ll[0]
            target_sp = name_2_sp_phylip_seq[target][0]
            if target not in all_hits:
                ref_species[target_sp] += 1
                all_hits.add(target)
            # reduce output for pickle (convert all element to integers)
            reduced.append(tuple(int(x) for x in ll[:3]))    
        
        ## analyse species content
        nb_present, nb_dupli = analyse_species(ref_species)
        
        ## case phylogenetic analysis
        if nb_present > 1 and nb_dupli > 0:
            nb_phylo += 1
            min_start = math.inf
            max_end   = 0
            all_hits = dict()
            # get all hits for this prot
            for ll in all_output[prot]:
                target = ll[0]
                species = name_2_sp_phylip_seq[target][0]
                qu_start = int(ll[1]) -1
                qu_end   = int(ll[2]) -1
                ta_start = int(ll[3]) -1
                cigar    = ll[4]

                if target in all_hits:
                    # extract HSP and add to target seq
                    HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar)
                    all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:]
                    min_start, max_end = process_location(qu_start, qu_end, min_start, max_end) 
                
                else:
                    ref_species[species] += 1
                    # create target seq
                    all_hits[target] = '-' * len(name_2_sp_phylip_seq[prot][2])
                    # extract HSP
                    HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar)
                    all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:]
                    min_start, max_end = process_location(qu_start, qu_end, min_start, max_end)              
            
            # add query to hits if not there (it happens sometimes when many similar sequences from the same species)
            if prot not in all_hits:
                all_hits[prot] = name_2_sp_phylip_seq[prot][2]            
            
            # find good positions
            good_positions = get_positions(prot, all_hits, trim_thres)
            # save alignment
            if len(good_positions) == 0:
                nb_empty_ali += 1
            elif len(good_positions) < 4988:  ## FastTtree2 limitation (5000 per line)
                new_ali = [str(len(all_hits)) + '	' + str(len(good_positions))]
                for name, seq in all_hits.items():
                    trimed_seq = [seq[n] for n in range(len(seq)) if n in good_positions]
                    new_ali.append(name_2_sp_phylip_seq[name][1] + ''.join(trimed_seq))
                all_alis[prot] = '\n'.join(new_ali)
            else:
                # take only the 4988 first positions (longer alignments are very rare anyway)
                new_ali = [str(len(all_hits)) + '	4988']
                for name, seq in all_hits.items():
                    trimed_seq = [seq[n] for n in range(len(seq)) if n in good_positions][:4988]
                    new_ali.append(name_2_sp_phylip_seq[name][1] + ''.join(trimed_seq)) 
                all_alis[prot] = '\n'.join(new_ali)
                               
        ## case NO phylogenetic analysis
        else:
            nb_NO_phylo += 1
            # sort the list of names for further processing
            xx = list(all_hits)
            xx.sort()
            no_phylo[prot] = xx
    
        ## save reduced output
        all_output[prot] = tuple(reduced)
                
    ## convert DIAMOND output (keys to integers) and save it to file
    all_output = {int(x):t for x,t in all_output.items()}
    output_file = index + '_output.pic'
    utils.save_pickle(out_dir / 'dict_output' / output_file, all_output)

    ## save similarity_ortho groups to file
    blast_ortho_file = index + '_similarity_ortho.pic'
    utils.save_pickle(out_dir / 'dict_similarity_ortho' / blast_ortho_file, no_phylo)
    
    ## save all alignments to file
    name_ali_file = 'alis_' + index + '.phy'
    write_ali = open(out_dir / name_ali_file, 'w+')
    all_ref_prot = list()
    for ref_prot, ali in all_alis.items():
        write_ali.write(ali + '\n')
        all_ref_prot.append(ref_prot)
    write_ali.close()
    
    # free memory
    nb_alis = len(all_alis)
    all_alis   = None
    all_output = None
    
    ## deal with method
    if phylo_method == 'nj':
        insert = '-noml -nome'
    elif phylo_method == 'me':
        insert = '-noml'
    elif phylo_method == 'ml':
        insert = ''
    
    ## perform phylogenetic analyses and root trees
    all_trees  = dict()
    nb_pbm_tree = 0
    a = subprocess.check_output(path_fasttree + ' -quiet -nosupport -fastest -bionj -pseudo ' + insert + ' -n ' + str(nb_alis) + ' ' + str(Path(out_dir / name_ali_file)) + ' 2>&1', shell=True)
    a2 = a.strip().decode("utf-8")
    a3 = a2.split('\n')
    c = -1
    for line in a3:
        # case the line is in the form 'Ignored unknown character ...'
        if line.startswith('Ign'):
            pass
        else:
            c += 1
            if not line.startswith('('):
                nb_pbm_tree += 1            
                # security
                if nb_pbm_tree > 100:
                    sys.exit("\n            ERROR STEP 2: too many errors in phylogenetic analyses -> stopped\n\n")
            else:
                # import tree in ete3 and root it
                ete_tree = PhyloTree(line)
                mid = ete_tree.get_midpoint_outgroup()
                try:
                    ete_tree.set_outgroup(mid)
                except:
                    pass
                # get reference protein name
                prot = all_ref_prot[c]
                # save rooted tree
                all_trees[prot] = ete_tree.write()
    
    ## save trees to file
    tree_file = index + '_trees.pic'
    utils.save_pickle(out_dir / 'dict_trees' / tree_file, all_trees)
    
    ## clean directory
    # delete ali file
    Path.unlink(out_dir / name_ali_file)
    # delete Diamond outputs
    shutil.rmtree(index_dir)
    
    return [index, str(nb_phylo), str(nb_NO_phylo), str(nb_empty_ali), str(nb_pbm_tree)]
Ejemplo n.º 8
0
def save_outputs(l_com, d_chimeric, d_species):    
    
    ## load original and combined names
    original_name = utils.get_pickle(Path('dir_step1') / 'original_names.pic')
    combined_prot = utils.get_pickle(Path('dir_step1') / 'combined_names.pic')
            
    # create vector nb_species as index and nb_OG as value
    vector_sp = [0] * (len(d_species) + 1)
    
    # create dict for species counts
    nb_per_sp = collections.defaultdict(int)

    # create dict for table OGs
    table_og = dict()
    
    ## save the lists of OGs and get fusion info and get OG info
    d_OG = collections.defaultdict(list)
    c = 0
    OGs_in_network = dict()
    file_list_OGs = open(out_dir / 'orthologous_groups.txt', 'w+')
    file_list_OGs.write('#OG_name	protein_names\n')
    for l in l_com:
        nb_species = count_species(l)
        # keep OG if more than 1 species
        if nb_species > 1:
            vector_sp[nb_species] += 1
            # create name OG
            c += 1
            name_OG = 'OG_' + str(c)
            # create OG vector
            #table_og[name_OG] = {x:0 for x in d_species}
            table_og[name_OG] = {x:list() for x in d_species}
            # save old names
            OGs_in_network[name_OG] = l
            # prepare full OG with combined proteins and save it
            l2 = list()
            for k in l: 
                if k in combined_prot:
                    for k2 in combined_prot[k]:
                        l2.append(original_name[k2])
                else:
                    l2.append(original_name[k])
            file_list_OGs.write(name_OG + '	' + ' '.join(l2) + '\n')
            # update dict per species and vector OG
            for k in l: 
                sp = prot_2_sp[k]
                if k in combined_prot:
                    nb_per_sp[sp] += len(combined_prot[k])
                    #table_og[name_OG][sp] += len(combined_prot[k])
                    for k2 in combined_prot[k]:                    
                        table_og[name_OG][sp].append(original_name[k2])
                else:
                    nb_per_sp[sp] += 1
                    #table_og[name_OG][sp] += 1
                    table_og[name_OG][sp].append(original_name[k])
            # check if gene fusion in OG -> save name OG for each gene-fusion
            for k in l:
                if k in d_chimeric:
                    d_chimeric[k].append(name_OG)
            # count number of edges corresponding to this OG and calculate the clustering coefficient
            nb_edges = 0
            s = set(l)
            for node in l:
                for node2 in all_edges[node]:
                    if node2 in s:
                        nb_edges += 1
            clustering_coefficient = nb_edges / (len(s) * (len(s) - 1))                   
            # save OG info
            d_OG[name_OG] = [str(nb_species), str(len(l)), str(len(l2)), str(round(clustering_coefficient,4))]
       
    ## save dict old names
    utils.save_pickle(out_dir / 'OGs_in_network.pic', OGs_in_network)
          
    ## save gene-fusions
    file_fusions = open(out_dir / 'chimeric_proteins.txt', 'w+')
    file_fusions.write('#species_file	protein_name	nb_OG_fused	list_fused_OGs\n')
    for k,l in d_chimeric.items():
        file_fusions.write(d_species[str(prot_2_sp[k])] + '	' + original_name[k] + '	' + str(len(l))  + '	' + ' '.join(l) + '\n')
        
    ## save statistics for each OG
    file_stats_each_OG = open(out_dir / 'statistics_per_OG.txt', 'w+')
    file_stats_each_OG.write('#OG_name	nb_species	nb_reduced_prot	nb_all_prot	clustering_coefficient\n')
    for k,l in d_OG.items():
        file_stats_each_OG.write(k + '	' + '	'.join(l) + '\n')

    ## save table OG counts
    file_stats_each_OG = open(out_dir / 'table_OGs_protein_counts.txt', 'w+')
    file_stats_each_OG.write('#OG_name	' + '	'.join(x for x in d_species.values()) + '\n')
    for og_name, d in table_og.items():
        file_stats_each_OG.write(og_name + '	' + '	'.join(str(len(x)) for x in d.values()) + '\n')

    ## save table OG names
    file_stats_each_OG = open(out_dir / 'table_OGs_protein_names.txt', 'w+')
    file_stats_each_OG.write('#OG_name	' + '	'.join(x for x in d_species.values()) + '\n')
    for og_name, d in table_og.items():
        file_stats_each_OG.write(og_name + '	' + '	'.join(' '.join(x) for x in d.values()) + '\n')
    
    ## calculate nb total prot per species, and then % assigned
    total_per_sp = collections.defaultdict(int)
    for sp in prot_2_sp.values():
        total_per_sp[sp] += 1
    perc_per_sp = {sp: (100 * nb_per_sp[sp] / total) for sp, total in total_per_sp.items()}
    
    ## save statistics for each species
    file_stats_each_species = open(out_dir / 'statistics_per_species.txt', 'w+')
    file_stats_each_species.write('#species	perc_prot_assigned nb_prot_assigned\n')
    for sp, perc in perc_per_sp.items():
        file_stats_each_species.write(d_species[sp] + '	' + str(round(perc,1)) + '	' + str(nb_per_sp[sp]) + '\n')
    
    ## save OG stats: nb sp VS nb OGs
    file_stats_OGs_sp = open(out_dir / 'statistics_nb_OGs_VS_nb_species.txt', 'w+')
    file_stats_OGs_sp.write('#nb_species	nb_OGs\n')
    for i,v in enumerate(vector_sp):
        file_stats_OGs_sp.write(str(i) + '	' + str(v) + '\n')