def gtf_to_iadhore_list(infile, outfolder): """ Convert GTF file to i-ADHoRe list :param infile: GTF file :param outfolder: i-ADHoRe list :return: """ outfolder = check_folder_path(outfolder, True) gtf_dict = gtf_to_dict(infile) suboutfolder = outfolder + os.path.basename(infile).split('.')[0] suboutfolder = check_folder_path(suboutfolder, True) # Create pandas dataframe gene_location_list = [] for key, values in gtf_dict.items(): for value in values: if value['feature'] == 'gene': gene_location_list.append( (value['attribute']['gene_id'], value['strand'], key, int(value['start']), int(value['end']))) labels = ['gene', 'strand', 'chromosome', 'start', 'end'] df = pd.DataFrame.from_records(gene_location_list, columns=labels) groups = df.groupby(['chromosome']) for chromosome, group in groups: outfile = suboutfolder + chromosome + '.lst' group_sort = group.sort_values(['start']) fout = open(outfile, 'w') for idx in group_sort.index: print(group_sort.at[idx, 'gene'] + group_sort.at[idx, 'strand'], file=fout) fout.close()
def storm(args): infolder = check_folder_path(args.input) pwm_folder = check_folder_path(args.pwm) outfolder = check_folder_path(args.output) storm_options = args.opt calculate_base_comp = args.bcomp cs.run_storm(infolder, pwm_folder, outfolder, storm_options, calculate_base_comp)
def run_iadhore(orthogroups_file, gtf_folder, id_conversion_folder, iadhore_parameter_file, outfolder, protein_column=0, gene_column=1, column_sep="\t"): """ Run i-ADHoRe :param orthogroups_file: Orthogroups file :param gtf_folder: GTF folder :param id_conversion_folder: ID conversion folder :param iadhore_parameter_file: i-ADHoRe parameter file :param outfolder: i-ADHoRe output folder :param protein_column: Protein column :param gene_column: Gene column :param column_sep: Column separator :return: """ outfolder = check_folder_path(outfolder, True) working_directory = outfolder + 'working_directory/' working_directory = check_folder_path(working_directory, True) orthogroups_genes = working_directory + 'Orthogroups.txt' orthogroups_protein_to_gene(orthogroups_file, id_conversion_folder, orthogroups_genes, protein_column, gene_column, column_sep) iadhore_family_file = working_directory + 'iadhore_family.tsv' orthogroups_to_iadhore_family_file(orthogroups_genes, iadhore_family_file) all_genes_list = working_directory + 'temporary_genes_list/' all_genes_list = check_folder_path(all_genes_list, True) gtf_to_iadhore_list_folder(gtf_folder, all_genes_list) filtered_genes_list = working_directory + 'genes_list' filtered_genes_list = check_folder_path(filtered_genes_list, True) iadhore_list_family_filtering(all_genes_list, iadhore_family_file, filtered_genes_list) # remove temporary genes list subprocess.call(['rm', '-r', all_genes_list]) iadhore_config = working_directory + 'iadhore_config.ini' create_iadhore_config(filtered_genes_list, iadhore_family_file, iadhore_parameter_file, outfolder, iadhore_config) # run i-ADHoRe subprocess.call(['i-adhore', iadhore_config])
def gtf_to_iadhore_list_folder(infolder, outfolder): """ Convert GTF file to i-ADHoRe list :param infolder: GTF folder :param outfolder: i-ADHoRe list :return: """ infolder = check_folder_path(infolder) outfolder = check_folder_path(outfolder, True) for file in glob.glob(infolder + '*'): gtf_to_iadhore_list(file, outfolder)
def run_mosyn(iadhore_output_folder, storm_output_folder, gtf_folder, outfolder, window=0.1, complete=True, binding_site_id="BS", id_start_index=0, motif_name="MOTIF"): """ Run MoSyn :param motif_name: The name of the motif :param id_start_index: The binding site id start index :param binding_site_id: The binding site id :param window: Window size of alignment :param iadhore_output_folder: i-ADHoRe output folder :param storm_output_folder: CREAD STORM output folder :param gtf_folder: GTF folder :param outfolder: MoSyn result folder :param complete: Complete alignment :return: """ outfolder = check_folder_path(outfolder, True) working_directory = outfolder + 'working_directory/' working_directory = check_folder_path(working_directory, True) motifs_gtf_folder = working_directory + 'motifs_gtf/' motifs_gtf_folder = check_folder_path(motifs_gtf_folder, True) ps.transfac_to_gtf_folder(storm_output_folder, motifs_gtf_folder, binding_site_id, id_start_index, motif_name) iadhore_dict = pi.iadhore_result_folder_to_dict(iadhore_output_folder) if complete: iadhore_dict = pi.get_complete_synteny_dict(iadhore_dict) iadhore_with_location = add_location_to_iadhore_synteny( iadhore_dict, gtf_folder) iadhore_with_motifs = add_motifs_into_synteny(iadhore_with_location, motifs_gtf_folder) restructured_iadhore = restructure_iadhore_dict_to_position( iadhore_with_motifs) iadhore_with_pairs = align_motifs_in_synteny(restructured_iadhore, window) if complete: iadhore_with_pairs = get_complete_motifs_synteny(iadhore_with_pairs) flat_synteny = outfolder + "synteny.txt" serial_synteny = outfolder + "synteny.yaml" dump_aligned_motifs_to_flat_synteny(iadhore_with_pairs, flat_synteny) dump_aligned_motifs_to_serial(iadhore_with_pairs, serial_synteny)
def create_storm_bash(script_file, genome_dir, pwm_dir, outdir, bash_outfile, min_threshold, max_threshold, increment): outdir = check_folder_path(outdir) fout = open(bash_outfile, 'w') for i in range(min_threshold, max_threshold + 1, increment): storm_opt = '\'-t ' + str(i) + '\'' sub_outdir = outdir + str(i) print('python', script_file, 'storm', '--input', genome_dir, '--pwm', pwm_dir, '--output', sub_outdir, '--opt', storm_opt, '--bcomp', file=fout) fout.close()
def clean_header_fasta_folder(infolder, outfolder): """ Clean FASTA header :param infolder: FASTA folder :param outfolder: FASTA with clean header folder :return: """ infolder = check_folder_path(infolder) outfolder = check_folder_path(outfolder, True) for file in glob.glob(infolder + '*'): filename = os.path.basename(file) outfile = outfolder + filename clean_header_fasta(file, outfile)
def iadhore_result_folder_to_dict(infolder): """ i-ADHoRe output folder to Python dictionary :param infolder: i-ADHoRe output folder :return: Python dictionary """ infolder = check_folder_path(infolder) multiplicons_file = infolder + 'multiplicons.txt' segments_file = infolder + 'segments.txt' elements_file = infolder + 'list_elements.txt' # get dict multiplicons_dict = iadhore_result_file_to_dict(multiplicons_file) segments_dict = iadhore_result_file_to_dict(segments_file) elements_dict = iadhore_result_file_to_dict(elements_file) # merge segments dict with elements dict segments_dict = merge_iadhore_file_dict(segments_dict, elements_dict, "segment", "elements") # merge multiplicon dict with segment dict multiplicons_dict = merge_iadhore_file_dict(multiplicons_dict, segments_dict, "multiplicon", "segments") return multiplicons_dict
def gtf_to_json_folder(infolder, outfolder): """ Convert GTF to JSON :param infolder: GTF folder :param outfolder: JSON folder :return: """ infolder = check_folder_path(infolder) outfolder = check_folder_path(outfolder, True) for file in glob.glob(infolder + '*'): filename = os.path.basename(file) new_filename = filename.split(".")[0] + ".json" outfile = outfolder + new_filename gtf_to_json(file, outfile)
def fasta_to_conversion_folder(infolder, outfolder, db_name): """ Convert FASTA file to ID Conversion file containing Protein ID -> Gene ID :param db_name: DataBase source, e.g., FlyBase, WormBase :param infolder: FASTA folder :param outfolder: ID Conversion folder :return: """ infolder = check_folder_path(infolder) outfolder = check_folder_path(outfolder, True) for file in glob.glob(infolder + '*'): filename = os.path.basename(file) new_filename = filename.split(".")[0] + ".tsv" outfile = outfolder + new_filename fasta_to_conversion_file(file, outfile, db_name)
def fasta_to_json_folder(infolder, outfolder, db_name): """ Convert FASTA to JSON :param db_name: DataBase source, e.g., FlyBase, WormBase :param infolder: FASTA folder :param outfolder: JSON folder :return: """ infolder = check_folder_path(infolder) outfolder = check_folder_path(outfolder, True) for file in glob.glob(infolder + '*'): filename = os.path.basename(file) new_filename = filename.split(".")[0] + ".json" outfile = outfolder + new_filename fasta_to_json(file, outfile, db_name)
def create_loop_bash(result_folder, output_folder, pwm_names, list_genus, list_alignment, range_score, script_file, bash_outfile, min_length=80000, max_length=800000): result_folder = check_folder_path(result_folder) output_folder = check_folder_path(output_folder) fout = open(bash_outfile, 'w') for genus in list_genus: for alignment in list_alignment: for score in range_score: for pwm in pwm_names: this_input = result_folder + "/".join( [genus, alignment, str(score), pwm]) this_input = check_folder_path(this_input) this_input += "synteny.yaml" this_outdir = output_folder + "/".join( [genus, alignment, str(score), pwm]) this_outdir = check_folder_path(this_outdir) print("python", script_file, "idloop", "--input", this_input, "--output", this_outdir, "--min", min_length, "--max", max_length, file=fout) fout.close()
def iadhore_list_family_filtering(iadhore_genes_list, iadhore_family_file, outfolder): """ Select only genes that are in the family file :param outfolder: Output folder :param iadhore_genes_list: i-ADHoRe list :param iadhore_family_file: i-ADHoRe family file :return: """ iadhore_genes_list = check_folder_path(iadhore_genes_list) outfolder = check_folder_path(outfolder, True) family_genes = set(iadhore_family_to_dict(iadhore_family_file).keys()) for p in glob.glob(iadhore_genes_list + '**/*'): if os.path.isfile(p): # get set of genes list_genes = set() fin = open(p, 'r') for line in fin.readlines(): list_genes.add(line.strip()[:-1]) fin.close() # check the difference exception_genes = list_genes.difference(family_genes) if len(exception_genes) < len(list_genes): sub_outfolder = outfolder + os.path.split(os.path.dirname(p))[-1] sub_outfolder = check_folder_path(sub_outfolder, True) outfile = sub_outfolder + os.path.basename(p) fout = open(outfile, 'w') # check infile again fin = open(p, 'r') for line in fin.readlines(): if line.strip()[:-1] not in exception_genes: fout.write(line) fin.close() fout.close()
def run_orthofinder(infolder, outfolder, python2env, orthofinder_options=None): """ Run OrthoFinder :param infolder: Input folder :param outfolder: Output folder :param python2env: Python 2 environment :param orthofinder_options: Other options for OrthoFinder :return: """ outfolder = check_folder_path(outfolder, True) working_directory = outfolder + 'working_directory/' working_directory = check_folder_path(working_directory, True) cleaned_fasta = working_directory + 'cleaned_fasta/' cleaned_fasta = check_folder_path(cleaned_fasta, True) pf.clean_header_fasta_folder(infolder, cleaned_fasta) orthofinder_script = working_directory + 'orthofinder.bash' fout = open(orthofinder_script, 'w') print('source', 'activate', python2env, file=fout) list_command = ['orthofinder', '-f', cleaned_fasta] if orthofinder_options: list_command += orthofinder_options.split(' ') print(' '.join(list_command), file=fout) fout.close() subprocess.call(['bash', orthofinder_script]) for p in glob.glob(cleaned_fasta + '*'): if os.path.isdir(p): p = check_folder_path(p) for r in glob.glob(p + '*'): subprocess.call(['mv', r, outfolder]) subprocess.call(['rm', '-r', p])
def transfac_to_gtf_folder(infolder, outfolder, binding_site_id="BS", id_start_index=0, motif_name="MOTIF"): """ Convert a TRANSFAC folder / STORM output to GTF :param infolder: TRANSFAC / STORM Output folder :param outfolder: GTF folder :param binding_site_id: The ID of the binding sites, e.g., BS. It will be BS0, BS1, .., BSx in the output :param id_start_index: the start index of the motif ID, e.g., 0,1,2,...,x :param motif_name: The motif name, e.g., CTCF. :return: """ infolder = check_folder_path(infolder) outfolder = check_folder_path(outfolder, True) for file in sorted(glob.glob(infolder + '*')): filename = os.path.basename(file) new_filename = filename.split(".")[0] + ".gtf" outfile = outfolder + new_filename last_index = transfac_to_gtf(file, outfile, binding_site_id, id_start_index, motif_name, True) id_start_index = last_index
def id_conversion_folder_to_dict(infolder, protein_column=0, gene_column=1, column_sep="\t"): super_id_conversion_dict = dict() infolder = check_folder_path(infolder) for file in glob.glob(infolder + '*'): id_conversion_dict = id_conversion_file_to_dict(file, protein_column, gene_column, column_sep) for key, value in id_conversion_dict.items(): super_id_conversion_dict[key] = value return super_id_conversion_dict
def idloop(args): outdir = args.output outdir = check_folder_path(outdir, True) outfile = outdir + "loops.csv" outfile0 = outdir + "loops.yaml" infile = args.input mosyn_loops = alo.identify_loops_in_synteny(infile, args.min, args.max) alo.print_loops_to_csv(mosyn_loops, outfile) with open(outfile0, 'w') as stream: yaml.dump(mosyn_loops, stream)
def generate_orthofinder_summary(infolder, genus_index=-3, alignment_index=-2): """ Generate OrthoFinder summary :param infolder: Input folder containing result :param genus_index: Genus index folder relative to result file :param alignment_index: Alignment index folder relative to result file :return: """ outfile = "orthofinder_summary.csv" fout = open(outfile, 'w') print("Genus", "Alignment", "Number_of_Genes", "Number_of_Genes_in_Orthogroups", "Number_of_Unassigned_Genes", "Number_of_Orthogroups", sep=",", file=fout) infolder = check_folder_path(infolder) for summary in glob.glob(infolder + '**/Statistics_Overall.csv', recursive=True): path_elem = summary.split('/') genus = path_elem[genus_index] alignment = path_elem[alignment_index] fin = open(summary, 'r') lines = fin.readlines() fin.close() summary_keywords = [ "Number of genes", "Number of genes in orthogroups", "Number of unassigned genes", "Number of orthogroups" ] summary_values = [] for su in summary_keywords: for line in lines: line_elem = line.strip().split('\t') if line_elem[0] == su: summary_values.append(int(line_elem[-1])) summary_write = [str(s) for s in summary_values] print(genus, alignment, ",".join(summary_write), sep=",", file=fout) fout.close()
def create_mosyn_bash(result_folder, material_folder, output_folder, pwm_names, list_genus, list_alignment, range_score, script_file, bash_outfile): result_folder = check_folder_path(result_folder) material_folder = check_folder_path(material_folder) output_folder = check_folder_path(output_folder) fout = open(bash_outfile, 'w') for genus in list_genus: for alignment in list_alignment: for score in range_score: for pwm in pwm_names: iadhore_output = result_folder + "/".join( ["IADHORE", genus, alignment]) iadhore_output = check_folder_path(iadhore_output) storm_output = result_folder + "/".join( ["STORM", genus, str(score), pwm]) storm_output = check_folder_path(storm_output) gtf_folder = material_folder + "/".join([genus, "GTF"]) gtf_folder = check_folder_path(gtf_folder) this_outdir = output_folder + "/".join( [genus, alignment, str(score), pwm]) this_outdir = check_folder_path(this_outdir) print("python", script_file, "mosyn", "--iadhore", iadhore_output, "--storm", storm_output, "--gtf", gtf_folder, "--output", this_outdir, "--complete", "--mid", "CTCF", "--idx", 0, "--name", "CTCF", file=fout) fout.close()
def gtf_to_dict_folder(infolder): """ Convert GTF files to dictionary :param infolder: GTF folder :return: Python dictionary """ all_gtf = dict() infolder = check_folder_path(infolder) for file in glob.glob(infolder + '*'): filename = os.path.basename(file) filename = filename.split(".")[0] all_gtf[filename] = gtf_to_dict(file) return all_gtf
def runall(args): outfolder = args.output outfolder = check_folder_path(outfolder, True) working_directory = outfolder + "working_directory/" working_directory = check_folder_path(working_directory, True) proteome = args.proteome orthofinder_output = working_directory + "orthofinder_output/" orthofinder_output = check_folder_path(orthofinder_output, True) co.run_orthofinder(proteome, orthofinder_output, python2env=args.python2, orthofinder_options=args.ofopt) gtf_folder = args.gtf id_conversion_folder = args.idconv iadhore_parameter_file = args.param orthogroups_file = orthofinder_output + "Orthogroups.txt" iadhore_output = working_directory + "iadhore_output/" iadhore_output = check_folder_path(iadhore_output, True) ci.run_iadhore(orthogroups_file, gtf_folder, id_conversion_folder, iadhore_parameter_file, iadhore_output, args.pcol, args.gcol, args.csep) genome_folder = args.genome pwm_folder = check_folder_path(args.pwm) storm_output = working_directory + "storm_output/" storm_output = check_folder_path(storm_output, True) cs.run_storm(genome_folder, pwm_folder, storm_output, storm_options=args.stopt, calculate_base_comp=args.bcomp) for pwm in glob.glob(pwm_folder + '*'): sub_storm = storm_output + os.path.basename(pwm).split('.')[0] sub_storm = check_folder_path(sub_storm) sub_out = outfolder + os.path.basename(pwm).split('.')[0] sub_out = check_folder_path(sub_out, True) cm.run_mosyn(iadhore_output, sub_storm, gtf_folder, sub_out, args.window, args.complete, args.mid, args.idx, args.name)
def run_storm(infolder, pwm_folder, outfolder, storm_options=None, calculate_base_comp=False): """ Run CREAD STORM :param infolder: Folder containing genome files in .fasta format :param pwm_folder: Folder containing position weight matrices files in TRANSFAC format :param outfolder: Output folder :param storm_options: String of orthofinder options :param calculate_base_comp: Calculate base composition :return: """ infolder = check_folder_path(infolder) pwm_folder = check_folder_path(pwm_folder) outfolder = check_folder_path(outfolder, True) working_directory = outfolder + 'working_directory/' working_directory = check_folder_path(working_directory, True) cleaned_fasta = working_directory + 'cleaned_fasta/' cleaned_fasta = check_folder_path(cleaned_fasta, True) pf.clean_header_fasta_folder(infolder, cleaned_fasta) for fas in glob.glob(cleaned_fasta + '*'): acgt = None if calculate_base_comp: acgt = '--base-comp=' + get_base_composition(fas) for pwm in glob.glob(pwm_folder + '*'): sub_outfolder = outfolder + os.path.basename(pwm).split('.')[0] sub_outfolder = check_folder_path(sub_outfolder, True) outfile = sub_outfolder + os.path.basename(fas).split( '.')[0] + '.storm' list_command = ['storm', '-s', fas, pwm, '-o', outfile] if acgt: list_command.append(acgt) if storm_options: list_opt = storm_options.split(' ') list_command += list_opt subprocess.call(list_command)
def create_iadhore_config(iadhore_genes_list, iadhore_family_file, iadhore_parameter_file, iadhore_result_folder, outfile): """ Create i-ADHoRe configuration file :param iadhore_genes_list: i-ADHoRe genes list :param iadhore_family_file: i-ADHoRe family file :param iadhore_parameter_file: i-ADHoRe parameter file :param iadhore_result_folder: i-ADHoRe result folder :param outfile: i-ADHoRe configuration file :return: """ chromosome_dict = dict() for g in glob.glob(iadhore_genes_list + '**/*'): if os.path.isfile(g): species = g.split('/')[-2] if species not in chromosome_dict.keys(): chromosome_dict[species] = [] chromosome_dict[species].append(g) else: chromosome_dict[species].append(g) fout = open(outfile, 'w') for key in chromosome_dict.keys(): fout.write('genome=' + key + '\n') for val in chromosome_dict[key]: chromosome = os.path.basename(val).split('.')[0] fout.write(chromosome + ' ' + val + '\n') fout.write('\n') iadhore_result_folder = check_folder_path(iadhore_result_folder, True) fout.write('output_path=' + iadhore_result_folder + '\n') fout.write('blast_table=' + iadhore_family_file + '\n') fout.write('table_type=family\n') with open(iadhore_parameter_file, 'r') as fin: for line in fin.readlines(): fout.write(line) fout.close()
def generate_mosyn_summary(infolder, genus_index=-5, alignment_index=-4, score_index=-3, pwm_index=-2): """ Generate MoSyn summary :param infolder: Input folder containing result :param genus_index: Genus index folder relative to result file :param alignment_index: Alignment index folder relative to result file :param score_index: Score index folder relative to result file :param pwm_index: PWM index folder relative to result file :return: """ outfile = "mosyn_detail_summary.csv" outfile0 = "mosyn_short_summary.csv" fout = open(outfile, 'w') fout0 = open(outfile0, 'w') print("Genus", "Alignment", "Score", "PWM", "Synteny_ID", "Number_of_Segments", "Number_of_Genes", "Number_of_CTCF", "Total_Length", sep=",", file=fout) print("Genus", "Alignment", "Score", "PWM", "Number_of_Synteny", "Number_of_Genes_in_Synteny", "Number_of_Synteny_containing_Motifs", "Number_of_Motifs_in_Synteny", "Average_Number_of_Genes_per_Synteny_per_Species", "Average_Length_per_Synteny_per_Species", "Average_Number_of_Motifs_per_Synteny_per_Species", sep=",", file=fout0) infolder = check_folder_path(infolder) for synt in glob.glob(infolder + '**/synteny.yaml', recursive=True): path_elem = synt.split('/') genus = path_elem[genus_index] alignment = path_elem[alignment_index] score = path_elem[score_index] pwm = path_elem[pwm_index] with open(synt, 'r') as stream: iadhore_dict_with_motifs = yaml.load(stream) synteny_length = 0 synteny_genes = 0 num_of_mult = 0 synteny_motifs = 0 synteny_contain = 0 set_of_genes = set() set_of_motifs = set() for key, value in sorted(iadhore_dict_with_motifs.items()): num_of_mult += 1 mult_size = dict() mult_motifs = 0 mult_genes = 0 check_contain = False for ke, val in value.items(): if ke == "loops": continue for k, v in val.items(): if k == "motifs": if not check_contain: check_contain = True for pair in v: for motif in pair: mult_motifs += 1 set_of_motifs.add(motif["motif"]) else: mult_genes += 1 v_gc = (v["genome"], v["chromosome"]) if v_gc not in mult_size.keys(): mult_size[v_gc] = [v["start"], v["end"]] if mult_size[v_gc][0] > v["start"]: mult_size[v_gc][0] = v["start"] if mult_size[v_gc][-1] < v["end"]: mult_size[v_gc][-1] = v["end"] set_of_genes.add(v["gene"]) if check_contain: synteny_contain += 1 num_of_segments = len(mult_size) mult_loc = [abs(y - x) for x, y in mult_size.values()] mult_length = sum(mult_loc) avg_mult_length = mult_length / num_of_segments avg_mult_genes = mult_genes / num_of_segments avg_mult_motifs = mult_motifs / num_of_segments synteny_length += avg_mult_length synteny_genes += avg_mult_genes synteny_motifs += avg_mult_motifs print(genus, alignment, score, pwm, key, num_of_segments, mult_genes, mult_motifs, mult_length, sep=",", file=fout) nr_genes = len(set_of_genes) nr_motifs = len(set_of_motifs) avg_synteny_length = synteny_length / num_of_mult avg_synteny_genes = synteny_genes / num_of_mult avg_synteny_motifs = 0 if synteny_contain: avg_synteny_motifs = synteny_motifs / synteny_contain print(genus, alignment, score, pwm, num_of_mult, nr_genes, synteny_contain, nr_motifs, avg_synteny_genes, avg_synteny_length, avg_synteny_motifs, sep=",", file=fout0) fout.close() fout0.close()
def generate_loop_synteny_gtf(infolder, outfolder, genus_index=-5, alignment_index=-4, score_index=-3, pwm_index=-2): """ Generate Loop summary :param infolder: Input folder containing result :param outfolder: Output folder :param genus_index: Genus index folder relative to result file :param alignment_index: Alignment index folder relative to result file :param score_index: Score index folder relative to result file :param pwm_index: PWM index folder relative to result file :return: """ infolder = check_folder_path(infolder) outfolder = check_folder_path(outfolder, True) for loop_file in glob.glob(infolder + '**/loops.yaml', recursive=True): path_elem = loop_file.split('/') genus = path_elem[genus_index] alignment = path_elem[alignment_index] score = path_elem[score_index] pwm = path_elem[pwm_index] with open(loop_file, 'r') as stream: iadhore_dict_with_loops = yaml.load(stream) loops_dict = dict() synteny_dict = dict() loop_index = 1 for key, value in sorted(iadhore_dict_with_loops.items()): mult_size = dict() position_keys = sorted([k for k in value.keys() if k != "loops"]) for pk in position_keys: segment_keys = sorted( [k for k in value[pk].keys() if k != "motifs"]) for sk in segment_keys: gene_gc = (value[pk][sk]["genome"], value[pk][sk]["chromosome"]) if gene_gc not in mult_size.keys(): mult_size[gene_gc] = [ value[pk][sk]["start"], value[pk][sk]["end"] ] mult_size[gene_gc] += [ value[pk][sk]["start"], value[pk][sk]["end"] ] for k, v in mult_size.items(): synteny_start = min(v) synteny_end = max(v) synteny_strand = '+' if v[0] < v[-1] else '-' synteny_attribute = "synteny_id \"" + str(key) + "\";" if k[0] not in synteny_dict.keys(): synteny_dict[k[0]] = [] synteny_dict[k[0]].append([ k[1], "iADHoRe", "synteny", synteny_start, synteny_end, ".", synteny_strand, ".", synteny_attribute ]) if "loops" not in value.keys(): continue for loop in value["loops"]: first_motif = loop["first"] last_motif = loop["last"] loop_size = {(f["genome"], f["chromosome"]): (f["start"], f["end"], l["start"], l["end"]) for f, l in zip(first_motif, last_motif)} for k0, v0 in loop_size.items(): loop_start = min(v0) loop_end = max(v0) loop_strand = '+' if v0[0] < v0[-1] else '-' loop_attribute = "loop_id \"" + str(loop_index) + "\";" if k0[0] not in loops_dict.keys(): loops_dict[k0[0]] = [] loops_dict[k0[0]].append([ k0[1], "MoSyn", "loop_like", loop_start, loop_end, ".", loop_strand, ".", loop_attribute ]) loop_index += 1 loop_outdir = outfolder + "/".join( ["loop", genus, alignment, score, pwm]) loop_outdir = check_folder_path(loop_outdir, True) synteny_outdir = outfolder + "/".join( ["synteny", genus, alignment, score, pwm]) synteny_outdir = check_folder_path(synteny_outdir, True) for key, value in loops_dict.items(): outfile = loop_outdir + key + ".gtf" fout = open(outfile, 'w') for val in value: val = [str(v) for v in val] print("\t".join(val), file=fout) fout.close() for key, value in synteny_dict.items(): outfile = synteny_outdir + key + ".gtf" fout = open(outfile, 'w') for val in value: val = [str(v) for v in val] print("\t".join(val), file=fout) fout.close()
def generate_loop_summary(infolder, outfolder, genus_index=-5, alignment_index=-4, score_index=-3, pwm_index=-2): """ Generate Loop summary :param infolder: Input folder containing result :param outfolder: Output folder :param genus_index: Genus index folder relative to result file :param alignment_index: Alignment index folder relative to result file :param score_index: Score index folder relative to result file :param pwm_index: PWM index folder relative to result file :return: """ outfolder = check_folder_path(outfolder, True) outfile = outfolder + "loop_detail_summary.csv" outfile0 = outfolder + "loop_short_summary.csv" fout = open(outfile, 'w') fout0 = open(outfile0, 'w') print("Genus", "Alignment", "Score", "PWM", "Loop_ID", "Number_of_Segments", "Number_of_Genes", "Number_of_CTCF", "Total_Length", sep=",", file=fout) print("Genus", "Alignment", "Score", "PWM", "Number_of_Loops", "Number_of_Genes_in_Loops", "Number_of_Motifs_in_Loops", "Average_Number_of_Genes_per_Loops_per_Species", "Average_Length_per_Loops_per_Species", "Average_Number_of_Motifs_per_Loops_per_Species", sep=",", file=fout0) infolder = check_folder_path(infolder) for loop_file in glob.glob(infolder + '**/loops.yaml', recursive=True): path_elem = loop_file.split('/') genus = path_elem[genus_index] alignment = path_elem[alignment_index] score = path_elem[score_index] pwm = path_elem[pwm_index] with open(loop_file, 'r') as stream: iadhore_dict_with_loops = yaml.load(stream) check_loops = False for key, value in sorted(iadhore_dict_with_loops.items()): if "loops" in value.keys(): check_loops = True break if not check_loops: continue this_outdir = outfolder + "/".join([genus, alignment, score, pwm]) this_outdir = check_folder_path(this_outdir, True) genes_and_motifs = this_outdir + "genes_and_motifs.txt" genes_only = this_outdir + "genes.txt" motifs_only = this_outdir + "motifs.txt" f_all = open(genes_and_motifs, 'w') f_gene = open(genes_only, 'w') f_mot = open(motifs_only, 'w') overall_length = 0 overall_genes = 0 num_of_loops = 0 overall_motifs = 0 set_of_genes = set() set_of_motifs = set() loop_index = 1 for key, value in sorted(iadhore_dict_with_loops.items()): if "loops" not in value.keys(): continue position_keys = sorted([k for k in value.keys() if k != "loops"]) for loop in value["loops"]: num_of_loops += 1 first_motif = loop["first"] last_motif = loop["last"] first_pos = loop["first_pos"] last_pos = loop["last_pos"] gc_keys = sorted([(m["genome"], m["chromosome"]) for m in first_motif]) num_of_segments = len(gc_keys) genome_string = ",".join([str(g[0]) for g in gc_keys]) chromosome_string = ",".join([str(g[-1]) for g in gc_keys]) print("#loop_id=" + str(loop_index) + ";", "genome=" + genome_string + ";", "chromosome=" + chromosome_string + ";", file=f_all) print("#loop_id=" + str(loop_index) + ";", "genome=" + genome_string + ";", "chromosome=" + chromosome_string + ";", file=f_gene) print("#loop_id=" + str(loop_index) + ";", "genome=" + genome_string + ";", "chromosome=" + chromosome_string + ";", file=f_mot) loop_loc = [(f["start"], f["end"], l["start"], l["end"]) for f, l in zip(first_motif, last_motif)] loop_length = sum([abs(max(l) - min(l)) for l in loop_loc]) avg_loop_length = loop_length / num_of_segments check_start = False loop_genes = 0 loop_motifs = 0 for pk in position_keys: if first_pos <= pk <= last_pos: val = value[pk] segment_keys = [s for s in val.keys() if s != "motifs"] this_position_genes = [] this_gcount = 0 this_sgenes = [] for gk in gc_keys: this_gene = None for sk in segment_keys: this_gk = (val[sk]["genome"], val[sk]["chromosome"]) if this_gk == gk: this_gene = val[sk] if this_gene: this_gcount += 1 this_sgenes.append(this_gene["gene"]) this_position_genes.append(this_gene["gene"] + this_gene["strand"]) else: this_position_genes.append("-") if this_position_genes and check_start: loop_genes += this_gcount set_of_genes.update(this_sgenes) print("\t".join(this_position_genes), file=f_all) print("\t".join(this_position_genes), file=f_gene) if "motifs" not in val.keys(): continue for pair in val["motifs"]: if pair == first_motif: check_start = True this_position_motifs = [] this_mcount = 0 this_smot = [] for gk in gc_keys: this_motif = None for m in pair: m_gk = (m["genome"], m["chromosome"]) if m_gk == gk: this_motif = m if this_motif: this_mcount += 1 this_smot.append(this_motif["motif"]) this_position_motifs.append( this_motif["motif"] + this_motif["strand"]) else: this_position_motifs.append("-") if this_position_motifs and check_start: loop_motifs += this_mcount set_of_motifs.update(this_smot) print("\t".join(this_position_motifs), file=f_all) print("\t".join(this_position_motifs), file=f_mot) if pair == last_motif: check_start = False avg_loop_genes = loop_genes / num_of_segments avg_loop_motifs = loop_motifs / num_of_segments overall_length += avg_loop_length overall_genes += avg_loop_genes overall_motifs += avg_loop_motifs print(genus, alignment, score, pwm, loop_index, num_of_segments, loop_genes, loop_motifs, loop_length, sep=",", file=fout) loop_index += 1 f_all.close() f_gene.close() f_mot.close() nr_genes = len(set_of_genes) nr_motifs = len(set_of_motifs) avg_synteny_length = overall_length / num_of_loops avg_synteny_genes = overall_genes / num_of_loops avg_synteny_motifs = overall_motifs / num_of_loops print(genus, alignment, score, pwm, num_of_loops, nr_genes, nr_motifs, avg_synteny_genes, avg_synteny_length, avg_synteny_motifs, sep=",", file=fout0) fout.close() fout0.close()
def orthofinder(args): infolder = check_folder_path(args.input) outfolder = check_folder_path(args.output) co.run_orthofinder(infolder, outfolder, python2env=args.python2, orthofinder_options=args.opt)