Ejemplo n.º 1
0
def gtf_to_iadhore_list(infile, outfolder):
    """
    Convert GTF file to i-ADHoRe list
    :param infile: GTF file
    :param outfolder: i-ADHoRe list
    :return:
    """

    outfolder = check_folder_path(outfolder, True)
    gtf_dict = gtf_to_dict(infile)
    suboutfolder = outfolder + os.path.basename(infile).split('.')[0]
    suboutfolder = check_folder_path(suboutfolder, True)

    # Create pandas dataframe
    gene_location_list = []
    for key, values in gtf_dict.items():
        for value in values:
            if value['feature'] == 'gene':
                gene_location_list.append(
                    (value['attribute']['gene_id'], value['strand'], key,
                     int(value['start']), int(value['end'])))
    labels = ['gene', 'strand', 'chromosome', 'start', 'end']
    df = pd.DataFrame.from_records(gene_location_list, columns=labels)
    groups = df.groupby(['chromosome'])

    for chromosome, group in groups:
        outfile = suboutfolder + chromosome + '.lst'
        group_sort = group.sort_values(['start'])

        fout = open(outfile, 'w')
        for idx in group_sort.index:
            print(group_sort.at[idx, 'gene'] + group_sort.at[idx, 'strand'],
                  file=fout)
        fout.close()
Ejemplo n.º 2
0
def storm(args):
    infolder = check_folder_path(args.input)
    pwm_folder = check_folder_path(args.pwm)
    outfolder = check_folder_path(args.output)

    storm_options = args.opt
    calculate_base_comp = args.bcomp

    cs.run_storm(infolder, pwm_folder, outfolder, storm_options, calculate_base_comp)
Ejemplo n.º 3
0
def run_iadhore(orthogroups_file,
                gtf_folder,
                id_conversion_folder,
                iadhore_parameter_file,
                outfolder,
                protein_column=0,
                gene_column=1,
                column_sep="\t"):
    """
    Run i-ADHoRe
    :param orthogroups_file: Orthogroups file
    :param gtf_folder: GTF folder
    :param id_conversion_folder: ID conversion folder
    :param iadhore_parameter_file: i-ADHoRe parameter file
    :param outfolder: i-ADHoRe output folder
    :param protein_column: Protein column
    :param gene_column: Gene column
    :param column_sep: Column separator
    :return:
    """

    outfolder = check_folder_path(outfolder, True)

    working_directory = outfolder + 'working_directory/'
    working_directory = check_folder_path(working_directory, True)

    orthogroups_genes = working_directory + 'Orthogroups.txt'
    orthogroups_protein_to_gene(orthogroups_file, id_conversion_folder,
                                orthogroups_genes, protein_column, gene_column,
                                column_sep)

    iadhore_family_file = working_directory + 'iadhore_family.tsv'
    orthogroups_to_iadhore_family_file(orthogroups_genes, iadhore_family_file)

    all_genes_list = working_directory + 'temporary_genes_list/'
    all_genes_list = check_folder_path(all_genes_list, True)
    gtf_to_iadhore_list_folder(gtf_folder, all_genes_list)

    filtered_genes_list = working_directory + 'genes_list'
    filtered_genes_list = check_folder_path(filtered_genes_list, True)
    iadhore_list_family_filtering(all_genes_list, iadhore_family_file,
                                  filtered_genes_list)

    # remove temporary genes list
    subprocess.call(['rm', '-r', all_genes_list])

    iadhore_config = working_directory + 'iadhore_config.ini'
    create_iadhore_config(filtered_genes_list, iadhore_family_file,
                          iadhore_parameter_file, outfolder, iadhore_config)

    # run i-ADHoRe
    subprocess.call(['i-adhore', iadhore_config])
Ejemplo n.º 4
0
def gtf_to_iadhore_list_folder(infolder, outfolder):
    """
    Convert GTF file to i-ADHoRe list
    :param infolder: GTF folder
    :param outfolder: i-ADHoRe list
    :return:
    """

    infolder = check_folder_path(infolder)
    outfolder = check_folder_path(outfolder, True)

    for file in glob.glob(infolder + '*'):
        gtf_to_iadhore_list(file, outfolder)
Ejemplo n.º 5
0
def run_mosyn(iadhore_output_folder,
              storm_output_folder,
              gtf_folder,
              outfolder,
              window=0.1,
              complete=True,
              binding_site_id="BS",
              id_start_index=0,
              motif_name="MOTIF"):
    """
    Run MoSyn
    :param motif_name: The name of the motif
    :param id_start_index: The binding site id start index
    :param binding_site_id: The binding site id
    :param window: Window size of alignment
    :param iadhore_output_folder: i-ADHoRe output folder
    :param storm_output_folder: CREAD STORM output folder
    :param gtf_folder: GTF folder
    :param outfolder: MoSyn result folder
    :param complete: Complete alignment
    :return:
    """

    outfolder = check_folder_path(outfolder, True)

    working_directory = outfolder + 'working_directory/'
    working_directory = check_folder_path(working_directory, True)

    motifs_gtf_folder = working_directory + 'motifs_gtf/'
    motifs_gtf_folder = check_folder_path(motifs_gtf_folder, True)
    ps.transfac_to_gtf_folder(storm_output_folder, motifs_gtf_folder,
                              binding_site_id, id_start_index, motif_name)

    iadhore_dict = pi.iadhore_result_folder_to_dict(iadhore_output_folder)
    if complete:
        iadhore_dict = pi.get_complete_synteny_dict(iadhore_dict)
    iadhore_with_location = add_location_to_iadhore_synteny(
        iadhore_dict, gtf_folder)
    iadhore_with_motifs = add_motifs_into_synteny(iadhore_with_location,
                                                  motifs_gtf_folder)
    restructured_iadhore = restructure_iadhore_dict_to_position(
        iadhore_with_motifs)
    iadhore_with_pairs = align_motifs_in_synteny(restructured_iadhore, window)
    if complete:
        iadhore_with_pairs = get_complete_motifs_synteny(iadhore_with_pairs)

    flat_synteny = outfolder + "synteny.txt"
    serial_synteny = outfolder + "synteny.yaml"

    dump_aligned_motifs_to_flat_synteny(iadhore_with_pairs, flat_synteny)
    dump_aligned_motifs_to_serial(iadhore_with_pairs, serial_synteny)
Ejemplo n.º 6
0
def create_storm_bash(script_file, genome_dir, pwm_dir, outdir, bash_outfile,
                      min_threshold, max_threshold, increment):

    outdir = check_folder_path(outdir)

    fout = open(bash_outfile, 'w')
    for i in range(min_threshold, max_threshold + 1, increment):

        storm_opt = '\'-t ' + str(i) + '\''
        sub_outdir = outdir + str(i)

        print('python',
              script_file,
              'storm',
              '--input',
              genome_dir,
              '--pwm',
              pwm_dir,
              '--output',
              sub_outdir,
              '--opt',
              storm_opt,
              '--bcomp',
              file=fout)
    fout.close()
Ejemplo n.º 7
0
def clean_header_fasta_folder(infolder, outfolder):
    """
    Clean FASTA header
    :param infolder: FASTA folder
    :param outfolder: FASTA with clean header folder
    :return:
    """

    infolder = check_folder_path(infolder)
    outfolder = check_folder_path(outfolder, True)

    for file in glob.glob(infolder + '*'):
        filename = os.path.basename(file)
        outfile = outfolder + filename

        clean_header_fasta(file, outfile)
Ejemplo n.º 8
0
def iadhore_result_folder_to_dict(infolder):
    """
    i-ADHoRe output folder to Python dictionary
    :param infolder: i-ADHoRe output folder
    :return: Python dictionary
    """

    infolder = check_folder_path(infolder)

    multiplicons_file = infolder + 'multiplicons.txt'
    segments_file = infolder + 'segments.txt'
    elements_file = infolder + 'list_elements.txt'

    # get dict
    multiplicons_dict = iadhore_result_file_to_dict(multiplicons_file)
    segments_dict = iadhore_result_file_to_dict(segments_file)
    elements_dict = iadhore_result_file_to_dict(elements_file)

    # merge segments dict with elements dict
    segments_dict = merge_iadhore_file_dict(segments_dict, elements_dict, "segment", "elements")

    # merge multiplicon dict with segment dict
    multiplicons_dict = merge_iadhore_file_dict(multiplicons_dict, segments_dict, "multiplicon", "segments")

    return multiplicons_dict
Ejemplo n.º 9
0
def gtf_to_json_folder(infolder, outfolder):
    """
    Convert GTF to JSON
    :param infolder: GTF folder
    :param outfolder: JSON folder
    :return:
    """

    infolder = check_folder_path(infolder)
    outfolder = check_folder_path(outfolder, True)

    for file in glob.glob(infolder + '*'):
        filename = os.path.basename(file)
        new_filename = filename.split(".")[0] + ".json"
        outfile = outfolder + new_filename

        gtf_to_json(file, outfile)
Ejemplo n.º 10
0
def fasta_to_conversion_folder(infolder, outfolder, db_name):
    """
    Convert FASTA file to ID Conversion file containing Protein ID -> Gene ID
    :param db_name: DataBase source, e.g., FlyBase, WormBase
    :param infolder: FASTA folder
    :param outfolder: ID Conversion folder
    :return:
    """

    infolder = check_folder_path(infolder)
    outfolder = check_folder_path(outfolder, True)

    for file in glob.glob(infolder + '*'):
        filename = os.path.basename(file)
        new_filename = filename.split(".")[0] + ".tsv"
        outfile = outfolder + new_filename

        fasta_to_conversion_file(file, outfile, db_name)
Ejemplo n.º 11
0
def fasta_to_json_folder(infolder, outfolder, db_name):
    """
    Convert FASTA to JSON
    :param db_name: DataBase source, e.g., FlyBase, WormBase
    :param infolder: FASTA folder
    :param outfolder: JSON folder
    :return:
    """

    infolder = check_folder_path(infolder)
    outfolder = check_folder_path(outfolder, True)

    for file in glob.glob(infolder + '*'):
        filename = os.path.basename(file)
        new_filename = filename.split(".")[0] + ".json"
        outfile = outfolder + new_filename

        fasta_to_json(file, outfile, db_name)
Ejemplo n.º 12
0
def create_loop_bash(result_folder,
                     output_folder,
                     pwm_names,
                     list_genus,
                     list_alignment,
                     range_score,
                     script_file,
                     bash_outfile,
                     min_length=80000,
                     max_length=800000):

    result_folder = check_folder_path(result_folder)
    output_folder = check_folder_path(output_folder)

    fout = open(bash_outfile, 'w')

    for genus in list_genus:
        for alignment in list_alignment:
            for score in range_score:
                for pwm in pwm_names:

                    this_input = result_folder + "/".join(
                        [genus, alignment, str(score), pwm])
                    this_input = check_folder_path(this_input)
                    this_input += "synteny.yaml"

                    this_outdir = output_folder + "/".join(
                        [genus, alignment, str(score), pwm])
                    this_outdir = check_folder_path(this_outdir)

                    print("python",
                          script_file,
                          "idloop",
                          "--input",
                          this_input,
                          "--output",
                          this_outdir,
                          "--min",
                          min_length,
                          "--max",
                          max_length,
                          file=fout)

    fout.close()
Ejemplo n.º 13
0
def iadhore_list_family_filtering(iadhore_genes_list, iadhore_family_file, outfolder):
    """
    Select only genes that are in the family file
    :param outfolder: Output folder
    :param iadhore_genes_list: i-ADHoRe list
    :param iadhore_family_file: i-ADHoRe family file
    :return:
    """

    iadhore_genes_list = check_folder_path(iadhore_genes_list)
    outfolder = check_folder_path(outfolder, True)

    family_genes = set(iadhore_family_to_dict(iadhore_family_file).keys())

    for p in glob.glob(iadhore_genes_list + '**/*'):
        if os.path.isfile(p):

            # get set of genes
            list_genes = set()
            fin = open(p, 'r')
            for line in fin.readlines():
                list_genes.add(line.strip()[:-1])
            fin.close()

            # check the difference
            exception_genes = list_genes.difference(family_genes)

            if len(exception_genes) < len(list_genes):

                sub_outfolder = outfolder + os.path.split(os.path.dirname(p))[-1]
                sub_outfolder = check_folder_path(sub_outfolder, True)

                outfile = sub_outfolder + os.path.basename(p)

                fout = open(outfile, 'w')

                # check infile again
                fin = open(p, 'r')
                for line in fin.readlines():
                    if line.strip()[:-1] not in exception_genes:
                        fout.write(line)
                fin.close()

                fout.close()
Ejemplo n.º 14
0
def run_orthofinder(infolder, outfolder, python2env, orthofinder_options=None):
    """
    Run OrthoFinder
    :param infolder: Input folder
    :param outfolder: Output folder
    :param python2env: Python 2 environment
    :param orthofinder_options: Other options for OrthoFinder
    :return:
    """
    outfolder = check_folder_path(outfolder, True)

    working_directory = outfolder + 'working_directory/'
    working_directory = check_folder_path(working_directory, True)

    cleaned_fasta = working_directory + 'cleaned_fasta/'
    cleaned_fasta = check_folder_path(cleaned_fasta, True)

    pf.clean_header_fasta_folder(infolder, cleaned_fasta)

    orthofinder_script = working_directory + 'orthofinder.bash'
    fout = open(orthofinder_script, 'w')

    print('source', 'activate', python2env, file=fout)

    list_command = ['orthofinder', '-f', cleaned_fasta]
    if orthofinder_options:
        list_command += orthofinder_options.split(' ')
    print(' '.join(list_command), file=fout)

    fout.close()

    subprocess.call(['bash', orthofinder_script])

    for p in glob.glob(cleaned_fasta + '*'):
        if os.path.isdir(p):

            p = check_folder_path(p)

            for r in glob.glob(p + '*'):
                subprocess.call(['mv', r, outfolder])

            subprocess.call(['rm', '-r', p])
Ejemplo n.º 15
0
def transfac_to_gtf_folder(infolder, outfolder, binding_site_id="BS", id_start_index=0, motif_name="MOTIF"):
    """
    Convert a TRANSFAC folder / STORM output to GTF
    :param infolder: TRANSFAC / STORM Output folder
    :param outfolder: GTF folder
    :param binding_site_id: The ID of the binding sites, e.g., BS. It will be BS0, BS1, .., BSx in the output
    :param id_start_index: the start index of the motif ID, e.g., 0,1,2,...,x
    :param motif_name: The motif name, e.g., CTCF.
    :return:
    """

    infolder = check_folder_path(infolder)
    outfolder = check_folder_path(outfolder, True)

    for file in sorted(glob.glob(infolder + '*')):
        filename = os.path.basename(file)
        new_filename = filename.split(".")[0] + ".gtf"
        outfile = outfolder + new_filename

        last_index = transfac_to_gtf(file, outfile, binding_site_id, id_start_index, motif_name, True)
        id_start_index = last_index
Ejemplo n.º 16
0
def id_conversion_folder_to_dict(infolder, protein_column=0, gene_column=1, column_sep="\t"):

    super_id_conversion_dict = dict()

    infolder = check_folder_path(infolder)

    for file in glob.glob(infolder + '*'):
        id_conversion_dict = id_conversion_file_to_dict(file, protein_column, gene_column, column_sep)
        for key, value in id_conversion_dict.items():
            super_id_conversion_dict[key] = value

    return super_id_conversion_dict
Ejemplo n.º 17
0
def idloop(args):

    outdir = args.output
    outdir = check_folder_path(outdir, True)
    outfile = outdir + "loops.csv"
    outfile0 = outdir + "loops.yaml"

    infile = args.input
    mosyn_loops = alo.identify_loops_in_synteny(infile, args.min, args.max)
    alo.print_loops_to_csv(mosyn_loops, outfile)
    with open(outfile0, 'w') as stream:
        yaml.dump(mosyn_loops, stream)
Ejemplo n.º 18
0
def generate_orthofinder_summary(infolder, genus_index=-3, alignment_index=-2):
    """
    Generate OrthoFinder summary
    :param infolder: Input folder containing result
    :param genus_index: Genus index folder relative to result file
    :param alignment_index: Alignment index folder relative to result file
    :return:
    """

    outfile = "orthofinder_summary.csv"

    fout = open(outfile, 'w')

    print("Genus",
          "Alignment",
          "Number_of_Genes",
          "Number_of_Genes_in_Orthogroups",
          "Number_of_Unassigned_Genes",
          "Number_of_Orthogroups",
          sep=",",
          file=fout)

    infolder = check_folder_path(infolder)
    for summary in glob.glob(infolder + '**/Statistics_Overall.csv',
                             recursive=True):

        path_elem = summary.split('/')
        genus = path_elem[genus_index]
        alignment = path_elem[alignment_index]

        fin = open(summary, 'r')
        lines = fin.readlines()
        fin.close()

        summary_keywords = [
            "Number of genes", "Number of genes in orthogroups",
            "Number of unassigned genes", "Number of orthogroups"
        ]
        summary_values = []

        for su in summary_keywords:
            for line in lines:
                line_elem = line.strip().split('\t')
                if line_elem[0] == su:
                    summary_values.append(int(line_elem[-1]))

        summary_write = [str(s) for s in summary_values]
        print(genus, alignment, ",".join(summary_write), sep=",", file=fout)

    fout.close()
Ejemplo n.º 19
0
def create_mosyn_bash(result_folder, material_folder, output_folder, pwm_names,
                      list_genus, list_alignment, range_score, script_file,
                      bash_outfile):

    result_folder = check_folder_path(result_folder)
    material_folder = check_folder_path(material_folder)
    output_folder = check_folder_path(output_folder)

    fout = open(bash_outfile, 'w')

    for genus in list_genus:
        for alignment in list_alignment:
            for score in range_score:
                for pwm in pwm_names:

                    iadhore_output = result_folder + "/".join(
                        ["IADHORE", genus, alignment])
                    iadhore_output = check_folder_path(iadhore_output)

                    storm_output = result_folder + "/".join(
                        ["STORM", genus, str(score), pwm])
                    storm_output = check_folder_path(storm_output)

                    gtf_folder = material_folder + "/".join([genus, "GTF"])
                    gtf_folder = check_folder_path(gtf_folder)

                    this_outdir = output_folder + "/".join(
                        [genus, alignment, str(score), pwm])
                    this_outdir = check_folder_path(this_outdir)

                    print("python",
                          script_file,
                          "mosyn",
                          "--iadhore",
                          iadhore_output,
                          "--storm",
                          storm_output,
                          "--gtf",
                          gtf_folder,
                          "--output",
                          this_outdir,
                          "--complete",
                          "--mid",
                          "CTCF",
                          "--idx",
                          0,
                          "--name",
                          "CTCF",
                          file=fout)

    fout.close()
Ejemplo n.º 20
0
def gtf_to_dict_folder(infolder):
    """
    Convert GTF files to dictionary
    :param infolder: GTF folder
    :return: Python dictionary
    """

    all_gtf = dict()

    infolder = check_folder_path(infolder)
    for file in glob.glob(infolder + '*'):
        filename = os.path.basename(file)
        filename = filename.split(".")[0]

        all_gtf[filename] = gtf_to_dict(file)

    return all_gtf
Ejemplo n.º 21
0
def runall(args):

    outfolder = args.output
    outfolder = check_folder_path(outfolder, True)

    working_directory = outfolder + "working_directory/"
    working_directory = check_folder_path(working_directory, True)

    proteome = args.proteome
    orthofinder_output = working_directory + "orthofinder_output/"
    orthofinder_output = check_folder_path(orthofinder_output, True)

    co.run_orthofinder(proteome, orthofinder_output,
                       python2env=args.python2, orthofinder_options=args.ofopt)

    gtf_folder = args.gtf
    id_conversion_folder = args.idconv
    iadhore_parameter_file = args.param
    orthogroups_file = orthofinder_output + "Orthogroups.txt"
    iadhore_output = working_directory + "iadhore_output/"
    iadhore_output = check_folder_path(iadhore_output, True)

    ci.run_iadhore(orthogroups_file, gtf_folder, id_conversion_folder, iadhore_parameter_file, iadhore_output,
                   args.pcol, args.gcol, args.csep)

    genome_folder = args.genome
    pwm_folder = check_folder_path(args.pwm)

    storm_output = working_directory + "storm_output/"
    storm_output = check_folder_path(storm_output, True)

    cs.run_storm(genome_folder, pwm_folder, storm_output,
                 storm_options=args.stopt, calculate_base_comp=args.bcomp)

    for pwm in glob.glob(pwm_folder + '*'):
        sub_storm = storm_output + os.path.basename(pwm).split('.')[0]
        sub_storm = check_folder_path(sub_storm)

        sub_out = outfolder + os.path.basename(pwm).split('.')[0]
        sub_out = check_folder_path(sub_out, True)

        cm.run_mosyn(iadhore_output, sub_storm, gtf_folder, sub_out,
                     args.window, args.complete, args.mid, args.idx, args.name)
Ejemplo n.º 22
0
def run_storm(infolder,
              pwm_folder,
              outfolder,
              storm_options=None,
              calculate_base_comp=False):
    """
    Run CREAD STORM
    :param infolder: Folder containing genome files in .fasta format
    :param pwm_folder: Folder containing position weight matrices files in TRANSFAC format
    :param outfolder: Output folder
    :param storm_options: String of orthofinder options
    :param calculate_base_comp: Calculate base composition
    :return:
    """

    infolder = check_folder_path(infolder)
    pwm_folder = check_folder_path(pwm_folder)
    outfolder = check_folder_path(outfolder, True)

    working_directory = outfolder + 'working_directory/'
    working_directory = check_folder_path(working_directory, True)

    cleaned_fasta = working_directory + 'cleaned_fasta/'
    cleaned_fasta = check_folder_path(cleaned_fasta, True)

    pf.clean_header_fasta_folder(infolder, cleaned_fasta)

    for fas in glob.glob(cleaned_fasta + '*'):

        acgt = None

        if calculate_base_comp:
            acgt = '--base-comp=' + get_base_composition(fas)

        for pwm in glob.glob(pwm_folder + '*'):

            sub_outfolder = outfolder + os.path.basename(pwm).split('.')[0]
            sub_outfolder = check_folder_path(sub_outfolder, True)

            outfile = sub_outfolder + os.path.basename(fas).split(
                '.')[0] + '.storm'

            list_command = ['storm', '-s', fas, pwm, '-o', outfile]

            if acgt:
                list_command.append(acgt)

            if storm_options:
                list_opt = storm_options.split(' ')
                list_command += list_opt

            subprocess.call(list_command)
Ejemplo n.º 23
0
def create_iadhore_config(iadhore_genes_list, iadhore_family_file, iadhore_parameter_file, iadhore_result_folder,
                          outfile):
    """
    Create i-ADHoRe configuration file
    :param iadhore_genes_list: i-ADHoRe genes list
    :param iadhore_family_file: i-ADHoRe family file
    :param iadhore_parameter_file: i-ADHoRe parameter file
    :param iadhore_result_folder: i-ADHoRe result folder
    :param outfile: i-ADHoRe configuration file
    :return:
    """

    chromosome_dict = dict()
    for g in glob.glob(iadhore_genes_list + '**/*'):
        if os.path.isfile(g):
            species = g.split('/')[-2]
            if species not in chromosome_dict.keys():
                chromosome_dict[species] = []
                chromosome_dict[species].append(g)
            else:
                chromosome_dict[species].append(g)

    fout = open(outfile, 'w')
    for key in chromosome_dict.keys():
        fout.write('genome=' + key + '\n')
        for val in chromosome_dict[key]:
            chromosome = os.path.basename(val).split('.')[0]
            fout.write(chromosome + ' ' + val + '\n')
        fout.write('\n')

    iadhore_result_folder = check_folder_path(iadhore_result_folder, True)

    fout.write('output_path=' + iadhore_result_folder + '\n')
    fout.write('blast_table=' + iadhore_family_file + '\n')
    fout.write('table_type=family\n')

    with open(iadhore_parameter_file, 'r') as fin:
        for line in fin.readlines():
            fout.write(line)

    fout.close()
Ejemplo n.º 24
0
def generate_mosyn_summary(infolder,
                           genus_index=-5,
                           alignment_index=-4,
                           score_index=-3,
                           pwm_index=-2):
    """
    Generate MoSyn summary
    :param infolder: Input folder containing result
    :param genus_index: Genus index folder relative to result file
    :param alignment_index: Alignment index folder relative to result file
    :param score_index: Score index folder relative to result file
    :param pwm_index: PWM index folder relative to result file
    :return:
    """

    outfile = "mosyn_detail_summary.csv"
    outfile0 = "mosyn_short_summary.csv"

    fout = open(outfile, 'w')
    fout0 = open(outfile0, 'w')

    print("Genus",
          "Alignment",
          "Score",
          "PWM",
          "Synteny_ID",
          "Number_of_Segments",
          "Number_of_Genes",
          "Number_of_CTCF",
          "Total_Length",
          sep=",",
          file=fout)

    print("Genus",
          "Alignment",
          "Score",
          "PWM",
          "Number_of_Synteny",
          "Number_of_Genes_in_Synteny",
          "Number_of_Synteny_containing_Motifs",
          "Number_of_Motifs_in_Synteny",
          "Average_Number_of_Genes_per_Synteny_per_Species",
          "Average_Length_per_Synteny_per_Species",
          "Average_Number_of_Motifs_per_Synteny_per_Species",
          sep=",",
          file=fout0)

    infolder = check_folder_path(infolder)
    for synt in glob.glob(infolder + '**/synteny.yaml', recursive=True):

        path_elem = synt.split('/')
        genus = path_elem[genus_index]
        alignment = path_elem[alignment_index]
        score = path_elem[score_index]
        pwm = path_elem[pwm_index]

        with open(synt, 'r') as stream:
            iadhore_dict_with_motifs = yaml.load(stream)

        synteny_length = 0
        synteny_genes = 0
        num_of_mult = 0

        synteny_motifs = 0
        synteny_contain = 0

        set_of_genes = set()
        set_of_motifs = set()

        for key, value in sorted(iadhore_dict_with_motifs.items()):

            num_of_mult += 1

            mult_size = dict()

            mult_motifs = 0
            mult_genes = 0

            check_contain = False

            for ke, val in value.items():

                if ke == "loops":
                    continue

                for k, v in val.items():

                    if k == "motifs":

                        if not check_contain:
                            check_contain = True

                        for pair in v:
                            for motif in pair:
                                mult_motifs += 1
                                set_of_motifs.add(motif["motif"])

                    else:
                        mult_genes += 1
                        v_gc = (v["genome"], v["chromosome"])

                        if v_gc not in mult_size.keys():
                            mult_size[v_gc] = [v["start"], v["end"]]

                        if mult_size[v_gc][0] > v["start"]:
                            mult_size[v_gc][0] = v["start"]

                        if mult_size[v_gc][-1] < v["end"]:
                            mult_size[v_gc][-1] = v["end"]

                        set_of_genes.add(v["gene"])

            if check_contain:
                synteny_contain += 1

            num_of_segments = len(mult_size)
            mult_loc = [abs(y - x) for x, y in mult_size.values()]
            mult_length = sum(mult_loc)

            avg_mult_length = mult_length / num_of_segments
            avg_mult_genes = mult_genes / num_of_segments
            avg_mult_motifs = mult_motifs / num_of_segments

            synteny_length += avg_mult_length
            synteny_genes += avg_mult_genes
            synteny_motifs += avg_mult_motifs

            print(genus,
                  alignment,
                  score,
                  pwm,
                  key,
                  num_of_segments,
                  mult_genes,
                  mult_motifs,
                  mult_length,
                  sep=",",
                  file=fout)

        nr_genes = len(set_of_genes)
        nr_motifs = len(set_of_motifs)

        avg_synteny_length = synteny_length / num_of_mult
        avg_synteny_genes = synteny_genes / num_of_mult

        avg_synteny_motifs = 0
        if synteny_contain:
            avg_synteny_motifs = synteny_motifs / synteny_contain

        print(genus,
              alignment,
              score,
              pwm,
              num_of_mult,
              nr_genes,
              synteny_contain,
              nr_motifs,
              avg_synteny_genes,
              avg_synteny_length,
              avg_synteny_motifs,
              sep=",",
              file=fout0)

    fout.close()
    fout0.close()
Ejemplo n.º 25
0
def generate_loop_synteny_gtf(infolder,
                              outfolder,
                              genus_index=-5,
                              alignment_index=-4,
                              score_index=-3,
                              pwm_index=-2):
    """
    Generate Loop summary
    :param infolder: Input folder containing result
    :param outfolder: Output folder
    :param genus_index: Genus index folder relative to result file
    :param alignment_index: Alignment index folder relative to result file
    :param score_index: Score index folder relative to result file
    :param pwm_index: PWM index folder relative to result file
    :return:
    """

    infolder = check_folder_path(infolder)
    outfolder = check_folder_path(outfolder, True)

    for loop_file in glob.glob(infolder + '**/loops.yaml', recursive=True):

        path_elem = loop_file.split('/')
        genus = path_elem[genus_index]
        alignment = path_elem[alignment_index]
        score = path_elem[score_index]
        pwm = path_elem[pwm_index]

        with open(loop_file, 'r') as stream:
            iadhore_dict_with_loops = yaml.load(stream)

        loops_dict = dict()
        synteny_dict = dict()

        loop_index = 1

        for key, value in sorted(iadhore_dict_with_loops.items()):

            mult_size = dict()
            position_keys = sorted([k for k in value.keys() if k != "loops"])

            for pk in position_keys:

                segment_keys = sorted(
                    [k for k in value[pk].keys() if k != "motifs"])

                for sk in segment_keys:

                    gene_gc = (value[pk][sk]["genome"],
                               value[pk][sk]["chromosome"])

                    if gene_gc not in mult_size.keys():
                        mult_size[gene_gc] = [
                            value[pk][sk]["start"], value[pk][sk]["end"]
                        ]

                    mult_size[gene_gc] += [
                        value[pk][sk]["start"], value[pk][sk]["end"]
                    ]

            for k, v in mult_size.items():

                synteny_start = min(v)
                synteny_end = max(v)
                synteny_strand = '+' if v[0] < v[-1] else '-'
                synteny_attribute = "synteny_id \"" + str(key) + "\";"

                if k[0] not in synteny_dict.keys():
                    synteny_dict[k[0]] = []

                synteny_dict[k[0]].append([
                    k[1], "iADHoRe", "synteny", synteny_start, synteny_end,
                    ".", synteny_strand, ".", synteny_attribute
                ])

            if "loops" not in value.keys():
                continue

            for loop in value["loops"]:

                first_motif = loop["first"]
                last_motif = loop["last"]

                loop_size = {(f["genome"], f["chromosome"]):
                             (f["start"], f["end"], l["start"], l["end"])
                             for f, l in zip(first_motif, last_motif)}

                for k0, v0 in loop_size.items():

                    loop_start = min(v0)
                    loop_end = max(v0)
                    loop_strand = '+' if v0[0] < v0[-1] else '-'
                    loop_attribute = "loop_id \"" + str(loop_index) + "\";"

                    if k0[0] not in loops_dict.keys():
                        loops_dict[k0[0]] = []

                    loops_dict[k0[0]].append([
                        k0[1], "MoSyn", "loop_like", loop_start, loop_end, ".",
                        loop_strand, ".", loop_attribute
                    ])

                loop_index += 1

        loop_outdir = outfolder + "/".join(
            ["loop", genus, alignment, score, pwm])
        loop_outdir = check_folder_path(loop_outdir, True)

        synteny_outdir = outfolder + "/".join(
            ["synteny", genus, alignment, score, pwm])
        synteny_outdir = check_folder_path(synteny_outdir, True)

        for key, value in loops_dict.items():

            outfile = loop_outdir + key + ".gtf"
            fout = open(outfile, 'w')

            for val in value:

                val = [str(v) for v in val]
                print("\t".join(val), file=fout)

            fout.close()

        for key, value in synteny_dict.items():

            outfile = synteny_outdir + key + ".gtf"
            fout = open(outfile, 'w')

            for val in value:

                val = [str(v) for v in val]
                print("\t".join(val), file=fout)

            fout.close()
Ejemplo n.º 26
0
def generate_loop_summary(infolder,
                          outfolder,
                          genus_index=-5,
                          alignment_index=-4,
                          score_index=-3,
                          pwm_index=-2):
    """
    Generate Loop summary
    :param infolder: Input folder containing result
    :param outfolder: Output folder
    :param genus_index: Genus index folder relative to result file
    :param alignment_index: Alignment index folder relative to result file
    :param score_index: Score index folder relative to result file
    :param pwm_index: PWM index folder relative to result file
    :return:
    """

    outfolder = check_folder_path(outfolder, True)

    outfile = outfolder + "loop_detail_summary.csv"
    outfile0 = outfolder + "loop_short_summary.csv"

    fout = open(outfile, 'w')
    fout0 = open(outfile0, 'w')

    print("Genus",
          "Alignment",
          "Score",
          "PWM",
          "Loop_ID",
          "Number_of_Segments",
          "Number_of_Genes",
          "Number_of_CTCF",
          "Total_Length",
          sep=",",
          file=fout)

    print("Genus",
          "Alignment",
          "Score",
          "PWM",
          "Number_of_Loops",
          "Number_of_Genes_in_Loops",
          "Number_of_Motifs_in_Loops",
          "Average_Number_of_Genes_per_Loops_per_Species",
          "Average_Length_per_Loops_per_Species",
          "Average_Number_of_Motifs_per_Loops_per_Species",
          sep=",",
          file=fout0)

    infolder = check_folder_path(infolder)
    for loop_file in glob.glob(infolder + '**/loops.yaml', recursive=True):

        path_elem = loop_file.split('/')
        genus = path_elem[genus_index]
        alignment = path_elem[alignment_index]
        score = path_elem[score_index]
        pwm = path_elem[pwm_index]

        with open(loop_file, 'r') as stream:
            iadhore_dict_with_loops = yaml.load(stream)

        check_loops = False
        for key, value in sorted(iadhore_dict_with_loops.items()):
            if "loops" in value.keys():
                check_loops = True
                break

        if not check_loops:
            continue

        this_outdir = outfolder + "/".join([genus, alignment, score, pwm])
        this_outdir = check_folder_path(this_outdir, True)

        genes_and_motifs = this_outdir + "genes_and_motifs.txt"
        genes_only = this_outdir + "genes.txt"
        motifs_only = this_outdir + "motifs.txt"

        f_all = open(genes_and_motifs, 'w')
        f_gene = open(genes_only, 'w')
        f_mot = open(motifs_only, 'w')

        overall_length = 0
        overall_genes = 0
        num_of_loops = 0

        overall_motifs = 0

        set_of_genes = set()
        set_of_motifs = set()

        loop_index = 1
        for key, value in sorted(iadhore_dict_with_loops.items()):

            if "loops" not in value.keys():
                continue

            position_keys = sorted([k for k in value.keys() if k != "loops"])
            for loop in value["loops"]:

                num_of_loops += 1

                first_motif = loop["first"]
                last_motif = loop["last"]

                first_pos = loop["first_pos"]
                last_pos = loop["last_pos"]

                gc_keys = sorted([(m["genome"], m["chromosome"])
                                  for m in first_motif])
                num_of_segments = len(gc_keys)

                genome_string = ",".join([str(g[0]) for g in gc_keys])
                chromosome_string = ",".join([str(g[-1]) for g in gc_keys])

                print("#loop_id=" + str(loop_index) + ";",
                      "genome=" + genome_string + ";",
                      "chromosome=" + chromosome_string + ";",
                      file=f_all)
                print("#loop_id=" + str(loop_index) + ";",
                      "genome=" + genome_string + ";",
                      "chromosome=" + chromosome_string + ";",
                      file=f_gene)
                print("#loop_id=" + str(loop_index) + ";",
                      "genome=" + genome_string + ";",
                      "chromosome=" + chromosome_string + ";",
                      file=f_mot)

                loop_loc = [(f["start"], f["end"], l["start"], l["end"])
                            for f, l in zip(first_motif, last_motif)]
                loop_length = sum([abs(max(l) - min(l)) for l in loop_loc])
                avg_loop_length = loop_length / num_of_segments

                check_start = False

                loop_genes = 0
                loop_motifs = 0

                for pk in position_keys:

                    if first_pos <= pk <= last_pos:

                        val = value[pk]
                        segment_keys = [s for s in val.keys() if s != "motifs"]

                        this_position_genes = []
                        this_gcount = 0
                        this_sgenes = []
                        for gk in gc_keys:
                            this_gene = None
                            for sk in segment_keys:
                                this_gk = (val[sk]["genome"],
                                           val[sk]["chromosome"])
                                if this_gk == gk:
                                    this_gene = val[sk]
                            if this_gene:
                                this_gcount += 1
                                this_sgenes.append(this_gene["gene"])
                                this_position_genes.append(this_gene["gene"] +
                                                           this_gene["strand"])
                            else:
                                this_position_genes.append("-")

                        if this_position_genes and check_start:
                            loop_genes += this_gcount
                            set_of_genes.update(this_sgenes)
                            print("\t".join(this_position_genes), file=f_all)
                            print("\t".join(this_position_genes), file=f_gene)

                        if "motifs" not in val.keys():
                            continue

                        for pair in val["motifs"]:

                            if pair == first_motif:
                                check_start = True

                            this_position_motifs = []
                            this_mcount = 0
                            this_smot = []
                            for gk in gc_keys:
                                this_motif = None
                                for m in pair:
                                    m_gk = (m["genome"], m["chromosome"])

                                    if m_gk == gk:
                                        this_motif = m

                                if this_motif:
                                    this_mcount += 1
                                    this_smot.append(this_motif["motif"])
                                    this_position_motifs.append(
                                        this_motif["motif"] +
                                        this_motif["strand"])
                                else:
                                    this_position_motifs.append("-")

                            if this_position_motifs and check_start:
                                loop_motifs += this_mcount
                                set_of_motifs.update(this_smot)
                                print("\t".join(this_position_motifs),
                                      file=f_all)
                                print("\t".join(this_position_motifs),
                                      file=f_mot)

                            if pair == last_motif:
                                check_start = False

                avg_loop_genes = loop_genes / num_of_segments
                avg_loop_motifs = loop_motifs / num_of_segments

                overall_length += avg_loop_length
                overall_genes += avg_loop_genes
                overall_motifs += avg_loop_motifs

                print(genus,
                      alignment,
                      score,
                      pwm,
                      loop_index,
                      num_of_segments,
                      loop_genes,
                      loop_motifs,
                      loop_length,
                      sep=",",
                      file=fout)

                loop_index += 1

        f_all.close()
        f_gene.close()
        f_mot.close()

        nr_genes = len(set_of_genes)
        nr_motifs = len(set_of_motifs)

        avg_synteny_length = overall_length / num_of_loops
        avg_synteny_genes = overall_genes / num_of_loops
        avg_synteny_motifs = overall_motifs / num_of_loops

        print(genus,
              alignment,
              score,
              pwm,
              num_of_loops,
              nr_genes,
              nr_motifs,
              avg_synteny_genes,
              avg_synteny_length,
              avg_synteny_motifs,
              sep=",",
              file=fout0)

    fout.close()
    fout0.close()
Ejemplo n.º 27
0
def orthofinder(args):

    infolder = check_folder_path(args.input)
    outfolder = check_folder_path(args.output)

    co.run_orthofinder(infolder, outfolder, python2env=args.python2, orthofinder_options=args.opt)