help="File with the genome list. Format GenomeID, FullName, ShortName", required=True)
    parser.add_argument("-a", "--annotation_folder", type=str,
                        help="Folder with the annotation files from JGI", required=True)
    parser.add_argument("-c", "--cluster_file", type=str,
                        help="Cluster file", required=True)
    parser.add_argument("-o", "--output_directory", type=str,
                        help="Output folder", required=True)

    args = parser.parse_args()

    #Create the output directory
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)

    #####Read the genome list
    genome_id_dictionary, genome_count = ClusterTools.read_genome_list(args.genome_list_index)

    ###Read the annotation information
    protein_annotation, function_definitions = AnnotationTools.parse_annotation_folder(genome_id_dictionary.keys(), args.annotation_folder)

    ##Read the cluster information
    total_clusters = ClusterTools.get_cluster_information(args.cluster_file)

    ##Print log file
    logfile = open(args.output_directory + "/logfile.txt", 'w')

    ##Total number of clusters
    logfile.write("Total number of analyzed clusters: %d" % len(total_clusters) + "\n")

    features_to_annotate = ["COG", "KO", "PFAM", "Product"]
    parser.add_argument("-c", "--cluster_file", type=str, help="Ortholog file, generated by OrthoMCL", required=True)
    parser.add_argument("-f", "--fasta_aa_directory", type=str, help="Directory with the fasta files", required=True)
    parser.add_argument("-g", "--group_information", type=str, help="Group file")
    parser.add_argument("-o", "--output_directory", type=str, help="Output directory", required=True)

    args = parser.parse_args()

    #Create the output directory
    if not os.path.exists(args.output_directory):
        os.makedirs(args.output_directory)

    #Create a log file
    run_summary = open(args.output_directory + "/logfile.txt", 'w')

    #####Read the genome list
    genome_id_dictionary, genome_count = ClusterTools.read_genome_list(args.genome_list_index)

    run_summary.write("Genomes in the genome list: %d" % genome_count + "\n")

    ######Read the cluster information, and check that everything is ok
    #cluster_information, set_of_proteins_in_clusters, unique_cluster_count, total_clusters, removed_clusters = \
    #    get_orthomcl_results(args.cluster_file, [i for i in genome_id_dictionary.itervalues()])

    cluster_information, set_of_proteins_in_clusters, unique_cluster_count, total_clusters, removed_clusters = \
        get_orthomcl_results(args.cluster_file, genome_id_dictionary.keys())

    run_summary.write("Total number of clusters: %d" % len(cluster_information) + "\n")
    run_summary.write("Total number of protein in clusters: %d" % len(set_of_proteins_in_clusters) + "\n")
    run_summary.write("Total number of removed clusters (not present in the genome file): %d" % removed_clusters + "\n")

    #Check the counts, to see if everything is going ok
Esempio n. 3
0
    dna_aligned_folder = args.output_directory + "/dna_aligned"
    protein_tree_folder = args.output_directory + "/protein_trees"
    dna_tree_folder = args.output_directory + "/dna_trees"

    folder_list = [
        args.output_directory, protein_unaligned_folder,
        protein_alignment_folder, dna_unaligned_folder, dna_aligned_folder,
        protein_tree_folder, dna_tree_folder
    ]

    for folder in folder_list:
        if not os.path.exists(folder):
            os.makedirs(folder)

    #Get the cluster information
    cluster_information = ClusterTools.get_cluster_information(
        args.cluster_file)

    #Create the sequence dictionary
    dna_sequence_dic = create_sequence_dictionary(args.fasta_nuc_directory)

    #Iterate over each cluster and generate the alignments

    frameshift_cases = []
    inframe_stops = []
    clusters_too_short = []
    nucleotide_not_found = []

    for cluster in cluster_information:

        protein_list = cluster_information[cluster].split(",")
        curated_protein_list = {}
Esempio n. 4
0
    protein_unaligned_folder = args.output_directory + "/protein_unaligned"
    protein_alignment_folder = args.output_directory + "/protein_alignment"
    dna_unaligned_folder = args.output_directory + "/dna_unaligned"
    dna_aligned_folder = args.output_directory + "/dna_aligned"
    protein_tree_folder = args.output_directory + "/protein_trees"
    dna_tree_folder = args.output_directory + "/dna_trees"

    folder_list = [args.output_directory, protein_unaligned_folder, protein_alignment_folder, dna_unaligned_folder,
                   dna_aligned_folder, protein_tree_folder, dna_tree_folder]

    for folder in folder_list:
        if not os.path.exists(folder):
            os.makedirs(folder)

    #Get the cluster information
    cluster_information = ClusterTools.get_cluster_information(args.cluster_file)

    #Create the sequence dictionary
    dna_sequence_dic = create_sequence_dictionary(args.fasta_nuc_directory)

    #Iterate over each cluster and generate the alignments

    frameshift_cases = []
    inframe_stops = []
    clusters_too_short = []
    nucleotide_not_found = []

    for cluster in cluster_information:

        protein_list = cluster_information[cluster].split(",")
        curated_protein_list = {}