Beispiel #1
0
    def get_codon_alignment_from_files(self,
                                       protein_aln_file,
                                       nucleotide_seq_file,
                                       codon_alignment_file,
                                       cds2protein_accordance_file=None,
                                       alignment_format="fasta",
                                       nucleotide_sequence_format="fasta",
                                       cds_index_file=None,
                                       retain_cds_index=False):
        protein_aln_dict = AlignIO.read(protein_aln_file,
                                        format=alignment_format)
        nucleotide_seq_dict = SeqIO.index_db(
            cds_index_file if cds_index_file else "nuc_tmp.idx",
            nucleotide_seq_file,
            format=nucleotide_sequence_format)

        protein2cds_accordance_dict = None
        if cds2protein_accordance_file:
            protein2cds_accordance_dict = SynDict()
            protein2cds_accordance_dict.read(cds2protein_accordance_file,
                                             key_index=1,
                                             value_index=0)

        self.get_codon_alignment(
            protein_aln_dict,
            nucleotide_seq_dict,
            codon_alignment_file,
            protein2cds_accordance_dict=protein2cds_accordance_dict)
        if (not cds_index_file) and (not retain_cds_index):
            os.remove("nuc_tmp.idx")
Beispiel #2
0
 def extract_dom_names_hits_from_domtblout(domtblout_file, output_file):
     hits_dict = SynDict()
     hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True,
                    key_index=3, value_index=0, comments_prefix="#")
     if output_file:
         hits_dict.write(output_file, splited_values=True)
     return hits_dict
Beispiel #3
0
    def get_families_from_top_hits(top_hits_file, fam_file):

        hit_dict = SynDict()
        hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#")
        hit_dict.write(fam_file, splited_values=True)

        return hit_dict
Beispiel #4
0
    def extract_sequences_from_selected_clusters(
            self,
            clusters_id_file,
            cluster_file,
            seq_file,
            output_dir="./",
            seq_format="fasta",
            out_prefix=None,
            create_dir_for_each_cluster=False,
            skip_cluster_if_no_sequence_for_element=True):
        from Routines import SequenceRoutines, FileRoutines
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")
        protein_dict = SeqIO.index_db(
            "tmp.idx",
            FileRoutines.make_list_of_path_to_files(seq_file),
            format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_dict)
                if absent_elements:
                    print "Skipping cluster %s due to absent element(%s)" % (
                        fam_id, ",".join(absent_elements))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, cluster_dict[fam_id], verbose=True),
                            out_file,
                            format=seq_format)

        os.remove("tmp.idx")
        print "%i of %i clusters were skipped due to absent elements" % (
            number_of_skipped_clusters, len(cluster_dict))

        return number_of_skipped_clusters
Beispiel #5
0
 def extract_hits_from_tbl_output(blast_hits, output_file):
     hits = SynDict()
     hits.read(blast_hits,
               allow_repeats_of_key=True,
               key_index=0,
               value_index=1,
               separator="\t")
     hits.write(output_file,
                splited_values=True,
                separator="\t",
                values_separator=",")
     return hits
Beispiel #6
0
    def rename_elements_in_clusters(
            clusters_file,
            syn_file,
            output_clusters_file,
            remove_clusters_with_not_renamed_elements=False,
            elements_with_absent_synonyms_file=None,
            syn_file_key_column_index=0,
            syn_file_value_column_index=1,
            syn_file_column_separator='\t'):
        syn_dict = SynDict()
        syn_dict.read(syn_file,
                      comments_prefix="#",
                      key_index=syn_file_key_column_index,
                      value_index=syn_file_value_column_index,
                      separator=syn_file_column_separator)

        clusters_dict = SynDict()
        clusters_dict.read(clusters_file,
                           split_values=True,
                           values_separator=",",
                           comments_prefix="#")

        output_clusters_dict = SynDict()

        absent_elements_dict = SynDict()

        for cluster in clusters_dict:
            renamed_element_list = []
            all_elements_were_renamed_flag = True
            for element in clusters_dict[cluster]:
                if element in syn_dict:
                    renamed_element_list.append(syn_dict[element])
                else:
                    if cluster not in absent_elements_dict:
                        absent_elements_dict[cluster] = [element]
                    else:
                        absent_elements_dict[cluster].append(element)
                    all_elements_were_renamed_flag = False
                    renamed_element_list.append(element)

            if (not remove_clusters_with_not_renamed_elements) or (
                    remove_clusters_with_not_renamed_elements
                    and all_elements_were_renamed_flag):
                output_clusters_dict[cluster] = renamed_element_list

        output_clusters_dict.write(output_clusters_file, splited_values=True)

        if elements_with_absent_synonyms_file:
            absent_elements_dict.write(elements_with_absent_synonyms_file,
                                       splited_values=True)

        return absent_elements_dict
Beispiel #7
0
 def add_len_to_simple_output(top_hits_simple, len_file, out_file):
     len_dict = SynDict()
     len_dict.read(len_file)
     with open(top_hits_simple, "r") as in_fd:
         with open(out_file, "w") as out_fd:
             for line in in_fd:
                 tmp_list = line.strip().split("\t")
                 out_fd.write(
                     "%s\t%s\t%s\t%s\t%s\t%f\n" %
                     (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3],
                      tmp_list[1], tmp_list[2],
                      (float(tmp_list[2]) - float(tmp_list[1]) + 1) /
                      float(len_dict[tmp_list[0]])))
Beispiel #8
0
 def replace_region_names_in_gff(input_gff, synonyms_file, output_gff):
     syn_dict = SynDict()
     syn_dict.read(synonyms_file, comments_prefix="#")
     with open(input_gff, "r") as in_fd:
         with open(output_gff, "w") as out_fd:
             for line in in_fd:
                 if line[0] == "#":
                     out_fd.write(line)
                 else:
                     line_list = line.split("\t")
                     if line_list[0] in syn_dict:
                         line_list[0] = syn_dict[line_list[0]]
                         out_fd.write("\t".join(line_list))
                     else:
                         out_fd.write(line)
Beispiel #9
0
    def label_cluster_elements_from_file(self,
                                         input_file,
                                         label,
                                         output_file,
                                         separator="@",
                                         label_position="first"):
        input_dict = SynDict()
        input_dict.read(input_file, split_values=True, comments_prefix="#")

        output_dict = self.label_cluster_elements(
            input_dict,
            label,
            separator=separator,
            label_position=label_position)
        output_dict.write(output_file, splited_values=True)

        return output_dict
Beispiel #10
0
    def extract_clusters_by_element_ids_from_file(self,
                                                  cluster_file,
                                                  element_file,
                                                  output_file,
                                                  mode="w"):
        """"
        mode: "w" - if elements from element_id_list are present in cluster extracts only that elements
              "a" - if elements from element_id_list are present in cluster extracts all elements
        """
        cluster_dict = SynDict()
        cluster_dict.read(cluster_file, split_values=True, comments_prefix="#")

        element_id_list = IdList()
        element_id_list.read(element_file, comments_prefix="#")
        extracted_clusters = self.extract_clusters_by_element_ids(
            cluster_dict, element_id_list, mode=mode)
        extracted_clusters.write(output_file, splited_values=True)
Beispiel #11
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from Routines import SequenceRoutines, FileRoutines
        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Beispiel #12
0
    def rename_elements_in_clusters(
            clusters_file,
            syn_file,
            output_clusters_file,
            remove_clusters_with_not_renamed_elements=False,
            syn_file_key_column_index=0,
            syn_file_value_column_index=1,
            syn_file_column_separator='\t'):
        syn_dict = SynDict()
        syn_dict.read(syn_file,
                      comments_prefix="#",
                      key_index=syn_file_key_column_index,
                      value_index=syn_file_value_column_index,
                      separator=syn_file_column_separator)

        clusters_dict = SynDict()
        clusters_dict.read(clusters_file,
                           split_values=True,
                           values_separator=",",
                           comments_prefix="#")

        output_clusters_dict = SynDict()

        for cluster in clusters_dict:
            renamed_element_list = []
            for element in clusters_dict[cluster]:
                if element in syn_dict:
                    renamed_element_list.append(syn_dict[element])
                else:
                    renamed_element_list.append(element)
                    if remove_clusters_with_not_renamed_elements:
                        break
            else:
                output_clusters_dict[cluster] = renamed_element_list

        output_clusters_dict.write(output_clusters_file, splited_values=True)
Beispiel #13
0
    def add_length_to_fam_file(fam_file,
                               len_file,
                               out_file,
                               close_after_if_file_object=False):
        fam_dict = SynDict()
        fam_dict.read(fam_file, split_values=True, comments_prefix="#")
        len_dict = SynDict()
        len_dict.read(len_file, comments_prefix="#")

        out_fd = out_file if isinstance(out_file, file) else open(
            out_file, "r")

        for family in fam_dict:
            len_list = []
            for member in fam_dict[family]:
                len_list.append(None if member not in
                                len_dict else len_dict[member])

            out_fd.write(
                "%s\t%s\t%s\n" %
                (family, ",".join(fam_dict[family]), ",".join(len_list)))

        if close_after_if_file_object:
            out_fd.close()
Beispiel #14
0
parser.add_argument("-f", "--families_with_errors", action="store", dest="fam_error", default="error.fam.ids",
                    help="File to write ids of families with errors")
parser.add_argument("-s", "--species_set", action="store", dest="species_set",
                    help="Comma-separated list of species.")

parser.add_argument("-l", "--name_last", action="store_false", dest="name_first", default=True,
                    help="Position of name of species in gene_id. Default: name first")
parser.add_argument("-e", "--name_separator", action="store", dest="name_separator", default=".",
                    help="Separator between species name and gene name. Default: '.'")
args = parser.parse_args()


args.species_set = set(args.species_set.split(","))

pep_fam_dict = SynDict()
pep_fam_dict.read(args.input, split_values=True)

cds_fam_dict = SynDict()

cds_dict = {}
accordance_dict = {}

for species in args.species_set:
    #cds_file = "%s/%s.cds" % (args.cds_dir, species)
    #cds_dict[species] = SeqIO.index(cds_file, format="fasta")

    accordance_file = "%s/%s.accordance" % (args.accordance_dir, species)
    accordance_dict[species] = SynDict()
    accordance_dict[species].read(accordance_file, key_index=1, value_index=0)

Beispiel #15
0
                    type=FileRoutines.check_path,
                    help="Directory to write fam files named by species names")
parser.add_argument("-d", "--syn_file", action="store", dest="syn_file", required=True,
                    help="File with taxa ids and species names")
parser.add_argument("-k", "--key_index", action="store", dest="key_index", type=int, default=0,
                    help="Key column in file with synonyms(0-based)")
parser.add_argument("-v", "--value_index", action="store", dest="value_index", type=int, default=1,
                    help="Value column in file with synonyms(0-based)")
parser.add_argument("-c", "--comments_prefix", action="store", dest="comments_prefix", default="#",
                    help="Prefix of comments in synonyms file")
parser.add_argument("-m", "--columns_separator", action="store", dest="separator", default="\t",
                    help="Column separator in file with synonyms")
parser.add_argument("-e", "--header", action="store_true", dest="header", default=False,
                    help="Header is present in synonyms file. Default - False")

args = parser.parse_args()

syn_dict = SynDict()
syn_dict.read(args.syn_file, header=args.header, separator=args.separator, key_index=args.key_index,
              value_index=args.value_index, comments_prefix=args.comments_prefix)

FileRoutines.safe_mkdir(args.output_files_dir)
input_files = os.listdir(args.input_files_dir)
for filename in input_files:
    directory, taxon_id, extension = FileRoutines.split_filename(filename)
    if taxon_id not in syn_dict:
        print("Species name was not found for taxon %s" % taxon_id)
        continue
    shutil.copy("%s%s" % (args.input_files_dir, filename),
                "%s%s%s" % (args.output_files_dir, syn_dict[taxon_id], extension))
Beispiel #16
0
    "--input",
    action="store",
    dest="input",
    required=True,
    type=make_list_of_path_to_files_from_comma_sep_string,
    help="Comma-separated list of fam files or directories with them")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file")

args = parser.parse_args()

out_fd = sys.stdout if args.input == "stdout" else open(args.output, "w")
family_dict = SynDict()

for filename in args.input:
    fam_dict = SynDict()
    fam_dict.read(filename, split_values=True)
    for family in fam_dict:
        if family not in family_dict:
            family_dict[family] = fam_dict[family]
        else:
            family_dict[family] += fam_dict[family]

family_dict.write(args.output, splited_values=True)

if args.output != "stdout":
    out_fd.close()
                    help="Suffix of fam files")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")
species_list = []
suffix_list = []
if args.use_basename:
    for filename in sorted(os.listdir(args.input)):
        dir, basename, ext = split_filename(filename)
        species_list.append(basename)
        suffix_list.append("%s" % ext)
else:
    species_list = sorted(args.species_set)
    suffix_list = [args.suffix for i in range(0, len(species_list))]

out_fd.write("#species\tnumber_of_families\tnumber_of_proteins\n")
for species, suffix in zip(species_list, suffix_list):
    fam_dict = SynDict()
    fam_dict.read("%s%s%s" % (args.input, species, suffix),
                  separator="\t",
                  split_values=True,
                  values_separator=",",
                  key_index=0,
                  value_index=1)
    out_fd.write("%s\t%i\t%i\n" %
                 (species, len(fam_dict), fam_dict.count_all_synonyms()))

if args.output != "stdout":
    out_fd.close()
                    required=True,
                    help="Reference family file")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    default="stdout",
                    help="Prefix of output file")
args = parser.parse_args()

out_fd = sys.stdout if args.output_prefix == "stdout" else open(
    "%s_reference_random_genes.ids" % args.output_prefix, "w")

reference_families = SynDict()
reference_families.read(args.reference_fam,
                        separator="\t",
                        split_values=True,
                        values_separator=",")

node_family_ids = IdList()
node_family_ids.read(args.input,
                     header=True,
                     column_number=0,
                     column_separator="\t")

reference_random_genes = SynDict()

for family_id in node_family_ids:
    if family_id not in reference_families:
        reference_random_genes[family_id] = "."
    else:
        reference_random_genes[family_id] = choice(
Beispiel #19
0
    def replace_augustus_ids_by_syn(augustus_gff, output_gff, genes_syn_file, transcripts_syn_file,
                                    cds_syn_file=None):

        genes_syn_dict = SynDict()
        genes_syn_dict.read(genes_syn_file, comments_prefix="#")
        transcripts_syn_dict = SynDict()
        transcripts_syn_dict.read(transcripts_syn_file, comments_prefix="#")
        cds_syn_dict = SynDict()
        if cds_syn_file:
            cds_syn_dict.read(cds_syn_file, comments_prefix="#")
        with open(augustus_gff, "r") as in_fd:
            with open(output_gff, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip()
                    if len(tmp) < 13:
                        out_fd.write(line)
                        continue
                    if tmp[:12] != "# start gene":
                        out_fd.write(line)
                        continue
                    augustus_gene_id = tmp.split(" ")[-1]
                    gene_syn_id = genes_syn_dict[augustus_gene_id]
                    augustus_transcript_id = ""
                    augustus_transcript_parent = ""
                    out_fd.write("# start gene %s\n" % gene_syn_id)
                    tmp = in_fd.next().strip()
                    while True:
                        while tmp[0] != "#":
                            tmp_list = tmp.split("\t")
                            feature_type = tmp_list[2]
                            edited_str = "\t".join(tmp_list[:-1])
                            info_field_list = tmp_list[-1].split(";")
                            if feature_type == "gene":
                                edited_str += "\tID=%s\n" % gene_syn_id
                            elif feature_type == "transcript":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_transcript_id = entry.split("=")[-1]
                                        transcript_syn_id = transcripts_syn_dict[augustus_transcript_id]
                                    if "Parent" in entry:
                                        augustus_transcript_parent = entry.split("=")[-1]
                                        if augustus_transcript_parent != augustus_gene_id:
                                            raise ValueError("Transcript parent id and gene id are not same!")
                                edited_str += "\tID=%s;Parent=%s\n" % (transcript_syn_id, gene_syn_id)
                            elif feature_type == "CDS":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_cds_id = entry.split("=")[-1]
                                        cds_syn_id = cds_syn_dict[augustus_cds_id] if cds_syn_dict else "%s.cds" % transcripts_syn_dict[augustus_cds_id[:-4]]
                                    if "Parent" in entry:
                                        augustus_cds_parent = entry.split("=")[-1]
                                        if augustus_cds_parent != augustus_transcript_id:
                                            raise ValueError("CDS parent id and transcript id are not same!")
                                edited_str += "\tID=%s;Parent=%s\n" % (cds_syn_id, transcript_syn_id)
                            elif (feature_type == "stop_codon") or (feature_type == "start_codon"):
                                for entry in info_field_list:
                                    if "Parent" in entry:
                                        augustus_feature_parent = entry.split("=")[-1]
                                        if augustus_feature_parent != augustus_transcript_id:
                                            raise ValueError("Feature parent id and transcript id are not same!")
                                edited_str += "\tParent=%s\n" % (transcript_syn_id)
                            else:
                                edited_str = tmp

                            out_fd.write(edited_str)
                            tmp = in_fd.next().strip()
                        while tmp[0] == "#":
                            if "# end gene" in tmp:
                                break
                            out_fd.write(tmp + "\n")
                            tmp = in_fd.next().strip()
                        if "# end gene" in tmp:
                                break
                    out_fd.write("# end gene %s\n" % gene_syn_id)
Beispiel #20
0
                 dont_add_read_groups else rmdup_sorted_filtered_alignment)
GenomeCov.get_coverage(
    rmdup_sorted_filtered_alignment_with_groups
    if not args.dont_add_read_groups else rmdup_sorted_filtered_alignment,
    args.coverage_bed)
if not args.retain_temp:
    os.remove(raw_alignment)
    os.remove(filtered_alignment)
    os.remove(sorted_filtered_alignment)

if args.calculate_median_coverage or args.calculate_mean_coverage:
    coverage_dict = SynDict()
    coverage_dict.read(args.coverage_bed,
                       header=False,
                       separator="\t",
                       allow_repeats_of_key=True,
                       values_separator=",",
                       key_index=0,
                       value_index=2,
                       expression=int)
    if args.calculate_median_coverage:
        with open("%s_median_coverage.tab" % args.prefix, "w") as out_fd:
            for region in coverage_dict:
                mediana = median(
                    array(coverage_dict[region] if args.flanks_size ==
                          0 else coverage_dict[region]
                          [args.flanks_size:-args.flanks_size]))
                out_fd.write("%s\t%f\n" % (region, mediana))
    if args.calculate_mean_coverage:
        with open("%s_mean_coverage.tab" % args.prefix, "w") as out_fd:
            for region in coverage_dict:
                meana = mean(
                    help="Output file with collapsed strings")
parser.add_argument("-c", "--column_separator", action="store", dest="column_separator", default="\t",
                    help="Column separator. Default: '\\t'")
parser.add_argument("-v", "--value_separator", action="store", dest="value_separator", default=",",
                    help="Value separator. Default: ','")
parser.add_argument("-k", "--key_column", action="store", dest="key_column", default=0, type=int,
                    help="Column to be used as key(0-based). Default: 0")
parser.add_argument("-a", "--value_column", action="store", dest="value_column", default=1, type=int,
                    help="Column to be used as value(0-based). Default: 1")
parser.add_argument("-m", "--comments_prefix", action="store", dest="comments_prefix", default="#",
                    help="Prefix of strings(comments) to be ignored. Default: #")
parser.add_argument("-r", "--remove_value_repeats", action="store_true", dest="remove_value_repeats",
                    help="Remove repeats of values")
args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

syn_dict = SynDict()
syn_dict.read(args.input, header=False, separator=args.column_separator, allow_repeats_of_key=True,
              split_values=True, values_separator=args.value_separator,
              key_index=args.key_column, value_index=args.value_column,
              comments_prefix=args.comments_prefix)

if args.remove_value_repeats:
    collapsed_dict = syn_dict.remove_value_repeats()
    collapsed_dict.write(out_fd, splited_values=True, values_separator=args.value_separator,
                         close_after_if_file_object=True)
else:
    syn_dict.write(out_fd, splited_values=True, values_separator=args.value_separator,
                   close_after_if_file_object=True)
#out_fd.close()
                    type=FileRoutines.make_list_of_path_to_files_from_string,
                    help="Comma-separated list of input files/directories with sequences")
parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path,
                    help="Directory to output groups_of sequences")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Format of input and output files. Allowed formats genbank, fasta(default)")
parser.add_argument("-e", "--extension", action="store", dest="extension",
                    help="Extension of output files. Default: equal to -f")
parser.add_argument("-d", "--id_file", action="store", dest="id_file",
                    help="File with groups of sequences to extract(.fam file).")

args = parser.parse_args()

FileRoutines.safe_mkdir(args.output)
args.extension = args.extension if args.extension else args.format
tmp_index_file = "temp.idx"

#id_list = read_ids(args.id_file)
id_list = IdSet(filename=args.id_file)

sequence_groups_id = SynDict()
sequence_groups_id.read(args.id_file, split_values=True)
#print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format)
for group in sequence_groups_id:
    SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group],
                                                        verbose=True),
                "%s%s.%s" % (args.output, group, args.extension), format=args.format)

os.remove(tmp_index_file)
parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_file", action="store", dest="input", required=True,
                    help="Input file with families")
parser.add_argument("-d", "--id_file", action="store", dest="id_file", default=None,
                    help="File with ids of families. If absent genes from all families will be extracted(default).")
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file")
parser.add_argument("-s", "--separate_families", action="store_true", dest="separate_families",
                    help="Separate families to different files. If set option -o/--output_file is ignored")
args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")
families = SynDict()
families.read(args.input, separator="\t", split_values=True, values_separator=",")
if args.id_file:
    id_list = IdList()
    id_list = id_list.read(args.id_file)


if args.separate_families:
    for fam_id in id_list if args.id_file else families:
        with open("%s.ids" % fam_id, "w") as fam_fd:
            for gene_id in families[fam_id]:
                fam_fd.write(gene_id + "\n")
else:
    with open(args.output, "w") as out_fd:
        for fam_id in id_list if args.id_file else families:
            for gene_id in families[fam_id]:
                out_fd.write(gene_id + "\n")
Beispiel #24
0
    def star_and_htseq(self,
                       genome_dir,
                       samples_directory,
                       output_directory,
                       gff_for_htseq,
                       count_table_file,
                       genome_fasta=None,
                       samples_to_handle=None,
                       genome_size=None,
                       annotation_gtf=None,
                       feature_from_gtf_to_use_as_exon=None,
                       exon_tag_to_use_as_transcript_id=None,
                       exon_tag_to_use_as_gene_id=None,
                       length_of_sequences_flanking_junction=None,
                       junction_tab_file_list=None,
                       three_prime_trim=None,
                       five_prime_trim=None,
                       adapter_seq_for_three_prime_clip=None,
                       max_mismatch_percent_for_adapter_trimming=None,
                       three_prime_trim_after_adapter_clip=None,
                       output_type="BAM",
                       sort_bam=True,
                       max_memory_for_bam_sorting=None,
                       include_unmapped_reads_in_bam=True,
                       output_unmapped_reads=True,
                       two_pass_mode=False,
                       star_dir=None,
                       threads=1,
                       max_intron_length=None,
                       stranded_rnaseq="yes",
                       min_alignment_quality=10,
                       feature_type_for_htseq="exon",
                       feature_id_attribute_for_htseq="gene_id",
                       htseq_mode="union"):

        STAR.threads = threads
        STAR.path = star_dir

        if genome_fasta:
            STAR.index(genome_dir,
                       genome_fasta,
                       annotation_gtf=None,
                       junction_tab_file=None,
                       sjdboverhang=None,
                       genomeSAindexNbases=None,
                       genomeChrBinNbits=None,
                       genome_size=genome_size)

        sample_list = samples_to_handle if samples_to_handle else self.get_sample_list(
            samples_directory)
        self.prepare_diff_expression_directories(output_directory, sample_list)

        alignment_dir = "%s/alignment/" % output_directory

        count_table = TwoLvlDict()
        for sample in sample_list:
            print("Handling %s" % sample)
            sample_dir = "%s/%s/" % (samples_directory, sample)
            alignment_sample_dir = "%s/%s/" % (alignment_dir, sample)
            filetypes, forward_files, reverse_files = self.make_lists_forward_and_reverse_files(
                sample_dir)

            print "\tAligning reads..."

            STAR.align(
                genome_dir,
                forward_files,
                reverse_read_list=reverse_files,
                annotation_gtf=annotation_gtf,
                feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
                exon_tag_to_use_as_transcript_id=
                exon_tag_to_use_as_transcript_id,
                exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                length_of_sequences_flanking_junction=
                length_of_sequences_flanking_junction,
                junction_tab_file_list=junction_tab_file_list,
                three_prime_trim=three_prime_trim,
                five_prime_trim=five_prime_trim,
                adapter_seq_for_three_prime_clip=
                adapter_seq_for_three_prime_clip,
                max_mismatch_percent_for_adapter_trimming=
                max_mismatch_percent_for_adapter_trimming,
                three_prime_trim_after_adapter_clip=
                three_prime_trim_after_adapter_clip,
                output_type=output_type,
                sort_bam=sort_bam,
                max_memory_for_bam_sorting=max_memory_for_bam_sorting,
                include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                output_unmapped_reads=output_unmapped_reads,
                output_dir=alignment_sample_dir,
                two_pass_mode=two_pass_mode,
                max_intron_length=max_intron_length)

            alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir

            print "\tIndexing alignment file..."
            os.system("samtools index %s" % alignment_file)

            print "\tCounting reads aligned to features..."
            count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample)

            HTSeq.count(alignment_file,
                        gff_for_htseq,
                        count_file,
                        samtype="bam",
                        order="pos",
                        stranded_rnaseq=stranded_rnaseq,
                        min_alignment_quality=min_alignment_quality,
                        feature_type=feature_type_for_htseq,
                        feature_id_attribute=feature_id_attribute_for_htseq,
                        mode=htseq_mode,
                        suppress_progres_report=False)

            sample_counts = SynDict()
            sample_counts.read(count_file,
                               header=False,
                               separator="\t",
                               allow_repeats_of_key=False,
                               split_values=False,
                               values_separator=",",
                               key_index=0,
                               value_index=1,
                               close_after_if_file_object=False,
                               expression=None,
                               comments_prefix="__")
            count_table[sample] = sample_counts

        count_table.write(count_table_file)
parser.add_argument("-i", "--input", action="store", dest="input", required=True,
                    help="Input fam file")
parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True,
                    help="File with ids of families to extract")
parser.add_argument("-o", "--output", action="store", dest="output", default="stdout",
                    help="File to write extracted families. Default - stdout")
parser.add_argument("-v", "--verbose", action="store_true", dest="verbose",
                    help="Print not found ids. Default - no")

args = parser.parse_args()

out_file = sys.stdout if args.output == "stdout" else open(args.output, "w")

fam_dict = SynDict()
fam_dict.read(args.input)

id_set = IdSet()
id_set.read(args.id_file)

extracted_dict = SynDict()
for id_entry in id_set:
    if id_entry in fam_dict:
        extracted_dict[id_entry] = fam_dict[id_entry]
    else:
        if args.verbose:
            print("%s was not found" % id_entry)

extracted_dict.write(out_file, close_after_if_file_object=True)

Beispiel #26
0
    help=
    "Remove nucleotide substitutions from output(preserve only AA substitutions)"
)
parser.add_argument("-c",
                    "--convert_aa_to_single_letter",
                    action="store_true",
                    dest="convert_to_single_letter",
                    help="Convert aminoacids to single letters")

args = parser.parse_args()

args.input = make_list_of_path_to_files(args.input)

gene_alias_dict = SynDict()
if args.gene_alias_file:
    gene_alias_dict.read(args.gene_alias_file, split_values=False)
out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

summary_dict = TwoLvlDict()
for filename in args.input:
    directory, prefix, extension = split_filename(filename)

    if args.write_dir_path and args.write_ext:
        name = filename
    elif args.write_dir_path:
        name = (directory + prefix) if directory else prefix
    elif args.write_ext:
        name = prefix + extension
    else:
        name = prefix
        if args.suffix_to_remove in name:
Beispiel #27
0
    for node in nodes_list:
        if node not in id_list:
            return False
    return True


def check_edge_soft(nodes_list, id_list):
    for node in nodes_list:
        if node in id_list:
            return True
    return False


families_dict = SynDict()
families_dict.read(args.fam_file,
                   separator="\t",
                   split_values=True,
                   values_separator=",")

try:
    os.mkdir(args.output_dir)
except OSError:
    pass

graph_list = []
with open(args.hclust_input, "r") as in_fd:
    for line in in_fd:
        graph_list.append(line.strip().split("\t"))


def extract_fam_graph(family_name):
    print("Started extraction for family %s" % family_name)
Beispiel #28
0
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file")
parser.add_argument("-k",
                    "--family_column",
                    action="store",
                    dest="fam_col",
                    default=1,
                    type=int,
                    help="Family column position(0-based). Default: 1")
parser.add_argument("-a",
                    "--genes_column",
                    action="store",
                    dest="gen_col",
                    default=0,
                    type=int,
                    help="Genes column position(0-based). Default: 0")

args = parser.parse_args()

hit_dict = SynDict()

hit_dict.read(args.input,
              header=args.header,
              allow_repeats_of_key=True,
              key_index=args.fam_col,
              value_index=args.gen_col)

hit_dict.write(args.output, splited_values=True)
Beispiel #29
0
                                                   pep_uniq_description_file)
remove_isoform_versions_str = "sed s/isoform.*// %s > %s" % (
    pep_uniq_description_file, pep_uniq_description_no_isoform_versions)

for exe_string in get_pep_decription_str, get_uniq_descriptions_str, remove_isoform_versions_str:
    print(exe_string)
    os.system(exe_string)

os.system(awk_extract_ids_string % (pep_uniq_description_file, pep_uniq_ids))

syn_dict = SynDict()
syn_dict.read(pep_uniq_description_no_isoform_versions,
              header=False,
              separator="\t",
              allow_repeats_of_key=True,
              split_values=True,
              values_separator=",",
              key_index=1,
              value_index=0,
              comments_prefix="#")
syn_dict.write(pep_description_collapsed_isoforms,
               splited_values=True,
               values_separator=",")

length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input,
                                                         format="fasta",
                                                         out_file=len_file)

descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w")
descr_longest_isoform_fd = open(pep_description_longest_isoform, "w")
descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w")