Esempio n. 1
0
    def calculate_p_distance(self):
        from RouToolPa.Routines import SequenceRoutines
        if self.parsing_mode == "generator":
            raise ValueError(
                "ERROR!!! P distance calculation was not implemented for generator mode!"
            )

        if self.seq_lengths is None:
            self.get_stats_and_features(count_gaps=False, sort=False)
        seq_len = self.seq_lengths["length"].unique()
        if len(seq_len) > 1:
            raise ValueError("ERROR!!! Some sequences have different length!")
        else:
            seq_len = seq_len[0]

        distance_df = pd.DataFrame(0,
                                   index=self.scaffolds,
                                   columns=self.scaffolds)
        for record_id_a in self.scaffolds:
            for record_id_b in self.scaffolds:
                if record_id_a == record_id_b:
                    continue

                distance_df.loc[record_id_a, record_id_b] = distance_df.loc[
                    record_id_b, record_id_a] = SequenceRoutines.p_distance(
                        self.records[record_id_a], self.records[record_id_b],
                        seq_len)

        return distance_df
Esempio n. 2
0
 def correct_coordinates(self, sequence_dict):
     for primer_pair in self.primer_pair_list:
         primer_pair.left_primer.start = sequence_dict[self.seq_id].find(
             primer_pair.left_primer.seq)
         primer_pair.right_primer.start = sequence_dict[self.seq_id].find(
             SequenceRoutines.reverse_complement(
                 primer_pair.right_primer.seq)
         ) + primer_pair.right_primer.length - 1
Esempio n. 3
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from RouToolPa.Routines import SequenceRoutines

        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Esempio n. 4
0
    help=
    "Format of input and output file. Allowed formats genbank, fasta(default)")
parser.add_argument("-p",
                    "--parsing_mode",
                    action="store",
                    dest="parsing_mode",
                    default="index_db",
                    help="Parsing mode for input sequence file. "
                    "Possible variants: 'index_db'(default), 'index', 'parse'")

args = parser.parse_args()
"""
example of usage

~/Soft/MAVR/scripts/sequence/filter_sequences_by_id_expression.py -i GSS_BOH_BAC_end.fa \
                                                                  -a GSS_BOH_BAC_end.forward.fa \
                                                                  -b GSS_BOH_BAC_end.reverse.fa \
                                                                  -e "\.F$" -p parse

"""
SequenceRoutines.filter_seq_by_reg_expression_from_file(
    args.input,
    args.regular_expression,
    args.filtered_file,
    args.filtered_out_file,
    parsing_mode=args.parsing_mode,
    format=args.format,
    index_file="tmp.idx",
    retain_index=False,
    reg_exp_flags=0)
Esempio n. 5
0
    def extract_proteins_from_output(self,
                                     augustus_output,
                                     protein_output,
                                     evidence_stats_file=None,
                                     supported_by_hints_file=None,
                                     complete_proteins_id_file=None,
                                     id_prefix="p."):
        if evidence_stats_file:
            ev_fd = open(evidence_stats_file, "w")
            ev_fd.write(
                "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t"
            )
            ev_fd.write(
                "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n"
            )

        if evidence_stats_file:
            sup_fd = open(supported_by_hints_file, "w")
            sup_fd.write(
                "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t"
            )
            sup_fd.write(
                "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n"
            )

        if complete_proteins_id_file:
            complete_fd = open(complete_proteins_id_file, "w")

        with open(protein_output, "w") as out_fd:
            with open(augustus_output, "r") as in_fd:
                for line in in_fd:
                    if line[:12] == "# start gene":
                        gene = line.strip().split()[-1]
                    elif "\ttranscript\t" in line:
                        transcript_id = line.split("\t")[8].split(
                            ";")[0].split("=")[1]
                        start_presence = False
                        stop_presence = False
                        #out_fd.write(">%s%s\t gene=%s\n" % (id_prefix, transcript_id, gene))
                    elif "\tstart_codon\t" in line:
                        start_presence = True
                    elif "\tstop_codon\t" in line:
                        stop_presence = True
                    elif "# protein sequence" in line:
                        protein = line.strip().split("[")[-1]
                        if "]" in protein:
                            protein = protein.split("]")[0]
                        else:
                            while True:
                                part = in_fd.readline().split()[-1]
                                if "]" in part:
                                    protein += part.split("]")[0]
                                    break
                                else:
                                    protein += part
                        if complete_proteins_id_file:
                            #print "AAAAA"
                            #print (start_presence, stop_presence)
                            if start_presence and stop_presence:
                                complete_fd.write("%s%s\n" %
                                                  (id_prefix, transcript_id))

                        out_fd.write(
                            ">%s%s\t gene=%s start_presence=%s stop_presence=%s\n"
                            % (id_prefix, transcript_id, gene,
                               str(start_presence), str(stop_presence)))
                        out_fd.write(protein)
                        protein_len = len(protein)
                        out_fd.write("\n")

                    elif evidence_stats_file or supported_by_hints_file:
                        if line[:17] == "# % of transcript":
                            supported_fraction = line.strip().split()[-1]
                            while True:
                                tmp_line = in_fd.readline()
                                if tmp_line[:12] == "# CDS exons:":
                                    cds_support = tmp_line.strip().split()[-1]
                                elif tmp_line[:14] == "# CDS introns:":
                                    introns_support = tmp_line.strip().split(
                                    )[-1]
                                elif tmp_line[:13] == "# 5'UTR exons":
                                    five_utr_support = tmp_line.strip().split(
                                    )[-1]
                                elif tmp_line[:13] == "# 3'UTR exons":
                                    three_introns_support = tmp_line.strip(
                                    ).split()[-1]
                                elif tmp_line[:
                                              27] == "# incompatible hint groups:":
                                    incompatible_hint_groups = tmp_line.strip(
                                    ).split()[-1]
                                    if evidence_stats_file:
                                        ev_fd.write("%s\t%s\t%s\t" %
                                                    (gene, transcript_id,
                                                     supported_fraction))
                                        ev_fd.write(
                                            "%s\t%s\t%s\t%s\t%s\t%i\n" %
                                            (cds_support, introns_support,
                                             five_utr_support,
                                             three_introns_support,
                                             incompatible_hint_groups,
                                             protein_len))
                                    if supported_by_hints_file and (
                                            float(supported_fraction) > 0):
                                        sup_fd.write("%s\t%s\t%s\t" %
                                                     (gene, transcript_id,
                                                      supported_fraction))
                                        sup_fd.write(
                                            "%s\t%s\t%s\t%s\t%s\t%i\n" %
                                            (cds_support, introns_support,
                                             five_utr_support,
                                             three_introns_support,
                                             incompatible_hint_groups,
                                             protein_len))

                                    break

        if evidence_stats_file:
            ev_fd.close()

        self.extract_longest_isoforms(evidence_stats_file,
                                      "%s.longest_pep" % evidence_stats_file,
                                      minimum_supported_fraction=0)
        SequenceRoutines.extract_sequence_by_ids(
            protein_output, "%s.longest_pep.ids" % evidence_stats_file,
            "%s.longest_pep.pep" % evidence_stats_file)

        if supported_by_hints_file:
            supported_by_hints_longest_pep_evidence = "%s.longest_pep" % supported_by_hints_file
            supported_by_hints_longest_pep = "%s.longest_pep.pep" % supported_by_hints_file
            supported_by_hints_longest_pep_ids = "%s.longest_pep.ids" % supported_by_hints_file
            self.extract_longest_isoforms(
                evidence_stats_file,
                supported_by_hints_longest_pep_evidence,
                minimum_supported_fraction=0.00001)
            SequenceRoutines.extract_sequence_by_ids(
                protein_output, supported_by_hints_longest_pep_ids,
                supported_by_hints_longest_pep)

        evidence_files = (evidence_stats_file,
                          "%s.longest_pep" % evidence_stats_file,
                          "%s.longest_pep" % supported_by_hints_file) if supported_by_hints_file else \
                          (evidence_stats_file,)
        for evidence_file in evidence_files:
            print("Drawing transcript support distribution for %s" %
                  evidence_file)
            MatplotlibRoutines.percent_histogram_from_file(
                evidence_file,
                evidence_file,
                column_list=(2, ),
                separator=None,
                comments="#",
                n_bins=20,
                title="Transcript support by hints",
                xlabel="%%",
                ylabel="Number",
                extensions=["svg", "png"],
                legend_location="upper center",
                stats_as_legend=True)
Esempio n. 6
0
parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input fasta with raw cDS")
parser.add_argument(
    "-s",
    "--stop_codons",
    action="store",
    dest="stop_codons_list",
    default=["TGA", "TAA", "TAG"],
    type=lambda s: s.split(","),
    help=
    "Comma-separated list of stop codons. Can be set using any case and both RNA and DNA alphabet."
    "Default: TGA, TAA, TAG")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file to write trimmed CDS")

args = parser.parse_args()
print("Using %s as stop codons" % ",".join(args.stop_codons_list))
SequenceRoutines.trim_cds_and_remove_terminal_stop_codons(
    args.input, args.output, args.stop_codons_list)
Esempio n. 7
0
import argparse

from RouToolPa.Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-r",
                    "--reference",
                    action="store",
                    dest="reference",
                    required=True,
                    help="File with reference genome")
parser.add_argument("-s",
                    "--samtools_directory",
                    action="store",
                    dest="samtools_dir",
                    default="",
                    help="Directory with samtools binaries")
parser.add_argument("-p",
                    "--picard_directory",
                    action="store",
                    dest="picard_dir",
                    default="",
                    help="Directory with PICARD jar")

args = parser.parse_args()

SequenceRoutines.prepare_reference_for_GATK(args.reference,
                                            picard_dir=args.picard_dir,
                                            samtools_dir=args.samtools_dir)
Esempio n. 8
0
syn_dict = SynDict()
syn_dict.read(pep_uniq_description_no_isoform_versions,
              header=False,
              separator="\t",
              allow_repeats_of_key=True,
              split_values=True,
              values_separator=",",
              key_index=1,
              value_index=0,
              comments_prefix="#")
syn_dict.write(pep_description_collapsed_isoforms,
               splited_values=True,
               values_separator=",")

length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input,
                                                         format="fasta",
                                                         out_file=len_file)

descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w")
descr_longest_isoform_fd = open(pep_description_longest_isoform, "w")
descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w")

for gene in syn_dict:
    len_list = []
    longest_isoform = None
    max_len = 0
    for isoform_id in syn_dict[gene]:
        length = length_dict[isoform_id]
        len_list.append(length)
        if length > max_len:
            max_len = length
Esempio n. 9
0
in_fd = open(args.in_file, "r")
max_length_list = []
number_poly_list = []
number_of_UTRs = 0
with open(args.out_file, "w") as out_fd:
    out_fd.write(
        "#main_id\tUTR_length\tmax_homopolymer_length\tnumber_homopolymers\tCoordinates_list\tOther_ids\n"
    )
    for line in in_fd:
        name_line = line.strip()
        sequence = in_fd.readline().strip()
        number_of_UTRs += 1
        coords_list, length_list = SequenceRoutines.find_homopolymers(
            sequence,
            args.nucleotide,
            min_size=args.min_size,
            search_type=args.search_type,
            max_single_insert_size=args.max_single_insert_size,
            max_total_insert_length=args.max_total_insert_length,
            max_number_of_insertions=args.max_number_of_insertions)
        if not coords_list:
            continue
        id_list = name_line.split("|")[1].split(",")
        max_length = max(length_list)
        number_of_homopolymers = len(length_list)
        max_length_list.append(max_length)
        number_poly_list.append(number_of_homopolymers)
        coords_str_list = map(lambda x: "(%i,%i)" % (x[0], x[1]), coords_list)
        out_fd.write(
            "%s\t%i\t%i\t%i\t%s\t%s\n" %
            (id_list[0], len(sequence), max_length, number_of_homopolymers,
             ",".join(coords_str_list), ",".join(id_list)))
Esempio n. 10
0
import argparse
from RouToolPa.Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input genbank file")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file with species_counts")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="genbank",
                    help="Format of input file. Default - genbank ")

args = parser.parse_args()

SequenceRoutines.get_id_to_species_accordance_from_file(args.input,
                                                        format=args.format,
                                                        output=args.output)
Esempio n. 11
0
                    dest="prefix",
                    help="Prefix of output files")
parser.add_argument("-o",
                    "--output_directory",
                    action="store",
                    dest="output_dir",
                    help="Directory to write output files")
parser.add_argument("-n",
                    "--num_of_records_per_file",
                    action="store",
                    dest="num_of_records_per_file",
                    type=int,
                    help="Number of sequences per output file")
parser.add_argument("-f",
                    "--num_of_out_files",
                    action="store",
                    dest="num_of_out_files",
                    type=int,
                    help="Number of output files")

args = parser.parse_args()

if args.num_of_records_per_file and args.num_of_out_files:
    raise ValueError("Options -n and -f can't be set simultaneously")

SequenceRoutines.split_fasta(args.input,
                             args.output_dir,
                             num_of_recs_per_file=args.num_of_records_per_file,
                             num_of_files=args.num_of_out_files,
                             output_prefix=args.prefix)
Esempio n. 12
0
                    "--black_list",
                    action="store",
                    dest="black_list",
                    help="File with record ids from black list")
parser.add_argument("-w",
                    "--white_list",
                    action="store",
                    dest="white_list",
                    help="File with record ids from white list")
parser.add_argument("-m",
                    "--masking",
                    action="store",
                    dest="masking",
                    help="0-based BED file with regions to mask")
parser.add_argument("-t",
                    "--trimming",
                    action="store",
                    dest="trimming",
                    help="0-based BED file with regions to trim")
args = parser.parse_args()

SequenceRoutines.correct_sequences_from_file(
    args.input,
    args.output,
    black_list_file=args.black_list,
    white_list_file=args.white_list,
    regions_to_trim_file=args.trimming,
    regions_to_mask_file=args.masking,
    parsing_mode="parse",
    format=args.format)
Esempio n. 13
0
    "-i",
    "--input",
    action="store",
    dest="input",
    required=True,
    type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")),
    help=
    "Comma-separated list of  genbank files/directories with transcript annotations"
)
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file")
parser.add_argument("-d",
                    "--id_file",
                    action="store",
                    dest="id_file",
                    help="File with id of transcripts to deal with")

args = parser.parse_args()

if args.id_file:
    id_list = IdList()
    id_list.read(args.id_file)
else:
    id_list = None
SequenceRoutines.extract_introns_from_transcripts_from_genbank_files(
    args.input, args.output, transcript_id_white_list=id_list)
Esempio n. 14
0
import argparse
from RouToolPa.Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input genbank file")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file with species counts")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="genbank",
                    help="Format of input file. Default - genbank ")

args = parser.parse_args()

SequenceRoutines.count_species_from_file(args.input,
                                         format=args.format,
                                         output_filename="count_species.count")
Esempio n. 15
0
from RouToolPa.Routines import SequenceRoutines


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_file_list", action="store", dest="input", required=True,
                    type=lambda s: make_list_of_path_to_files(s.split(",")),
                    help="Comma-separated list of input files/directories with sequences")
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file with renamed sequences")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Format of input and output files. Allowed formats genbank, fasta(default)")
parser.add_argument("-s", "--syn_file", action="store", dest="syn_file", required=True,
                    help="File to write correspondences between new and old ids")
parser.add_argument("-n", "--numerical_part_of_id_length", action="store", dest="numerical_part_of_id_length",
                    default=8, type=int,
                    help="Length of numerical part of id. Default: 8")
parser.add_argument("-d", "--id_prefix", action="store", dest="id_prefix", required=True,
                    help="Prefix of new sequence ids")
parser.add_argument("-l", "--clear_description", action="store_true", dest="clear_description", default=False,
                    help="Clear description. Default - False")

args = parser.parse_args()

SequenceRoutines.rename_records_by_sequential_ids_from_files(args.input, args.output, args.syn_file, format=args.format,
                                                             clear_description=args.clear_description,
                                                             record_id_prefix=args.id_prefix,
                                                             length_of_numerical_part=args.numerical_part_of_id_length,
                                                             parse_mode="parse", index_file="temp.idx")

Esempio n. 16
0
import argparse
from RouToolPa.Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-a",
                    "--seq_file_a",
                    action="store",
                    dest="seq_file_a",
                    required=True,
                    help="Sequence file A")
parser.add_argument("-b",
                    "--seq_file_b",
                    action="store",
                    dest="seq_file_b",
                    required=True,
                    help="Sequence file B")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")

args = parser.parse_args()

SequenceRoutines.compare_sequences_by_length_from_file(args.seq_file_a,
                                                       args.seq_file_b,
                                                       args.output_prefix)
Esempio n. 17
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'

import argparse
from RouToolPa.Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_file", action="store", dest="input", required=True,
                    help="Input file with protein sequences")
parser.add_argument("-o", "--output_file_prefix", action="store", dest="output_prefix", required=True,
                    help="Prefix of output files")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Format of input and output files. Allowed formats genbank, fasta(default)")
parser.add_argument("-s", "--stop_codons", action="store", dest="stop_codons",
                    default=("*", "."),
                    type=lambda s: set(s.split(",")),
                    help="Comma-separated list of stop codon symbols. Default - '.', '*'")
args = parser.parse_args()

SequenceRoutines.check_proteins_for_stop_codons_from_file(args.input, args.output_prefix,
                                                          stop_codon_symbol_set=args.stop_codons,
                                                          format=args.format)
Esempio n. 18
0
parser.add_argument("-i", "--input", action="store", dest="input", required=True,
                    type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")),
                    help="Comma-separated list of files/directories with sequences")
parser.add_argument("-o", "--output_bed_file", action="store", dest="output", required=True,
                    help="Output bed file")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Format of input files. Allowed formats genbank, fasta(default)")
parser.add_argument("-w", "--white_list_ids", action="store", dest="white_list_ids",
                    help="File with ids of regions from white list")
parser.add_argument("-b", "--black_list_ids", action="store", dest="black_list_ids",
                    help="File with ids of regions from black list")
parser.add_argument("-e", "--bed_format", action="store", dest="bed_format", default="0-based",
                    help="Format of output bed format. Allowed: 0-based(default), 1-based")
parser.add_argument("-m", "--min_len", action="store", dest="min_len",
                    help="Minimum length of sequence to count. Default: not set")
parser.add_argument("-x", "--max_len", action="store", dest="max_len",
                    help="Maximum length of sequence to count. Default: not set")
parser.add_argument("-p", "--parsing_mode", action="store", dest="parsing_mode", default="index_db",
                    help="Parsing mode for input sequence file. "
                         "Possible variants: 'index_db'(default), 'index', 'parse'")

args = parser.parse_args()

SequenceRoutines.make_region_bed_file_from_file(args.input, args.output, white_id_file=args.white_list_ids,
                                                black_id_file=args.black_list_ids,
                                                output_format=args.bed_format, input_format=args.format,
                                                min_len=args.min_len, max_len=args.max_len,
                                                parsing_mode=args.parsing_mode,
                                                index_file="tmp.idx", retain_index=False)
Esempio n. 19
0
    "-e",
    "--end_column_id",
    action="store",
    dest="end_column_id",
    type=int,
    default=2,
    help="0-based index of column with feature end. Default: 2")
parser.add_argument(
    "-n",
    "--coordinates_type",
    action="store",
    dest="coordinates_type",
    default="1-based",
    help="Type of coordinates. Allowed: 0-based, 1-based(default)")

args = parser.parse_args()

SequenceRoutines.split_sequence_by_regions_from_file(
    args.input,
    args.regions,
    args.output_prefix,
    retain_description=False,
    min_length=args.min_length,
    parsing_mode="parse",
    scaffold_column_index=args.scaffold_column_id,
    start_column_index=args.start_column_id,
    end_column_index=args.end_column_id,
    coordinates_type=args.coordinates_type,
    input_separator="\t",
    sequence_format="fasta")
Esempio n. 20
0
try:
    os.mkdir(args.output_directory)
except OSError:
    pass

split_index = 1
records_written = 0
record_ids_list = list(sequence_dict.keys())
number_of_records = len(record_ids_list)

while (records_written + args.number_of_sequences) <= number_of_records:

    SeqIO.write(SequenceRoutines.record_by_id_generator(
        sequence_dict,
        record_ids_list[records_written:records_written +
                        args.number_of_sequences],
        verbose=True),
                "%s/%s_%i.fasta" %
                (args.splited_directory, args.output_prefix, split_index),
                format="fasta")
    split_index += 1
    records_written += args.number_of_sequences

if records_written != number_of_records:
    SeqIO.write(SequenceRoutines.record_by_id_generator(
        sequence_dict, record_ids_list[records_written:], verbose=True),
                "%s/%s_%i.fasta" %
                (args.splited_directory, args.output_prefix, split_index),
                format="fasta")
Esempio n. 21
0
    "--parsing_mode",
    action="store",
    dest="parsing_mode",
    default="parse",
    help="Parsing mode of sequence files. Allowed: parse, index, index_db."
    "Default: parse")
parser.add_argument(
    "-g",
    "--genetic_code_table",
    action="store",
    dest="genetic_code_table",
    default=1,
    type=int,
    help="Genetic code to use for translation of transcript. "
    "Allowed: table number from http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi"
    "Default: 1(The standard code)")

args = parser.parse_args()

SequenceRoutines.find_cds_coordinates_in_transcript_by_pep_from_file(
    args.transcript_file,
    args.pep_file,
    args.correspondence_file,
    args.output_prefix,
    parsing_mode=args.parsing_mode,
    verbose=args.verbose,
    format=args.format,
    transcript_index_file=None,
    protein_index_file=None,
    genetic_code_table=args.genetic_code_table)
Esempio n. 22
0
                    default="selenocystein_proteins",
                    help="Prefix of output files")
parser.add_argument(
    "-f",
    "--format",
    action="store",
    dest="format",
    default="fasta",
    help=
    "Format of input and output files. Allowed formats genbank, fasta(default)"
)

args = parser.parse_args()

SequenceRoutines.check_selenocystein_presence_from_file(args.input,
                                                        args.output_prefix,
                                                        format="fasta")
"""
tmp_index_file = "temp.idx"

print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input_file, format=args.format)
selenocystein_ids = []
with open(args.out_prefix + ".ids", "w") as out_fd:
    for record_id in sequence_dict:
        if "U" in sequence_dict[record_id].seq:
            selenocystein_ids.append(record_id)
            out_fd.write(record_id + "\n")
SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict,
                                                    selenocystein_ids), "%s_seq.fasta" % args.out_prefix, args.format)
os.remove(tmp_index_file)
Esempio n. 23
0
    "-w",
    "-transcript_with_no_pep_idfile",
    action="store",
    dest="transcript_with_no_pep_idfile",
    help=
    "File to write ids of transcripts with no protein hit. Default: not set")
parser.add_argument(
    "-s",
    "-transcript_with_several_pep_idfile",
    action="store",
    dest="transcript_with_several_pep_idfile",
    help=
    "File to write ids of transcripts with several protein. Default: not set")

args = parser.parse_args()

SequenceRoutines.get_transcript_to_pep_accordance_from_files(
    args.transcript_file,
    args.pep_file,
    args.out,
    verbose=args.verbose,
    parsing_mode=args.parsing_mode,
    genetic_code_table=args.genetic_code_table,
    include_id_check=args.id_check,
    transcript_with_no_pep_idfile=args.transcript_with_no_pep_idfile,
    transcript_with_several_proteins_idfile=args.
    transcript_with_several_pep_idfile)
if args.parsing_mode == "index_db":
    os.remove("transcript_tmp.idx")
    os.remove("pep_tmp.idx")
Esempio n. 24
0
import argparse
from RouToolPa.Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_file",
                    action="store",
                    dest="input",
                    required=True,
                    help="Comma separated list of genbank files/directories")
parser.add_argument("-o",
                    "--output_file_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")
parser.add_argument(
    "-f",
    "--format",
    action="store",
    dest="format",
    default="genbank",
    help=
    "Format of input and output file. Allowed formats genbank(default), fasta")

args = parser.parse_args()

SequenceRoutines.get_random_species_genomes_from_genbank_file(
    args.input, args.output_prefix, output_type=args.format)
Esempio n. 25
0
parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    help="Genbank file with annotations")
parser.add_argument(
    "--fast_parsing",
    action="store_true",
    dest="fast_parsing",
    help="Fast parsing mode - high memory consumption. Default: false")
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    default="output",
                    help="Prefix of output files")

args = parser.parse_args()

record_dict = SeqIO.to_dict(SeqIO.parse(
    args.input, format="genbank")) if args.fast_parsing else SeqIO.index_db(
        "temp_index.idx", [args.input], format="genbank")

SequenceRoutines.get_protein_marking_by_exons_from_genbank(
    record_dict,
    args.output_prefix,
    protein_id_field_in_cds_feature="protein_id")

#os.remove("temp_index.idx")
Esempio n. 26
0
parser.add_argument("-o",
                    "--output_file_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")
parser.add_argument(
    "-f",
    "--format",
    action="store",
    dest="format",
    default="fasta",
    help=
    "Format of input and output files. Allowed formats genbank, fasta(default)"
)
parser.add_argument("-p",
                    "--print_stats",
                    action="store_true",
                    dest="print_stats",
                    help="Print stats. Default: False")

args = parser.parse_args()

SequenceRoutines.count_softmasked_nucleotides_from_file(
    args.input,
    args.output_prefix,
    verbose=args.print_stats,
    parsing_mode="parse",
    format=args.format,
    index_file=None)
Esempio n. 27
0
args = parser.parse_args()

if args.labels_list is not None:
    if len(args.labels_list) != len(args.input_file_list):
        raise ValueError(
            "Length of labels list is not equal to number of files with assemblies"
        )

assemblies_dict = OrderedDict()
for i in range(0, len(args.input_file_list)):
    assembly_label = args.labels_list[i] if args.labels_list else "A%i" % (i +
                                                                           1)
    tmp_index = "%s.tmp.idx" % assembly_label
    assemblies_dict[assembly_label] = SequenceRoutines.parse_seq_file(
        args.input_file_list[i],
        args.parsing_mode,
        format=args.format,
        index_file=tmp_index)
    #SeqIO.index_db(tmp_index, args.input_file_list[i],format=args.format)

assembly_N50_dict = TwoLvlDict()
assembly_L50 = TwoLvlDict()
assembly_bins = []
assembly_contig_cumulative_length = OrderedDict()
assembly_contig_number_values = OrderedDict()
assembly_general_stats = TwoLvlDict()
assembly_length_array = OrderedDict()
assembly_lengths = TwoLvlDict()
for assembly in assemblies_dict:
    lengths_array, N50_dict, L50_dict, length_dict, total_length, longest_contig, Ns_number, bins, contig_cumulative_length_values, \
        contig_number_values = SequenceRoutines.calculate_assembly_stats(assemblies_dict[assembly],
Esempio n. 28
0
                    default=1,
                    help="Maximum number of sequences per region. Default: 1")
parser.add_argument("-b",
                    "--scaffold_black_list_file",
                    action="store",
                    dest="scaffold_black_list_file",
                    type=lambda s: IdList(filename=s),
                    help="File with scaffolds from black list")
parser.add_argument(
    "-x",
    "--min_scaffold_len",
    action="store",
    dest="min_scaffold_len",
    type=int,
    default=None,
    help=
    "Minimum length of scaffold to be included in regions. Default: not set")

args = parser.parse_args()

SequenceRoutines.prepare_region_list_by_length(
    max_length=args.max_length,
    max_seq_number=args.max_seq_number,
    length_dict=None,
    reference=args.reference,
    parsing_mode="parse",
    output_dir=args.output_dir,
    split_scaffolds=args.split_scaffolds,
    min_scaffold_length=args.min_scaffold_len,
    black_list_scaffolds=None)
Esempio n. 29
0
                          use_softmasking=args.softmasking,
                          hints_file=args.hintsfile,
                          extrinsicCfgFile=args.extrinsicCfgFile,
                          predict_UTR=args.predict_UTR,
                          parsing_mode="parse")

AUGUSTUS.replace_augustus_ids(output_raw_gff,
                              args.output,
                              species_prefix=args.species_prefix,
                              number_of_digits_in_id=8)

Gffread.extract_transcript_sequences(output_gff, args.input, args.output)

SequenceRoutines.trim_cds_and_remove_terminal_stop_codons(
    "%s.cds" % args.output,
    "%s.trimmed.cds" % args.output,
    stop_codons_list=("TGA", "TAA", "TAG")
)  # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins

SequenceRoutines.translate_sequences_from_file(
    "%s.trimmed.cds" % args.output,
    "%s.trimmed.pep" % args.output,
    format="fasta",
    id_expression=None,
    genetic_code_table=1,
    translate_to_stop=False,
    prefix_of_file_inframe_stop_codons_seqs=
    prefix_of_file_inframe_stop_codons_seqs)  # Universal code !!!

AUGUSTUS.extract_gene_ids_from_output(output_gff, all_annotated_genes_ids)
AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff)
Esempio n. 30
0
    help="File to write ids of extracted sequences. Default - don't write")
parser.add_argument(
    "-p",
    "--parsing_mode",
    action="store",
    dest="parsing_mode",
    default='parse',
    help="Parsing mode. Allowed: parse(default), index, index_db")

args = parser.parse_args()

SequenceRoutines.extract_sequences_by_length_from_file(
    args.input_file,
    args.output_file,
    min_len=args.min_length,
    max_len=args.max_length,
    format=args.format,
    tmp_index_file="tmp.idx",
    id_file=args.id_file,
    parsing_mode=args.parsing_mode)
"""
if (args.min_length is None) and (args.max_length is None):
    raise ValueError("Both minimum and maximum lengths were not set")
elif (args.min_length is not None) and (args.max_length is not None) and (args.min_length > args.max_length):
    raise ValueError("Minimum length is greater then maximum lengths")

tmp_index_file = "temp.idx"

print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input_file, format=args.format)