Beispiel #1
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from Routines import SequenceRoutines, FileRoutines
        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Beispiel #2
0
                    "--columns_separator",
                    action="store",
                    dest="separator",
                    default="\t",
                    help="Column separator in file with synonyms")
parser.add_argument("-e",
                    "--header",
                    action="store_true",
                    dest="header",
                    default=False,
                    help="Header is present in synonyms file. Default - False")
parser.add_argument("-l",
                    "--clear_description",
                    action="store_true",
                    dest="clear_description",
                    default=False,
                    help="Clear description. Default - False")
args = parser.parse_args()

SequenceRoutines.rename_records_from_files(
    args.input,
    args.output,
    args.syn_file,
    format=args.format,
    header=args.header,
    separator=args.separator,
    key_index=args.key_index,
    value_index=args.value_index,
    clear_description=args.clear_description,
    comments_prefix=args.comments_prefix)
Beispiel #3
0
parser.add_argument("-c", "--cds_file", action="store", dest="cds_file", required=True,
                    help="Input file CDS sequences")
parser.add_argument("-p", "--pep_file", action="store", dest="pep_file", required=True,
                    help="Input file protein sequences")
parser.add_argument("-o", "--output_file", action="store", dest="out", required=True,
                    help="Output file")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Format of input files. Allowed: fasta, genbank. Default: fasta")
parser.add_argument("-v", "--verbose", action="store_true", dest="verbose",
                    help="Print warning if no protein was found for CDS")
parser.add_argument("-m", "--parsing_mode", action="store", dest="parsing_mode", default="parse",
                    help="Parsing mode of sequence files. Allowed: parse, index, index_db."
                         "Default: parse")
parser.add_argument("-t", "--genetic_code_table", action="store", dest="genetic_code_table", default=1, type=int,
                    help="Genetic code to use for translation of CDS. "
                         "Allowed: table number from http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi"
                         "Default: 1(The standard code)")
parser.add_argument("-d", "--id_check", action="store_true", dest="id_check",
                    help="Also use id check - if there is id present in both files consider them as accordance")
args = parser.parse_args()

SequenceRoutines.get_cds_to_pep_accordance_from_files(args.cds_file, args.pep_file, args.out, verbose=args.verbose,
                                                      parsing_mode=args.parsing_mode,
                                                      genetic_code_table=args.genetic_code_table,
                                                      include_id_check=args.id_check)
if args.parsing_mode == "index_db":
    os.remove("cds_tmp.idx")
    os.remove("pep_tmp.idx")


                    type=FileRoutines.make_list_of_path_to_files_from_string,
                    help="Comma-separated list of input files/directories with sequences")
parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path,
                    help="Directory to output groups_of sequences")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Format of input and output files. Allowed formats genbank, fasta(default)")
parser.add_argument("-e", "--extension", action="store", dest="extension",
                    help="Extension of output files. Default: equal to -f")
parser.add_argument("-d", "--id_file", action="store", dest="id_file",
                    help="File with groups of sequences to extract(.fam file).")

args = parser.parse_args()

FileRoutines.safe_mkdir(args.output)
args.extension = args.extension if args.extension else args.format
tmp_index_file = "temp.idx"

#id_list = read_ids(args.id_file)
id_list = IdSet(filename=args.id_file)

sequence_groups_id = SynDict()
sequence_groups_id.read(args.id_file, split_values=True)
#print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format)
for group in sequence_groups_id:
    SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group],
                                                        verbose=True),
                "%s%s.%s" % (args.output, group, args.extension), format=args.format)

os.remove(tmp_index_file)
Beispiel #5
0
    dest="min_len",
    help="Minimum length of sequence to count. Default: not set")
parser.add_argument(
    "-x",
    "--max_len",
    action="store",
    dest="max_len",
    help="Maximum length of sequence to count. Default: not set")
parser.add_argument("-p",
                    "--parsing_mode",
                    action="store",
                    dest="parsing_mode",
                    default="index_db",
                    help="Parsing mode for input sequence file. "
                    "Possible variants: 'index_db'(default), 'index', 'parse'")

args = parser.parse_args()

SequenceRoutines.make_region_bed_file_from_file(
    args.input,
    args.output,
    white_id_file=args.white_list_ids,
    black_id_file=args.black_list_ids,
    output_format=args.bed_format,
    input_format=args.format,
    min_len=args.min_len,
    max_len=args.max_len,
    parsing_mode=args.parsing_mode,
    index_file="tmp.idx",
    retain_index=False)
Beispiel #6
0
from Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_file",
                    action="store",
                    dest="input",
                    required=True,
                    help="Comma separated list of genbank files/directories")
parser.add_argument("-o",
                    "--output_file_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")
parser.add_argument(
    "-f",
    "--format",
    action="store",
    dest="format",
    default="genbank",
    help=
    "Format of input and output file. Allowed formats genbank(default), fasta")

args = parser.parse_args()

SequenceRoutines.get_random_species_genomes_from_genbank_file(
    args.input, args.output_prefix, output_type=args.format)
Beispiel #7
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'

import argparse

from Routines import SequenceRoutines, FileRoutines

parser = argparse.ArgumentParser()

parser.add_argument(
    "-i",
    "--input",
    action="store",
    dest="input",
    type=lambda x: FileRoutines.make_list_of_path_to_files(x.split(",")),
    help="Comma-separated list of genbank files/directories")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    help="Output file")

args = parser.parse_args()

SequenceRoutines.extract_exon_lengths_from_genbank_file(
    args.input, args.output)
Beispiel #8
0
import argparse

from Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input genbank file")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file with species_counts")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="genbank",
                    help="Format of input file. Default - genbank ")

args = parser.parse_args()

SequenceRoutines.get_id_to_species_accordance_from_file(args.input,
                                                        format=args.format,
                                                        output=args.output)
Beispiel #9
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse

from CustomCollections.GeneralCollections import IdList
from Routines import SequenceRoutines, FileRoutines


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input", action="store", dest="input", required=True,
                    type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")),
                    help="Comma-separated list of  genbank files/directories with transcript annotations")
parser.add_argument("-o", "--output", action="store", dest="output", required=True,
                    help="Output file")
parser.add_argument("-d", "--id_file", action="store", dest="id_file",
                    help="File with id of transcripts to deal with")

args = parser.parse_args()

if args.id_file:
    id_list = IdList()
    id_list.read(args.id_file)
else:
    id_list = None
SequenceRoutines.extract_introns_from_transcripts_from_genbank_files(args.input, args.output,
                                                                     transcript_id_white_list=id_list)
Beispiel #10
0
    "-s",
    "--separator",
    action="store",
    dest="separator",
    default="@",
    help="Separator between species name and sequence id. Default - '@'")
parser.add_argument("-r",
                    "--label_last",
                    action="store_false",
                    dest="label_first",
                    default=True,
                    help="Species label is at the end of id")

args = parser.parse_args()

input_file_list = sorted(os.listdir(args.input_dir))
FileRoutines.safe_mkdir(args.output_dir)
if args.label_first:
    id_expression = lambda record_id: record_id.split(args.separator)[0]
else:
    id_expression = lambda record_id: record_id.split(args.separator)[1]

for filename in input_file_list:
    input_file = "%s%s" % (args.input_dir, filename)
    output_file = "%s%s" % (args.output_dir, filename)
    SequenceRoutines.rename_records_from_files(
        input_file,
        output_file,
        record_id_expression=id_expression,
        clear_description=True)
Beispiel #11
0
                    "--output_file",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file with translated sequences")
parser.add_argument(
    "-g",
    "--genetic_code",
    action="store",
    dest="code",
    type=int,
    default=1,
    help=
    "Genetic code to use. Set by number of ncbi code. Default - 1 (universal)")
parser.add_argument(
    "-s",
    "--translate_to_stop",
    action="store_true",
    dest="translate_to_stop",
    help=
    "Translate to first in-frame stop codon. Default - False(translate whole sequence)"
)
args = parser.parse_args()

SequenceRoutines.translate_sequences_from_file(args.input,
                                               args.output,
                                               format="fasta",
                                               id_expression=None,
                                               genetic_code_table=args.code,
                                               translate_to_stop=True)
Beispiel #12
0
    def extract_proteins_from_output(self, augustus_output, protein_output, evidence_stats_file=None,
                                     supported_by_hints_file=None, complete_proteins_id_file=None, id_prefix="p."):
        if evidence_stats_file:
            ev_fd = open(evidence_stats_file, "w")
            ev_fd.write("#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t")
            ev_fd.write("5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n")

        if evidence_stats_file:
            sup_fd = open(supported_by_hints_file, "w")
            sup_fd.write("#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t")
            sup_fd.write("5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n")

        if complete_proteins_id_file:
            complete_fd = open(complete_proteins_id_file, "w")

        with open(protein_output, "w") as out_fd:
            with open(augustus_output, "r") as in_fd:
                for line in in_fd:
                    if line[:12] == "# start gene":
                        gene = line.strip().split()[-1]
                    elif "\ttranscript\t" in line:
                        transcript_id = line.split("\t")[8].split(";")[0].split("=")[1]
                        start_presence = False
                        stop_presence = False
                        #out_fd.write(">%s%s\t gene=%s\n" % (id_prefix, transcript_id, gene))
                    elif "\tstart_codon\t" in line:
                        start_presence = True
                    elif "\tstop_codon\t" in line:
                        stop_presence = True
                    elif "# protein sequence" in line:
                        protein = line.strip().split("[")[-1]
                        if "]" in protein:
                            protein = protein.split("]")[0]
                        else:
                            while True:
                                part = in_fd.next().split()[-1]
                                if "]" in part:
                                    protein += part.split("]")[0]
                                    break
                                else:
                                    protein += part
                        if complete_proteins_id_file:
                            #print "AAAAA"
                            #print (start_presence, stop_presence)
                            if start_presence and stop_presence:
                                complete_fd.write("%s%s\n" % (id_prefix, transcript_id))

                        out_fd.write(">%s%s\t gene=%s start_presence=%s stop_presence=%s\n" % (id_prefix, transcript_id,
                                                                                               gene,
                                                                                               str(start_presence),
                                                                                               str(stop_presence)))
                        out_fd.write(protein)
                        protein_len = len(protein)
                        out_fd.write("\n")

                    elif evidence_stats_file or supported_by_hints_file:
                        if line[:17] == "# % of transcript":
                            supported_fraction = line.strip().split()[-1]
                            while True:
                                tmp_line = in_fd.next()
                                if tmp_line[:12] == "# CDS exons:":
                                    cds_support = tmp_line.strip().split()[-1]
                                elif tmp_line[:14] == "# CDS introns:":
                                    introns_support = tmp_line.strip().split()[-1]
                                elif tmp_line[:13] == "# 5'UTR exons":
                                    five_utr_support = tmp_line.strip().split()[-1]
                                elif tmp_line[:13] == "# 3'UTR exons":
                                    three_introns_support = tmp_line.strip().split()[-1]
                                elif tmp_line[:27] == "# incompatible hint groups:":
                                    incompatible_hint_groups = tmp_line.strip().split()[-1]
                                    if evidence_stats_file:
                                        ev_fd.write("%s\t%s\t%s\t" % (gene, transcript_id, supported_fraction))
                                        ev_fd.write("%s\t%s\t%s\t%s\t%s\t%i\n" % (cds_support, introns_support,
                                                                                  five_utr_support,
                                                                                  three_introns_support,
                                                                                  incompatible_hint_groups,
                                                                                  protein_len))
                                    if supported_by_hints_file and (float(supported_fraction) > 0):
                                        sup_fd.write("%s\t%s\t%s\t" % (gene, transcript_id, supported_fraction))
                                        sup_fd.write("%s\t%s\t%s\t%s\t%s\t%i\n" % (cds_support, introns_support,
                                                                                   five_utr_support,
                                                                                   three_introns_support,
                                                                                   incompatible_hint_groups,
                                                                                   protein_len))

                                    break

        if evidence_stats_file:
            ev_fd.close()

        self.extract_longest_isoforms(evidence_stats_file, "%s.longest_pep" % evidence_stats_file,
                                      minimum_supported_fraction=0)
        SequenceRoutines.extract_sequence_by_ids(protein_output,
                                                 "%s.longest_pep.ids" % evidence_stats_file,
                                                 "%s.longest_pep.pep" % evidence_stats_file)

        if supported_by_hints_file:
            supported_by_hints_longest_pep_evidence = "%s.longest_pep" % supported_by_hints_file
            supported_by_hints_longest_pep = "%s.longest_pep.pep" % supported_by_hints_file
            supported_by_hints_longest_pep_ids = "%s.longest_pep.ids" % supported_by_hints_file
            self.extract_longest_isoforms(evidence_stats_file, supported_by_hints_longest_pep_evidence,
                                          minimum_supported_fraction=0.00001)
            SequenceRoutines.extract_sequence_by_ids(protein_output,
                                                     supported_by_hints_longest_pep_ids,
                                                     supported_by_hints_longest_pep)

        evidence_files = (evidence_stats_file,
                          "%s.longest_pep" % evidence_stats_file,
                          "%s.longest_pep" % supported_by_hints_file) if supported_by_hints_file else \
                          (evidence_stats_file,)
        for evidence_file in evidence_files:
            print ("Drawing transcript support distribution for %s" % evidence_file)
            MatplotlibRoutines.percent_histogram_from_file(evidence_file,
                                                           evidence_file,
                                                           column_list=(2,), separator=None,
                                                           comments="#", n_bins=20,
                                                           title="Transcript support by hints",
                                                           xlabel="%%", ylabel="Number",
                                                           extensions=["svg", "png"],
                                                           legend_location="upper center",
                                                           stats_as_legend=True)
Beispiel #13
0
parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_file", action="store", dest="input_file",
                    help="Input genbank file with sequences")
parser.add_argument("-o", "--output_file", action="store", dest="out_file",
                    help="Output file")


args = parser.parse_args()


tmp_index_file = "temp.idx"

print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input_file, format="genbank")
gaps_dict = SequenceRoutines.find_gaps(sequence_dict)

with open(args.out_file, "w") as out_fd:
    out_fd.write("#sequence_id\tspecies\tlength\tlineage\treferences\tgenes\trRNA\n")
    for record_id in sequence_dict:
        species = sequence_dict[record_id].annotations["organism"]
        lineage = sequence_dict[record_id].annotations["taxonomy"]
        length = len(sequence_dict[record_id].seq)
        references = ",".join(map(lambda x: x.title, sequence_dict[record_id].annotations['references']))

        protein_genes_list = []
        rRNA_genes_list = []
        for feature in sequence_dict[record_id].features:
            if feature.type == "gene":
                if "gene" in feature.qualifiers:
                    gene_name = feature.qualifiers["gene"][0]
Beispiel #14
0
                    help="Input file with sequences")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="out_file",
                    help="Output gff with gaps")
parser.add_argument(
    "-f",
    "--format",
    action="store",
    dest="format",
    default="fasta",
    help="Format of input. Allowed formats genbank, fasta(default)")

args = parser.parse_args()

tmp_index_file = "temp.idx"

print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file,
                               args.input_file,
                               format=args.format)
gaps_dict = SequenceRoutines.find_gaps(sequence_dict)

with open(args.out_file, "w") as out_fd:
    GFF.write(
        SequenceRoutines.record_by_id_generator(gaps_dict,
                                                gaps_dict.keys(),
                                                verbose=True), out_fd)
os.remove(tmp_index_file)
    default=0,
    help="Number of column with ids in id file (0-based). Default: 0")
parser.add_argument(
    "-e",
    "--extraction_mode",
    action="store",
    dest="coincidence_mode",
    default="exact",
    help="Coincidence mode for id: exact(full, default), partial")
parser.add_argument(
    "-a",
    "--allow_multiple_coincidence_report",
    action="store_true",
    dest="allow_multiple_coincidence_report",
    default=False,
    help=
    "Allow multiple coincidence report of sequences for partial coincidence mode."
    "By default an error is raised")

args = parser.parse_args()

SequenceRoutines.extract_sequence_by_ids(
    args.input,
    args.id_file,
    args.output,
    format=args.format,
    verbose=True,
    id_column_number=args.id_column,
    coincidence_mode=args.coincidence_mode,
    allow_multiple_coincidence_report=args.allow_multiple_coincidence_report)
Beispiel #16
0
    assemblies_dict[assembly_label] = SeqIO.index_db(tmp_index,
                                                     args.input_file_list[i],
                                                     format=args.format)

assembly_N50_dict = TwoLvlDict()
assembly_L50 = TwoLvlDict()
assembly_bins = []
assembly_contig_cumulative_length = OrderedDict()
assembly_contig_number_values = OrderedDict()
assembly_general_stats = TwoLvlDict()
assembly_length_array = OrderedDict()
assembly_lengths = TwoLvlDict()
for assembly in assemblies_dict:
    lengths_array, N50_dict, L50_dict, length_dict, total_length, longest_contig, Ns_number, bins, contig_cumulative_length_values, \
        contig_number_values = SequenceRoutines.calculate_assembly_stats(assemblies_dict[assembly],
                                                                         thresholds_list=args.thresholds,
                                                                         seq_len_file="%s.%s.len" % (args.output_prefix, assembly))
    assembly_N50_dict[assembly] = N50_dict
    assembly_L50[assembly] = L50_dict
    assembly_contig_cumulative_length[
        assembly] = contig_cumulative_length_values
    assembly_contig_number_values[assembly] = contig_number_values
    assembly_general_stats[assembly] = OrderedDict()
    assembly_general_stats[assembly]["Ns"] = Ns_number
    assembly_general_stats[assembly]["Longest contig"] = longest_contig
    assembly_general_stats[assembly]["Total length"] = total_length
    assembly_length_array[assembly] = lengths_array
    assembly_lengths[assembly] = length_dict
    if len(assembly_bins) < len(bins):
        assembly_bins = bins
Beispiel #17
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'

import argparse

from Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_file", action="store", dest="input", required=True,
                    help="Input file with cds sequences")
parser.add_argument("-o", "--output_file_prefix", action="store", dest="output_prefix", required=True,
                    help="Prefix of output files")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Format of input and output files. Allowed formats genbank, fasta(default)")
parser.add_argument("-g", "--genetic_code", action="store", dest="genetic_code",
                    default=1, type=int,
                    help="Number of NCBI genetic code table to use. Default - 1(universal) ")
args = parser.parse_args()

SequenceRoutines.check_cds_for_stop_codons_from_file(args.input, args.output_prefix,
                                                     genetic_code_table=args.genetic_code, format=args.format)
parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input fasta with raw cDS")
parser.add_argument(
    "-s",
    "--stop_codons",
    action="store",
    dest="stop_codons_list",
    default=["TGA", "TAA", "TAG"],
    type=lambda s: s.split(","),
    help=
    "Comma-separated list of stop codons. Can be set using any case and both RNA and DNA alphabet"
)
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file to write trimmed CDS")

args = parser.parse_args()
print "Using %s as stop codons" % ",".join(args.stop_codons_list)
SequenceRoutines.trim_cds_and_remove_terminal_stop_codons(
    args.input, args.output, args.stop_codons_list)
Beispiel #19
0
                    help="Input file A with sequences")
parser.add_argument("-b",
                    "--input_file_B",
                    action="store",
                    dest="input_file_b",
                    required=True,
                    help="Input file A with sequences")
parser.add_argument("-o",
                    "--output_file_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")
parser.add_argument(
    "-f",
    "--format",
    action="store",
    dest="format",
    default="fasta",
    help=
    "Format of input and output files. Allowed formats genbank, fasta(default)"
)

args = parser.parse_args()

SequenceRoutines.compare_sequences_from_files(args.input_file_a,
                                              args.input_file_b,
                                              args.output_prefix,
                                              format=args.format,
                                              verbose=True)
Beispiel #20
0
out_right_se = "%s_2.se.fastq" % args.out_prefix

out_left_fd = open("%s_1.fastq" % args.out_prefix, "w")
out_right_fd = open("%s_2.fastq" % args.out_prefix, "w")
out_left_se_fd = open("%s_1.se.fastq" % args.out_prefix, "w")
out_right_se_fd = open("%s_2.se.fastq" % args.out_prefix, "w")

left_input_reads_dict = SeqIO.index_db("left_in_reads.idx", args.input_left, "fastq")
right_input_reads_dict = SeqIO.index_db("right_in_reads.idx", args.input_right, "fastq")

left_input_set = set(left_input_reads_dict.keys())
right_input_set = set(right_input_reads_dict.keys())


SeqIO.write(SequenceRoutines.record_by_id_generator(left_input_reads_dict,
                                                    sorted(left_input_set & right_input_set),
                                                    verbose=True), out_left, "fastq")
SeqIO.write(SequenceRoutines.record_by_id_generator(right_input_reads_dict,
                                                    sorted(left_input_set & right_input_set),
                                                    verbose=True), out_right, "fastq")
SeqIO.write(SequenceRoutines.record_by_id_generator(left_input_reads_dict,
                                                    left_input_set - right_input_set,
                                                    verbose=True), out_left_se, "fastq")
SeqIO.write(SequenceRoutines.record_by_id_generator(right_input_reads_dict,
                                                    right_input_set - left_input_set,
                                                    verbose=True), out_right_se, "fastq")
out_left_fd.close()
out_right_fd.close()
out_left_se_fd.close()
out_right_se_fd.close()
os.remove("left_in_reads.idx")
Beispiel #21
0
parser.add_argument("-o",
                    "--output_file_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")
parser.add_argument(
    "-f",
    "--format",
    action="store",
    dest="format",
    default="fasta",
    help=
    "Format of input and output files. Allowed formats genbank, fasta(default)"
)
parser.add_argument(
    "-s",
    "--stop_codons",
    action="store",
    dest="stop_codons",
    default=("*", "."),
    type=lambda s: set(s.split(",")),
    help="Comma-separated list of stop codon symbols. Default - '.', '*'")
args = parser.parse_args()

SequenceRoutines.check_proteins_for_stop_codons_from_file(
    args.input,
    args.output_prefix,
    stop_codon_symbol_set=args.stop_codons,
    format=args.format)
Beispiel #22
0
import argparse

from Routines import SequenceRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input",
                    action="store",
                    dest="input",
                    required=True,
                    help="Input genbank file")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="Output file with species counts")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="genbank",
                    help="Format of input file. Default - genbank ")

args = parser.parse_args()

SequenceRoutines.count_species_from_file(args.input,
                                         format=args.format,
                                         output_filename="count_species.count")
try:
    os.mkdir(args.output_directory)
except OSError:
    pass

split_index = 1
records_written = 0
record_ids_list = list(sequence_dict.keys())
number_of_records = len(record_ids_list)

while (records_written + args.number_of_sequences) <= number_of_records:

    SeqIO.write(SequenceRoutines.record_by_id_generator(
        sequence_dict,
        record_ids_list[records_written:records_written +
                        args.number_of_sequences],
        verbose=True),
                "%s/%s_%i.fasta" %
                (args.splited_directory, args.output_prefix, split_index),
                format="fasta")
    split_index += 1
    records_written += args.number_of_sequences

if records_written != number_of_records:
    SeqIO.write(SequenceRoutines.record_by_id_generator(
        sequence_dict, record_ids_list[records_written:], verbose=True),
                "%s/%s_%i.fasta" %
                (args.splited_directory, args.output_prefix, split_index),
                format="fasta")
Beispiel #24
0
    action="store",
    dest="max_length",
    type=int,
    help="Maximum length of sequence to store. Default: filter not set")
parser.add_argument(
    "-d",
    "--id_file",
    action="store",
    dest="id_file",
    help="File to write ids of extracted sequences. Default - don't write")
args = parser.parse_args()

SequenceRoutines.extract_sequences_by_length_from_file(
    args.input_file,
    args.output_file,
    min_len=args.min_length,
    max_len=args.max_length,
    format=args.format,
    tmp_index_file="tmp.idx",
    id_file=args.id_file)
"""
if (args.min_length is None) and (args.max_length is None):
    raise ValueError("Both minimum and maximum lengths were not set")
elif (args.min_length is not None) and (args.max_length is not None) and (args.min_length > args.max_length):
    raise ValueError("Minimum length is greater then maximum lengths")

tmp_index_file = "temp.idx"

print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input_file, format=args.format)

if (args.min_length is not None) and (args.max_length is not None):
Beispiel #25
0
"""
AUGUSTUS.parallel_predict(args.species, args.input, output_raw_gff, strand=args.strand, gene_model=args.gene_model,
                          output_gff3=True, other_options=args.other_options, config_dir=args.config_dir,
                          use_softmasking=args.softmasking, hints_file=args.hintsfile,
                          extrinsicCfgFile=args.extrinsicCfgFile, predict_UTR=args.predict_UTR)
"""
AUGUSTUS.replace_augustus_ids(output_raw_gff,
                              args.output,
                              species_prefix=args.species_prefix,
                              number_of_digits_in_id=8)

Gffread.extract_transcript_sequences(output_gff, args.input, args.output)

SequenceRoutines.trim_cds_and_remove_terminal_stop_codons(
    "%s.cds" % args.output,
    "%s.trimmed.cds" % args.output,
    stop_codons_list=("TGA", "TAA", "TAG")
)  # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins
SequenceRoutines.translate_sequences_from_file(
    "%s.trimmed.cds" % args.output,
    "%s.trimmed.pep" % args.output,
    format="fasta",
    id_expression=None,
    genetic_code_table=1,
    translate_to_stop=False,
    prefix_of_file_inframe_stop_codons_seqs=
    prefix_of_file_inframe_stop_codons_seqs)  # Universal code !!!

AUGUSTUS.extract_gene_ids_from_output(output_gff, all_annotated_genes_ids)
AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff)
if args.masking:
Beispiel #26
0
    help=
    "Format of input and output file. Allowed formats genbank, fasta(default)")
parser.add_argument("-p",
                    "--parsing_mode",
                    action="store",
                    dest="parsing_mode",
                    default="index_db",
                    help="Parsing mode for input sequence file. "
                    "Possible variants: 'index_db'(default), 'index', 'parse'")

args = parser.parse_args()
"""
example of usage

~/Soft/MAVR/scripts/sequence/filter_sequences_by_id_expression.py -i GSS_BOH_BAC_end.fa \
                                                                  -a GSS_BOH_BAC_end.forward.fa \
                                                                  -b GSS_BOH_BAC_end.reverse.fa \
                                                                  -e "\.F$" -p parse

"""
SequenceRoutines.filter_seq_by_reg_expression_from_file(
    args.input,
    args.regular_expression,
    args.filtered_file,
    args.filtered_out_file,
    parsing_mode=args.parsing_mode,
    format=args.format,
    index_file="tmp.idx",
    retain_index=False,
    reg_exp_flags=0)
Beispiel #27
0
syn_dict = SynDict()
syn_dict.read(pep_uniq_description_no_isoform_versions,
              header=False,
              separator="\t",
              allow_repeats_of_key=True,
              split_values=True,
              values_separator=",",
              key_index=1,
              value_index=0,
              comments_prefix="#")
syn_dict.write(pep_description_collapsed_isoforms,
               splited_values=True,
               values_separator=",")

length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input,
                                                         format="fasta",
                                                         out_file=len_file)

descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w")
descr_longest_isoform_fd = open(pep_description_longest_isoform, "w")
descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w")

for gene in syn_dict:
    len_list = []
    longest_isoform = None
    max_len = 0
    for isoform_id in syn_dict[gene]:
        length = length_dict[isoform_id]
        len_list.append(length)
        if length > max_len:
            max_len = length