Esempio n. 1
0
    def parallel_predict(self, species, genome_file, output, strand="both", gene_model=None, output_gff3=True,
                         other_options="", split_dir="splited_input", splited_output_dir="splited_output_dir",
                         config_dir=None, combine_output_to_single_file=True, use_softmasking=None, hints_file=None,
                         extrinsicCfgFile=None, predict_UTR=None, external_process_pool=None,
                         async_run=False, min_intron_len=None, parsing_mode="parse"):
        common_options = self.parse_options(species, genome_file="", strand=strand, gene_model=gene_model,
                                            output_gff3=output_gff3, other_options=other_options,
                                            config_dir=config_dir, use_softmasking=use_softmasking,
                                            hints_file=hints_file, extrinsicCfgFile=extrinsicCfgFile,
                                            predict_UTR=predict_UTR, min_intron_len=min_intron_len)

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        FileRoutines.safe_mkdir(splited_dir)
        FileRoutines.safe_mkdir(splited_out_dir)

        self.split_fasta_by_seq_len(genome_file, splited_dir, parsing_mode=parsing_mode)

        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_output_files = []
        options_list = []
        for filename in input_list_of_files:
            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.gff" % (splited_out_dir, filename)
            list_of_output_files.append(output_file)
            options = common_options

            options += " %s" % input_file
            options += " > %s" % output_file
            options_list.append(options)

        self.parallel_execute(options_list, external_process_pool=external_process_pool, async_run=async_run)

        if combine_output_to_single_file:
            CGAS.cat(list_of_output_files, output=output)
Esempio n. 2
0
    def parallel_blast(self,
                       blast_command,
                       seqfile,
                       database,
                       outfile=None,
                       blast_options=None,
                       split_dir="splited_fasta",
                       splited_output_dir="splited_output_dir",
                       evalue=None,
                       output_format=None,
                       threads=None,
                       num_of_seqs_per_scan=None,
                       combine_output_to_single_file=True,
                       async_run=False,
                       external_process_pool=None):

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        self.safe_mkdir(splited_dir)
        self.safe_mkdir(splited_out_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = FileRoutines.split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)

            list_of_files.append((input_file, output_file))

        options_list = []
        out_files = []

        for in_file, out_filename in list_of_files:

            options = " -out %s" % out_filename

            options += " -db %s" % database
            options += " -query %s" % in_file
            options += " %s" % blast_options if blast_options else ""
            options += " -evalue %s" % evalue if evalue else ""
            options += " -outfmt %i" % output_format if output_format else ""
            options_list.append(options)
            out_files.append(out_filename)

        self.parallel_execute(options_list,
                              cmd=blast_command,
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            CGAS.cat(out_files, output=outfile)
Esempio n. 3
0
    def extract_sequences_from_selected_clusters(
            self,
            clusters_id_file,
            cluster_file,
            seq_file,
            output_dir="./",
            seq_format="fasta",
            out_prefix=None,
            create_dir_for_each_cluster=False,
            skip_cluster_if_no_sequence_for_element=True):
        from Routines import SequenceRoutines, FileRoutines
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")
        protein_dict = SeqIO.index_db(
            "tmp.idx",
            FileRoutines.make_list_of_path_to_files(seq_file),
            format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_dict)
                if absent_elements:
                    print "Skipping cluster %s due to absent element(%s)" % (
                        fam_id, ",".join(absent_elements))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, cluster_dict[fam_id], verbose=True),
                            out_file,
                            format=seq_format)

        os.remove("tmp.idx")
        print "%i of %i clusters were skipped due to absent elements" % (
            number_of_skipped_clusters, len(cluster_dict))

        return number_of_skipped_clusters
Esempio n. 4
0
    def extract_proteins_from_alignments(dir_with_alignments, output_dir):
        out_dir = FileRoutines.check_path(output_dir)

        print type(FileRoutines)

        input_files = make_list_of_path_to_files(
            [dir_with_alignments] if isinstance(dir_with_alignments, str
                                                ) else dir_with_alignments)

        FileRoutines.safe_mkdir(out_dir)
        from Routines import MultipleAlignmentRoutines
        for filename in input_files:
            filename_list = FileRoutines.split_filename(filename)
            output_file = "%s%s%s" % (out_dir, filename_list[1],
                                      filename_list[2])
            MultipleAlignmentRoutines.extract_sequences_from_alignment(
                filename, output_file)
Esempio n. 5
0
 def read_cluster_files_from_dir(dir_with_cluster_files):
     cluster_files_list = sorted(os.listdir(dir_with_cluster_files))
     clusters_dict = OrderedDict()
     for filename in cluster_files_list:
         filepath = "%s%s" % (
             FileRoutines.check_path(dir_with_cluster_files), filename)
         filename_list = FileRoutines.split_filename(filepath)
         clusters_dict[filename_list[1]] = SynDict()
         clusters_dict[filename_list[1]].read(filepath,
                                              header=False,
                                              separator="\t",
                                              allow_repeats_of_key=False,
                                              split_values=True,
                                              values_separator=",",
                                              key_index=0,
                                              value_index=1,
                                              comments_prefix="#")
     return clusters_dict
Esempio n. 6
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from Routines import SequenceRoutines, FileRoutines
        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Esempio n. 7
0
    def split_proteins_per_species(dir_with_proteins,
                                   output_dir,
                                   input_format="fasta",
                                   output_format="fasta"):
        input_files = FileRoutines.make_list_of_path_to_files(
            [dir_with_proteins] if isinstance(dir_with_proteins, str
                                              ) else dir_with_proteins)

        out_dir = FileRoutines.check_path(output_dir)
        FileRoutines.safe_mkdir(out_dir)

        protein_dict = SeqIO.index_db("temp.idx",
                                      input_files,
                                      format=input_format)

        syn_dict = SynDict()

        for protein_id in protein_dict:
            taxa_id = protein_id.split(".")[0]
            # pep_id = ".".join(tmp_list[1:])
            if taxa_id not in syn_dict:
                syn_dict[taxa_id] = []
            syn_dict[taxa_id].append(protein_id)

        def renamed_records_generator(record_dict, taxa_id):
            for record_id in syn_dict[taxa_id]:
                record = deepcopy(record_dict[record_id])
                #print(record)
                record.id = ".".join(record_id.split(".")[1:])
                yield record

        for taxa_id in syn_dict:
            out_file = "%s%s.pep" % (out_dir, taxa_id)
            SeqIO.write(renamed_records_generator(protein_dict, taxa_id),
                        out_file,
                        format=output_format)
parser.add_argument("-i",
                    "--input_vcf",
                    action="store",
                    dest="input_vcf",
                    required=True,
                    help="Input vcf file")
parser.add_argument("-o",
                    "--output_vcf",
                    action="store",
                    dest="output_vcf",
                    required=True,
                    help="Output vcf file")
parser.add_argument("-r",
                    "--reference",
                    action="store",
                    dest="reference",
                    required=True,
                    help="Fasta with reference genome")
parser.add_argument("-g",
                    "--gatk_directory",
                    action="store",
                    dest="gatk_dir",
                    default="",
                    help="Directory with GATK jar")
args = parser.parse_args()

SelectVariants.jar_path = FileRoutines.check_path(args.gatk_dir)
SelectVariants.remove_entries_with_filters(args.reference, args.input_vcf,
                                           args.output_vcf)
Esempio n. 9
0
                    action="store",
                    dest="max_memory_per_thread",
                    default="1G",
                    help="Maximum memory per thread. Default - 1G")
args = parser.parse_args()

if args.prepare_bam and ((not args.prepared_bam_prefix) or
                         (not args.temp_dir)):
    raise ValueError(
        "Options -e/--prepared_bam_prefix and -m/--temp_dir must be set if -p/--prepare_bam option is used"
    )

SamtoolsV1.threads = args.threads

if args.prepare_bam or args.mix_ends:
    FileRoutines.safe_mkdir(FileRoutines.check_path(args.temp_dir))
    prepared_pe_bam_file = "%s.bam" % args.prepared_bam_prefix
    prepared_unpaired_bam_file = (
        "%s.unpaired.bam" %
        args.prepared_bam_prefix) if args.mix_ends else None
    """
    SamtoolsV1.prepare_bam_for_read_extraction(args.input, args.prepared_bam, temp_file_prefix=args.temp_dir,
                                               max_memory_per_thread=args.max_memory_per_thread)
    """
    SamtoolsV1.prepare_bam_for_read_extraction(
        args.input,
        prepared_pe_bam_file,
        temp_file_prefix=args.temp_dir,
        max_memory_per_thread=args.max_memory_per_thread,
        bam_file_to_write_unpaired_reads=prepared_unpaired_bam_file)
if args.paired:
Esempio n. 10
0
    def extract_sequences_by_clusters(self,
                                      dir_with_cluster_files,
                                      dir_with_sequence_files,
                                      output_dir,
                                      file_with_white_list_cluster_ids=None,
                                      mode="families",
                                      sequence_file_extension="fasta",
                                      sequence_file_format="fasta",
                                      label_species=False,
                                      separator_for_labeling="@",
                                      species_label_first=True):
        """
        basenames of cluster and sequence files must be same

        mode:
            clusters - extract sequences from clusters in separate files,
            species - extract sequences from species to separate files
        """
        white_list_ids = None
        if file_with_white_list_cluster_ids:
            white_list_ids = IdSet()
            white_list_ids.read(file_with_white_list_cluster_ids)

        clusters_dict = self.read_cluster_files_from_dir(
            dir_with_cluster_files)
        cluster_names = self.get_cluster_names(clusters_dict,
                                               white_list_ids=white_list_ids)

        sequence_super_dict = OrderedDict()
        out_dir = FileRoutines.check_path(output_dir)

        for species in clusters_dict:
            idx_file = "%s_tmp.idx" % species
            sequence_file = "%s%s.%s" % (FileRoutines.check_path(
                dir_with_sequence_files), species, sequence_file_extension)
            sequence_super_dict[species] = SeqIO.index_db(
                idx_file, sequence_file, format=sequence_file_format)

        if mode == "species":
            seqeuence_names = self.get_sequence_names(
                clusters_dict,
                write_ids=False,
                out_prefix=None,
                white_list_ids=white_list_ids)
            for species in seqeuence_names:
                out_file = "%s%s.%s" % (out_dir, species,
                                        sequence_file_extension)
                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    sequence_super_dict[species], seqeuence_names[species]),
                            out_file,
                            format=sequence_file_format)
        elif mode == "families":

            def per_family_record_generator(seq_super_dict, clust_dict,
                                            cluster_id):
                if species_label_first:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        label, separator_for_labeling, name)
                else:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        name, separator_for_labeling, label)

                for species in seq_super_dict:
                    #print species, cluster_id
                    for record_id in clust_dict[species][cluster_id]:
                        if label_species:
                            record = deepcopy(
                                seq_super_dict[species][record_id])
                            record.id = label_sequence(species, record_id)
                            yield record
                        else:
                            yield seq_super_dict[species][record_id]

            for cluster_name in cluster_names:
                out_file = "%s%s.%s" % (out_dir, cluster_name,
                                        sequence_file_extension)
                SeqIO.write(per_family_record_generator(
                    sequence_super_dict, clusters_dict, cluster_name),
                            out_file,
                            format=sequence_file_format)

        for species in clusters_dict:
            os.remove("%s_tmp.idx" % species)
Esempio n. 11
0
parser.add_argument("--indel_InbreedingCoeff",
                    action="store",
                    dest="indel_InbreedingCoeff",
                    type=float,
                    default=-0.8,
                    help="Indel InbreedingCoeff threshold. Default -   -0.8")
parser.add_argument("--indel_FS",
                    action="store",
                    dest="indel_FS",
                    type=float,
                    default=200.0,
                    help="Indel FS threshold. Default - 200.0")

args = parser.parse_args()

VariantFiltration.jar_path = FileRoutines.check_path(args.gatk_dir)

VariantFiltration.filter_bad_variants(
    args.reference,
    args.input_vcf,
    args.output_prefix,
    snp_filter_name=args.snp_filter_name,
    snp_QD=args.snp_QD,
    snp_FS=args.snp_FS,
    snp_MQ=args.snp_MQ,
    snp_HaplotypeScore=args.snp_HaplotypeScore,
    snp_MappingQualityRankSum=args.snp_MappingQualityRankSum,
    snp_ReadPosRankSum=args.snp_ReadPosRankSum,
    indel_filter_name=args.indel_filter_name,
    indel_QD=args.indel_QD,
    indel_ReadPosRankSum=args.indel_ReadPosRankSum,