Beispiel #1
0
 def generate_backbone(self):
     _LOG.info("Reading input sequences: %s" %(self.options.sequence_file))
     sequences = MutableAlignment()
     sequences.read_file_object(self.options.sequence_file)
     if (options().backbone_size is None):            
         options().backbone_size = min(100,int(.20*sequences.get_num_taxa()))
         _LOG.info("Backbone size set to: %d" %(options().backbone_size))
     backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size))        
     [sequences.pop(i) for i in backbone_sequences.keys()]
     
     _LOG.info("Writing query and backbone set. ")
     query = get_temp_file("query", "backbone", ".fas")
     backbone = get_temp_file("backbone", "backbone", ".fas")
     _write_fasta(sequences, query)
     _write_fasta(backbone_sequences, backbone)
             
     _LOG.info("Generating sate backbone alignment and tree. ")
     satealignJob = SateAlignJob()
     moleculeType = options().molecule
     if (options().molecule == 'amino'):
         moleculeType =  'protein'
     satealignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu)
     satealignJob.run()
     satealignJob.read_results()
     
     options().placement_size = self.options.backbone_size
     options().alignment_file = open(self.options.outdir + "/sate.fasta")
     options().tree_file = open(self.options.outdir + "/sate.fasttree")
     _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file))
     options().fragment_file = query
Beispiel #2
0
def hmmer_to_markers(input, temp_dir):
    global refpkg

    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + '_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])

    for gene in refpkg["genes"]:
        # Now run HMMER search
        hmmer_output = temp_dir + '/' + gene + ".out"
        hmmer_search(frag_file, refpkg[gene]["hmm"], hmmer_output)
        results = read_hmmsearch_results(hmmer_output)

        # Now select best direction for each frag
        for name, value in results.items():
            bitscore = value[1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name, val in frag_scores.items():
        if (val[1] not in genes):
            genes[val[1]] = {}
        if (val[2] == 'forward'):
            genes[val[1]][name] = fragments[name]
        else:
            genes[val[1]][name] = reverse_sequence(fragments[name])

    genes.pop("NA", None)

    for gene, seq in genes.items():
        gene_file = temp_dir + '/' + gene + ".frags.fas.fixed"
        _write_fasta(seq, gene_file)

    binned_fragments = {}
    for gene, seq in genes.items():
        binned_fragments[gene] = {}
        binned_fragments[gene]["file"] = temp_dir + '/' + gene \
            + ".frags.fas.fixed"
        binned_fragments[gene]["nfrags"] = len(seq.keys())

    return binned_fragments
Beispiel #3
0
def blast_to_markers(input, temp_dir):
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    if (options().gene is None):
        # First blast sequences against all markers
        blast_results = temp_dir + "/blast.out"
        if (options().blast_file is None):
            print("Blasting fragments against marker dataset\n")
            blast_fragments(input, blast_results)
        else:
            blast_results = options().blast_file
        # Next bin the blast hits to the best gene
        gene_binning = bin_blast_results(blast_results)
    else:
        gene_binning = {options().gene: list(fragments.keys())}
    # Now figure out direction of fragments
    binned_fragments = dict([
        (gene, dict([(seq_name, fragments[seq_name])
                     for seq_name in gene_binning[gene]]))
        for gene in gene_binning])
    print("Finding best orientation of reads\n")
    align_name = 'sate'
    if (options().genes == 'cogs'):
        align_name = 'pasta'
    for (gene, frags) in binned_fragments.items():
        # Add reverse complement sequence
        frags_rev = dict([(name + '_rev', reverse_sequence(seq))
                          for (name, seq) in frags.items()])
        gene_frags = MutableAlignment()
        gene_frags.set_alignment(frags)
        gene_frags.set_alignment(frags_rev)
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(gene_frags, gene_file)

        # Now run HMMER search
        hmmer_search(
            gene_file,
            os.path.join(
                options().__getattribute__('reference').path,
                'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for key in frags:
            forward_score = -10000
            backward_score = -10000
            if (key in results):
                forward_score = results[key][1]
            if (key+"_rev" in results):
                backward_score = results[key + "_rev"][1]
            if (backward_score > forward_score):
                frags[key] = gene_frags[key + "_rev"]

        # Now write to file
        _write_fasta(frags, gene_file + ".fixed")
        binned_fragments[gene] = frags
    return binned_fragments
Beispiel #4
0
def blast_to_markers(input, temp_dir):
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    if (options().gene is None):
        # First blast sequences against all markers
        blast_results = temp_dir + "/blast.out"
        if (options().blast_file is None):
            print("Blasting fragments against marker dataset\n")
            blast_fragments(input, blast_results)
        else:
            blast_results = options().blast_file
        # Next bin the blast hits to the best gene
        gene_binning = bin_blast_results(blast_results)
    else:
        gene_binning = {options().gene: list(fragments.keys())}
    # Now figure out direction of fragments
    binned_fragments = dict([(gene,
                              dict([(seq_name, fragments[seq_name])
                                    for seq_name in gene_binning[gene]]))
                             for gene in gene_binning])
    print("Finding best orientation of reads\n")
    align_name = 'sate'
    if (options().genes == 'cogs'):
        align_name = 'pasta'
    for (gene, frags) in binned_fragments.items():
        # Add reverse complement sequence
        frags_rev = dict([(name + '_rev', reverse_sequence(seq))
                          for (name, seq) in frags.items()])
        gene_frags = MutableAlignment()
        gene_frags.set_alignment(frags)
        gene_frags.set_alignment(frags_rev)
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(gene_frags, gene_file)

        # Now run HMMER search
        hmmer_search(
            gene_file,
            os.path.join(options().__getattribute__('reference').path,
                         'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for key in frags:
            forward_score = -10000
            backward_score = -10000
            if (key in results):
                forward_score = results[key][1]
            if (key + "_rev" in results):
                backward_score = results[key + "_rev"][1]
            if (backward_score > forward_score):
                frags[key] = gene_frags[key + "_rev"]

        # Now write to file
        _write_fasta(frags, gene_file + ".fixed")
        binned_fragments[gene] = frags
    return binned_fragments
Beispiel #5
0
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name+'_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir+"/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])
    gene_set = marker_genes
    align_name = 'sate'
    if (options().genes == 'cogs'):
        gene_set = cog_genes
        align_name = 'pasta'
    for gene in gene_set:
        # Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(
                options().__getattribute__('reference').path,
                'refpkg/%s.refpkg/%.profile' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for name, value in results.items():
            bitscore = value[1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name, val in frag_scores.items():
        if (val[1] not in genes):
            genes[val[1]] = {}
        if (val[2] == 'forward'):
            genes[val[1]][name] = fragments[name]
        else:
            genes[val[1]][name] = reverse_sequence(fragments[name])
    genes.pop("NA", None)
    for gene, seq in genes.items():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(seq, gene_file + ".fixed")
    return genes
Beispiel #6
0
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + '_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    #Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])
    gene_set = marker_genes
    align_name = 'sate'
    if (options().genes == 'cogs'):
        gene_set = cog_genes
        align_name = 'pasta'
    for gene in gene_set:
        #Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(options().__getattribute__('reference').path,
                         'refpkg/%s.refpkg/%.profile' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        #Now select best direction for each frag
        for name in results.keys():
            bitscore = results[name][1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    #Now bin the fragments
    genes = dict([])
    for name in frag_scores.keys():
        if (frag_scores[name][1] not in genes):
            genes[frag_scores[name][1]] = {}
        if (frag_scores[name][2] == 'forward'):
            genes[frag_scores[name][1]][name] = fragments[name]
        else:
            genes[frag_scores[name][1]][name] = reverse_sequence(
                fragments[name])
    genes.pop("NA", None)
    for gene in genes.keys():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(genes[gene], gene_file + ".fixed")
    return genes
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + "_rev", reverse_sequence(seq)) for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, "NA", "NA"]) for name in fragments.keys()])
    gene_set = marker_genes
    align_name = "sate"
    if options().genes == "cogs":
        gene_set = cog_genes
        align_name = "pasta"
    for gene in gene_set:
        # Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(
                options().__getattribute__("reference").path, "refpkg/%s.refpkg/%.profile" % (gene, align_name)
            ),
            temp_dir + "/%s.out" % gene,
        )
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for name in results.keys():
            bitscore = results[name][1]
            direction = "forward"
            true_name = name
            if name.find("_rev") != -1:
                true_name = true_name.replace("_rev", "")
                direction = "reverse"
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name in frag_scores.keys():
        if frag_scores[name][1] not in genes:
            genes[frag_scores[name][1]] = {}
        if frag_scores[name][2] == "forward":
            genes[frag_scores[name][1]][name] = fragments[name]
        else:
            genes[frag_scores[name][1]][name] = reverse_sequence(fragments[name])
    genes.pop("NA", None)
    for gene in genes.keys():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(genes[gene], gene_file + ".fixed")
    return genes
Beispiel #8
0
 def generate_backbone(self):
     _LOG.info("Reading input sequences: %s" %(self.options.sequence_file))
     sequences = MutableAlignment()
     sequences.read_file_object(self.options.sequence_file)
     fragments = MutableAlignment()
     if (options().median_full_length is not None):
       if (options().median_full_length == -1):
         seq_lengths = sorted([len(seq) for seq in sequences.values()])              
         lengths = len(seq_lengths)
         if lengths % 2:
           options().median_full_length = (seq_lengths[lengths / 2] + seq_lengths[lengths / 2 - 1]) / 2.0
         else:
           options().median_full_length = seq_lengths[lengths / 2]              
         
       (min_length,max_length) = (int(options().median_full_length*(1-options().backbone_threshold)),int(options().median_full_length*(1+options().backbone_threshold)))
       frag_names = [name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length]
       if (len(frag_names) > 0):
           fragments = sequences.get_hard_sub_alignment(frag_names)        
           [sequences.pop(i) for i in fragments.keys()]        
     if (options().backbone_size is None):            
         options().backbone_size = min(1000,int(sequences.get_num_taxa()))
         _LOG.info("Backbone size set to: %d" %(options().backbone_size))
     if (options().backbone_size > len(sequences.keys())):
       options().backbone_size = len(sequences.keys())
     backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size))        
     [sequences.pop(i) for i in backbone_sequences.keys()]
             
     _LOG.info("Writing backbone set. ")
     backbone = get_temp_file("backbone", "backbone", ".fas")
     _write_fasta(backbone_sequences, backbone)
      
     _LOG.info("Generating pasta backbone alignment and tree. ")
     pastaalignJob = PastaAlignJob()
     moleculeType = options().molecule
     if (options().molecule == 'amino'):
         moleculeType =  'protein'
     pastaalignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu)
     pastaalignJob.run()
     pastaalignJob.read_results()
     
     options().placement_size = self.options.backbone_size
     options().alignment_file = open(self.options.outdir + "/pasta.fasta")
     options().tree_file = open(self.options.outdir + "/pasta.fasttree")
     _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file))
     sequences.set_alignment(fragments)        
     if (len(sequences) == 0):
       _LOG.info("No query sequences to align.  Final alignment saved as %s" % self.get_output_filename("alignment.fasta"))   
       shutil.copyfile(self.options.outdir + "/pasta.fasta", self.get_output_filename("alignment.fasta"))
       sys.exit(0)
     else:
       query = get_temp_file("query", "backbone", ".fas")
       options().fragment_file = query          
       _write_fasta(sequences, query)               
Beispiel #9
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        if (options().backbone_size is None):
            options().backbone_size = min(100,
                                          int(.20 * sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(
            random.sample(sequences.keys(),
                          options().backbone_size))
        [sequences.pop(i) for i in backbone_sequences.keys()]

        _LOG.info("Writing query and backbone set. ")
        query = get_temp_file("query", "backbone", ".fas")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(sequences, query)
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating sate backbone alignment and tree. ")
        satealignJob = SateAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        satealignJob.setup(backbone,
                           options().backbone_size, self.options.outdir,
                           moleculeType,
                           options().cpu)
        satealignJob.run()
        satealignJob.read_results()

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(self.options.outdir + "/sate.fasta")
        options().tree_file = open(self.options.outdir + "/sate.fasttree")
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        options().fragment_file = query
Beispiel #10
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None
                or options().full_length_range is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (seq_lengths[l2] +
                                                    seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]
            if options().full_length_range is not None:
                L = sorted(int(x) for x in options().full_length_range.split())
                min_length = L[0]
                max_length = L[1]
            else:
                (min_length,
                 max_length) = (int(options().median_full_length *
                                    (1 - options().backbone_threshold)),
                                int(options().median_full_length *
                                    (1 + options().backbone_threshold)))
            _LOG.info(
                "Full length sequences are set to be from %d to %d character long"
                % (min_length, max_length))
            frag_names = [
                name for name in sequences if len(sequences[name]) > max_length
                or len(sequences[name]) < min_length
            ]
            if (len(frag_names) > 0):
                _LOG.info("Detected %d fragmentary sequences" %
                          len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(
            random.sample(sorted(list(sequences.keys())),
                          options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone,
                            options().backbone_size, moleculeType,
                            options().cpu, **vars(options().pasta))
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s" %
                self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)