Beispiel #1
0
    def testAlignmentReadFasta(self):
        alg = MutableAlignment()
        alg.read_filepath(get_data_path("mock/pyrg/sate.fasta"))

        assert len(alg) == 65, "MutableAlignment length is %s" % len(alg)

        assert all([not alg.is_all_gap(i) for i in range(0, alg.get_length())])
Beispiel #2
0
    def check_options(self):
        self.check_outputprefix()
        options().info_file = "A_dummy_value"

        # Check to see if tree/alignment/fragment file provided, if not,
        # generate it from sequence file
        if ((not options().tree_file is None)
                and (not options().alignment_file is None)
                and (not options().sequence_file is None)):
            options().fragment_file = options().sequence_file
        elif ((options().tree_file is None)
              and (options().alignment_file is None)
              and (not options().sequence_file is None)):
            self.generate_backbone()
        else:
            _LOG.error(
                ("Either specify the backbone alignment and tree and query "
                 "sequences or only the query sequences.  Any other "
                 "combination is invalid"))
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))
        backbone_size = sequences.get_num_taxa()
        if options().backbone_size is None:
            options().backbone_size = backbone_size
        assert options().backbone_size == backbone_size, (
            ("Backbone parameter needs to match actual size of backbone; "
             "backbone parameter:%s backbone_size:%s") %
            (options().backbone_size, backbone_size))
        if options().placement_size is None:
            options().placement_size = options().backbone_size
        return ExhaustiveAlgorithm.check_options(self)
Beispiel #3
0
    def check_options(self):
        options().info_file = "A_dummy_value"

        if options().tree_file is None or options().alignment_file is None:
            _LOG.error("Specify the backbone alignment and tree and query sequences")
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))
        return ExhaustiveAlgorithm.check_options(self)
Beispiel #4
0
    def check_options(self):
        options().info_file = "A_dummy_value"

        if options().tree_file is None or options().alignment_file is None:
            _LOG.error("Specify the backbone alignment and tree and query sequences")
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))  
        return ExhaustiveAlgorithm.check_options(self)
Beispiel #5
0
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name+'_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir+"/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])
    gene_set = marker_genes
    align_name = 'sate'
    if (options().genes == 'cogs'):
        gene_set = cog_genes
        align_name = 'pasta'
    for gene in gene_set:
        # Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(
                options().__getattribute__('reference').path,
                'refpkg/%s.refpkg/%.profile' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for name, value in results.items():
            bitscore = value[1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name, val in frag_scores.items():
        if (val[1] not in genes):
            genes[val[1]] = {}
        if (val[2] == 'forward'):
            genes[val[1]][name] = fragments[name]
        else:
            genes[val[1]][name] = reverse_sequence(fragments[name])
    genes.pop("NA", None)
    for gene, seq in genes.items():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(seq, gene_file + ".fixed")
    return genes
 def testAlignmentReadFasta(self):
     print "====== starting testAlignmentReadFasta ==========" 
     alg = MutableAlignment()
     alg.read_filepath("data/mock/pyrg/sate.fasta")
     
     print "Maing alignment is:\n\n", alg
     
     assert len(alg) == 65, "MutableAlignment length is %s" %len(alg)
     
     assert all([not alg.is_all_gap(i) for i in xrange(0,alg.get_length())])            
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + "_rev", reverse_sequence(seq)) for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, "NA", "NA"]) for name in fragments.keys()])
    gene_set = marker_genes
    align_name = "sate"
    if options().genes == "cogs":
        gene_set = cog_genes
        align_name = "pasta"
    for gene in gene_set:
        # Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(
                options().__getattribute__("reference").path, "refpkg/%s.refpkg/%.profile" % (gene, align_name)
            ),
            temp_dir + "/%s.out" % gene,
        )
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for name in results.keys():
            bitscore = results[name][1]
            direction = "forward"
            true_name = name
            if name.find("_rev") != -1:
                true_name = true_name.replace("_rev", "")
                direction = "reverse"
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name in frag_scores.keys():
        if frag_scores[name][1] not in genes:
            genes[frag_scores[name][1]] = {}
        if frag_scores[name][2] == "forward":
            genes[frag_scores[name][1]][name] = fragments[name]
        else:
            genes[frag_scores[name][1]][name] = reverse_sequence(fragments[name])
    genes.pop("NA", None)
    for gene in genes.keys():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(genes[gene], gene_file + ".fixed")
    return genes
Beispiel #8
0
def blast_to_markers(input, temp_dir):
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    if (options().gene is None):
        # First blast sequences against all markers
        blast_results = temp_dir + "/blast.out"
        if (options().blast_file is None):
            print("Blasting fragments against marker dataset\n")
            blast_fragments(input, blast_results)
        else:
            blast_results = options().blast_file
        # Next bin the blast hits to the best gene
        gene_binning = bin_blast_results(blast_results)
    else:
        gene_binning = {options().gene: list(fragments.keys())}
    # Now figure out direction of fragments
    binned_fragments = dict([
        (gene, dict([(seq_name, fragments[seq_name])
                     for seq_name in gene_binning[gene]]))
        for gene in gene_binning])
    print("Finding best orientation of reads\n")
    align_name = 'sate'
    if (options().genes == 'cogs'):
        align_name = 'pasta'
    for (gene, frags) in binned_fragments.items():
        # Add reverse complement sequence
        frags_rev = dict([(name + '_rev', reverse_sequence(seq))
                          for (name, seq) in frags.items()])
        gene_frags = MutableAlignment()
        gene_frags.set_alignment(frags)
        gene_frags.set_alignment(frags_rev)
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(gene_frags, gene_file)

        # Now run HMMER search
        hmmer_search(
            gene_file,
            os.path.join(
                options().__getattribute__('reference').path,
                'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for key in frags:
            forward_score = -10000
            backward_score = -10000
            if (key in results):
                forward_score = results[key][1]
            if (key+"_rev" in results):
                backward_score = results[key + "_rev"][1]
            if (backward_score > forward_score):
                frags[key] = gene_frags[key + "_rev"]

        # Now write to file
        _write_fasta(frags, gene_file + ".fixed")
        binned_fragments[gene] = frags
    return binned_fragments
Beispiel #9
0
 def read_alignment_and_tree(self):
     _LOG.info("Reading input alignment: %s" %(self.options.alignment_file))
     alignment = MutableAlignment()
     alignment.read_file_object(self.options.alignment_file)
     
     #fragments = MutableAlignment()
     #fragments.read_file_object(self.options.fragment_file);   
     _LOG.info("Reading input tree: %s" %(self.options.tree_file))        
     tree = PhylogeneticTree( dendropy.Tree(stream=self.options.tree_file, 
                                            schema="newick", 
                                            preserve_underscores=True))        
     
     return (alignment, tree)
Beispiel #10
0
    def testReadOnlySubAlignment(self):
        alg = MutableAlignment()
        alg.read_filepath(get_data_path("mock/pyrg/sate.fasta"))

        subset = [
            'NC_008701_720717_722309', 'NC_013156_149033_150643',
            'NC_013887_802739_801129'
        ]
        readonly_subalignment = ReadonlySubalignment(subset, alg)

        assert len(readonly_subalignment) == 3, len(readonly_subalignment)

        assert set(readonly_subalignment.keys()) == set(
            readonly_subalignment.get_sequence_names()) == set(subset), \
            "Subalignment keys not matching given keys %s vs %s" % (
            list(readonly_subalignment.keys()), subset)

        for (k, s) in list(readonly_subalignment.items()):
            assert k in subset, \
                "%s not found in subset but returned by subalignment" % k
            assert s == alg[k], \
                "sequence associated with %s not matching parent alignment" % k

        try:
            readonly_subalignment[2] = "ACGT"
            assert False, "Readony alignment is successfully modified. "
        except TypeError:
            pass

        assert readonly_subalignment.get_length() == alg.get_length(), \
            "alignment length should not change"

        assert readonly_subalignment.is_aligned() is True

        assert readonly_subalignment.is_all_gap(2) is True, \
            "Site 2 should be all gaps"
        assert readonly_subalignment.is_all_gap(150) is False, \
            "Site 100 should not be all gaps"

        readonly_subalignment.write_to_path(
            self.fp_dummy1)  # "mock/pyrg/sate.sub.fasta"

        mutable_subalignment = readonly_subalignment.get_mutable_alignment()
        mutable_subalignment.delete_all_gap()

        assert all([
            not mutable_subalignment.is_all_gap(i)
            for i in range(0, mutable_subalignment.get_length())
        ])
Beispiel #11
0
    def read_alignment_and_tree(self):
        _LOG.info("Reading input alignment: %s" %
                  (self.options.alignment_file))
        alignment = MutableAlignment()
        alignment.read_file_object(self.options.alignment_file)

        # fragments = MutableAlignment()
        # fragments.read_file_object(self.options.fragment_file);
        _LOG.info("Reading input tree: %s" % self.options.tree_file)
        tree = PhylogeneticTree(
            dendropy.Tree.get_from_stream(self.options.tree_file,
                                          schema="newick",
                                          preserve_underscores=True))

        return (alignment, tree)
def main():
  args = parse_args()
  sequences = MutableAlignment()
  assert os.path.isfile(args.input) and os.access(args.input, os.R_OK), "Input file %s does not exist\n" % args.input
  sequences.read_file_object(args.input)
  frag = MutableAlignment()
  full = MutableAlignment()
  
  for (key,seq) in sequences.items():
    if (len(seq) <= args.threshold):
      frag[key]=seq
    else:
      full[key]=seq
  frag.write_to_path("%s.frag.fas" % args.output)
  full.write_to_path("%s.full.fas" % args.output)
Beispiel #13
0
    def read_and_divide_fragments(self, chunks, extra_frags={}):
        max_chunk_size = self.options.max_chunk_size
        _LOG.debug(
            "start reading fragment files and breaking to at least %s chunks but at most %s sequences "
            % (str(chunks), str(max_chunk_size)))
        self.root_problem.fragments = MutableAlignment()
        self.root_problem.fragments.read_file_object(
            self.options.fragment_file)

        # test if input fragment names might collide with reference names.
        # code contribution by Stefan Janssen (June 13th, 2018)
        ids_reference = set(self.root_problem.subalignment.keys())
        ids_inputfragments = set(self.root_problem.fragments.keys())
        ids_overlap = ids_reference & ids_inputfragments
        if len(ids_overlap) > 0:
            raise ValueError(
                ("Your input fragment file contains %i sequences, whose names "
                 "overlap with names in your reference. Please rename your inp"
                 "ut fragments and re-start. Duplicate names are:\n  '%s'") %
                (len(ids_overlap), "'\n  '".join(ids_overlap)))

        for (k, v) in extra_frags.items():
            self.root_problem.fragments[k] = v.replace("-", "")
        alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(
            chunks, max_chunk_size)
        ret = []
        for i in range(0, len(alg_chunks)):
            temp_file = None
            if alg_chunks[i]:
                temp_file = get_temp_file("fragment_chunk_%d" % i,
                                          "fragment_chunks", ".fasta")
                alg_chunks[i].write_to_path(temp_file)
            ret.append(temp_file)
        _LOG.debug("fragment files read and divided.")
        return ret
Beispiel #14
0
    def figureout_fragment_subset(self):
        ''' Figure out which fragment should go to which subproblem'''
        # We need to keep and check the following flag because of checkpoining scenarios (join already done before!)
        if self.root_problem.annotations.has_key(
                "fragments.distribution.done"):
            return
        bitscores = dict([(name, [])
                          for name in self.root_problem.fragments.keys()])
        for fragment_chunk_problem in self.root_problem.iter_leaves():
            align_problem = fragment_chunk_problem.get_parent()
            assert isinstance(align_problem, SeppProblem)
            '''For each subproblem start with an empty set of fragments,
            and add to them as we encounter new best hits for that subproblem'''
            if align_problem.fragments is None:
                align_problem.fragments = MutableAlignment()
            search_res = fragment_chunk_problem.get_job_result_by_name(
                "hmmsearch")
            for key in search_res.keys():
                ''' keep a list of all hits, and their bit scores'''
                bitscores[key].append((search_res[key][1], align_problem))

        for frag, tuplelist in bitscores.iteritems():
            ''' TODO: what to do with those that are not? For now, only output warning message'''
            #TODO:  Need to double check and fix the math
            if len(tuplelist) == 0:
                _LOG.warning("Fragment %s is not scored against any subset" %
                             str(frag))
                continue
            ''' convert bit scores to probabilities '''
            denum = sum(math.pow(2, min(x[0], 1022)) for x in tuplelist)
            tuplelist = [((math.pow(2, min(x[0], 1022)) / denum * 1000000),
                          x[1]) for x in tuplelist]
            ''' Sort subsets by their probability'''
            tuplelist.sort(reverse=True)
            ''' Find enough subsets to reach the threshold '''
            selected = tuplelist[0:max(
                1,
                reduce(
                    lambda x, y: (x[0], None) if x[1] is None else
                    (y[0], x[1] + y[1])
                    if x[1] < int(1000000 * self.alignment_threshold) else
                    (y[0], None), enumerate([x[0] for x in tuplelist]))[0])]
            ''' Renormalize the selected list to add up to 1'''
            renorm = 0
            for (prob, align_problem) in selected:
                renorm = renorm + prob / 1000000
            renorm = 1 / renorm

            _LOG.debug("Fragment %s assigned to %d subsets" %
                       (frag, len(selected)))
            ''' Rename the fragment and assign it to the respective subsets'''
            for (prob, align_problem) in selected:
                postfix = prob * renorm if options(
                ).exhaustive.weight_placement_by_alignment.lower(
                ) == "true" else 1000000
                frag_rename = "%s_%s_%d" % (frag, align_problem.label, postfix)
                align_problem.fragments[
                    frag_rename] = self.root_problem.fragments[frag]

        self.root_problem.annotations["fragments.distribution.done"] = 1
Beispiel #15
0
    def read_and_divide_fragments(self, chunks, extra_frags={}):
        max_chunk_size = self.options.max_chunk_size
        _LOG.debug(
            ("start reading fragment files and breaking to at least %s chunks"
             " but at most %s sequences ") %
            (str(chunks), str(max_chunk_size)))
        self.root_problem.fragments = MutableAlignment()
        self.root_problem.fragments.read_file_object(
            self.options.fragment_file)

        # test if input fragment names might collide with reference names.
        # code contribution by Stefan Janssen (June 13th, 2018)
        ids_reference = set(self.root_problem.subalignment.keys())
        ids_inputfragments = set(self.root_problem.fragments.keys())
        ids_overlap = ids_reference & ids_inputfragments
        if len(ids_overlap) > 0 and not self.options.ignore_overlap:
            raise ValueError(
                ("Your input fragment file contains %i sequences, whose names "
                 "overlap with names in your reference. Please rename your inp"
                 "ut fragments and re-start. Duplicate names are:\n  '%s'") %
                (len(ids_overlap), "'\n  '".join(ids_overlap)))
        elif len(ids_overlap) > 0:
            _LOG.debug("Ignoring following %i query sequences present "
                       "in the backbone: \n '%s'" %
                       (len(ids_overlap), "' , '".join(ids_overlap)))

            self.root_problem.fragments = self.root_problem.fragments.\
                get_soft_sub_alignment(ids_inputfragments - ids_reference)

        # test if input fragment names contain whitespaces / tabs which would
        # cause hmmsearch to fail.
        # code contribution by Stefan Janssen (June 22nd, 2018)
        ids_inputfragments_spaces = [
            id_ for id_ in ids_inputfragments if (' ' in id_) or ('\t' in id_)
        ]
        if len(ids_inputfragments_spaces) > 0:
            raise ValueError(
                ("Your input fragment file contains %i sequences, whose names "
                 "contain either whitespaces: ' ' or tabulator '\\t' symbols. "
                 "Please rename your input fragments and re-start. Affected "
                 "names are:\n  '%s'") %
                (len(ids_inputfragments_spaces),
                 "'\n  '".join(ids_inputfragments_spaces)))

        for (k, v) in extra_frags.items():
            self.root_problem.fragments[k] = v.replace("-", "")
        alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(
            chunks, max_chunk_size)
        ret = []
        for i in range(0, len(alg_chunks)):
            temp_file = None
            if alg_chunks[i]:
                temp_file = get_temp_file("fragment_chunk_%d" % i,
                                          "fragment_chunks", ".fasta")
                alg_chunks[i].write_to_path(temp_file)
            ret.append(temp_file)
        _LOG.debug("fragment files read and divided.")
        return ret
Beispiel #16
0
 def generate_backbone(self):
     _LOG.info("Reading input sequences: %s" %(self.options.sequence_file))
     sequences = MutableAlignment()
     sequences.read_file_object(self.options.sequence_file)
     if (options().backbone_size is None):            
         options().backbone_size = min(100,int(.20*sequences.get_num_taxa()))
         _LOG.info("Backbone size set to: %d" %(options().backbone_size))
     backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size))        
     [sequences.pop(i) for i in backbone_sequences.keys()]
     
     _LOG.info("Writing query and backbone set. ")
     query = get_temp_file("query", "backbone", ".fas")
     backbone = get_temp_file("backbone", "backbone", ".fas")
     _write_fasta(sequences, query)
     _write_fasta(backbone_sequences, backbone)
             
     _LOG.info("Generating sate backbone alignment and tree. ")
     satealignJob = SateAlignJob()
     moleculeType = options().molecule
     if (options().molecule == 'amino'):
         moleculeType =  'protein'
     satealignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu)
     satealignJob.run()
     satealignJob.read_results()
     
     options().placement_size = self.options.backbone_size
     options().alignment_file = open(self.options.outdir + "/sate.fasta")
     options().tree_file = open(self.options.outdir + "/sate.fasttree")
     _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file))
     options().fragment_file = query
Beispiel #17
0
    def testReadOnlySubAlignment(self):
        alg = MutableAlignment()
        alg.read_filepath(get_data_path("mock/pyrg/sate.fasta"))

        subset = ['NC_008701_720717_722309', 'NC_013156_149033_150643',
                  'NC_013887_802739_801129']
        readonly_subalignment = ReadonlySubalignment(subset, alg)

        assert len(readonly_subalignment) == 3, len(readonly_subalignment)

        assert set(readonly_subalignment.keys()) == set(
            readonly_subalignment.get_sequence_names()) == set(subset), \
            "Subalignment keys not matching given keys %s vs %s" % (
            list(readonly_subalignment.keys()), subset)

        for (k, s) in list(readonly_subalignment.items()):
            assert k in subset, \
                "%s not found in subset but returned by subalignment" % k
            assert s == alg[k], \
                "sequence associated with %k not matching parent alignment" % k

        try:
            readonly_subalignment[2] = "ACGT"
            assert False, "Readony alignment is successfully modified. "
        except TypeError:
            pass

        assert readonly_subalignment.get_length() == alg.get_length(), \
            "alignment length should not change"

        assert readonly_subalignment.is_aligned() is True

        assert readonly_subalignment.is_all_gap(2) is True, \
            "Site 2 should be all gaps"
        assert readonly_subalignment.is_all_gap(150) is False, \
            "Site 100 should not be all gaps"

        readonly_subalignment.write_to_path(
            self.fp_dummy1)  # "mock/pyrg/sate.sub.fasta"

        mutable_subalignment = readonly_subalignment.get_mutable_alignment()
        mutable_subalignment.delete_all_gap()

        assert all([not mutable_subalignment.is_all_gap(i)
                    for i in range(0, mutable_subalignment.get_length())])
Beispiel #18
0
    def testAlignmentReadFasta(self):
        alg = MutableAlignment()
        alg.read_filepath(get_data_path("mock/pyrg/sate.fasta"))

        assert len(alg) == 65, "MutableAlignment length is %s" % len(alg)

        assert all([not alg.is_all_gap(i) for i in range(0, alg.get_length())])
 def testReadOnlySubAlignment(self):
     print "======= starting testReadOnlySubAlignment =========" 
     alg = MutableAlignment()
     alg.read_filepath("data/mock/pyrg/sate.fasta")
     
     subset  = alg.keys()[9:12]
     readonly_subalignment = ReadonlySubalignment(subset, alg)
     
     print "subalignment is:\n\n", readonly_subalignment
     
     assert len(readonly_subalignment) == 3, len(readonly_subalignment) 
     
     assert readonly_subalignment.keys() == readonly_subalignment.get_sequence_names() == subset, "Subalignment keys not matching given keys %s vs %s" %(readonly_subalignment.keys() , subset)
     
     for (k, s) in readonly_subalignment.items():
         assert k in subset, "%s not found in subset but returned by subalignment" %k
         assert s == alg[k], "sequence associated with %k not matching parent alignment" %k 
     
     try:
         readonly_subalignment[2] = "ACGT"
         assert False, "Readony alignment is successfully modified. "
     except TypeError:
         pass
     
     assert readonly_subalignment.get_length() == alg.get_length(), "alignment length should not change"
     
     assert readonly_subalignment.is_aligned() == True
     
     assert readonly_subalignment.is_all_gap(2) == True, "Site 2 should be all gaps"
     assert readonly_subalignment.is_all_gap(150) == False, "Site 100 should not be all gaps"        
     
     readonly_subalignment.write_to_path("data/mock/pyrg/sate.sub.fasta")
     
     mutable_subalignment = readonly_subalignment.get_mutable_alignment()
     mutable_subalignment.delete_all_gap()
     
     assert all([not mutable_subalignment.is_all_gap(i) for i in xrange(0,mutable_subalignment.get_length())])
     
     print "======= finishing testReadOnlySubAlignment =========" 
Beispiel #20
0
    def check_options(self):
        options().info_file = "A_dummy_value"

        #Check to see if tree/alignment/fragment file provided, if not, generate it
        #from sequence file                
        if not options().tree_file is None and not options().alignment_file is None and not options().sequence_file is None:
            options().fragment_file = options().sequence_file        
        elif options().tree_file is None and options().alignment_file is None and not options().sequence_file is None:            
            self.generate_backbone()
        else:
            _LOG.error("Either specify the backbone alignment and tree and query sequences or only the query sequences.  Any other combination is invalid")
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))            
        backbone_size = sequences.get_num_taxa()
        if options().backbone_size is None:
            options().backbone_size = backbone_size        
        assert options().backbone_size == backbone_size, ("Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" 
                %(options().backbone_size, backbone_size))                    
        if options().placement_size is None:
            options().placement_size = options().backbone_size
        return ExhaustiveAlgorithm.check_options(self)
Beispiel #21
0
    def output_results(self):
        extended_alignment = self.results
        _LOG.info("Generating output. ")
        outfilename = self.get_output_filename("alignment.fasta")
        extended_alignment.write_to_path(outfilename)
        _LOG.info("Unmasked alignment written to %s" % outfilename)
        outfilename = self.get_output_filename("insertion_columns.txt")
        extended_alignment.write_insertion_column_indexes(outfilename)
        _LOG.info("The index of insertion columns written to %s" % outfilename)
        if self.options.backtranslation_sequence_file:
            outfilename = self.get_output_filename(
                "backtranslated_alignment.fasta")
            backtranslation_seqs = MutableAlignment()
            backtranslation_seqs.read_file_object(
                self.options.backtranslation_sequence_file)
            try:
                extended_backtranslated_alignment = backtranslate(
                    self.results, backtranslation_seqs)
            except Exception as e:
                _LOG.warning("Backtranslation failed due "
                             "to following error: " + str(e) + ".\n"
                             "No translated DNA sequence will be "
                             "written to a file.")
                pass
            else:
                extended_backtranslated_alignment.write_to_path(outfilename)
                _LOG.info("Backtranslated alignment written to %s" %
                          outfilename)
                extended_backtranslated_alignment.remove_insertion_columns()
                outfilename = self.get_output_filename(
                    "backtranslated_alignment_masked.fasta")
                extended_backtranslated_alignment.write_to_path(outfilename)
                _LOG.info("Backtranslated masked alignment written "
                          "to %s" % outfilename)

        extended_alignment.remove_insertion_columns()
        outfilename = self.get_output_filename("alignment_masked.fasta")
        extended_alignment.write_to_path(outfilename)
        _LOG.info("Masked alignment written to %s" % outfilename)
Beispiel #22
0
    def testAlignmentReadFasta(self):
        print("====== starting testAlignmentReadFasta ==========")
        alg = MutableAlignment()
        alg.read_filepath("data/mock/pyrg/sate.fasta")

        print("Maing alignment is:\n\n", alg)

        assert len(alg) == 65, "MutableAlignment length is %s" % len(alg)

        assert all([not alg.is_all_gap(i) for i in range(0, alg.get_length())])
Beispiel #23
0
 def read_and_divide_fragments(self, chunks, extra_frags={}):
     _LOG.debug("start reading fragment files and breaking to chunks: %d" %
                chunks)
     self.root_problem.fragments = MutableAlignment()
     self.root_problem.fragments.read_file_object(
         self.options.fragment_file)
     for (k, v) in extra_frags.iteritems():
         self.root_problem.fragments[k] = v.replace("-", "")
     alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(chunks)
     ret = []
     for i in xrange(0, chunks):
         temp_file = get_temp_file("fragment_chunk_%d" % i,
                                   "fragment_chunks", ".fasta")
         alg_chunks[i].write_to_path(temp_file)
         ret.append(temp_file)
     _LOG.debug("fragment files read and divided.")
     return ret
Beispiel #24
0
    def check_options(self):
        options().info_file = "A_dummy_value"

        #Check to see if tree/alignment/fragment file provided, if not, generate it
        #from sequence file                
        if not options().tree_file is None and not options().alignment_file is None and not options().sequence_file is None:
            options().fragment_file = options().sequence_file        
        elif options().tree_file is None and options().alignment_file is None and not options().sequence_file is None:
            self.generate_backbone()
        else:
            _LOG.error("Either specify the backbone alignment and tree and query sequences or only the query sequences.  Any other combination is invalid")
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))            
        backbone_size = sequences.get_num_taxa()
        if options().backbone_size is None:
            options().backbone_size = backbone_size
        assert options().backbone_size == backbone_size, ("Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s" 
                %(options().backbone_size, backbone_size))                    
        if options().placement_size is None:
            options().placement_size = options().backbone_size
        if options().alignment_size is None:
            _LOG.info("Alignment subset size not given.  Calculating subset size. ")
            alignment = MutableAlignment()
            alignment.read_file_object(open(self.options.alignment_file.name))
            if (options().molecule == 'amino'):
                _LOG.warning("Automated alignment subset selection not implemented for protein alignment.  Setting to 10.")
                options().alignment_size = 10        
            else:
                (averagep,maxp) = alignment.get_p_distance()            
                align_size = 10
                if (averagep > .60):                
                    while (align_size*2 < alignment.get_num_taxa()):
                        align_size = align_size * 2            
                _LOG.info("Average p-distance of backbone is %f0.2.  Alignment subset size set to %d. " % (averagep,align_size))    
                options().alignment_size = align_size
        return ExhaustiveAlgorithm.check_options(self)
Beispiel #25
0
    def testReadOnlySubAlignment(self):
        print("======= starting testReadOnlySubAlignment =========")
        alg = MutableAlignment()
        alg.read_filepath("data/mock/pyrg/sate.fasta")

        subset = list(alg.keys())[9:12]
        readonly_subalignment = ReadonlySubalignment(subset, alg)

        print("subalignment is:\n\n", readonly_subalignment)

        assert len(readonly_subalignment) == 3, len(readonly_subalignment)

        assert set(readonly_subalignment.keys()) == set(
            readonly_subalignment.get_sequence_names()) == set(
                subset
            ), "Subalignment keys not matching given keys %s vs %s" % (list(
                readonly_subalignment.keys()), subset)

        for (k, s) in list(readonly_subalignment.items()):
            assert k in subset, "%s not found in subset but returned by subalignment" % k
            assert s == alg[
                k], "sequence associated with %k not matching parent alignment" % k

        try:
            readonly_subalignment[2] = "ACGT"
            assert False, "Readony alignment is successfully modified. "
        except TypeError:
            pass

        assert readonly_subalignment.get_length() == alg.get_length(
        ), "alignment length should not change"

        assert readonly_subalignment.is_aligned() == True

        assert readonly_subalignment.is_all_gap(
            2) == True, "Site 2 should be all gaps"
        assert readonly_subalignment.is_all_gap(
            150) == False, "Site 100 should not be all gaps"

        readonly_subalignment.write_to_path("data/mock/pyrg/sate.sub.fasta")

        mutable_subalignment = readonly_subalignment.get_mutable_alignment()
        mutable_subalignment.delete_all_gap()

        assert all([
            not mutable_subalignment.is_all_gap(i)
            for i in range(0, mutable_subalignment.get_length())
        ])

        print("======= finishing testReadOnlySubAlignment =========")
Beispiel #26
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        if (options().backbone_size is None):
            options().backbone_size = min(100,
                                          int(.20 * sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(
            random.sample(sequences.keys(),
                          options().backbone_size))
        [sequences.pop(i) for i in backbone_sequences.keys()]

        _LOG.info("Writing query and backbone set. ")
        query = get_temp_file("query", "backbone", ".fas")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(sequences, query)
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating sate backbone alignment and tree. ")
        satealignJob = SateAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        satealignJob.setup(backbone,
                           options().backbone_size, self.options.outdir,
                           moleculeType,
                           options().cpu)
        satealignJob.run()
        satealignJob.read_results()

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(self.options.outdir + "/sate.fasta")
        options().tree_file = open(self.options.outdir + "/sate.fasttree")
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        options().fragment_file = query
Beispiel #27
0
def hmmer_to_markers(input, temp_dir):
    global refpkg

    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + '_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])

    for gene in refpkg["genes"]:
        # Now run HMMER search
        hmmer_output = temp_dir + '/' + gene + ".out"
        hmmer_search(frag_file, refpkg[gene]["hmm"], hmmer_output)
        results = read_hmmsearch_results(hmmer_output)

        # Now select best direction for each frag
        for name, value in results.items():
            bitscore = value[1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name, val in frag_scores.items():
        if (val[1] not in genes):
            genes[val[1]] = {}
        if (val[2] == 'forward'):
            genes[val[1]][name] = fragments[name]
        else:
            genes[val[1]][name] = reverse_sequence(fragments[name])

    genes.pop("NA", None)

    for gene, seq in genes.items():
        gene_file = temp_dir + '/' + gene + ".frags.fas.fixed"
        _write_fasta(seq, gene_file)

    binned_fragments = {}
    for gene, seq in genes.items():
        binned_fragments[gene] = {}
        binned_fragments[gene]["file"] = temp_dir + '/' + gene \
            + ".frags.fas.fixed"
        binned_fragments[gene]["nfrags"] = len(seq.keys())

    return binned_fragments
Beispiel #28
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (
                        seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]

            (min_length, max_length) = (
                int(options().median_full_length * (
                    1 - options().backbone_threshold)),
                int(options().median_full_length*(
                    1 + options().backbone_threshold)))
            frag_names = [
                name for name in sequences
                if len(sequences[name]) > max_length or
                len(sequences[name]) < min_length]
            if (len(frag_names) > 0):
                _LOG.info(
                    "Detected %d fragmentary sequences" % len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(random.sample(
            sorted(list(sequences.keys())), options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone, options().backbone_size,
                            moleculeType, options().cpu)
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s"
            % (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s"
                % self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)
Beispiel #29
0
def main():
    args = parse_args()
    sequences = MutableAlignment()
    assert os.path.isfile(args.input) and os.access(
        args.input, os.R_OK), "Input file %s does not exist\n" % args.input
    sequences.read_file_object(args.input)
    frag = MutableAlignment()
    full = MutableAlignment()

    for (key, seq) in sequences.items():
        if (len(seq) <= args.threshold):
            frag[key] = seq
        else:
            full[key] = seq
    frag.write_to_path("%s.frag.fas" % args.output)
    full.write_to_path("%s.full.fas" % args.output)
Beispiel #30
0
    def check_options(self):
        options().info_file = "A_dummy_value"

        #Check to see if tree/alignment/fragment file provided, if not, generate it
        #from sequence file
        if not options().tree_file is None and not options(
        ).alignment_file is None and not options().sequence_file is None:
            options().fragment_file = options().sequence_file
        elif options().tree_file is None and options(
        ).alignment_file is None and not options().sequence_file is None:
            self.generate_backbone()
        else:
            _LOG.error(
                "Either specify the backbone alignment and tree and query sequences or only the query sequences.  Any other combination is invalid"
            )
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))
        backbone_size = sequences.get_num_taxa()
        if options().backbone_size is None:
            options().backbone_size = backbone_size
        assert options().backbone_size == backbone_size, (
            "Backbone parameter needs to match actual size of backbone; backbone parameter:%s backbone_size:%s"
            % (options().backbone_size, backbone_size))
        if options().placement_size is None:
            options().placement_size = options().backbone_size
        if options().alignment_size is None:
            _LOG.info(
                "Alignment subset size not given.  Calculating subset size. ")
            alignment = MutableAlignment()
            alignment.read_file_object(open(self.options.alignment_file.name))
            if (options().molecule == 'amino'):
                _LOG.warning(
                    "Automated alignment subset selection not implemented for protein alignment.  Setting to 10."
                )
                options().alignment_size = 10
            else:
                (averagep, maxp) = alignment.get_p_distance()
                align_size = 10
                if (averagep > .60):
                    while (align_size * 2 < alignment.get_num_taxa()):
                        align_size = align_size * 2
                _LOG.info(
                    "Average p-distance of backbone is %f0.2.  Alignment subset size set to %d. "
                    % (averagep, align_size))
                options().alignment_size = align_size
        return ExhaustiveAlgorithm.check_options(self)
# Make sure to do this before the last line relabeling columns, since that's usually the line that errors.
# backbone alignment
original_backbone_file = (
    '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/orig_backbone.txt'
)
new_backbone_file = "new_backbone_file.txt"

with open(original_backbone_file, "r") as reader:
    with open(new_backbone_file, "w+") as writer:
        for line in reader.readlines():
            if line[0] == ">":
                writer.write(line.upper())
            else:
                writer.write(line)

original_backbone = MutableAlignment()
done = original_backbone.read_filepath(new_backbone_file)

# all query sequences
original_frag_file = (
    '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/all_query.txt'
)
original_frag = MutableAlignment()
done = original_frag.read_filepath(original_frag_file)

# First build extended alignment on entire fragment set
extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names())
dir = '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/'

for a in [1, 2]:
    a = str(a)
Beispiel #32
0
def blast_to_markers(input, temp_dir):
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    if (options().gene is None):
        # First blast sequences against all markers
        blast_results = temp_dir + "/blast.out"
        if (options().blast_file is None):
            print("Blasting fragments against marker dataset\n")
            blast_fragments(input, blast_results)
        else:
            blast_results = options().blast_file
        # Next bin the blast hits to the best gene
        gene_binning = bin_blast_results(blast_results)
    else:
        gene_binning = {options().gene: list(fragments.keys())}
    # Now figure out direction of fragments
    binned_fragments = dict([(gene,
                              dict([(seq_name, fragments[seq_name])
                                    for seq_name in gene_binning[gene]]))
                             for gene in gene_binning])
    print("Finding best orientation of reads\n")
    align_name = 'sate'
    if (options().genes == 'cogs'):
        align_name = 'pasta'
    for (gene, frags) in binned_fragments.items():
        # Add reverse complement sequence
        frags_rev = dict([(name + '_rev', reverse_sequence(seq))
                          for (name, seq) in frags.items()])
        gene_frags = MutableAlignment()
        gene_frags.set_alignment(frags)
        gene_frags.set_alignment(frags_rev)
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(gene_frags, gene_file)

        # Now run HMMER search
        hmmer_search(
            gene_file,
            os.path.join(options().__getattribute__('reference').path,
                         'refpkg/%s.refpkg/%s.hmm' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for key in frags:
            forward_score = -10000
            backward_score = -10000
            if (key in results):
                forward_score = results[key][1]
            if (key + "_rev" in results):
                backward_score = results[key + "_rev"][1]
            if (backward_score > forward_score):
                frags[key] = gene_frags[key + "_rev"]

        # Now write to file
        _write_fasta(frags, gene_file + ".fixed")
        binned_fragments[gene] = frags
    return binned_fragments
Beispiel #33
0
def hmmer_to_markers(input, temp_dir):
    global marker_genes
    fragments = MutableAlignment()
    fragments.read_filepath(input)

    reverse = dict([(name + '_rev', reverse_sequence(seq))
                    for (name, seq) in fragments.items()])
    all_frags = MutableAlignment()
    all_frags.set_alignment(fragments)
    all_frags.set_alignment(reverse)
    frag_file = temp_dir + "/frags.fas"
    _write_fasta(all_frags, frag_file)

    # Now bin the fragments
    frag_scores = dict([(name, [-10000, 'NA', 'NA'])
                        for name in fragments.keys()])
    gene_set = marker_genes
    align_name = 'sate'
    if (options().genes == 'cogs'):
        gene_set = cog_genes
        align_name = 'pasta'
    for gene in gene_set:
        # Now run HMMER search
        hmmer_search(
            frag_file,
            os.path.join(options().__getattribute__('reference').path,
                         'refpkg/%s.refpkg/%.profile' % (gene, align_name)),
            temp_dir + "/%s.out" % gene)
        results = read_hmmsearch_results(temp_dir + "/%s.out" % gene)

        # Now select best direction for each frag
        for name, value in results.items():
            bitscore = value[1]
            direction = 'forward'
            true_name = name
            if (name.find('_rev') != -1):
                true_name = true_name.replace('_rev', '')
                direction = 'reverse'
            if frag_scores[true_name][0] < bitscore:
                frag_scores[true_name] = [bitscore, gene, direction]

    # Now bin the fragments
    genes = dict([])
    for name, val in frag_scores.items():
        if (val[1] not in genes):
            genes[val[1]] = {}
        if (val[2] == 'forward'):
            genes[val[1]][name] = fragments[name]
        else:
            genes[val[1]][name] = reverse_sequence(fragments[name])
    genes.pop("NA", None)
    for gene, seq in genes.items():
        gene_file = temp_dir + "/%s.frags.fas" % gene
        _write_fasta(seq, gene_file + ".fixed")
    return genes
Beispiel #34
0
    def testExtendedAlignment(self):
        print "======= starting testExtendedAlignment ========="

        subset = [
            "SFIF", "SFII", "SCFC", "SGHD", "SDCC", "SBGE", "SFBB", "SDI",
            "SCGB", "SJGF", "SGBI", "SCJA", "SGAD", "SHEB", "SFHB", "SDJI",
            "SHED", "SJJJ", "SBBE", "SCCH", "SDJB", "SDAC", "SHEH", "SFDC",
            "SFEI", "SHHB", "SC", "SIAB", "SDDI", "SBCB", "SJB", "SEBD",
            "SFGD", "SHA", "SIDA", "SGHI", "SGIB", "SBFJ", "SFIE", "SCJF",
            "SJHJ", "SJBG", "SEJI", "SFFF", "SJ", "SIII", "SJHH", "SEIH",
            "SBDC", "SHDJ", "SJDD", "SGDB", "SIHA", "SIBB", "SECC", "SCAD",
            "SGBB", "SGIF", "SJHC", "SFCD", "SEAA", "SEFF", "SDFG", "SDJE",
            "SCFG", "SFH", "SCJ", "SDDD", "SEGD", "SCIH", "SDAG", "SCJE",
            "SFAJ", "SIDJ", "SE", "SHBC", "SJFF", "SCHD", "SBHA", "SEDF",
            "SFAF", "SEDD", "SDHD", "SGJD", "SIBH", "SGDF", "SIFA", "SJGA",
            "SIJB", "SFI", "SGA", "SBFC", "SBJA", "SFFC", "SFDH", "SFEE",
            "SBDF", "SGBJ", "SDHE", "SJIB", "SHHI", "SIDE", "SJII"
        ]

        alg = MutableAlignment()
        alg.read_filepath("data/simulated/test.fasta")
        alg.delete_all_gap()
        tlen = alg.get_length()

        frg = MutableAlignment()
        frg.read_filepath("data/simulated/test.fas")
        #print frg.get_num_taxa()

        pp = SeppProblem(alg.keys())
        pp.fragments = frg
        pp.subalignment = alg

        cp1 = SeppProblem(subset, pp)
        cp2 = SeppProblem(list(set(alg.keys()) - set(subset)), pp)
        cp1.fragments = ReadonlySubalignment(
            [k for k in frg.keys() if int(k[-1]) >= 9], frg)
        cp2.fragments = ReadonlySubalignment(
            [k for k in frg.keys() if int(k[-1]) <= 1], frg)

        cp1labels = cp1.write_subalignment_without_allgap_columns(
            "data/tmp/cp1.fasta")
        cp2labels = cp2.write_subalignment_without_allgap_columns(
            "data/tmp/cp2.fasta")
        tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta")
        assert all(
            [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())])
        tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta")
        assert all(
            [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())])

        cp1.fragments.write_to_path("data/tmp/cp1.frags.fas")
        cp2.fragments.write_to_path("data/tmp/cp2.frags.fas")
        '''We have done the hmmalign before. don't worry about that right now'''

        ext1 = ExtendedAlignment(cp1.fragments)
        ext1.build_extended_alignment("data/tmp/cp1.fasta",
                                      "data/tmp/cp1.extended.sto")
        ext1.relabel_original_columns(cp1labels)
        ext2 = ExtendedAlignment(cp2.fragments)
        ext2.build_extended_alignment("data/tmp/cp2.fasta",
                                      "data/tmp/cp2.extended.sto")
        ext2.relabel_original_columns(cp2labels)

        extmerger = ExtendedAlignment([])
        extmerger.merge_in(ext1)
        mixed = extmerger.merge_in(ext2)

        extmerger.write_to_path("data/tmp/extended.merged.fasta")

        assert extmerger.is_aligned(), "Merged alignment is not aligned"
        in1 = len([x for x in ext1._col_labels if x < 0])
        in2 = len([x for x in ext2._col_labels if x < 0])
        print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" % (
            extmerger.get_length(), in1, in2, tlen)
        assert (in1 + in2 + tlen - mixed) == extmerger.get_length(
        ), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" % (
            extmerger.get_length(), in1, in2, tlen, mixed)
        assert (in1 + in2 - mixed) == len(
            list(extmerger.iter_insertion_columns())
        ), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" % (
            len(list(extmerger.iter_insertion_columns())), in1, in1, mixed)

        tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment()
        tmp.delete_all_gap()
        assert tmp.is_aligned(), "merged alignment should be aligned!"
        assert tmp.get_length() == tlen, "merged alignment has wrong length"
        assert all([alg[k] == s for (k, s) in tmp.items()
                    ]), "merged alignment should match original alignment"

        print "======= finished testExtendedAlignment ========="
Beispiel #35
0
from sepp.alignment import MutableAlignment, ExtendedAlignment,_write_fasta
from sepp.exhaustive import JoinAlignJobs, ExhaustiveAlgorithm
from sepp.jobs import PplacerJob,MafftAlignJob,FastTreeJob,PastaAlignJob
from sepp.filemgr import get_temp_file
from sepp.config import options
import sepp.config
from sepp.math_utils import lcm
from sepp.problem import SeppProblem
from sepp.scheduler import JobPool
from multiprocessing import Pool, Manager
from sepp.alignment import ExtendedAlignment
import glob

job_joiner = JoinAlignJobs
original_backbone_file = '/projects/sate8/namphuon/ultra_large/1000000/sate.fasta'
original_backbone = MutableAlignment()
done = original_backbone.read_filepath(original_backbone_file)

original_frag_file = '/projects/sate8/namphuon/ultra_large/1000000/initial.fas.100'
original_frag = MutableAlignment()
done = original_frag.read_filepath(original_frag_file)

#First build extended alignment on entire fragment set
extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names())

dirs = glob.glob('/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/temp/upp.1_HNlM/root/P_0/A_0_*/')

dirs.reverse()
for dir in dirs:
  print "Working on %s\n" % dir
  aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir)
Beispiel #36
0
 def generate_backbone(self):
     _LOG.info("Reading input sequences: %s" %(self.options.sequence_file))
     sequences = MutableAlignment()
     sequences.read_file_object(self.options.sequence_file)
     fragments = MutableAlignment()
     if (options().median_full_length is not None):
       if (options().median_full_length == -1):
         seq_lengths = sorted([len(seq) for seq in sequences.values()])              
         lengths = len(seq_lengths)
         if lengths % 2:
           options().median_full_length = (seq_lengths[lengths / 2] + seq_lengths[lengths / 2 - 1]) / 2.0
         else:
           options().median_full_length = seq_lengths[lengths / 2]              
         
       (min_length,max_length) = (int(options().median_full_length*(1-options().backbone_threshold)),int(options().median_full_length*(1+options().backbone_threshold)))
       frag_names = [name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length]
       if (len(frag_names) > 0):
           fragments = sequences.get_hard_sub_alignment(frag_names)        
           [sequences.pop(i) for i in fragments.keys()]        
     if (options().backbone_size is None):            
         options().backbone_size = min(1000,int(sequences.get_num_taxa()))
         _LOG.info("Backbone size set to: %d" %(options().backbone_size))
     if (options().backbone_size > len(sequences.keys())):
       options().backbone_size = len(sequences.keys())
     backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size))        
     [sequences.pop(i) for i in backbone_sequences.keys()]
             
     _LOG.info("Writing backbone set. ")
     backbone = get_temp_file("backbone", "backbone", ".fas")
     _write_fasta(backbone_sequences, backbone)
      
     _LOG.info("Generating pasta backbone alignment and tree. ")
     pastaalignJob = PastaAlignJob()
     moleculeType = options().molecule
     if (options().molecule == 'amino'):
         moleculeType =  'protein'
     pastaalignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu)
     pastaalignJob.run()
     pastaalignJob.read_results()
     
     options().placement_size = self.options.backbone_size
     options().alignment_file = open(self.options.outdir + "/pasta.fasta")
     options().tree_file = open(self.options.outdir + "/pasta.fasttree")
     _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file))
     sequences.set_alignment(fragments)        
     if (len(sequences) == 0):
       _LOG.info("No query sequences to align.  Final alignment saved as %s" % self.get_output_filename("alignment.fasta"))   
       shutil.copyfile(self.options.outdir + "/pasta.fasta", self.get_output_filename("alignment.fasta"))
       sys.exit(0)
     else:
       query = get_temp_file("query", "backbone", ".fas")
       options().fragment_file = query          
       _write_fasta(sequences, query)               
Beispiel #37
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None
                or options().full_length_range is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (seq_lengths[l2] +
                                                    seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]
            if options().full_length_range is not None:
                L = sorted(int(x) for x in options().full_length_range.split())
                min_length = L[0]
                max_length = L[1]
            else:
                (min_length,
                 max_length) = (int(options().median_full_length *
                                    (1 - options().backbone_threshold)),
                                int(options().median_full_length *
                                    (1 + options().backbone_threshold)))
            _LOG.info(
                "Full length sequences are set to be from %d to %d character long"
                % (min_length, max_length))
            frag_names = [
                name for name in sequences if len(sequences[name]) > max_length
                or len(sequences[name]) < min_length
            ]
            if (len(frag_names) > 0):
                _LOG.info("Detected %d fragmentary sequences" %
                          len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(
            random.sample(sorted(list(sequences.keys())),
                          options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone,
                            options().backbone_size, moleculeType,
                            options().cpu, **vars(options().pasta))
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s" %
                self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)
    def testExtendedAlignment(self):
        print "======= starting testExtendedAlignment ========="

        subset = ["SFIF","SFII","SCFC","SGHD","SDCC","SBGE","SFBB","SDI","SCGB","SJGF","SGBI","SCJA","SGAD","SHEB","SFHB","SDJI","SHED","SJJJ","SBBE","SCCH","SDJB","SDAC","SHEH","SFDC","SFEI","SHHB","SC","SIAB","SDDI","SBCB","SJB","SEBD","SFGD","SHA","SIDA","SGHI","SGIB","SBFJ","SFIE","SCJF","SJHJ","SJBG","SEJI","SFFF","SJ","SIII","SJHH","SEIH","SBDC","SHDJ","SJDD","SGDB","SIHA","SIBB","SECC","SCAD","SGBB","SGIF","SJHC","SFCD","SEAA","SEFF","SDFG","SDJE","SCFG","SFH","SCJ","SDDD","SEGD","SCIH","SDAG","SCJE","SFAJ","SIDJ","SE","SHBC","SJFF","SCHD","SBHA","SEDF","SFAF","SEDD","SDHD","SGJD","SIBH","SGDF","SIFA","SJGA","SIJB","SFI","SGA","SBFC","SBJA","SFFC","SFDH","SFEE","SBDF","SGBJ","SDHE","SJIB","SHHI","SIDE","SJII"]
         
        alg = MutableAlignment()
        alg.read_filepath("data/simulated/test.fasta")
        alg.delete_all_gap()
        tlen = alg.get_length()                    
        
        frg = MutableAlignment()
        frg.read_filepath("data/simulated/test.fas")
        #print frg.get_num_taxa()
        
        pp = SeppProblem(alg.keys())
        pp.fragments = frg
        pp.subalignment = alg
        
        cp1 = SeppProblem(subset, pp)
        cp2 = SeppProblem(list(set(alg.keys()) -set(subset)), pp)
        cp1.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) >= 9], frg)
        cp2.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) <= 1], frg)
        
        cp1labels = cp1.write_subalignment_without_allgap_columns("data/tmp/cp1.fasta")
        cp2labels = cp2.write_subalignment_without_allgap_columns("data/tmp/cp2.fasta")
        tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta")
        assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())])        
        tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta")
        assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())])
        
        cp1.fragments.write_to_path("data/tmp/cp1.frags.fas")
        cp2.fragments.write_to_path("data/tmp/cp2.frags.fas")
        
        '''We have done the hmmalign before. don't worry about that right now'''
        
        ext1 = ExtendedAlignment(cp1.fragments)
        ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto")
        ext1.relabel_original_columns(cp1labels)
        ext2 = ExtendedAlignment(cp2.fragments)
        ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto")
        ext2.relabel_original_columns(cp2labels)
        
        extmerger = ExtendedAlignment([])
        extmerger.merge_in(ext1)
        mixed = extmerger.merge_in(ext2)
                        
        extmerger.write_to_path("data/tmp/extended.merged.fasta")        

        assert extmerger.is_aligned(), "Merged alignment is not aligned"
        in1 = len([x for x in ext1._col_labels if x<0])
        in2 = len([x for x in ext2._col_labels if x<0])
        print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" %(extmerger.get_length(),in1 , in2 , tlen)
        assert ( in1 + in2 + tlen - mixed) == extmerger.get_length(), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d"  %(extmerger.get_length(),in1, in2 , tlen, mixed)
        assert ( in1 + in2 - mixed) == len(list(extmerger.iter_insertion_columns())), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d"  %(len(list(extmerger.iter_insertion_columns())),in1 , in1, mixed)
         
        
        tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment()
        tmp.delete_all_gap()
        assert tmp.is_aligned(), "merged alignment should be aligned!"
        assert tmp.get_length() == tlen, "merged alignment has wrong length"
        assert all([alg[k] == s for (k,s) in tmp.items()]), "merged alignment should match original alignment"

        
        print "======= finished testExtendedAlignment ========="