Example #1
0
def backtranslate(faa, fna):
    newfna = ExtendedAlignment(faa.fragments)
    for k, s in fna.items():
        if k in faa.keys():
            aa = faa[k].upper()
            cd = []
            i = 0
            for r in aa:
                cds = s[i:i + 3]
                if r == '-':
                    cd.append('---')
                else:
                    if is_compatible(cds, r):
                        cd.append(cds)
                        i += 3
                    else:
                        if i == 0 and (cds == 'GTG' or cds == 'TTG'):
                            cd.append(cds)
                            i += 3
                        else:
                            raise ValueError('%s at position %d of %s '
                                             'does not translate to %s' %
                                             (cds, i, k, r))
            newfna[k] = ''.join(cd)
        else:
            continue
    col_lab = faa.col_labels
    for i in col_lab:
        newfna._col_labels = newfna._col_labels + [i, i, i]
    return newfna
Example #2
0
 def merge_results(self):
     assert isinstance(self.root_problem,SeppProblem)
     
     '''Generate single extended alignment'''
     fullExtendedAlignment = ExtendedAlignment(self.root_problem.fragments.keys())
     #self.root_problem.get_children()[0].jobs[get_placement_job_name(0)].get_attribute("full_extended_alignment_object")
     for pp in self.root_problem.get_children():
         for i in range(0,self.root_problem.fragment_chunks):
             align_input = open(pp.jobs[get_placement_job_name(i)].full_extended_alignment_file,'rb')
             extended_alignment = pickle.load(align_input)
             align_input.close()  
             fullExtendedAlignment.merge_in(extended_alignment,convert_to_string=True)
     self.results = fullExtendedAlignment
     
     mergeinput = []
     '''Append main tree to merge input'''
     mergeinput.append("%s;" %(self.root_problem.subtree.compose_newick(labels = True)))
     jsons = []
     for pp in self.root_problem.get_children():
         assert isinstance(pp,SeppProblem)
         for i in range(0,self.root_problem.fragment_chunks):
             if (pp.get_job_result_by_name(get_placement_job_name(i)) is None):
                continue
             '''Append subset trees and json locations to merge input'''
             mergeinput.append("%s;\n%s" %(pp.subtree.compose_newick(labels = True),
                               pp.get_job_result_by_name(get_placement_job_name(i))))
     mergeinput.append("")
     mergeinput.append("")
     meregeinputstring = "\n".join(mergeinput)
     mergeJsonJob = self.get_merge_job(meregeinputstring)
     mergeJsonJob.run()
 def merge_subalignments(self):
     '''
     Merge alignment subset extended alignments to get one extended alignment
     for current placement subset.
     '''
     pp = self.placement_problem
     _LOG.info("Merging sub-alignments for placement problem : %s." %(pp.label))
     ''' First assign fragments to the placement problem'''
     pp.fragments = pp.parent.fragments.get_soft_sub_alignment([])
     for ap in pp.get_children():
         pp.fragments.seq_names |= set(ap.fragments)
     ''' Then Build an extended alignment by merging all hmmalign results'''
     extendedAlignment = ExtendedAlignment(pp.fragments.seq_names)
     for ap in pp.children:
         assert isinstance(ap, SeppProblem)
         ''' Get all fragment chunk alignments for this alignment subset'''
         aligned_files = [fp.get_job_result_by_name('hmmalign') for
                             fp in ap.children if
                             fp.get_job_result_by_name('hmmalign') is not None]
         _LOG.info("Merging fragment chunks for subalignment : %s." %(ap.label))
         ap_alg = ap.read_extendend_alignment_and_relabel_columns\
                     (ap.jobs["hmmbuild"].infile , aligned_files)
         _LOG.info("Merging alignment subset into placement subset: %s." %(ap.label))
         
         extendedAlignment.merge_in(ap_alg,convert_to_string=False)
         del ap_alg
     extendedAlignment.from_bytearray_to_string()
     return extendedAlignment
Example #4
0
    def read_extendend_alignment_and_relabel_columns(
            self, orig_path, extension_path, convert_to_string=True):
        '''
        This method goes with write_subalignment_without_allgap_columns method.
        It enables reading back an alignment that was previously written to
        disk, and relabeling its columns with the original labels.
        extension_path is a path to an .sto file (or a list of paths).
        Alignments from these .sto files are also read, and merged with the
        original (base) alignment.
        '''
        remaining_cols = self.annotations["ref.alignment.columns"]
        assert remaining_cols is not None and \
            len(remaining_cols) != 0, \
            ("Subproblem needs to have a proper list of alignment columns "
             "associated with it")

        _LOG.debug(
            "Reading %s %s and relabeling it based on %d orig column labels." %
            (orig_path, extension_path, len(remaining_cols)))

        ap_alg = ExtendedAlignment(list(self.fragments.keys()))
        ap_alg.build_extended_alignment(
            orig_path, extension_path, convert_to_string)
        ap_alg.relabel_original_columns(remaining_cols)
        return ap_alg
Example #5
0
    def merge_subalignments(self):
        '''
        Merge alignment subset extended alignments to get one extended alignment
        for current placement subset.
        '''
        pp = self.placement_problem
        _LOG.info("Merging sub-alignments for placement problem : %s." %
                  (pp.label))
        ''' First assign fragments to the placement problem'''
        pp.fragments = pp.parent.fragments.get_soft_sub_alignment([])
        for ap in pp.get_children():
            pp.fragments.seq_names |= set(ap.fragments)
        ''' Then Build an extended alignment by merging all hmmalign results'''
        extendedAlignment = ExtendedAlignment(pp.fragments.seq_names)
        for ap in pp.children:
            assert isinstance(ap, SeppProblem)
            ''' Get all fragment chunk alignments for this alignment subset'''
            aligned_files = [
                fp.get_job_result_by_name('hmmalign') for fp in ap.children
                if fp.get_job_result_by_name('hmmalign') is not None
            ]
            _LOG.info("Merging fragment chunks for subalignment : %s." %
                      (ap.label))
            ap_alg = ap.read_extendend_alignment_and_relabel_columns\
                        (ap.jobs["hmmbuild"].infile , aligned_files)
            _LOG.info("Merging alignment subset into placement subset: %s." %
                      (ap.label))

            extendedAlignment.merge_in(ap_alg, convert_to_string=False)
            del ap_alg
        extendedAlignment.from_bytearray_to_string()
        return extendedAlignment
Example #6
0
    def read_extendend_alignment_and_relabel_columns(
            self, orig_path, extension_path, convert_to_string=True):
        """
        This method goes with write_subalignment_without_allgap_columns method.
        It enables reading back an alignment that was previously written to
        disk, and relabeling its columns with the original labels.
        extension_path is a path to an .sto file (or a list of paths).
        Alignments from these .sto files are also read, and merged with the
        original (base) alignment.
        """
        remaining_cols = self.annotations["ref.alignment.columns"]
        assert remaining_cols is not None and \
            len(remaining_cols) != 0, \
            ("Subproblem needs to have a proper list of alignment columns "
             "associated with it")

        _LOG.debug(
            "Reading %s %s and relabeling it based on %d orig column labels." %
            (orig_path, extension_path, len(remaining_cols)))

        ap_alg = ExtendedAlignment(list(self.fragments.keys()))
        ap_alg.build_extended_alignment(
            orig_path, extension_path, convert_to_string)
        ap_alg.relabel_original_columns(remaining_cols)
        return ap_alg
Example #7
0
    def merge_results(self):
        assert isinstance(self.root_problem, SeppProblem)

        '''Generate single extended alignment'''
        fullExtendedAlignment = ExtendedAlignment(
            self.root_problem.fragments.keys())
        # self.root_problem.get_children()[0].jobs[get_placement_job_name(0)]\
        # .get_attribute("full_extended_alignment_object")
        for pp in self.root_problem.get_children():
            for i in range(0, self.root_problem.fragment_chunks):
                align_input = open(
                    pp.jobs[get_placement_job_name(i)]
                    .full_extended_alignment_file, 'rb')
                extended_alignment = pickle.load(align_input)
                align_input.close()
                fullExtendedAlignment.merge_in(
                    extended_alignment, convert_to_string=True)
        self.results = fullExtendedAlignment

        mergeinput = []
        '''Append main tree to merge input'''
        mergeinput.append("%s;" % (
            self.root_problem.subtree.compose_newick(labels=True)))
        for pp in self.root_problem.get_children():
            assert isinstance(pp, SeppProblem)
            for i in range(0, self.root_problem.fragment_chunks):
                if (pp.get_job_result_by_name(
                       get_placement_job_name(i)) is None):
                    continue
                '''Append subset trees and json locations to merge input'''
                mergeinput.append(
                    "%s;\n%s" % (
                        pp.subtree.compose_newick(labels=True),
                        pp.get_job_result_by_name(get_placement_job_name(i))))
        mergeinput.append("")
        mergeinput.append("")
        meregeinputstring = "\n".join(mergeinput)
        mergeJsonJob = self.get_merge_job(meregeinputstring)
        mergeJsonJob.run()
Example #8
0
 def merge_subalignments(self):
     '''
     Merge alignment subset extended alignments to get one extended
     alignment for current placement subset.
     '''
     pp = self.placement_problem
     _LOG.info("Merging sub-alignments for placement problem : %s." %
               (pp.label))
     ''' First find fragments assigned to this placement problem'''
     pp.fragments = pp.parent.fragments.get_soft_sub_alignment([])
     for ap in pp.get_children():
         pp.fragments.seq_names |= set(ap.fragments)
     ''' Then, gather a list of all alignments relevant to this placement
     subset'''
     fragfilesperap = dict()
     for ap in pp.children:
         assert isinstance(ap, SeppProblem)
         ''' Get all fragment chunk alignments for this alignment subset'''
         aligned_files = [
             fp.get_job_result_by_name('hmmalign') for fp in ap.children
         ]
         fragfilesperap[ap] = aligned_files
     ''' Now, build an extended alignment *per each fragment chunk*.
         Simply merge all hmmalign results for fragment chunk numbered i'''
     extendedAlignments = []
     for i in range(0, self.root_problem.fragment_chunks):
         extendedAlignment = ExtendedAlignment(pp.fragments.seq_names)
         for ap in pp.children:
             # _LOG.debug("Merging fragment chunks for subalignment : %s."
             # %(ap.label))
             if fragfilesperap[ap][i]:
                 ap_alg = ap.read_extendend_alignment_and_relabel_columns(
                     ap.jobs["hmmbuild"].infile, [fragfilesperap[ap][i]])
             else:
                 ap_alg = ap.read_extendend_alignment_and_relabel_columns(
                     ap.jobs["hmmbuild"].infile, [])
             _LOG.debug(
                 ("Merging alignment subset into placement subset for "
                  "chunk %d: %s.") % (i, ap.label))
             extendedAlignment.merge_in(ap_alg, convert_to_string=False)
         '''Extended alignmnts have all fragments. remove the ones that
            don't belong to thsi chunk'''
         extendedAlignment.remove_missing_fragments()
         extendedAlignment.from_bytearray_to_string()
         extendedAlignments.append(extendedAlignment)
     return extendedAlignments
Example #9
0
    def merge_subalignments(self):
        '''
        Merge alignment subset extended alignments to get one extended
        alignment for current placement subset.
        '''
        pp = self.placement_problem
        _LOG.info("Merging sub-alignments for placement problem : %s." %
                  (pp.label))
        ''' First find fragments assigned to this placement problem'''
        pp.fragments = pp.parent.fragments.get_soft_sub_alignment([])
        for ap in pp.get_children():
            pp.fragments.seq_names |= set(ap.fragments)

        ''' Then, gather a list of all alignments relevant to this placement
        subset'''
        fragfilesperap = dict()
        for ap in pp.children:
            assert isinstance(ap, SeppProblem)
            ''' Get all fragment chunk alignments for this alignment subset'''
            aligned_files = [fp.get_job_result_by_name('hmmalign') for
                             fp in ap.children]
            fragfilesperap[ap] = aligned_files

        ''' Now, build an extended alignment *per each fragment chunk*.
            Simply merge all hmmalign results for fragment chunk numbered i'''
        extendedAlignments = []
        for i in range(0, self.root_problem.fragment_chunks):
            extendedAlignment = ExtendedAlignment(pp.fragments.seq_names)
            for ap in pp.children:
                # _LOG.debug("Merging fragment chunks for subalignment : %s."
                # %(ap.label))
                if fragfilesperap[ap][i]:
                    ap_alg = ap.read_extendend_alignment_and_relabel_columns(
                        ap.jobs["hmmbuild"].infile, [fragfilesperap[ap][i]])
                else:
                    ap_alg = ap.read_extendend_alignment_and_relabel_columns(
                        ap.jobs["hmmbuild"].infile, [])
                _LOG.debug(
                    ("Merging alignment subset into placement subset for "
                     "chunk %d: %s.") % (i, ap.label))
                extendedAlignment.merge_in(ap_alg, convert_to_string=False)
            '''Extended alignmnts have all fragments. remove the ones that
               don't belong to thsi chunk'''
            extendedAlignment.remove_missing_fragments()
            extendedAlignment.from_bytearray_to_string()
            extendedAlignments.append(extendedAlignment)
        return extendedAlignments
Example #10
0
    def merge_results(self):
        assert \
            len(self.root_problem.get_children()) == 1, \
            "Currently UPP works with only one placement subset."
        '''
        Merge alignment subset extended alignments to get one extended
        alignment for current placement subset.
        '''
        pp = self.root_problem.get_children()[0]
        _LOG.info("Merging sub-alignments for placement problem : %s." %
                  pp.label)
        ''' First assign fragments to the placement problem'''
        pp.fragments = pp.parent.fragments.get_soft_sub_alignment([])
        for ap in pp.get_children():
            pp.fragments.seq_names |= set(ap.fragments)
        ''' Then Build an extended alignment by merging all hmmalign results'''
        _LOG.debug("fragments are %d:\n %s" %
                   (len(pp.fragments.seq_names), pp.fragments.seq_names))
        extendedAlignment = ExtendedAlignment(pp.fragments.seq_names)
        for ap in pp.children:
            assert isinstance(ap, SeppProblem)
            ''' Get all fragment chunk alignments for this alignment subset'''
            aligned_files = [
                fp.get_job_result_by_name('hmmalign') for fp in ap.children
                if fp.get_job_result_by_name('hmmalign') is not None
            ]
            _LOG.debug("Merging fragment chunks for subalignment : %s." %
                       ap.label)
            ap_alg = ap.read_extendend_alignment_and_relabel_columns(
                ap.jobs["hmmbuild"].infile, aligned_files)
            _LOG.debug("Merging alignment subset into placement subset: %s." %
                       ap.label)
            extendedAlignment.merge_in(ap_alg, convert_to_string=False)

        extendedAlignment.from_bytearray_to_string()
        self.results = extendedAlignment
Example #11
0
class UPPExhaustiveAlgorithm(ExhaustiveAlgorithm):
    '''
    This implements the exhaustive algorithm where all alignments subsets
    are searched for every fragment. This is for UPP, meaning that no placement
    is performed, and that there is always only one placement subset
    (currently).
    '''
    def __init__(self):
        ExhaustiveAlgorithm.__init__(self)
        self.pasta_only = False

    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None
                or options().full_length_range is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (seq_lengths[l2] +
                                                    seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]
            if options().full_length_range is not None:
                L = sorted(int(x) for x in options().full_length_range.split())
                min_length = L[0]
                max_length = L[1]
            else:
                (min_length,
                 max_length) = (int(options().median_full_length *
                                    (1 - options().backbone_threshold)),
                                int(options().median_full_length *
                                    (1 + options().backbone_threshold)))
            _LOG.info(
                "Full length sequences are set to be from %d to %d character long"
                % (min_length, max_length))
            frag_names = [
                name for name in sequences if len(sequences[name]) > max_length
                or len(sequences[name]) < min_length
            ]
            if (len(frag_names) > 0):
                _LOG.info("Detected %d fragmentary sequences" %
                          len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(
            random.sample(sorted(list(sequences.keys())),
                          options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone,
                            options().backbone_size, moleculeType,
                            options().cpu, **vars(options().pasta))
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s" %
                self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)

    def check_options(self):
        self.check_outputprefix()
        options().info_file = "A_dummy_value"

        # Check to see if tree/alignment/fragment file provided, if not,
        # generate it from sequence file
        if ((not options().tree_file is None)
                and (not options().alignment_file is None)
                and (not options().sequence_file is None)):
            options().fragment_file = options().sequence_file
        elif ((options().tree_file is None)
              and (options().alignment_file is None)
              and (not options().sequence_file is None)):
            self.generate_backbone()
        else:
            _LOG.error(
                ("Either specify the backbone alignment and tree and query "
                 "sequences or only the query sequences.  Any other "
                 "combination is invalid"))
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))
        backbone_size = sequences.get_num_taxa()
        if options().backbone_size is None:
            options().backbone_size = backbone_size
        assert options().backbone_size == backbone_size, (
            ("Backbone parameter needs to match actual size of backbone; "
             "backbone parameter:%s backbone_size:%s") %
            (options().backbone_size, backbone_size))
        if options().placement_size is None:
            options().placement_size = options().backbone_size

        if options().backtranslation_sequence_file and \
                options().molecule != "amino":
            _LOG.error(("Backtranslation can be performed only when "
                        "input sequences are amino acid. "))
            exit(-1)

        return ExhaustiveAlgorithm.check_options(self)

    def merge_results(self):
        assert \
            len(self.root_problem.get_children()) == 1, \
            "Currently UPP works with only one placement subset."
        '''
        Merge alignment subset extended alignments to get one extended
        alignment for current placement subset.
        '''
        pp = self.root_problem.get_children()[0]
        _LOG.info("Merging sub-alignments for placement problem : %s." %
                  (pp.label))
        ''' First assign fragments to the placement problem'''
        pp.fragments = pp.parent.fragments.get_soft_sub_alignment([])
        for ap in pp.get_children():
            pp.fragments.seq_names |= set(ap.fragments)
        ''' Then Build an extended alignment by merging all hmmalign results'''
        _LOG.debug("fragments are %d:\n %s" %
                   (len(pp.fragments.seq_names), pp.fragments.seq_names))
        extendedAlignment = ExtendedAlignment(pp.fragments.seq_names)
        for ap in pp.children:
            assert isinstance(ap, SeppProblem)
            ''' Get all fragment chunk alignments for this alignment subset'''
            aligned_files = [
                fp.get_job_result_by_name('hmmalign') for fp in ap.children
                if fp.get_job_result_by_name('hmmalign') is not None
            ]
            _LOG.debug("Merging fragment chunks for subalignment : %s." %
                       (ap.label))
            ap_alg = ap.read_extendend_alignment_and_relabel_columns(
                ap.jobs["hmmbuild"].infile, aligned_files)
            _LOG.debug("Merging alignment subset into placement subset: %s." %
                       (ap.label))
            extendedAlignment.merge_in(ap_alg, convert_to_string=False)

        extendedAlignment.from_bytearray_to_string()
        self.results = extendedAlignment


# Useful for multi-core merging if ever needed
#    def parallel_merge_results(self):
#        assert len(self.root_problem.get_children()) == 1, "Currently UPP
#        works with only one placement subset."
#        '''
#        Merge alignment subset extended alignments to get one extended
#        alignment
#        for current placement subset.
#        '''
#        pp = self.root_problem.get_children()[0]
#        _LOG.info("Merging sub-alignments for placement problem : %s."
#        %(pp.label))
#        ''' Then Build an extended alignment by merging all hmmalign
#            results'''
#        manager = Manager()
#        extendedAlignments = manager.list()
#        for ap in pp.children:
#            assert isinstance(ap, SeppProblem)
#            ''' Get all fragment chunk alignments for this alignment subset'''
#            aligned_files = [fp.get_job_result_by_name('hmmalign') for
#                             fp in ap.children if
#                             fp.get_job_result_by_name('hmmalign')
#                               is not None]
#            _LOG.info("Merging fragment chunks for subalignment : %s."
#                       %(ap.label))
#            ap_alg = ap.read_extendend_alignment_and_relabel_columns\
#                        (ap.jobs["hmmbuild"].infile , aligned_files)
#            _LOG.info("Merging alignment subset into placement subset: %s."
#                          %(ap.label))
#            extendedAlignments.append(ap_alg)
#
#        while len(extendedAlignments)>1:
#            a=range(0,len(extendedAlignments))
#            #print [len(x) for x in extendedAlignments]
#            x = zip(a[0::2],a[1::2])
#            mapin = zip (x,[extendedAlignments]*len(x))
#            _LOG.debug("One round of merging started. Currently have %d
#                        alignments left. " %len(extendedAlignments))
#            Pool(max(12,len(extendedAlignments))).map(mergetwo,mapin)
#            #print [len(x) if x is not None else "None" for x in
#                    extendedAlignments]
#            extendedAlignments = manager.list([x for x in
#                             extendedAlignments if x is not None])
#            extendedAlignments.reverse()
#            _LOG.debug("One round of merging finished. Still have %d
#                       alignments left. " %len(extendedAlignments))
#        extendedAlignment = extendedAlignments[0]
#        extendedAlignment.from_bytearray_to_string()
#        self.results = extendedAlignment

    def output_results(self):
        extended_alignment = self.results
        _LOG.info("Generating output. ")
        outfilename = self.get_output_filename("alignment.fasta")
        extended_alignment.write_to_path(outfilename)
        _LOG.info("Unmasked alignment written to %s" % outfilename)
        outfilename = self.get_output_filename("insertion_columns.txt")
        extended_alignment.write_insertion_column_indexes(outfilename)
        _LOG.info("The index of insertion columns written to %s" % outfilename)
        if self.options.backtranslation_sequence_file:
            outfilename = self.get_output_filename(
                "backtranslated_alignment.fasta")
            backtranslation_seqs = MutableAlignment()
            backtranslation_seqs.read_file_object(
                self.options.backtranslation_sequence_file)
            try:
                extended_backtranslated_alignment = backtranslate(
                    self.results, backtranslation_seqs)
            except Exception as e:
                _LOG.warning("Backtranslation failed due "
                             "to following error: " + str(e) + ".\n"
                             "No translated DNA sequence will be "
                             "written to a file.")
                pass
            else:
                extended_backtranslated_alignment.write_to_path(outfilename)
                _LOG.info("Backtranslated alignment written to %s" %
                          outfilename)
                extended_backtranslated_alignment.remove_insertion_columns()
                outfilename = self.get_output_filename(
                    "backtranslated_alignment_masked.fasta")
                extended_backtranslated_alignment.write_to_path(outfilename)
                _LOG.info("Backtranslated masked alignment written "
                          "to %s" % outfilename)

        extended_alignment.remove_insertion_columns()
        outfilename = self.get_output_filename("alignment_masked.fasta")
        extended_alignment.write_to_path(outfilename)
        _LOG.info("Masked alignment written to %s" % outfilename)

    def check_and_set_sizes(self, total):
        assert (self.options.placement_size is None) or (
                self.options.placement_size >= total), \
                ("currently UPP works with only one placement subset."
                 " Please leave placement subset size option blank.")
        ExhaustiveAlgorithm.check_and_set_sizes(self, total)
        self.options.placement_size = total

    def _get_new_Join_Align_Job(self):
        return UPPJoinAlignJobs()

    def modify_tree(self, a_tree):
        ''' Filter out taxa on long branches '''
        self.filtered_taxa = []
        if self.options.long_branch_filter is not None:
            tr = a_tree.get_tree()
            elen = {}
            for e in tr.leaf_edge_iter():
                elen[e] = e.length
            elensort = sorted(elen.values())
            mid = elensort[len(elensort) / 2]
            torem = []
            for k, v in list(elen.items()):
                if v > mid * self.options.long_branch_filter:
                    self.filtered_taxa.append(k.head_node.taxon.label)
                    torem.append(k.head_node.taxon)
            tr.prune_taxa(torem)

    def create_fragment_files(self):
        alg_subset_count = len(list(self.root_problem.iter_leaves()))
        frag_chunk_count = lcm(alg_subset_count,
                               self.options.cpu) // alg_subset_count
        _LOG.info("%d taxa pruned from backbone and added to fragments: %s" %
                  (len(self.filtered_taxa), " , ".join(self.filtered_taxa)))
        return self.read_and_divide_fragments(
            frag_chunk_count,
            extra_frags=self.root_problem.subalignment.get_soft_sub_alignment(
                self.filtered_taxa))
Example #12
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (
                        seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]

            (min_length, max_length) = (
                int(options().median_full_length * (
                    1 - options().backbone_threshold)),
                int(options().median_full_length*(
                    1 + options().backbone_threshold)))
            frag_names = [
                name for name in sequences
                if len(sequences[name]) > max_length or
                len(sequences[name]) < min_length]
            if (len(frag_names) > 0):
                _LOG.info(
                    "Detected %d fragmentary sequences" % len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(random.sample(
            sorted(list(sequences.keys())), options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone, options().backbone_size,
                            moleculeType, options().cpu)
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s"
            % (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s"
                % self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)
Example #13
0
class UPPExhaustiveAlgorithm(ExhaustiveAlgorithm):
    '''
    This implements the exhaustive algorithm where all alignments subsets
    are searched for every fragment. This is for UPP, meaning that no placement
    is performed, and that there is always only one placement subset
    (currently).
    '''
    def __init__(self):
        ExhaustiveAlgorithm.__init__(self)
        self.pasta_only = False

    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (
                        seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]

            (min_length, max_length) = (
                int(options().median_full_length * (
                    1 - options().backbone_threshold)),
                int(options().median_full_length*(
                    1 + options().backbone_threshold)))
            frag_names = [
                name for name in sequences
                if len(sequences[name]) > max_length or
                len(sequences[name]) < min_length]
            if (len(frag_names) > 0):
                _LOG.info(
                    "Detected %d fragmentary sequences" % len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(random.sample(
            sorted(list(sequences.keys())), options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone, options().backbone_size,
                            moleculeType, options().cpu)
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s"
            % (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s"
                % self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)

    def check_options(self):
        self.check_outputprefix()
        options().info_file = "A_dummy_value"

        # Check to see if tree/alignment/fragment file provided, if not,
        # generate it from sequence file
        if (
            (not options().tree_file is None) and
            (not options().alignment_file is None) and
            (not options().sequence_file is None)
           ):
            options().fragment_file = options().sequence_file
        elif (
              (options().tree_file is None) and
              (options().alignment_file is None) and
              (not options().sequence_file is None)
             ):
            self.generate_backbone()
        else:
            _LOG.error(
                ("Either specify the backbone alignment and tree and query "
                 "sequences or only the query sequences.  Any other "
                 "combination is invalid"))
            exit(-1)
        sequences = MutableAlignment()
        sequences.read_file_object(open(self.options.alignment_file.name))
        backbone_size = sequences.get_num_taxa()
        if options().backbone_size is None:
            options().backbone_size = backbone_size
        assert options().backbone_size == backbone_size, (
            ("Backbone parameter needs to match actual size of backbone; "
             "backbone parameter:%s backbone_size:%s")
            % (options().backbone_size, backbone_size))
        if options().placement_size is None:
            options().placement_size = options().backbone_size
        return ExhaustiveAlgorithm.check_options(self)

    def merge_results(self):
        assert \
            len(self.root_problem.get_children()) == 1, \
            "Currently UPP works with only one placement subset."
        '''
        Merge alignment subset extended alignments to get one extended
        alignment for current placement subset.
        '''
        pp = self.root_problem.get_children()[0]
        _LOG.info(
            "Merging sub-alignments for placement problem : %s." % (pp.label))
        ''' First assign fragments to the placement problem'''
        pp.fragments = pp.parent.fragments.get_soft_sub_alignment([])
        for ap in pp.get_children():
            pp.fragments.seq_names |= set(ap.fragments)

        ''' Then Build an extended alignment by merging all hmmalign results'''
        _LOG.debug(
            "fragments are %d:\n %s" % (
                len(pp.fragments.seq_names), pp.fragments.seq_names))
        extendedAlignment = ExtendedAlignment(pp.fragments.seq_names)
        for ap in pp.children:
            assert isinstance(ap, SeppProblem)
            ''' Get all fragment chunk alignments for this alignment subset'''
            aligned_files = [fp.get_job_result_by_name('hmmalign') for
                             fp in ap.children if
                             fp.get_job_result_by_name('hmmalign') is not None]
            _LOG.debug(
                "Merging fragment chunks for subalignment : %s." % (ap.label))
            ap_alg = ap.read_extendend_alignment_and_relabel_columns(
                ap.jobs["hmmbuild"].infile, aligned_files)
            _LOG.debug(
                "Merging alignment subset into placement subset: %s." %
                (ap.label))
            extendedAlignment.merge_in(ap_alg, convert_to_string=False)

        extendedAlignment.from_bytearray_to_string()
        self.results = extendedAlignment

# Useful for multi-core merging if ever needed
#    def parallel_merge_results(self):
#        assert len(self.root_problem.get_children()) == 1, "Currently UPP
#        works with only one placement subset."
#        '''
#        Merge alignment subset extended alignments to get one extended
#        alignment
#        for current placement subset.
#        '''
#        pp = self.root_problem.get_children()[0]
#        _LOG.info("Merging sub-alignments for placement problem : %s."
#        %(pp.label))
#        ''' Then Build an extended alignment by merging all hmmalign
#            results'''
#        manager = Manager()
#        extendedAlignments = manager.list()
#        for ap in pp.children:
#            assert isinstance(ap, SeppProblem)
#            ''' Get all fragment chunk alignments for this alignment subset'''
#            aligned_files = [fp.get_job_result_by_name('hmmalign') for
#                             fp in ap.children if
#                             fp.get_job_result_by_name('hmmalign')
#                               is not None]
#            _LOG.info("Merging fragment chunks for subalignment : %s."
#                       %(ap.label))
#            ap_alg = ap.read_extendend_alignment_and_relabel_columns\
#                        (ap.jobs["hmmbuild"].infile , aligned_files)
#            _LOG.info("Merging alignment subset into placement subset: %s."
#                          %(ap.label))
#            extendedAlignments.append(ap_alg)
#
#        while len(extendedAlignments)>1:
#            a=range(0,len(extendedAlignments))
#            #print [len(x) for x in extendedAlignments]
#            x = zip(a[0::2],a[1::2])
#            mapin = zip (x,[extendedAlignments]*len(x))
#            _LOG.debug("One round of merging started. Currently have %d
#                        alignments left. " %len(extendedAlignments))
#            Pool(max(12,len(extendedAlignments))).map(mergetwo,mapin)
#            #print [len(x) if x is not None else "None" for x in
#                    extendedAlignments]
#            extendedAlignments = manager.list([x for x in
#                             extendedAlignments if x is not None])
#            extendedAlignments.reverse()
#            _LOG.debug("One round of merging finished. Still have %d
#                       alignments left. " %len(extendedAlignments))
#        extendedAlignment = extendedAlignments[0]
#        extendedAlignment.from_bytearray_to_string()
#        self.results = extendedAlignment

    def output_results(self):
        extended_alignment = self.results
        _LOG.info("Generating output. ")
        outfilename = self.get_output_filename("alignment.fasta")
        extended_alignment.write_to_path(outfilename)
        _LOG.info("Unmasked alignment written to %s" % outfilename)
        outfilename = self.get_output_filename("insertion_columns.txt")
        extended_alignment.write_insertion_column_indexes(outfilename)
        _LOG.info("The index of insertion columns written to %s" % outfilename)
        extended_alignment.remove_insertion_columns()
        outfilename = self.get_output_filename("alignment_masked.fasta")
        extended_alignment.write_to_path(outfilename)
        _LOG.info("Masked alignment written to %s" % outfilename)

    def check_and_set_sizes(self, total):
        assert (self.options.placement_size is None) or (
                self.options.placement_size >= total), \
                ("currently UPP works with only one placement subset."
                 " Please leave placement subset size option blank.")
        ExhaustiveAlgorithm.check_and_set_sizes(self, total)
        self.options.placement_size = total

    def _get_new_Join_Align_Job(self):
        return UPPJoinAlignJobs()

    def modify_tree(self, a_tree):
        ''' Filter out taxa on long branches '''
        self.filtered_taxa = []
        if self.options.long_branch_filter is not None:
            tr = a_tree.get_tree()
            elen = {}
            for e in tr.leaf_edge_iter():
                elen[e] = e.length
            elensort = sorted(elen.values())
            mid = elensort[len(elensort) / 2]
            torem = []
            for k, v in list(elen.items()):
                if v > mid * self.options.long_branch_filter:
                    self.filtered_taxa.append(k.head_node.taxon.label)
                    torem.append(k.head_node.taxon)
            tr.prune_taxa(torem)

    def create_fragment_files(self):
        alg_subset_count = len(list(self.root_problem.iter_leaves()))
        frag_chunk_count = lcm(
            alg_subset_count, self.options.cpu) // alg_subset_count
        _LOG.info(
            "%d taxa pruned from backbone and added to fragments: %s"
            % (len(self.filtered_taxa), " , ".join(self.filtered_taxa)))
        return self.read_and_divide_fragments(
            frag_chunk_count,
            extra_frags=self.root_problem.subalignment.get_soft_sub_alignment(
                self.filtered_taxa))
                writer.write(line.upper())
            else:
                writer.write(line)

original_backbone = MutableAlignment()
done = original_backbone.read_filepath(new_backbone_file)

# all query sequences
original_frag_file = (
    '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/all_query.txt'
)
original_frag = MutableAlignment()
done = original_frag.read_filepath(original_frag_file)

# First build extended alignment on entire fragment set
extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names())
dir = '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/'

for a in [1, 2]:
    a = str(a)
    print("Working on HMM %s\n" % a)

    # query sequences
    aligned_files = glob.glob(str(dir) + 's' + str(a) + '_query.aln')
    if a == '1':
        sequence_files = glob.glob(str(dir) + 'query_x.txt')
    elif a == '2':
        sequence_files = glob.glob(str(dir) + 'query_y.txt')

    # sequences your hmm was trained on. Ensure you didn't just take the backbone alignment and
    # restrict the subset of sequences. This file must not have any gaps in it.
Example #15
0
    def testExtendedAlignment(self):
        print "======= starting testExtendedAlignment ========="

        subset = [
            "SFIF", "SFII", "SCFC", "SGHD", "SDCC", "SBGE", "SFBB", "SDI",
            "SCGB", "SJGF", "SGBI", "SCJA", "SGAD", "SHEB", "SFHB", "SDJI",
            "SHED", "SJJJ", "SBBE", "SCCH", "SDJB", "SDAC", "SHEH", "SFDC",
            "SFEI", "SHHB", "SC", "SIAB", "SDDI", "SBCB", "SJB", "SEBD",
            "SFGD", "SHA", "SIDA", "SGHI", "SGIB", "SBFJ", "SFIE", "SCJF",
            "SJHJ", "SJBG", "SEJI", "SFFF", "SJ", "SIII", "SJHH", "SEIH",
            "SBDC", "SHDJ", "SJDD", "SGDB", "SIHA", "SIBB", "SECC", "SCAD",
            "SGBB", "SGIF", "SJHC", "SFCD", "SEAA", "SEFF", "SDFG", "SDJE",
            "SCFG", "SFH", "SCJ", "SDDD", "SEGD", "SCIH", "SDAG", "SCJE",
            "SFAJ", "SIDJ", "SE", "SHBC", "SJFF", "SCHD", "SBHA", "SEDF",
            "SFAF", "SEDD", "SDHD", "SGJD", "SIBH", "SGDF", "SIFA", "SJGA",
            "SIJB", "SFI", "SGA", "SBFC", "SBJA", "SFFC", "SFDH", "SFEE",
            "SBDF", "SGBJ", "SDHE", "SJIB", "SHHI", "SIDE", "SJII"
        ]

        alg = MutableAlignment()
        alg.read_filepath("data/simulated/test.fasta")
        alg.delete_all_gap()
        tlen = alg.get_length()

        frg = MutableAlignment()
        frg.read_filepath("data/simulated/test.fas")
        #print frg.get_num_taxa()

        pp = SeppProblem(alg.keys())
        pp.fragments = frg
        pp.subalignment = alg

        cp1 = SeppProblem(subset, pp)
        cp2 = SeppProblem(list(set(alg.keys()) - set(subset)), pp)
        cp1.fragments = ReadonlySubalignment(
            [k for k in frg.keys() if int(k[-1]) >= 9], frg)
        cp2.fragments = ReadonlySubalignment(
            [k for k in frg.keys() if int(k[-1]) <= 1], frg)

        cp1labels = cp1.write_subalignment_without_allgap_columns(
            "data/tmp/cp1.fasta")
        cp2labels = cp2.write_subalignment_without_allgap_columns(
            "data/tmp/cp2.fasta")
        tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta")
        assert all(
            [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())])
        tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta")
        assert all(
            [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())])

        cp1.fragments.write_to_path("data/tmp/cp1.frags.fas")
        cp2.fragments.write_to_path("data/tmp/cp2.frags.fas")
        '''We have done the hmmalign before. don't worry about that right now'''

        ext1 = ExtendedAlignment(cp1.fragments)
        ext1.build_extended_alignment("data/tmp/cp1.fasta",
                                      "data/tmp/cp1.extended.sto")
        ext1.relabel_original_columns(cp1labels)
        ext2 = ExtendedAlignment(cp2.fragments)
        ext2.build_extended_alignment("data/tmp/cp2.fasta",
                                      "data/tmp/cp2.extended.sto")
        ext2.relabel_original_columns(cp2labels)

        extmerger = ExtendedAlignment([])
        extmerger.merge_in(ext1)
        mixed = extmerger.merge_in(ext2)

        extmerger.write_to_path("data/tmp/extended.merged.fasta")

        assert extmerger.is_aligned(), "Merged alignment is not aligned"
        in1 = len([x for x in ext1._col_labels if x < 0])
        in2 = len([x for x in ext2._col_labels if x < 0])
        print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" % (
            extmerger.get_length(), in1, in2, tlen)
        assert (in1 + in2 + tlen - mixed) == extmerger.get_length(
        ), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" % (
            extmerger.get_length(), in1, in2, tlen, mixed)
        assert (in1 + in2 - mixed) == len(
            list(extmerger.iter_insertion_columns())
        ), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" % (
            len(list(extmerger.iter_insertion_columns())), in1, in1, mixed)

        tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment()
        tmp.delete_all_gap()
        assert tmp.is_aligned(), "merged alignment should be aligned!"
        assert tmp.get_length() == tlen, "merged alignment has wrong length"
        assert all([alg[k] == s for (k, s) in tmp.items()
                    ]), "merged alignment should match original alignment"

        print "======= finished testExtendedAlignment ========="
Example #16
0
from sepp.scheduler import JobPool
from multiprocessing import Pool, Manager
from sepp.alignment import ExtendedAlignment
import glob

job_joiner = JoinAlignJobs
original_backbone_file = '/projects/sate8/namphuon/ultra_large/1000000/sate.fasta'
original_backbone = MutableAlignment()
done = original_backbone.read_filepath(original_backbone_file)

original_frag_file = '/projects/sate8/namphuon/ultra_large/1000000/initial.fas.100'
original_frag = MutableAlignment()
done = original_frag.read_filepath(original_frag_file)

#First build extended alignment on entire fragment set
extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names())

dirs = glob.glob('/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/temp/upp.1_HNlM/root/P_0/A_0_*/')

dirs.reverse()
for dir in dirs:
  print "Working on %s\n" % dir
  aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir)
  sequence_files = glob.glob('%sFC_*/hmmalign.frag.*' % dir)
  base_alignment_file = glob.glob('%s/*.fasta' % dir)
  base_alignment = MutableAlignment()
  done = base_alignment.read_filepath(base_alignment_file[0])
  subbackbone = original_backbone.get_soft_sub_alignment(base_alignment.get_sequence_names())
  frags = MutableAlignment()
  sequence_names = []
  for file in sequence_files:
    def testExtendedAlignment(self):
        print "======= starting testExtendedAlignment ========="

        subset = ["SFIF","SFII","SCFC","SGHD","SDCC","SBGE","SFBB","SDI","SCGB","SJGF","SGBI","SCJA","SGAD","SHEB","SFHB","SDJI","SHED","SJJJ","SBBE","SCCH","SDJB","SDAC","SHEH","SFDC","SFEI","SHHB","SC","SIAB","SDDI","SBCB","SJB","SEBD","SFGD","SHA","SIDA","SGHI","SGIB","SBFJ","SFIE","SCJF","SJHJ","SJBG","SEJI","SFFF","SJ","SIII","SJHH","SEIH","SBDC","SHDJ","SJDD","SGDB","SIHA","SIBB","SECC","SCAD","SGBB","SGIF","SJHC","SFCD","SEAA","SEFF","SDFG","SDJE","SCFG","SFH","SCJ","SDDD","SEGD","SCIH","SDAG","SCJE","SFAJ","SIDJ","SE","SHBC","SJFF","SCHD","SBHA","SEDF","SFAF","SEDD","SDHD","SGJD","SIBH","SGDF","SIFA","SJGA","SIJB","SFI","SGA","SBFC","SBJA","SFFC","SFDH","SFEE","SBDF","SGBJ","SDHE","SJIB","SHHI","SIDE","SJII"]
         
        alg = MutableAlignment()
        alg.read_filepath("data/simulated/test.fasta")
        alg.delete_all_gap()
        tlen = alg.get_length()                    
        
        frg = MutableAlignment()
        frg.read_filepath("data/simulated/test.fas")
        #print frg.get_num_taxa()
        
        pp = SeppProblem(alg.keys())
        pp.fragments = frg
        pp.subalignment = alg
        
        cp1 = SeppProblem(subset, pp)
        cp2 = SeppProblem(list(set(alg.keys()) -set(subset)), pp)
        cp1.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) >= 9], frg)
        cp2.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) <= 1], frg)
        
        cp1labels = cp1.write_subalignment_without_allgap_columns("data/tmp/cp1.fasta")
        cp2labels = cp2.write_subalignment_without_allgap_columns("data/tmp/cp2.fasta")
        tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta")
        assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())])        
        tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta")
        assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())])
        
        cp1.fragments.write_to_path("data/tmp/cp1.frags.fas")
        cp2.fragments.write_to_path("data/tmp/cp2.frags.fas")
        
        '''We have done the hmmalign before. don't worry about that right now'''
        
        ext1 = ExtendedAlignment(cp1.fragments)
        ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto")
        ext1.relabel_original_columns(cp1labels)
        ext2 = ExtendedAlignment(cp2.fragments)
        ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto")
        ext2.relabel_original_columns(cp2labels)
        
        extmerger = ExtendedAlignment([])
        extmerger.merge_in(ext1)
        mixed = extmerger.merge_in(ext2)
                        
        extmerger.write_to_path("data/tmp/extended.merged.fasta")        

        assert extmerger.is_aligned(), "Merged alignment is not aligned"
        in1 = len([x for x in ext1._col_labels if x<0])
        in2 = len([x for x in ext2._col_labels if x<0])
        print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" %(extmerger.get_length(),in1 , in2 , tlen)
        assert ( in1 + in2 + tlen - mixed) == extmerger.get_length(), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d"  %(extmerger.get_length(),in1, in2 , tlen, mixed)
        assert ( in1 + in2 - mixed) == len(list(extmerger.iter_insertion_columns())), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d"  %(len(list(extmerger.iter_insertion_columns())),in1 , in1, mixed)
         
        
        tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment()
        tmp.delete_all_gap()
        assert tmp.is_aligned(), "merged alignment should be aligned!"
        assert tmp.get_length() == tlen, "merged alignment has wrong length"
        assert all([alg[k] == s for (k,s) in tmp.items()]), "merged alignment should match original alignment"

        
        print "======= finished testExtendedAlignment ========="
Example #18
0
    def generate_backbone(self):
        _LOG.info("Reading input sequences: %s" % (self.options.sequence_file))
        sequences = MutableAlignment()
        sequences.read_file_object(self.options.sequence_file)
        sequences.degap()
        fragments = MutableAlignment()
        if (options().median_full_length is not None
                or options().full_length_range is not None):
            if (options().median_full_length == -1):
                seq_lengths = sorted(
                    [len(seq) for seq in list(sequences.values())])
                lengths = len(seq_lengths)
                l2 = int(lengths / 2)
                if lengths % 2:
                    options().median_full_length = (seq_lengths[l2] +
                                                    seq_lengths[l2 + 1]) / 2.0
                else:
                    options().median_full_length = seq_lengths[l2]
            if options().full_length_range is not None:
                L = sorted(int(x) for x in options().full_length_range.split())
                min_length = L[0]
                max_length = L[1]
            else:
                (min_length,
                 max_length) = (int(options().median_full_length *
                                    (1 - options().backbone_threshold)),
                                int(options().median_full_length *
                                    (1 + options().backbone_threshold)))
            _LOG.info(
                "Full length sequences are set to be from %d to %d character long"
                % (min_length, max_length))
            frag_names = [
                name for name in sequences if len(sequences[name]) > max_length
                or len(sequences[name]) < min_length
            ]
            if (len(frag_names) > 0):
                _LOG.info("Detected %d fragmentary sequences" %
                          len(frag_names))
                fragments = sequences.get_hard_sub_alignment(frag_names)
                [sequences.pop(i) for i in list(fragments.keys())]
        if (options().backbone_size is None):
            options().backbone_size = min(1000, int(sequences.get_num_taxa()))
            _LOG.info("Backbone size set to: %d" % (options().backbone_size))
        if (options().backbone_size > len(list(sequences.keys()))):
            options().backbone_size = len(list(sequences.keys()))
        sample = sorted(
            random.sample(sorted(list(sequences.keys())),
                          options().backbone_size))
        backbone_sequences = sequences.get_hard_sub_alignment(sample)
        _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys()))))
        [sequences.pop(i) for i in list(backbone_sequences.keys())]

        _LOG.info("Writing backbone set. ")
        backbone = get_temp_file("backbone", "backbone", ".fas")
        _write_fasta(backbone_sequences, backbone)

        _LOG.info("Generating pasta backbone alignment and tree. ")
        pastaalignJob = PastaAlignJob()
        moleculeType = options().molecule
        if (options().molecule == 'amino'):
            moleculeType = 'protein'
        pastaalignJob.setup(backbone,
                            options().backbone_size, moleculeType,
                            options().cpu, **vars(options().pasta))
        pastaalignJob.run()
        (a_file, t_file) = pastaalignJob.read_results()

        shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree"))
        shutil.copyfile(a_file, self.get_output_filename("pasta.fasta"))

        options().placement_size = self.options.backbone_size
        options().alignment_file = open(
            self.get_output_filename("pasta.fasta"))
        options().tree_file = open(self.get_output_filename("pasta.fasttree"))
        _LOG.info(
            "Backbone alignment written to %s.\nBackbone tree written to %s" %
            (options().alignment_file, options().tree_file))
        sequences.set_alignment(fragments)
        if (len(sequences) == 0):
            sequences = MutableAlignment()
            sequences.read_file_object(open(self.options.alignment_file.name))
            self.results = ExtendedAlignment(fragment_names=[])
            self.results.set_alignment(sequences)
            _LOG.info(
                "No query sequences to align.  Final alignment saved as %s" %
                self.get_output_filename("alignment.fasta"))
            self.output_results()
            sys.exit(0)
        else:
            query = get_temp_file("query", "backbone", ".fas")
            options().fragment_file = query
            _write_fasta(sequences, query)
Example #19
0
    def merge_results(self):
        assert isinstance(self.root_problem, SeppProblem)
        '''Generate single extended alignment'''
        fullExtendedAlignment = ExtendedAlignment(
            self.root_problem.fragments.keys())
        # self.root_problem.get_children()[0].jobs[get_placement_job_name(0)]\
        # .get_attribute("full_extended_alignment_object")
        for pp in self.root_problem.get_children():
            for i in range(0, self.root_problem.fragment_chunks):
                extended_alignment = pp.jobs[get_placement_job_name(
                    i)].get_attribute("full_extended_alignment_object")
                fullExtendedAlignment.merge_in(extended_alignment,
                                               convert_to_string=True)
        self.results = fullExtendedAlignment

        # IF only one placement subset, no need to go to java
        if len(self.root_problem.get_children()) == 1:
            import json
            mergeinput = []
            for pp in self.root_problem.get_children():
                assert isinstance(pp, SeppProblem)
                for i in range(0, self.root_problem.fragment_chunks):
                    if (pp.get_job_result_by_name(get_placement_job_name(i)) is
                            None):
                        continue
                    '''Append subset trees and json locations to merge input'''
                    with open(
                            pp.get_job_result_by_name(
                                get_placement_job_name(i))) as f:
                        mergeinput.append(json.load(f))
                _LOG.info(
                    "There are %d fragment chunks on a single placement subset"
                    % len(mergeinput))
            result = mergeinput[0]
            for i in range(1, len(mergeinput)):
                result["placements"] = result["placements"] + mergeinput[i][
                    "placements"]
            with open(self.get_output_filename("placement.json"), 'w') as f:
                json.dump(result, f, sort_keys=True, indent=4)
        else:
            mergeinput = []
            '''Append main tree to merge input'''
            mergeinput.append(
                "%s;" %
                (self.root_problem.subtree.compose_newick(labels=True)))
            for pp in self.root_problem.get_children():
                assert isinstance(pp, SeppProblem)
                for i in range(0, self.root_problem.fragment_chunks):
                    if (pp.get_job_result_by_name(get_placement_job_name(i)) is
                            None):
                        continue
                    '''Append subset trees and json locations to merge input'''
                    mergeinput.append(
                        "%s;\n%s" %
                        (pp.subtree.compose_newick(labels=True),
                         pp.get_job_result_by_name(get_placement_job_name(i))))
            mergeinput.append("")
            mergeinput.append("")
            meregeinputstring = "\n".join(mergeinput)
            _LOG.debug(mergeinput)
            mergeJsonJob = MergeJsonJob()
            mergeJsonJob.setup(meregeinputstring,
                               self.get_output_filename("placement.json"))
            mergeJsonJob.run()