Beispiel #1
0
def convertGUIDANCELog(arr, nams, trimfile, logfile, outfile):
    '''
    Convert the GUIDANCE output to resemble the CIAlign "removed" file
    '''

    trimfile_cols, trimfile_rows, out_trimmed = trimfile
    logfile_cols, logfile_rows = logfile

    t_arr_cols, t_nams_cols = utilityFunctions.FastaToArray(trimfile_cols)
    removed_cols = [
        int(line.strip().split("\t")[0].split(" ")[-1])
        for line in open(logfile_cols).readlines()
    ]
    removed_cols = np.array(removed_cols) - 1
    all_ints = set(np.arange(0, np.shape(arr)[1]))
    keep = sorted(list(all_ints - set(removed_cols)))
    if os.path.exists("%s.With_Names" % trimfile_rows) and os.path.exists(
            "%s.With_Names" % logfile_rows):
        t_arr_rows, t_nams_rows = utilityFunctions.FastaToArray(
            "%s.With_Names" % trimfile_rows)
        t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray(
            "%s.With_Names" % logfile_rows)
    elif os.path.exists("%s.With_Names" % trimfile_rows):
        t_arr_rows, t_nams_rows = utilityFunctions.FastaToArray(
            "%s.With_Names" % trimfile_rows)
        t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray(
            logfile_rows)
    else:
        t_arr_rows, t_nams_rows = np.array([]), list()
        t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray(
            "%s.With_Names" % logfile_rows)

    assert len(t_nams_rows) + len(t_nams_rows_rm) == len(nams)
    assert len(removed_cols) + np.shape(t_arr_cols)[1] == np.shape(arr)[1]

    assert (arr[:, keep] == t_arr_cols).all()

    allnams = sorted(t_nams_rows + t_nams_rows_rm)

    assert allnams == sorted(nams)
    out = open(outfile, "w")
    out.write("other\t%s\n" % (",".join([str(x)
                                         for x in sorted(removed_cols)])))
    out.write("otherc\t%s\n" %
              (",".join([str(x) for x in sorted(t_nams_rows_rm)])))
    out.close()
    which_nams = np.where(np.isin(nams, t_nams_rows))[0]

    new_arr = arr[which_nams, ]
    new_arr = new_arr[:, keep]
    utilityFunctions.writeOutfile(out_trimmed, new_arr, nams, t_nams_rows_rm)
Beispiel #2
0
def convertZorroLog(arr, nams, trimfile, logfile, outfile, thresh):
    '''
    Convert the Zorro output to resemble the CIAlign "removed" file
    '''
    scores = [float(x.strip()) for x in open(logfile).readlines()]
    scores = np.array(scores)
    which = np.where(scores < thresh)[0]
    keeps = np.where(scores >= thresh)[0]
    removed = set(which)
    new_arr = arr[:, keeps]
    utilityFunctions.writeOutfile(trimfile, new_arr, nams, removed)
    out = open(outfile, "w")
    out.write("other\t%s\n" % (",".join([str(x) for x in removed])))
    out.close()
Beispiel #3
0
def runConsensus(args, log, orig_arr, orig_nams, arr, nams, removed_seqs):
    '''
    Make consensus sequences

    Parameters
    ----------
    args: configargparse.ArgumentParser
        ArgumentParser object containing the specified parameters
    log: logging.Logger
        Open log file
    orig_arr: np.array
        Array containing the original alignment
    orig_nams:
        List of sequence names in the original alignment
    arr: np.array
        Array containing the cleaned alignment
    nams: list
        List of sequence names in the cleaned alignment
    removed_seqs: set
        Set of sequence names which have been removed
    '''
    if args.make_consensus or args.all_options or args.interpret:
        log.info("Building consensus sequence")
        if not args.silent:
            print("Building consensus sequence")
        cons, coverage = consensusSeq.findConsensus(arr, log,
                                                    args.consensus_type)
        consarr = np.array(cons)
        # Combine the consensus with the alignment
        arr_plus_cons = np.row_stack((arr, consarr))
        cons = "".join(cons)
        # Remove the gaps from the consensus if this option is specified
        if not args.consensus_keep_gaps:
            cons = cons.replace("-", "")
        # Output file of just the consensus sequence
        out = open("%s_consensus.fasta" % args.outfile_stem, "w")
        out.write(">%s\n%s\n" % (args.consensus_name, cons))
        out.close()
        # Output file of the consensus and the alignment
        outf = "%s_with_consensus.fasta" % args.outfile_stem
        utilityFunctions.writeOutfile(outf, arr_plus_cons,
                                      nams + [args.consensus_name],
                                      removed_seqs)
Beispiel #4
0
def runTtoU(args, log, orig_arr, orig_nams, arr, nams, removed_seqs):
    '''
    Make a copy of the alignment with T replaced by U

    Parameters
    ----------
    args: configargparse.ArgumentParser
        ArgumentParser object containing the specified parameters
    log: logging.Logger
        Open log file
    orig_arr: np.array
        Array containing the original alignment
    orig_nams:
        List of sequence names in the original alignment
    arr: np.array
        Array containing the cleaned alignment
    nams: list
        List of sequence names in the cleaned alignment
    removed_seqs: set
        Set of sequence names which have been removed
    '''
    # Replace T with U in the input
    if args.replace_input:
        log.info("Generating a T instead of U version of the input alignment")
        if not args.silent:
            print("Generating a T instead of U version of the input alignment")
        outf = "%s_T_input.fasta" % (args.outfile_stem)
        T_arr = utilityFunctions.replaceUbyT(orig_arr)
        # Write to file
        utilityFunctions.writeOutfile(outf, T_arr, orig_nams, removed_seqs)
    # Rpleace T with U in the output
    if args.replace_output:
        log.info("Generating a T instead of U version of\
                 the output alignment")
        if not args.silent:
            print("Generating a T instead of U version of\
                  the output alignment")
        outf = "%s_T_output.fasta" % (args.outfile_stem)
        T_arr = utilityFunctions.replaceUbyT(arr)
        # Write to file
        utilityFunctions.writeOutfile(outf, T_arr, orig_nams, removed_seqs)
Beispiel #5
0
def runUnalign(args, log, orig_arr, orig_nams, arr, nams, removed_seqs):
    '''
    Make a copy of the alignment without gaps

    Parameters
    ----------
    args: configargparse.ArgumentParser
        ArgumentParser object containing the specified parameters
    log: logging.Logger
        Open log file
    orig_arr: np.array
        Array containing the original alignment
    orig_nams:
        List of sequence names in the original alignment
    arr: np.array
        Array containing the cleaned alignment
    nams: list
        List of sequence names in the cleaned alignment
    removed_seqs: set
        Set of sequence names which have been removed
    '''
    # Unalign input
    if args.unalign_input:
        log.info("Generating a gap free version of the input alignment")
        if not args.silent:
            print("Generating a gap free version of the input alignment")
        outf = "%s_unaligned_input.fasta" % (args.outfile_stem)
        unaligned_arr = utilityFunctions.unAlign(orig_arr)
        # Write to file
        utilityFunctions.writeOutfile(outf, unaligned_arr, orig_nams,
                                      removed_seqs)
    # Unalign output
    if args.unalign_output:
        log.info("Generating a gap free version of the output alignment")
        if not args.silent:
            print("Generating a gap free version of the output alignment")
        outf = "%s_unaligned_output.fasta" % (args.outfile_stem)
        unaligned_arr = utilityFunctions.unAlign(arr)
        # Write to file
        utilityFunctions.writeOutfile(outf, unaligned_arr, nams, removed_seqs)
Beispiel #6
0
def runCleaning(args, log, arr, nams):
    '''
    Run the cleaning functions

    Parameters
    ----------
    args: configargparse.ArgumentParser
        ArgumentParser object containing the specified parameters
    log: logging.Logger
        Open log file
    arr: np.array
        Array containing the original alignment
    nams:
        List of sequence names in the original alignment

    Returns
    -------
    arr: np.array
        Array containing the cleaned alignment
    nams: list
        List of sequence names remaining in the cleaned alignment
    markupdict: dict
        Dictionary where the keys are function names and the values are
        lists of columns, rows or positions which have been removed
    removed_seqs: set
        set of the names of sequences which have been removed
    '''
    # Set everything up
    orig_nams = copy.copy(nams)
    markupdict, relativePositions, R = setupTrackers(args, arr)
    outfile, rmfile = setupOutfiles(args)
    removed_seqs, removed_cols, removed_positions = R

    # Remove divergent sequences
    if args.remove_divergent or args.all_options or args.clean:
        log.info("Removing divergent sequences")
        if not args.silent:
            print("Removing divergent sequences")
        minperc = args.remove_divergent_minperc
        arr, r = parsingFunctions.removeDivergent(arr, nams, rmfile, log,
                                                  minperc)
        # Track what has been removed
        markupdict['remove_divergent'] = r
        removed_seqs = removed_seqs | r
        nams = utilityFunctions.updateNams(nams, r)

        # Check there are some sequences left
        utilityFunctions.checkArrLength(arr, log)

    # Remove gaps created by remove divergent
    if (args.remove_divergent
            and args.remove_gaponly) or args.all_options or args.clean:
        log.info("Removing gap only columns")
        if not args.silent:
            print("Removing gap only columns")
        A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log)
        # Track what has been removed
        arr, r, relativePositions = A

        if 'remove_gaponly' in markupdict:
            markupdict['remove_gaponly'].update(r)
        else:
            markupdict['remove_gaponly'] = r

        # Check there are some columns left
        removed_cols = removed_cols | r
        utilityFunctions.checkArrLength(arr, log)

    # Remove insertions
    if args.remove_insertions or args.all_options or args.clean:
        log.info("Removing insertions")
        if not args.silent:
            print("Removing insertions")
        assert args.insertion_min_size < args.insertion_max_size, "\
            insertion_min_size must be less than insertion_max_size"

        A = parsingFunctions.removeInsertions(arr, relativePositions, rmfile,
                                              log, args.insertion_min_size,
                                              args.insertion_max_size,
                                              args.insertion_min_flank)

        # Track what has been removed
        arr, r, relativePositions = A
        markupdict['remove_insertions'] = r
        removed_cols = removed_cols | r
        # Check there are some columns left
        utilityFunctions.checkArrLength(arr, log)

    # Remove gaps created by remove insertions
    if (args.remove_insertions
            and args.remove_gaponly) or args.all_options or args.clean:
        log.info("Removing gap only columns")
        if not args.silent:
            print("Removing gap only columns")
        A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log)

        # Track what has been removed
        arr, r, relativePositions = A
        if 'remove_gaponly' in markupdict:
            markupdict['remove_gaponly'].update(r)
        else:
            markupdict['remove_gaponly'] = r
        removed_cols = removed_cols | r
        # Check there are still some columns left
        utilityFunctions.checkArrLength(arr, log)

    # Crop Ends
    if args.crop_ends or args.all_options or args.clean:
        # doesn't remove any whole columns or rows
        log.info("Cropping ends")
        if not args.silent:
            print("Cropping ends")
        arr, r = parsingFunctions.cropEnds(arr, nams, relativePositions,
                                           rmfile, log,
                                           args.crop_ends_mingap_perc,
                                           args.crop_ends_redefine_perc)
        # Track what has been removed
        markupdict['crop_ends'] = r
        removed_positions.update(r)
        # Check there are still some positions left
        utilityFunctions.checkArrLength(arr, log)

    # Remove empty columns created by crop ends
    if (args.crop_ends
            and args.remove_gaponly) or args.all_options or args.clean:
        log.info("Removing gap only columns")
        if not args.silent:
            print("Removing gap only columns")

        A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log)
        # Track what has been removed
        arr, r, relativePositions = A
        if 'remove_gaponly' in markupdict:
            markupdict['remove_gaponly'].update(r)
        else:
            markupdict['remove_gaponly'] = r
        removed_cols = removed_cols | r
        # Check there are still some positions left
        utilityFunctions.checkArrLength(arr, log)

    # Remove short
    if args.remove_short or args.all_options or args.clean:
        log.info("Removing short sequences")
        if not args.silent:
            print("Removing short sequences")
        arr, r = parsingFunctions.removeTooShort(arr, nams, rmfile, log,
                                                 args.remove_min_length)
        # Track what has been removed
        markupdict['remove_short'] = r
        removed_seqs = removed_seqs | r
        nams = utilityFunctions.updateNams(nams, r)
        # Check there are still some sequences left
        utilityFunctions.checkArrLength(arr, log)

    # Remove empty columns created by remove short
    if (args.remove_short
            and args.remove_gaponly) or args.all_options or args.clean:
        log.info("Removing gap only columns")
        if not args.silent:
            print("Removing gap only columns")

        A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log)
        arr, r, relativePositions = A
        if 'remove_gaponly' in markupdict:
            markupdict['remove_gaponly'].update(r)
        else:
            markupdict['remove_gaponly'] = r
        removed_cols = removed_cols | r
        utilityFunctions.checkArrLength(arr, log)

    if args.remove_gaponly and not (args.all_options or args.remove_divergent
                                    or args.remove_insertions or args.crop_ends
                                    or args.remove_short or args.clean):
        log.info("Removing gap only columns")
        if not args.silent:
            print("Removing gap only columns")

        A = parsingFunctions.removeGapOnly(arr, relativePositions, rmfile, log)
        arr, r, relativePositions = A
        # Track what has been removed
        if 'remove_gaponly' in markupdict:
            markupdict['remove_gaponly'].update(r)
        else:
            markupdict['remove_gaponly'] = r
        removed_cols = removed_cols | r
        # Check there are some columns left
        utilityFunctions.checkArrLength(arr, log)

    # Write the output file
    utilityFunctions.writeOutfile(outfile, arr, orig_nams, removed_seqs,
                                  rmfile)

    return (arr, nams, markupdict, removed_seqs)
    def testWriteOutfile(self):
        utilityFunctions.writeOutfile(self.outfile, self.in_array, self.nams,
                                      self.removed)

        self.assertTrue(os.path.isfile(self.outfile))