Beispiel #1
0
def convertGBlocksLog(arr, nams, trimfile, logfile, outfile):
    '''
    Convert the GBlocks txt output to resemble the CIAlign
    "removed" file.
    '''
    t_arr, t_nams = utilityFunctions.FastaToArray(trimfile, "")
    width = np.shape(arr)[1]
    t_width = np.shape(t_arr)[1]
    full = ""
    with open(logfile) as infile:
        for line in infile:
            if line.startswith("Gblocks") and not "Results" in line:
                string = line.split(" ")[-1].strip()
                full += string
    removed = set(list(np.where(np.array(list(full)) != "#")[0]))
    kept = np.where(np.array(list(full)) == "#")[0]
    removed = sorted(list(removed))
    assert len(removed) == width - t_width

    # Check removing these columns gives the gblocks output
    new_arr = arr[:, kept]
    assert (new_arr == t_arr).all()

    out = open(outfile, "w")
    out.write("other\t%s\n" % (",".join([str(x) for x in removed])))
    out.close()
Beispiel #2
0
def setupArrays(args, log):
    '''
    Read the alignment into an array, check there are enough sequences in
    the array and the names are not duplicated, detect if the alignment
    is nucleotides or amino acids.

    Parameters
    ----------
    args: configargparse.ArgumentParser
        ArgumentParser object containing the specified parameters
    log: logging.Logger
        Open log file

    Returns
    -------
    arr: np.array
        The alignment stored in a numpy array
    nams: list
        The names of the sequences in the alignment
    typ: str
        Either 'aa' - amino acid - or 'nt' - nucleotide
    '''

    # convert the input fasta file into an array and make a list of
    # sequence names so the order can be maintained
    arr, nams = utilityFunctions.FastaToArray(args.infile, log,
                                              args.outfile_stem)
    # check if names are unique
    if len(nams) > len(set(nams)):
        print("Error! Your input alignmnent has duplicate names!")
        exit()

    # Check the alignment array isn't empty
    utilityFunctions.checkArrLength(arr, log)

    # Check which cleaning functions are requested
    cleaningArgs = [
        args.remove_insertions, args.crop_ends, args.remove_divergent
    ]

    # Check there are enough sequences for the requested functions
    if len(arr) < 3 and any(cleaningArgs):
        # when less than three sequences, stop
        print("You need at least three sequences in your MSA to run \
               remove_insertions, crop_ends or remove_divergent")
        exit()
    elif len(arr) < 2:
        print("You need at least two sequences in your MSA")
        exit()

    # detect if the sequence is amino acids or nucleotides
    typ = utilityFunctions.seqType(arr)

    if typ == 'aa':
        log.info("Amino acid alignment detected")
    else:
        log.info("Nucleotide alignment detected")

    return (arr, nams, typ)
    def testFastaToArray(self):
        logger = logging.getLogger('path.to.module.under.test')
        with mock.patch.object(logger, 'debug') as mock_debug:
            ali, nams = utilityFunctions.FastaToArray(self.input, logger)

        # self.assertEqual(nams.size, self.nams.size)
        self.assertEqual(ali[0, :].size, self.in_array[0, :].size)
        self.assertEqual(len(self.in_array), len(ali))
        self.assertEqual(len(nams), len(self.nams))
        self.assertTrue((ali == self.in_array).all())
        self.assertTrue(nams == self.nams)
Beispiel #4
0
def format_alignment(ali, cleaned=False, cialign_removed=None):
    '''
    Converts the alignment in the path ali to a numpy array of integers
    showing the cumulative number of non-gap residues prior to the residue
    at this position in the sequence, with characters removed by CIAlign
    excluded.
    Runs the FastaToArray function from utilityFunctions, converts to upper
    case, runs find_removed_cialign and alignment_to_matrix.

    Parameters
    ----------
    ali: str
        path to multiple sequence alignment in FASTA format. If the alignment
        has been cleaned with CIAlign this should be the CIAlign input, not
        the output
    cleaned: bool
        True if the alignment has been cleaned with CIAlign, otherwise False
    cialign_removed: str
        path to CIAlign _removed.txt file for the alignment

    Returns
    -------
    arr: np.array
        Numpy array of integers showing the cumulative number of non-gap
        residues prior to the residue
        at this position in the sequence, with characters removed by CIAlign
        excluded
    nams: list
        List of sequence names in the same order as the rows of the sequence
        array.
    '''
    removed_count_total = 0
    removed_count_nongap = 0
    # Convert alignment into arrays
    arr, nams = utilityFunctions.FastaToArray(ali)

    # make everything upper case so this doesn't affect the score
    arr = np.char.upper(arr)

    # make sure everything is in the right order
    o = np.argsort(nams)
    nams = np.array(nams)[o]
    arr = arr[o, :]

    # if the alignment has been cleaned with CIAlign, update the array
    # to contain !s for positions which have been removed
    if cleaned:
        X = find_removed_cialign(cialign_removed, arr, nams)
        arr, nams, removed_count_total, removed_count_nongap = X

    arr = alignment_to_matrix(arr)

    return (arr, nams, removed_count_total, removed_count_nongap)
Beispiel #5
0
def convertGUIDANCELog(arr, nams, trimfile, logfile, outfile):
    '''
    Convert the GUIDANCE output to resemble the CIAlign "removed" file
    '''

    trimfile_cols, trimfile_rows, out_trimmed = trimfile
    logfile_cols, logfile_rows = logfile

    t_arr_cols, t_nams_cols = utilityFunctions.FastaToArray(trimfile_cols)
    removed_cols = [
        int(line.strip().split("\t")[0].split(" ")[-1])
        for line in open(logfile_cols).readlines()
    ]
    removed_cols = np.array(removed_cols) - 1
    all_ints = set(np.arange(0, np.shape(arr)[1]))
    keep = sorted(list(all_ints - set(removed_cols)))
    if os.path.exists("%s.With_Names" % trimfile_rows) and os.path.exists(
            "%s.With_Names" % logfile_rows):
        t_arr_rows, t_nams_rows = utilityFunctions.FastaToArray(
            "%s.With_Names" % trimfile_rows)
        t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray(
            "%s.With_Names" % logfile_rows)
    elif os.path.exists("%s.With_Names" % trimfile_rows):
        t_arr_rows, t_nams_rows = utilityFunctions.FastaToArray(
            "%s.With_Names" % trimfile_rows)
        t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray(
            logfile_rows)
    else:
        t_arr_rows, t_nams_rows = np.array([]), list()
        t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray(
            "%s.With_Names" % logfile_rows)

    assert len(t_nams_rows) + len(t_nams_rows_rm) == len(nams)
    assert len(removed_cols) + np.shape(t_arr_cols)[1] == np.shape(arr)[1]

    assert (arr[:, keep] == t_arr_cols).all()

    allnams = sorted(t_nams_rows + t_nams_rows_rm)

    assert allnams == sorted(nams)
    out = open(outfile, "w")
    out.write("other\t%s\n" % (",".join([str(x)
                                         for x in sorted(removed_cols)])))
    out.write("otherc\t%s\n" %
              (",".join([str(x) for x in sorted(t_nams_rows_rm)])))
    out.close()
    which_nams = np.where(np.isin(nams, t_nams_rows))[0]

    new_arr = arr[which_nams, ]
    new_arr = new_arr[:, keep]
    utilityFunctions.writeOutfile(out_trimmed, new_arr, nams, t_nams_rows_rm)
Beispiel #6
0
def convertTrimalLog(arr, nams, trimfile, logfile, outfile):
    '''
    Convert the trimal --colnumbering output to resemble the CIAlign
    "removed" file.
    '''
    t_arr, t_nams = utilityFunctions.FastaToArray(trimfile, "")
    width = np.shape(arr)[1]
    t_width = np.shape(t_arr)[1]
    remaining = [
        int(x.strip())
        for x in open(logfile).readlines()[0].strip().split("\t")[1].split(",")
    ]
    remaining = set(remaining)
    all_ints = set(np.arange(0, width))
    removed = sorted(list(all_ints - remaining))
    assert len(removed) == width - t_width

    # Check removing these columns gives the trimal output
    new_arr = arr[:, np.array(list(remaining))]
    assert (new_arr == t_arr).all()
    out = open(outfile, "w")
    out.write("other\t%s\n" % (",".join([str(x) for x in removed])))
    out.close()
Beispiel #7
0
def getParser():
    '''
    Builds a configargparse.ArgumentParser object with the CIAlign parameters

    Returns
    -------
    parser: configargparse.ArgumentParser
        ArgumentParser with the CIAlign parameters
    '''

    parser = configargparse.ArgumentParser(
             description='Clean and interpret a multiple sequence \
                          alignment', add_help=False)
    ci_dir = os.path.dirname(utilityFunctions.__file__)

    # Looks up the default values and minimum and maximum values for the
    # paramters associated with the cleaning functions in the text file
    # ranges.txt provided in the CIAlign code directory
    ranges = [line.strip().split("\t")
              for line in open("%s/ranges.txt" % ci_dir)]
    # Defaults
    defs = {x[0]: x[1] for x in ranges}
    # Minima
    minis = {x[0]: x[2] for x in ranges}
    # Maxima
    maxis = {x[0]: x[3] for x in ranges}

    # Seperate the required and optional paramters
    required = parser.add_argument_group('Required Arguments')
    optional = parser.add_argument_group('Optional Arguments')

    # Files
    # not to confuse with inifile
    required.add("--infile", dest='infile', type=str,
                 help='Path to input alignment file in FASTA format')
    optional.add("--inifile", dest='inifile', type=str,
                 default=None,
                 help='Path to config file. Default: %(default)s',
                 is_config_file=True)
    optional.add("--outfile_stem", dest='outfile_stem', type=str,
                 default="CIAlign",
                 help="Prefix for output files, including the path to the \
                     output directory. Default: %(default)s")

    # Initial setup
    # Read the alignment temporarily just to find out how many columns there
    # are as for several of the cleaning functions the range of valid
    # parameters depends on this.

    tempargs = parser.parse_known_args()[0]
    if tempargs.infile:
        # Read the FASTA file into an array
        arr, nams = utilityFunctions.FastaToArray(tempargs.infile, None,
                                                  tempargs.outfile_stem)

        # Find the number of columns in the input alignment
        n_col = np.shape(arr)[1]
        # Remove the array from memory
        del arr
    else:
        # Gives a valid int value just for generating the --help text
        n_col = 100
    # parameter to run all functions without having to type them in
    optional.add("--all", dest="all_options",
                 action="store_true",
                 help="Use all available functions with default parameters.")

    # parameter to run all cleaning functions without having to type them in
    optional.add("--clean", dest="clean",
                 action="store_true",
                 help="Use all cleaning functions with default parameters.")

    # parameter to create all mini alignments without having to type them in
    optional.add("--visualise", dest="visualise",
                 action="store_true",
                 help="Plot all mini alignments with default parameters.")

    # parameter to run all interpreation functions except creating sequence logos without having to type them in
    optional.add("--interpret", dest="interpret",
                 action="store_true",
                 help="Use all interpreting functions with default parameters.")

    # Runtime
    optional.add("--silent", dest='silent',
                 help="Do not print progress to the screen. \
                       Default: %(default)s",
                 action='store_true')

    # Crop Ends
    optional.add("--crop_ends", dest="crop_ends",
                 action="store_true",
                 help="Crop the ends of sequences if they are poorly aligned. \
                 Default: %(default)s")

    optional.add("--crop_ends_mingap_perc", dest='crop_ends_mingap_perc',
                 type=float_range(minis['crop_ends_mingap_perc'],
                                  maxis['crop_ends_mingap_perc']),
                 default=defs['crop_ends_mingap_perc'],
                 help="Minimum proportion of the sequence length (excluding \
                     gaps) that is the threshold for change in gap numbers. \
                     Default: %(default)s.",
                 metavar="(float, %s..%s)" % (minis['crop_ends_mingap_perc'],
                                              maxis['crop_ends_mingap_perc']))

    optional.add("--crop_ends_redefine_perc", dest='crop_ends_redefine_perc',
                 type=float_range(minis['crop_ends_redefine_perc'],
                                  maxis['crop_ends_redefine_perc']),
                 default=defs['crop_ends_redefine_perc'],
                 help="Proportion of the sequence length (excluding gaps) \
                       that is being checked for change in gap numbers to \
                       redefine start/end. Default: %(default)s",
                 metavar="(float, %s..%s)" % (
                     minis['crop_ends_redefine_perc'],
                     maxis['crop_ends_redefine_perc']))

    # Remove divergent sequences
    optional.add("--remove_divergent", dest="remove_divergent",
                 action="store_true",
                 help="Remove sequences with <= N proportion of positions at \
                       which the most common base / amino acid in the \
                       alignment is present. Default: %(default)s")

    optional.add("--remove_divergent_minperc", dest="remove_divergent_minperc",
                 default=defs['remove_divergent_minperc'],
                 type=float_range(minis['remove_divergent_minperc'],
                                  maxis['remove_divergent_minperc']),
                 help="Minimum proportion of positions which should be \
                       identical to the most common base / amino acid in \
                       order to be preserved. \
                       Default: %(default)s)",
                 metavar="(float, %s..%s)" % (
                     minis['remove_divergent_minperc'],
                     maxis['remove_divergent_minperc']))

    # # Remove Insertions
    optional.add("--remove_insertions", dest="remove_insertions",
                 action="store_true",
                 help="Remove insertions found in <= 50 percent of sequences \
                       from the alignment. Default: %(default)s")

    optional.add("--insertion_min_size", dest="insertion_min_size",
                 type=int_range(minis['insertion_min_size'],
                                maxis['insertion_max_size'],
                                n_col),
                 default=defs['insertion_min_size'],
                 help="Only remove insertions >= this number of residues. \
                       Default: %(default)s.",
                 metavar="(int, %s..%s)" % (
                     minis['insertion_min_size'],
                     maxis['insertion_min_size']))

    optional.add("--insertion_max_size", dest="insertion_max_size",
                 type=int_range(minis['insertion_max_size'],
                                maxis['insertion_max_size'],
                                n_col),
                 default=defs['insertion_max_size'],
                 help="Only remove insertions <= this number of residues. \
                       Default: %(default)s",
                 metavar="(int, %s..%s)" % (
                     minis['insertion_max_size'],
                     maxis['insertion_max_size']))

    optional.add("--insertion_min_flank", dest="insertion_min_flank",
                 type=int_range(minis['insertion_min_flank'],
                                maxis['insertion_min_flank'],
                                n_col),
                 default=defs['insertion_min_flank'],
                 help="Minimum number of bases on either side of an insertion \
                       to classify it as an insertion.\
                       Default: %(default)s",
                 metavar="(int, %s..%s)" % (
                     minis['insertion_min_flank'],
                     maxis['insertion_min_flank']))

    # Remove Short
    optional.add("--remove_short", dest="remove_short",
                 help="Remove sequences <= N bases / amino acids from the \
                       alignment. Default: %(default)s",
                 action="store_true")

    optional.add("--remove_min_length", dest="remove_min_length",
                 type=int_range(minis['remove_min_length'],
                                maxis['remove_min_length'],
                                n_col),
                 default=defs['remove_min_length'],
                 help="Sequences are removed if they are shorter than this \
                       minimum length, excluding gaps. Default: %(default)s",
                 metavar="(int, %s..%s)" % (
                     minis['remove_min_length'],
                     maxis['remove_min_length']))

    # keep gap only
    optional.add("--keep_gaponly", dest="remove_gaponly",
                 action="store_false",
                 help="Keep gap only columns in the alignment. Default: \
                       %(default)s")

    # Consensus
    optional.add("--make_consensus", dest="make_consensus",
                 action="store_true",
                 help="Make a consensus sequence based on the cleaned \
                       alignment. Default: %(default)s")
    optional.add("--consensus_type", dest="consensus_type", type=str,
                 default="majority",
                 help="Type of consensus sequence to make - can be majority, \
                       to use the most common character at each position in \
                       the consensus, even if this is a gap, or \
                       majority_nongap, to use the most common non-gap \
                       character at each position. Default: %(default)s")
    optional.add("--consensus_keep_gaps", dest="consensus_keep_gaps",
                 action="store_true",
                 help="If there are gaps in the consensus (if majority_nongap \
                       is used as consensus_type), should these be included \
                       in the consensus (True) or should this position in \
                      the consensus be deleted (False). Default: %(default)s")
    optional.add("--consensus_name", dest="consensus_name",
                 type=str, default="consensus",
                 help="Name to use for the consensus sequence in the output \
                       fasta file. Default: %(default)s")

    # Mini Alignments
    optional.add("--plot_input", dest="plot_input",
                 action="store_true",
                 help="Plot a mini alignment - an image representing the \
                       input alignment. Default: %(default)s")
    optional.add("--plot_output", dest="plot_output",
                 action="store_true",
                 help="Plot a mini alignment, an image representing the \
                       output alignment. Default: %(default)s")
    optional.add("--plot_markup", dest="plot_markup",
                 action="store_true",
                 help="Draws the input alignment but with the columns and \
                       rows which have been removed by each function marked \
                       up in corresponding colours. Default: %(default)s")
    optional.add("--plot_dpi", dest="plot_dpi",
                 type=int, default=300,
                 help="DPI for mini alignments. Default: %(default)s")
    optional.add("--plot_format", dest="plot_format",
                 type=str, default='png',
                 help="Image format for mini alignments - can be png, svg, \
                       tiff or jpg. Default: %(default)s")
    optional.add("--plot_width", dest="plot_width",
                 type=int, default=5,
                 help="Mini alignment width in inches. Default: %(default)s")
    optional.add("--plot_height", dest="plot_height",
                 type=int, default=3,
                 help="Mini alignment height in inches. Default: %(default)s")
    optional.add("--plot_keep_numbers", dest="plot_keep_numbers",
                 action="store_true",
                 help="If specified, for mini alignments based on CIAlign \
                       output with <10 sequences (or if force_numbers \
                       is switched on) the rows will be labelled \
                       based on the input alignment, rather \
                       than renumbered")
    optional.add("--plot_force_numbers", dest="plot_force_numbers",
                 action="store_true",
                 help="Force all rows to be numbered on the mini alignments \
                 rather than labelling e.g. every 10th row for larger plots. \
                 Will cause labels to overlap on large plots")

    # Sequence logos
    optional.add("--make_sequence_logo", dest="make_sequence_logo",
                 action="store_true",
                 help="Draw a sequence logo. Default: %(default)s")
    optional.add("--sequence_logo_type", dest="sequence_logo_type",
                 type=str, default='bar',
                 help="Type of sequence logo - bar/text/both. \
                       Default: %(default)s")
    optional.add("--sequence_logo_dpi", dest="sequence_logo_dpi",
                 type=int, default=300,
                 help="DPI for sequence logo image. Default: %(default)s")
    optional.add("--sequence_logo_font", dest="sequence_logo_font",
                 type=str, default='monospace',
                 help="Font for text sequence logo. Default: %(default)s")
    optional.add("--sequence_logo_nt_per_row", dest='sequence_logo_nt_per_row',
                 type=int, default=50,
                 help="Number of bases / amino acids to show per row in the \
                       sequence logo, where the logo is too large to show on \
                       a single line. Default: %(default)s")
    optional.add("--sequence_logo_filetype", dest='sequence_logo_filetype',
                 type=str, default='png',
                 help="Image file type to use for the sequence logo - can be \
                       png, svg, tiff or jpg. Default: %(default)s")
    optional.add("--logo_start", dest="logo_start",
                 type=int, default=0,
                 help="Start position of sequence logo. Default: %(default)s")
    optional.add("--logo_end", dest="logo_end",
                 type=int, default=0,
                 help="End position of sequence logo. Default: %(default)s")
    optional.add("--list_fonts_only", dest='list_fonts_only',
                 action="store_true",
                 help="Make a swatch showing available fonts. \
                       Default: %(default)s")

    # Coverage
    optional.add("--plot_coverage_input", dest="plot_coverage_input",
                 action="store_true",
                 help="Plot the coverage of the input MSA. Default: \
                       %(default)s")
    optional.add("--plot_coverage_output", dest="plot_coverage_output",
                 action="store_true",
                 help="Plot the coverage of the output MSA. Default: \
                       %(default)s")
    optional.add("--plot_coverage_dpi", dest="plot_coverage_dpi",
                 type=int, default=300,
                 help="DPI for coverage plot. Default: %(default)s")
    optional.add("--plot_coverage_height", dest="plot_coverage_height",
                 type=int, default=3,
                 help="Height for coverage plot (inches). Default: \
                       %(default)s")
    optional.add("--plot_coverage_width", dest="plot_coverage_width",
                 type=int, default=5,
                 help="Width for coverage plot (inches). Default: \
                       %(default)s")
    optional.add("--plot_coverage_colour", dest="plot_coverage_colour",
                 type=str, default='#007bf5',
                 help="Colour for coverage plot (hex code or name). \
                       Default: %(default)s")
    optional.add("--plot_coverage_filetype", dest="plot_coverage_filetype",
                 type=str, default='png',
                 help="File type for coverage plot (png, svg, tiff, jpg). \
                       Default: %(default)s")

    # Similarity Matrix
    optional.add("--make_similarity_matrix_input", dest="make_simmatrix_input",
                 action="store_true",
                 help="Make a similarity matrix for the input alignment. \
                       Default: %(default)s")
    optional.add("--make_similarity_matrix_output",
                 dest="make_simmatrix_output",
                 action="store_true",
                 help="Make a similarity matrix for the output alignment. \
                       Default: %(default)s")
    optional.add("--make_simmatrix_dp", dest="make_simmatrix_dp",
                 type=int, default=4,
                 help="Number of decimal places to display in the similarity \
                       matrix output file. Default: %(default)s")
    optional.add("--make_simmatrix_minoverlap",
                 dest="make_simmatrix_minoverlap",
                 type=int, default=1,
                 help="Minimum overlap between two sequences to have non-zero \
                       similarity in the similarity matrix. \
                       Default: %(default)s")
    optional.add("--make_simmatrix_keepgaps", dest="make_simmatrix_keepgaps",
                 type=int, default=0,
                 help="Include positions with gaps in either or both \
                       sequences in the similarity matrix calculation. \
                       Can be 0 - exclude positions which are gaps in either \
                       or both sequences, 1 - exclude positions which are \
                       gaps in both sequences, 2 - consider all positions \
                       regardless of gaps. Default: %(default)s")

    # Unalign function
    optional.add("--unalign_input", dest="unalign_input",
                 action="store_true", default=False,
                 help="Generate a copy of the input alignment with no gaps. \
                       Default: %(default)s")
    optional.add("--unalign_output", dest="unalign_output",
                 action="store_true", default=False,
                 help="Generate a copy of the cleaned alignment with no \
                     gaps. Default: %(default)s")

    # Replace Us by Ts function
    optional.add("--replace_input", dest="replace_input", action="store_true",
                 default=False,
                 help="Replaces all Us by Ts in input alignment. \
                     Default: %(default)s")
    optional.add("--replace_output", dest="replace_output",
                 action="store_true", default=False,
                 help="Replaces all Us by Ts in output alignment. \
                     Default: %(default)s")

    # Help function
    optional.add('-h', '--help', action='help',
                 default=configargparse.SUPPRESS,
                 help='Show all available parameters with an explanation.')

    # Version function
    optional.add('-v', '--version', action='version',
                 version=__version__,
                 default=configargparse.SUPPRESS,
                 help='Show the current version.')
    return (parser)