def convertGBlocksLog(arr, nams, trimfile, logfile, outfile): ''' Convert the GBlocks txt output to resemble the CIAlign "removed" file. ''' t_arr, t_nams = utilityFunctions.FastaToArray(trimfile, "") width = np.shape(arr)[1] t_width = np.shape(t_arr)[1] full = "" with open(logfile) as infile: for line in infile: if line.startswith("Gblocks") and not "Results" in line: string = line.split(" ")[-1].strip() full += string removed = set(list(np.where(np.array(list(full)) != "#")[0])) kept = np.where(np.array(list(full)) == "#")[0] removed = sorted(list(removed)) assert len(removed) == width - t_width # Check removing these columns gives the gblocks output new_arr = arr[:, kept] assert (new_arr == t_arr).all() out = open(outfile, "w") out.write("other\t%s\n" % (",".join([str(x) for x in removed]))) out.close()
def setupArrays(args, log): ''' Read the alignment into an array, check there are enough sequences in the array and the names are not duplicated, detect if the alignment is nucleotides or amino acids. Parameters ---------- args: configargparse.ArgumentParser ArgumentParser object containing the specified parameters log: logging.Logger Open log file Returns ------- arr: np.array The alignment stored in a numpy array nams: list The names of the sequences in the alignment typ: str Either 'aa' - amino acid - or 'nt' - nucleotide ''' # convert the input fasta file into an array and make a list of # sequence names so the order can be maintained arr, nams = utilityFunctions.FastaToArray(args.infile, log, args.outfile_stem) # check if names are unique if len(nams) > len(set(nams)): print("Error! Your input alignmnent has duplicate names!") exit() # Check the alignment array isn't empty utilityFunctions.checkArrLength(arr, log) # Check which cleaning functions are requested cleaningArgs = [ args.remove_insertions, args.crop_ends, args.remove_divergent ] # Check there are enough sequences for the requested functions if len(arr) < 3 and any(cleaningArgs): # when less than three sequences, stop print("You need at least three sequences in your MSA to run \ remove_insertions, crop_ends or remove_divergent") exit() elif len(arr) < 2: print("You need at least two sequences in your MSA") exit() # detect if the sequence is amino acids or nucleotides typ = utilityFunctions.seqType(arr) if typ == 'aa': log.info("Amino acid alignment detected") else: log.info("Nucleotide alignment detected") return (arr, nams, typ)
def testFastaToArray(self): logger = logging.getLogger('path.to.module.under.test') with mock.patch.object(logger, 'debug') as mock_debug: ali, nams = utilityFunctions.FastaToArray(self.input, logger) # self.assertEqual(nams.size, self.nams.size) self.assertEqual(ali[0, :].size, self.in_array[0, :].size) self.assertEqual(len(self.in_array), len(ali)) self.assertEqual(len(nams), len(self.nams)) self.assertTrue((ali == self.in_array).all()) self.assertTrue(nams == self.nams)
def format_alignment(ali, cleaned=False, cialign_removed=None): ''' Converts the alignment in the path ali to a numpy array of integers showing the cumulative number of non-gap residues prior to the residue at this position in the sequence, with characters removed by CIAlign excluded. Runs the FastaToArray function from utilityFunctions, converts to upper case, runs find_removed_cialign and alignment_to_matrix. Parameters ---------- ali: str path to multiple sequence alignment in FASTA format. If the alignment has been cleaned with CIAlign this should be the CIAlign input, not the output cleaned: bool True if the alignment has been cleaned with CIAlign, otherwise False cialign_removed: str path to CIAlign _removed.txt file for the alignment Returns ------- arr: np.array Numpy array of integers showing the cumulative number of non-gap residues prior to the residue at this position in the sequence, with characters removed by CIAlign excluded nams: list List of sequence names in the same order as the rows of the sequence array. ''' removed_count_total = 0 removed_count_nongap = 0 # Convert alignment into arrays arr, nams = utilityFunctions.FastaToArray(ali) # make everything upper case so this doesn't affect the score arr = np.char.upper(arr) # make sure everything is in the right order o = np.argsort(nams) nams = np.array(nams)[o] arr = arr[o, :] # if the alignment has been cleaned with CIAlign, update the array # to contain !s for positions which have been removed if cleaned: X = find_removed_cialign(cialign_removed, arr, nams) arr, nams, removed_count_total, removed_count_nongap = X arr = alignment_to_matrix(arr) return (arr, nams, removed_count_total, removed_count_nongap)
def convertGUIDANCELog(arr, nams, trimfile, logfile, outfile): ''' Convert the GUIDANCE output to resemble the CIAlign "removed" file ''' trimfile_cols, trimfile_rows, out_trimmed = trimfile logfile_cols, logfile_rows = logfile t_arr_cols, t_nams_cols = utilityFunctions.FastaToArray(trimfile_cols) removed_cols = [ int(line.strip().split("\t")[0].split(" ")[-1]) for line in open(logfile_cols).readlines() ] removed_cols = np.array(removed_cols) - 1 all_ints = set(np.arange(0, np.shape(arr)[1])) keep = sorted(list(all_ints - set(removed_cols))) if os.path.exists("%s.With_Names" % trimfile_rows) and os.path.exists( "%s.With_Names" % logfile_rows): t_arr_rows, t_nams_rows = utilityFunctions.FastaToArray( "%s.With_Names" % trimfile_rows) t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray( "%s.With_Names" % logfile_rows) elif os.path.exists("%s.With_Names" % trimfile_rows): t_arr_rows, t_nams_rows = utilityFunctions.FastaToArray( "%s.With_Names" % trimfile_rows) t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray( logfile_rows) else: t_arr_rows, t_nams_rows = np.array([]), list() t_arr_rows_rm, t_nams_rows_rm = utilityFunctions.FastaToArray( "%s.With_Names" % logfile_rows) assert len(t_nams_rows) + len(t_nams_rows_rm) == len(nams) assert len(removed_cols) + np.shape(t_arr_cols)[1] == np.shape(arr)[1] assert (arr[:, keep] == t_arr_cols).all() allnams = sorted(t_nams_rows + t_nams_rows_rm) assert allnams == sorted(nams) out = open(outfile, "w") out.write("other\t%s\n" % (",".join([str(x) for x in sorted(removed_cols)]))) out.write("otherc\t%s\n" % (",".join([str(x) for x in sorted(t_nams_rows_rm)]))) out.close() which_nams = np.where(np.isin(nams, t_nams_rows))[0] new_arr = arr[which_nams, ] new_arr = new_arr[:, keep] utilityFunctions.writeOutfile(out_trimmed, new_arr, nams, t_nams_rows_rm)
def convertTrimalLog(arr, nams, trimfile, logfile, outfile): ''' Convert the trimal --colnumbering output to resemble the CIAlign "removed" file. ''' t_arr, t_nams = utilityFunctions.FastaToArray(trimfile, "") width = np.shape(arr)[1] t_width = np.shape(t_arr)[1] remaining = [ int(x.strip()) for x in open(logfile).readlines()[0].strip().split("\t")[1].split(",") ] remaining = set(remaining) all_ints = set(np.arange(0, width)) removed = sorted(list(all_ints - remaining)) assert len(removed) == width - t_width # Check removing these columns gives the trimal output new_arr = arr[:, np.array(list(remaining))] assert (new_arr == t_arr).all() out = open(outfile, "w") out.write("other\t%s\n" % (",".join([str(x) for x in removed]))) out.close()
def getParser(): ''' Builds a configargparse.ArgumentParser object with the CIAlign parameters Returns ------- parser: configargparse.ArgumentParser ArgumentParser with the CIAlign parameters ''' parser = configargparse.ArgumentParser( description='Clean and interpret a multiple sequence \ alignment', add_help=False) ci_dir = os.path.dirname(utilityFunctions.__file__) # Looks up the default values and minimum and maximum values for the # paramters associated with the cleaning functions in the text file # ranges.txt provided in the CIAlign code directory ranges = [line.strip().split("\t") for line in open("%s/ranges.txt" % ci_dir)] # Defaults defs = {x[0]: x[1] for x in ranges} # Minima minis = {x[0]: x[2] for x in ranges} # Maxima maxis = {x[0]: x[3] for x in ranges} # Seperate the required and optional paramters required = parser.add_argument_group('Required Arguments') optional = parser.add_argument_group('Optional Arguments') # Files # not to confuse with inifile required.add("--infile", dest='infile', type=str, help='Path to input alignment file in FASTA format') optional.add("--inifile", dest='inifile', type=str, default=None, help='Path to config file. Default: %(default)s', is_config_file=True) optional.add("--outfile_stem", dest='outfile_stem', type=str, default="CIAlign", help="Prefix for output files, including the path to the \ output directory. Default: %(default)s") # Initial setup # Read the alignment temporarily just to find out how many columns there # are as for several of the cleaning functions the range of valid # parameters depends on this. tempargs = parser.parse_known_args()[0] if tempargs.infile: # Read the FASTA file into an array arr, nams = utilityFunctions.FastaToArray(tempargs.infile, None, tempargs.outfile_stem) # Find the number of columns in the input alignment n_col = np.shape(arr)[1] # Remove the array from memory del arr else: # Gives a valid int value just for generating the --help text n_col = 100 # parameter to run all functions without having to type them in optional.add("--all", dest="all_options", action="store_true", help="Use all available functions with default parameters.") # parameter to run all cleaning functions without having to type them in optional.add("--clean", dest="clean", action="store_true", help="Use all cleaning functions with default parameters.") # parameter to create all mini alignments without having to type them in optional.add("--visualise", dest="visualise", action="store_true", help="Plot all mini alignments with default parameters.") # parameter to run all interpreation functions except creating sequence logos without having to type them in optional.add("--interpret", dest="interpret", action="store_true", help="Use all interpreting functions with default parameters.") # Runtime optional.add("--silent", dest='silent', help="Do not print progress to the screen. \ Default: %(default)s", action='store_true') # Crop Ends optional.add("--crop_ends", dest="crop_ends", action="store_true", help="Crop the ends of sequences if they are poorly aligned. \ Default: %(default)s") optional.add("--crop_ends_mingap_perc", dest='crop_ends_mingap_perc', type=float_range(minis['crop_ends_mingap_perc'], maxis['crop_ends_mingap_perc']), default=defs['crop_ends_mingap_perc'], help="Minimum proportion of the sequence length (excluding \ gaps) that is the threshold for change in gap numbers. \ Default: %(default)s.", metavar="(float, %s..%s)" % (minis['crop_ends_mingap_perc'], maxis['crop_ends_mingap_perc'])) optional.add("--crop_ends_redefine_perc", dest='crop_ends_redefine_perc', type=float_range(minis['crop_ends_redefine_perc'], maxis['crop_ends_redefine_perc']), default=defs['crop_ends_redefine_perc'], help="Proportion of the sequence length (excluding gaps) \ that is being checked for change in gap numbers to \ redefine start/end. Default: %(default)s", metavar="(float, %s..%s)" % ( minis['crop_ends_redefine_perc'], maxis['crop_ends_redefine_perc'])) # Remove divergent sequences optional.add("--remove_divergent", dest="remove_divergent", action="store_true", help="Remove sequences with <= N proportion of positions at \ which the most common base / amino acid in the \ alignment is present. Default: %(default)s") optional.add("--remove_divergent_minperc", dest="remove_divergent_minperc", default=defs['remove_divergent_minperc'], type=float_range(minis['remove_divergent_minperc'], maxis['remove_divergent_minperc']), help="Minimum proportion of positions which should be \ identical to the most common base / amino acid in \ order to be preserved. \ Default: %(default)s)", metavar="(float, %s..%s)" % ( minis['remove_divergent_minperc'], maxis['remove_divergent_minperc'])) # # Remove Insertions optional.add("--remove_insertions", dest="remove_insertions", action="store_true", help="Remove insertions found in <= 50 percent of sequences \ from the alignment. Default: %(default)s") optional.add("--insertion_min_size", dest="insertion_min_size", type=int_range(minis['insertion_min_size'], maxis['insertion_max_size'], n_col), default=defs['insertion_min_size'], help="Only remove insertions >= this number of residues. \ Default: %(default)s.", metavar="(int, %s..%s)" % ( minis['insertion_min_size'], maxis['insertion_min_size'])) optional.add("--insertion_max_size", dest="insertion_max_size", type=int_range(minis['insertion_max_size'], maxis['insertion_max_size'], n_col), default=defs['insertion_max_size'], help="Only remove insertions <= this number of residues. \ Default: %(default)s", metavar="(int, %s..%s)" % ( minis['insertion_max_size'], maxis['insertion_max_size'])) optional.add("--insertion_min_flank", dest="insertion_min_flank", type=int_range(minis['insertion_min_flank'], maxis['insertion_min_flank'], n_col), default=defs['insertion_min_flank'], help="Minimum number of bases on either side of an insertion \ to classify it as an insertion.\ Default: %(default)s", metavar="(int, %s..%s)" % ( minis['insertion_min_flank'], maxis['insertion_min_flank'])) # Remove Short optional.add("--remove_short", dest="remove_short", help="Remove sequences <= N bases / amino acids from the \ alignment. Default: %(default)s", action="store_true") optional.add("--remove_min_length", dest="remove_min_length", type=int_range(minis['remove_min_length'], maxis['remove_min_length'], n_col), default=defs['remove_min_length'], help="Sequences are removed if they are shorter than this \ minimum length, excluding gaps. Default: %(default)s", metavar="(int, %s..%s)" % ( minis['remove_min_length'], maxis['remove_min_length'])) # keep gap only optional.add("--keep_gaponly", dest="remove_gaponly", action="store_false", help="Keep gap only columns in the alignment. Default: \ %(default)s") # Consensus optional.add("--make_consensus", dest="make_consensus", action="store_true", help="Make a consensus sequence based on the cleaned \ alignment. Default: %(default)s") optional.add("--consensus_type", dest="consensus_type", type=str, default="majority", help="Type of consensus sequence to make - can be majority, \ to use the most common character at each position in \ the consensus, even if this is a gap, or \ majority_nongap, to use the most common non-gap \ character at each position. Default: %(default)s") optional.add("--consensus_keep_gaps", dest="consensus_keep_gaps", action="store_true", help="If there are gaps in the consensus (if majority_nongap \ is used as consensus_type), should these be included \ in the consensus (True) or should this position in \ the consensus be deleted (False). Default: %(default)s") optional.add("--consensus_name", dest="consensus_name", type=str, default="consensus", help="Name to use for the consensus sequence in the output \ fasta file. Default: %(default)s") # Mini Alignments optional.add("--plot_input", dest="plot_input", action="store_true", help="Plot a mini alignment - an image representing the \ input alignment. Default: %(default)s") optional.add("--plot_output", dest="plot_output", action="store_true", help="Plot a mini alignment, an image representing the \ output alignment. Default: %(default)s") optional.add("--plot_markup", dest="plot_markup", action="store_true", help="Draws the input alignment but with the columns and \ rows which have been removed by each function marked \ up in corresponding colours. Default: %(default)s") optional.add("--plot_dpi", dest="plot_dpi", type=int, default=300, help="DPI for mini alignments. Default: %(default)s") optional.add("--plot_format", dest="plot_format", type=str, default='png', help="Image format for mini alignments - can be png, svg, \ tiff or jpg. Default: %(default)s") optional.add("--plot_width", dest="plot_width", type=int, default=5, help="Mini alignment width in inches. Default: %(default)s") optional.add("--plot_height", dest="plot_height", type=int, default=3, help="Mini alignment height in inches. Default: %(default)s") optional.add("--plot_keep_numbers", dest="plot_keep_numbers", action="store_true", help="If specified, for mini alignments based on CIAlign \ output with <10 sequences (or if force_numbers \ is switched on) the rows will be labelled \ based on the input alignment, rather \ than renumbered") optional.add("--plot_force_numbers", dest="plot_force_numbers", action="store_true", help="Force all rows to be numbered on the mini alignments \ rather than labelling e.g. every 10th row for larger plots. \ Will cause labels to overlap on large plots") # Sequence logos optional.add("--make_sequence_logo", dest="make_sequence_logo", action="store_true", help="Draw a sequence logo. Default: %(default)s") optional.add("--sequence_logo_type", dest="sequence_logo_type", type=str, default='bar', help="Type of sequence logo - bar/text/both. \ Default: %(default)s") optional.add("--sequence_logo_dpi", dest="sequence_logo_dpi", type=int, default=300, help="DPI for sequence logo image. Default: %(default)s") optional.add("--sequence_logo_font", dest="sequence_logo_font", type=str, default='monospace', help="Font for text sequence logo. Default: %(default)s") optional.add("--sequence_logo_nt_per_row", dest='sequence_logo_nt_per_row', type=int, default=50, help="Number of bases / amino acids to show per row in the \ sequence logo, where the logo is too large to show on \ a single line. Default: %(default)s") optional.add("--sequence_logo_filetype", dest='sequence_logo_filetype', type=str, default='png', help="Image file type to use for the sequence logo - can be \ png, svg, tiff or jpg. Default: %(default)s") optional.add("--logo_start", dest="logo_start", type=int, default=0, help="Start position of sequence logo. Default: %(default)s") optional.add("--logo_end", dest="logo_end", type=int, default=0, help="End position of sequence logo. Default: %(default)s") optional.add("--list_fonts_only", dest='list_fonts_only', action="store_true", help="Make a swatch showing available fonts. \ Default: %(default)s") # Coverage optional.add("--plot_coverage_input", dest="plot_coverage_input", action="store_true", help="Plot the coverage of the input MSA. Default: \ %(default)s") optional.add("--plot_coverage_output", dest="plot_coverage_output", action="store_true", help="Plot the coverage of the output MSA. Default: \ %(default)s") optional.add("--plot_coverage_dpi", dest="plot_coverage_dpi", type=int, default=300, help="DPI for coverage plot. Default: %(default)s") optional.add("--plot_coverage_height", dest="plot_coverage_height", type=int, default=3, help="Height for coverage plot (inches). Default: \ %(default)s") optional.add("--plot_coverage_width", dest="plot_coverage_width", type=int, default=5, help="Width for coverage plot (inches). Default: \ %(default)s") optional.add("--plot_coverage_colour", dest="plot_coverage_colour", type=str, default='#007bf5', help="Colour for coverage plot (hex code or name). \ Default: %(default)s") optional.add("--plot_coverage_filetype", dest="plot_coverage_filetype", type=str, default='png', help="File type for coverage plot (png, svg, tiff, jpg). \ Default: %(default)s") # Similarity Matrix optional.add("--make_similarity_matrix_input", dest="make_simmatrix_input", action="store_true", help="Make a similarity matrix for the input alignment. \ Default: %(default)s") optional.add("--make_similarity_matrix_output", dest="make_simmatrix_output", action="store_true", help="Make a similarity matrix for the output alignment. \ Default: %(default)s") optional.add("--make_simmatrix_dp", dest="make_simmatrix_dp", type=int, default=4, help="Number of decimal places to display in the similarity \ matrix output file. Default: %(default)s") optional.add("--make_simmatrix_minoverlap", dest="make_simmatrix_minoverlap", type=int, default=1, help="Minimum overlap between two sequences to have non-zero \ similarity in the similarity matrix. \ Default: %(default)s") optional.add("--make_simmatrix_keepgaps", dest="make_simmatrix_keepgaps", type=int, default=0, help="Include positions with gaps in either or both \ sequences in the similarity matrix calculation. \ Can be 0 - exclude positions which are gaps in either \ or both sequences, 1 - exclude positions which are \ gaps in both sequences, 2 - consider all positions \ regardless of gaps. Default: %(default)s") # Unalign function optional.add("--unalign_input", dest="unalign_input", action="store_true", default=False, help="Generate a copy of the input alignment with no gaps. \ Default: %(default)s") optional.add("--unalign_output", dest="unalign_output", action="store_true", default=False, help="Generate a copy of the cleaned alignment with no \ gaps. Default: %(default)s") # Replace Us by Ts function optional.add("--replace_input", dest="replace_input", action="store_true", default=False, help="Replaces all Us by Ts in input alignment. \ Default: %(default)s") optional.add("--replace_output", dest="replace_output", action="store_true", default=False, help="Replaces all Us by Ts in output alignment. \ Default: %(default)s") # Help function optional.add('-h', '--help', action='help', default=configargparse.SUPPRESS, help='Show all available parameters with an explanation.') # Version function optional.add('-v', '--version', action='version', version=__version__, default=configargparse.SUPPRESS, help='Show the current version.') return (parser)