def process_deltadir(delta_dir, org_lengths): """Returns a tuple of ANIm results for .deltas in passed directory. - delta_dir - path to the directory containing .delta files - org_lengths - dictionary of total sequence lengths, keyed by sequence Returns the following pandas dataframes in a tuple; query sequences are rows, subject sequences are columns: - alignment_lengths - symmetrical: total length of alignment - percentage_identity - symmetrical: percentage identity of alignment - alignment_coverage - non-symmetrical: coverage of query and subject - similarity_errors - symmetrical: count of similarity errors May throw a ZeroDivisionError if one or more NUCmer runs failed, or a very distant sequence was included in the analysis. """ # Process directory to identify input files deltafiles = pyani_files.get_input_files(delta_dir, '.delta') labels = org_lengths.keys() # Hold data in pandas dataframe alignment_lengths = pd.DataFrame(index=labels, columns=labels, dtype=float) similarity_errors = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(0) percentage_identity = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) alignment_coverage = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) # Fill diagonal NA values for alignment_length with org_lengths for org, length in org_lengths.items(): alignment_lengths[org][org] = length # Process .delta files assuming that the filename format holds: # org1_vs_org2.delta for deltafile in deltafiles: qname, sname = \ os.path.splitext(os.path.split(deltafile)[-1])[0].split('_vs_') tot_length, tot_sim_error = parse_delta(deltafile) query_cover = float(tot_length) / org_lengths[qname] sbjct_cover = float(tot_length) / org_lengths[sname] # Calculate percentage ID of aligned length. This may fail if # total length is zero. # The ZeroDivisionError that would arise should be handled # Common causes are that a NUCmer run failed, or that a very # distant sequence was included in the analysis. perc_id = 1 - float(tot_sim_error) / tot_length # Populate dataframes: when assigning data, pandas dataframes # take column, index order, i.e. df['column']['row'] - this only # matters for asymmetrical data alignment_lengths.loc[qname, sname] = tot_length alignment_lengths.loc[sname, qname] = tot_length similarity_errors.loc[qname, sname] = tot_sim_error similarity_errors.loc[sname, qname] = tot_sim_error percentage_identity.loc[qname, sname] = perc_id percentage_identity.loc[sname, qname] = perc_id alignment_coverage.loc[sname, qname] = query_cover alignment_coverage.loc[qname, sname] = sbjct_cover return (alignment_lengths, percentage_identity, alignment_coverage, similarity_errors)
def process_deltadir(delta_dir, org_lengths): """Returns a tuple of ANIm results for .deltas in passed directory. - delta_dir - path to the directory containing .delta files - org_lengths - dictionary of total sequence lengths, keyed by sequence Returns the following pandas dataframes in a tuple; query sequences are rows, subject sequences are columns: - alignment_lengths - symmetrical: total length of alignment - percentage_identity - symmetrical: percentage identity of alignment - alignment_coverage - non-symmetrical: coverage of query and subject - similarity_errors - symmetrical: count of similarity errors May throw a ZeroDivisionError if one or more NUCmer runs failed, or a very distant sequence was included in the analysis. """ # Process directory to identify input files deltafiles = pyani_files.get_input_files(delta_dir, '.delta') labels = org_lengths.keys() # Hold data in pandas dataframe alignment_lengths = pd.DataFrame(index=labels, columns=labels, dtype=float) similarity_errors = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(0) percentage_identity = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) alignment_coverage = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) # Fill diagonal NA values for alignment_length with org_lengths for org, length in org_lengths.items(): alignment_lengths[org][org] = length # Process .delta files assuming that the filename format holds: # org1_vs_org2.delta for deltafile in deltafiles: qname, sname = \ os.path.splitext(os.path.split(deltafile)[-1])[0].split('_vs_') tot_length, tot_sim_error = parse_delta(deltafile) query_cover = float(tot_length) / org_lengths[qname] sbjct_cover = float(tot_length) / org_lengths[sname] # Calculate percentage ID of aligned length. This may fail if # total length is zero. # The ZeroDivisionError that would arise should be handled # Common causes are that a NUCmer run failed, or that a very # distant sequence was included in the analysis. perc_id = 1 - float(tot_sim_error) / tot_length # Populate dataframes: when assigning data, pandas dataframes # take column, index order, i.e. df['column']['row'] - this only # matters for asymmetrical data alignment_lengths.loc[qname, sname] = tot_length alignment_lengths.loc[sname, qname] = tot_length similarity_errors.loc[qname, sname] = tot_sim_error similarity_errors.loc[sname, qname] = tot_sim_error percentage_identity.loc[qname, sname] = perc_id percentage_identity.loc[sname, qname] = perc_id alignment_coverage.loc[sname, qname] = query_cover alignment_coverage.loc[qname, sname] = sbjct_cover return(alignment_lengths, percentage_identity, alignment_coverage, similarity_errors)
def process_blast(blast_dir, org_lengths, fraglengths=None, mode="ANIb"): """Returns a tuple of ANIb results for .blast_tab files in the output dir. - blast_dir - path to the directory containing .blast_tab files - org_lengths - the base count for each input sequence - fraglengths - dictionary of query sequence fragment lengths, only needed for BLASTALL output - mode - parsing BLASTN+ or BLASTALL output? Returns the following pandas dataframes in a tuple: - alignment_lengths - non-symmetrical: total length of alignment - percentage_identity - non-symmetrical: ANIb (Goris) percentage identity - alignment_coverage - non-symmetrical: coverage of query - similarity_errors - non-symmetrical: count of similarity errors May throw a ZeroDivisionError if one or more BLAST runs failed, or a very distant sequence was included in the analysis. """ # Process directory to identify input files blastfiles = pyani_files.get_input_files(blast_dir, '.blast_tab') labels = org_lengths.keys() # Hold data in pandas dataframe alignment_lengths = pd.DataFrame(index=labels, columns=labels, dtype=float) similarity_errors = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(0) percentage_identity = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) alignment_coverage = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) # Fill diagonal NA values for alignment_length with org_lengths for org, length in org_lengths.items(): alignment_lengths[org][org] = length # Process .blast_tab files assuming that the filename format holds: # org1_vs_org2.blast_tab: for blastfile in blastfiles: qname, sname = \ os.path.splitext(os.path.split(blastfile)[-1])[0].split('_vs_') tot_length, tot_sim_error, ani_pid = parse_blast_tab( blastfile, fraglengths, mode) query_cover = float(tot_length) / org_lengths[qname] # Populate dataframes: when assigning data, pandas dataframes # take column, index order, i.e. df['column']['row'] - this only # matters for asymmetrical data alignment_lengths.loc[qname, sname] = tot_length similarity_errors.loc[qname, sname] = tot_sim_error percentage_identity.loc[qname, sname] = 0.01 * ani_pid alignment_coverage.loc[qname, sname] = query_cover return (alignment_lengths, percentage_identity, alignment_coverage, similarity_errors)
def process_blast(blast_dir, org_lengths, fraglengths=None, mode="ANIb"): """Returns a tuple of ANIb results for .blast_tab files in the output dir. - blast_dir - path to the directory containing .blast_tab files - org_lengths - the base count for each input sequence - fraglengths - dictionary of query sequence fragment lengths, only needed for BLASTALL output - mode - parsing BLASTN+ or BLASTALL output? Returns the following pandas dataframes in a tuple: - alignment_lengths - non-symmetrical: total length of alignment - percentage_identity - non-symmetrical: ANIb (Goris) percentage identity - alignment_coverage - non-symmetrical: coverage of query - similarity_errors - non-symmetrical: count of similarity errors May throw a ZeroDivisionError if one or more BLAST runs failed, or a very distant sequence was included in the analysis. """ # Process directory to identify input files blastfiles = pyani_files.get_input_files(blast_dir, '.blast_tab') labels = org_lengths.keys() # Hold data in pandas dataframe alignment_lengths = pd.DataFrame(index=labels, columns=labels, dtype=float) similarity_errors = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(0) percentage_identity = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) alignment_coverage = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) # Fill diagonal NA values for alignment_length with org_lengths for org, length in org_lengths.items(): alignment_lengths[org][org] = length # Process .blast_tab files assuming that the filename format holds: # org1_vs_org2.blast_tab: for blastfile in blastfiles: qname, sname = \ os.path.splitext(os.path.split(blastfile)[-1])[0].split('_vs_') tot_length, tot_sim_error, ani_pid = parse_blast_tab(blastfile, fraglengths, mode) query_cover = float(tot_length) / org_lengths[qname] # Populate dataframes: when assigning data, pandas dataframes # take column, index order, i.e. df['column']['row'] - this only # matters for asymmetrical data alignment_lengths.loc[qname, sname] = tot_length similarity_errors.loc[qname, sname] = tot_sim_error percentage_identity.loc[qname, sname] = 0.01 * ani_pid alignment_coverage.loc[qname, sname] = query_cover return(alignment_lengths, percentage_identity, alignment_coverage, similarity_errors)
def process_deltadir(delta_dir, org_lengths, logger=None): """Returns a tuple of ANIm results for .deltas in passed directory. - delta_dir - path to the directory containing .delta files - org_lengths - dictionary of total sequence lengths, keyed by sequence Returns the following pandas dataframes in a tuple; query sequences are rows, subject sequences are columns: - alignment_lengths - symmetrical: total length of alignment - percentage_identity - symmetrical: percentage identity of alignment - alignment_coverage - non-symmetrical: coverage of query and subject - similarity_errors - symmetrical: count of similarity errors May throw a ZeroDivisionError if one or more NUCmer runs failed, or a very distant sequence was included in the analysis. """ # Process directory to identify input files deltafiles = pyani_files.get_input_files(delta_dir, '.delta') labels = org_lengths.keys() # Hold data in pandas dataframe alignment_lengths = pd.DataFrame(index=labels, columns=labels, dtype=float) similarity_errors = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(0) percentage_identity = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) alignment_coverage = pd.DataFrame(index=labels, columns=labels, dtype=float).fillna(1.0) # Fill diagonal NA values for alignment_length with org_lengths for org, length in org_lengths.items(): alignment_lengths[org][org] = length # Process .delta files assuming that the filename format holds: # org1_vs_org2.delta zero_error = False # flag to register a divide-by-zero error for deltafile in deltafiles: qname, sname = \ os.path.splitext(os.path.split(deltafile)[-1])[0].split('_vs_') tot_length, tot_sim_error = parse_delta(deltafile) if tot_length == 0 and logger is not None: logger.warning("Total alignment length reported in %s is zero!" % deltafile) query_cover = float(tot_length) / org_lengths[qname] sbjct_cover = float(tot_length) / org_lengths[sname] # Calculate percentage ID of aligned length. This may fail if # total length is zero. # The ZeroDivisionError that would arise should be handled # Common causes are that a NUCmer run failed, or that a very # distant sequence was included in the analysis. try: perc_id = 1 - float(tot_sim_error) / tot_length except ZeroDivisionError: logger.error("One or more NUCmer output files has a problem.") logger.error("This is possibly due to a NUCmer comparison " + "being too distant for use. If so, please consider " + "using the --maxmatch option.") logger.error("Alternatively, this may be due to NUCmer run failure: " + "analysis may continue, but please investigate.") perc_id = 0 # set arbitrary value of zero identity zero_error = True # Populate dataframes: when assigning data, pandas dataframes # take column, index order, i.e. df['column']['row'] - this only # matters for asymmetrical data alignment_lengths.loc[qname, sname] = tot_length alignment_lengths.loc[sname, qname] = tot_length similarity_errors.loc[qname, sname] = tot_sim_error similarity_errors.loc[sname, qname] = tot_sim_error percentage_identity.loc[qname, sname] = perc_id percentage_identity.loc[sname, qname] = perc_id alignment_coverage.loc[sname, qname] = query_cover alignment_coverage.loc[qname, sname] = sbjct_cover return(alignment_lengths, percentage_identity, alignment_coverage, similarity_errors, zero_error)