def read_sqce_lengths(fasta_file): """Read the length of the sequences from an input FASTA file. Return them as sequence_id:length dictionary. """ print_log_msg( log_str='Retrieving sequence lengths from input FASTA file (%s)' % fasta_file) sqce_lengths = {} with open(fasta_file, 'r') as in_file: sqce_id = "" sqce = [] for line in in_file: if line[0] == ">": # If there was a previous sequence ,add information to `sqce_lengths` and reset `sqce` if sqce and sqce_id: sqce_lengths[sqce_id] = len("".join(sqce)) sqce_id = "" sqce = [] sqce_id = line.strip()[1:] else: sqce.append(line.strip()) # Also add the last sequence sqce_lengths[sqce_id] = len("".join(sqce)) # Some tests # print sqce_lengths["AT1G01010"] # 430 # print sqce_lengths["AT1G01020"] # 246 # print sqce_lengths["AT1G01030"] # 359 # print sqce_lengths["ATMG01410"] # 205 aa # print len(sqce_lengths) print_log_msg( log_str="Length retrieved for all input sequences (%d sequences). " % len(sqce_lengths), color="cyan") return sqce_lengths
def core_gf_json_report(all_gfs_dict, output_dir, file_name, is_represented, pretty=False): """Generate JSON report, for either represented or missing core GFs (set with `is_represented` boolean). Produced file can be made prettier (indentation, new lines) by setting `pretty` to True. """ # Don't generate anything if an invalid GF type was chosen if not isinstance(is_represented, bool): print_log_msg( log_str= 'Error: invalid value for `is_represented` (boolean). Cannot generate JSON report', color="red") else: print_log_msg(log_str='Generating JSON report (represented: %s)' % is_represented) core_gfs = set([ gf for gf in all_gfs_dict if all_gfs_dict[gf]['represented'] == is_represented and all_gfs_dict[gf]['is_core_gf'] ]) out_dict = {} # Output directory (subset of `all_gfs_dict`) for gf in core_gfs: out_dict[gf] = { 'weight': float(all_gfs_dict[gf]['weight']), 'n_species': int(all_gfs_dict[gf]['n_species']), 'n_genes': len(all_gfs_dict[gf]['members']) } with open(os.path.join(output_dir, file_name), 'w') as out_file: if pretty: out_file.write(json.dumps(out_dict, sort_keys=True, indent=4)) else: out_file.write(json.dumps(out_dict, sort_keys=True))
def read_trapid_data(db_conn, experiment_id, top_hits, transcript_label=None): """Retrieve similarity search results of TRAPID experiment `experiment_id`, (optionally for transcripts labelled with `transcript_label`label), from the TRAPID database (through `db_conn`). Return data as a sorted and indexed pandas dataframe (keep only query, subject and e-value). `top_hits` is used to avoid retrieving more results than necessary. """ print_log_msg(log_str='Retrieving similarity search data from TRAPID database.') sim_list= [] if transcript_label not in [None, "None"]: # Quickfix (ambiguity between None, and 'None' str). get_sim_data_query = "SELECT sim.transcript_id, UNCOMPRESS(sim.similarity_data) as `similarity_data`" \ "FROM similarities sim INNER JOIN transcripts_labels tl " \ "ON sim.transcript_id = tl.transcript_id " \ "WHERE sim.experiment_id = {experiment_id} " \ "AND tl.label = '{transcript_label}'".format(experiment_id=str(experiment_id), transcript_label=transcript_label) else: get_sim_data_query = "SELECT sim.transcript_id, UNCOMPRESS(sim.similarity_data) as `similarity_data` " \ "FROM similarities sim " \ "WHERE sim.experiment_id = {experiment_id}".format(experiment_id=str(experiment_id)) # Execute query cursor = db_conn.cursor(MS.cursors.DictCursor) cursor.execute(get_sim_data_query) # Process output to get only information we want. for record in ResultIter(db_cursor=cursor): # Get a list of similarity search results for current query, formatted like: # [[query, subject_1, e-value_1], [query, subject_2, e-value_2], ...] sim_data = [[record['transcript_id'], data.split(',')[0], float(data.split(',')[1])] for data in record['similarity_data'].split(';')[0:top_hits]] sim_list.extend(sim_data) trapid_df = pd.DataFrame(sim_list, columns=["query_gene", "subject", "log_e_value"]) # Convert e-values to log10 (they are stored as raw e-values in TRAPID's `similarities` table) trapid_df['log_e_value'] = [math.log10(e_val) if e_val > sys.float_info.min else math.log10(sys.float_info.min) for e_val in trapid_df['log_e_value'].tolist()] # Sort, index and return trapid_df = trapid_df.sort_values(by=['query_gene', 'log_e_value'], ascending=[True, True]) trapid_df = trapid_df.set_index(['query_gene']) return trapid_df
def get_gene_gf_map(all_gfs_dict): """Map each gene identifier to a GF. Return a gene_id-gf dict, used to perform lookup after. """ gene_gf_map = {} for gf in all_gfs_dict: for gene in all_gfs_dict[gf]['members']: gene_gf_map[gene] = gf # Return gene_gf_map print_log_msg(log_str=str(len(gene_gf_map)) + ' elements in Gene-GF map.') return gene_gf_map
def export_results_to_db(db_conn, output_dict): """Export core GF completeness analysis results to the TRAPID database. """ print_log_msg(log_str='Export core GF completeness results to TRAPID database.') cursor = db_conn.cursor() # Kind of dumb way to create the request, but it works. columns = ', '.join(sorted(output_dict)) values = ', '.join(["\'{insert_value}\'".format(insert_value=output_dict[k]) for k in sorted(output_dict)]) export_query = "INSERT INTO completeness_results ({columns}) VALUES ({values})".format(columns=columns, values=values) cursor.execute(export_query) db_conn.commit()
def get_completeness_score(all_gfs_dict): """Compute the core GF completeness score. """ total_weight = sum([ float(all_gfs_dict[a]["weight"]) for a in all_gfs_dict.keys() if all_gfs_dict[a]['is_core_gf'] ]) current_weight = sum([ float(all_gfs_dict[a]["weight"]) for a in all_gfs_dict.keys() if all_gfs_dict[a]["represented"] and all_gfs_dict[a]['is_core_gf'] ]) print_log_msg(log_str='Calculating weighted core GF score.') # print current_weight # print total_weight return current_weight / total_weight
def completeness_report(all_gfs_dict, output_dir, file_name): """Generate a core GF completeness report. Output to `file_name` in `output_dir`. """ print_log_msg(log_str='Generating core GF completeness report.') represented_gfs = [ gf for gf in all_gfs_dict if all_gfs_dict[gf]['represented'] and all_gfs_dict[gf]['is_core_gf'] ] missing_gfs = [ gf for gf in all_gfs_dict if not all_gfs_dict[gf]['represented'] and all_gfs_dict[gf]['is_core_gf'] ] core_gfs = [gf for gf in all_gfs_dict if all_gfs_dict[gf]['is_core_gf']] # core_gfs_dict["HOM004486"]['not_chosen']=True # Debug not_chosen_gfs = [ gf for gf in all_gfs_dict if all_gfs_dict[gf]['not_chosen'] ] not_chosen_missing_gfs = [ gf for gf in all_gfs_dict if not all_gfs_dict[gf]['represented'] and all_gfs_dict[gf]['not_chosen'] and all_gfs_dict[gf]['is_core_gf'] ] completeness_score = get_completeness_score(all_gfs_dict=all_gfs_dict) # sys.exit() with open(os.path.join(output_dir, file_name), 'wb') as out_file: out_file.write('# Core GF completeness score:\t' + "{:.5f}".format(completeness_score) + '\n') out_file.write('# Represented core gene families:\t' + str(len(represented_gfs)) + '/' + str(len(core_gfs)) + '\n') out_file.write('# Missing core gene families:\t' + str(len(missing_gfs)) + '/' + str(len(core_gfs)) + '\n') out_file.write( '# In ' + str(len(not_chosen_gfs)) + ' cases, the core gene family associated with the top hit was not the one chosen as \'best\' gene family. \n' ) out_file.write( '# ' + str(len(not_chosen_missing_gfs)) + ' of these ' + str(len(not_chosen_gfs)) + ' were core gene families, now considered as missing. ' + ', '.join(not_chosen_missing_gfs) + '\n') out_file.write('\n') # out_file.write('missing_gf\tn_genes\n') out_file.write('missing_gf\tn_genes\tn_species\tgf_weight\n') for gf in missing_gfs: out_file.write('\t'.join([ gf, str(len(all_gfs_dict[gf]['members'])), str(all_gfs_dict[gf]['n_species']), str(all_gfs_dict[gf]['weight']) ]) + '\n')
def represented_core_gf_report(all_gfs_dict, output_dir, file_name): """Generate a tabulated file reporting represented core GFs and the list of corresponding similarity search queries. """ print_log_msg(log_str='Generating represented core GF report.') represented_gfs = [ gf for gf in all_gfs_dict if all_gfs_dict[gf]['represented'] and all_gfs_dict[gf]['is_core_gf'] ] with open(os.path.join(output_dir, file_name), 'wb') as out_file: out_file.write( 'represented_gf\tn_genes\tn_species\tgf_weight\tquery_list\n') for gf in represented_gfs: out_file.write('\t'.join([ gf, str(len(all_gfs_dict[gf]['members'])), str(all_gfs_dict[gf]['n_species']), str(all_gfs_dict[gf]['weight']), ','.join(all_gfs_dict[gf] ['query_list']) ]) + '\n')
def read_all_gfs(core_gfs_file, gf_len): """Read a core GF tabulated file, return a dictionary of GFs. """ print_log_msg(log_str='Reading all GFs from core GFs file (%s). ' % core_gfs_file) all_gfs_dict = {} with open(core_gfs_file, 'r') as in_file: next(in_file) # Skip header line if gf_len: for line in in_file: fields = line.strip().split('\t') all_gfs_dict[fields[0]] = { 'members': fields[-4].split('|'), 'n_species': fields[2], 'weight': fields[3], 'not_chosen': False, 'is_core_gf': fields[4] in 'True', 'represented': False, 'len_avg': float(fields[-3]), 'len_med': float(fields[-2]), 'len_stdev': float(fields[-1]), 'query_list': [] } else: for line in in_file: fields = line.strip().split('\t') all_gfs_dict[fields[0]] = { 'members': fields[-1].split('|'), 'n_species': fields[2], 'weight': fields[3], 'not_chosen': False, 'is_core_gf': fields[4] in 'True', 'represented': False, 'query_list': [] } n_gfs = len(all_gfs_dict.keys()) n_core_gfs = len( [gf for gf in all_gfs_dict if all_gfs_dict[gf]['is_core_gf'] == True]) print_log_msg(log_str=str(n_gfs) + ' gene families and ' + str(n_core_gfs) + ' core gene families found.', color="cyan") return all_gfs_dict
def read_blast_output(blast_output, raw_evalues=False, remove_self_hits=False): """Read a similarity search output file (`.m8` file), keeping only query, subject and e-value columns. Return it as sorted and indexed dataframe.""" print_log_msg(log_str='Reading similarity search output file (%s)' % blast_output) # Read the BLAST output and only keep the columns we're interested in. Then rename columns + set index (fast lookup) blast_df = pd.read_csv(blast_output, sep='\t', header=None, comment='#', usecols=[0, 1, 3, 10]) blast_df = blast_df.rename(columns={ 0: "query_gene", 1: "subject", 3: "length", 10: "log_e_value" }) # Convert e-values to log10 if --raw flag is provided (RapSearch2 takes care of that!) if raw_evalues: print_log_msg( log_str= 'Converting e-values to log10 (\'--raw\' flag was provided).') blast_df['log_e_value'] = [ math.log10(e_val) if e_val > sys.float_info.min else math.log10(sys.float_info.min) for e_val in blast_df['log_e_value'].tolist() ] # print min(blast_df['log_e_value']) # Debug # Remove self hits if --no_self_hits flag is provided if remove_self_hits: print_log_msg( log_str='Removing self-hits (\'--no_self_hits\' flag was provided).' ) # If we wanted to remove only self-hits that are the top hits... Does it really make sense? # first_hit_indices = {} # to_remove = [] # List of row indexes to drop # # Get 'top hits' indices # for index, query in enumerate(blast_df['query_gene'].tolist()): # if query not in first_hit_indices: # first_hit_indices[query] = index # # Check if the top hit is a self hit, add it to the list of indices to drop # for query,idx in first_hit_indices.items(): # if blast_df.iloc[idx]['query_gene'] == blast_df.iloc[idx]['subject']: # to_remove.append(idx) # # Drop rows # blast_df = blast_df.drop(blast_df.index[to_remove]) # Removing all self-hits blast_df = blast_df[blast_df['query_gene'] != blast_df['subject']] # Sort and index blast_df = blast_df.sort_values(by=['query_gene', 'log_e_value'], ascending=[True, True]) blast_df = blast_df.set_index(['query_gene']) return blast_df
def main(core_gfs_file, blast_output, fasta_input, output_dir, top_hits, min_len, raw_evalues, naive_scoring, remove_self_hits, gf_len): """Script execution. """ if not os.path.exists(output_dir): print_log_msg(log_str='Creating output directory \'%s\'.' % output_dir) os.makedirs(output_dir) else: print_log_msg(log_str='Output directory \'%s\' already exists.' % output_dir) all_gfs = read_all_gfs(core_gfs_file=core_gfs_file, gf_len=gf_len) sqce_lengths = read_sqce_lengths(fasta_file=fasta_input) gene_gf_map = get_gene_gf_map(all_gfs_dict=all_gfs) blast_df = read_blast_output(blast_output=blast_output, raw_evalues=raw_evalues, remove_self_hits=remove_self_hits) process_blast_output(blast_df=blast_df, n_hits=top_hits, gene_gf_map=gene_gf_map, output_dir=output_dir, all_gfs_dict=all_gfs, sqce_lengths=sqce_lengths, gf_len=gf_len, min_len=min_len, naive_scoring=naive_scoring) completeness_report(all_gfs_dict=all_gfs, output_dir=output_dir, file_name="core_gf_completeness_report.tsv") represented_core_gf_report(all_gfs_dict=all_gfs, output_dir=output_dir, file_name="represented_core_gf_report.tsv") core_gf_json_report(all_gfs_dict=all_gfs, output_dir=output_dir, file_name="core_gf_report.represented.json", is_represented=True) core_gf_json_report(all_gfs_dict=all_gfs, output_dir=output_dir, file_name="core_gf_report.missing.json", is_represented=False) print_log_msg(log_str='Core GF completeness analysis finished!', color="green")
def process_blast_output(blast_df, n_hits, gene_gf_map, all_gfs_dict, sqce_lengths, gf_len, min_len, output_dir, naive_scoring=False): """Process the whole BLAST/RapSearch2 output. Will modify the dictionary passed as parameter to set to `True` families that are represented. For a query, we take the top `n_hits` from the results and check for GF members to assign a GF to each top hit. The best scoring GF is then considered represented. """ query_coverage = {} filtered_log_file = { "gf_len": "gf_z_score_filtered_queries.txt", "min_len": "minimum_alignment_length_filtered_queries.txt" } # Get list of all queries all_queries = list(blast_df.index.unique()) print_log_msg(log_str=str(len(all_queries)) + ' queries in similarity search output.') if not naive_scoring: print_log_msg( log_str= 'Warning: GF scoring will be normalized with GF weight (no `naive_scoring` flag provided).', color="orange") for query in all_queries: scored_gfs = {} # Dict to store found GFs and associated score top_hits = blast_df.loc[query][0:n_hits] if not isinstance(top_hits, pd.DataFrame): # sys.stderr.write("Not a dataframe.\n") # Debug top_hits = blast_df.loc[[query]][0:n_hits] # print top_hits.columns gfs = [ gf_lookup(gene_id=subject, gene_gf_map=gene_gf_map) for subject in top_hits['subject'] ] top_hit_gf = gfs[0] log_e_values = top_hits['log_e_value'].tolist() for gf, log_e_value in zip(gfs, log_e_values): if gf not in scored_gfs: scored_gfs[gf] = abs(log_e_value) else: scored_gfs[gf] += abs(log_e_value) # TODO: CLEAN THIS MESS OTHERWISE IT IS UNREADABLE # Remove 'None' BEFORE taking the 'max' (otherwise we underestimate the completeness...) if None in scored_gfs.keys(): print "-----" # This should never happen! del scored_gfs[None] # print scored_gfs # Weighting scores with GF weight, only if we chose to (i.e. not `naive_scoring`) if not naive_scoring: for gf in scored_gfs: scored_gfs[gf] = scored_gfs[gf] * float( all_gfs_dict[gf]['weight']) if scored_gfs: best_gf = max(scored_gfs, key=scored_gfs.get) # If `--gf_len` flag is provided, determine if the query is partial # if nor `gf_len` / `min_len` were provided, it's equivalent to the legacy method (not caring about partial query sequences) partial = False if gf_len: # First, compare its length to the length of GF members. If z-score is < -2, the GF is not represented. # We can only do this when it makes sense: i.e. if the cutoff value is at least 10 AA and stdev != 0 gf_len_cutoff = all_gfs_dict[best_gf][ 'len_avg'] - 2 * all_gfs_dict[best_gf]['len_stdev'] if gf_len_cutoff >= 15 and all_gfs_dict[best_gf][ 'len_stdev'] != 0: len_z_score = (sqce_lengths[top_hits.index[0]] - all_gfs_dict[best_gf]['len_avg'] ) / all_gfs_dict[best_gf]['len_stdev'] if len_z_score <= -2: partial = True with open( os.path.join(output_dir, filtered_log_file['gf_len']), 'a') as out_file: out_file.write( "%s flagged as partial (Z = %f, threshold = %f AA)\n" % (top_hits.index[0], len_z_score, gf_len_cutoff)) if min_len > 0: # Also use alignment length (mean of best GF alignments) query_len_cutoff = all_gfs_dict[best_gf][ 'len_med'] * min_len # Minimum required length in AA print query_len_cutoff print all_gfs_dict[best_gf]['len_med'] # Length of all alignments corresponding to best GF hits query_lens = top_hits.iloc[[ idx for idx, gf in enumerate(gfs) if gf == best_gf ]]['length'].tolist() query_len_avg = float(sum(query_lens)) / max( len(query_lens), 1) # Average length in AA query_coverage[top_hits.index[ 0]] = query_len_avg / all_gfs_dict[best_gf]['len_med'] if query_len_avg < query_len_cutoff: partial = True with open( os.path.join(output_dir, filtered_log_file['min_len']), 'a') as out_file: out_file.write( "%s flagged as partial (alignment length shorter than %f AA)\n" % (top_hits.index[0], query_len_cutoff)) if not partial: all_gfs_dict[best_gf]['represented'] = True all_gfs_dict[best_gf]['query_list'].append(query) if best_gf != top_hit_gf and top_hit_gf is not None: print_log_msg( log_str='Warning: the GF of the top result for query ' + query + ' (' + str(top_hit_gf) + '|is_core_gf=' + str(all_gfs_dict[top_hit_gf]['is_core_gf']) + ', score=' + str(scored_gfs[top_hit_gf]) + ') was not the one assigned (' + str(best_gf) + '|is_core_gf=' + str(all_gfs_dict[best_gf]['is_core_gf']) + ', score=' + str(scored_gfs[best_gf]) + ').') # sys.stderr.write(str(scored_gfs)+'\n') all_gfs_dict[top_hit_gf]['not_chosen'] = True