def select_NCBI_record(ODB_fasta_fpath, NCBI_fasta_fpath, taxid_dict, ODB_final_input_df, compare_taxids): """Selects best NCBI record from NCBI fasta fpath by max identity to the OrthoDB records represented by compare_taxids. :param ODB_fasta_fpath: Fasta path for ODB records :param NCBI_fasta_fpath: Fasta path to NCBI records (should only contain records from one species) :param taxid_dict: Maps species names to taxids, used by load_NCBI_fasta_df :param ODB_final_input_df: DataFrame containing accepted records from OrthoDB data, returned from ODBfilter.process_input :param (collection) compare_taxids: tax_ids against which distance should be calculated to determine minimum distance NCBI record :return: combined_df, DataFrame containing rows from ODB_final_input and the minimu, distance row from NCBI_fasta_fpath """ ncbi_df = load_NCBI_fasta_df(NCBI_fasta_fpath, taxid_dict) if len(ncbi_df) > 1: #Align all unfiltered NCBI records against ODB_final_input records combined_unaln_fpath, combined_aln_fpath = "tmp/ODB_NCBI_unaln.fasta", "tmp/ODB_NCBI_aln.fasta" unaln_generator = SSfasta.ODB_NCBI_generator( ODB_fasta_fpath, NCBI_fasta_fpath, odb_subset=ODB_final_input_df.index) SeqIO.write(unaln_generator, combined_unaln_fpath, "fasta") combined_df = ODB_final_input_df.append(ncbi_df, sort=False) # display(combined_df) id_dm, align_srs = SSfasta.construct_id_dm( combined_df, combined_unaln_fpath, align_outpath=combined_aln_fpath) spec_record_ids = ncbi_df.index compare_record_ids = ODB_final_input_df.loc[ ODB_final_input_df['organism_taxid'].isin(compare_taxids)].index md_row, min_dist = min_dist_spec_record(id_dm, align_srs.index, spec_record_ids, compare_record_ids, combined_df) final_combined_df = ODB_final_input_df.append(md_row, sort=False) final_combined_df.index.name = "record_id" return final_combined_df else: combined_df = ODB_final_input_df.append(ncbi_df, sort=False) combined_df.index.name = "record_id" return combined_df
def combined_records_processing(config, am_df, em_df, combined_df, symbol, odb_fasta="", ncbi_fasta="", out_unaln_fasta="", out_aln_fasta="", out_tsv_fpath=""): """Adds in internal record distance information and source DataBase annotations for final dataset. Writes final input dataset (both OrthoDB and NCBI records) to 1) unaligned fasta 2) aligned fasta and 3) a records table corresponding to the record modified record DataFrame :param config: configparser object :param am_df alias_match DataFrame (see ODBfilter) :param em_df: exact match DataFrame :param combined_df: ODB and NCBI combined record DataFrame as returned by select_NCBI_record :param symbol: Gene symbol :param odb_fasta,ncbi_fasta: If provided, will use as sources for records to write final dataset sequences from. If not provided, defaults to unfiltered OrthoDB and NCBI fasta paths given run_name and symbol :param out_unaln_fasta,out_aln_fasta: If provided, will write unaligned/aligned final record Sequence set to these paths. If not provided, uses default output directory path for symbol. :return: processed_df: DataFrame modified to include record distance, db_source, and filter_type """ run_name, ncbi_taxid = config['RUN']['RunName'], config['NCBI'][ 'NCBITaxID'] if not odb_fasta: odb_fasta = "{0}/input/ODB/{1}.fasta".format(run_name, symbol) if not ncbi_fasta: ncbi_fasta = "{0}/input/NCBI/{1}/{2}.fasta".format( run_name, ncbi_taxid, symbol) if not out_unaln_fasta: out_unaln_fasta = "{0}/output/{1}/{1}.fasta".format(run_name, symbol) if not out_aln_fasta: out_aln_fasta = "{0}/output/{1}/{1}_msa.fasta".format(run_name, symbol) if not out_tsv_fpath: out_tsv_fpath = "{0}/output/{1}/{1}_records.tsv".format( run_name, symbol) SSdirectory.create_directory("{0}/output/{1}".format(run_name, symbol)) manual_selections_fpath = "{0}/manual_record_selections.tsv".format( run_name) #Final unaligned and aligned Fasta writing odb_records_idx = combined_df.index[combined_df.index.isin(am_df.index)] ncbi_records_idx = combined_df.index[~combined_df.index.isin(am_df.index)] combined_records = SSfasta.ODB_NCBI_generator(odb_fasta, ncbi_fasta, odb_subset=odb_records_idx, ncbi_subset=ncbi_records_idx, ordered=True) SeqIO.write(combined_records, out_unaln_fasta, 'fasta') #Internal distance calculation id_dm, aln_srs = SSfasta.construct_id_dm(combined_df, out_unaln_fasta, align_outpath=out_aln_fasta) dist_srs = SSfasta.avg_dist_srs(combined_df.index, id_dm) combined_processed = combined_df.copy() combined_processed.loc[:, 'dist'] = dist_srs #Add source_db and selection_type information, drop redundant columns (default is level_taxid, pub_og_id). combined_processed = annotate_source_and_filter(config, symbol, combined_processed, am_df, em_df, manual_selections_fpath) #write records table to file combined_processed.to_csv(out_tsv_fpath, sep='\t') return combined_processed