Ejemplo n.º 1
0
def select_NCBI_record(ODB_fasta_fpath, NCBI_fasta_fpath, taxid_dict,
                       ODB_final_input_df, compare_taxids):
    """Selects best NCBI record from NCBI fasta fpath by max identity to the OrthoDB records represented by compare_taxids.

    :param ODB_fasta_fpath: Fasta path for ODB records
    :param NCBI_fasta_fpath: Fasta path to NCBI records (should only contain records from one species)
    :param taxid_dict: Maps species names to taxids, used by load_NCBI_fasta_df
    :param ODB_final_input_df: DataFrame containing accepted records from OrthoDB data, returned from
    ODBfilter.process_input
    :param (collection) compare_taxids: tax_ids against which distance should be calculated to determine minimum
    distance NCBI record
    :return: combined_df, DataFrame containing rows from ODB_final_input and the minimu, distance row from
    NCBI_fasta_fpath
    """
    ncbi_df = load_NCBI_fasta_df(NCBI_fasta_fpath, taxid_dict)
    if len(ncbi_df) > 1:
        #Align all unfiltered NCBI records against ODB_final_input records
        combined_unaln_fpath, combined_aln_fpath = "tmp/ODB_NCBI_unaln.fasta", "tmp/ODB_NCBI_aln.fasta"
        unaln_generator = SSfasta.ODB_NCBI_generator(
            ODB_fasta_fpath,
            NCBI_fasta_fpath,
            odb_subset=ODB_final_input_df.index)
        SeqIO.write(unaln_generator, combined_unaln_fpath, "fasta")
        combined_df = ODB_final_input_df.append(ncbi_df, sort=False)
        # display(combined_df)
        id_dm, align_srs = SSfasta.construct_id_dm(
            combined_df,
            combined_unaln_fpath,
            align_outpath=combined_aln_fpath)
        spec_record_ids = ncbi_df.index
        compare_record_ids = ODB_final_input_df.loc[
            ODB_final_input_df['organism_taxid'].isin(compare_taxids)].index
        md_row, min_dist = min_dist_spec_record(id_dm, align_srs.index,
                                                spec_record_ids,
                                                compare_record_ids,
                                                combined_df)
        final_combined_df = ODB_final_input_df.append(md_row, sort=False)
        final_combined_df.index.name = "record_id"
        return final_combined_df
    else:
        combined_df = ODB_final_input_df.append(ncbi_df, sort=False)
        combined_df.index.name = "record_id"
        return combined_df
Ejemplo n.º 2
0
def combined_records_processing(config,
                                am_df,
                                em_df,
                                combined_df,
                                symbol,
                                odb_fasta="",
                                ncbi_fasta="",
                                out_unaln_fasta="",
                                out_aln_fasta="",
                                out_tsv_fpath=""):
    """Adds in internal record distance information and source DataBase annotations for final dataset. Writes final
    input dataset (both OrthoDB and NCBI records) to 1) unaligned fasta 2) aligned fasta and 3) a records table
    corresponding to the record modified record DataFrame

    :param config: configparser object
    :param am_df alias_match DataFrame (see ODBfilter)
    :param em_df: exact match DataFrame
    :param combined_df: ODB and NCBI combined record DataFrame as returned by select_NCBI_record
    :param symbol: Gene symbol
    :param odb_fasta,ncbi_fasta: If provided, will use as sources for records to write final dataset sequences from. If
    not provided, defaults to unfiltered OrthoDB and NCBI fasta paths given run_name and symbol
    :param out_unaln_fasta,out_aln_fasta: If provided, will write unaligned/aligned final record Sequence set to
    these paths. If not provided, uses default output directory path for symbol.
    :return: processed_df: DataFrame modified to include record distance, db_source, and filter_type
    """
    run_name, ncbi_taxid = config['RUN']['RunName'], config['NCBI'][
        'NCBITaxID']
    if not odb_fasta:
        odb_fasta = "{0}/input/ODB/{1}.fasta".format(run_name, symbol)
    if not ncbi_fasta:
        ncbi_fasta = "{0}/input/NCBI/{1}/{2}.fasta".format(
            run_name, ncbi_taxid, symbol)
    if not out_unaln_fasta:
        out_unaln_fasta = "{0}/output/{1}/{1}.fasta".format(run_name, symbol)
    if not out_aln_fasta:
        out_aln_fasta = "{0}/output/{1}/{1}_msa.fasta".format(run_name, symbol)
    if not out_tsv_fpath:
        out_tsv_fpath = "{0}/output/{1}/{1}_records.tsv".format(
            run_name, symbol)

    SSdirectory.create_directory("{0}/output/{1}".format(run_name, symbol))
    manual_selections_fpath = "{0}/manual_record_selections.tsv".format(
        run_name)

    #Final unaligned and aligned Fasta writing
    odb_records_idx = combined_df.index[combined_df.index.isin(am_df.index)]
    ncbi_records_idx = combined_df.index[~combined_df.index.isin(am_df.index)]
    combined_records = SSfasta.ODB_NCBI_generator(odb_fasta,
                                                  ncbi_fasta,
                                                  odb_subset=odb_records_idx,
                                                  ncbi_subset=ncbi_records_idx,
                                                  ordered=True)
    SeqIO.write(combined_records, out_unaln_fasta, 'fasta')
    #Internal distance calculation
    id_dm, aln_srs = SSfasta.construct_id_dm(combined_df,
                                             out_unaln_fasta,
                                             align_outpath=out_aln_fasta)
    dist_srs = SSfasta.avg_dist_srs(combined_df.index, id_dm)
    combined_processed = combined_df.copy()
    combined_processed.loc[:, 'dist'] = dist_srs
    #Add source_db and selection_type information, drop redundant columns (default is level_taxid, pub_og_id).
    combined_processed = annotate_source_and_filter(config, symbol,
                                                    combined_processed, am_df,
                                                    em_df,
                                                    manual_selections_fpath)
    #write records table to file
    combined_processed.to_csv(out_tsv_fpath, sep='\t')
    return combined_processed