Ejemplo n.º 1
0
def read_sqce_lengths(fasta_file):
    """Read the length of the sequences from an input FASTA file. Return them as sequence_id:length dictionary. """
    print_log_msg(
        log_str='Retrieving sequence lengths from input FASTA file (%s)' %
        fasta_file)
    sqce_lengths = {}
    with open(fasta_file, 'r') as in_file:
        sqce_id = ""
        sqce = []
        for line in in_file:
            if line[0] == ">":
                # If there was a previous sequence ,add information to `sqce_lengths` and reset `sqce`
                if sqce and sqce_id:
                    sqce_lengths[sqce_id] = len("".join(sqce))
                    sqce_id = ""
                    sqce = []
                sqce_id = line.strip()[1:]
            else:
                sqce.append(line.strip())
        # Also add the last sequence
        sqce_lengths[sqce_id] = len("".join(sqce))
    # Some tests
    # print sqce_lengths["AT1G01010"]  # 430
    # print sqce_lengths["AT1G01020"]  # 246
    # print sqce_lengths["AT1G01030"]  # 359
    # print sqce_lengths["ATMG01410"]  # 205 aa
    # print len(sqce_lengths)
    print_log_msg(
        log_str="Length retrieved for all input sequences (%d sequences). " %
        len(sqce_lengths),
        color="cyan")
    return sqce_lengths
Ejemplo n.º 2
0
def core_gf_json_report(all_gfs_dict,
                        output_dir,
                        file_name,
                        is_represented,
                        pretty=False):
    """Generate JSON report, for either represented or missing core GFs (set with `is_represented` boolean).
    Produced file can be made prettier (indentation, new lines) by setting `pretty` to True. """
    # Don't generate anything if an invalid GF type was chosen
    if not isinstance(is_represented, bool):
        print_log_msg(
            log_str=
            'Error: invalid value for `is_represented` (boolean). Cannot generate JSON report',
            color="red")
    else:
        print_log_msg(log_str='Generating JSON report (represented: %s)' %
                      is_represented)
        core_gfs = set([
            gf for gf in all_gfs_dict
            if all_gfs_dict[gf]['represented'] == is_represented
            and all_gfs_dict[gf]['is_core_gf']
        ])
        out_dict = {}  # Output directory (subset of `all_gfs_dict`)
        for gf in core_gfs:
            out_dict[gf] = {
                'weight': float(all_gfs_dict[gf]['weight']),
                'n_species': int(all_gfs_dict[gf]['n_species']),
                'n_genes': len(all_gfs_dict[gf]['members'])
            }
        with open(os.path.join(output_dir, file_name), 'w') as out_file:
            if pretty:
                out_file.write(json.dumps(out_dict, sort_keys=True, indent=4))
            else:
                out_file.write(json.dumps(out_dict, sort_keys=True))
Ejemplo n.º 3
0
def read_trapid_data(db_conn, experiment_id, top_hits, transcript_label=None):
    """Retrieve similarity search results of TRAPID experiment `experiment_id`, (optionally for transcripts labelled
    with `transcript_label`label), from the TRAPID database (through `db_conn`). Return data as a sorted and indexed
    pandas dataframe (keep only query, subject and e-value). `top_hits` is used to avoid retrieving more results than
    necessary. """
    print_log_msg(log_str='Retrieving similarity search data from TRAPID database.')
    sim_list= []
    if transcript_label not in [None, "None"]:  # Quickfix (ambiguity between None, and 'None' str).
        get_sim_data_query = "SELECT sim.transcript_id, UNCOMPRESS(sim.similarity_data) as `similarity_data`" \
                             "FROM similarities sim INNER JOIN transcripts_labels tl " \
                             "ON sim.transcript_id = tl.transcript_id " \
                             "WHERE sim.experiment_id = {experiment_id} " \
                             "AND tl.label = '{transcript_label}'".format(experiment_id=str(experiment_id),
                                                                          transcript_label=transcript_label)
    else:
        get_sim_data_query = "SELECT sim.transcript_id, UNCOMPRESS(sim.similarity_data) as `similarity_data` " \
                             "FROM similarities sim " \
                             "WHERE sim.experiment_id = {experiment_id}".format(experiment_id=str(experiment_id))
    # Execute query
    cursor = db_conn.cursor(MS.cursors.DictCursor)
    cursor.execute(get_sim_data_query)
    # Process output to get only information we want.
    for record in ResultIter(db_cursor=cursor):
        # Get a list of similarity search results for current query, formatted like:
        # [[query, subject_1, e-value_1], [query, subject_2, e-value_2], ...]
        sim_data = [[record['transcript_id'], data.split(',')[0], float(data.split(',')[1])] for data in record['similarity_data'].split(';')[0:top_hits]]
        sim_list.extend(sim_data)
    trapid_df = pd.DataFrame(sim_list, columns=["query_gene",  "subject", "log_e_value"])
    # Convert e-values to log10 (they are stored as raw e-values in TRAPID's `similarities` table)
    trapid_df['log_e_value'] = [math.log10(e_val) if e_val > sys.float_info.min else math.log10(sys.float_info.min)
                                   for e_val in trapid_df['log_e_value'].tolist()]
    # Sort, index and return
    trapid_df = trapid_df.sort_values(by=['query_gene', 'log_e_value'], ascending=[True, True])
    trapid_df = trapid_df.set_index(['query_gene'])
    return trapid_df
Ejemplo n.º 4
0
def get_gene_gf_map(all_gfs_dict):
    """Map each gene identifier to a GF. Return a gene_id-gf dict, used to perform lookup after. """
    gene_gf_map = {}
    for gf in all_gfs_dict:
        for gene in all_gfs_dict[gf]['members']:
            gene_gf_map[gene] = gf
    # Return gene_gf_map
    print_log_msg(log_str=str(len(gene_gf_map)) + ' elements in Gene-GF map.')
    return gene_gf_map
Ejemplo n.º 5
0
def export_results_to_db(db_conn, output_dict):
    """Export core GF completeness analysis results to the TRAPID database. """
    print_log_msg(log_str='Export core GF completeness results to TRAPID database.')
    cursor = db_conn.cursor()
    # Kind of dumb way to create the request, but it works.
    columns = ', '.join(sorted(output_dict))
    values = ', '.join(["\'{insert_value}\'".format(insert_value=output_dict[k]) for k in sorted(output_dict)])
    export_query = "INSERT INTO completeness_results ({columns}) VALUES ({values})".format(columns=columns, values=values)
    cursor.execute(export_query)
    db_conn.commit()
Ejemplo n.º 6
0
def get_completeness_score(all_gfs_dict):
    """Compute the core GF completeness score. """
    total_weight = sum([
        float(all_gfs_dict[a]["weight"]) for a in all_gfs_dict.keys()
        if all_gfs_dict[a]['is_core_gf']
    ])
    current_weight = sum([
        float(all_gfs_dict[a]["weight"]) for a in all_gfs_dict.keys()
        if all_gfs_dict[a]["represented"] and all_gfs_dict[a]['is_core_gf']
    ])
    print_log_msg(log_str='Calculating weighted core GF score.')
    # print current_weight
    # print total_weight
    return current_weight / total_weight
Ejemplo n.º 7
0
def completeness_report(all_gfs_dict, output_dir, file_name):
    """Generate a core GF completeness report. Output to `file_name` in `output_dir`. """
    print_log_msg(log_str='Generating core GF completeness report.')
    represented_gfs = [
        gf for gf in all_gfs_dict
        if all_gfs_dict[gf]['represented'] and all_gfs_dict[gf]['is_core_gf']
    ]
    missing_gfs = [
        gf for gf in all_gfs_dict if not all_gfs_dict[gf]['represented']
        and all_gfs_dict[gf]['is_core_gf']
    ]
    core_gfs = [gf for gf in all_gfs_dict if all_gfs_dict[gf]['is_core_gf']]
    # core_gfs_dict["HOM004486"]['not_chosen']=True  # Debug
    not_chosen_gfs = [
        gf for gf in all_gfs_dict if all_gfs_dict[gf]['not_chosen']
    ]
    not_chosen_missing_gfs = [
        gf for gf in all_gfs_dict if not all_gfs_dict[gf]['represented']
        and all_gfs_dict[gf]['not_chosen'] and all_gfs_dict[gf]['is_core_gf']
    ]
    completeness_score = get_completeness_score(all_gfs_dict=all_gfs_dict)
    # sys.exit()
    with open(os.path.join(output_dir, file_name), 'wb') as out_file:
        out_file.write('# Core GF completeness score:\t' +
                       "{:.5f}".format(completeness_score) + '\n')
        out_file.write('# Represented core gene families:\t' +
                       str(len(represented_gfs)) + '/' + str(len(core_gfs)) +
                       '\n')
        out_file.write('# Missing core gene families:\t' +
                       str(len(missing_gfs)) + '/' + str(len(core_gfs)) + '\n')
        out_file.write(
            '# In ' + str(len(not_chosen_gfs)) +
            ' cases, the core gene family associated with the top hit was not the one chosen as \'best\' gene family. \n'
        )
        out_file.write(
            '# ' + str(len(not_chosen_missing_gfs)) + ' of these ' +
            str(len(not_chosen_gfs)) +
            ' were core gene families, now considered as missing. ' +
            ', '.join(not_chosen_missing_gfs) + '\n')
        out_file.write('\n')
        # out_file.write('missing_gf\tn_genes\n')
        out_file.write('missing_gf\tn_genes\tn_species\tgf_weight\n')
        for gf in missing_gfs:
            out_file.write('\t'.join([
                gf,
                str(len(all_gfs_dict[gf]['members'])),
                str(all_gfs_dict[gf]['n_species']),
                str(all_gfs_dict[gf]['weight'])
            ]) + '\n')
Ejemplo n.º 8
0
def represented_core_gf_report(all_gfs_dict, output_dir, file_name):
    """Generate a tabulated file reporting represented core GFs and the list of corresponding similarity search queries. """
    print_log_msg(log_str='Generating represented core GF report.')
    represented_gfs = [
        gf for gf in all_gfs_dict
        if all_gfs_dict[gf]['represented'] and all_gfs_dict[gf]['is_core_gf']
    ]
    with open(os.path.join(output_dir, file_name), 'wb') as out_file:
        out_file.write(
            'represented_gf\tn_genes\tn_species\tgf_weight\tquery_list\n')
        for gf in represented_gfs:
            out_file.write('\t'.join([
                gf,
                str(len(all_gfs_dict[gf]['members'])),
                str(all_gfs_dict[gf]['n_species']),
                str(all_gfs_dict[gf]['weight']), ','.join(all_gfs_dict[gf]
                                                          ['query_list'])
            ]) + '\n')
Ejemplo n.º 9
0
def read_all_gfs(core_gfs_file, gf_len):
    """Read a core GF tabulated file, return a dictionary of GFs. """
    print_log_msg(log_str='Reading all GFs from core GFs file (%s). ' %
                  core_gfs_file)
    all_gfs_dict = {}
    with open(core_gfs_file, 'r') as in_file:
        next(in_file)  # Skip header line
        if gf_len:
            for line in in_file:
                fields = line.strip().split('\t')
                all_gfs_dict[fields[0]] = {
                    'members': fields[-4].split('|'),
                    'n_species': fields[2],
                    'weight': fields[3],
                    'not_chosen': False,
                    'is_core_gf': fields[4] in 'True',
                    'represented': False,
                    'len_avg': float(fields[-3]),
                    'len_med': float(fields[-2]),
                    'len_stdev': float(fields[-1]),
                    'query_list': []
                }
        else:
            for line in in_file:
                fields = line.strip().split('\t')
                all_gfs_dict[fields[0]] = {
                    'members': fields[-1].split('|'),
                    'n_species': fields[2],
                    'weight': fields[3],
                    'not_chosen': False,
                    'is_core_gf': fields[4] in 'True',
                    'represented': False,
                    'query_list': []
                }
    n_gfs = len(all_gfs_dict.keys())
    n_core_gfs = len(
        [gf for gf in all_gfs_dict if all_gfs_dict[gf]['is_core_gf'] == True])
    print_log_msg(log_str=str(n_gfs) + ' gene families and ' +
                  str(n_core_gfs) + ' core gene families found.',
                  color="cyan")
    return all_gfs_dict
Ejemplo n.º 10
0
def read_blast_output(blast_output, raw_evalues=False, remove_self_hits=False):
    """Read a similarity search output file (`.m8` file), keeping only query, subject and e-value columns.
    Return it as sorted and indexed dataframe."""
    print_log_msg(log_str='Reading similarity search output file (%s)' %
                  blast_output)
    # Read the BLAST output and only keep the columns we're interested in. Then rename columns + set index (fast lookup)
    blast_df = pd.read_csv(blast_output,
                           sep='\t',
                           header=None,
                           comment='#',
                           usecols=[0, 1, 3, 10])
    blast_df = blast_df.rename(columns={
        0: "query_gene",
        1: "subject",
        3: "length",
        10: "log_e_value"
    })
    # Convert e-values to log10 if  --raw flag is provided (RapSearch2 takes care of that!)
    if raw_evalues:
        print_log_msg(
            log_str=
            'Converting e-values to log10 (\'--raw\' flag was provided).')
        blast_df['log_e_value'] = [
            math.log10(e_val)
            if e_val > sys.float_info.min else math.log10(sys.float_info.min)
            for e_val in blast_df['log_e_value'].tolist()
        ]
        # print min(blast_df['log_e_value'])  # Debug
    # Remove self hits if --no_self_hits flag is provided
    if remove_self_hits:
        print_log_msg(
            log_str='Removing self-hits (\'--no_self_hits\' flag was provided).'
        )
        # If we wanted to remove only self-hits that are the top hits... Does it really make sense?
        # first_hit_indices = {}
        # to_remove = []  # List of row indexes to drop
        # # Get 'top hits' indices
        # for index, query in enumerate(blast_df['query_gene'].tolist()):
        #     if query not in first_hit_indices:
        #         first_hit_indices[query] = index
        # # Check if the top hit is a self hit, add it to the list of indices to drop
        # for query,idx in first_hit_indices.items():
        #     if blast_df.iloc[idx]['query_gene'] == blast_df.iloc[idx]['subject']:
        #         to_remove.append(idx)
        # # Drop rows
        # blast_df = blast_df.drop(blast_df.index[to_remove])
        # Removing all self-hits
        blast_df = blast_df[blast_df['query_gene'] != blast_df['subject']]
    # Sort and index
    blast_df = blast_df.sort_values(by=['query_gene', 'log_e_value'],
                                    ascending=[True, True])
    blast_df = blast_df.set_index(['query_gene'])
    return blast_df
Ejemplo n.º 11
0
def main(core_gfs_file, blast_output, fasta_input, output_dir, top_hits,
         min_len, raw_evalues, naive_scoring, remove_self_hits, gf_len):
    """Script execution. """
    if not os.path.exists(output_dir):
        print_log_msg(log_str='Creating output directory \'%s\'.' % output_dir)
        os.makedirs(output_dir)
    else:
        print_log_msg(log_str='Output directory \'%s\' already exists.' %
                      output_dir)
    all_gfs = read_all_gfs(core_gfs_file=core_gfs_file, gf_len=gf_len)
    sqce_lengths = read_sqce_lengths(fasta_file=fasta_input)
    gene_gf_map = get_gene_gf_map(all_gfs_dict=all_gfs)
    blast_df = read_blast_output(blast_output=blast_output,
                                 raw_evalues=raw_evalues,
                                 remove_self_hits=remove_self_hits)
    process_blast_output(blast_df=blast_df,
                         n_hits=top_hits,
                         gene_gf_map=gene_gf_map,
                         output_dir=output_dir,
                         all_gfs_dict=all_gfs,
                         sqce_lengths=sqce_lengths,
                         gf_len=gf_len,
                         min_len=min_len,
                         naive_scoring=naive_scoring)
    completeness_report(all_gfs_dict=all_gfs,
                        output_dir=output_dir,
                        file_name="core_gf_completeness_report.tsv")
    represented_core_gf_report(all_gfs_dict=all_gfs,
                               output_dir=output_dir,
                               file_name="represented_core_gf_report.tsv")
    core_gf_json_report(all_gfs_dict=all_gfs,
                        output_dir=output_dir,
                        file_name="core_gf_report.represented.json",
                        is_represented=True)
    core_gf_json_report(all_gfs_dict=all_gfs,
                        output_dir=output_dir,
                        file_name="core_gf_report.missing.json",
                        is_represented=False)
    print_log_msg(log_str='Core GF completeness analysis finished!',
                  color="green")
Ejemplo n.º 12
0
def process_blast_output(blast_df,
                         n_hits,
                         gene_gf_map,
                         all_gfs_dict,
                         sqce_lengths,
                         gf_len,
                         min_len,
                         output_dir,
                         naive_scoring=False):
    """Process the whole BLAST/RapSearch2 output.
    Will modify the dictionary passed as parameter to set to `True` families that are represented.
    For a query, we take the top `n_hits` from the results and check for GF members to assign a GF to each top hit. The
    best scoring GF is then considered represented. """
    query_coverage = {}
    filtered_log_file = {
        "gf_len": "gf_z_score_filtered_queries.txt",
        "min_len": "minimum_alignment_length_filtered_queries.txt"
    }
    # Get list of all queries
    all_queries = list(blast_df.index.unique())
    print_log_msg(log_str=str(len(all_queries)) +
                  ' queries in similarity search output.')
    if not naive_scoring:
        print_log_msg(
            log_str=
            'Warning: GF scoring will be normalized with GF weight (no `naive_scoring` flag provided).',
            color="orange")
    for query in all_queries:
        scored_gfs = {}  # Dict to store found GFs and associated score
        top_hits = blast_df.loc[query][0:n_hits]
        if not isinstance(top_hits, pd.DataFrame):
            # sys.stderr.write("Not a dataframe.\n")  # Debug
            top_hits = blast_df.loc[[query]][0:n_hits]
        # print top_hits.columns
        gfs = [
            gf_lookup(gene_id=subject, gene_gf_map=gene_gf_map)
            for subject in top_hits['subject']
        ]
        top_hit_gf = gfs[0]
        log_e_values = top_hits['log_e_value'].tolist()
        for gf, log_e_value in zip(gfs, log_e_values):
            if gf not in scored_gfs:
                scored_gfs[gf] = abs(log_e_value)
            else:
                scored_gfs[gf] += abs(log_e_value)
        # TODO: CLEAN THIS MESS OTHERWISE IT IS UNREADABLE
        # Remove 'None' BEFORE taking the 'max' (otherwise we underestimate the completeness...)
        if None in scored_gfs.keys():
            print "-----"  # This should never happen!
            del scored_gfs[None]
        # print scored_gfs

        # Weighting scores with GF weight, only if we chose to (i.e. not `naive_scoring`)
        if not naive_scoring:
            for gf in scored_gfs:
                scored_gfs[gf] = scored_gfs[gf] * float(
                    all_gfs_dict[gf]['weight'])

        if scored_gfs:
            best_gf = max(scored_gfs, key=scored_gfs.get)

            # If `--gf_len` flag is provided, determine if the query is partial
            # if nor `gf_len` / `min_len` were provided, it's equivalent to the legacy method (not caring about partial query sequences)
            partial = False
            if gf_len:
                # First, compare its length to the length of GF members. If z-score is < -2, the GF is not represented.
                # We can only do this when it makes sense: i.e. if the cutoff value is at least 10 AA and stdev != 0
                gf_len_cutoff = all_gfs_dict[best_gf][
                    'len_avg'] - 2 * all_gfs_dict[best_gf]['len_stdev']
                if gf_len_cutoff >= 15 and all_gfs_dict[best_gf][
                        'len_stdev'] != 0:
                    len_z_score = (sqce_lengths[top_hits.index[0]] -
                                   all_gfs_dict[best_gf]['len_avg']
                                   ) / all_gfs_dict[best_gf]['len_stdev']
                    if len_z_score <= -2:
                        partial = True
                        with open(
                                os.path.join(output_dir,
                                             filtered_log_file['gf_len']),
                                'a') as out_file:
                            out_file.write(
                                "%s flagged as partial (Z = %f, threshold = %f AA)\n"
                                % (top_hits.index[0], len_z_score,
                                   gf_len_cutoff))
            if min_len > 0:
                # Also use alignment length (mean of best GF alignments)
                query_len_cutoff = all_gfs_dict[best_gf][
                    'len_med'] * min_len  # Minimum required length in AA
                print query_len_cutoff
                print all_gfs_dict[best_gf]['len_med']
                # Length of all alignments corresponding to best GF hits
                query_lens = top_hits.iloc[[
                    idx for idx, gf in enumerate(gfs) if gf == best_gf
                ]]['length'].tolist()
                query_len_avg = float(sum(query_lens)) / max(
                    len(query_lens), 1)  # Average length in AA
                query_coverage[top_hits.index[
                    0]] = query_len_avg / all_gfs_dict[best_gf]['len_med']
                if query_len_avg < query_len_cutoff:
                    partial = True
                    with open(
                            os.path.join(output_dir,
                                         filtered_log_file['min_len']),
                            'a') as out_file:
                        out_file.write(
                            "%s flagged as partial (alignment length shorter than %f AA)\n"
                            % (top_hits.index[0], query_len_cutoff))

            if not partial:
                all_gfs_dict[best_gf]['represented'] = True
                all_gfs_dict[best_gf]['query_list'].append(query)

            if best_gf != top_hit_gf and top_hit_gf is not None:
                print_log_msg(
                    log_str='Warning: the GF of the top result for query ' +
                    query + ' (' + str(top_hit_gf) + '|is_core_gf=' +
                    str(all_gfs_dict[top_hit_gf]['is_core_gf']) + ', score=' +
                    str(scored_gfs[top_hit_gf]) +
                    ') was not the one assigned (' + str(best_gf) +
                    '|is_core_gf=' + str(all_gfs_dict[best_gf]['is_core_gf']) +
                    ', score=' + str(scored_gfs[best_gf]) + ').')
                # sys.stderr.write(str(scored_gfs)+'\n')
                all_gfs_dict[top_hit_gf]['not_chosen'] = True