def generic_merge(splicegraph_gff_fname, gff_fname, output_gff_fname,
                  genome, exon_id_suffix,
                  coords_diff_cutoff=10,
                  SE_merge=False):
    # Load SpliceGraph skipped exons
    splicegraph_in = pybedtools.BedTool(splicegraph_gff_fname)
    splicegraph_exons = \
        splicegraph_in.filter(lambda x: x.attrs["ID"].endswith(exon_id_suffix))
    # New annotation's skipped exons
    new_in = pybedtools.BedTool(gff_fname)
    new_exons = new_in.filter(lambda x: x.attrs["ID"].endswith(exon_id_suffix))
    # Intersect splicegraph exons with new exons
    intersected_gff = splicegraph_exons.intersect(new_exons,
                                                  wao=True,
                                                  s=True)
    # Compile the overlaps for each exon and the exon it
    # overlaps with
    exons_to_overlaps = defaultdict(list)
    for exon in intersected_gff:
        curr_overlap = int(exon.fields[-1])
        exons_to_overlaps[exon.attrs["ID"]].append((curr_overlap, exon))
    # If the maximum overlap between the SpliceGraph exon and
    # all exons in the new GFF annotation is LESS than 'coords_diff_cutoff'
    # then keep the SpliceGraph exon in the annotation
    # Name of SpliceGraph event trios to include in merged annotation
    splicegraph_trios_to_add = []
    # Mapping from SpliceGraph potentially redundant trios
    # to the new trios that they overlap with
    sg_redundant_trios = defaultdict(list)
    for exon_id in exons_to_overlaps:        
        trio_id = exon_id.rsplit(".", 2)[0]
        # Get maximum overlap
        overlaps = \
            [overlap_exon[0] \
             for overlap_exon in exons_to_overlaps[exon_id]]
        overlapping_trios = \
            [overlap_exon[1].attrs["ID"].rsplit(".", 2)[0] \
             for overlap_exon in exons_to_overlaps[exon_id]]
        max_ind, max_overlap = utils.max_item(overlaps)
        if max_overlap < coords_diff_cutoff:
            splicegraph_trios_to_add.append(trio_id)
        else:
            # Look at all new trios it overlaps with
            for new_overlapping_trio_id in overlapping_trios:
                if new_overlapping_trio_id != trio_id:
                    sg_redundant_trios[trio_id].append(new_overlapping_trio_id)
                else:
                    # Skip identical trios
                    continue
            # Collect potentially redundant trios
            sg_redundant_trios[trio_id].append(new_overlapping_trio_id)
    # If we're dealing with SEs, then do an SE specific
    # merge for the potentially redundant trios
    num_sg_trios = len(splicegraph_trios_to_add)
    print "Added %d trios from SpliceGraph" %(num_sg_trios)
    output_combined_gff_events(splicegraph_gff_fname,
                               splicegraph_trios_to_add,
                               gff_fname,
                               output_gff_fname,
                               genome)
def generic_merge(splicegraph_gff_fname,
                  gff_fname,
                  output_gff_fname,
                  genome,
                  exon_id_suffix,
                  coords_diff_cutoff=10,
                  SE_merge=False):
    # Load SpliceGraph skipped exons
    splicegraph_in = pybedtools.BedTool(splicegraph_gff_fname)
    splicegraph_exons = \
        splicegraph_in.filter(lambda x: x.attrs["ID"].endswith(exon_id_suffix))
    # New annotation's skipped exons
    new_in = pybedtools.BedTool(gff_fname)
    new_exons = new_in.filter(lambda x: x.attrs["ID"].endswith(exon_id_suffix))
    # Intersect splicegraph exons with new exons
    intersected_gff = splicegraph_exons.intersect(new_exons, wao=True, s=True)
    # Compile the overlaps for each exon and the exon it
    # overlaps with
    exons_to_overlaps = defaultdict(list)
    for exon in intersected_gff:
        curr_overlap = int(exon.fields[-1])
        exons_to_overlaps[exon.attrs["ID"]].append((curr_overlap, exon))
    # If the maximum overlap between the SpliceGraph exon and
    # all exons in the new GFF annotation is LESS than 'coords_diff_cutoff'
    # then keep the SpliceGraph exon in the annotation
    # Name of SpliceGraph event trios to include in merged annotation
    splicegraph_trios_to_add = []
    # Mapping from SpliceGraph potentially redundant trios
    # to the new trios that they overlap with
    sg_redundant_trios = defaultdict(list)
    for exon_id in exons_to_overlaps:
        trio_id = exon_id.rsplit(".", 2)[0]
        # Get maximum overlap
        overlaps = \
            [overlap_exon[0] \
             for overlap_exon in exons_to_overlaps[exon_id]]
        overlapping_trios = \
            [overlap_exon[1].attrs["ID"].rsplit(".", 2)[0] \
             for overlap_exon in exons_to_overlaps[exon_id]]
        max_ind, max_overlap = utils.max_item(overlaps)
        if max_overlap < coords_diff_cutoff:
            splicegraph_trios_to_add.append(trio_id)
        else:
            # Look at all new trios it overlaps with
            for new_overlapping_trio_id in overlapping_trios:
                if new_overlapping_trio_id != trio_id:
                    sg_redundant_trios[trio_id].append(new_overlapping_trio_id)
                else:
                    # Skip identical trios
                    continue
            # Collect potentially redundant trios
            sg_redundant_trios[trio_id].append(new_overlapping_trio_id)
    # If we're dealing with SEs, then do an SE specific
    # merge for the potentially redundant trios
    num_sg_trios = len(splicegraph_trios_to_add)
    print "Added %d trios from SpliceGraph" % (num_sg_trios)
    output_combined_gff_events(splicegraph_gff_fname, splicegraph_trios_to_add,
                               gff_fname, output_gff_fname, genome)
Ejemplo n.º 3
0
 def obs_over_exp_counts_dinuc(self, subseqs):
     """
     Get observed over expected ratio of counts (non-log!) of
     subsequences in all sequences.
     """
     entries = []
     t1 = time.time()
     num_seqs = 0
     for curr_seq in self.seqs:
         # Sequence name is FASTA header without leading '>'
         seq_name = curr_seq[0][1:]
         # Get observed and expected counts for all subseqs
         # in the current sequence
         obs_counts, exp_counts = self.count_subseqs(curr_seq[1], subseqs)
         # Calculate ratios
         ratios = obs_counts / exp_counts
         # All ratios
         ratios_str = ",".join(["%.3f" % (r) for r in ratios])
         # The maximum ratio
         max_ratio_indx, max_ratio = utils.max_item(ratios)
         # Get the observed counts of the kmer with highest ratio
         max_ratio_obs_count = int(obs_counts[max_ratio_indx])
         obs_counts_str = ",".join(["%d" % (int(oc)) for oc in obs_counts])
         exp_counts_str = ",".join(["%.2f" % (ec) for ec in exp_counts])
         # Collect raw counts and ratio in order:
         # sequence name, obs counts, exp counts, obs / exp ratios
         entries.append([
             seq_name, max_ratio, max_ratio_obs_count, obs_counts_str,
             exp_counts_str, ratios_str
         ])
         num_seqs += 1
         if num_seqs == 100:
             print "Quitting early!"
             print "=" * 10
             break
     t2 = time.time()
     print "Counting occurrences in %d sequences took %.2f seconds" \
           %(num_seqs, (t2 - t1))
     col_names = [
         "header", "max_ratio", "max_ratio_obs_count", "obs_counts",
         "exp_counts", "ratios"
     ]
     entries = \
         pandas.DataFrame(np.array(entries),
                          columns=col_names).set_index("header")
     # Sort in descending order
     entries.sort(column=["max_ratio"], ascending=False, inplace=True)
     return entries
Ejemplo n.º 4
0
 def obs_over_exp_counts_dinuc(self, subseqs):
     """
     Get observed over expected ratio of counts (non-log!) of
     subsequences in all sequences.
     """
     entries = []
     t1 = time.time()
     num_seqs = 0
     for curr_seq in self.seqs:
         # Sequence name is FASTA header without leading '>'
         seq_name = curr_seq[0][1:]
         # Get observed and expected counts for all subseqs
         # in the current sequence
         obs_counts, exp_counts = self.count_subseqs(curr_seq[1], subseqs)
         # Calculate ratios
         ratios = obs_counts / exp_counts
         # All ratios
         ratios_str = ",".join(["%.3f" % (r) for r in ratios])
         # The maximum ratio
         max_ratio_indx, max_ratio = utils.max_item(ratios)
         # Get the observed counts of the kmer with highest ratio
         max_ratio_obs_count = int(obs_counts[max_ratio_indx])
         obs_counts_str = ",".join(["%d" % (int(oc)) for oc in obs_counts])
         exp_counts_str = ",".join(["%.2f" % (ec) for ec in exp_counts])
         # Collect raw counts and ratio in order:
         # sequence name, obs counts, exp counts, obs / exp ratios
         entries.append([seq_name, max_ratio, max_ratio_obs_count, obs_counts_str, exp_counts_str, ratios_str])
         num_seqs += 1
         if num_seqs == 100:
             print "Quitting early!"
             print "=" * 10
             break
     t2 = time.time()
     print "Counting occurrences in %d sequences took %.2f seconds" % (num_seqs, (t2 - t1))
     col_names = ["header", "max_ratio", "max_ratio_obs_count", "obs_counts", "exp_counts", "ratios"]
     entries = pandas.DataFrame(np.array(entries), columns=col_names).set_index("header")
     # Sort in descending order
     entries.sort(column=["max_ratio"], ascending=False, inplace=True)
     return entries
Ejemplo n.º 5
0
def output_utr_table(tables_dir,
                     utr_gff_fname,
                     output_dir,
                     choice_rule="longest"):
    """
    Output a UTR table (one UTR per gene) given a
    UTR GFF file. Possible rules for choosing the
    UTR ('choice_rule'):

      - longest, uses longest UTR
      - shortest, uses shortest UTR

    Outputs a GFF file.
    """
    print "Outputting UTR table from %s" % (utr_gff_fname)
    output_basename = os.path.basename(utr_gff_fname).rsplit(".", 1)[0]
    utils.make_dir(output_dir)
    output_fname = os.path.join(output_dir, "%s.gff" % (output_basename))
    print "  - Output file: %s" % (output_fname)
    if not os.path.isfile(utr_gff_fname):
        raise Exception, "Cannot find %s" % (utr_gff_fname)
    # Load table
    table_fname = os.path.join(tables_dir, "ensGene.kgXref.combined.txt")
    table_df = pandas.read_table(table_fname, sep="\t")
    trans_to_gene = {}
    # Map transcripts to genes
    for row, entry in table_df.iterrows():
        trans_to_gene[entry["name"]] = entry["name2"]
    # Mapping from gene ID to a dictionary mapping each
    # UTR to its length
    genes_to_utr_lens = defaultdict(lambda: defaultdict(int))
    print "Computing lengths of UTRs.."
    gff_utrs = pybedtools.BedTool(utr_gff_fname)
    # Compute lengths of UTRs for each gene
    for entry in gff_utrs:
        # Get transcript that UTR belongs to
        trans_id = entry.attrs["Parent"]
        # Get UTR id
        utr_id = entry.attrs["ID"]
        # Get the gene it corresponds to
        gene_id = trans_to_gene[trans_id]
        # Compute length of UTRs
        # Length of UTR
        utr_len = len(entry)
        genes_to_utr_lens[gene_id][utr_id] = utr_len
    # Select UTR for each gene
    gene_to_chosen_utr = {}
    for gene in genes_to_utr_lens:
        all_utrs = genes_to_utr_lens[gene].items()
        utr_lens = [curr_utr[1] for curr_utr in all_utrs]
        if choice_rule == "longest":
            utr_indx = utils.max_item(utr_lens)[0]
            chosen_utr = all_utrs[utr_indx]
            gene_to_chosen_utr[gene] = chosen_utr
        else:
            raise Exception, "Unsupported choice rule %s" % (choice_rule)
    # Now select the relevant entries for outputting. Also
    # add relevant information about genes/length
    gff_utrs = pybedtools.BedTool(utr_gff_fname)
    gff_out = open(output_fname, "w")
    for entry in gff_utrs:
        # Current UTR id
        curr_utr_id = entry.attrs["ID"]
        # Current UTR's transcript
        curr_utr_trans = entry.attrs["Parent"]
        # Get the current UTR's gene
        curr_utr_gene = trans_to_gene[curr_utr_trans]
        # If this UTR is the chosen UTR, output it
        if gene_to_chosen_utr[curr_utr_gene][0] == curr_utr_id:
            # Look up the gene ID it belongs to
            curr_gene_id = trans_to_gene[curr_utr_trans]
            entry.attrs["gene_id"] = curr_gene_id
            entry.attrs["region_len"] = \
                str(gene_to_chosen_utr[curr_utr_gene][1])
            gff_out.write("%s" % (str(entry)))
    gff_out.close()
    return output_fname
Ejemplo n.º 6
0
def output_utr_table(tables_dir,
                     utr_gff_fname,
                     output_dir,
                     choice_rule="longest"):
    """
    Output a UTR table (one UTR per gene) given a
    UTR GFF file. Possible rules for choosing the
    UTR ('choice_rule'):

      - longest, uses longest UTR
      - shortest, uses shortest UTR

    Outputs a GFF file.
    """
    print "Outputting UTR table from %s" %(utr_gff_fname)
    output_basename = os.path.basename(utr_gff_fname).rsplit(".", 1)[0]
    utils.make_dir(output_dir)
    output_fname = os.path.join(output_dir, "%s.gff" %(output_basename))
    print "  - Output file: %s" %(output_fname)
    if not os.path.isfile(utr_gff_fname):
        raise Exception, "Cannot find %s" %(utr_gff_fname)
    # Load table
    table_fname = os.path.join(tables_dir, "ensGene.kgXref.combined.txt")
    table_df = pandas.read_table(table_fname, sep="\t")
    trans_to_gene = {}
    # Map transcripts to genes
    for row, entry in table_df.iterrows():
        trans_to_gene[entry["name"]] = entry["name2"]
    # Mapping from gene ID to a dictionary mapping each
    # UTR to its length
    genes_to_utr_lens = defaultdict(lambda: defaultdict(int))
    print "Computing lengths of UTRs.."
    gff_utrs = pybedtools.BedTool(utr_gff_fname)
    # Compute lengths of UTRs for each gene
    for entry in gff_utrs:
        # Get transcript that UTR belongs to
        trans_id = entry.attrs["Parent"]
        # Get UTR id
        utr_id = entry.attrs["ID"]
        # Get the gene it corresponds to
        gene_id = trans_to_gene[trans_id]
        # Compute length of UTRs
        # Length of UTR
        utr_len = len(entry)
        genes_to_utr_lens[gene_id][utr_id] = utr_len
    # Select UTR for each gene
    gene_to_chosen_utr = {}
    for gene in genes_to_utr_lens:
        all_utrs = genes_to_utr_lens[gene].items()
        utr_lens = [curr_utr[1] for curr_utr in all_utrs]
        if choice_rule == "longest":
            utr_indx = utils.max_item(utr_lens)[0]
            chosen_utr = all_utrs[utr_indx]
            gene_to_chosen_utr[gene] = chosen_utr
        else:
            raise Exception, "Unsupported choice rule %s" %(choice_rule)
    # Now select the relevant entries for outputting. Also
    # add relevant information about genes/length
    gff_utrs = pybedtools.BedTool(utr_gff_fname)
    gff_out = open(output_fname, "w")
    for entry in gff_utrs:
        # Current UTR id
        curr_utr_id = entry.attrs["ID"]
        # Current UTR's transcript
        curr_utr_trans = entry.attrs["Parent"]
        # Get the current UTR's gene
        curr_utr_gene = trans_to_gene[curr_utr_trans]
        # If this UTR is the chosen UTR, output it
        if gene_to_chosen_utr[curr_utr_gene][0] == curr_utr_id:
            # Look up the gene ID it belongs to
            curr_gene_id = trans_to_gene[curr_utr_trans]
            entry.attrs["gene_id"] = curr_gene_id
            entry.attrs["region_len"] = \
                str(gene_to_chosen_utr[curr_utr_gene][1])
            gff_out.write("%s" %(str(entry)))
    gff_out.close()
    return output_fname