Beispiel #1
0
def plot_motif(motif_name, figsize, ylab='bits', information_content=True):
    """
    Plot motifs from encode motifs file
    """
    motif_letter_heights = loaded_motifs.getPwm(motif_name).getRows()
    return plot_pwm(motif_letter_heights, figsize,
                    ylab=ylab, information_content=information_content)
Beispiel #2
0
def get_motif_scores(encoded_sequences,
                     motif_names,
                     max_scores=None,
                     return_positions=False,
                     GC_fraction=0.4):
    """
    Computes pwm log odds.

    Parameters
    ----------
    encoded_sequences : 4darray
    motif_names : list of strings
    max_scores : int, optional
    return_positions : boolean, optional
    GC_fraction : float, optional

    Returns
    -------
    (num_samples, num_motifs, seq_length) complete score array by default.
    If max_scores, (num_samples, num_motifs*max_scores) max score array.
    If max_scores and return_positions, (num_samples, 2*num_motifs*max_scores)
    array with max scores and their positions.
    """
    encoded_sequences = np.transpose(encoded_sequences, (0, 1, 3, 2))
    num_samples, _, _, seq_length = encoded_sequences.shape
    scores = np.ones((num_samples, len(motif_names), seq_length))
    for j, motif_name in enumerate(motif_names):
        pwm = loaded_motifs.getPwm(motif_name).getRows().T
        log_pwm = np.log(pwm)
        gc_pwm = 0.5 * np.array(
            [[1 - GC_fraction, GC_fraction, GC_fraction, 1 - GC_fraction]] *
            len(pwm[0])).T
        gc_log_pwm = np.log(gc_pwm)
        scores[:, j, :] = get_pssm_scores(encoded_sequences,
                                          log_pwm) - get_pssm_scores(
                                              encoded_sequences, gc_log_pwm)
    if max_scores is not None:
        sorted_scores = np.sort(scores)[:, :, ::-1][:, :, :max_scores]
        if return_positions:
            sorted_positions = scores.argsort()[:, :, ::-1][:, :, :max_scores]
            return np.concatenate(
                (sorted_scores.reshape(
                    (num_samples, len(motif_names) * max_scores)),
                 sorted_positions.reshape(
                     (num_samples, len(motif_names) * max_scores))),
                axis=1)
        else:
            return sorted_scores.reshape(
                (num_samples, len(motif_names) * max_scores))
    else:
        return scores
Beispiel #3
0
def get_motif_scores(encoded_sequences,
                     motif_names,
                     max_scores=None,
                     return_positions=False,
                     GC_fraction=0.4):
  """
  Computes pwm log odds.

  Parameters
  ----------
  encoded_sequences : 4darray
  motif_names : list of strings
  max_scores : int, optional
  return_positions : boolean, optional
  GC_fraction : float, optional

  Returns
  -------
  (num_samples, num_motifs, seq_length) complete score array by default.
  If max_scores, (num_samples, num_motifs*max_scores) max score array.
  If max_scores and return_positions, (num_samples, 2*num_motifs*max_scores)
  array with max scores and their positions.
  """
  num_samples, _, _, seq_length = encoded_sequences.shape
  scores = np.ones((num_samples, len(motif_names), seq_length))
  for j, motif_name in enumerate(motif_names):
    pwm = loaded_motifs.getPwm(motif_name).getRows().T
    log_pwm = np.log(pwm)
    gc_pwm = 0.5 * np.array(
        [[1 - GC_fraction, GC_fraction, GC_fraction, 1 - GC_fraction]] * len(
            pwm[0])).T
    gc_log_pwm = np.log(gc_pwm)
    scores[:, j, :] = get_pssm_scores(encoded_sequences,
                                      log_pwm) - get_pssm_scores(
                                          encoded_sequences, gc_log_pwm)
  if max_scores is not None:
    sorted_scores = np.sort(scores)[:, :, ::-1][:, :, :max_scores]
    if return_positions:
      sorted_positions = scores.argsort()[:, :, ::-1][:, :, :max_scores]
      return np.concatenate(
          (sorted_scores.reshape((num_samples, len(motif_names) * max_scores)),
           sorted_positions.reshape(
               (num_samples, len(motif_names) * max_scores))),
          axis=1)
    else:
      return sorted_scores.reshape((num_samples, len(motif_names) * max_scores))
  else:
    return scores
Beispiel #4
0
def get_motif_scores(encoded_sequences,
                     motif_names,
                     max_scores=None,
                     return_positions=False,
                     GC_fraction=0.4,
                     pfm=None,
                     log_pfm=None,
                     include_rc=True):
    """
    Computes pfm log odds.

    Parameters
    ----------
    encoded_sequences : 4darray
    motif_names : list of strings
    max_scores : int, optional
    return_positions : boolean, optional
    GC_fraction : float, optional
    pfm: position weight matrix for the motif, optional
    log_pfm: log(pfm), optional, this is the format that  HOCOMOCO Provides in their PFM download links 
    include_rc: boolean indicating whether both the forward strand and the reverse complement of the motif should be used (default True) 
    Returns
    -------
    (num_samples, num_motifs, seq_length) complete score array by default.
    If max_scores, (num_samples, num_motifs*max_scores) max score array.
    If max_scores and return_positions, (num_samples, 2*num_motifs*max_scores)
    array with max scores and their positions.
    """
    encoded_sequences = np.transpose(encoded_sequences, (0, 1, 3, 2))
    num_samples, _, _, seq_length = encoded_sequences.shape
    scores = np.ones((num_samples, len(motif_names), seq_length))
    for j, motif_name in enumerate(motif_names):
        if (pfm is None) and (log_pfm is None):
            pfm = loaded_motifs.getPwm(motif_name).getRows().T
            log_pfm = np.log(pfm)
        elif log_pfm is None:
            log_pfm = np.log(pfm)
        #get the background pfm either based on GC fraction or on shuffling the input sequence
        background_pfm = 0.5 * np.array(
            [[1 - GC_fraction, GC_fraction, GC_fraction, 1 - GC_fraction]] *
            len(log_pfm[0])).T
        background_log_pfm = np.log(background_pfm)
        scores[:, j, :] = get_pssm_scores(
            encoded_sequences, log_pfm,
            include_rc=include_rc) - get_pssm_scores(
                encoded_sequences, background_log_pfm, include_rc=include_rc)
    if max_scores is not None:
        sorted_scores = np.sort(scores)[:, :, ::-1][:, :, :max_scores]
        if return_positions:
            sorted_positions = scores.argsort()[:, :, ::-1][:, :, :max_scores]
            return np.concatenate(
                (sorted_scores.reshape(
                    (num_samples, len(motif_names) * max_scores)),
                 sorted_positions.reshape(
                     (num_samples, len(motif_names) * max_scores))),
                axis=1)
        else:
            return sorted_scores.reshape(
                (num_samples, len(motif_names) * max_scores))
    else:
        return scores