Beispiel #1
0
def dataSubset(fittingData, numDatapoints, seed=345, maxNumIndepParams=None):
    """
    By default, add one timepoint for each independent parameter first,
    then increase the number of timepoints per independent parameter.
    Timepoints are added randomly for each independent parameter.
    Independent parameters are added in the order of indepParamsList.
    """
    scipy.random.seed(seed)
    subset = []
    numIndepParams = len(fittingData)
    if maxNumIndepParams is None: maxNumIndepParams = numIndepParams
    numDatapoints = int(numDatapoints)
    for i in range(min(numDatapoints, maxNumIndepParams)):
        varNames = scipy.sort(fittingData[i].keys())
        allTimes = scipy.sort(fittingData[i][varNames[0]].keys())

        possibleIndices = range(len(allTimes))
        scipy.random.shuffle(possibleIndices)

        N = numDatapoints / maxNumIndepParams
        if i < numDatapoints % maxNumIndepParams: N += 1
        timeIndices = possibleIndices[:N]
        times = scipy.array(allTimes)[timeIndices]

        s = {}
        for var in varNames:
            s[var] = dict([(t, fittingData[i][var][t]) for t in times])
        subset.append(s)

    return subset
Beispiel #2
0
    def check( x ):
        y = sl.canonicalise( x )
        yr = y[0,:]
        yc = y[:,0]

        assert all( yr == sc.sort( yr ) )
        assert all( yc == sc.sort( yc ) )
def thresholdFromNumNonzero(mat,
                            numNonzero,
                            sym=False,
                            useAbs=True,
                            aboveDiagOnly=False):
    """
    Things get complicated if the matrix elements are not all distinct...
    
    sym:        If True, treat the matrix as symmetric, and count only 
                nonzero elements at or below the diagonal.
    """
    if sym:
        mat = scipy.tri(len(mat)) * mat
    if useAbs: absMat = abs(mat)
    else: absMat = mat
    if not aboveDiagOnly:
        flatAbsMat = scipy.sort(arrayFlatten(absMat))[::-1]
    else:
        flatAbsMat = scipy.sort(aboveDiagFlat(absMat))[::-1]
    if numNonzero < 1:
        return scipy.inf
    elif numNonzero == len(flatAbsMat):
        if useAbs: return 0.
        else: return flatAbsMat[-1]
    elif numNonzero > len(flatAbsMat):
        raise Exception("Desired numNonzero > number of matrix elements.")
    return scipy.mean([flatAbsMat[numNonzero], flatAbsMat[numNonzero - 1]])
def dataSubset(fittingData,numDatapoints,seed=345,maxNumIndepParams=None):
    """
    By default, add one timepoint for each independent parameter first,
    then increase the number of timepoints per independent parameter.
    Timepoints are added randomly for each independent parameter.
    Independent parameters are added in the order of indepParamsList.
    """
    scipy.random.seed(seed)
    subset = []
    numIndepParams = len(fittingData)
    if maxNumIndepParams is None: maxNumIndepParams = numIndepParams
    numDatapoints = int(numDatapoints)
    for i in range(min(numDatapoints,maxNumIndepParams)):
        varNames = scipy.sort( fittingData[i].keys() )
        allTimes = scipy.sort( fittingData[i][varNames[0]].keys() )
        
        possibleIndices = range(len(allTimes))
        scipy.random.shuffle(possibleIndices)
        
        N = numDatapoints/maxNumIndepParams
        if i < numDatapoints%maxNumIndepParams: N += 1
        timeIndices = possibleIndices[:N]
        times = scipy.array(allTimes)[timeIndices]

        s = {}
        for var in varNames:
            s[var] = dict([(t,fittingData[i][var][t]) for t in times])
        subset.append(s)

    return subset
Beispiel #5
0
def quantify_intron_retention(event, gene, counts_segments, counts_edges, counts_seg_pos):

    cov = sp.zeros((2, ), dtype='float')
    sg = gene.splicegraph
    segs = gene.segmentgraph

    seg_lens = segs.segments[1, :] - segs.segments[0, :]
    seg_shape = segs.seg_edges.shape
    order = 'C'
    offset = 0

    ### find exons corresponding to event
    idx_exon1  = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0]
    idx_exon2  = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0]

    ### find segments corresponding to exons
    seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1])
    seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1])
    seg_all = sp.arange(seg_exon1[0], seg_exon2[-1])

    seg_intron = sp.setdiff1d(seg_all, seg_exon1)
    seg_intron = sp.setdiff1d(seg_intron, seg_exon2)
    assert(seg_intron.shape[0] > 0)

    ### compute exon coverages as mean of position wise coverage
    # intron_cov
    cov[0] = sp.sum(counts_segments[seg_intron] * seg_lens[seg_intron]) / sp.sum(seg_lens[seg_intron])

    ### check intron confirmation as sum of valid intron scores
    ### intron score is the number of reads confirming this intron
    # intron conf
    idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon1[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0]
    cov[1] = counts_edges[idx, 1]

    return cov
def plot_tuning_curves(data_flags):

    # First entries are for mu_dSs, second are for tuning_width
    #plot_vars = [[0, 9, 18], [7, 11, 19]]
    plot_vars = [[0, 1, 2], [0, 1, 2]]
    cmaps = [[cm.Greys, cm.Purples, cm.Blues],
             [cm.Greys, cm.Purples, cm.Blues]]

    for data_idx, data_flag in enumerate(data_flags):

        list_dict = read_specs_file(data_flag)
        for key in list_dict:
            exec("%s = list_dict[key]" % key)

        tuning_curve_data = load_tuning_curve(data_flag)
        tuning_curve = tuning_curve_data['tuning_curve']
        epsilons = tuning_curve_data['epsilons']

        if data_idx == 0:
            fig, plot_dims, axes_tuning, axes_eps, axes_signal = \
              tuning_curve_plot_epsilon(plot_vars, iter_vars, params)

        for idx, idx_var in enumerate(plot_vars[0]):
            for idy, idy_var in enumerate(plot_vars[1]):

                colors = cmaps[data_idx][idx](sp.linspace(
                    0.75, 0.3, params['Mm']))

                for iM in range(params['Mm']):
                    axes_tuning[idx,
                                idy].plot(sp.arange(params['Nn'] / 2),
                                          sp.sort(tuning_curve[idx_var,
                                                               idy_var, ::2,
                                                               iM]),
                                          color=colors[iM],
                                          linewidth=0.7,
                                          zorder=params['Mm'] - iM)
                    axes_tuning[idx,
                                idy].plot(sp.arange(params['Nn'] / 2 - 1,
                                                    params['Nn'] - 1),
                                          sp.sort(tuning_curve[idx_var,
                                                               idy_var, 1::2,
                                                               iM])[::-1],
                                          color=colors[iM],
                                          linewidth=0.7,
                                          zorder=params['Mm'] - iM)

                axes_eps[idy].plot(range(params['Mm']),
                                   epsilons[idx_var, idy_var],
                                   color=colors[4],
                                   linewidth=1.5,
                                   zorder=0)
                for iM in range(params['Mm']):
                    axes_eps[idy].scatter(iM,
                                          epsilons[idx_var, idy_var][iM],
                                          c=colors[iM],
                                          s=3)

    save_tuning_curve_fig(fig, data_flag)
Beispiel #7
0
def quantify_mutex_exons(event, gene, counts_segments, counts_edges):

    sg = gene.splicegraph
    segs = gene.segmentgraph

    seg_lens = segs.segments[1, :] - segs.segments[0, :]
    seg_shape = segs.seg_edges.shape[0]
    order = 'C'
    offset = 0

    ### find exons corresponding to event
    idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons1[0, 0])
                            & (sg.vertices[1, :] == event.exons1[0, 1]))[0]
    idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons1[-1, 0])
                            & (sg.vertices[1, :] == event.exons1[-1, 1]))[0]
    idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[1, 0])
                         & (sg.vertices[1, :] == event.exons1[1, 1]))[0]
    idx_exon2 = sp.where((sg.vertices[0, :] == event.exons2[1, 0])
                         & (sg.vertices[1, :] == event.exons2[1, 1]))[0]

    ### find segments corresponding to exons
    seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1])
    seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1])
    seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1])
    seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1])

    # exon1 cov
    cov[0] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum(
        seg_lens[seg_exon1])
    # exon2 cov
    cov[1] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum(
        seg_lens[seg_exon2])

    ### check intron confirmation as sum of valid intron scores
    ### intron score is the number of reads confirming this intron
    # exon_pre_exon1_conf
    idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [seg_exon_pre[-1], seg_exon1[0]], seg_shape, order=order) + offset)[0]
    if len(idx1.shape) > 0 and idx1.shape[0] > 0:
        cov[0] += counts_edges[idx1[0], 1]
    # exon_pre_exon2_conf
    idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [seg_exon_pre[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0]
    if len(idx2.shape) > 0 and idx2.shape[0] > 0:
        cov[1] += counts_edges[idx2[0], 1]
    # exon1_exon_aft_conf
    idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [seg_exon1[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0]
    if len(idx3.shape) > 0 and idx3.shape[0] > 0:
        cov[0] += counts_edges[idx3[0], 1]
    # exon2_exon_aft_conf
    idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [seg_exon2[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0]
    if len(idx4.shape) > 0 and idx4.shape[0] > 0:
        cov[1] += counts_edges[idx4[0], 1]

    return cov
Beispiel #8
0
def _remdup(a,amax=None):
    """Remove duplicates from vector a
    """
    scipy.sort(a)
    flag = 0
    for x in range(1,len(a)):
        if (a[x-1]+1) - (a[x]+1) == 0:
            flag = 1
    return flag
def plot_tuning_curves(data_flags):

    # First entries are for mu_dSs, second are for Kk2 diversity
    plot_vars = [[0, 9, 19], [8, 15, 19]]
    cmaps = [[cm.Greys, cm.Purples, cm.Blues],
             [cm.Greys, cm.Purples, cm.Blues]]

    for data_idx, data_flag in enumerate(data_flags):

        list_dict = read_specs_file(data_flag)
        for key in list_dict:
            exec("%s = list_dict[key]" % key)

        tuning_curve_data = load_tuning_curve(data_flag)
        tuning_curve = tuning_curve_data['tuning_curve']
        epsilons = tuning_curve_data['epsilons']
        Kk2s = tuning_curve_data['Kk2s']

        if data_idx == 0:
            fig, plot_dims, axes_tuning, axes_Kk2, axes_signal = \
              tuning_curve_plot_Kk2(plot_vars, iter_vars, params)

        for idx, idx_var in enumerate(plot_vars[0]):
            for idy, idy_var in enumerate(plot_vars[1]):

                colors = cmaps[data_idx][idx](sp.linspace(
                    0.75, 0.3, params['Mm']))

                for iM in range(params['Mm']):
                    axes_tuning[idx,
                                idy].plot(sp.arange(params['Nn'] / 2),
                                          sp.sort(tuning_curve[idx_var,
                                                               idy_var, ::2,
                                                               iM]),
                                          color=colors[iM],
                                          linewidth=0.7,
                                          zorder=params['Mm'] - iM)
                    axes_tuning[idx,
                                idy].plot(sp.arange(params['Nn'] / 2 - 1,
                                                    params['Nn'] - 1),
                                          sp.sort(tuning_curve[idx_var,
                                                               idy_var, 1::2,
                                                               iM])[::-1],
                                          color=colors[iM],
                                          linewidth=0.7,
                                          zorder=params['Mm'] - iM)

                if idx == 0:
                    sorted_idxs = sp.argsort(
                        sp.std(Kk2s[0, idy_var, :, :], axis=1))
                    axes_Kk2[idy].imshow(Kk2s[0, idy_var, sorted_idxs, :].T,
                                         aspect=0.3,
                                         cmap='bone',
                                         rasterized=True)

    save_tuning_curve_fig(fig, data_flag)
Beispiel #10
0
def quantify_intron_retention(event, gene, counts_segments, counts_edges,
                              counts_seg_pos, CFG):

    cov = sp.zeros((2, ), dtype='float')
    sg = gene.splicegraph
    segs = gene.segmentgraph

    if CFG['is_matlab']:
        seg_lens = segs[0, 0][1, :] - segs[0, 0][0, :]
        seg_shape = segs[0, 2].shape
        order = 'F'
        offset = 1

        ### find exons corresponding to event
        idx_exon1 = sp.where((sg[0, 0][0, :] == event.exon1[0])
                             & (sg[0, 0][1, :] == event.exon1[1]))[0]
        idx_exon2 = sp.where((sg[0, 0][0, :] == event.exon2[0])
                             & (sg[0, 0][1, :] == event.exon2[1]))[0]

        ### find segments corresponding to exons
        seg_exon1 = sp.sort(sp.where(segs[0, 1][idx_exon1, :])[1])
        seg_exon2 = sp.sort(sp.where(segs[0, 1][idx_exon2, :])[1])
    else:
        seg_lens = segs.segments[1, :] - segs.segments[0, :]
        seg_shape = segs.seg_edges.shape
        order = 'C'
        offset = 0

        ### find exons corresponding to event
        idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[0, 0])
                             & (sg.vertices[1, :] == event.exons1[0, 1]))[0]
        idx_exon2 = sp.where((sg.vertices[0, :] == event.exons1[1, 0])
                             & (sg.vertices[1, :] == event.exons1[1, 1]))[0]

        ### find segments corresponding to exons
        seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1])
        seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1])
    seg_all = sp.arange(seg_exon1[0], seg_exon2[-1])

    seg_intron = sp.setdiff1d(seg_all, seg_exon1)
    seg_intron = sp.setdiff1d(seg_intron, seg_exon2)
    assert (seg_intron.shape[0] > 0)

    ### compute exon coverages as mean of position wise coverage
    # intron_cov
    cov[0] = sp.sum(counts_segments[seg_intron] *
                    seg_lens[seg_intron]) / sp.sum(seg_lens[seg_intron])

    ### check intron confirmation as sum of valid intron scores
    ### intron score is the number of reads confirming this intron
    # intron conf
    idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [seg_exon1[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0]
    cov[1] = counts_edges[idx, 1]

    return cov
Beispiel #11
0
def make_unique_by_strain(event_list):
    # event_list = make_unique_by_strain(event_list)

    rm_idx = []
    for i in range(1, event_list.shape[0]):
        if i % 1000 == 0:
            print '.',
            if i % 10000 == 0:
                print '%i' % i

        old_coords = event_list[i - 1].get_coords(trafo=True)
        curr_coords = event_list[i].get_coords(trafo=True)

        if old_coords.shape[0] == curr_coords.shape[0] and sp.all(
                old_coords == curr_coords):

            ### assertion that we did everything right
            if event_list[i - 1].chr == event_list[i].chr:
                assert (event_list[i - 1].strand == event_list[i].strand)
                assert (event_list[i].strain.shape[0] == 1)
            else:
                assert (event_list[i - 1].gene_name != event_list[i].gene_name)

            idx = sp.where(event_list[i -
                                      1].strain == event_list[i].strain[0])[0]
            if idx.shape[0] > 0:
                assert (idx.shape[0] == 1)
                assert (sp.all(event_list[i].get_coords(
                    trafo=True) == event_list[i - 1].get_coords(trafo=True)))
                if not event_list[i].gene_name[0] in event_list[i -
                                                                1].gene_name:
                    event_list[
                        i - 1].gene_name = sp.r_[event_list[i - 1].gene_name,
                                                 [event_list[i].gene_name[0]]]
                event_list[i] = event_list[i - 1]
            else:
                event_list[i].strain = sp.r_[[event_list[i - 1].strain[0]],
                                             event_list[i].strain]
                assert (sp.all(
                    sp.sort(event_list[i].strain) == sp.sort(
                        sp.unique(event_list[i].strain))))
                ### TODO !!!!!!!!!!!!! make sure that we keep different coordinates if the strains differ ...
                if not event_list[i].gene_name[0] in event_list[i -
                                                                1].gene_name:
                    event_list[i].gene_name = sp.r_[
                        event_list[i - 1].gene_name,
                        [event_list[i].gene_name[0]]]
            rm_idx.append(i - 1)

    print 'events dropped: %i' % len(rm_idx)
    keep_idx = sp.where(~sp.in1d(sp.arange(event_list.shape[0]), rm_idx))[0]
    event_list = event_list[keep_idx]

    return event_list
Beispiel #12
0
 def _test_cg(self, tname, cg, A, b, expect):
     logging.info(f'{tname} with {cg.__name__}')
     expect = sp.sort(expect)
     actual = sp.sort(cg(A, b))
     logging.debug(f'{tname} A\n{str(A)}')
     logging.debug(f'{tname} b\n{str(b)}')
     logging.debug(f'{tname} expect\n{str(expect)}')
     logging.debug(f'{tname} actual\n{str(actual)}')
     elapsed = timeit(lambda: cg(A, b), number=1000)
     logging.info(f'{tname} {elapsed:.3}ms')
     abserrorsum = sp.sum(sp.absolute(expect - actual))
     logging.info(f'{tname} sum of absolute errors = {abserrorsum:.3}')
Beispiel #13
0
def average_EF(structure, distance_function, feature=True, pca=False, percent=[0.25], name=False):
    """
    Compute the average enrichment factor of a given structure, over the set of actives
    Does it faster because of a vectorisation process (old version below)
    :param structure: the structure to access, in data/
    :param name: the name of the embedding csv to use
    :return: the average EF for this structure as a dict percent : EF
    """
    # not very useful, just put the name of the embedding csv. Otherwise it will get the baseline emebeddings
    if not name:
        name = '_feature={}_pca={}.csv'.format(feature, pca)

    # META Get the Path of the embeddings
    csv_actives_dir = os.path.join('../data/embeddings', structure)
    csv_decoys_dir = os.path.join('../data/embeddings', structure)
    csv_actives_path = os.path.join(csv_actives_dir, 'csv_actives' + name)
    csv_decoys_path = os.path.join(csv_decoys_dir, 'csv_decoys' + name)

    features = (4 * feature + 1) * 12
    actives_values = pd.read_csv(csv_actives_path, usecols=range(2, features + 1), dtype=float)
    actives_values = np.array(actives_values)
    decoys_values = pd.read_csv(csv_decoys_path, usecols=range(2, features + 1), dtype=float)
    decoys_values = np.array(decoys_values)

    # Compute the DM (bottleneck)
    actives_dist = distance_matrix(actives_values, actives_values, distance_function)
    decoys_dist = distance_matrix(actives_values, decoys_values,distance_function)

    # Sort and compute the EF
    enrichments = {}
    for perc in percent:
        enrichments[perc] = []
    for i in range(actives_dist.shape[1]):
        list_actives = np.sort(actives_dist[i])[:-1]
        list_decoys = np.sort(decoys_dist[i])
        total = np.sort(np.append(list_decoys, list_actives))
        a, d, tot = len(list_actives), len(list_decoys), len(total)

        # avoid to recompute this for all percent
        for perc in percent:
            threshold_index = tot - int(perc / 100 * tot) - 1
            threshold = total[threshold_index]
            selected_actives = [x for x in list_actives if x >= threshold]
            selected_decoys = [x for x in list_decoys if x >= threshold]
            sa, sd = len(selected_actives), len(selected_decoys)
            stot = sa + sd
            numerator = sa / stot
            denominator = a / tot
            enrichments[perc].append(numerator / denominator)

    for perc in percent:
        enrichments[perc] = np.mean(enrichments[perc])
    return enrichments
    def set_ordered_temporal_adaptation_rate(self):
        """
		Set a spread of adaptation rates, possibly ordered by activity levels.
		The spread is incorporated when temporal_adaptation_rate_sigma is 
		nonzero, and this spread gives a factor change, i.e. beta -->
		beta*10^{-sigma, sigma}. Various ordering schemes are given.
		"""

        try:
            self.dYy
            self.Yy
            self.Yy0
        except AttributeError:
            print 'Must run set_measured_activity(...) before calling '\
             'set_ordered_temporal_adaptation_rate(...)'

        sp.random.seed(self.temporal_adaptation_rate_seed)
        exp_spread = sp.random.normal(-self.temporal_adaptation_rate_sigma,
                                      self.temporal_adaptation_rate_sigma,
                                      self.Mm)
        self.temporal_adaptation_rate_vector = self.temporal_adaptation_rate*\
                 10.**exp_spread

        # Order the adaptation rates by activity levels
        if self.temporal_adaptation_rate_ordering == 'random':
            pass
        elif self.temporal_adaptation_rate_ordering == 'increasing_Yy':
            sorted_idxs = self.Yy.argsort()
            idx_ranks = sorted_idxs.argsort()
            self.temporal_adaptation_rate_vector = \
             sp.sort(self.temporal_adaptation_rate_vector)[idx_ranks]
        elif self.temporal_adaptation_rate_ordering == 'increasing_dYy':
            sorted_idxs = self.dYy.argsort()
            idx_ranks = sorted_idxs.argsort()
            self.temporal_adaptation_rate_vector = \
             sp.sort(self.temporal_adaptation_rate_vector)[idx_ranks]
        elif self.temporal_adaptation_rate_ordering == 'decreasing_Yy':
            sorted_idxs = self.Yy.argsort()[::-1]
            idx_ranks = sorted_idxs.argsort()
            self.temporal_adaptation_rate_vector = \
             sp.sort(self.temporal_adaptation_rate_vector)[idx_ranks]
        elif self.temporal_adaptation_rate_ordering == 'decreasing_dYy':
            sorted_idxs = self.dYy.argsort()[::-1]
            idx_ranks = sorted_idxs.argsort()
            self.temporal_adaptation_rate_vector = \
             sp.sort(self.temporal_adaptation_rate_vector)[idx_ranks]
        else:
            print "\ntemporal_adaptation_rate_ordering not set to "\
              "a valid string; use 'random', 'increasing_Yy', "\
              "'increasing_dYy', 'decreasing_Yy', or 'decreasing_dYy'"
            quit()
Beispiel #15
0
def QQPlot(arguments,pv,unique_pv,fname):
    font_size = 18
    mpl.rcParams['font.family']="sans-serif"
    mpl.rcParams['font.sans-serif']="Arial"
    mpl.rcParams['font.size']=font_size
    #mpl.rcParams['figure.dpi'] = 300
    mpl.rcParams['font.weight']='medium'
    mpl.rcParams['figure.facecolor'] = 'white'
    mpl.rcParams['lines.linewidth'] = 1
    mpl.rcParams['axes.facecolor'] = 'white'
    mpl.rcParams['patch.edgecolor'] = 'white'
    mpl.rcParams['grid.linestyle'] = '-'
    mpl.rcParams['grid.color'] = 'LightGray'
    if arguments.ignore!=None:
        if arguments.ignore in fname:
            return 
    
    if arguments.distinct:
        pv = unique_pv

    pl.figure(figsize=(5,5))
    pv_uni = (sp.arange(1.0/float(pv.shape[0]),1,1.0/float(pv.shape[0]+1)))
    pl.plot(-sp.log10(pv_uni),-sp.log10(sp.sort(pv_uni)),'b--')
    pl.ylim(0,(-sp.log10(pv[:])).max()+1)
    pl.plot(-sp.log10(pv_uni),-sp.log10(sp.sort(pv[:],axis=0)),'.',color="#F68E55",markersize=12,markeredgewidth=0,alpha=1)
    #plot theoretical expectations
    if arguments.estpv:
        datapoints=10**(sp.arange(sp.log10(0.5),sp.log10(pv.shape[0]-0.5)+0.1,0.1))
        beta_alpha=sp.zeros(datapoints.shape[0])
        beta_nalpha=sp.zeros(datapoints.shape[0])
        beta_tmp=sp.zeros(datapoints.shape[0])
        for n in xrange(datapoints.shape[0]):
            m=datapoints[n]
            beta_tmp[n]=stats.beta.ppf(0.5,m,pv.shape[0]-m)
            beta_alpha[n]=stats.beta.ppf(0.05,m,pv.shape[0]-m)
            beta_nalpha[n]=stats.beta.ppf(1-0.05,m,pv.shape[0]-m)
        estimated_pvals=datapoints/pv.shape[0]
        lower_bound = -sp.log10(estimated_pvals-(beta_tmp-beta_alpha))
        upper_bound = -sp.log10(estimated_pvals+(beta_nalpha-beta_tmp))
        pl.fill_between(-sp.log10(estimated_pvals),lower_bound,upper_bound,color='#00BFF3',alpha=0.4,linewidth=0)
    if arguments.title:
        pl.title("Phenotype: %s"%(fname))
    pl.xlabel('Expected $-log10(p-value)$')
    pl.ylabel('Observed $-log10(p-value)$')
    if arguments.gc:
        gc = sp.median(stats.chi2.isf(pv,1))/0.456
        pl.text(4,1,"$\hat \lambda=%.2f$"%(gc))
    remove_border()
    pl.subplots_adjust(left=0.14,bottom=0.13,right=0.97,top=0.95,wspace=0.45)
    pl.savefig(os.path.join(arguments.out,'qqplot_' + fname + '.' + arguments.iformat) )
    pl.close()
Beispiel #16
0
 def get_coords(self, trafo=False):
     
     if self.event_type != 'mult_exon_skip':
         if trafo:
             #return sp.sort(sp.unique(sp.c_[self.exons1_col.ravel(), self.exons2_col.ravel()]))
             return sp.sort(sp.r_[self.exons1_col.ravel(), self.exons2_col.ravel()])
         else:
             #return sp.sort(sp.unique(sp.c_[self.exons1.ravel(), self.exons2.ravel()]))
             return sp.sort(sp.r_[self.exons1.ravel(), self.exons2.ravel()])
     else:
         if trafo:
             return sp.sort(sp.r_[self.exons1_col.ravel()[:4], self.exons2_col.ravel()[-4:]])
         else:
             return sp.sort(sp.r_[self.exons1.ravel()[:4], self.exons2.ravel()[-4:]])
Beispiel #17
0
def quantify_mult_exon_skip(event, gene, counts_segments, counts_edges):

    cov = sp.zeros((2, ), dtype='float')

    sg = gene.splicegraph
    segs = gene.segmentgraph

    seg_lens = segs.segments[1, :] - segs.segments[0, :]
    seg_shape = segs.seg_edges.shape[0]
    order = 'C'
    offset = 0

    ### find exons corresponding to event
    idx_exon_pre  = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0]
    idx_exon_aft  = sp.where((sg.vertices[0, :] == event.exons2[-1, 0]) & (sg.vertices[1, :] == event.exons2[-1, 1]))[0]
    seg_exons = []
    for i in range(1, event.exons2.shape[0] - 1):
        tmp = sp.where((sg.vertices[0, :] == event.exons2[i, 0]) & (sg.vertices[1, :] == event.exons2[i, 1]))[0]
        seg_exons.append(sp.where(segs.seg_match[tmp, :])[1])
    
    ### find segments corresponding to exons
    seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1])
    seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1])

    seg_exons_u = sp.sort(sp.unique([x for sublist in seg_exons for x in sublist]))

    ### inner exons_cov
    cov[0] = sp.sum(counts_segments[seg_exons_u] * seg_lens[seg_exons_u]) / sp.sum(seg_lens[seg_exons_u])

    ### check intron confirmation as sum of valid intron scores
    ### intron score is the number of reads confirming this intron
    # exon_pre_exon_conf
    idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exons[0][0]], seg_shape, order=order) + offset)[0]
    if len(idx1.shape) > 0 and idx1.shape[0] > 0:
        cov[0] += counts_edges[idx1[0], 1]
    # exon_exon_aft_conf
    idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exons[-1][-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0]
    if len(idx2.shape) > 0 and idx2.shape[0] > 0:
        cov[0] += counts_edges[idx2[0], 1]
    # exon_pre_exon_aft_conf
    idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0]
    if len(idx3.shape) > 0 and idx3.shape[0] > 0:
        cov[1] = counts_edges[idx3[0], 1]
    for i in range(len(seg_exons) - 1):
        # sum_inner_exon_conf
        idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exons[i][-1], seg_exons[i+1][0]], seg_shape, order=order) + offset)[0]
        if len(idx4.shape) > 0 and idx4.shape[0] > 0:
            cov[0] += counts_edges[idx4[0], 1]

    return cov
Beispiel #18
0
def find_holes(data):
    sample = data.copy()
    size = sample.size

    # Here's a little hack to "flatten" star boxes
    tmp = scipy.sort(sample)
    star_cutoff = scipy.median(tmp[-30:-10]) * 0.6
    sample = scipy.where(sample > star_cutoff, star_cutoff, sample)

    derivative = deriv_1d(sample)
    derivative = ndimage.gaussian_filter1d(derivative, 3)
    derivative = abs(derivative)

    tmp = scipy.sort(derivative)
    avg = scipy.median(tmp[size / 8:size * 3 / 8])
    sigma = tmp[size / 8:size * 3 / 8].std()

    threshold = avg + sigma * 100.

    edge = []

    count = 0
    while derivative.max() > threshold:
        start = derivative.argmax() - 7
        end = derivative.argmax() + 8

        if start < 0:
            start = 0
        if end > derivative.size:
            end = derivative.size

        fit = find_peak(derivative[start:end])

        if start > 7 and end < derivative.size - 7:
            edge.append(float(start) + fit[2])

        start -= 3
        end += 3

        if start < 0:
            start = 0
        if end > derivative.size:
            end = derivative.size

        derivative[start:end] = 0.

    edge.sort()
    return edge, threshold, star_cutoff
Beispiel #19
0
def NGorN50(file_path='contigs.txt', genomesize=None):
    contigs, num_contig = file_parser(file_path)
    print( "Total number of contigs: %d " %(num_contig) ) # Expect 20

    # Sort the contigs in reverse order in an array e.g. 
    # array([79, 23, 10])
    contigs = scipy.sort(contigs)[::-1]
    #print(contigs)

    # Calculate sum to compare against for N50s or NG50
    if genomesize == None:
        contig_sum = contigs.sum()/2
        print( "50 Contig Sum is: %d" % (contig_sum) )
    else:
        contig_sum = int(genomesize)/2
        print ("50 Genome Size specified: %d" %(contig_sum))

    for counter in range(1, num_contig+1):
        # TODO: Consider memoizing this if you need to reuse this script for large contigs for performance gains.

        # Check the accumulated sum against the comparison
        if contigs[0:counter].sum() > contig_sum:
            print( "Partial Contig Sum is: %d, with counter: %d, and contig length %d" 
                % (contigs[0:counter].sum(), counter, contigs[counter-1]) )
            # Only need to find the first case
            break
Beispiel #20
0
def traj_ensemble_quantiles(traj_set, quantiles=(0.025, 0.5, 0.975)):
    """
    Return a list of trajectories, each one corresponding the a given passed-in
    quantile.
    """
    all_values = scipy.array([traj.values for traj in traj_set])
    sorted_values = scipy.sort(all_values, 0)

    q_trajs = []
    for q in quantiles:
        # Calculate the index corresponding to this quantile. The q is because
        #  Python arrays are 0 indexed
        index = q * (len(sorted_values) - 1)
        below = int(scipy.floor(index))
        above = int(scipy.ceil(index))
        if above == below:
            q_values = sorted_values[below]
        else:
            # Linearly interpolate...
            q_below = (1.0 * below) / (len(sorted_values) - 1)
            q_above = (1.0 * above) / (len(sorted_values) - 1)
            q_values = sorted_values[below] + (q - q_below) * (
                sorted_values[above] - sorted_values[below]) / (q_above -
                                                                q_below)
        q_traj = copy.deepcopy(traj_set[0])
        q_traj.values = q_values
        q_trajs.append(q_traj)

    return q_trajs
Beispiel #21
0
 def bulk_bands_calculator(self, s, sub, kx, ky, kz):
     ''' Calculate the band energies for the specified kx, ky, and kz values.
     The 3x3 Hamiltonian for wurtzite crystals is used for the valence,
     while a 1x1 Hamiltonian is used for the conduction band. The model is
     from the chapter by Vurgaftman and Meyer in the book by Piprek. 
 '''
     E = scipy.zeros((4, len(s.Eg0)))
     E[0,:] = s.Eg0+s.delcr+s.delso/3+\
                 hbar**2/(2*s.mepara)*(kx**2+ky**2)+\
                 hbar**2/(2*s.meperp)*(kz**2)+\
                 (s.a1+s.D1)*s.epszz+(s.a2+s.D2)*(s.epsxx+s.epsyy)
     L = hbar**2/(2*m0)*(s.A1*kz**2+s.A2*(kx+ky)**2)+\
         s.D1*s.epszz+s.D2*(s.epsxx+s.epsyy)
     T = hbar**2/(2*m0)*(s.A3*kz**2+s.A4*(kx+ky)**2)+\
         s.D3*s.epszz+s.D4*(s.epsxx+s.epsyy)
     F = s.delcr + s.delso / 3 + L + T
     G = s.delcr - s.delso / 3 + L + T
     K = hbar**2 / (2 * m0) * s.A5 * (kx + 1j * ky)**2 + s.D5 * (s.epsxx -
                                                                 s.epsyy)
     H = hbar**2 / (2 * m0) * s.A6 * (kx + 1j * ky) * kz + s.D6 * (s.epsxz)
     d = scipy.sqrt(2) * s.delso / 3
     for ii in range(len(s.Eg0)):
         mat = scipy.matrix([[F[ii], K[ii], -1j * H[ii]],
                             [K[ii], G[ii], -1j * H[ii] + d[ii]],
                             [-1j * H[ii], -1j * H[ii] + d[ii], L[ii]]])
         w, v = scipy.linalg.eig(mat)
         E[1:, ii] = scipy.flipud(scipy.sort(scipy.real(w)))
     return E
Beispiel #22
0
def _get_Voronoi_edges(vor):
    r"""
    Given a Voronoi object as produced by the scipy.spatial.Voronoi class,
    this function calculates the start and end points of eeach edge in the
    Voronoi diagram, in terms of the vertex indices used by the received
    Voronoi object.

    Parameters
    ----------
    vor : scipy.spatial.Voronoi object

    Returns
    -------
    A 2-by-N array of vertex indices, indicating the start and end points of
    each vertex in the Voronoi diagram.  These vertex indices can be used to
    index straight into the ``vor.vertices`` array to get spatial positions.
    """
    edges = [[], []]
    for facet in vor.ridge_vertices:
        # Create a closed cycle of vertices that define the facet
        edges[0].extend(facet[:-1]+[facet[-1]])
        edges[1].extend(facet[1:]+[facet[0]])
    edges = sp.vstack(edges).T  # Convert to scipy-friendly format
    mask = sp.any(edges == -1, axis=1)  # Identify edges at infinity
    edges = edges[~mask]  # Remove edges at infinity
    edges = sp.sort(edges, axis=1)  # Move all points to upper triangle
    # Remove duplicate pairs
    edges = edges[:, 0] + 1j*edges[:, 1]  # Convert to imaginary
    edges = sp.unique(edges)  # Remove duplicates
    edges = sp.vstack((sp.real(edges), sp.imag(edges))).T  # Back to real
    edges = sp.array(edges, dtype=int)
    return edges
Beispiel #23
0
 def bulk_bands_calculator(self,s,sub,kx,ky,kz):
   ''' Calculate the band energies for the specified kx, ky, and kz values.
       The 3x3 Hamiltonian for wurtzite crystals is used for the valence,
       while a 1x1 Hamiltonian is used for the conduction band. The model is
       from the chapter by Vurgaftman and Meyer in the book by Piprek. 
   '''
   E = scipy.zeros((4,len(s.Eg0)))   
   E[0,:] = s.Eg0+s.delcr+s.delso/3+\
               hbar**2/(2*s.mepara)*(kx**2+ky**2)+\
               hbar**2/(2*s.meperp)*(kz**2)+\
               (s.a1+s.D1)*s.epszz+(s.a2+s.D2)*(s.epsxx+s.epsyy)
   L = hbar**2/(2*m0)*(s.A1*kz**2+s.A2*(kx+ky)**2)+\
       s.D1*s.epszz+s.D2*(s.epsxx+s.epsyy)
   T = hbar**2/(2*m0)*(s.A3*kz**2+s.A4*(kx+ky)**2)+\
       s.D3*s.epszz+s.D4*(s.epsxx+s.epsyy)
   F = s.delcr+s.delso/3+L+T
   G = s.delcr-s.delso/3+L+T
   K = hbar**2/(2*m0)*s.A5*(kx+1j*ky)**2+s.D5*(s.epsxx-s.epsyy)
   H = hbar**2/(2*m0)*s.A6*(kx+1j*ky)*kz+s.D6*(s.epsxz)
   d = scipy.sqrt(2)*s.delso/3
   for ii in range(len(s.Eg0)):
     mat = scipy.matrix([[    F[ii],     K[ii],       -1j*H[ii]      ],
                         [    K[ii],     G[ii],       -1j*H[ii]+d[ii]],
                         [-1j*H[ii], -1j*H[ii]+d[ii],     L[ii]      ]])
     w,v = scipy.linalg.eig(mat)
     E[1:,ii] = scipy.flipud(scipy.sort(scipy.real(w)))
   return E
Beispiel #24
0
def import_data(size=128):

    files = []
    orients = ["00F", "30L", "30R", "45L", "45R", "60L", "60R", "90L", "90R"]
    for orient in orients:
        _files = glob.glob(os.path.join(data_dir, "*/*_%s.jpg" % orient))
        files = files + _files
    files = sp.sort(files)

    D1id = []
    D2id = []
    Did = []
    Rid = []
    Y = sp.zeros([len(files), size, size, 3], dtype=sp.uint8)
    for _i, _file in enumerate(files):
        y = imread(_file)
        y = imresize(y, size=[size, size], interp="bilinear")
        Y[_i] = y
        fn = _file.split(".jpg")[0]
        fn = fn.split("/")[-1]
        did1, did2, rid = fn.split("_")
        Did.append(did1 + "_" + did2)
        Rid.append(rid)
    Did = sp.array(Did, dtype="|S100")
    Rid = sp.array(Rid, dtype="|S100")

    RV = {"Y": Y, "Did": Did, "Rid": Rid}
    return RV
def subsetsWithFits(fileNumString,onlyNew=False):
    """
    Find data subsets (N) that have models that have been fit to
    all conditions.
    
    onlyNew (False)         : Optionally include only subsets that have
                              fits that are not included in the current
                              combined fitProbs.
    """
    fpd = loadFitProbData(fileNumString)
    saveFilename = fpd.values()[0]['saveFilename']
    
    Nlist = []
    for N in scipy.sort(fpd.keys()):
        # find models that have been fit to all conditions
        if len(fpd[N]['fitProbDataList']) == 1:
            fitModels = fpd[N]['fitProbDataList'][0]['logLikelihoodDict'].keys()
        else:
            fitModels = scipy.intersect1d([ fp['logLikelihoodDict'].keys() \
                                            for fp in fpd[N]['fittingProblemList'] ])
        if onlyNew:
            Nfilename = directoryPrefixNonly(fileNumString,N)+'/'+saveFilename
            fileExists = os.path.exists(Nfilename)
            if not fileExists: # no combined file exists
                if len(fitModels) > 0:
                    Nlist.append(N)
            else: # check which fit models are currently included in the saved file
                fpMultiple = load(Nfilename)
                fitModelsSaved = fpMultiple.logLikelihoodDict.keys()
                if len(scipy.intersect1d(fitModels,fitModelsSaved)) < len(fitModels):
                    Nlist.append(N)
        else:
            if len(fitModels) > 0:
                Nlist.append(N)
    return Nlist
Beispiel #26
0
def neo_p0(setup, *args):
    ntemps, nwalkers, nsteps = setup
    t = args[0]
    ndim = args[1]
    C = args[2]

    pos = sp.zeros((ntemps, nwalkers, ndim))
    for temp in range(ntemps):
        for j in range(ndim):
            boundaries = t[C[j]].lims
            fact = sp.absolute(boundaries[0] - boundaries[1]) / nwalkers
            rnd = sp.random.uniform(0.9, 0.9999)
            dif = sp.arange(nwalkers) * fact * sp.random.uniform(0.9, 0.9999)
            if (t[C[j]].cv and t[C[j]].tag() != 'Period'):
                for i in range(nwalkers):
                    pos[temp][i][j] = (boundaries[1] + 3 * boundaries[0]) / \
                        4 + (dif[i] * 2. / 5. + fact / 2.0)
            elif t[C[j]].tag() == 'Jitter':
                jitt_ini = sp.sort(sp.fabs(sp.random.normal(0, 1,
                                                            nwalkers))) * 0.1
                dif = jitt_ini * sp.random.uniform(0.9, 0.9999)
                for i in range(nwalkers):
                    pos[temp][i][j] = boundaries[0] + (dif[i] + fact / 2.0)
                    pos[temp][i][j] *= 0.1

            else:
                for i in range(nwalkers):
                    pos[temp][i][j] = boundaries[0] + (dif[i] + fact / 2.0)

    return pos
Beispiel #27
0
def draw_graph(graph):

    # create networkx graph
    G=nx.Graph()
    
    ordered_node_list = scipy.sort([int(i[1::]) for i in graph])

    # add nodes
    #for node in graph:
    #    G.add_node(node)
    for num in ordered_node_list:
        G.add_node('n'+str(num))
    

    # add edges
    for i in graph:
        for j in graph[i][1::]:
            G.add_edge(i,j)
            
    colors = ['b','r','g','c','w','k']
    
    node_color = [colors[graph[node][0]] for node in graph]

    # draw graph
    #pos = nx.shell_layout(G)
    pos = nx.spring_layout(G,iterations=100)
    nx.draw(G, pos, node_color = node_color)

    # show graph
    plt.axis('off')
    plt.show()
Beispiel #28
0
    def compute_accuracy(self):
        """Computes accuracy across the range in `self.date_range`.

        Returns: a pandas DataFrame with three columns corresponding to each
            kind of prediction method (PredPol, perfect prediction (god), and
            the baseline (naive_count)). The entries of each column are an array
            where the ith entry is the average accuracy over `self.date_range`
            when visiting i number of grid cells
        """
        accuracy = {
            method: sp.zeros((len(self.results), len(self.lambda_columns)))
            for method in ['predpol', 'god', 'naive_count']
        }
        naive_count = count_seen(self.pred_obj, self.pred_obj.train)['num_observed']

        for i, (lambda_col, actual_col) in self._iterator():
            actual_vals = self.results[actual_col].values
            accuracy['god'][:, i] = sp.sort(actual_vals)[::-1]

            sorted_idx = sp.argsort(self.results[lambda_col])[::-1]
            accuracy['predpol'][:, i] = actual_vals[sorted_idx]

            sorted_idx = sp.argsort(naive_count)[::-1]
            accuracy['naive_count'][:, i] = actual_vals[sorted_idx]

            naive_count += self.results[actual_col]

        # Compute CI and p-values here
        for k, v in accuracy.items():
            accuracy[k] = sp.sum(v, axis=1)
            accuracy[k] = sp.cumsum(accuracy[k] / sp.sum(accuracy[k]))
        return pd.DataFrame(accuracy)
Beispiel #29
0
def traj_ensemble_quantiles(traj_set, quantiles=(0.025, 0.5, 0.975)):
    """
    Return a list of trajectories, each one corresponding the a given passed-in
    quantile.
    """
    all_values = scipy.array([traj.values for traj in traj_set])
    sorted_values = scipy.sort(all_values, 0)
                   
    q_trajs = []
    for q in quantiles:
        # Calculate the index corresponding to this quantile. The q is because
        #  Python arrays are 0 indexed
        index = q * (len(sorted_values) - 1)
        below = int(scipy.floor(index))
        above = int(scipy.ceil(index))
        if above == below:
            q_values = sorted_values[below]
        else:
            # Linearly interpolate...
            q_below = (1.0*below)/(len(sorted_values)-1)
            q_above = (1.0*above)/(len(sorted_values)-1)
            q_values = sorted_values[below] + (q - q_below)*(sorted_values[above] - sorted_values[below])/(q_above - q_below)
        q_traj = copy.deepcopy(traj_set[0])
        q_traj.values = q_values
        q_trajs.append(q_traj)

    return q_trajs
    def set_step_stim(self):
        """
		Generate a random step stimulus with given density of steps 
		and given discrete stimulus values.
		"""

        assert self.step_stim_density < self.nT, "step_stim_density must be "\
         "less than number of time points, but nT = %s, density = %s" \
         % (self.nT, self.step_stim_density)

        self.stim = sp.zeros(self.nT)

        # Get points at which to switch the step stimulus
        sp.random.seed(self.step_stim_seed)
        switch_pts = sp.random.choice(self.nT, self.step_stim_density)
        switch_pts = sp.sort(switch_pts)

        # Set values in each inter-switch interval from step_stim_vals array
        sp.random.seed(self.step_stim_seed)
        for iT in range(self.step_stim_density - 1):
            stim_val = sp.random.choice(self.step_stim_vals)
            self.stim[switch_pts[iT]:switch_pts[iT + 1]] = stim_val

        # Fill in ends
        edge_vals = sp.random.choice(self.step_stim_vals, 2)
        self.stim[:switch_pts[0]] = edge_vals[0]
        self.stim[switch_pts[self.step_stim_density - 1]:] = edge_vals[1]
        self.Tt = sp.arange(0, self.dt * self.nT, self.dt)
Beispiel #31
0
def termFrequencyMatrix(directory,stopwords,termlist):
	""" The student must code this. """
	filenames = sp.sort(os.listdir(directory))
	frequencyMatrix = sp.zeros((len(termlist),len(filenames)))
	for i in xrange(len(filenames)):
		frequencyMatrix[:,i] = termVector(directory + filenames[i],stopwords,termlist)
	return frequencyMatrix.astype(float)
Beispiel #32
0
  def generateNoteLength(self):
    length = (60. / self.wavetempo) * self.time_freq_fs
    note_length = sp.array([2**i for i in range(5)]) / 4.
    note_length *= length
    note_huten = sp.array(
        [note_length[i-1]+note_length[i] for i in range(1, 5)])
    note_length = sp.r_[note_length, note_huten]
    note_length = sp.sort(note_length)

    note_length_pair = []
    for i in range(note_length.size):
      try:
        upper = (note_length[i+1] - note_length[i])/2
        upper += note_length[i]
      except IndexError:
        upper = note_length[i] * 2
      try:
        lower = note_length_pair[-1][1]
      except IndexError:
        lower = 0
      note_length_pair.append((lower, upper))
        
    if(self.output_form == 'MML'):
      note_name = ['16', '16.', '8', '8.', '4', '4.', '2', '2.', '1']
    elif(self.output_form == 'PMX'):
      note_name = ['1', '1d', '8', '8d', '4', '4d', '2', '2d', '0']
    return (note_name, note_length_pair)
Beispiel #33
0
def quantify_mutex_exons(event, gene, counts_segments, counts_edges):

    sg = gene.splicegraph
    segs = gene.segmentgraph

    seg_lens = segs.segments[1, :] - segs.segments[0, :]
    seg_shape = segs.seg_edges.shape[0]
    order = 'C'
    offset = 0

    ### find exons corresponding to event
    idx_exon_pre  = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0]
    idx_exon_aft  = sp.where((sg.vertices[0, :] == event.exons1[-1, 0]) & (sg.vertices[1, :] == event.exons1[-1, 1]))[0]
    idx_exon1  = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0]
    idx_exon2  = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0]
    
    ### find segments corresponding to exons
    seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1])
    seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1])
    seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1])
    seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1])

    # exon1 cov
    cov[0] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum(seg_lens[seg_exon1])
    # exon2 cov
    cov[1] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum(seg_lens[seg_exon2])

    ### check intron confirmation as sum of valid intron scores
    ### intron score is the number of reads confirming this intron
    # exon_pre_exon1_conf
    idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon1[0]], seg_shape, order=order) + offset)[0]
    if len(idx1.shape) > 0 and idx1.shape[0] > 0:
        cov[0] += counts_edges[idx1[0], 1]
    # exon_pre_exon2_conf
    idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0]
    if len(idx2.shape) > 0 and idx2.shape[0] > 0:
        cov[1] += counts_edges[idx2[0], 1]
    # exon1_exon_aft_conf
    idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon1[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0]
    if len(idx3.shape) > 0 and idx3.shape[0] > 0:
        cov[0] += counts_edges[idx3[0], 1]
    # exon2_exon_aft_conf
    idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon2[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0]
    if len(idx4.shape) > 0 and idx4.shape[0] > 0:
        cov[1] += counts_edges[idx4[0], 1]

    return cov
def _verify_eqrm_flags(eqrm_flags):
    """
    Check that the values in eqrm_flags are consistant with how EQRM works.

    Args:
      eqrm_flags: A DictKeyAsAttributes instance.
    """
    if not allclose(eqrm_flags.atten_periods,
                    sort(eqrm_flags.atten_periods)):
        raise AttributeSyntaxError(
            "Syntax Error: Period values are not ascending")

    if eqrm_flags.save_hazard_map == True and eqrm_flags.is_scenario == True:
        raise AttributeSyntaxError(
            'Cannot save the hazard map for a scenario.')

    if eqrm_flags.atten_variability_method == 1 and \
            eqrm_flags.run_type == 'risk_csm':
        raise AttributeSyntaxError(
            'Cannot use spawning when doing a risk_csm simulation.')

    if eqrm_flags.amp_variability_method == 1:
        raise AttributeSyntaxError(
            'Cannot spawn on amplification.')

    if eqrm_flags.event_set_handler == 'load' and \
            eqrm_flags.event_set_load_dir is None:
        raise AttributeSyntaxError(
            'event_set_load_dir must be set if event_set_handler is load.')

    if eqrm_flags.event_set_handler == 'load' and \
            not os.path.exists(eqrm_flags.event_set_load_dir):
        raise AttributeSyntaxError(
            'event_set_load_dir %s must exist if event_set_handler is load.' %
            eqrm_flags.event_set_load_dir)

    # Only do these checks if different from output_dir
    # (output_dir gets created if not exists)
    if eqrm_flags.data_array_storage != eqrm_flags.output_dir and \
            not os.path.exists(eqrm_flags.data_array_storage):
        raise AttributeSyntaxError(
            'data_array_storage %s must exist and be accessible from %s' %
            (eqrm_flags.data_array_storage, socket.gethostname()))

    if eqrm_flags.fault_source_tag is None and \
            eqrm_flags.zone_source_tag is None:
        raise AttributeSyntaxError(
            'Either fault_source_tag or zone_source_tag must be set.')

    # Check to see if a parameter is defined that is incompatible with the
    # defined run_type
    # Note: _add_default_values should have already dealt with adding
    # incompatible defaults
    for param in CONV_NEW:
        if not is_param_compatible(param, eqrm_flags):
            raise AttributeSyntaxError(
                "Attribute " + param['new_para'] +
                " not compatible with run_type=" + eqrm_flags['run_type'] +
                " - compatible run_type values are " + str(param['run_type']))
Beispiel #35
0
 def __init__(self, name, data):
     assert isinstance(name, six.string_types)
     assert isinstance(data, (list, tuple))
     self.name = name
     self.data = data
     self.by_date = dict([(datetime_from_date(dateutil.parser.parse(d)), v)
                          for (d, v) in self.data])
     self.sorted = sort(array(list(self.by_date.keys())))
Beispiel #36
0
def calculateEV_montecarlo(fFn, zRV, nDraws=10000):
	global g_montecarloDraws	
	if (not (zRV, nDraws) in g_montecarloDraws):
		g_montecarloDraws[(zRV, nDraws)] = scipy.sort(zRV.rvs(size=nDraws))
	draws = g_montecarloDraws[(zRV, nDraws)]		
	vals = map(fFn, draws)
	EV = scipy.mean(vals)
	return EV
Beispiel #37
0
def calculateEV_montecarlo2(grid, fArray, zRV, nDraws=10000):
	global g_montecarloDraws	
	if (not (zRV, nDraws) in g_montecarloDraws):
		g_montecarloDraws[(zRV, nDraws)] = scipy.sort(zRV.rvs(size=nDraws))
	draws = g_montecarloDraws[(zRV, nDraws)]		
	fn = linterp.LinInterp1D(grid, fArray)
	EV = fn.applySorted(draws) / nDraws
	return EV
Beispiel #38
0
def termFrequencyMatrix(directory, stopwords, termlist):
    """ The student must code this. """
    filenames = sp.sort(os.listdir(directory))
    frequencyMatrix = sp.zeros((len(termlist), len(filenames)))
    for i in xrange(len(filenames)):
        frequencyMatrix[:, i] = termVector(directory + filenames[i], stopwords,
                                           termlist)
    return frequencyMatrix.astype(float)
    def __init__(self, N, vectors, coverage_ratio=0.2):
        """
        Performs exact nearest neighbour search on the data set.

        vectors can either be a numpy matrix with all the vectors
        as columns OR a python array containing the individual
        numpy vectors.
        """
        # We need a dict from vector string representation to index
        self.vector_dict = {}
        self.N = N
        self.coverage_ratio = coverage_ratio

        # Get numpy array representation of input
        self.vectors = numpy_array_from_list_or_numpy_array(vectors)

        # Build map from vector string representation to vector
        for index in range(self.vectors.shape[1]):
            self.vector_dict[self.__vector_to_string(
                self.vectors[:, index])] = index

        # Get transposed version of vector matrix, so that the rows
        # are the vectors (needed by cdist)
        vectors_t = numpy.transpose(self.vectors)

        # Determine the indices of query vectors used for comparance
        # with approximated search.
        query_count = numpy.floor(self.coverage_ratio *
                                  self.vectors.shape[1])
        self.query_indices = []
        for k in range(int(query_count)):
            index = numpy.floor(k * (self.vectors.shape[1] / query_count))
            index = min(index, self.vectors.shape[1] - 1)
            self.query_indices.append(int(index))

        print('\nStarting exact search (query set size=%d)...\n' % query_count)

        # For each query vector get radius of closest N neighbours
        self.nearest_radius = {}
        self.exact_search_time_per_vector = 0.0

        for index in self.query_indices:

            v = vectors_t[index, :].reshape(1, self.vectors.shape[0])
            exact_search_start_time = time.time()
            D = cdist(v, vectors_t, 'euclidean')

            # Get radius of closest N neighbours
            self.nearest_radius[index] = scipy.sort(D)[0, N]

            # Save time needed for exact search
            exact_search_time = time.time() - exact_search_start_time
            self.exact_search_time_per_vector += exact_search_time

        print('\Done with exact search...\n')

        # Normalize search time
        self.exact_search_time_per_vector /= float(len(self.query_indices))
    def __init__(self, N, vectors, coverage_ratio=0.2):
        """
        Performs exact nearest neighbour search on the data set.

        vectors can either be a numpy matrix with all the vectors
        as columns OR a python array containing the individual
        numpy vectors.
        """
        # We need a dict from vector string representation to index
        self.vector_dict = {}
        self.N = N
        self.coverage_ratio = coverage_ratio

        # Get numpy array representation of input
        self.vectors = numpy_array_from_list_or_numpy_array(vectors)

        # Build map from vector string representation to vector
        for index in range(self.vectors.shape[1]):
            self.vector_dict[self.__vector_to_string(
                self.vectors[:, index])] = index

        # Get transposed version of vector matrix, so that the rows
        # are the vectors (needed by cdist)
        vectors_t = numpy.transpose(self.vectors)

        # Determine the indices of query vectors used for comparance
        # with approximated search.
        query_count = numpy.floor(self.coverage_ratio *
                                  self.vectors.shape[1])
        self.query_indices = []
        for k in range(int(query_count)):
            index = numpy.floor(k*(self.vectors.shape[1]/query_count))
            index = min(index, self.vectors.shape[1]-1)
            self.query_indices.append(int(index))

        print '\nStarting exact search (query set size=%d)...\n' % query_count

        # For each query vector get radius of closest N neighbours
        self.nearest_radius = {}
        self.exact_search_time_per_vector = 0.0

        for index in self.query_indices:

            v = vectors_t[index, :].reshape(1, self.vectors.shape[0])
            exact_search_start_time = time.time()
            D = cdist(v, vectors_t, 'euclidean')

            # Get radius of closest N neighbours
            self.nearest_radius[index] = scipy.sort(D)[0, N]

            # Save time needed for exact search
            exact_search_time = time.time() - exact_search_start_time
            self.exact_search_time_per_vector += exact_search_time

        print '\Done with exact search...\n'

        # Normalize search time
        self.exact_search_time_per_vector /= float(len(self.query_indices))
Beispiel #41
0
def quantify_exon_skip(event, gene, counts_segments, counts_edges):

    cov = sp.zeros((2, ), dtype='float')
    sg = gene.splicegraph
    segs = gene.segmentgraph

    seg_lens = segs.segments[1, :] - segs.segments[0, :]
    seg_shape = segs.seg_edges.shape
    order = 'C'
    offset = 0

    ### find exons corresponding to event
    idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0])
                            & (sg.vertices[1, :] == event.exons2[0, 1]))[0]
    idx_exon = sp.where((sg.vertices[0, :] == event.exons2[1, 0])
                        & (sg.vertices[1, :] == event.exons2[1, 1]))[0]
    idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[2, 0])
                            & (sg.vertices[1, :] == event.exons2[2, 1]))[0]

    ### find segments corresponding to exons
    seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1])
    seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1])
    seg_exon = sp.sort(sp.where(segs.seg_match[idx_exon, :])[1])

    # get inner exon cov
    cov[0] = sp.sum(counts_segments[seg_exon] * seg_lens[seg_exon]) / sp.sum(
        seg_lens[seg_exon])

    ### check intron confirmation as sum of valid intron scores
    ### intron score is the number of reads confirming this intron
    # exon_pre_exon_conf
    idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [seg_exon_pre[-1], seg_exon[0]], seg_shape, order=order) + offset)[0]
    cov[0] += counts_edges[idx1, 1]
    # exon_exon_aft_conf
    idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [seg_exon[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0]
    cov[0] += counts_edges[idx2, 1]
    # exon_pre_exon_aft_conf
    idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [seg_exon_pre[-1], seg_exon_aft[0]], seg_shape, order=order) +
                    offset)[0]
    cov[1] = counts_edges[idx3, 1]

    return cov
def entropy2(values):
    """Calculate the entropy of vector values.
    
    values will be flattened to a 1d ndarray."""
    
    values = sp.asarray(values).flatten()
    p = sp.diff(sp.c_[0,sp.diff(sp.sort(values)).nonzero(), values.size])/float(values.size)
    H = (p*sp.log2(p)).sum()
    return -H
Beispiel #43
0
def dict_arrsort(in_dict):
    """sort all arrays in a dictionary ["""

    try:
        for k in in_dict.keys():
            if isinstance(in_dict[k], sp.ndarray):
                in_dict[k] = sp.sort(in_dict[k])
    finally:
        return in_dict
Beispiel #44
0
 def __init__(self, name, data):
     assert isinstance(name, six.string_types)
     assert isinstance(data, (list, tuple))
     self.name = name
     self.data = data
     self.by_date = dict(
         [(datetime_from_date(dateutil.parser.parse(d)), v) for (d, v) in self.data]
     )
     self.sorted = sort(array(list(self.by_date.keys())))
Beispiel #45
0
def filterBid(allids, sbids):
    '''
    gets two id list
    returns matching index
    '''
    if sp.unique(sbids).shape[0]  != sbids.shape[0]:
        warnings.warn("superset ids are not unique: Making it unique")
        sbids = sp.unique(sbids)
    if sp.unique(allids).shape[0] != allids.shape[0]:
        warnings.warn("Subset ids are not unique: Making it unique")
        allids = sp.unique(allids)
    if sp.sum(sp.sort(allids) == allids) != allids.shape[0]: 
        warnings.warn("Superset ids are not sorted: Sorting it")
        allids = sp.sort(allids)
    if sp.sum(sp.sort(sbids) == sbids) != sbids.shape[0]:
        warnings.warn('subset ids are not sorted: Sorting it')
        sbids = sp.sort(sbids)
    return sp.where(sp.in1d(allids, sbids))[0]
Beispiel #46
0
 def generateNoteLength(self, tempo):
   length = 60 / tempo * self.fs * 4
   note_length = sp.array([2**i for i in range(5)]) / 4
   note_length *= length
   note_huten = sp.array(
       [note_length[i-1]+note_length[i] for i in range(1, 4)])
   note_length = sp.r_[note_length, note_huten]
   self.note_length = sp.sort(note_length)
   self.note_name = ['16', '16.', '8', '8.', '4', '4.', '2', '2.' '1']
def ECDF2(seq):
    """
    Calculate the Empirical Cumuated Distribution Function (ecdf) from a sequence 'seq'.
    """
    N=len(seq)
    sseq=sp.sort(seq)
    ranks = sp.stats.rankdata(sseq)
    ecdf=ranks/(N+1)
    return ecdf
Beispiel #48
0
def dict_arrsort(in_dict):
    """sort all arrays in a dictionary ["""

    try:
        for k in in_dict.keys():
            if isinstance(in_dict[k], sp.ndarray):
                in_dict[k] = sp.sort(in_dict[k])
    finally:
        return in_dict
Beispiel #49
0
    def __init__(self, data, sky, wave, wave2, skymodel):
        """ Plot data """
        self.data = plt.plot(wave, data, c='b')[0]
        self.sky = plt.plot(wave2, sky, c='gray')[0]
        self.canvas = self.data.get_figure().canvas
        self.ax = self.data.get_axes()
        self.xdata = self.data.get_xdata().copy()
        self.start = [
            data.copy(),
            sky.copy(),
            wave.copy(),
            wave2.copy(), skymodel
        ]
        self.skymodel = skymodel
        """ Get metadata (ie line locations) for arcs """
        data = self.data.get_ydata()
        self.datapeaks = ndimage.maximum_filter(data, 9)
        tmp = scipy.sort(data)
        thresh = tmp[tmp.size * 0.95]
        cond = (data == self.datapeaks) & (data > thresh)
        self.datapeaks = scipy.where(cond)[0]
        self.datasel = self.datapeaks * 0
        self.datalines = []
        for peak in self.datapeaks:
            l = plt.axvline(self.xdata[peak], c='k')
            self.datalines.append(l)

        self.spec = self.data
        self.peaks = self.datapeaks
        self.selected = self.datasel
        self.lines = self.datalines
        """ Set useful flags """
        self.domotion = False
        self.origx = None
        self.soln = None
        self.pick = False
        self.fitlines = None
        self.keyid = self.canvas.mpl_connect('key_press_event', self.key_press)

        self.connect()

        print """
Mouse Controls:
    - left button drags single lines (rescales spectrum!)
    - middle button drags all lines (or exits from pan/zoom modes)
    - right button selects/deselects lines
Keyboard Commands:
    a - add new line (use mouse to select the line)
    m - fit a polynomial to the blue `solution'
    d - optimize the blue fit to the gray model (like m, but optimizes too)
    w - write the current state to disk
    r - read a saved state
    n - reset to the initial state
    q - quit (performs an `m' fit if no fit has been applied yet)
"""
        plt.show()
def runFitAllParallelWorker(fileNumString,endTime=None,verbose=True):
    """
    Each worker node runs this function to look for and perform work.
    
    endTime (None)      : Stop work if endTime hours (wall time) 
                          have elapsed when completing a work unit.  
                          If None, continue indefinitely.
    """

    # check that the fitProbData file exists
    if not fileNumString+"_fitProbData.dat" in os.listdir('.'):
        raise Exception, "fitProbData database file not found: "+str(fitProbDatFilename)

    # 9.24.2013 make sure SloppyCell C compiling is working
    if not testCcompiling():
        raise Exception, "SloppyCell C compiling not working."

    if endTime is None: endTime = scipy.inf
    startWallTime = time.time()
    elapsedTimeHours = 0

    while elapsedTimeHours < endTime:
      fitProbData = loadFitProbData(fileNumString)
      saveFilename = fitProbData.values()[0]['saveFilename']

      numTimepointsList = scipy.sort(fitProbData.keys())

      # () find a (condition,Np,model) triplet to work on
      conditioni,numTimepointsi,modelj = assignWork(fileNumString)
      numTimepoints = numTimepointsList[numTimepointsi]
      fitProb = loadFitProb(saveFilename,fileNumString,conditioni,numTimepoints)
      
      if verbose:
          print "runFitAllParallelWorker: Assigned work: condition",conditioni,\
            ", numTimepoints",numTimepoints,", model index",modelj
      
      # set up smallerBestSeenParams
      if (numTimepointsi > 0) and \
         (getState(fitProbData,conditioni,numTimepointsi-1,modelj) == 'finished'):
        smallerFitProb = loadFitProb(saveFilename,fileNumString,conditioni,
                                     numTimepointsList[numTimepointsi-1])
        fitProb.smallerBestParamsDict = paramsDict(smallerFitProb)
      
      # fit the single model
      fitProb.fitAll(maxNumFit=modelj+1)
      
      # save the result in the individual fitProbDict file
      saveFitProb(fitProb,saveFilename,fileNumString,conditioni,numTimepoints)

      # save the result in the more general fitProbData file
      updateFitProbData(fitProb,fileNumString,conditioni,numTimepoints,modelj)

      if verbose:
          print "runFitAllParallelWorker: Finished work."

      elapsedTimeHours = (time.time() - startWallTime)/3600.
Beispiel #51
0
def measure(file,num=25):
	f = pyfits.open(file)

	data = f[0].data.astype(scipy.float64)/1000.
	size = data.size
	wave = spectools.wavelength(file)
	sorted = scipy.sort(data)
	zero = sorted[size/20:size/10].mean()
	sigma = sorted[size/20:size/10].std()

	thresh = zero+100*sigma

	count = 0
	search = data.copy()

	vals = scipy.zeros(num)
	place = scipy.zeros(num)
	while count<num:
		max = search.max()
		if max<thresh:
			break

		pos = search.argmax()


		search[pos-5:pos+6] = 0.

		if pos<5 or pos+6>size:
			continue
		fitdata = scipy.zeros((11,2))
		fitdata[:,0] = wave[pos-5:pos+6]
		fitdata[:,1] = data[pos-5:pos+6]

		par = scipy.zeros(4)
		par[1] = max
		par[2] = wave[pos]
		par[3] = wave[pos]-wave[pos-1]

		fit,chi2 = special_functions.ngaussfit(fitdata,par)
		if chi2>4:
			continue
		model = special_functions.ngauss(fitdata[:,0],fit)
		pylab.plot(wave[pos-5:pos+6],model)

		feature = wave[pos]
		width = fit[3]*299800/feature

		vals[count] = width
		place[count] = feature

		count += 1
	args = place.argsort()
	vals = vals[args]
	place = place[args]

	return scipy.median(vals)
Beispiel #52
0
def g2ig(g):
    """
    Converts our graph represenataion to an igraph for plotting
    """
    t = scipy.where(CG2adj(g) == 1)
    l = zip(t[0], t[1])
    ig = igraph.Graph(l, directed=True)
    ig.vs["name"] = scipy.sort([u for u in g])
    ig.vs["label"] = ig.vs["name"]
    return ig
def makeFpdLean(fpd):
    """
    Modify in place to create a stripped-down version of fpd 
    that doesn't include the models.
    """
    for N in scipy.sort(fpd.keys()):
        fp = fpd[N]
        for f in fp.fittingProblemList:
            f.fittingModelDict = {}
            f.fittingModelList = []
def ECDF(seq):
    """
    Calculate the Empirical Cumuated Distribution Function (ecdf) from a sequence 'seq'.

    A scipy interpolation object is returned.
    """
    N=len(seq)
    sseq=sp.sort(seq)
    ecdf=sp.linspace(1./N,1,N)
    return interp1d(sseq,ecdf,bounds_error=False)
def entropy2(values):
    """Calculate the entropy of vector values.
    
    values will be flattened to a 1d ndarray."""
    
    values = values.flatten()
    M = len(sp.unique(values))
    p = sp.diff(sp.c_[sp.diff(sp.sort(values)).nonzero(), len(values)])/float(len(values))
    H = -((p*sp.log2(p)).sum())
    return H