def non_iter_ls_inv_stft(stft_object):
    stft_data = stft_object['stft']
    origSigSize = stft_object['origSigSize']
    num_rows, _, _ = origSigSize
    shift_length = stft_object['shift_length']
    len_each_section, num_rows_overlap, _, _ = stft_data.shape
    # TODO: Isn't this just num_rows in the very beginning?
    # total_new_elements = (num_rows_overlap - 1) * shift_length + len_each_section
    win_info = stft_object['win_info']
    wVec = win_info(len_each_section)
    wVecSq = wVec**2
    vecC = np_arange(1, num_rows_overlap * shift_length, step=shift_length)
    # vecC = range(0, num_rows_overlap*shift_length-1, shift_length)
    DlsArr = np_zeros((num_rows, ))
    for j in vecC:
        tmpArr = np_arange(j - 1, len_each_section + j - 1)
        # tmpArr = np_arange(j, len_each_section+j)
        DlsArr[tmpArr] += wVecSq
    # DlsArrInv = 1/DlsArr
    invFT = math_sqrt(len_each_section) * np_ifft(stft_data, axis=0)
    invFT_real = invFT.real
    invFT *= wVec[:, np_newaxis, np_newaxis, np_newaxis]
    yEst = np_zeros(origSigSize)
    for index, j in enumerate(vecC):
        tmpArr = np_arange(j - 1, len_each_section + j - 1)
        yEst[tmpArr, :] += invFT_real[:, index, :]
    # sigOut = yEst * DlsArrInv[:, np_newaxis, np_newaxis]
    sigOut = yEst / DlsArr[:, np_newaxis, np_newaxis]
    return sigOut
Exemple #2
0
 def _modeCheck(Ser1):
     c = np_bincount(Ser1)
     # 返回众数
     i = np_argmax(c)
     rule = (i - 2 > Ser1) | (i + 2 < Ser1)
     index = np_arange(Ser1.shape[0])[rule]
     return index
Exemple #3
0
def parse_matrix_part(matrix, szSub, ovSub):
    assert matrix.ndim == 3
    assert np_ndim(szSub) == 1
    assert len(szSub) == 3
    assert np_ndim(ovSub) == 1
    assert len(ovSub) == 3

    matrix_shape = np_asarray(matrix.shape, dtype=int)
    len_each_section, _, _ = szSub
    shift_length, _, _ = ovSub

    len_each_section_range = np_arange(len_each_section)

    matrix_shape = np_ceil((matrix_shape - szSub + 1)/ovSub).astype(int)
    num_rows_overlap, num_elements, num_beams = matrix_shape
    result_matrix = np_zeros((np_prod(szSub), np_prod(matrix_shape)))
    cnt = 0
    for i in range(num_beams):
        for j in range(num_elements):
            for k in range(num_rows_overlap):
                index_1 = len_each_section_range + k * shift_length
                index_2 = j
                index_3 = i
                tmp = matrix[index_1, index_2, index_3]
                result_matrix[:, cnt] = tmp
                cnt += 1

    return result_matrix
Exemple #4
0
 def _two_sigma(Ser1):
     '''
     Ser1:表示传入DataFrame的某一列。
     '''
     rule = (Ser1.mean() - 2 * Ser1.std() >
             Ser1) | (Ser1.mean() + 2 * Ser1.std() < Ser1)
     index = np_arange(Ser1.shape[0])[rule]
     return index
Exemple #5
0
def invert_index_list(indexes, length):
    '''
    Inverts indexes list
    indexes: List[Int] of Ndarray flat numpy array
    length: Int. Length of the base list
    '''
    mask = np_ones(length, dtype='bool')
    mask[indexes] = False
    inverted_indexes = np_arange(length)[mask]
    return inverted_indexes
Exemple #6
0
def fan_regular_pols(np_verts,
                     np_pols,
                     np_distances,
                     np_faces_id,
                     custom_normals,
                     index_offset=0,
                     use_custom_normals=False,
                     output_old_v_id=True,
                     output_old_face_id=True,
                     output_pols_groups=True):

    pols_number = np_pols.shape[0]
    pol_sides = np_pols.shape[1]
    v_pols = np_verts[np_pols]  #shape [num_pols, num_corners, 3]

    if (len(np_distances) > 1
            and np.any(np_distances != 0)) or np_distances != 0:
        if use_custom_normals:
            normals = custom_normals
        else:
            normals = np_faces_normals(v_pols)
        average = np.sum(
            v_pols, axis=1
        ) / pol_sides + normals * np_distances[:,
                                               np_newaxis]  #shape [num_pols, 3]
    else:
        average = np.sum(v_pols, axis=1) / pol_sides

    idx_offset = len(np_verts) + index_offset
    new_idx = np_arange(idx_offset, pols_number + idx_offset)
    new_pols = np.zeros([pols_number, pol_sides, 3], dtype=int)
    new_pols[:, :, 0] = np_pols
    new_pols[:, :, 1] = np_roll(np_pols, -1, axis=1)
    new_pols[:, :, 2] = new_idx[:, np_newaxis]

    old_vert_id = np_pols[:, 0].tolist() if output_old_v_id else []

    if output_old_face_id:
        old_face_id = np_repeat(np_faces_id[:, np_newaxis], pol_sides,
                                axis=1).tolist()
    else:
        old_face_id = []

    if output_pols_groups:
        pols_groups = np_repeat(1, len(new_pols) * pol_sides).tolist()
    else:
        pols_groups = []

    return (
        average.tolist(),
        new_pols.reshape(-1, 3).tolist(),
        old_vert_id,
        old_face_id,
        pols_groups,
    )
Exemple #7
0
    def getBreakpointsByCardinality(self, cardinality):

        if cardinality not in self.breakpointsByCardinality:
            frac = 1.0 / cardinality
            list_percent = []
            for i_fl in np_arange(frac, 1.0, frac):
                list_percent.append(i_fl)
            self.breakpointsByCardinality[cardinality] = (
                np_array(norm.ppf(list_percent)) * self.std + self.mean)

        return self.breakpointsByCardinality[cardinality]
Exemple #8
0
def stft(signal, len_each_section, frac_overlap, padding, win_info=boxcar):
    shift_length = round(len_each_section *
                         (1. - frac_overlap))  # shift_length = 2

    _, num_elements, num_beams = signal.shape

    zeroCrct = 0

    wVec_rectwin = win_info(len_each_section + zeroCrct)
    wVec = wVec_rectwin[zeroCrct // 2:len(wVec_rectwin) - zeroCrct // 2]

    allOvrlp = parse_matrix_part(signal, [len_each_section, 1, 1],
                                 [shift_length, 1, 1])

    num_rows_overlap = allOvrlp.shape[1] // (num_elements * num_beams)

    newShape = [len_each_section, num_rows_overlap, num_elements, num_beams]

    subOvrlp = allOvrlp.reshape(newShape,
                                order="F")  # Matlab defaults to Fortran

    startLocs = np_arange(num_rows_overlap * shift_length, step=shift_length)

    winOvrlp = subOvrlp * wVec[:, np_newaxis, np_newaxis, np_newaxis]

    stft_array = np_fft(winOvrlp, padding, axis=0)

    freq = np_arange(padding) / padding

    out = {
        'stft': stft_array,
        'freqs': freq,
        'startOffsets': startLocs,
        'len_each_section': len_each_section,
        'padding': padding,
        'win_info': win_info,
        'frac_overlap': frac_overlap,
        'shift_length': shift_length,
    }

    return out
Exemple #9
0
def naive_search_with_np(n):
    prime_numbers = np_array([2])
    i = prime_numbers[0]
    while len(prime_numbers) != n:
        i += 1
        for d in np_arange(2, i):
            i_is_prime = True
            if i%d == 0:
                i_is_prime = False
                break
        if i_is_prime:
            prime_numbers = np_append(prime_numbers, i)

    print('Байт:', getsizeof(prime_numbers))
def centers(c, side, edges, catan):
    """
    Input: (center,length of the inner side, number of edges, type of CATAN)
    Output: numpy array of "many" points
    """
    alpha = 2.0 * pi / edges

    if catan == "CATAN_ext":
        i_y = int(1 * small_side / (3 * sqrt(3) - 1.0) + 0.5) * 1.3
        i_x = int(i_y * cos(alpha) + 0.5) * 2.3
        #i_y = int(1*small_side/(3*sqrt(3)-1.0)+0.5)*1.4
        #i_x = int(i_y*cos(alpha)+0.5)*2.4
        many, j, j_x = 30, 7, 3
    else:
        i_y = int(1.5 * small_side / (3 * sqrt(3) - 1.0) + 0.5) * 1.05
        i_x = int(i_y * cos(alpha) + 0.5) * 2.3
        many, j, j_x = 19, 5, 2

    centers = np_zeros((many, 2))
    raw_centers = np_zeros((many, 3))
    k = 0
    for j_y in np_arange(j) - j_x:
        if j == 5:
            j += 1
        for j_j in np_arange(j - abs(j_y) - 1):
            g = j - abs(j_y) - 1
            f = 0.5 * (g % 2 == 0)
            j_k = j_j - g / 2 + f
            #print j-abs(j_y)-1,(j_j,j_y),j_k
            centers[k, :] = [
                int(c[0] + i_x * j_k + 0.5),
                int(c[1] + i_y * j_y + 0.5)
            ]
            raw_centers[k, :] = [j - abs(j_y) - 1, j_j, j_y]
            k += 1
    #print centers
    return centers, raw_centers
def process_impropers(atom,
                      dihe_list,
                      func="imp",
                      verbose=False,
                      forzmatrix=False,
                      dihed_count=None):

    alist = [
        atom,
        atom.get_atom_list()[0],
        atom.get_atom_list()[1],
        atom.get_atom_list()[2]
    ]
    looked_up_param, centeratom_name, dihed_identifier = return_params(
        func, alist)
    if looked_up_param is None:  # dihed was not specified
        pass
    else:  # was specified. Now, check whether it was given double
        params = looked_up_param.split(",")
        if len(params) == 1:
            d = Dihedral(*alist,
                         func=func,
                         param=params[0],
                         was_input=False,
                         is_multiparam=False,
                         has_siblings=False,
                         dihed_idx=dihed_count)
            dihed_count += 1
            dihe_list.append(d)
        elif len(params) > 1:
            siblings = np_arange(start=dihed_count,
                                 stop=dihed_count + len(params))
            if verbose:
                print_verbose_dihedral_message(func, alist, params)
            for param in params:
                d = Dihedral(
                    *alist,
                    func=func,
                    param=param,
                    was_input=True,
                    is_multiparam=True,
                    around_center_atom=atom.get_atomtype() == centeratom_name,
                    siblings=siblings,
                    dihed_idx=dihed_count)
                dihed_count += 1
                dihe_list.append(d)
        if not forzmatrix:
            pass
    return dihe_list, dihed_count
Exemple #12
0
def Skew(x,y,dat,noise=3):
    interp=sp_interp2d(x,y,dat)
    dx=x[1]-x[0]
    ySkew=np_arange(np_amin(y)-dx*x.size,np_amax(y),dx)
    DAT=np_empty((ySkew.size,x.size))

    yMax=np_amax(y); yMin=np_amin(y)
    for i in range(ySkew.size):
        for j in range(x.size):
            if ySkew[i]+j*dx > yMax or ySkew[i]+j*dx < yMin:
                DAT[i,j]=(np_rand(1)-0.5)*noise
            else:
                DAT[i,j]=interp(x[j],ySkew[i]+j*dx)

    return ySkew,DAT
Exemple #13
0
    def plot(self, unchanged, passive, active, xticklabels, ylabel):
        """Create stacked bar plot."""

        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        axis = self.fig.add_subplot(111)

        ind = np_arange(len(unchanged))
        width = 0.7

        unchanged = np_array(unchanged)
        passive = np_array(passive)
        active = np_array(active)

        p1 = axis.bar(ind, unchanged, width, color='#80b1d3')
        p2 = axis.bar(ind, passive, width, bottom=unchanged, color='#fdae6b')
        p3 = axis.bar(ind,
                      active,
                      width,
                      bottom=unchanged + passive,
                      color='#b3de69')

        axis.set_ylim([0, 100])
        axis.set_yticks(range(0, 101, 10))
        axis.set_ylabel(ylabel)

        axis.set_xticks(ind)
        axis.set_xticklabels(xticklabels)

        axis.yaxis.grid(True,
                        linestyle='-',
                        which='major',
                        color='lightgrey',
                        alpha=0.7,
                        zorder=1)
        axis.set_axisbelow(True)

        self.prettify(axis)

        axis.legend((p3[0], p2[0], p1[0]),
                    ('Active change', 'Passive change', 'Unchanged'),
                    fontsize=self.options.tick_font_size,
                    loc='upper left',
                    bbox_to_anchor=(1, 1),
                    frameon=False)

        #self.fig.tight_layout(pad=1.0, w_pad=0.1, h_pad=0.1)
        self.draw()
    def __init_matches(self):

        for match_type, var in [['qm', 'qualification_matches'], ['qf', 'quarter_final_matches'],
                                ['sf', 'semi_final_matches'], ['f', 'final_matches']]:
            num_matches = self.__count_matches(self.raw_matches, match_type)
            if num_matches is not 0:
                # zero = range(num_matches)
                red_teams = np_zeros((num_matches,), np_object)
                blue_teams = np_zeros((num_matches,), np_object)
                blue_scores = np_zeros((num_matches,), np_object)
                red_scores = np_zeros((num_matches,), np_object)
                match_code = np_zeros((num_matches,), np_object)
                match_numbers = np_arange(1, num_matches + 1, 1)

                for match in self.raw_matches:
                    if match['comp_level'] == match_type:
                        match_num = match['match_number'] - 1

                        red_teams[match_num] = [np_int(match['alliances']['red']['teams'][0][3:]),
                                                np_int(match['alliances']['red']['teams'][1][3:]),
                                                np_int(match['alliances']['red']['teams'][2][3:])]

                        red_scores[match_num] = [-1 if match['alliances']['red']['score'] is None
                                                 else match['alliances']['red']['score'],
                                                 -1 if match['score_breakdown']['red']['auto'] is None
                                                 else match['score_breakdown']['red']['auto'],
                                                 -1 if match['score_breakdown']['red']['foul'] is None
                                                 else match['score_breakdown']['red']['foul']]

                        blue_teams[match_num] = [np_int(match['alliances']['blue']['teams'][0][3:]),
                                                 np_int(match['alliances']['blue']['teams'][1][3:]),
                                                 np_int(match['alliances']['blue']['teams'][2][3:])]

                        blue_scores[match_num] = [-1 if match['alliances']['blue']['score'] is None
                                                  else match['alliances']['blue']['score'],
                                                  -1 if match['score_breakdown']['blue']['auto'] is None
                                                  else match['score_breakdown']['blue']['auto'],
                                                  -1 if match['score_breakdown']['blue']['foul'] is None
                                                  else match['score_breakdown']['blue']['foul']]
                        match_code[match_num] = match['key']

                red_win = np_array(red_scores.tolist())[:, 0] > np_array(blue_scores.tolist())[:, 0]
                winner = np_array(['blue'] * len(red_win))
                winner[red_win] = 'red'

                self.__setattr__(var,
                                 np_rot90(np_array([[match_type] * num_matches, match_numbers, red_teams, blue_teams,
                                                    red_scores, blue_scores, winner, match_code], np_object))[::-1])
    def plot(self, both, isolate, env, xticklabels):
        """Create stacked bar plot."""
        
        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        axis = self.fig.add_subplot(111)
        
        ind = np_arange(len(both))
        width = 0.7
        
        both = np_array(both)
        isolate = np_array(isolate)
        env = np_array(env)

        p1 = axis.bar(ind, both, width, color='#80b1d3')
        p2 = axis.bar(ind, isolate, width, bottom=both, color='#fdae6b')
        p3 = axis.bar(ind, env, width, bottom=both+isolate, color='#b3de69')

        axis.set_ylim([0, 100])
        axis.set_yticks(range(0, 101, 10))
        axis.set_ylabel('Taxa (%)')
        
        axis.set_xticks(ind)
        axis.set_xticklabels(xticklabels)
        
        axis.yaxis.grid(True, 
                        linestyle='-', 
                        which='major', 
                        color='lightgrey', 
                        alpha=0.7, 
                        zorder=1)
        axis.set_axisbelow(True)

        self.prettify(axis)
        
        axis.legend((p3[0], p2[0], p1[0]), ('Exclusively MAGs and/or SAGs', 
                                            'Exclusively isolates', 
                                            'Isolate and environmental genomes'),
                                            fontsize=self.options.tick_font_size,
                                            loc='upper left', 
                                            bbox_to_anchor=(1, 1),
                                            frameon=False)

        #self.fig.tight_layout(pad=1.0, w_pad=0.1, h_pad=0.1)
        self.draw()
    def plot(self, plot_latinized, plot_placeholder, xticklabels):
        """Create stacked bar plot."""

        self.fig.clear()
        self.fig.set_size_inches(self.options.width, self.options.height)
        axis = self.fig.add_subplot(111)

        ind = np_arange(len(plot_latinized))
        width = 0.7

        plot_latinized = np_array(plot_latinized)
        plot_placeholder = np_array(plot_placeholder)

        p1 = axis.bar(ind, plot_latinized, width, color='#80b1d3')
        p2 = axis.bar(ind,
                      plot_placeholder,
                      width,
                      bottom=plot_latinized,
                      color='#fdae6b')

        axis.set_ylim([0, 100])
        axis.set_yticks(range(0, 101, 10))
        axis.set_ylabel('Taxa (%)')

        axis.set_xticks(ind)
        axis.set_xticklabels(xticklabels)

        axis.yaxis.grid(True,
                        linestyle='-',
                        which='major',
                        color='lightgrey',
                        alpha=0.7,
                        zorder=1)
        axis.set_axisbelow(True)

        self.prettify(axis)

        axis.legend((p2[0], p1[0]), ('Placeholder', 'Latinized'),
                    fontsize=self.options.tick_font_size,
                    loc='upper left',
                    bbox_to_anchor=(1, 1),
                    frameon=False)

        #self.fig.tight_layout(pad=1.0, w_pad=0.1, h_pad=0.1)
        self.draw()
def split_examples(X, y, percent_valid=PERCENT_VALID, percent_test=PERCENT_TEST):
    '''
    # Split by target after selecting a single frequency. So X.shape is probably
    [24, 2377, 65, 2]
    '''
    # TODO: prove that the split indices for X match those for y.
    # ## Train, Valid, Test split
    try:
        assert X.ndim == 4
    except:
        raise ValueError('X.ndim should be 4, but X.shape is {}'.format(X.shape))
    num_examples_all = len(X)
    assert len(y) == num_examples_all
    nums_splits = [int((1-percent_valid-percent_test)*num_examples_all), \
                    int((1-percent_test)*num_examples_all)]

    indices_original = np_arange(num_examples_all, dtype=int)
    indices_shuffled = np_random_permutation(indices_original)
    indices_train, indices_valid, indices_test = np_split(indices_shuffled, nums_splits) # pylint: disable=W0632
    X_train, X_valid, X_test = X[indices_train, :, :], X[indices_valid, :, :], X[indices_test, :, :]
    y_train, y_valid, y_test = y[indices_train, :, :], y[indices_valid, :, :], y[indices_test, :, :]
    return (X_train, X_valid, X_test), (y_train, y_valid, y_test)
Exemple #18
0
    def setUpClass(cls):
        super(TestTsManagement, cls).setUpClass()

        # ts_one=
        # [[ 0  1]
        #  [ 2  3]
        #  [ 4  5]
        #   ...
        #  [92 93]
        #  [94 95]
        #  [96 97]
        #  [98 99]]
        cls.ts_one = np_arange(100).reshape(50, 2)

        # ts_two=
        # [[198 199]
        #  [196 197]
        #  [194 195]
        #  [192 193]
        #    ...
        #  [104 105]
        #  [102 103]
        #  [100 101]]
        #
        cls.ts_two = cls.ts_one[50::-1] + 100

        # reverse sort of ts_two:
        #
        # ts_inverse=
        # [[100 101]
        #  [102 103]
        #  [104 105]
        #    ...
        #  [194 195]
        #  [196 197]
        #  [198 199]]
        #
        cls.ts_inverse = cls.ts_two[::-1]
Exemple #19
0
    def deConvolve(self,G_w,noise_dT=3,noise_avg=3,fMax=2.4):
        self.reGrid(noise_dT=noise_dT,noise_avg=noise_avg)
        self.tPumpDeconv=np_arange(np_amin(self.tPump),np_amax(self.tPump),
                                   self.tTHz[1]-self.tTHz[0])
        loc=np_amin(np_where(self.f >= fMax))
        for i in range(self.tPumpSkew.size):
            self.dTSkewFFT[i,:loc]=self.dTSkewFFT[i,:loc]/G_w[:loc]
            self.avgSkewFFT[i,:loc]=self.avgSkewFFT[i,:loc]/G_w[:loc]

        self.dTskew=np_irfft(self.dTSkewFFT,axis=1)
        self.avgSkewFFT=np_irfft(self.avgSkewFFT,axis=1)

        self.dTdeconv=unSkew(self.tTHz,self.tPump,self.tPumpSkew,self.dTskew)
        self.avgDeconv=unSkew(self.tTHz,self.tPump
                              ,self.tPumpSkew,self.avgSkewFFT)
        self.refDeconv=self.avgDeconv-self.dTdeconv
        self.pumpDeconv=self.avgDeconv+self.dTdeconv

        self.refFFTdeconv=np_rfft(self.refDeconv,axis=1)
        self.pumpFFTdeconv=np_rfft(self.pumpDeconv,axis=1)
        self.transDeconv=self.pumpFFTdeconv/self.refFFTdeconv
                                
        return
Exemple #20
0
    def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file,
            taxonomy_file, output_prefix, min_children, title):

        # determine named clades in full tree
        named_clades = set()
        tree = dendropy.Tree.get_from_path(full_tree_file,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        for node in tree.preorder_node_iter():
            if node.label:
                taxonomy = node.label.split(';')
                named_clades.add(taxonomy[-1].strip().split(':')[-1])

        print 'Identified %d named clades in full tree.' % len(named_clades)

        # determine named groups with at least the specified number of children
        print 'Determining taxa with sufficient named children lineages.'
        taxon_children = defaultdict(set)
        groups = defaultdict(list)
        print taxonomy_file
        for line in open(taxonomy_file):
            line_split = line.replace('; ', ';').split()
            genome_id = line_split[0]
            taxonomy = [x.strip() for x in line_split[1].split(';')]

            if len(taxonomy) > rank + 1:
                taxon_children[taxonomy[rank]].add(taxonomy[rank + 1])

            if len(taxonomy) > rank:
                groups[taxonomy[rank]].append(genome_id)

        groups_to_consider = set()
        for taxon, children_taxa in taxon_children.iteritems():
            if len(children_taxa) >= min_children and taxon in named_clades:
                groups_to_consider.add(taxon)

        print 'Assessing distribution over %d groups.' % len(
            groups_to_consider)

        # calculate RED for full tree
        print ''
        print 'Calculating RED over full tree.'
        tree = dendropy.Tree.get_from_path(full_tree_file,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)
        full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups(
            tree, groups_to_consider, groups)
        if len(polyphyletic) > 0:
            print ''
            print '[Warning] Full tree contains polyphyletic groups.'

        # calculate RED for dereplicated tree
        print ''
        print 'Calculating RED over dereplicated tree.'
        tree = dendropy.Tree.get_from_path(derep_tree_file,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups(
            tree, groups_to_consider, groups)

        groups_to_consider = groups_to_consider - polyphyletic
        print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len(
            groups_to_consider)

        # calculate RED to each group in each tree
        print ''
        rel_dists = defaultdict(list)
        dist_components = defaultdict(list)
        for f in os.listdir(input_tree_dir):
            if not f.endswith('.rooted.tree'):
                continue

            print f

            tree_file = os.path.join(input_tree_dir, f)
            tree = dendropy.Tree.get_from_path(tree_file,
                                               schema='newick',
                                               rooting='force-rooted',
                                               preserve_underscores=True)

            # calculate relative distance to named taxa
            rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups(
                tree, groups_to_consider, groups)

            for taxon, dist in rel_dist.iteritems():
                rel_dists[taxon].append(dist)
                dist_components[taxon].append(components[taxon])

        # create scatter plot
        x = []
        y = []
        xDerep = []
        yDerep = []
        xFull = []
        yFull = []
        perc10 = []
        perc90 = []
        labels = []
        fout = open(output_prefix + '.tsv', 'w')
        fout.write(
            'Taxon\tP10\tP90\tP90-P10\tMean RED\tMean dist to parent\tMean dist to leaves\tOriginal RED\tOrigial dist to parent\tOriginal dist to leaves\n'
        )
        for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)):
            labels.append(taxon + ' (%d)' % (len(rel_dists[taxon])))

            rd = rel_dists[taxon]
            for d in rd:
                x.append(d)
                y.append(i + 0.2)

            p10, p90 = np_percentile(rd, [10, 90])
            perc10.append(p10)
            perc90.append(p90)

            print taxon, p90 - p10
            mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0)
            derep_x, derep_a, derep_b = derep_dist_components[taxon]
            fout.write(
                '%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' %
                (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x,
                 derep_a, derep_b))

            xDerep.append(derep_rel_dist[taxon])
            yDerep.append(i)

            xFull.append(full_rel_dist[taxon])
            yFull.append(i)
        fout.close()

        self.fig.clear()
        self.fig.set_size_inches(8, len(rel_dists) * 0.4)
        ax = self.fig.add_subplot(111)

        ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s')
        ax.scatter(xDerep,
                   yDerep,
                   alpha=1.0,
                   s=24,
                   c=(1.0, 0.0, 0.0),
                   marker='s')
        ax.scatter(xFull,
                   yFull,
                   alpha=1.0,
                   s=24,
                   c=(0.0, 0.0, 1.0),
                   marker='*')

        for i in xrange(len(labels)):
            ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-')
            ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-')

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')
        if title:
            ax.set_title(title, size=12)

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('taxa')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(labels)

        self.prettify(ax)

        # make plot interactive
        # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12))

        # mpld3.save_html(fig, output_prefix + '.html')
        self.fig.tight_layout(pad=1)
        self.fig.savefig(output_prefix + '.png', dpi=300)
 def __init_alliances(self):
     alliances = [[team[3:] for team in alliance['picks']] for alliance in self.raw_event['alliances']]
     alliances = np_array(alliances, np_int)
     numbers = np_vstack(np_arange(1, 9, 1))
     self.alliances = np_concatenate((numbers, alliances), 1)
Exemple #22
0
    def _distribution_summary_plot(self, phylum_rel_dists,
                                   taxa_for_dist_inference, plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [
                np_median(dists)
                for taxon, dists in medians_for_taxa[rank].items()
                if taxon in taxa_for_dist_inference
            ]
            if not v:
                # not taxa at rank suitable for creating classification
                # boundaries
                continue

            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)
            ax.plot((p50, p50), (i, i + 0.5),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)
            ax.plot((p90, p90), (i, i + 0.25),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if 1.0 > boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5),
                            c=c,
                            lw=2,
                            zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label +
                               ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            no_inference = []
            for clade_label, dists in medians_for_taxa[rank].items():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if self._is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(md)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(md)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(md)

            # histogram for each rank
            n = 0
            if len(mono) > 0:
                mono = np_array(mono)
                no_inference = np_array(no_inference)
                poly = np_array(poly)
                binwidth = 0.025
                bins = np_arange(0, 1.0 + binwidth, binwidth)

                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                w = float(
                    len(mono)) / (len(mono) + len(poly) + len(no_inference))
                n, b, p = ax.hist(mono,
                                  bins=bins,
                                  color=(0.0, 0.0, 1.0),
                                  alpha=0.25,
                                  weights=0.9 * w * mono_weights,
                                  bottom=i,
                                  lw=0,
                                  zorder=0)

            if len(no_inference) > 0:
                no_inference_max_count = max(
                    np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (
                    1.0 / no_inference_max_count)

                ax.hist(no_inference,
                        bins=bins,
                        color=(0.3, 0.3, 0.3),
                        alpha=0.25,
                        weights=0.9 * (1.0 - w) * no_inference_weights,
                        bottom=i + n,
                        lw=0,
                        zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly,
                        bins=bins,
                        color=(1.0, 0.0, 0.0),
                        alpha=0.25,
                        weights=0.9 * (1.0 - w) * poly_weights,
                        bottom=i + n,
                        lw=0,
                        zorder=0)

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(list(range(0, len(medians_for_taxa))))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(
            self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig,
                              mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Exemple #23
0
def clean_meshes(vertices,
                 edges,
                 faces,
                 remove_unreferenced_edges=False,
                 remove_unreferenced_faces=False,
                 remove_duplicated_edges=False,
                 remove_duplicated_faces=False,
                 remove_degenerated_edges=False,
                 remove_degenerated_faces=False,
                 remove_loose_verts=False,
                 calc_verts_idx=False,
                 calc_edges_idx=False,
                 calc_faces_idx=False):
    '''
    Cleans a group of meshes using different routines.
    Returs Clened meshes and removed items indexes
    '''
    verts_out, edges_out, faces_out = [], [], []
    verts_removed_out, edges_removed_out, faces_removed_out = [], [], []

    for verts_original, edges_original, faces_original in zip(
            vertices, edges, faces):
        verts_changed, edges_changed, faces_changed = False, False, False

        preserved_edges_idx = []
        preserved_faces_idx = []
        if remove_unreferenced_edges:
            edges, preserved_edges_mask = remove_unreferenced_topology(
                edges_original, len(verts_original))
            preserved_edges_idx = np_arange(
                len(edges_original))[preserved_edges_mask]
            edges_changed = True

        if remove_unreferenced_faces:
            faces, preserved_faces_mask = remove_unreferenced_topology(
                faces_original, len(verts_original))
            preserved_faces_idx = np_arange(
                len(faces_original))[preserved_faces_mask]
            faces_changed = True

        if remove_duplicated_edges:
            if edges_changed:
                edges, unique_edges_mask = get_unique_topology(edges)
                preserved_edges_idx = preserved_edges_idx[unique_edges_mask]
            else:
                edges, unique_edges_mask = get_unique_topology(edges_original)
                preserved_edges_idx = np_arange(
                    len(edges_original))[unique_edges_mask]
            edges_changed = True

        if remove_duplicated_faces:
            if faces_changed:
                faces, unique_faces_mask = get_unique_topology(faces)
                preserved_faces_idx = preserved_faces_idx[unique_faces_mask]
            else:
                faces, unique_faces_mask = get_unique_topology(faces_original)
                preserved_faces_idx = np_arange(
                    len(faces_original))[unique_faces_mask]
            faces_changed = True

        if remove_degenerated_edges:
            if edges_changed:
                edges, non_coincident_mask = non_coincident_edges(edges)
                preserved_edges_idx = preserved_edges_idx[non_coincident_mask]
            else:
                edges, non_coincident_mask = non_coincident_edges(
                    edges_original)
                preserved_edges_idx = np_arange(
                    len(edges_original))[non_coincident_mask]
            edges_changed = True
        if remove_degenerated_faces:
            if faces_changed:
                faces, non_redundant_mask = non_redundant_faces_indices(faces)
                preserved_faces_idx = preserved_faces_idx[non_redundant_mask]
            else:
                faces, non_redundant_mask = non_redundant_faces_indices(
                    faces_original)
                preserved_faces_idx = np_arange(
                    len(faces_original))[non_redundant_mask]
            faces_changed = True

        if remove_loose_verts:
            verts, edges, faces, removed_verts_idx = remove_unreferenced_verts(
                verts_original, edges if edges_changed else edges_original,
                faces if faces_changed else faces_original)
            verts_changed = True
            edges_changed = True
            faces_changed = True

        if verts_changed:
            verts_out.append(verts)
            if calc_verts_idx:
                verts_removed_out.append(removed_verts_idx)
            else:
                verts_removed_out.append([])

        else:
            verts_out.append(verts_original)
            verts_removed_out.append([])

        if edges_changed:
            edges_out.append(edges)
            if calc_edges_idx and len(preserved_edges_idx) > 0:
                edges_removed_out.append(
                    invert_index_list(preserved_edges_idx,
                                      len(edges_original)).tolist())

            else:
                edges_removed_out.append([])

        else:
            edges_out.append(edges_original)
            edges_removed_out.append([])

        if faces_changed:
            faces_out.append(faces)
            if calc_faces_idx and len(preserved_faces_idx) > 0:
                faces_removed_out.append(
                    invert_index_list(preserved_faces_idx,
                                      len(faces_original)).tolist())

            else:
                faces_removed_out.append([])

        else:
            faces_out.append(faces_original)
            faces_removed_out.append([])

    return verts_out, edges_out, faces_out, verts_removed_out, edges_removed_out, faces_removed_out
def process_torsionals(atom,
                       dihe_list,
                       func="prop",
                       verbose=False,
                       forzmatrix=False,
                       dihed_count=None):
    vizinho1 = atom.get_atom_list()[0]
    vizinho2 = atom.get_atom_list()[1]
    for vizinhoTemp in vizinho1.get_atom_list():
        alist = [vizinhoTemp, vizinho1, atom, vizinho2]
        if not (vizinhoTemp is None) and (vizinhoTemp != atom):
            looked_up_param, centeratom_name, dihed_identifier = return_params(
                func, alist)
            if looked_up_param is None:
                if forzmatrix:  # dihed was not specified
                    d = Dihedral(*alist, func, None, dihed_idx=dihed_count)
                    dihed_count += 1
                    dihe_list.append(d)
            else:  # was specified. Now, check whether it was given double
                params = looked_up_param.split(",")
                if len(params) == 1:
                    d = Dihedral(*alist,
                                 func=func,
                                 param=params[0],
                                 was_input=False,
                                 is_multiparam=False,
                                 has_siblings=False,
                                 dihed_idx=dihed_count)
                    dihed_count += 1
                    dihe_list.append(d)
                    if not forzmatrix:
                        break
                elif len(params) > 1:
                    # dihedrals who belong to the same parameter set:
                    siblings = np_arange(start=dihed_count,
                                         stop=dihed_count + len(params))
                    if verbose:
                        print_verbose_dihedral_message(func, alist, params)
                    for param in params:
                        if atom.get_was_central_atom():
                            if not (dihed_identifier
                                    in atom.get_was_central_atom_for()):
                                d = Dihedral(
                                    *alist,
                                    func=func,
                                    param=param,
                                    was_input=True,
                                    isDetermined=True,
                                    is_multiparam=True,
                                    around_center_atom=atom.get_atomtype() ==
                                    centeratom_name,
                                    has_siblings=True,
                                    siblings=siblings,
                                    dihed_idx=dihed_count)
                                dihed_count += 1
                                dihe_list.append(d)
                        else:
                            d = Dihedral(
                                *alist,
                                func=func,
                                param=param,
                                was_input=True,
                                isDetermined=True,
                                is_multiparam=True,
                                around_center_atom=atom.get_atomtype() ==
                                centeratom_name,
                                has_siblings=True,
                                siblings=siblings,
                                dihed_idx=dihed_count)
                            dihed_count += 1
                            dihe_list.append(d)
                    if not forzmatrix:
                        atom.set_was_central_atom(
                            True
                        )  # was used as central atom for multiparam dihed
                        atom.append_was_central_atom_for(dihed_identifier)
                        break

    for vizinhoTemp in vizinho2.get_atom_list():
        alist = [vizinhoTemp, vizinho2, atom, vizinho1]
        if not (vizinhoTemp is None) and vizinhoTemp != atom:
            looked_up_param, centeratom_name, dihed_identifier = return_params(
                func, alist)
            if looked_up_param is None:
                if forzmatrix:  # dihed was not specified
                    d = Dihedral(*alist, func, None, dihed_idx=dihed_count)
                    dihed_count += 1
                    dihe_list.append(d)
            else:  # was specified. Now, check whether it was given double
                params = looked_up_param.split(",")
                if len(params) == 1:
                    d = Dihedral(*alist,
                                 func=func,
                                 param=params[0],
                                 was_input=False,
                                 is_multiparam=False,
                                 has_siblings=False,
                                 dihed_idx=dihed_count)
                    dihed_count += 1
                    dihe_list.append(d)
                    if not forzmatrix:
                        break
                elif len(params) > 1:
                    # dihedrals who belong to the same parameter set:
                    siblings = np_arange(start=dihed_count,
                                         stop=dihed_count + len(params))
                    if verbose:
                        print_verbose_dihedral_message(func, alist, params)
                    for param in params:
                        if atom.get_was_central_atom():
                            if not (dihed_identifier
                                    in atom.get_was_central_atom_for()):
                                d = Dihedral(
                                    *alist,
                                    func=func,
                                    param=param,
                                    was_input=True,
                                    isDetermined=True,
                                    is_multiparam=True,
                                    around_center_atom=atom.get_atomtype() ==
                                    centeratom_name,
                                    has_siblings=True,
                                    siblings=siblings,
                                    dihed_idx=dihed_count)
                                dihed_count += 1
                                dihe_list.append(d)
                        else:
                            d = Dihedral(
                                *alist,
                                func=func,
                                param=param,
                                was_input=True,
                                isDetermined=False,
                                is_multiparam=True,
                                around_center_atom=atom.get_atomtype() ==
                                centeratom_name,
                                has_siblings=True,
                                siblings=siblings,
                                dihed_idx=dihed_count)
                            dihed_count += 1
                            dihe_list.append(d)
                    if not forzmatrix:
                        atom.set_was_central_atom(
                            True
                        )  # was used as central atom for multiparam dihed
                        atom.append_was_central_atom_for(dihed_identifier)
                        break
    return dihe_list, dihed_count
Exemple #25
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)
        
        
        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) < 2:
                continue
                
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

    
        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))
            
            mono = []
            poly = []
            no_inference = []
            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(dist)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(dist)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(dist)
            
                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1,-1,-1] + [str(percentile_outlier)]
                
                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        
            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference))
            n = 0
            if len(mono) > 0:
                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                n, b, p = ax.hist(mono, bins=bins,
                          color=(0.0, 0.0, 1.0),
                          alpha=0.25,
                          weights=0.9 * w * mono_weights,
                          bottom=i,
                          lw=0,
                          zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)
                          
        fout.close()

    
        # overlay scatter plot elements
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Exemple #26
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)
        
        
        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference]
            if len(v) < 2:
                continue
                
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

    
        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))
            
            mono = []
            poly = []
            no_inference = []
            for clade_label, dist in rel_dists[rank].items():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(dist)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(dist)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(dist)
            
                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1,-1,-1] + [str(percentile_outlier)]
                
                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        
            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            d = len(mono) + len(poly) + len(no_inference)
            if d == 0:
                break
                
            w = float(len(mono)) / d
            n = 0
            if len(mono) > 0:
                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                n, b, p = ax.hist(mono, bins=bins,
                          color=(0.0, 0.0, 1.0),
                          alpha=0.25,
                          weights=0.9 * w * mono_weights,
                          bottom=i,
                          lw=0,
                          zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)
                          
        fout.close()

    
        # overlay scatter plot elements
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(range(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Exemple #27
0
    def table(self, input_tree, taxon_category_file, bl_step_size,
              output_table):
        """Produce table with number of lineage for increasing mean branch lengths

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        taxon_category_file : str
            File indicating category for each taxon in the tree.
        bl_step_size : float
            Step size in table for mean branch length criterion.
        output_table : str
            Name of output table.
        """

        # get category for each taxon
        taxon_category = {}
        for line in open(taxon_category_file):
            line_split = line.strip().split('\t')
            taxon_category[line_split[0]] = line_split[1]

        # read tree
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # determine mean distance to leaves and taxon categories for each node
        all_categories = set()
        node_info = {}
        parent_mean_dist_to_leafs = {}
        max_bl_threshold = None
        for i, node in enumerate(tree.seed_node.preorder_iter()):
            node.id = i

            if node.is_leaf():
                mean_dist_to_leafs = 0.0
                categories = set()
                for c in taxon_category[node.taxon.label].split('/'):
                    categories.add(c)
            else:
                dist_to_leafs = []
                categories = set()
                for t in node.leaf_iter():
                    dist_to_leafs.append(self._dist_to_ancestor(t, node))

                    for c in taxon_category[t.taxon.label].split('/'):
                        categories.add(c)

                mean_dist_to_leafs = np_mean(dist_to_leafs)

            if node.parent_node:
                p = parent_mean_dist_to_leafs[node.parent_node.id]
            else:
                p = mean_dist_to_leafs + 1e-6

            category = '/'.join(sorted(list(categories), reverse=True))
            all_categories.add(category)
            node_info[node.id] = [mean_dist_to_leafs, p, category]
            parent_mean_dist_to_leafs[node.id] = mean_dist_to_leafs

            if mean_dist_to_leafs > max_bl_threshold:
                max_bl_threshold = mean_dist_to_leafs

        # write table
        fout = open(output_table, 'w')
        fout.write('Threshold')
        for c in all_categories:
            fout.write('\t%s' % c)
        fout.write('\n')

        for bl_threshold in np_arange(0, max_bl_threshold + bl_step_size,
                                      bl_step_size):
            category_count = defaultdict(int)

            stack = [tree.seed_node]
            while stack:
                node = stack.pop()

                mean_dist_to_leafs, _, category = node_info[node.id]
                if mean_dist_to_leafs > bl_threshold:
                    for c in node.child_node_iter():
                        stack.append(c)
                else:
                    category_count[category] += 1

            # check if node meets mean branch length criterion
            if sum(category_count.values()) > 0:
                fout.write('%.3f' % bl_threshold)
                for c in all_categories:
                    fout.write('\t%d' % category_count[c])
                fout.write('\n')

        fout.close()

        if False:
            node_info.sort()
            for bl_threshold in np_arange(0, node_info[-1][0] + bl_step_size,
                                          bl_step_size):
                category_count = defaultdict(int)
                for mean_bl_dist, parent_mean_bl_dist, category in node_info:
                    if bl_threshold >= mean_bl_dist and bl_threshold < parent_mean_bl_dist:
                        category_count[category] += 1

                if sum(category_count.values()) > 0:
                    fout.write('%.3f' % bl_threshold)
                    for c in all_categories:
                        fout.write('\t%d' % category_count[c])
                    fout.write('\n')
Exemple #28
0
    def optimal(self, input_tree, rank, min_dist, max_dist, step_size,
                output_table):
        """Determine branch length for best congruency with existing taxonomy.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rank : int
            Taxonomic rank to consider (1=Phylum, ..., 6=Species).
        output_table : str
            Name of output table.
        """

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # get mean distance to terminal taxa for each node along with
        # other stats needed to determine classification
        self.logger.info('Determining MDTT for each node.')
        rank_prefix = Taxonomy.rank_prefixes[rank]
        child_rank_prefix = Taxonomy.rank_prefixes[rank + 1]
        rank_info = []
        rank_dists = set()
        for node in tree.seed_node.preorder_internal_node_iter():
            if node == tree.seed_node:
                continue

            # check if node is at the specified rank
            node_taxon = None
            if node.label:
                support, taxon_name, _auxiliary_info = parse_label(node.label)

                if taxon_name:
                    for taxon in [x.strip() for x in taxon_name.split(';')]:
                        if taxon.startswith(rank_prefix):
                            node_taxon = taxon

            if not node_taxon:
                continue

            # check that node has two descendants at the next rank
            child_rank_taxa = []
            for c in node.levelorder_iter():
                if c.label:
                    support, taxon_name, _auxiliary_info = parse_label(c.label)

                    if taxon_name:
                        for taxon in [
                                x.strip() for x in taxon_name.split(';')
                        ]:
                            if taxon.startswith(child_rank_prefix):
                                child_rank_taxa.append(taxon)

                if len(child_rank_taxa) >= 2:
                    break

            if len(child_rank_taxa) < 2:
                continue

            # get mean branch length to terminal taxa
            dists_to_tips = []
            for t in node.leaf_iter():
                dists_to_tips.append(self._dist_to_ancestor(t, node))

            node_dist = np_mean(dists_to_tips)

            # get mean branch length to terminal taxa for first ancestor spanning multiple phyla
            ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix)

            ancestor_dists_to_tips = []
            for t in ancestor.leaf_iter():
                ancestor_dists_to_tips.append(
                    self._dist_to_ancestor(t, ancestor))

            ancestor_dist = np_mean(ancestor_dists_to_tips)

            rank_info.append([node_dist, ancestor_dist, node_taxon])
            rank_dists.add(node_dist)

        self.logger.info(
            'Calculating threshold from %d taxa with specified rank resolution.'
            % len(rank_info))

        fout = open('bl_optimal_taxa_dists.tsv', 'w')
        fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n')
        for node_dist, ancestor_dist, node_taxon in rank_info:
            fout.write('%s\t%.3f\t%.3f\n' %
                       (node_taxon, node_dist, ancestor_dist))
        fout.close()

        # report number of correct and incorrect taxa for each threshold
        fout = open(output_table, 'w')
        header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages'
        fout.write(header + '\n')
        print(header)

        top_correct = 0
        top_incorrect = 0
        top_precision = 0
        for d in np_arange(min_dist, max_dist + step_size, step_size):
            rank_dists.add(d)

        for dist_threshold in sorted(rank_dists, reverse=True):
            correct = 0
            incorrect = 0
            for node_dist, ancestor_dist, node_taxon in rank_info:
                # check if node/edge would be collapsed at the given threshold
                if node_dist <= dist_threshold and ancestor_dist > dist_threshold:
                    correct += 1
                elif node_dist > dist_threshold:
                    incorrect += 1
                else:
                    incorrect += 1  # above ancestor with multiple taxa

            denominator = correct + incorrect
            if denominator:
                precision = float(correct) / denominator
            else:
                precision = 0

            num_lineages, num_terminal_lineages = self._num_lineages(
                tree, dist_threshold)

            row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % (
                dist_threshold, correct, incorrect, precision, num_lineages +
                num_terminal_lineages, num_lineages, num_terminal_lineages)

            fout.write(row + '\n')
            print(row)

            if precision > top_precision:
                top_correct = correct
                top_incorrect = incorrect
                top_precision = precision
                top_threshold = dist_threshold

        return top_threshold, top_correct, top_incorrect
Exemple #29
0
    def optimal(self, input_tree, 
                        rank,
                        min_dist, 
                        max_dist, 
                        step_size,
                        output_table):
        """Determine branch length for best congruency with existing taxonomy.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rank : int
            Taxonomic rank to consider (1=Phylum, ..., 6=Species).
        output_table : str
            Name of output table.
        """
    
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)
        
        # get mean distance to terminal taxa for each node along with
        # other stats needed to determine classification
        self.logger.info('Determining MDTT for each node.')
        rank_prefix = Taxonomy.rank_prefixes[rank]
        child_rank_prefix = Taxonomy.rank_prefixes[rank+1]
        rank_info = []
        rank_dists = set()                                
        for node in tree.seed_node.preorder_internal_node_iter():
            if node == tree.seed_node:
                continue
                
            # check if node is at the specified rank
            node_taxon = None
            if node.label:
                support, taxon_name, _auxiliary_info = parse_label(node.label)
                
                if taxon_name:
                    for taxon in [x.strip() for x in taxon_name.split(';')]:
                        if taxon.startswith(rank_prefix):
                            node_taxon = taxon
                        
            if not node_taxon:
                continue
                
            # check that node has two descendants at the next rank
            child_rank_taxa = []
            for c in node.levelorder_iter():
                if c.label:
                    support, taxon_name, _auxiliary_info = parse_label(c.label)
                    
                    if taxon_name:
                        for taxon in [x.strip() for x in taxon_name.split(';')]:
                            if taxon.startswith(child_rank_prefix):
                                child_rank_taxa.append(taxon)
                            
                if len(child_rank_taxa) >= 2:
                    break
                    
            if len(child_rank_taxa) < 2:
                continue
                
            # get mean branch length to terminal taxa
            dists_to_tips = []
            for t in node.leaf_iter():
                dists_to_tips.append(self._dist_to_ancestor(t, node))
                
            node_dist = np_mean(dists_to_tips)
            
            # get mean branch length to terminal taxa for first ancestor spanning multiple phyla
            ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix)
            
            ancestor_dists_to_tips = []
            for t in ancestor.leaf_iter():
                ancestor_dists_to_tips.append(self._dist_to_ancestor(t, ancestor))
                
            ancestor_dist = np_mean(ancestor_dists_to_tips)
                    
            rank_info.append([node_dist, ancestor_dist, node_taxon])
            rank_dists.add(node_dist)
            
        self.logger.info('Calculating threshold from %d taxa with specified rank resolution.' % len(rank_info))
            
        fout = open('bl_optimal_taxa_dists.tsv' , 'w')
        fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n')
        for node_dist, ancestor_dist, node_taxon in rank_info:
            fout.write('%s\t%.3f\t%.3f\n' % (node_taxon, node_dist, ancestor_dist))
        fout.close()
                    
        # report number of correct and incorrect taxa for each threshold
        fout = open(output_table, 'w')
        header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages'
        fout.write(header + '\n')
        print header
        
        top_correct = 0
        top_incorrect = 0
        top_precision = 0
        for d in np_arange(min_dist, max_dist+step_size, step_size):
            rank_dists.add(d)
            
        for dist_threshold in sorted(rank_dists, reverse=True):
            correct = 0
            incorrect = 0
            for node_dist, ancestor_dist, node_taxon in rank_info:
                # check if node/edge would be collapsed at the given threshold
                if node_dist <= dist_threshold and ancestor_dist > dist_threshold:
                    correct += 1
                elif node_dist > dist_threshold:
                    incorrect += 1
                else:
                    incorrect += 1 # above ancestor with multiple taxa
         
            denominator = correct + incorrect
            if denominator:
                precision = float(correct) / denominator
            else:
                precision = 0
                
            num_lineages, num_terminal_lineages = self._num_lineages(tree, dist_threshold)
                    
            row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % (dist_threshold, 
                                                            correct, 
                                                            incorrect, 
                                                            precision,
                                                            num_lineages + num_terminal_lineages,
                                                            num_lineages, 
                                                            num_terminal_lineages)
                                                            
            fout.write(row + '\n')
            print row
            
            if precision > top_precision:
                top_correct = correct
                top_incorrect = incorrect
                top_precision = precision
                top_threshold = dist_threshold
                
        return top_threshold, top_correct, top_incorrect
Exemple #30
0
from mpi4py import MPI
from numpy import \
arange as np_arange, \
zeros as np_zeros
comm = MPI.COMM_WORLD
myrank = comm.Get_rank()
nproc = comm.Get_size()
if myrank == 0:
    fulldata = np_arange(3 * nproc, dtype='i')
    print("I'm {0} fulldata is: {1}".format(myrank, fulldata))
else:
    fulldata = None
count = 3
mydata = np_zeros(count, dtype='i')
comm.Scatter([fulldata, count, MPI.INT], [mydata, count, MPI.INT], root=0)
print("After Scatter, I'm {0} and mydata is: {1}".format(myrank, mydata))
Exemple #31
0
    def shuffleBAMs(self):
        """Make the data transformation deterministic by reordering the bams"""
        # first we should make a subset of the total data
        # we'd like to take it down to about 1500 or so RI's
        # but we'd like to do this in a repeatable way
        ideal_contig_num = 1500
        sub_cons = range(len(self.indices))
        while len(sub_cons) > ideal_contig_num:
            # select every second contig when sorted by norm cov
            cov_sorted = np_argsort(self.normCoverages[sub_cons])
            sub_cons = np_array([sub_cons[cov_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))])

            if len(sub_cons) > ideal_contig_num:
                # select every second contig when sorted by mer PC1
                mer_sorted = np_argsort(self.kmerNormPC1[sub_cons])
                sub_cons = np_array([sub_cons[mer_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))])

        # now that we have a subset, calculate the distance between each of the untransformed vectors
        num_sc = len(sub_cons)

        # log shift the coverages towards the origin
        sub_covs = np_transpose([self.covProfiles[i]*(np_log10(self.normCoverages[i])/self.normCoverages[i]) for i in sub_cons])
        sq_dists = cdist(sub_covs,sub_covs,'cityblock')
        dists = squareform(sq_dists)

        # initialise a list of left, right neighbours
        lr_dict = {}
        for i in range(self.numStoits):
            lr_dict[i] = []

        too_big = 10000
        while True:
            closest = np_argmin(dists)
            if dists[closest] == too_big:
                break
            (i,j) = self.small2indices(closest, self.numStoits-1)
            lr_dict[j].append(i)
            lr_dict[i].append(j)

            # mark these guys as neighbours
            if len(lr_dict[i]) == 2:
                # no more than 2 neighbours
                sq_dists[i,:] = too_big
                sq_dists[:,i] = too_big
                sq_dists[i,i] = 0.0
            if len(lr_dict[j]) == 2:
                # no more than 2 neighbours
                sq_dists[j,:] = too_big
                sq_dists[:,j] = too_big
                sq_dists[j,j] = 0.0

            # fix the dist matrix
            sq_dists[j,i] = too_big
            sq_dists[i,j] = too_big
            dists = squareform(sq_dists)

        # now make the ordering
        ordering = [0, lr_dict[0][0]]
        done = 2
        while done < self.numStoits:
            last = ordering[done-1]
            if lr_dict[last][0] == ordering[done-2]:
                ordering.append(lr_dict[last][1])
                last = lr_dict[last][1]
            else:
                ordering.append(lr_dict[last][0])
                last = lr_dict[last][0]
            done+=1

        # reshuffle the contig order!
        # yay for bubble sort!
        working = np_arange(self.numStoits)
        for i in range(1, self.numStoits):
            # where is this guy in the list
            loc = list(working).index(ordering[i])
            if loc != i:
                # swap the columns
                self.covProfiles[:,[i,loc]] = self.covProfiles[:,[loc,i]]
                self.stoitColNames[[i,loc]] = self.stoitColNames[[loc,i]]
                working[[i,loc]] = working[[loc,i]]
Exemple #32
0
    def shuffleBAMs(self):
        """Make the data transformation deterministic by reordering the bams"""
        # first we should make a subset of the total data
        # we'd like to take it down to about 1500 or so RI's
        # but we'd like to do this in a repeatable way
        ideal_contig_num = 1500
        sub_cons = range(len(self.indices))
        while len(sub_cons) > ideal_contig_num:
            # select every second contig when sorted by norm cov
            cov_sorted = np_argsort(self.normCoverages[sub_cons])
            sub_cons = np_array([
                sub_cons[cov_sorted[i * 2]]
                for i in np_arange(int(len(sub_cons) / 2))
            ])

            if len(sub_cons) > ideal_contig_num:
                # select every second contig when sorted by mer PC1
                mer_sorted = np_argsort(self.kmerNormPC1[sub_cons])
                sub_cons = np_array([
                    sub_cons[mer_sorted[i * 2]]
                    for i in np_arange(int(len(sub_cons) / 2))
                ])

        # now that we have a subset, calculate the distance between each of the untransformed vectors
        num_sc = len(sub_cons)

        # log shift the coverages towards the origin
        sub_covs = np_transpose([
            self.covProfiles[i] *
            (np_log10(self.normCoverages[i]) / self.normCoverages[i])
            for i in sub_cons
        ])
        sq_dists = cdist(sub_covs, sub_covs, 'cityblock')
        dists = squareform(sq_dists)

        # initialise a list of left, right neighbours
        lr_dict = {}
        for i in range(self.numStoits):
            lr_dict[i] = []

        too_big = 10000
        while True:
            closest = np_argmin(dists)
            if dists[closest] == too_big:
                break
            (i, j) = self.small2indices(closest, self.numStoits - 1)
            lr_dict[j].append(i)
            lr_dict[i].append(j)

            # mark these guys as neighbours
            if len(lr_dict[i]) == 2:
                # no more than 2 neighbours
                sq_dists[i, :] = too_big
                sq_dists[:, i] = too_big
                sq_dists[i, i] = 0.0
            if len(lr_dict[j]) == 2:
                # no more than 2 neighbours
                sq_dists[j, :] = too_big
                sq_dists[:, j] = too_big
                sq_dists[j, j] = 0.0

            # fix the dist matrix
            sq_dists[j, i] = too_big
            sq_dists[i, j] = too_big
            dists = squareform(sq_dists)

        # now make the ordering
        ordering = [0, lr_dict[0][0]]
        done = 2
        while done < self.numStoits:
            last = ordering[done - 1]
            if lr_dict[last][0] == ordering[done - 2]:
                ordering.append(lr_dict[last][1])
                last = lr_dict[last][1]
            else:
                ordering.append(lr_dict[last][0])
                last = lr_dict[last][0]
            done += 1

        # reshuffle the contig order!
        # yay for bubble sort!
        working = np_arange(self.numStoits)
        for i in range(1, self.numStoits):
            # where is this guy in the list
            loc = list(working).index(ordering[i])
            if loc != i:
                # swap the columns
                self.covProfiles[:, [i, loc]] = self.covProfiles[:, [loc, i]]
                self.stoitColNames[[i, loc]] = self.stoitColNames[[loc, i]]
                working[[i, loc]] = working[[loc, i]]
    def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file, taxonomy_file, output_prefix, min_children, title):

        # determine named clades in full tree
        named_clades = set()
        tree = dendropy.Tree.get_from_path(full_tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        for node in tree.preorder_node_iter():
            if node.label:
                taxonomy = node.label.split(';')
                named_clades.add(taxonomy[-1].strip().split(':')[-1])

        print 'Identified %d named clades in full tree.' % len(named_clades)

        # determine named groups with at least the specified number of children
        print 'Determining taxa with sufficient named children lineages.'
        taxon_children = defaultdict(set)
        groups = defaultdict(list)
        print taxonomy_file
        for line in open(taxonomy_file):
            line_split = line.replace('; ', ';').split()
            genome_id = line_split[0]
            taxonomy = [x.strip() for x in line_split[1].split(';')]

            if len(taxonomy) > rank + 1:
                taxon_children[taxonomy[rank]].add(taxonomy[rank + 1])

            if len(taxonomy) > rank:
                groups[taxonomy[rank]].append(genome_id)

        groups_to_consider = set()
        for taxon, children_taxa in taxon_children.iteritems():
            if len(children_taxa) >= min_children and taxon in named_clades:
                groups_to_consider.add(taxon)

        print 'Assessing distribution over %d groups.' % len(groups_to_consider)

        # calculate relative distance for full tree
        print ''
        print 'Calculating relative distance over full tree.'
        tree = dendropy.Tree.get_from_path(full_tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups)
        if len(polyphyletic) > 0:
            print ''
            print '[Warning] Full tree contains polyphyletic groups.'

        # calculate relative distance for dereplicated tree
        print ''
        print 'Calculating relative distance over dereplicated tree.'
        tree = dendropy.Tree.get_from_path(derep_tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups)

        groups_to_consider = groups_to_consider - polyphyletic
        print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len(groups_to_consider)

        # calculate relative distance to each group in each tree
        print ''
        rel_dists = defaultdict(list)
        dist_components = defaultdict(list)
        for f in os.listdir(input_tree_dir):
            if not f.endswith('.rooted.tree'):
                continue

            print f

            tree_file = os.path.join(input_tree_dir, f)
            tree = dendropy.Tree.get_from_path(tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

            # calculate relative distance to named taxa
            rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups)

            for taxon, dist in rel_dist.iteritems():
                rel_dists[taxon].append(dist)
                dist_components[taxon].append(components[taxon])

        # create scatter plot
        x = []
        y = []
        xDerep = []
        yDerep = []
        xFull = []
        yFull = []
        perc10 = []
        perc90 = []
        labels = []
        fout = open(output_prefix + '.tsv', 'w')
        fout.write('Taxon\tP10\tP90\tP90-P10\tMean rel. dist\tMean dist to parent\tMean dist to leaves\tOriginal rel. dist.\tOrigial dist to parent\tOriginal dist to leaves\n')
        for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)):
            labels.append(taxon + ' (%d)' % (len(rel_dists[taxon])))

            rd = rel_dists[taxon]
            for d in rd:
                x.append(d)
                y.append(i + 0.2)

            p10, p90 = np_percentile(rd, [10, 90])
            perc10.append(p10)
            perc90.append(p90)

            print taxon, p90 - p10
            mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0)
            derep_x, derep_a, derep_b = derep_dist_components[taxon]
            fout.write('%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' % (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x, derep_a, derep_b))

            xDerep.append(derep_rel_dist[taxon])
            yDerep.append(i)

            xFull.append(full_rel_dist[taxon])
            yFull.append(i)
        fout.close()

        self.fig.clear()
        self.fig.set_size_inches(8, len(rel_dists) * 0.4)
        ax = self.fig.add_subplot(111)

        ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s')
        ax.scatter(xDerep, yDerep, alpha=1.0, s=24, c=(1.0, 0.0, 0.0), marker='s')
        ax.scatter(xFull, yFull, alpha=1.0, s=24, c=(0.0, 0.0, 1.0), marker='*')

        for i in xrange(len(labels)):
            ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-')
            ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-')

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')
        if title:
            ax.set_title(title, size=12)

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('taxa')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(labels)

        self.prettify(ax)

        # make plot interactive
        # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12))

        # mpld3.save_html(fig, output_prefix + '.html')
        self.fig.tight_layout(pad=1)
        self.fig.savefig(output_prefix + '.png', dpi=300)
Exemple #34
0
    def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            no_inference = []
            for clade_label, dists in medians_for_taxa[rank].iteritems():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(md)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(md)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(md)

            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            mono_max_count = max(np_histogram(mono, bins=bins)[0])
            mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

            w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference))
            n, b, p = ax.hist(mono, bins=bins,
                      color=(0.0, 0.0, 1.0),
                      alpha=0.25,
                      weights=0.9 * w * mono_weights,
                      bottom=i,
                      lw=0,
                      zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(medians_for_taxa)))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Exemple #35
0
def inset_regular_pols(np_verts,
                       np_pols,
                       np_distances,
                       np_inset_rate,
                       np_make_inners,
                       np_faces_id,
                       custom_normals,
                       matrices,
                       offset_mode='CENTER',
                       proportional=False,
                       concave_support=True,
                       index_offset=0,
                       use_custom_normals=False,
                       output_old_face_id=True,
                       output_old_v_id=True,
                       output_pols_groups=True):

    pols_number = np_pols.shape[0]
    pol_sides = np_pols.shape[1]
    v_pols = np_verts[np_pols]  #shape [num_pols, num_corners, 3]
    if offset_mode == 'SIDES':
        inner_points = sides_mode_inset(v_pols, np_inset_rate, np_distances,
                                        concave_support, proportional,
                                        use_custom_normals, custom_normals)
    elif offset_mode == 'MATRIX':
        inner_points = matrix_mode_inset(v_pols, matrices, use_custom_normals,
                                         custom_normals)
    else:
        if any(np_distances != 0):
            if use_custom_normals:
                normals = custom_normals
            else:
                normals = np_faces_normals(v_pols)
            average = np.sum(
                v_pols, axis=1
            ) / pol_sides  #+ normals*np_distances[:, np_newaxis] #shape [num_pols, 3]
            inner_points = average[:, np_newaxis, :] + (
                v_pols - average[:, np_newaxis, :]
            ) * np_inset_rate[:, np_newaxis,
                              np_newaxis] + normals[:,
                                                    np_newaxis, :] * np_distances[:,
                                                                                  np_newaxis,
                                                                                  np_newaxis]
        else:
            average = np.sum(v_pols, axis=1) / pol_sides  #shape [num_pols, 3]
            inner_points = average[:, np_newaxis, :] + (
                v_pols - average[:, np_newaxis, :]
            ) * np_inset_rate[:, np_newaxis, np_newaxis]

    idx_offset = len(np_verts) + index_offset

    new_v_idx = np_arange(idx_offset,
                          pols_number * pol_sides + idx_offset).reshape(
                              pols_number, pol_sides)

    side_pols = np.zeros([pols_number, pol_sides, 4], dtype=int)
    side_pols[:, :, 0] = np_pols
    side_pols[:, :, 1] = np_roll(np_pols, -1, axis=1)
    side_pols[:, :, 2] = np_roll(new_v_idx, -1, axis=1)
    side_pols[:, :, 3] = new_v_idx

    side_faces = side_pols.reshape(-1, 4)

    new_insets = new_v_idx[np_make_inners]

    if pol_sides == 4:
        new_faces = np_concatenate([side_faces, new_insets]).tolist()
    else:
        new_faces = side_faces.tolist() + new_insets.tolist()

    old_v_id = np_pols.flatten().tolist() if output_old_v_id else []
    if output_old_face_id:
        side_ids = np.repeat(np_faces_id[:, np_newaxis], pol_sides, axis=1)
        inset_ids = np_faces_id[np_make_inners]
        old_face_id = np.concatenate((side_ids.flatten(), inset_ids)).tolist()
    else:
        old_face_id = []

    if output_pols_groups:
        pols_groups = np_repeat(
            [1, 2], [len(side_faces), len(new_insets)]).tolist()
    else:
        pols_groups = []

    return (inner_points.reshape(-1, 3).tolist(), new_faces,
            new_insets.tolist(), old_v_id, old_face_id, pols_groups)
Exemple #36
0
from mpi4py import MPI
import sys
from numpy import \
    arange as np_arange, \
    zeros as np_zeros, \
    uint32 as np_uint32
comm = MPI.COMM_WORLD
myrank = comm.Get_rank()
nproc = comm.Get_size()
size = int(sys.argv[1])
partial_sum = np_arange(size)
if (myrank != 0):
    comm.Send([partial_sum, size, MPI.INT], dest=0, tag=7)
else:
    tmp_sum = np_zeros(size, dtype=np_uint32)
    for i in range(1, nproc):
        comm.Recv([tmp_sum, size, MPI.INT], source=i, tag=7)
    print("received data")
Exemple #37
0
    def table(self, input_tree, taxon_category_file, bl_step_size, output_table):
        """Produce table with number of lineage for increasing mean branch lengths

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        taxon_category_file : str
            File indicating category for each taxon in the tree.
        bl_step_size : float
            Step size in table for mean branch length criterion.
        output_table : str
            Name of output table.
        """
        
        # get category for each taxon
        taxon_category = {}
        for line in open(taxon_category_file):
            line_split = line.strip().split('\t')
            taxon_category[line_split[0]] = line_split[1]

        # read tree
        tree = dendropy.Tree.get_from_path(input_tree,
                                            schema='newick',
                                            rooting='force-rooted',
                                            preserve_underscores=True)
        
        # determine mean distance to leaves and taxon categories for each node
        all_categories = set()
        node_info = {}
        parent_mean_dist_to_leafs = {}
        max_bl_threshold = None
        for i, node in enumerate(tree.seed_node.preorder_iter()):
            node.id = i
            
            if node.is_leaf():
                mean_dist_to_leafs = 0.0
                categories = set()
                for c in taxon_category[node.taxon.label].split('/'):
                    categories.add(c)
            else:
                dist_to_leafs = []
                categories = set()
                for t in node.leaf_iter():
                    dist_to_leafs.append(self._dist_to_ancestor(t, node))
                    
                    for c in taxon_category[t.taxon.label].split('/'):
                        categories.add(c)

                mean_dist_to_leafs = np_mean(dist_to_leafs)
                
            if node.parent_node:
                p = parent_mean_dist_to_leafs[node.parent_node.id]
            else:
                p = mean_dist_to_leafs + 1e-6

            category = '/'.join(sorted(list(categories), reverse=True))
            all_categories.add(category)
            node_info[node.id] = [mean_dist_to_leafs, p, category] 
            parent_mean_dist_to_leafs[node.id] = mean_dist_to_leafs
            
            if mean_dist_to_leafs > max_bl_threshold:
                max_bl_threshold = mean_dist_to_leafs
            
        # write table
        fout = open(output_table, 'w')
        fout.write('Threshold')
        for c in all_categories:
            fout.write('\t%s' % c)
        fout.write('\n')
        
        for bl_threshold in np_arange(0, max_bl_threshold + bl_step_size, bl_step_size):
            category_count = defaultdict(int)
            
            stack = [tree.seed_node]
            while stack:
                node = stack.pop()
                
                mean_dist_to_leafs, _, category = node_info[node.id]
                if mean_dist_to_leafs > bl_threshold:
                    for c in node.child_node_iter():
                        stack.append(c)
                else:
                    category_count[category] += 1
                                  
            # check if node meets mean branch length criterion
            if sum(category_count.values()) > 0:
                fout.write('%.3f' % bl_threshold)
                for c in all_categories:
                    fout.write('\t%d' % category_count[c])
                fout.write('\n')
                
        fout.close()
   
        if False:
            node_info.sort()
            for bl_threshold in np_arange(0, node_info[-1][0] + bl_step_size, bl_step_size):
                category_count = defaultdict(int)
                for mean_bl_dist, parent_mean_bl_dist, category in node_info:
                    if bl_threshold >= mean_bl_dist and bl_threshold < parent_mean_bl_dist:
                        category_count[category] += 1
                        
                if sum(category_count.values()) > 0:
                    fout.write('%.3f' % bl_threshold)
                    for c in all_categories:
                        fout.write('\t%d' % category_count[c])
                    fout.write('\n')
Exemple #38
0
    def _distribution_plot(self, rel_dists, rel_dist_thresholds,
                           taxa_for_dist_inference, distribution_table,
                           plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        rel_dist_thresholds: list
            Relative distances cutoffs for defining ranks.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [
                dist for taxa, dist in rel_dists[rank].iteritems()
                if taxa in taxa_for_dist_inference
            ]
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [
                dist for taxa, dist in rel_dists[rank].iteritems()
                if taxa in taxa_for_dist_inference
            ]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write(
            'Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n'
        )
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        rel_dist_thresholds += [1.0]  # append boundry for species
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))

            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if clade_label in taxa_for_dist_inference:
                    c.append((0.0, 0.0, 0.5))
                else:
                    c.append((0.5, 0.5, 0.5))

                p10, p50, p90 = percentiles[i]
                percentile_outlier = not (dist >= p10 and dist <= p90)

                if i == 0:
                    rank_cutoff = rel_dist_thresholds[i]
                    rank_outlier = dist > rank_cutoff
                else:
                    rank_cutoff = rel_dist_thresholds[i]
                    upper_rank_cutoff = rel_dist_thresholds[i - 1]
                    rank_outlier = not (dist >= upper_rank_cutoff
                                        and dist <= rank_cutoff)

                v = [clade_label, dist, rank_cutoff, str(rank_outlier)]
                v += percentiles[i] + [str(percentile_outlier)]
                fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' %
                           tuple(v))
        fout.close()

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # plot relative divergence threshold lines
        y_min, y_max = ax.get_ylim()
        for threshold in rel_dist_thresholds[
                0:-1]:  # don't draw species boundary
            ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--')
            ax.text(threshold + 0.001,
                    y_max,
                    '%.3f' % threshold,
                    horizontalalignment='center')

        # make plot interactive
        mpld3.plugins.connect(
            self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig,
                              mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=96)
    def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        rel_dist_thresholds: list
            Relative distances cutoffs for defining ranks.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        rel_dist_thresholds += [1.0]  # append boundry for species
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))

            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if clade_label in taxa_for_dist_inference:
                    c.append((0.0, 0.0, 0.5))
                else:
                    c.append((0.5, 0.5, 0.5))

                p10, p50, p90 = percentiles[i]
                percentile_outlier = not (dist >= p10 and dist <= p90)

                if i == 0:
                    rank_cutoff = rel_dist_thresholds[i]
                    rank_outlier = dist > rank_cutoff
                else:
                    rank_cutoff = rel_dist_thresholds[i]
                    upper_rank_cutoff = rel_dist_thresholds[i - 1]
                    rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff)

                v = [clade_label, dist, rank_cutoff, str(rank_outlier)]
                v += percentiles[i] + [str(percentile_outlier)]
                fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        fout.close()

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # plot relative divergence threshold lines
        y_min, y_max = ax.get_ylim()
        for threshold in rel_dist_thresholds[0:-1]:  # don't draw species boundary
            ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--')
            ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center')

        # make plot interactive
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=96)