def CalculatePairwiseKDTree(ra, dec, com_dists, field, sep_min=0, sep_max=300, nbins=20, flip_sign=False): assert (len(ra) == len(com_dists)) nclusts = len(ra) delta_sep = (sep_max - sep_min) / nbins bins = np.arange(sep_min, sep_max + delta_sep, delta_sep) vec_unit = RaDec2XYZ(ra, dec) # Get unit vectors pointing to the clusters vec_dist = (vec_unit.T * com_dists).T # Mpc tree = KDTree(vec_dist) D = tree.sparse_distance_matrix(tree, sep_max, p=2.0, output_type='ndarray') DU = D[D['i'] < D['j']] pw = PairwiseKDTreeCore(DU, vec_unit, com_dists, field, bins, flip_sign) return pw
def guess_bonds(self, tolerance=0.40): """ Use geometric distances and covalent radii to determine bonding information for this molecule. Bonding is determined by the distance between sites being closer than the sum of covalent radii + `tolerance` Will set the `bonds` member. If the `graph_tool` library is available, this will call the `bond_graph` method to populate the connectivity graph. Args: tolerance (float, optional): Additional tolerance for attributing two sites as 'bonded'. The default is 0.4 angstroms, which is recommended by the CCDC """ tree = KDTree(self.positions) covalent_radii = np.array([x.cov for x in self.elements]) max_cov = np.max(covalent_radii) thresholds = (covalent_radii[:, np.newaxis] + covalent_radii[np.newaxis, :] + tolerance) max_distance = max_cov * 2 + tolerance dist = tree.sparse_distance_matrix( tree, max_distance=max_distance).toarray() mask = (dist > 0) & (dist < thresholds) self.bonds = np.zeros(dist.shape) self.bonds[mask] = dist[mask] self.bonds = dok_matrix(self.bonds) try: self.bond_graph() except Exception: pass
def get_rdf(pos, inside, Nbins=250, maxdist=30.0): """Radial distribution function, not normalised. For each particle tagged as inside, count the particles around and bin then with respect to distance. Need to be normalised by inside.sum() and density x volume of the spherical shell between r and r+maxdist/Nbins. - pos is a Nxd array of coordinates, with d the dimension of space - inside is a N array of booleans. For example all particles further away than maxdist from any edge of the box. - Nbins is the number of bins along r - maxdist is the maximum distance considered""" #an additional bin for the case where the distance is exactly equal to maxdist g = np.zeros(Nbins+1, int) #conversion factor between indices and bins l2r = Nbins/maxdist #spatial indexing tree = KDTree(pos, 12) centertree = KDTree(pos[inside], 12) centerindex = np.where(inside)[0] #all pairs of points closer than maxdist with their distances in a record array query = centertree.sparse_distance_matrix(tree, maxdist, output_type='ndarray') #keep only pairs where the points are distinct query['i'] = centerindex[query['i']] good = query['i'] != query['j'] query = query[good] #binning rs = (query['v'] * l2r).astype(int) np.add.at(g, rs, 1) return g[:-1]
def gG_l(pos, qlms, is_center, Nbins, maxdist): """Spatial correlation of the qlms (non normalized). For each particle i tagged as is_center, for each particle j closer than maxdist, do the cross product between their qlm and count, then bin each quantity with respect to distance. The two first sums need to be normalised by the last one. - pos is a Nxd array of coordinates, with d the dimension of space - qlms is a list of Nx(2l+1) arrays of boo coordinates for l-fold symmetry. l can be different for each item. - is_center is a N array of booleans. For example all particles further away than maxdist from any edge of the box. - Nbins is the number of bins along r - maxdist is the maximum distance considered""" for qlm in qlms: assert len(pos) == len(qlm) assert len(is_center) == len(pos) #conversion factor between indices and bins l2r = Nbins / maxdist #result containers hqQ = np.zeros((Nbins, len(qlms))) g = np.zeros(Nbins, int) #spatial indexing tree = KDTree(pos, 12) centertree = KDTree(pos[is_center], 12) #all pairs of points closer than maxdist with their distances in a record array query = centertree.sparse_distance_matrix(tree, maxdist, output_type='ndarray') #keep only pairs where the points are distinct centerindex = np.where(is_center)[0] query['i'] = centerindex[query['i']] good = query['i'] != query['j'] query = query[good] #binning of distances rs = (query['v'] * l2r).astype(int) np.add.at(g, rs, 1) #binning of boo cross products pqQs = np.empty((len(rs), len(qlms))) for it, qlm in enumerate(qlms): pqQs[:, it] = boo_product(qlm[query['i']], qlm[query['j']]) np.add.at(hqQ, rs, pqQs) return hqQ, g
def gG_l(pos, qlms, is_center, Nbins, maxdist): """ Spatial correlation of the qlms (non normalized). For each particle i tagged as is_center, for each particle j closer than maxdist, do the cross product between their qlm and count, then bin each quantity with respect to distance. The two first sums need to be normalised by the last one. Periodic boundary conditions are not supported. Parameters ---------- pos : (N, 3) array of floats Spatial coordinates qlms : list A list of M (N, 2l+1) arrays of boo coordinates for l-fold symmetry. l can be different for each item. is_center : (N) array of bool. For example all particles further away than maxdist from any edge of the box. Nbins : int The number of bins along r maxdist : float The maximum distance considered. Returns ---------- hqQ : (Nbins, M) array of floats The sum of cross products for each distance and each qlm g : (Nbins) array of ints The number of pairs for each distance """ for qlm in qlms: assert len(pos) == len(qlm) assert len(is_center) == len(pos) #conversion factor between indices and bins l2r = Nbins / maxdist #result containers #an additional bin for the case where the distance is exactly equal to maxdist hqQ = np.zeros((Nbins + 1, len(qlms))) g = np.zeros(Nbins + 1, int) #compute ql for all particles qQ = np.array([ql(qlm) for qlm in qlms]) nonzero = qQ.min(0) + 1.0 > 1.0 #spatial indexing tree = KDTree(pos[nonzero], 12) centertree = KDTree(pos[is_center & nonzero], 12) #all pairs of points closer than maxdist with their distances in a record array query = centertree.sparse_distance_matrix(tree, maxdist, output_type='ndarray') #convert in original indices nonzeroindex = np.where(nonzero)[0] centerindex = np.where(is_center & nonzero)[0] query['i'] = centerindex[query['i']] query['j'] = nonzeroindex[query['j']] #keep only pairs where the points are distinct good = query['i'] != query['j'] query = query[good] #binning of distances rs = (query['v'] * l2r).astype(int) np.add.at(g, rs, 1) #binning of boo cross products pqQs = np.empty((len(rs), len(qlms))) for it, qlm in enumerate(qlms): pqQs[:, it] = product(qlm[query['i']], qlm[query['j']]) prodnorm = qQ[it, query['i']] * qQ[it, query['j']] pqQs[:, it] /= prodnorm np.add.at(hqQ, rs, pqQs) return hqQ[:-1], g[:-1]
def crossmatch_gaia(self, plate_solution=None, star_catalog=None): """ Crossmatch sources with Gaia objects, considering multiple solutions. Parameters: ----------- plate_solution : :class:`solve.PlateSolution` Plate solution with one or more astrometric solutions star_catalog : :class:`catalog.StarCatalog` External star catalog with Gaia data """ from .solve import PlateSolution from .catalog import StarCatalog self.log.write('Crossmatching sources with Gaia objects', level=3, event=44) if plate_solution is None or plate_solution.num_solutions == 0: self.log.write('Cannot crossmatch sources with Gaia objects ' 'due to missing astrometric solutions!', level=2, event=44) return if star_catalog is None: self.log.write('Cannot crossmatch sources with Gaia objects ' 'due to missing Gaia catalog data!', level=2, event=44) return assert isinstance(plate_solution, PlateSolution) assert isinstance(star_catalog, StarCatalog) # Take parameters from plate_solution num_solutions = plate_solution.num_solutions solutions = plate_solution.solutions mean_pixscale = plate_solution.mean_pixel_scale # Number of Gaia stars num_gaia = len(star_catalog) self.log.write('Number of Gaia stars: {:d}'.format(num_gaia), level=4, event=44, double_newline=False) # Calculate RA and Dec for the plate epoch ra_ref = (star_catalog['ra'] + (self.plate_epoch - star_catalog['ref_epoch']) * star_catalog['pmra'] / np.cos(star_catalog['dec'] * np.pi / 180.) / 3600000.) dec_ref = (star_catalog['dec'] + (self.plate_epoch - star_catalog['ref_epoch']) * star_catalog['pmdec'] / 3600000.) #catalog = SkyCoord(ra_ref, dec_ref, frame='icrs') xy_ref = np.empty((0, 2)) sol_ref = np.empty((0,), dtype=np.int8) index_ref = np.empty((0,), dtype=np.int32) # Build a list of Gaia stars in image coordinates for i in np.arange(plate_solution.num_solutions): solution = solutions[i] # If there is a column named 'solution_num', then take only # reference stars with the current solution number if 'solution_num' in star_catalog.columns: mask_sol = star_catalog['solution_num'] == i + 1 else: mask_sol = np.full(num_gaia, True) w = wcs.WCS(solution['header_wcs']) try: xr,yr = w.all_world2pix(ra_ref[mask_sol], dec_ref[mask_sol], 1) except wcs.NoConvergence as e: self.log.write('Failed to convert sky coordinates to ' 'pixel coordinates for solution {:d}: {}' .format(i + 1, e)) continue mask_inside = ((xr > 0.5) & (xr < plate_solution.imwidth) & (yr > 0.5) & (yr < plate_solution.imheight)) num_inside = mask_inside.sum() xyr = np.vstack((xr[mask_inside], yr[mask_inside])).T xy_ref = np.vstack((xy_ref, xyr)) sol_ref = np.hstack((sol_ref, np.full(num_inside, i + 1))) index_ref = np.hstack((index_ref, np.arange(num_gaia)[mask_sol][mask_inside])) # Calculate mean astrometric error sigma1 = u.Quantity([sol['scamp_sigma_1'] for sol in solutions if sol['scamp_sigma_1'] is not None]) sigma2 = u.Quantity([sol['scamp_sigma_2'] for sol in solutions if sol['scamp_sigma_2'] is not None]) if len(sigma1) > 0 and len(sigma2) > 0: mean_scamp_sigma = np.sqrt(sigma1.mean()**2 + sigma2.mean()**2) else: mean_scamp_sigma = 2. * u.arcsec # Crossmatch sources and Gaia stars coords_plate = np.vstack((self['x_source'], self['y_source'])).T tolerance = ((5. * mean_scamp_sigma / mean_pixscale) .to(u.pixel).value) #if (5. * mean_scamp_sigma) < 2 * u.arcsec: # tolerance = ((2 * u.arcsec / mean_pixscale) # .to(u.pixel).value) tolerance_arcsec = (5. * mean_scamp_sigma).to(u.arcsec).value self.log.write('Crossmatch tolerance: {:.2f} arcsec ({:.2f} pixels)' .format(tolerance_arcsec, tolerance), level=4, event=44, double_newline=False) ind_plate, ind_ref, ds = crossmatch_cartesian(coords_plate, xy_ref, tolerance=tolerance) dist_arcsec = (ds * u.pixel * mean_pixscale).to(u.arcsec).value ind_gaia = index_ref[ind_ref] self['solution_num'][ind_plate] = sol_ref[ind_ref] self['match_radius'][ind_plate] = tolerance_arcsec self['gaiaedr3_id'][ind_plate] = star_catalog['source_id'][ind_gaia] self['gaiaedr3_gmag'][ind_plate] = star_catalog['mag'][ind_gaia] self['gaiaedr3_bpmag'][ind_plate] = star_catalog['mag1'][ind_gaia] self['gaiaedr3_rpmag'][ind_plate] = star_catalog['mag2'][ind_gaia] self['gaiaedr3_bp_rp'][ind_plate] = star_catalog['color_index'][ind_gaia] self['gaiaedr3_dist'][ind_plate] = dist_arcsec self.num_crossmatch_gaia = len(ind_plate) # Mask nan values in listed columns for col in ['gaiaedr3_gmag', 'gaiaedr3_bpmag', 'gaiaedr3_rpmag', 'gaiaedr3_bp_rp', 'gaiaedr3_dist']: self[col] = MaskedColumn(self[col], mask=np.isnan(self[col])) # Mask zeros in the ID column col = 'gaiaedr3_id' self[col] = MaskedColumn(self[col], mask=(self[col] == 0)) # Store number of crossmatched sources to each solution grp = self.group_by('solution_num').groups tab_grp = Table(grp.aggregate(len)['solution_num', 'source_num']) tab_grp.rename_column('source_num', 'num_gaia_edr3') for i in np.arange(plate_solution.num_solutions): solution = solutions[i] m = tab_grp['solution_num'] == i + 1 if m.sum() > 0: num_gaia_edr3 = tab_grp['num_gaia_edr3'][m].data[0] solution['num_gaia_edr3'] = num_gaia_edr3 else: solution['num_gaia_edr3'] = 0 # Crossmatch: find all neighbours for sources kdt_ref = KDT(xy_ref) kdt_plate = KDT(coords_plate) max_distance = ((20. * mean_scamp_sigma / mean_pixscale) .to(u.pixel).value) if (20. * mean_scamp_sigma) < 5 * u.arcsec: max_distance = (5 * u.arcsec / mean_pixscale).to(u.pixel).value max_dist_arcsec = (max_distance * u.pixel * mean_pixscale).to(u.arcsec).value self.log.write('Finding all reference stars around sources within ' 'the radius of {:.2f} arcsec ({:.2f} pixels)' .format(max_dist_arcsec, max_distance), level=4, event=44) mtrx = kdt_plate.sparse_distance_matrix(kdt_ref, max_distance) mtrx_keys = np.array([a for a in mtrx.keys()]) # Check if there are neighbors at all if len(mtrx_keys) > 0: k_plate = mtrx_keys[:,0] k_ref = mtrx_keys[:,1] dist = np.fromiter(mtrx.values(), dtype=float) * u.pixel # Construct neighbors table nbs = Table() nbs['source_num'] = self['source_num'][k_plate] nbs['gaiaedr3_id'] = star_catalog['source_id'][index_ref[k_ref]] nbs['dist'] = dist nbs['solution_num'] = sol_ref[k_ref] nbs['x_gaia'] = xy_ref[k_ref,0] nbs['y_gaia'] = xy_ref[k_ref,1] # Create the flag_xmatch column by joining the neighbors table # with the source table tab = Table() tab['source_num'] = self['source_num'] tab['gaiaedr3_id'] = MaskedColumn(self['gaiaedr3_id']).filled(0) tab['flag_xmatch'] = np.int8(1) jtab = join(nbs, tab, keys=('source_num', 'gaiaedr3_id'), join_type='left') jtab['flag_xmatch'] = MaskedColumn(jtab['flag_xmatch']).filled(0) self.neighbors_gaia = jtab # Calculate neighbor counts source_num, cnt = np.unique(nbs['source_num'].data, return_counts=True) mask = np.isin(self['source_num'], source_num) ind_mask = np.where(mask)[0] self['gaiaedr3_neighbors'][ind_mask] = cnt else: # Create empty neighbors table nbs = Table(names=('source_num', 'gaiaedr3_id', 'dist', 'solution_num', 'x_gaia', 'y_gaia', 'flag_xmatch'), dtype=('i4', 'i8', 'f4', 'i2', 'f8', 'f8', 'i1')) self.neighbors_gaia = nbs # Process coordinates again, because solution_num assignments may have changed self.process_coordinates(plate_solution=plate_solution)
class SolventAccessibleSurface: """Calculate Lee-Richards solvent-accessible surface area for folded proteins.""" def __init__(self, atoms, probe_radius, num_points=1000, xyz_path="surface.xyz"): """Initialize the object. :param list(Atom) atoms: list of atoms from which to construct surface :param float probe_radius: radius of probe atom (solvent) in Angstroms :param int num_points: number of points to use for reference sphere :param str xyz_path: path to xyz file (if None, no file is written) """ self.atoms = atoms self.probe_radius = probe_radius self.num_points = num_points self.sphere = sphere(num_points) self.max_radius = max([atom.radius for atom in self.atoms]) self.max_search = 2 * self.max_radius + 2 * probe_radius _LOGGER.debug("max_search = %g", self.max_search) # Set up atom surface reference spheres self.surfaces = [] for atom in self.atoms: if atom.radius < RADIUS_CUTOFF: self.surfaces.append(None) else: atom_sphere = (atom.radius + self.probe_radius) * self.sphere + atom.position self.surfaces.append(atom_sphere) # Set up tree structure for distance lookup self.tree = Tree([atom.position for atom in self.atoms]) matrix = self.tree.sparse_distance_matrix(self.tree, self.max_search, output_type="coo_matrix") # Test individual surfaces _LOGGER.debug(matrix) for i, j, distance in zip(matrix.row, matrix.col, matrix.data): if i != j: if np.isclose(distance, 0) and np.isclose( self.atoms[i].radius, self.atoms[j].radius): errstr = f"Overlapping atoms ({i}, {j}) of equal radius" _LOGGER.warning(errstr) if i < j: self.surfaces[i] = self.surfaces[i][0::2] else: self.surfaces[i] = self.surfaces[i][1::2] else: self.prune_surface(i, j) # Dump surface if xyz_path is not None: self.dump_xyz(xyz_path) def dump_xyz(self, xyz_path): """Dump surface in XYZ format. :param str xyz_path: path for XYZ-format data """ _LOGGER.debug("Writing debug coordinates to %s", xyz_path) fmt = "{name} {x:>8.3f} {y:>8.3f} {z:>8.3f}" with open(xyz_path, "wt") as xyz_file: for surface in self.surfaces: if surface is not None: for point in surface: xyz_file.write("%s\n" % fmt.format( name="P", x=point[0], y=point[1], z=point[2])) def atom_surface_area(self, iatom): """Calculate surface area for this atom. :param int iatom: index of the atom in the atom list :returns: total surface area (Angstroms^2) :rtype: float """ num_ref = np.shape(self.sphere)[0] surf = self.surfaces[iatom] if surf is not None: num_surf = np.shape(surf)[0] else: num_surf = 0 atom = self.atoms[iatom] tot_radius = atom.radius + self.probe_radius area = 4 * np.pi * tot_radius * tot_radius return area * float(num_surf) / float(num_ref) def surface_area_dictionary(self): """Calculate surface area, indexed by chain/residue. :returns: surface area (Angstroms^2) for each residue :rtype: dict """ area_dict = {} for iatom, atom in enumerate(self.atoms): if atom.chain_id: chain_id = f"{atom.chain_id}:" else: chain_id = "" key = f"{chain_id}{atom.res_num}:{atom.res_name}" area = self.atom_surface_area(iatom) if key in area_dict: area_dict[key] = area_dict[key] + area else: area_dict[key] = area return area_dict def prune_surface(self, iatom1, iatom2): """Prune the surface of atom1 based on the presence of atom2. :param int iatom1: index of first atom :param int iatom2: index of second atom """ atom2 = self.atoms[iatom2] if self.surfaces[iatom1] is not None: disp12 = self.surfaces[iatom1] - atom2.position dist12 = np.sum(disp12**2, axis=1) max12 = np.square(atom2.radius + self.probe_radius) self.surfaces[iatom1] = self.surfaces[iatom1][dist12 > max12]
def basecallingMaxFlowMinCost(G, Dvar, Tvar, max_df, final_norm, DNN_model, num_hybs): barcodes = [] T_Costs = [] D_Costs = [] signals_df = [] for c in nx.connected_components(G): c = np.array(list(c)) c = c[c <= Dvar.X_idx.max()] if len(Dvar[(Dvar.X_idx.isin(c))]) == num_hybs: barcodes.append( np.array( np.hstack( (np.matrix(Dvar[(Dvar.X_idx.isin(c))].iloc[0, :][[ 'x', 'y', 'z' ]].values), np.matrix(Dvar[( Dvar.X_idx.isin(c))]['ch'])))).flatten().tolist()) s_df = pd.DataFrame( data=Dvar[(Dvar.X_idx.isin(c))][['ch', 'hyb', 'x', 'y', 'z']]) signals_df.append(s_df) k1 = KDTree(s_df[['x', 'y', 'z']].values) T_Costs.append( np.amax(list(k1.sparse_distance_matrix(k1, np.inf).values()))) prob_s = [] I_s = [] for s, row in s_df.iterrows(): ch = row.ch - 2 if ch == 0: #T max_df_tmp = max_df[(max_df.x_T == row.x) & (max_df.y_T == row.y) & (max_df.z_T == row.z) & (max_df.cycle == row.hyb - 1)] elif ch == 1: #G max_df_tmp = max_df[(max_df.x_G == row.x) & (max_df.y_G == row.y) & (max_df.z_G == row.z) & (max_df.cycle == row.hyb - 1)] elif ch == 2: #C max_df_tmp = max_df[(max_df.x_C == row.x) & (max_df.y_C == row.y) & (max_df.z_C == row.z) & (max_df.cycle == row.hyb - 1)] elif ch == 3: #A max_df_tmp = max_df[(max_df.x_A == row.x) & (max_df.y_A == row.y) & (max_df.z_A == row.z) & (max_df.cycle == row.hyb - 1)] I = np.array([ max_df_tmp.I_T.values[0], max_df_tmp.I_G.values[0], max_df_tmp.I_C.values[0], max_df_tmp.I_A.values[0] ]) noNaN = np.argwhere(~np.isnan(I)).flatten() prob_r = [] for n in range(4): if n in noNaN: x_ch = max_df_tmp.iloc[:, n * 3 + 4].values[0] y_ch = max_df_tmp.iloc[:, n * 3 + 5].values[0] z_ch = max_df_tmp.iloc[:, n * 3 + 6].values[0] else: x_ch = row.x y_ch = row.y z_ch = row.z I[n] = final_norm[int(max_df_tmp.cycle), int(n + 2), int(z_ch), int(y_ch), int(x_ch)] rect = final_norm[int(max_df_tmp.cycle), int(n + 2), int(z_ch), int(y_ch - 2):int(y_ch + 3), int(x_ch - 2):int(x_ch + 3)] if rect.size == 25: rect = (rect - np.amin(rect)) / (np.amax(rect) - np.amin(rect)) rect = rect - np.mean(rect) X_data = rect.reshape(1, 5, 5, 1) #getting probabilities from the DNN prob_r.append(DNN_model.predict_proba(X_data)[:, 1][0]) else: prob_r.append(0) prob_r = np.array(prob_r).reshape(-1) prob_r_max = prob_r[int(row.ch) - 2] I_max = I[int(row.ch) - 2] I_ch = np.delete(I, int(row.ch) - 2) prob_ch = np.delete(prob_r, int(row.ch) - 2) I_ch = I_ch[prob_ch > 0.5] prob_ch = prob_ch[prob_ch > 0.5] if len(prob_ch): prob_s.append( (I_max * prob_r_max) / (I_max * prob_r_max + np.amax(I_ch * prob_ch))) else: prob_s.append(prob_r_max) I_s.append(I_max) prob_s = np.array(prob_s) I_s = np.array(I_s) D_Costs.append(np.sum(prob_s)) barcodes = pd.DataFrame(barcodes) for i in range(3, len(barcodes.columns)): barcodes.iloc[:, i].replace([2, 3, 4, 5], ['T', 'G', 'C', 'A'], inplace=True) return { 'barcodes': barcodes, 'T_Costs': T_Costs, 'D_Costs': D_Costs, 'signals_df': signals_df }