def output_multiple_chains(chains, filename, file_type="pdb"): ''' Dump multiple chains to an output file. Remove the hydrogen atoms. :param chains: An iterable of Bio.PDB.Chain to dump. :param filename: The place to dump it. ''' class HSelect(bpdb.Select): def accept_atom(self, atom): if atom.name.find('H') >= 0: return False else: return True m = bpdb.Model.Model(0) s = bpdb.Structure.Structure('stru') for chain in chains: log.debug("Adding chain %s with %s residues", chain.id, len(chain)) m.add(chain) if file_type=="pdb" and len(chain.id)!=1: raise ValueError("Cannot save chain with name %s (not a single character) " "in PDB format. Use cif format instead!") s.add(m) if file_type == "pdb": io = bpdb.PDBIO() else: io = bpdb.MMCIFIO() io.set_structure(s) try: io.save(filename, HSelect()) except Exception as e: with log_to_exception(log, e): log.error("Could not output PDB with chains and residues:") for chain in s[0]: log.error("%s: %s", chain.id, [r.id for r in chain]) raise
def parse_base_pair_id(base_pair_id): """ Separate the two chain/base identifiers present in the interaction section of an MC-Annotate output file. @param base_pair_id: The identifier string for the interacting nucleotides (i.e. 'A33-B45') @return: 4-tuple containing of the form (chain1, res1, chain2, res2) i.e. ('A', 33, 'B', '45') """ # A number in single quotes or a letter, followed by a (potentially negative) number and # potentiallly by an insertion code. residue_pattern = r"(?:'\d'|[A-Za-z])-?\d+(?:\.[A-Za-z])?" parts = re.findall(residue_pattern, base_pair_id) if len(parts) != 2: e = ValueError("Invalid interaction in the MC-Annotate file: %s" % base_pair_id) with log_to_exception(log, e): log.error("Regex matched the following parts: %s", parts) raise e if "-".join(parts) != base_pair_id: raise ValueError("Invalid interaction in the MC-Annotate file: %s" % base_pair_id) log.debug("Parts are '{}'".format(parts)) (from_chain, from_base) = parse_chain_base(parts[0].strip()) (to_chain, to_base) = parse_chain_base(parts[1].strip()) return (from_chain, from_base, to_chain, to_base)
def _validate_pdb_to_stem(target_stem, chains, cg, elem_name): """ :param target_stem: A StemModel to which the pdb chain should be aligned :param chains: A dict {chain_id: Chain} :param cg: The original coarse-grained representation of the pdb chains :param elem_name: The elem_name in cg. """ try: pdb_stem = _define_to_stem_model(cg, chains, elem_name) except Exception as e: with log_to_exception(log, e): for chain in chains.values(): log.error([r.id for r in chain.get_residues()]) raise d_start = ftuv.magnitude(pdb_stem.mids[0] - target_stem.mids[0]) d_end = ftuv.magnitude(pdb_stem.mids[1] - target_stem.mids[1]) assert d_start < 0.1, "{Distance between stem starts {} and {} is too big: {}".format( pdb_stem.mids[0], target_stem.mids[0], d_start) assert d_start < 0.1, "{Distance between stem ends {} and {} is too big: {}".format( pdb_stem.mids[1], target_stem.mids[1], d_end) tw1_polar_pdb = ftuv.spherical_cartesian_to_polar(pdb_stem.twists[0]) tw1_polar_target = ftuv.spherical_cartesian_to_polar(target_stem.twists[0]) d_twist_u = abs(tw1_polar_pdb[1] - tw1_polar_target[1]) d_twist_v = abs(tw1_polar_pdb[2] - tw1_polar_target[2]) if d_twist_u > 0.01: log.warning("Deviation of twist angle u too big for %s: %s", elem_name, d_twist_u) if d_twist_v > 0.01: log.warning("Deviation of twist angle v too big for %s: %s", elem_name, d_twist_v) return True
def _getitem(self, key, include_missing=False, show_modifications=False): log.debug("_getitem called for %s, include_missing=%s, show_modifications=%s", key, include_missing, show_modifications) if isinstance(key, int): key = to_0_based(key) if show_modifications and self._seqids[key] in self._modifications: return self._modifications[self._seqids[key]] else: return self._seq[key] elif isinstance(key, fgr.RESID): try: i = self._seqids.index(key) except ValueError: if key in self._missing_nts: nt = self._missing_nts[key] if include_missing == False: raise IndexError("No structure available for nucleotide '{}'." "For look-up including missing residues, use" "`.with_missing[key]`".format(key)) if show_modifications and key in self._modifications: return self._modifications[key] return nt error = IndexError( "Nucleotide {} is not part of this RNA".format(key)) with log_to_exception(log, error): log.error("self._missing_nts = %s", self._missing_nts) raise error else: if show_modifications and key in self._modifications: return self._modifications[key] return self._seq[i] elif isinstance(key, slice): return self._getslice(key, include_missing, show_modifications) else: raise TypeError("Wrong index type: {}".format(type(key).__name__))
def cg_stem(self, dssr_stem): """ Get the stem define in the CoarseGrainRNA that corresponds to the stem id in the dssr-format. :param dssr_stem: INT the stem in the DSSR Anntotation. """ log.debug("Mapping DSSR stem %s to forgi", dssr_stem) if "stems" not in self._dssr: raise DSSRLookupError("The DSSR object does not contain any stem!") for stem_obj in self._dssr["stems"]: if stem_obj["index"] == dssr_stem: break else: raise DSSRLookupError("No stem with index {}".format(dssr_stem)) log.debug("Found stem %s&%s", stem_obj["strand1"], stem_obj["strand2"]) cg_stems = Counter() # See, if the dssr_stems maps to more than 1 cg-stem for pair in stem_obj["pairs"]: res1 = dssr_to_pdb_resid(pair["nt1"]) res2 = dssr_to_pdb_resid(pair["nt2"]) log.debug("Contains pair %s-%s", res1, res2) if self._cg.chains and (res1.chain not in self._cg.chains or res2.chain not in self._cg.chains): e = WrongChain() with log_to_exception(log, e): log.error("Wrong chain: res1={}, res2={}, cg.chains={}".format( res1, res2, self._cg.chains)) raise e i1 = self._cg.seq.to_integer(res1) i2 = self._cg.seq.to_integer(res2) nodes = self._cg.nucleotides_to_elements([i1, i2]) for node in nodes: cg_stems[node] += 1 if not cg_stems: raise RuntimeError( "No stem matching dssr_stem {}.".format(dssr_stem)) most_common = cg_stems.most_common() if len(most_common) > 1: extra_info = "" for d in cg_stems.keys(): if d[0] == "i": extra_info += "\n{} is {}:".format(d, self._cg.get_define_seq_str(d)) extra_info += "\n\t" + self._cg.seq + "\n\t" + \ self._cg.to_dotbracket_string() + "\n\t" resnums = list(self._cg.define_residue_num_iterator(d)) for i in range(len(self._cg.seq)): pos = i + 1 if pos in resnums: extra_info += "^" else: extra_info += " " warnings.warn("dssr_stem {} maps to more than one cg element: {} {}".format( dssr_stem, list(cg_stems.keys()), extra_info)) for mc in most_common: if mc[0][0] == "s": return mc[0] raise RuntimeError("No stem matching dssr_stem {}, only single stranded region: {}.".format( dssr_stem, list(cg_stems.keys())))
def read_stats_file(filename): log.info("Reading stats-file %s", filename) with open(filename) as f: try: return parse_stats_file(f) except Exception as e: with log_to_exception(log, e): log.error("Failed to parse file %s", filename) raise
def parse_stats_file(file_handle): stats = { "stem": defaultdict(list), "angle": defaultdict(list), "loop": defaultdict(list), "3prime": defaultdict(list), "5prime": defaultdict(list) } for line in file_handle: line = line.strip() if "#" in line: line = line.split('#')[0] if not line: continue if line.startswith("stem"): stem_stat = ftmstats.StemStat(line) stats["stem"][stem_stat.bp_length].append(stem_stat) elif line.startswith("angle") or line.startswith( "open") or line.startswith("pseudo"): angle_stat = ftmstats.AngleStat() try: angle_stat.parse_line(line) except Exception as e: with log_to_exception(log, e): log.error( "Could not parse file due to error parsing line '{}'". format(line)) raise if len(angle_stat.define) > 0 and angle_stat.define[ 0] == 1: #An angle at the beginning of a structure #I guess this should never happen, if the stats do not stem from faulty bulge graphs. log.error( "Ignoring angle stat {} because it is at the beginning of a structure." " Does the stat come from a faulty BulgeGraph?".format( angle_stat.pdb_name)) continue angle_stat.ang_type = patch_angtype(angle_stat.ang_type) log.debug( "Reading angle_stat with dimensions %s and %s, and type %s. With define %s", angle_stat.dim1, angle_stat.dim2, angle_stat.ang_type, angle_stat.define) stats["angle"][(angle_stat.dim1, angle_stat.dim2, angle_stat.ang_type)].append(angle_stat) # Adding the reverse does not work as intended and produces a lot of structures # that do not fulfill the constraint energy. # stats["angle"][(angle_stat.dim1, angle_stat.dim2, -angle_stat.ang_type)].append(angle_stat) # Note that CoarseGrainRNA.get_stats extracts two angle stats per angle. else: key = line.split()[0] if key not in ["3prime", "5prime", "loop"]: raise ValueError( "Illegal line in stats file: '{}'".format(line)) stat = ftmstats.LoopStat(line) stats[key][stat.bp_length].append(stat) return stats
def _safe_resid_from_chain_res(chain, residue): try: return fgr.resid_from_str(str("{}:{}".format(chain, residue))) except ValueError as e: if residue.isdigit(): with log_to_exception(log, e): log.error("Chain is '{}', res is '{}'".format(chain, residue)) raise else: warnings.warn("Illegal residue number: '{}'.".format(residue)) return
def get_dotplot(lines): """docstring for get_dotplot""" residues = [] residue_types = [] bps = defaultdict(lambda: -1) bpseq_str = "" for line in iterate_over_residue_list(lines): parts = line.split(' ') residues.append(parse_chain_base(parts[0])) # A tuple chain, id residue_types += [parts[2]] paired = set() for line in iterate_over_interactions(lines): parts = line.split(' ') #bond_type = parts[3] # if bond_type.find('Ww/Ww') >= 0 or bond_type.find('Ww/Ws') >= 0 or bond_type.find('Ws/Ww') >= 0: if ((line.find('Ww/Ww') >= 0 and (line.find('A-U') >= 0 or line.find('U-A') >= 0 or line.find('C-G') >= 0 or line.find('G-C') >= 0)) or (line.find('Ws/Ww') >= 0 and line.find('U-G') >= 0) or (line.find('Ww/Ws') >= 0 and line.find('G-U') >= 0)): # if bond_type.find('Ww/Ww') >= 0: # print line chain1, base1, chain2, base2 = parse_base_pair_id(parts[0]) res1 = (chain1, base1) res2 = (chain2, base2) if res1 in paired or res2 in paired: if log.isEnabledFor(logging.WARNING): if res1 in bps: existing = "{} - {}".format(res1, residues[bps[res1]]) else: existing = "{} - {}".format(res2, residues[bps[res2]]) log.warning( "Base-triple encountered: Ignoring basepair %s - %s, because basepair %s exists", res1, res2, existing) continue paired.add(res1) paired.add(res2) try: bps[res1] = residues.index(res2) bps[res2] = residues.index(res1) except ValueError as e: with log_to_exception(log, e): log.error("bps = %s, residues = %s, res1 = %s, res2 = %s", bps, residues, res1, res2) raise for i in range(len(residue_types)): bpseq_str += "%d %s %s\n" % (i + 1, residue_types[i], bps[residues[i]] + 1) seq_ids = _seqids_from_residue_map(residues) return bpseq_str, seq_ids
def raise_error_contextmngr2(value): e = ValueError("Another ValueError") log = logging.getLogger("main.inside_ctxt2") log.info("Before with-context. This is logged directly") with logging_exceptions.log_to_exception(log, e): log.debug("This is DEBUG ... %s", value) log.info("This is an INFO ... %s", value) log.warning("This is a WARNING ... %s", value) log.error("This is an ERROR ... %s", value) log.critical("This is CRITICAL ... %s", value) log.info("Raising inside with context") raise e
def _get_fragment(self, stat, sm): key = stat.pdb_name + "__def_" + "-".join(map(str, stat.define)) new_fragment = False try: fragment, _, _ = ftup.get_all_chains(op.join( self.LIBRARY_DIRECTORY, key[2:4], key + ".cif"), no_annotation=True) except Exception: cg, chains = self._get_source_cg_and_chain(stat, sm) new_fragment = True else: fragment = {c.id: c for c in fragment} log.debug("Used stored fragment for %s", key) pdb_basename = stat.pdb_name.split(":")[0] cg_filename = op.expanduser( op.join(self.cg_library_path, pdb_basename + ".cg")) cg = self.get_cg(cg_filename) #The cg with the template try: elem = cg.get_node_from_residue_num(stat.define[0]) except Exception: log.error("stat %s with define %s", stat, stat.define) raise if stat.define != cg.defines[elem]: err = ValueError( "The CG files where the stats where extracted and " "the cg file used for reconstruction are not consistent!") with log_to_exception(log, err): log.error("%s != %s for element %s (%s)", stat.define, cg.defines[elem], elem, stat.pdb_name) raise err if new_fragment: fragment = ftup.extract_subchains_from_seq_ids( chains, cg.define_residue_num_iterator(elem, seq_ids=True, adjacent=(elem[0] != "s"))) if self.LIBRARY_DIRECTORY is not None: log.debug("Storing newly-created fragment for %s", key) import distutils.dir_util distutils.dir_util.mkpath( op.join(self.LIBRARY_DIRECTORY, key[2:4])) ftup.output_multiple_chains( fragment.values(), op.join(self.LIBRARY_DIRECTORY, key[2:4], key + ".cif"), "cif") return cg, elem, fragment
def _enumerate_background_geometries(all_cgs, cutoff_dist, aminor_geometries): """ :param all_cgs: A dictionary {PDBID: [ cg1, cg2, ...]} """ non_ame_geometries = set() for pdb_id, curr_cgs in all_cgs.items(): for cg in curr_cgs: try: for loop in cg.defines: if loop[0] == "s": continue if loop in cg.incomplete_elements or loop in cg.interacting_elements: continue for stem in cg.stem_iterator(): if loop in cg.edges[stem]: continue if stem in cg.incomplete_elements or stem in cg.interacting_elements: continue dist, angle1, angle2 = ftca.get_relative_orientation( cg, loop, stem) if loop[0] == "i": flexibility = ftca.get_loop_flexibility(cg, loop) else: flexibility = 1 if not np.isnan(dist + angle1 + angle2) and dist <= cutoff_dist: geometry = AMGeometry(cg.name, loop, stem, dist, angle1, angle2, "&".join( cg.get_define_seq_str(loop)), 1000, "no_interaction", flexibility) if geometry in aminor_geometries: log.info( "Geometry %s is in aminor_geometries", geometry) else: non_ame_geometries.add(geometry) except BaseException as e: with log_to_exception(log, e): log.error( "An Error occurred during processing of cg: %s", cg.name) raise log.error("%s non_ame geometries found", len(non_ame_geometries)) return non_ame_geometries
def _enumerate_background_geometries(all_cgs, cutoff_dist, aminor_geometries): """ :param all_cgs: A dictionary {PDBID: [ cg1, cg2, ...]} """ non_ame_geometries = set() for pdb_id, curr_cgs in all_cgs.items(): for cg in curr_cgs: try: for loop in cg.defines: if loop[0] == "s": continue if loop in cg.incomplete_elements or loop in cg.interacting_elements: continue for stem in cg.stem_iterator(): if loop in cg.edges[stem]: continue if stem in cg.incomplete_elements or stem in cg.interacting_elements: continue dist, angle1, angle2 = ftca.get_relative_orientation( cg, loop, stem) if loop[0] == "i": flexibility = ftca.get_loop_flexibility(cg, loop) else: flexibility = 1 if not np.isnan(dist + angle1 + angle2) and dist <= cutoff_dist: geometry = AMGeometry( cg.name, loop, stem, dist, angle1, angle2, "&".join(cg.get_define_seq_str(loop)), 1000, "no_interaction", flexibility) if geometry in aminor_geometries: log.info("Geometry %s is in aminor_geometries", geometry) else: non_ame_geometries.add(geometry) except BaseException as e: with log_to_exception(log, e): log.error("An Error occurred during processing of cg: %s", cg.name) raise log.error("%s non_ame geometries found", len(non_ame_geometries)) return non_ame_geometries
def split_at_cofold_cutpoints(bg, cutpoints): """ Multiple sequences should not be connected along the backbone. We have constructed the bulge graph, as if they were connected along the backbone, so now we have to split it. """ for splitpoint in cutpoints: element_left = bg.get_node_from_residue_num(splitpoint) element_right = bg.get_node_from_residue_num(splitpoint + 1) if element_left[0] in "ft" or element_right[0] in "ft": if element_left[0] == "t" and element_left[0] != "t": continue # Splitpoint already implemented elif element_right[0] == "f" and element_left[0] != "f": continue # Splitpoint already implemented else: # No cofold structure. First sequence is disconnected from rest e = GraphConstructionError( "Cannot create BulgeGraph. Found two sequences not " "connected by any base-pair.") with log_to_exception(log, e): log.error("Trying to split between %s and %s", element_left, element_right) raise e return elif element_left[0] == "i" or element_right[0] == "i": _split_interior_loop(bg, splitpoint, element_left, element_right) elif element_left != element_right: _split_between_elements(bg, splitpoint, element_left, element_right) elif element_left[0] == "s": _split_inside_stem(bg, splitpoint, element_left) else: _split_inside_loop(bg, splitpoint, element_left) bg._node_to_resnum = {} if not _is_connected(bg): raise GraphConstructionError( "Cannot create BulgeGraph. Found two sequences not connected by any " " base-pair.")
def _get_source_cg_and_chain(self, stat, sm): """ Load the fragment defined in the stat from the fragment library as pdb and cg. :param stat: The forgi.threedee.model.stats.StemStat or ftms.AngleStat or ftms.LoopStat object. :param sm: The SpatialModel to reconstruct. Used, if it contains stats not sampled but loaded directly. """ stat_name = stat.pdb_name if stat_name == sm.bg.name and sm.bg.chains: return sm.bg, sm.bg.chains pdb_basename = stat_name.split(":")[0] pdb_filename = op.expanduser( op.join(self.pdb_library_path, "_".join(pdb_basename.split("_")[:-1]) + ".pdb")) cg_filename = op.expanduser( op.join(self.cg_library_path, pdb_basename + ".cg")) #Make sure the files exist. try: try: with open(pdb_filename): pass except IOError: pdb_filename = pdb_filename.rstrip(".pdb") + ".cif" with open(pdb_filename): pass with open(cg_filename): pass except Exception as e: with log_to_exception(log, e): log.error("Failed to open files for stat %s", stat.pdb_name) raise log.debug("Opening cg-file %s to extract stat %s", cg_filename, stat.pdb_name) cg = self.get_cg(cg_filename) #The cg with the template chains = self.get_pdb(pdb_filename, store=self.store) return cg, chains
def split_at_cofold_cutpoints(bg, cutpoints): """ Multiple sequences should not be connected along the backbone. We have constructed the bulge graph, as if they were connected along the backbone, so now we have to split it. """ for splitpoint in cutpoints: element_left = bg.get_node_from_residue_num(splitpoint) element_right = bg.get_node_from_residue_num(splitpoint + 1) if element_left[0] in "ft" or element_right[0] in "ft": if element_left[0] == "t" and element_left[0] != "t": continue # Splitpoint already implemented elif element_right[0] == "f" and element_left[0] != "f": continue # Splitpoint already implemented else: # No cofold structure. First sequence is disconnected from rest e = GraphConstructionError("Cannot create BulgeGraph. Found two sequences not " "connected by any base-pair.") with log_to_exception(log, e): log.error("Trying to split between %s and %s", element_left, element_right) raise e return elif element_left[0] == "i" or element_right[0] == "i": _split_interior_loop(bg, splitpoint, element_left, element_right) elif element_left != element_right: _split_between_elements( bg, splitpoint, element_left, element_right) elif element_left[0] == "s": _split_inside_stem(bg, splitpoint, element_left) else: _split_inside_loop(bg, splitpoint, element_left) bg._node_to_resnum = {} if not _is_connected(bg): raise GraphConstructionError("Cannot create BulgeGraph. Found two sequences not connected by any " " base-pair.")
def output_multiple_chains(chains, filename): ''' Dump multiple chains to an output file. Remove the hydrogen atoms. :param chains: An iterable of Bio.PDB.Chain to dump. :param filename: The place to dump it. ''' class HSelect(bpdb.Select): def accept_atom(self, atom): if atom.name.find('H') >= 0: return False else: return True m = bpdb.Model.Model(' ') s = bpdb.Structure.Structure(' ') for chain in chains: log.debug("Adding chain %s with %s residues", chain.id, len(chain)) m.add(chain) s.add(m) io = bpdb.PDBIO() io.set_structure(s) try: io.save(filename, HSelect()) except Exception as e: with log_to_exception(log, e): log.error("Could not output PDB with residues:") log.error( list(r.get_id() for r in bpdb.Selection.unfold_entities(m, 'R'))) log.error(" in chains:") log.error( list(c.get_id() for c in bpdb.Selection.unfold_entities(m, 'C'))) raise
key = {"name": cg.name, "filename": filenames[i]} if args.per_ml: new_data = describe_ml_segments(cg) for i in range(len(new_data["segment"])): for k, v in key.items(): data[k].append(v) for k, v in new_data.items(): data[k].append(v[i]) else: new_data = describe_rna(cg, file_num, dist_pairs, angle_pairs) for k, v in key.items(): data[k].append(v) for k, v in new_data.items(): data[k].append(v) except Exception as e: with log_to_exception(log, e): log.error( "Error occurred during describing %d%s cg %s", file_num, { 1: "st", 2: "nd", 3: "rd" }.get(file_num % 10 * (file_num % 100 not in [11, 12, 13]), "th"), cg.name) raise if args.keys: allowed_keys = args.keys.split(",") + ["name"] for key in list(data.keys()): if key not in allowed_keys: del data[key] df = pd.DataFrame(data) df.set_index("name", append=True, inplace=True)
def load_rna(filename, rna_type="any", allow_many=True, pdb_chain=None, pdb_remove_pk=True, pdb_dotbracket="", dissolve_length_one_stems=True, pdb_annotation_tool=None, pdb_allow_www_query=False): """ :param rna_type: One of "any", and "3d" and "pdb" * "any": Return either BulgeGraph or CoarseGrainRNA object, depending on the input format * "only_cg": Only accept cg-files. * "3d": Return CoarseGrainRNA objects, if the file contains 3D information, raise an error otherwise * "pdb": only accept pdb files :param allow_many: If True, return a list. If False, return a single CoarseGrainRNA object or raise a WrongFileFormat, if more than one RNA is present. :param pdb_chain: Extract the given chain from the file. Only applicable if filename corresponds to a pdb file :param pdb_remove_pk: Detect pseudoknot-free structures from the pdb. :param pdb_dotbracket: Only applicable, if filename corresponds to a pdb file and pdb_chain is given. :param dissolve_length_one_stems: Ignored if input is in forgi bg/cg format. :param pdb_annotation_tool: Use DSSR, MC-Annotate or forgi heuristic for basepair-detection in PDB/MMCIF files (None for auto-detect). Ignored for other file-types. :retuns: A list of RNAs or a single RNA """ # Is filename a dotbracket string and not a filename? if all(c in ".()[]{}&" for c in filename): # A dotbracket-string was provided via the commandline if not rna_type == "any": warnings.warn( "Cannot treat '{}' as dotbracket string, since we need a sequence. " "Trying to treat it as a filename instead...".format(filename)) else: log.info("Assuming RNA %s is a dotbracketstring and not a file.", filename) bg = fgb.BulgeGraph.from_dotbracket( filename, dissolve_length_one_stems=dissolve_length_one_stems) if allow_many: return [bg] else: return bg with open(filename) as rnafile: filetype = sniff_filetype(rnafile) if rna_type == "pdb" and filetype not in ["pdb", "cif"]: raise WrongFileFormat( "Only PDB files (*.pdb/.cif) are accepted, but file {} has type {}." .format(filename, filetype)) if rna_type == "only_cg" and filetype != "forgi": raise WrongFileFormat( "Only forgi cg files are accepted, but file {} has type {}.". format(filename, filetype)) if filetype == "forgi": cg = ftmc.CoarseGrainRNA.from_bg_file(filename) if rna_type in ["3d", "only_cg"] and not cg.coords.is_filled: # pylint: disable=E1101 raise WrongFileFormat( "File {} does not contain all 3D coordinates!".format( filename)) if allow_many: return [cg] else: return cg elif filetype == "pdb" or filetype == "cif": if pdb_chain: cgs = ftmc.CoarseGrainRNA.from_pdb( filename, load_chains=pdb_chain, remove_pseudoknots=pdb_remove_pk and not pdb_dotbracket, secondary_structure=pdb_dotbracket, dissolve_length_one_stems=dissolve_length_one_stems, filetype=filetype, annotation_tool=pdb_annotation_tool, query_PDBeChem=pdb_allow_www_query) else: if pdb_dotbracket: raise ValueError( "pdb_dotbracket requires a chain to be given to avoid ambiguity." ) cgs = ftmc.CoarseGrainRNA.from_pdb( filename, remove_pseudoknots=pdb_remove_pk, dissolve_length_one_stems=dissolve_length_one_stems, filetype=filetype, annotation_tool=pdb_annotation_tool, query_PDBeChem=pdb_allow_www_query) if allow_many: return cgs else: if len(cgs) > 1: raise WrongFileFormat( "More than one connected RNA component in pdb file {}: {}". format(filename, [cg.name for cg in cgs])) return cgs[0] # elif filetype=="mmcif": # raise WrongFileFormat("MMCIF files are not yet supported.") elif filetype == "bpseq": if rna_type == "3d": raise WrongFileFormat( "bpseq file {} is not supported. We need 3D coordinates!". format(filename)) with open(filename, 'r') as f: text = f.read() try: int(text[0]) except ValueError: i = text.find("\n1 ") text = text[i + 1:] bg = ftmc.CoarseGrainRNA.from_bpseq_str( text, dissolve_length_one_stems=dissolve_length_one_stems) if allow_many: return [bg] else: return bg elif filetype == "fasta" or filetype == "other": if rna_type == "3d": raise WrongFileFormat( "Fasta(like) file {} is not supported. We need 3D coordinates!" .format(filename)) try: bgs = ftmc.CoarseGrainRNA.from_fasta( filename, dissolve_length_one_stems=dissolve_length_one_stems) except Exception as e: with log_to_exception(log, e): log.critical("Could not parse file %r.", filename) if filetype == "other": log.critical( "We assumed file %r to be some fasta-variant or dotbracket file, but an error occurred during parsing.", filename) raise if allow_many: return bgs else: if len(bgs) > 1: raise WrongFileFormat( "More than one RNA found in fasta/ dotbracket file {}.". format(filename)) return bgs[0]
def _parse(self, filepath): meta = {} with open(filepath) as file: headers = None data = None for line_no, line in enumerate(file): try: line = line.strip() if not line: continue elif line.startswith("# Random Seed:"): meta["seed"] = int(line.split()[-1]) elif line.startswith("# Command"): meta["command"] = line.split('`')[1] elif line.startswith("# Version"): fields = line.split() meta["ernwin_version"] = fields[3].rstrip(",") meta["forgi_version"] = fields[5] elif line.startswith("#"): continue elif headers is None: headers = line.split("\t") self._init_collector_lookup(headers) data = [] for i in range(len(headers)): data.append([]) else: fields = line.split('\t') for i, field in enumerate(fields): if i == 0: # Step data[i].append(int(field)) elif i == 1: # Sampling_Energy data[i].append(float(field)) cls = self._collectors[i] if cls == "Sampling Move": data[i].append(field) elif cls is not None: data[i].append(cls.parse_value(field)) except Exception as e: with log_to_exception(log, e): log.error( "Exception occurred during parsing of line %d '%s'", line_no, line) raise data_dic = {} for i, header in enumerate(headers): if data[i]: if isinstance(data[i][0], tuple): data_dic["{}_{}".format( header, data[i][0][0])] = [x[1] for x in data[i]] else: data_dic[header] = data[i] data_dic["move_type"] = [] data_dic["accepted"] = [] data_dic["delta_E"] = [] data_dic["stats_moved"] = [] for d in data[-1]: field, _, accepted = d.rpartition(";") typ, _, field = field.partition(":") data_dic["accepted"].append(accepted) if typ == "RE": data_dic["delta_E"].append(float("nan")) data_dic["move_type"].append("RE") data_dic["stats_moved"].append(float("nan")) else: self.update_data_move(data_dic, typ, field) return data_dic
def mend_breakpoints(chains, gap): """ :param gap: A list of res_ids, which can be moved to mend the gap. """ #raise NotImplementedError("Error") try: import moderna except ImportError: warnings.warn( "Cannot mend gaps in sequence, because ModeRNA is not installed!") return chains mod_models = {} with fus.make_temp_directory() as tmpdir: log.info("Writing chains %s", chains.values()) #ftup.output_multiple_chains(chains.values(), op.join(tmpdir, "tmp.pdb")) for g in gap: if g[0].chain != g[1].chain: log.warning( "Not mending gap between multiple chains: %s and %s", g[0], g[1]) continue if g[0].chain not in mod_models: try: mod_models[g[0].chain] = moderna.load_model( chains[g[0].chain], data_type="chain" ) #moderna.load_model(op.join(tmpdir, "tmp.pdb"), g[0].chain) except Exception as e: with log_to_exception(log, e): log.error("g is %s, g[0] is %s, g[0].chain is %s", g, g[0], g[0].chain) log.error("chains is %s", chains) raise moderna.fix_backbone(mod_models[g[0].chain], resid_to_moderna(g[0]), resid_to_moderna(g[1])) #moderna.write_model(mod_models[g[0].chain], op.join(tmpdir, "tmp.pdb")) #for chain_id, model in mod_models.items(): # moderna.write_model(model, op.join(tmpdir, "mended_{}.pdb".format(chain_id))) #Load back to Biopython mended_chains = {} for chain_id in chains.keys(): if chain_id in mod_models: mended_chains[chain_id] = mod_models[ chain_id] #Mod models are chain subclasses anyway log.info("Mended:", mended_chains) mended_chains[chain_id].id = chain_id else: mended_chains[chain_id] = chains[chain_id] log.info("mended_chains: %s", mended_chains) # Moderna may replace modified residues with "UNK" for unknown or otherrwise change the code. # We have to replace them back. for chain_id in chains: for res in mended_chains[chain_id]: changed = False for o_res in chains[chain_id]: if o_res.id[1:] == res.id[1:]: log.debug("Changing Moderna residue %s to %s", res, o_res) assert not changed #Only one residue per number+icode res.id = o_res.id res.resname = o_res.resname log.debug("Moderna residue now %s", res) changed = True # Convert back from ModeRNA to Biopython out_chains = {} for k, v in mended_chains.items(): s = v.get_structure()[0] log.error("%s, %s %s", k, s, s.child_dict) assert len(s.child_list) == 1 out_chains[k] = s.child_list[0] out_chains[k].id = k return out_chains
def insert_element(cg_to, cg_from, elem_to, elem_from, chains_to, chains_from, angle_type): ''' Take an element (elem_from) from one dict of chains (chains_from, cg_from) and insert it on the new chain while aligning on the adjoining elements. The neighboring elements need to be present in chain_to in order for the next element to be aligned to their starting and ending positions. The dimensions and type of elem_to and elem_from need to be identical. This method aligns the flanking base pairs on both ends (except for 3' and 5' elements) of the fragment with the respective base-pairs in the stem-scaffold. This means that there will be equally big breaks in the chain on both sides of the fragment. :param cg_to: The coarse-grain representation of the target chain :param cg_from: The coarse-grain representation of the source chain :param elem_to: The element to replace :param elem_from: The source element :param chains_to: A dict chainid:chain. The chains to graft onto :param chains_from: A dict chainid:chain. The chains to excise from :returns: a list of tuples containing gaps to mend ''' log.info("Inserting element %s", elem_to) assert elem_from[0] == elem_to[0], "{}[0]!={}[0]".format( elem_from, elem_to) # The define of the loop with adjacent nucleotides (if present) in both cgs define_a_to = cg_to.define_a(elem_to) define_a_from = cg_from.define_a(elem_from) assert len(define_a_to) == len(define_a_from) nt_in_define_from = [ x in cg_from.defines[elem_from] for x in define_a_from ] # The defines translated to seq_ids. closing_bps_to = [] closing_bps_from = [] log.debug("Angle type is %s", angle_type) for nt in define_a_to: closing_bps_to.append(cg_to.seq.to_resid(nt)) for nt in define_a_from: closing_bps_from.append(cg_from.seq.to_resid(nt)) # Seq_ids of all nucleotides in the loop that will be inserted seq_ids_a_from = [] for i in range(0, len(define_a_from), 2): for nt in range(define_a_from[i], define_a_from[i + 1] + 1): seq_ids_a_from.append(cg_from.seq.to_resid(nt)) log.debug("seqids_a from %s", seq_ids_a_from) #The loop fragment to insert in a dict {chain_id:chain} try: chains_from = ftup.extract_subchains_from_seq_ids( chains_from, seq_ids_a_from) except Exception as e: with log_to_exception(log, e): log.error( "Could not extract fragment %s from pdb: " " At least one of the seq_ids %s not found." " Chains are %s", elem_from, seq_ids_a_from, chains_from.keys()) raise # A list of tuples (seq_id_from, seq_id_to) for the nucleotides # that will be used for alignment. log.debug("Closing_bps _from are %s", closing_bps_from) alignment_positions = [] assert elem_from[0] != "s", "No stems allowed in insert_element" if elem_from[0] == "f": alignment_positions.append((closing_bps_from[1], closing_bps_to[1])) elif elem_from[0] == "t": alignment_positions.append((closing_bps_from[0], closing_bps_to[0])) else: for i in range(len(closing_bps_from)): # alignment_positions.append( (closing_bps_from[i], closing_bps_to[i])) log.debug("Calling align_on_nucleotides for %s", elem_to) align_on_nucleotides(chains_from, chains_to, alignment_positions) #The defines and seq_ids WITHOUT adjacent elements define_to = cg_to.defines[elem_to] define_from = cg_from.defines[elem_from] no_moderna = False if len(define_from) != len(define_to): log.warning( "Inconsistent defines: {} and {} for {}. Using ModeRNA fragment instead." .format(define_from, define_to, elem_to)) target_seqs = cg_to.get_define_seq_str(elem_to) # One or two strands for i, target_seq in enumerate(target_seqs): if closing_bps_from[2 * i].chain != closing_bps_from[2 * i + 1].chain: raise NotImplementedError("TODO") try: mod_chain = use_moderna_fragment( chains_from[closing_bps_from[2 * i].chain], target_seq, closing_bps_from[2 * i], closing_bps_from[2 * i + 1]) except: no_moderna = True else: chains_from[seq_ids_a_from[0].chain] = mod_chain elif cg_to.element_length(elem_to) != cg_from.element_length(elem_from): log.warning("%s not consistent with %s: Missing residues", define_from, define_to) log.warning("%s has different len than %s for angle type %s", define_from, define_to, angle_type) if define_to[1] - define_to[0] > define_from[1] - define_from[0]: # Apply an indel on the left side if closing_bps_from[0].chain != closing_bps_from[1].chain: raise NotImplementedError("TODO") target_seq = cg_to.get_define_seq_str(elem_to)[0] # Forward strand try: mod_chain = use_moderna_fragment( chains_from[closing_bps_from[0].chain], target_seq, closing_bps_from[0], closing_bps_from[1]) except: no_moderna = True else: chains_from[seq_ids_a_from[0].chain] = mod_chain else: raise NotImplementedError("TODO") seq_ids_to = [] for i in range(0, len(define_to), 2): seq_ids_to.append([]) for nt in range(define_to[i], define_to[i + 1] + 1): seq_ids_to[-1].append(cg_to.seq.to_resid(nt)) seq_ids_from = [] # Now append first strand to seq_ids_from assert closing_bps_from[0].chain == closing_bps_from[1].chain log.debug("nt_in_define=%s", nt_in_define_from) if closing_bps_from[0].resid < closing_bps_from[1].resid: s = list( iter_resids_between(chains_from[closing_bps_from[0].chain], closing_bps_from[0].resid, closing_bps_from[1].resid, nt_in_define_from[0], nt_in_define_from[1])) else: s = list( iter_resids_between(chains_from[closing_bps_from[0].chain], closing_bps_from[1].resid, closing_bps_from[0].resid, nt_in_define_from[1], nt_in_define_from[0])) s[0].reverse() if s: seq_ids_from.append(s) if len(closing_bps_from) > 2: assert closing_bps_from[2].chain == closing_bps_from[3].chain if closing_bps_from[2].resid < closing_bps_from[3].resid: s = (list( iter_resids_between(chains_from[closing_bps_from[2].chain], closing_bps_from[2].resid, closing_bps_from[3].resid, nt_in_define_from[2], nt_in_define_from[3]))) else: s = list( iter_resids_between(chains_from[closing_bps_from[2].chain], closing_bps_from[3].resid, closing_bps_from[2].resid, nt_in_define_from[3], nt_in_define_from[2])) s.reverse() if s: seq_ids_from.append(s) log.info("Fragment %s", seq_ids_from) log.info("Target %s", seq_ids_to) if not no_moderna: assert len(seq_ids_from[0]) == len( seq_ids_to[0]), "Unequal length for {}: {} {}".format( elem_to, seq_ids_from, seq_ids_to) if len(seq_ids_to) > 1: assert len(seq_ids_from[1]) == len( seq_ids_to[1]), "Unequal length for {}: {} {}".format( elem_to, seq_ids_from, seq_ids_to) log.debug("Copying %s to %s for %s", seq_ids_from, seq_ids_to, elem_to) # Now copy the residues from the fragment chain to the scaffold chain. lastres = [None, None] for a in range(len(seq_ids_to)): for i in range(len(seq_ids_to[a])): try: resid_from = seq_ids_from[a][i] except IndexError: lastres[a] = seq_ids_to[a][i - 1] break resid_to = seq_ids_to[a][i] residue = chains_from[resid_from.chain][resid_from.resid] #Change the resid to the target residue.parent = None residue.id = resid_to.resid if resid_to.chain not in chains_to: log.info("Adding chain with id %r for residue %r", resid_to.chain, resid_to) chains_to[resid_to.chain] = bpdb.Chain.Chain(resid_to.chain) #Now, add the residue to the target chain chains_to[resid_to.chain].add(residue) # Now we need to mend gaps created by imperfect alignment. gaps_to_mend = [] if elem_from[0] != "f": log.debug("To mend: %s %s ", cg_to.seq.to_resid(define_a_to[0]), cg_to.seq.to_resid(define_a_to[0] + 1)) gaps_to_mend.append([ cg_to.seq.to_resid(define_a_to[0]), cg_to.seq.to_resid(define_a_to[0] + 1) ]) d = gap_length(chains_to, cg_to.seq.to_resid(define_a_to[0]), cg_to.seq.to_resid(define_a_to[0] + 1)) log.debug("Elem {}: dist {} - {} is {}".format(elem_to, define_a_to[0], define_a_to[0] + 1, d)) if elem_from[0] != "t": if lastres[0] is not None: r = lastres[0] else: r = cg_to.seq.to_resid(define_a_to[1] - 1) gaps_to_mend.append([r, cg_to.seq.to_resid(define_a_to[1])]) d = gap_length(chains_to, r, cg_to.seq.to_resid(define_a_to[1])) log.debug("Elem {}: dist {} - {} is {}".format(elem_to, define_a_to[1], define_a_to[1] - 1, d)) if elem_from[0] == "i": gaps_to_mend.append([ cg_to.seq.to_resid(define_a_to[2]), cg_to.seq.to_resid(define_a_to[2] + 1) ]) if lastres[1] is not None: r = lastres[1] else: r = cg_to.seq.to_resid(define_a_to[3] - 1) gaps_to_mend.append([r, cg_to.seq.to_resid(define_a_to[3])]) log.debug("To mend %s", gaps_to_mend) return gaps_to_mend
logging_exceptions.log_exception(e, logging.WARNING) try: raise_error_contextmngr(555) except Exception as e: logging_exceptions.log_exception(e) log = logging.getLogger("another.logger.name") log.info( "The following should log from the main module level (doesn't work with root logger)" ) helper_function(log) fltr = Filter1("CTMNGR2") log = logging.getLogger("main.inside_ctxt2") log.addFilter(fltr) log = logging.getLogger() log.handlers[0].addFilter(Filter1("RootHandler")) try: raise_error_contextmngr2(12345) except Exception as e: logging_exceptions.log_exception(e, with_stacktrace=False) log.info("Almost there") try: raise_error_contextmngr2(-1) except Exception as e: with logging_exceptions.log_to_exception(log, e): log.critical("using a BARE RAISE works as intended.") raise
def load_rna(filename, rna_type="any", allow_many=True, pdb_chain=None, pbd_remove_pk=True, pdb_dotbracket="", dissolve_length_one_stems = True): """ :param rna_type: One of "any", "cg" and "3d" and "pdb" * "any": Return either BulgeGraph or CoarseGrainRNA objekte, depending on the input format * "cg": Always convert to CoarseGrainRNA objects, even if they have no 3D information * "only_cg": Only accept cg-files. * "3d": Return CoarseGrainRNA objects, if the file contains 3D information, raise an error otherwise * "pdb": only accept pdb files :param allow_many: If True, return a list. If False raise an error, if more than one RNA is present. :param pdb_chain: Extract the given chain from the file. Only applicable if filename corresponds to a pdb file :param pdb_remove_pk: Detect pseudoknot-free structures from the pdb. :param pdb_dotbracket: Only applicable, if filename corresponds to a pdb file and pdb_chain is given. :param dissolve_length_one_stems: Ignored if input is in forgi bg/cg format. :retuns: A list of RNAs or a single RNA """ # Is filename a dotbracket string and not a filename? if all( c in ".()[]{}&" for c in filename): # A dotbracket-string was provided via the commandline if not rna_type=="any": warnings.warn("Cannot treat '{}' as dotbracket string, since we need a sequence. " "Trying to treat it as a filename instead...".format(filename)) else: log.info("Assuming RNA %s is a dotbracketstring and not a file.", filename) bg = fgb.from_fasta_text(filename, dissolve_length_one_stems=dissolve_length_one_stems) if allow_many: return [bg] else: return bg with open(filename) as rnafile: filetype = sniff_filetype(rnafile) if rna_type=="pdb" and filetype!="pdb": raise WrongFileFormat("Only PDB files are accepted, but file {} has type {}.".format(filename, filetype)) if rna_type=="only_cg" and filetype!="forgi": raise WrongFileFormat("Only forgi cg files are accepted, but file {} has type {}.".format(filename, filetype)) if filetype=="forgi": cg = ftmc.CoarseGrainRNA(filename) if rna_type in ["3d", "only_cg"] and not cg.coords.is_filled: raise WrongFileFormat("File {} does not contain all 3D coordinates!".format(filename)) if allow_many: return [cg] else: return cg elif filetype=="pdb": if pdb_chain: cgs = [ftmc.load_cg_from_pdb(filename, chain_id=pdb_chain, remove_pseudoknots=pbd_remove_pk and not pdb_dotbracket, secondary_structure=pdb_dotbracket, dissolve_length_one_stems=dissolve_length_one_stems)] if dissolve_length_one_stems: for cg in cgs: cg.dissolve_length_one_stems() else: if pdb_dotbracket: raise ValueError("pdb_dotbracket requires a chain ti be given to avioid ambiguity.") cgs = ftmc.connected_cgs_from_pdb(filename, remove_pseudoknots = pbd_remove_pk, dissolve_length_one_stems=dissolve_length_one_stems) if allow_many: return cgs else: if len(cgs)>1: raise WrongFileFormat("More than one connected RNA component in pdb file {}.".format(filename)) return cgs[0] elif filetype=="mmcif": raise WrongFileFormat("MMCIF files are not yet supported.") elif filetype=="bpseq": if rna_type=="3d": raise WrongFileFormat("bpseq file {} is not supported. We need 3D coordinates!".format(filename)) bg = fgb.BulgeGraph() with open(filename, 'r') as f: text = f.read() try: int(text[0]) except ValueError: i=text.find("\n1 ") text=text[i+1:] bg.from_bpseq_str(text, dissolve_length_one_stems=dissolve_length_one_stems) if rna_type=="cg": bg = ftmc.from_bulge_graph(bg) if allow_many: return [bg] else: return bg elif filetype =="fasta" or filetype=="other": if rna_type=="3d": raise WrongFileFormat("Fasta(like) file {} is not supported. We need 3D coordinates!".format(filename)) try: bgs = fgb.from_fasta(filename, dissolve_length_one_stems=dissolve_length_one_stems) except Exception as e: with log_to_exception(log, e): log.critical("Could not parse file %r.", filename) if filetype=="other": log.critical("We assumed file %r to be some fasta-variant or dotbracket file, but an error occurred during parsing.", filename) raise if isinstance(bgs, fgb.BulgeGraph): bgs = [bgs] if dissolve_length_one_stems: for bg in bgs: bg.dissolve_length_one_stems() if rna_type=="cg": bgs = list(map(ftmc.from_bulge_graph, bgs)) if allow_many: return bgs else: if len(bgs)>1: raise WrongFileFormat("More than one RNA found in fasta/ dotbracket file {}.".format(filename)) return bgs[0]
def load_rna(filename, rna_type="any", allow_many=True, pdb_chain=None, pdb_remove_pk=True, pdb_dotbracket="", dissolve_length_one_stems=True, pdb_annotation_tool=None, pdb_allow_www_query=False): """ :param rna_type: One of "any", and "3d" and "pdb" * "any": Return either BulgeGraph or CoarseGrainRNA object, depending on the input format * "only_cg": Only accept cg-files. * "3d": Return CoarseGrainRNA objects, if the file contains 3D information, raise an error otherwise * "pdb": only accept pdb files :param allow_many: If True, return a list. If False, return a single CoarseGrainRNA object or raise a WrongFileFormat, if more than one RNA is present. :param pdb_chain: Extract the given chain from the file. Only applicable if filename corresponds to a pdb file :param pdb_remove_pk: Detect pseudoknot-free structures from the pdb. :param pdb_dotbracket: Only applicable, if filename corresponds to a pdb file and pdb_chain is given. :param dissolve_length_one_stems: Ignored if input is in forgi bg/cg format. :param pdb_annotation_tool: Use DSSR, MC-Annotate or forgi heuristic for basepair-detection in PDB/MMCIF files (None for auto-detect). Ignored for other file-types. :retuns: A list of RNAs or a single RNA """ # Is filename a dotbracket string and not a filename? if all(c in ".()[]{}&" for c in filename): # A dotbracket-string was provided via the commandline if not rna_type == "any": warnings.warn("Cannot treat '{}' as dotbracket string, since we need a sequence. " "Trying to treat it as a filename instead...".format(filename)) else: log.info( "Assuming RNA %s is a dotbracketstring and not a file.", filename) bg = fgb.BulgeGraph.from_dotbracket( filename, dissolve_length_one_stems=dissolve_length_one_stems) if allow_many: return [bg] else: return bg with open(filename) as rnafile: filetype = sniff_filetype(rnafile) if rna_type == "pdb" and filetype not in ["pdb", "cif"]: raise WrongFileFormat( "Only PDB files (*.pdb/.cif) are accepted, but file {} has type {}.".format(filename, filetype)) if rna_type == "only_cg" and filetype != "forgi": raise WrongFileFormat( "Only forgi cg files are accepted, but file {} has type {}.".format(filename, filetype)) if filetype == "forgi": cg = ftmc.CoarseGrainRNA.from_bg_file(filename) if rna_type in ["3d", "only_cg"] and not cg.coords.is_filled: # pylint: disable=E1101 raise WrongFileFormat( "File {} does not contain all 3D coordinates!".format(filename)) if allow_many: return [cg] else: return cg elif filetype == "pdb" or filetype == "cif": if pdb_chain: cgs = ftmc.CoarseGrainRNA.from_pdb(filename, load_chains=pdb_chain, remove_pseudoknots=pdb_remove_pk and not pdb_dotbracket, secondary_structure=pdb_dotbracket, dissolve_length_one_stems=dissolve_length_one_stems, filetype=filetype, annotation_tool=pdb_annotation_tool, query_PDBeChem=pdb_allow_www_query) else: if pdb_dotbracket: raise ValueError( "pdb_dotbracket requires a chain to be given to avoid ambiguity.") cgs = ftmc.CoarseGrainRNA.from_pdb(filename, remove_pseudoknots=pdb_remove_pk, dissolve_length_one_stems=dissolve_length_one_stems, filetype=filetype, annotation_tool=pdb_annotation_tool, query_PDBeChem=pdb_allow_www_query) if allow_many: return cgs else: if len(cgs) > 1: raise WrongFileFormat("More than one connected RNA component in pdb file {}: {}".format( filename, [cg.name for cg in cgs])) return cgs[0] # elif filetype=="mmcif": # raise WrongFileFormat("MMCIF files are not yet supported.") elif filetype == "bpseq": if rna_type == "3d": raise WrongFileFormat( "bpseq file {} is not supported. We need 3D coordinates!".format(filename)) with open(filename, 'r') as f: text = f.read() try: int(text[0]) except ValueError: i = text.find("\n1 ") text = text[i + 1:] bg = ftmc.CoarseGrainRNA.from_bpseq_str( text, dissolve_length_one_stems=dissolve_length_one_stems) if allow_many: return [bg] else: return bg elif filetype == "fasta" or filetype == "other": if rna_type == "3d": raise WrongFileFormat( "Fasta(like) file {} is not supported. We need 3D coordinates!".format(filename)) try: bgs = ftmc.CoarseGrainRNA.from_fasta( filename, dissolve_length_one_stems=dissolve_length_one_stems) except Exception as e: with log_to_exception(log, e): log.critical("Could not parse file %r.", filename) if filetype == "other": log.critical( "We assumed file %r to be some fasta-variant or dotbracket file, but an error occurred during parsing.", filename) raise if allow_many: return bgs else: if len(bgs) > 1: raise WrongFileFormat( "More than one RNA found in fasta/ dotbracket file {}.".format(filename)) return bgs[0]
key = {"name": cg.name, "filename": filenames[i]} if args.per_ml: new_data = describe_ml_segments(cg) for i in range(len(new_data["segment"])): for k, v in key.items(): data[k].append(v) for k, v in new_data.items(): data[k].append(v[i]) else: new_data = describe_rna(cg, file_num, dist_pairs, angle_pairs) for k, v in key.items(): data[k].append(v) for k, v in new_data.items(): data[k].append(v) except Exception as e: with log_to_exception(log, e): log.error("Error occurred during describing %d%s cg %s", file_num, {1: "st", 2: "nd", 3: "rd"}.get( file_num % 10 * (file_num % 100 not in [11, 12, 13]), "th"), cg.name) raise if args.keys: allowed_keys = args.keys.split(",") + ["name"] for key in list(data.keys()): if key not in allowed_keys: del data[key] df = pd.DataFrame(data) df.set_index("name", append=True, inplace=True) if args.csv: if not args.mode and os.path.isfile(args.csv): raise RuntimeError("File {} exists already.".format(args.csv)) if not args.mode or args.mode == 'o': df.to_csv(args.csv)
def cg_stem(self, dssr_stem): """ Get the stem define in the CoarseGrainRNA that corresponds to the stem id in the dssr-format. :param dssr_stem: INT the stem in the DSSR Anntotation. """ log.debug("Mapping DSSR stem %s to forgi", dssr_stem) if "stems" not in self._dssr: raise DSSRLookupError("The DSSR object does not contain any stem!") for stem_obj in self._dssr["stems"]: if stem_obj["index"] == dssr_stem: break else: raise DSSRLookupError("No stem with index {}".format(dssr_stem)) log.debug("Found stem %s&%s", stem_obj["strand1"], stem_obj["strand2"]) cg_stems = Counter( ) # See, if the dssr_stems maps to more than 1 cg-stem for pair in stem_obj["pairs"]: res1 = dssr_to_pdb_resid(pair["nt1"]) res2 = dssr_to_pdb_resid(pair["nt2"]) log.debug("Contains pair %s-%s", res1, res2) if self._cg.chains and (res1.chain not in self._cg.chains or res2.chain not in self._cg.chains): e = WrongChain() with log_to_exception(log, e): log.error( "Wrong chain: res1={}, res2={}, cg.chains={}".format( res1, res2, self._cg.chains)) raise e i1 = self._cg.seq.to_integer(res1) i2 = self._cg.seq.to_integer(res2) nodes = self._cg.nucleotides_to_elements([i1, i2]) for node in nodes: cg_stems[node] += 1 if not cg_stems: raise RuntimeError( "No stem matching dssr_stem {}.".format(dssr_stem)) most_common = cg_stems.most_common() if len(most_common) > 1: extra_info = "" for d in cg_stems.keys(): if d[0] == "i": extra_info += "\n{} is {}:".format( d, self._cg.get_define_seq_str(d)) extra_info += "\n\t" + self._cg.seq + "\n\t" + \ self._cg.to_dotbracket_string() + "\n\t" resnums = list(self._cg.define_residue_num_iterator(d)) for i in range(len(self._cg.seq)): pos = i + 1 if pos in resnums: extra_info += "^" else: extra_info += " " warnings.warn( "dssr_stem {} maps to more than one cg element: {} {}".format( dssr_stem, list(cg_stems.keys()), extra_info)) for mc in most_common: if mc[0][0] == "s": return mc[0] raise RuntimeError( "No stem matching dssr_stem {}, only single stranded region: {}.". format(dssr_stem, list(cg_stems.keys())))