def retrive_relevant_poses() -> (dict, dict): """ :return: seq dicts for cohs and docs, holding only the relevqant positions, determined by 1OHZ """ cohs_old = read_multi_fastas(root_path + 'cohesins_from_rachel.fasta_aln', suffix_to_remove='/') docs_old = read_multi_fastas(root_path + 'dockerins_from_rachel.fasta_aln', suffix_to_remove='/') coh_1ohz = cohs_old['1OHZ'] coh_poses = [ coh_1ohz.non_aligned_position_at_aligned(p) for p in coh_poses_1ohz ] doc_1ohz = docs_old['1OHZ'] doc_poses = [ doc_1ohz.non_aligned_position_at_aligned(p) for p in doc_poses_1ohz ] cohs_new, docs_new = {}, {} for coh, res in cohs_old.items(): cohs_new[coh] = AASeq(string=''.join( res.get_aligned_positions(coh_poses)), name=coh) for doc, res in docs_old.items(): docs_new[doc] = AASeq(string=''.join( res.get_aligned_positions(doc_poses)), name=doc) return cohs_new, docs_new
def __init__(self, name: str, coh_AASeq: AASeq, doc_AASeq: AASeq, purples: int, j=False, originals=None, doc_wt=False): self.name = name self.coh_AASeq = coh_AASeq self.doc_AASeq = doc_AASeq self.purples = purples if not j: name_split = name.split('_') a_ind = name_split.index('A') self.coh_wt = name_split[a_ind - 1] self.doc_wt = name_split[a_ind + 1] elif doc_wt: self.coh_wt = '1ohz' self.doc_wt = doc_wt else: self.coh_wt = '1ohz' self.doc_wt = originals[name[:-3] + '.pdb.gz'].split('_A_')[1].split('_')[0] pos_str = coh_AASeq.get_positions(positions_dict['1ohz']) self.coh_switch = ''.join( [type_dict[a] if a in type_dict.keys() else 'c' for a in pos_str]) doc_str = doc_AASeq.get_positions(list(doc21ohz[self.doc_wt].keys())) self.doc_switch = ''.join( [type_dict[a] if a in type_dict.keys() else 'c' for a in doc_str])
def __init__(self, chain_id: str = None, residues: dict = None, non_residues: dict = None): self.chain_id = chain_id self.residues = residues if residues is not None else {} self.seq = AASeq(''.join(a.res_type for a in residues.values())) if residues is not None else \ AASeq('', name=chain_id) self.non_residues = non_residues if non_residues is not None else {} self.non_residues_seq = AASeq(''.join(a.res_type for a in residues.values()), name=chain_id) if \ non_residues is not None else AASeq('', name=chain_id)
def write_multi_seqs_to_file(seqs: dict, out_file: str, query: AASeq = None, no_dups: bool = True): """ :param seqs: {name: AASeq} :param out_file: file to write :param query: query AASeq :param no_dups: whether to refrain from duplicates or not :return: write fasta file """ written_seqs = [] with open(out_file, 'w+') as fout: if query is not None: fout.write('%s\n' % query.write()) written_seqs.append(query) for s in seqs.values(): if query is not None: if query == s: continue if no_dups: if s in written_seqs: continue fout.write('%s\n' % s.write()) written_seqs.append(s)
def extract_seq(pdb: MyPDB) -> dict: seqs = {} for cid, c in pdb: seqs[cid] = AASeq(name='%s.%s' % (pdb.name, cid)) seq = '' for rid, r in c: seq += r.res_type seqs[cid].set_seq(seq) return seqs
def aln_identity(aln1: AASeq, aln2: AASeq) -> float: """ :param aln1: alignment sequence (with gaps) :param aln2: alignment sequence (with gaps) :return: the identity calculated by: (# identities)/(aln1 length, no gaps) >>> a = AASeq(aligned='-ABC-D') >>> b = AASeq(aligned='-ABCED') >>> aln_identity(a, b) 1.0 >>> b = AASeq(string='-BBCED') >>> aln_identity(a, b) 0.75 """ res = 0. for i, aa in enumerate(aln1): res += 1. if aln1.get_aligned()[i] == aln2.get_aligned()[i] != '-' else 0. length = float(len(len(aln1))) return res/length
def aln_identity(aln1: AASeq, aln2: AASeq) -> float: """ :param aln1: alignment sequence (with gaps) :param aln2: alignment sequence (with gaps) :return: the identity calculated by: (# identities)/(aln1 length, no gaps) >>> a = AASeq(aligned='-ABC-D') >>> b = AASeq(aligned='-ABCED') >>> aln_identity(a, b) 1.0 >>> b = AASeq(string='-BBCED') >>> aln_identity(a, b) 0.75 """ res = 0. for i, aa in enumerate(aln1): res += 1. if aln1.get_aligned()[i] == aln2.get_aligned( )[i] != '-' else 0. length = float(len(len(aln1))) return res / length
def setup_db(args): rost_db = parse_rost_db() failed = [] logger = Logger('./db_setup.log') for k, v in rost_db.items(): # if k != 'q9u6b8': continue logger.create_header('working on %s' % k) logger.log('seq: %s' % v['seq']) logger.log('pdb: %s' % v['pdb']) logger.log('chain: %s' % v['chain']) logger.log('ts: %s' % v['ts']) os.mkdir(k) os.chdir(k) # get pdb and extract chain download_pdb({'name': v['pdb'], 'path': './'}) empty_pdb = MyPDB(name=v['pdb']) pdb = parse_PDB('pdb%s.ent' % v['pdb']) chain = pdb.chains[v['chain']] empty_pdb.add_chain(chain) write_PDB('%s_%s.pdb' % (k, v['chain']), empty_pdb) pdb_seq = extract_seq(empty_pdb) rdb_seq = AASeq(v['seq']) score, start, end = pdb_seq[v['chain']].align(rdb_seq) logger.log('pdb seq: %s' % pdb_seq[v['chain']].aligned) logger.log('rst seq: %s' % rdb_seq.aligned) # get spans and print xml spans = find_topo(v['ts']) new_spans = [] for sp in spans: start = pdb_seq[v['chain']].aligned_position_at_non_aligned( sp[0]) + 1 end = pdb_seq[v['chain']].aligned_position_at_non_aligned( sp[1]) + 1 logger.log('span %i->%i %s moving to %i->%i' % (sp[0], sp[1], sp[2], start, end)) new_spans.append([start, end, sp[2]]) create_AddMembrane_xml(new_spans, '%s_AddMembrane.xml' % v['pdb']) # create flags file with open('embed.flags', 'w+') as fout: fout.write( '-parser:protocol /home/labs/fleishman/jonathaw/elazaridis/protocols/embed_in_membrane.xml\n' ) fout.write('-s %s\n' % '%s_%s.pdb' % (k, v['chain'])) fout.write('-parser:script_vars add_memb_xml=%s\n' % '%s_AddMembrane.xml' % v['pdb']) fout.write('-overwrite\n') fout.write('-score::elec_memb_sig_die\n') fout.write('-corrections::beta_nov15\n') fout.write('-score::memb_fa_sol\n') os.chdir('../')
def translate(seq: str, name=None) -> AASeq: """ :param seq: a nucleotide seq :return: amino acid seq >>> translate('TTTCATAAG').get_seq() 'FHK' """ return AASeq(string=''.join( [genetic_code[seq[i:i + 3]] for i in range(0, len(seq) - 3 + 1, 3)]), name=name)
def highest_seq_similarity(crys_seqs: list, query: AASeq) -> (AASeq, float): """ :param crys_seqs: list of AASeq instances of crystalised seqs :param query: a query AASeq :return: the most sequence-similar sequence """ best_seq, best_iden = AASeq(), 0.0 for seq in crys_seqs: iden_ = query.aligned_identity(seq) if iden_ > best_iden: best_iden = iden_ best_seq = seq return best_seq, best_iden
def __init__(self, name: str, coh_AASeq: AASeq, doc_AASeq: AASeq, purples: int, j=False, originals=None, doc_wt=False): self.name = name self.coh_AASeq = coh_AASeq self.doc_AASeq = doc_AASeq self.purples = purples if not j: name_split = name.split('_') a_ind = name_split.index('A') self.coh_wt = name_split[a_ind-1] self.doc_wt = name_split[a_ind+1] elif doc_wt: self.coh_wt = '1ohz' self.doc_wt = doc_wt else: self.coh_wt = '1ohz' self.doc_wt = originals[name[:-3]+'.pdb.gz'].split('_A_')[1].split('_')[0] pos_str = coh_AASeq.get_positions(positions_dict['1ohz']) self.coh_switch = ''.join([type_dict[a] if a in type_dict.keys() else 'c' for a in pos_str]) doc_str = doc_AASeq.get_positions(list(doc21ohz[self.doc_wt].keys())) self.doc_switch = ''.join([type_dict[a] if a in type_dict.keys() else 'c' for a in doc_str])
def read_multi_fastas(fastas_file: str, suffix_to_remove: str = None, lower=False, add_aligned=False) -> dict: """ :param fastas_file: file address :return: {name: AASeq} """ with open(fastas_file, 'r') as f: cont = f.read().split('>') result = {} for entry in cont: split_entry = entry.split('\n') if len(split_entry) < 2: continue name = '_'.join(split_entry[0].rstrip().split()) if name == '': continue if suffix_to_remove is not None: name = name.split(suffix_to_remove)[0] seq = ''.join(a.rstrip() for a in split_entry[1:]) if '-' in seq or add_aligned: aln = seq seq = aln.replace('-', '') if lower: result[name.lower()] = AASeq(string=seq, name=name.lower(), aligned=aln) else: result[name] = AASeq(string=seq, name=name, aligned=aln) else: if lower: result[name.lower()] = AASeq(string=seq, name=name.lower()) else: result[name] = AASeq(string=seq, name=name) return result
def parse_input_data(in_file: str) -> OrderedDict: """ :param in_file: input table. use the template :return: dict of the CSV """ with open(in_file, 'r') as fin: cont = fin.read().split('\n') result = OrderedDict({}) for l in cont: s = l.split(',') if s[0] == 'name' or len(s) < 4: continue result[s[0]] = { 'name': s[0], 'seq': AASeq(s[1], name=s[0]), 'dilution_factor': float(s[2]), 'absorbance': float(s[3]) } return result
def write_multi_seqs_to_file(seqs: dict, out_file: str, query: AASeq=None, no_dups: bool=True): """ :param seqs: {name: AASeq} :param out_file: file to write :param query: query AASeq :param no_dups: whether to refrain from duplicates or not :return: write fasta file """ written_seqs = [] with open(out_file, 'w+') as fout: if query is not None: fout.write('%s\n' % query.write()) written_seqs.append(query) for s in seqs.values(): if query is not None: if query == s: continue if no_dups: if s in written_seqs: continue fout.write('%s\n' % s.write()) written_seqs.append(s)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-mode', default='csv', type=str) parser.add_argument('-in_file', type=str) parser.add_argument('-name', type=str) parser.add_argument('-seq', type=str) parser.add_argument('-dilution_factor', type=float) parser.add_argument('-absorbance', type=float) args = vars(parser.parse_args()) if args['mode'] == 'csv': input_dict = parse_input_data(args['in_file']) elif args['mode'] == 'line': input_dict = { args['name']: { 'name': args['name'], 'seq': AASeq(args['seq'], args['name']), 'dilution_factor': args['dilution_factor'], 'absorbance': args['absorbance'] } } elif args['mode'] == 'Js': j_data = J_data() input_dict = { args['name']: { 'name': args['name'], 'seq': AASeq(j_data[args['name']], args['name']), 'dilution_factor': args['dilution_factor'], 'absorbance': args['absorbance'] } } elif args['mode'] == 'excel': absorbances = [ 0, 1.2205, 0, 0, 0, 0.7145, 0, 0, 1.3835, 1.859, 0, 0, 1.83875, 3.18925 ] names = [ 'j829.A', 'j5711.A', 'j5517.A', 'j5106.A', 'j5093.A', 'j4286.A', 'j3622.A', 'j1647.A', 'j4398.A', 'j3983.A', 'j3626.A', 'j4518.A', 'j4653.A', '1ohz.A' ] j_data = J_data() input_dict = OrderedDict() for name, absorbance in zip(names, absorbances): input_dict[name] = { 'name': name, 'seq': AASeq(j_data[name], name), 'dilution_factor': args['dilution_factor'], 'absorbance': absorbance } elif args['mode'] == 'just': j_data = J_data() coh_names = [ 'j829.A', 'j5711.A', 'j5517.A', 'j5106.A', 'j5093.A', 'j4286.A', 'j3622.A', 'j1647.A', 'j4398.A', 'j3983.A', 'j3626.A', 'j4518.A', 'j4653.A', '1ohz.A' ] doc_names = [ 'j829.B', 'j5711.B', 'j5517.B', 'j5106.B', 'j5093.B', 'j1526.B', 'j3622.B', 'j1647.B', 'j4398.B', 'j3983.B', 'j3626.B', 'j4518.B', 'j4653.B', '1ohz.B' ] all_names = doc_names input_dict = OrderedDict() for name in all_names: input_dict[name] = { 'name': name, 'seq': AASeq(j_data[name], name), 'dilution_factor': 1, 'absorbance': 0.0 } else: print('no mode found') sys.exit() pd.set_option('display.float_format', '{:.2g}'.format) df = pd.DataFrame(columns=[ 'name', 'seq', 'dilution_factor', 'absorbance', 'molecular_weight', 'pI', 'extinction_coefficient' ]) for k, v in input_dict.items(): # calculate extinction coefficient v['extinction_coefficient'] = v['seq'].calc_extinction_coefficient( reduced=False) # calculate Isoelectroc point v['pI'] = v['seq'].calc_isoelectric_point() # calculate molar concentration v['conc'] = v['dilution_factor'] * v['absorbance'] / v[ 'extinction_coefficient'] # calcualte concentration if dilued by half v['glycerol_conc'] = v['conc'] / 2 # calculate molecular weight v['molecular_weight'] = v['seq'].calc_molecular_weight() # calculate g/L v['g/l'] = v['conc'] / v['molecular_weight'] print_evernote_format(v) v['seq'] = v['seq'].get_seq() df = df.append(v, ignore_index=True) print(df) print(', '.join(["%i" % i for i in df['extinction_coefficient']])) # print conc row for excel print('conc row for excel') print('\t'.join('%.2f' % (a * 10**6) for a in df['conc'].values))
class Chain: def __init__(self, chain_id: str = None, residues: dict = None, non_residues: dict = None): self.chain_id = chain_id self.residues = residues if residues is not None else {} self.seq = AASeq(''.join(a.res_type for a in residues.values())) if residues is not None else \ AASeq('', name=chain_id) self.non_residues = non_residues if non_residues is not None else {} self.non_residues_seq = AASeq(''.join(a.res_type for a in residues.values()), name=chain_id) if \ non_residues is not None else AASeq('', name=chain_id) def __repr__(self) -> str: return "chain %s has %i residues" % (self.chain_id, len(self.residues)) def __getitem__(self, item: int) -> Residue: try: return self.residues[item] except: return self.non_residues[item] def __iter__(self): for k, v in self.residues.items(): yield k, v def __len__(self): if self.residues == {}: return 0 return len(self.residues.keys()) def add_residue(self, residue: Residue) -> None: if residue.res_type_3 in three_2_one.keys(): self.seq.add_aa(residue.res_type) self.residues[residue.res_num] = residue else: self.non_residues_seq.add_aa(residue.res_type) self.non_residues[residue.res_num] = residue def min_distance_chain(self, other: Residue) -> float: distances = [] for mrid, mres in self: for orid, ores in other: distances.append(mres.min_distance_res(ores)) return min(distances) def keys(self): return self.residues.keys() def values(self): return self.residues.values() def COM(self) -> XYZ: """ :return:the Center Of Mass of the chain as calculated by the averages over Xs, Ys and Zs of all CAs """ Xs = [] Ys = [] Zs = [] for res in self.values(): if 'CA' in res.keys(): Xs.append(res['CA'].xyz.x) Ys.append(res['CA'].xyz.y) Zs.append(res['CA'].xyz.z) return XYZ(np.mean(Xs), np.mean(Ys), np.mean(Zs)) def change_chain_name(self, new: str) -> None: self.chain_id = new for rid, r in self: r.change_chain_name(new) def translate_xyz(self, xyz: XYZ) -> None: """ :param xyz: an xyz point :return: None. translate all chain atoms by xyz """ for rid, r in self: r.translate_xyz(xyz)
def extract_charge_configuration(seq: AASeq, positions: list): res_in_poses = seq.get_positions(positions) charge = [res2charge[a] if a in res2charge.keys() else "c" for a in res_in_poses] return charge
def extract_charge_configuration(seq: AASeq, positions: list): res_in_poses = seq.get_positions(positions) charge = [ res2charge[a] if a in res2charge.keys() else 'c' for a in res_in_poses ] return charge