''' #print 'subspace:' #subSpace.display() #print 'Measurements in set' #print expandedSpace.listMeasurementsInSet() # loop over measurements and load them into Measurements fileName = 'None' for msrmtInSet in expandedSpace.listMeasurementsInSet(): if fileName != allMeasurements[msrmtInSet][0]: fileName = allMeasurements[msrmtInSet][0] rdatFileName = 'ETERNA'+fileName[3:12]+'.rdat' rdat = RDATFile() rdat.load(open('/home/qmac/projects/testdir/'+rdatFileName)) offset=0 constructs = rdat.constructs.values()[0] # pdb.set_trace() dsection = constructs.data[allMeasurements[msrmtInSet][1]] if dsection.annotations['sequence'][0] != expandedSpace.sequences[msrmtInSet]: print 'Error, sequences not the same!' sys.exit() seq=dsection.annotations['sequence'][0] countZeros = 0 rdatLength = len(dsection.values) for j in range(1,rdatLength): if dsection.values[-j] != 0.0: break
def get_restricted_RDATFile_and_plot_data(constructs, numresults, qdata, searchid, ssdict, check_structure_balance): rdat = RDATFile() all_values = [] rmdb_ids = [] values_min = float('Inf') values_max = float('-Inf') values_min_heatmap = float('Inf') values_max_heatmap = float('-Inf') messages = [] paired_bins = [] unpaired_bins = [] paired_bin_anchors = [] unpaired_bin_anchors = [] paired_merged_data = [] unpaired_merged_data = [] cell_labels = [] for k, c in enumerate(constructs): entry = RMDBEntry.objects.get(constructsection=c) seqpos = [int(i) for i in c.seqpos.strip('[]').split(',')] offset = int(c.offset) rseqpos_byquery = {} searchable_fields = {} seqpos_offset = min(seqpos) - offset - 1 searchable_fields['sequence'] = ''.join([s for i, s in enumerate(c.sequence) if i + offset + 1 in seqpos]) searchable_fields['structure'] = ''.join([s for i, s in enumerate(c.structure) if i + offset + 1 in seqpos]) if 'all' in qdata: rseqposes = [seqpos] else: for field in qdata: if field in ('sequence', 'structure'): matches = [range(m.start() + seqpos_offset, m.end() + seqpos_offset) for m in re.finditer(qdata[field], searchable_fields[field].upper())] if check_structure_balance and field == 'structure': field_seqpos = [] for match in matches: if check_balance(''.join([c.structure[i] for i in match])): field_seqpos.append([i + offset + 1 for i in match]) else: field_seqpos = [[i + offset + 1 for i in match] for match in matches] if 'motif' in rseqpos_byquery: rseqpos_byquery['motif'] = [match for match in rseqpos_byquery['motif'] if match in field_seqpos] else: rseqpos_byquery['motif'] = field_seqpos if field == 'secstructelems': rseqpos_byquery[field] = [] for elem in qdata[field]: for poslist in ssdict[c.id][elem]: rseqpos_byquery[field].append([i + offset + 1 for i in poslist]) rseqposes = rseqpos_byquery.values()[0] for k, v in rseqpos_byquery.items(): tmp = [] for poslist1 in v: for poslist2 in rseqposes: poslist = [] for i in poslist1: if i in poslist2: if len(poslist) > 0 and i-1 in poslist or i+1 in poslist: poslist.append(i) else: if len(poslist) > 0: tmp.append(poslist) poslist = [] if len(poslist) > 0: tmp.append(poslist) rseqposes += tmp for secnum, rseqpos in enumerate(rseqposes): if len(rseqpos) > 0: rseqpos.sort() section = RDATSection() section.name = '%s:%s:%s' % (entry.rmdb_id, rseqpos[0], rseqpos[-1]) section.offset = c.offset section.sequence = c.sequence section.structure = c.structure section.annotations = {} section.xsel = [] section.data = [] section.mutpos = [] section.data_types = [] section.seqpos = rseqpos rdat.traces[section.name] = [] rdat.xsels[section.name] = [] rdat.values[section.name] = [] rdat.errors[section.name] = [] append_to_rdat = False for idx, datasection in enumerate(DataSection.objects.filter(construct_section=c)): dsection = RDATSection() parsedvalues = datasection.values.split(',') dsection.values = [float(parsedvalues[seqpos.index(i)]) for i in rseqpos if i in seqpos] valarray = array([float(p) for p in parsedvalues]) normvalarray = valarray#(valarray - valarray.mean())/valarray.std() if len(dsection.values) == 0: # No data on the required rseqpos, continue with next data continue else: # We found at least one data section that has the required data, append the construct section to our rdat file append_to_rdat = True if len(datasection.errors) > 0: parsederrors = datasection.errors.split(',') dsection.errors = [float(parsederrors[seqpos.index(i)]) for i in rseqpos if i in seqpos] else: dsection.errors = [] if len(datasection.xsel) > 0: parsedxsels = datasection.xsel.split(',') dsection.xsel = [float(parsedxsels[seqpos.index(i)]) for i in rseqpos if i in seqpos] else: dsection.xsel = [] all_values.append([section.name + ':' + str(idx + 1)] + [normvalarray[seqpos.index(i)] for i in rseqpos if i in seqpos]) cell_labels.append([c.sequence[i - offset - 1] + c.structure[i - offset - 1] for i in rseqpos if i in seqpos]) if len(c.structure.strip()) > -1: paired_merged_data += [normvalarray[seqpos.index(i)] for i in rseqpos if c.structure[i - offset -1] in ('(', ')') and i in seqpos] unpaired_merged_data += [normvalarray[seqpos.index(i)] for i in rseqpos if c.structure[i - offset -1] == '.' and i in seqpos] #values_min = min(values_min, min(dsection.values)) #values_max = max(values_max, max(dsection.values)) if datasection.trace: dsection.traces = [float(d) for d in datasection.trace.split(',')] else: dsection.traces = [] if datasection.reads: dsection.reads = [float(d) for d in datasection.reads.split(',')] else: dsection.reads = [] if append_to_rdat: section.data.append(dsection) rdat.traces[section.name].append(dsection.traces) rdat.reads[section.name].append(dsection.reads) rdat.values[section.name].append(dsection.values) rdat.xsels[section.name].append(dsection.xsel) rdat.errors[section.name].append(dsection.errors) dsection.annotations = dict([(a.name, a.value) for a in DataAnnotation.objects.filter(section=datasection)]) rdat.constructs[section.name] = section rmdb_ids.append(entry.rmdb_id) numallresults = len(all_values) rdat.loaded = True rdat.comments = 'Query results for %s in the Stanford RMDB on %s. Search id %s' % (qdata, datetime.datetime.now(), searchid) if len(rmdb_ids) > numresults: messages.append('Your query exceeded %s results, showing just the first %s' % (numresults, numresults)) for v in all_values[:numresults]: values_min_heatmap = min(values_min_heatmap, min(v[1:])) values_max_heatmap = max(values_max_heatmap, max(v[1:])) if len(all_values) > 0: maxlen = max((len(row) for row in all_values[:numresults])) else: maxlen = 0 for i in range(len(all_values)): if len(all_values[i]) < maxlen: all_values[i] += [float('NaN')]*(maxlen - len(all_values[i])) if len(rmdb_ids) > 0: values_max = 2 values_min = -1 paired_merged_data = array(paired_merged_data) unpaired_merged_data = array(unpaired_merged_data) paired_indices = logical_and(paired_merged_data >= values_min, paired_merged_data <= values_max) unpaired_indices = logical_and(unpaired_merged_data >= values_min, unpaired_merged_data <= values_max) if len(unpaired_merged_data) > 0: unpaired_bins, unpaired_bin_anchors = (x.tolist() for x in hist(unpaired_merged_data[unpaired_indices], 100)[:2]) if len(paired_merged_data) > 0: paired_bins, paired_bin_anchors = (x.tolist() for x in hist(paired_merged_data[paired_indices], 100)[:2]) row_length = len(all_values[0]) render = True else: render = False row_length = 0 return rdat, [['Position'] + [str(i+1) for i in range(row_length-1)]] + all_values[:numresults], cell_labels[:numresults], values_min, values_max, values_min_heatmap, values_max_heatmap, unpaired_bins, paired_bins, unpaired_bin_anchors, paired_bin_anchors, rmdb_ids[:numresults], messages, numallresults, render
from matplotlib.pylab import * from rdatkit.datahandlers import RDATFile from rdatkit.view import VARNA from rdatkit.secondary_structure import fold from rdatkit.mapping import MappingData, normalize from analysis import eigen_reactivities import sys rdat = RDATFile() rdat.load(open(sys.argv[1])) vals = array(rdat.values.values()[0]) for i in xrange(shape(vals)[0]): vals[i,:] = normalize(vals[i,:]) eigenrs = eigen_reactivities(vals) matshow(vals) #mshow(vals, cmap=get_cmap('Greys'), vmin=0, vmax=vals.mean(), aspect='auto', interpolation='nearest') matshow(eigenrs) #imshow(eigenrs, cmap=get_cmap('Greys'), vmin=eigenrs.min(), vmax=eigenrs.mean(), aspect='auto', interpolation='nearest') show() construct = rdat.constructs.values()[0] for i, e in enumerate(eigenrs[:35]): sequence = construct.sequence md = MappingData(data=e, seqpos=[s - construct.offset - 1 for s in construct.seqpos]) print fold(sequence, mapping_data=md) structure = fold(sequence, mapping_data=md)[0].dbn VARNA.cmd(sequence, structure, 'test_results/eigen_struct%s.png' % i)
from matplotlib.pylab import * from rdatkit.datahandlers import RDATFile from rdatkit.view import VARNA from rdatkit.secondary_structure import fold from rdatkit.mapping import MappingData, normalize from analysis import eigen_reactivities import sys rdat = RDATFile() rdat.load(open(sys.argv[1])) vals = array(rdat.values.values()[0]) for i in xrange(shape(vals)[0]): vals[i, :] = normalize(vals[i, :]) eigenrs = eigen_reactivities(vals) matshow(vals) #mshow(vals, cmap=get_cmap('Greys'), vmin=0, vmax=vals.mean(), aspect='auto', interpolation='nearest') matshow(eigenrs) #imshow(eigenrs, cmap=get_cmap('Greys'), vmin=eigenrs.min(), vmax=eigenrs.mean(), aspect='auto', interpolation='nearest') show() construct = rdat.constructs.values()[0] for i, e in enumerate(eigenrs[:35]): sequence = construct.sequence md = MappingData( data=e, seqpos=[s - construct.offset - 1 for s in construct.seqpos]) print fold(sequence, mapping_data=md) structure = fold(sequence, mapping_data=md)[0].dbn VARNA.cmd(sequence, structure, 'test_results/eigen_struct%s.png' % i)
args = parser.parse_args() fragtypes = ['all', 'helices', 'interiorloops', 'hairpins', 'dangles', 'bulges',\ '2wayjunctions', '3wayjunctions', '4wayjunctions', '5wayjunctions', 'unpaired', 'edgepairs', 'internalpairs'] db = {} dberrors = {} dbidx = {} for t in fragtypes: db[t] = [] dberrors[t] = [] dbidx[t] = {} for filename in os.listdir(args.rdatdir): if not os.path.isdir(args.rdatdir+'/'+filename): print filename rdat = RDATFile() rdat.load(open(args.rdatdir+'/'+filename)) for cname in rdat.constructs: construct = rdat.constructs[cname] struct = SecondaryStructure(construct.structure) frags = struct.explode() for data in construct.data: if (('mutation' not in data.annotations) or \ ('mutation' in data.annotations and \ 'WT' in data.annotations['mutation'])): if 'modifier' in data.annotations: if args.normalize: normvals = normalize(data.values) else: normvals = data.values iqr = scoreatpercentile(normvals, 75) - scoreatpercentile(normvals, 25)
def parse_rdat_data(request, is_get_file): sequences, titles, structures, modifiers, messages, valerrors, offset_seqpos = ( [], [], [], [], [], [], []) temperature = 37 rdatfile = RDATFile() refstruct = secondary_structure.SecondaryStructure() if len(request.POST['sequences']): messages.append( 'WARNING: Using sequences and/or structures from received RDAT file content. Original input in fields were overwritten.' ) if is_get_file: uploadfile = request.FILES['rdatfile'] rf = write_temp_file('/tmp/%s' % uploadfile.name) else: rmdbid = request.POST['rmdbid'].strip() version = RMDBEntry.get_current_version(rmdbid) rf = open( PATH.DATA_DIR['FILE_DIR'] + '/%s/%s_%s.rdat' % (rmdbid, rmdbid, version), 'r') rdatfile.load(rf) rf.close() is_modified = 'modifier' in rdatfile.annotations if is_modified: modifier = ','.join(rdatfile.annotations['modifier']) for cname in rdatfile.constructs: c = rdatfile.constructs[cname] if 'temperature' in c.annotations: temperature = c.annotations['temperature'] seq = '' bonuses_1d = [] bonuses_2d = [] seqpos_min = min(c.seqpos) if ('clipsequence' in request.POST): if len(c.sequence) >= max(c.seqpos) - c.offset - 1: seq_clipped = ''.join( [c.sequence[i - c.offset - 1] for i in sorted(c.seqpos)]) else: messages.append( 'WARNING: SEQUENCE and SEQPOS mismatch for construct %s in RDAT file. SEQPOS ignored.' % c.name) c.seqpos = [(i + 1) for i in range(len(c.sequence))] seq_clipped = c.sequence if len(c.structure) >= max(c.seqpos) - c.offset - 1: struct_clipped = ''.join( [c.structure[i - c.offset - 1] for i in sorted(c.seqpos)]) else: messages.append( 'WARNING: STRUCTURE and SEQPOS mismatch for construct %s in RDAT file. STRUCTURE ignored.' % c.name) struct_clipped = '.' * (max(c.seqpos) - c.offset - 1) c.structure = struct_clipped seq = seq_clipped struct = struct_clipped else: seq = c.sequence struct = c.structure if len(refstruct) == 0: refstruct = secondary_structure.SecondaryStructure(dbn=struct) for d in c.data: if is_modified or ('modifier' in d.annotations): s = seq is_2d = False if ('mutation' in d.annotations): for mut in d.annotations['mutation']: if 'WT' == mut.strip(): break is_2d = True idx = int(mut.strip()[1:-1]) base = mut[-1] s = s[:idx - c.offset] + base + s[idx - c.offset + 1:] titles.append(';'.join(d.annotations['mutation'])) else: titles.append(cname) sequences.append(s) b = [str(x) for x in d.values] bonuses_1d.append(b) if ('clipsequence' in request.POST): offset = seqpos_min offset_seqpos.append([i - offset for i in c.seqpos]) else: offset = c.offset + 1 offset_seqpos.append([i - offset for i in c.seqpos]) if is_2d: if len(bonuses_2d) == 0: bonuses_2d = zeros([len(seq), len(seq)]) for i, pos in enumerate(c.seqpos): bonuses_2d[pos - offset, idx - offset] = d.values[i] if is_modified: modifiers.append(modifier) else: modifiers.append(','.join(d.annotations['modifier'])) return (messages, valerrors, bonuses_1d, bonuses_2d, titles, modifiers, offset_seqpos, temperature, sequences, refstruct)
def get_constructs_from_rdats(dir): """ using rdatkit parse all RDAT files in the directory specified and parse each construct's sequence, structure and score into construct objects. ONLY files with .rdat extension will be recognized as RDAT files other files will be skipped :params dir: directory with rdat files :type dir: str :returns: List of Construct Objects """ files = glob.glob(dir+"/*") rdat_files = [] #make sure files are rdat files for file in files: if file[-4:] == "rdat": rdat_files.append(file) if len(rdat_files) == 0: raise ValueError("no rdat files in directory "+dir+" files must have rdat extension to be recognized") construct_objs = [] mm = re.compile("Mutate and Map") for file in files: r = RDATFile() r.load(open(file)) construct = r.constructs.values() constructs = construct[0].data for c in constructs: #some data entries dont have signal_to_noise variable, skip over #them if 'signal_to_noise' not in c.annotations: continue data_quality = c.annotations['signal_to_noise'] spl = re.split("\:",data_quality[0]) #dont want to include weak data if spl[0] == "weak": continue name = c.annotations['MAPseq'][0] project_name = c.annotations['MAPseq'][1] #mutate and map data wont be useful since target structure is not #correct with the mutation if mm.search(name) or mm.search(project_name): continue score = c.annotations['EteRNA'][0] spl1 = re.split("\:",score) c = Construct(seq=c.annotations['sequence'][0],ss=c.annotations['structure'][0],score=spl1[2]) construct_objs.append(c) return construct_objs
matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np import rdatkit.secondary_structure as ss import sys import pickle import argparse parser = argparse.ArgumentParser() parser.add_argument("infile",help="input file name, please end with .rdat") parser.add_argument("outfile",help="output file name no extition needed") args = parser.parse_args() #import rdat data rdat = RDATFile() rdat.load(open('/home/qmac/projects/testdir/'+args.infile)) offset=0 constructs = rdat.constructs.values()[0] competing_pairs = [] sequences_included=[] msrmtsNumbers=[] print 'lenth of constructs.data', len(constructs.data) for count in range(0,len(constructs.data)): dsection = constructs.data[count] seq=dsection.annotations['sequence'][0] #structs=ss.fold(seq,nstructs=2) struct_energy_list =[(struct.dbn, energy) for struct, energy in zip(*ss.subopt(seq,nstructs=100,fraction=0.075,energies=True))] struct_energy_list_unique = list(set(struct_energy_list)) struct_energy_list_unique = sorted(struct_energy_list_unique, key=lambda x: x[1])
args = parser.parse_args() fragtypes = ['all', 'helices', 'interiorloops', 'hairpins', 'dangles', 'bulges',\ '2wayjunctions', '3wayjunctions', '4wayjunctions', '5wayjunctions', 'unpaired', 'edgepairs', 'internalpairs'] db = {} dberrors = {} dbidx = {} for t in fragtypes: db[t] = [] dberrors[t] = [] dbidx[t] = {} for filename in os.listdir(args.rdatdir): if not os.path.isdir(args.rdatdir + '/' + filename): print filename rdat = RDATFile() rdat.load(open(args.rdatdir + '/' + filename)) for cname in rdat.constructs: construct = rdat.constructs[cname] struct = SecondaryStructure(construct.structure) frags = struct.explode() for data in construct.data: if (('mutation' not in data.annotations) or \ ('mutation' in data.annotations and \ 'WT' in data.annotations['mutation'])): if 'modifier' in data.annotations: if args.normalize: normvals = normalize(data.values) else: normvals = data.values iqr = scoreatpercentile(