def import_MSStudio(infile, state=None): """ Function for converting a MS Studio file into a Dataset object """ conditions = Conditions() data = Dataset(conditions = conditions, input_file=infile)
def import_HXcolumns(infile, sequence, name=None, percentD=False, conditions=None, error_estimate=5.0, n_fastamides=2, offset=0): """ Function for converting a set of HX columns intoa Dataset object """ if type(conditions) is not Conditions: print("Standard Conditions used. Please modify these in the script if you are not at 283K and pH=7") conditions = Conditions() f = open(infile,"r") line = f.readline() dataset = Dataset(name=name, sequence=sequence, conditions=conditions, error_estimate=error_estimate, offset=offset, input_file=infile) column_headers = line.rstrip().split(",") # Pre-set to None. See #XXXX for line in f.readlines(): fields = line.split(",") sequence = fields[column_headers.index("peptide_seq")] start_res = int(fields[column_headers.index("start_res")]) + offset time = float(fields[column_headers.index("time")]) deut = float(fields[column_headers.index("D_inc")]) if not percentD: deut = deut / tools.calculate_number_of_observable_amides(sequence, n_fastamides) * 100 score = float(fields[column_headers.index("score")]) new_peptide = dataset.create_peptide(sequence, start_res) if new_peptide is not None: # If the time is 0.0, that's weird. Ignore that. if time==0.0: continue if deut < 105: # XXX Hard coded cap on %D value. Not ideal. new_timepoint=True # If timepoint is already in fragment, grab that timepoint if time in [tp.time for tp in new_peptide.get_timepoints()]: tp = new_peptide.get_timepoint_by_time(time) #If not, add a new timepoint at this time. What to put for sigma?? else: tp = new_peptide.add_timepoint(time) # add the deuteration value as a replicate. # Any other replicate information from the file should be added at this step. tp.add_replicate(deut, score=score) new_peptide.add_timepoint(time) dataset.calculate_observable_rate_bounds() dataset.calculate_observable_protection_factors() return dataset
def import_json(infile, name=""): # Imports a json written by Dataset.write_to_file() import json #print(infile) with open(infile) as json_data: d = json.load(json_data) cond_dict = d["conditions"] conditions = Conditions() for a in cond_dict: conditions.a = cond_dict[a] dataset = Dataset(d["name"], conditions, d["sequence"], input_file = d["raw_data_file"], error_estimate = d["error_estimate"], offset = d["offset"], number_fast_exchanging_amides = d["num_fast_exchanging_amides"], percent_deuterium = d["percent_deuterium"]) pep_dict = d['peptides'] for pep in pep_dict: p = pep_dict[str(pep)] peptide = dataset.create_peptide(p["sequence"], p["start_residue"], peptide_id=pep, charge_state=p["charge_state"], retention_time=p["retention_time"], sigma=p["sigma"]) tp_dict = p["timepoints"] for tp in tp_dict: timepoint = peptide.add_timepoint(tp_dict[str(tp)]["time"], tp_dict[str(tp)]["sigma"]) reps = tp_dict[str(tp)]["replicates"] for rep in reps: timepoint.add_replicate(reps[str(rep)]["deut"], experiment_id=reps[str(rep)]["experiment_id"], score=reps[str(rep)]["reliability"], rt=reps[str(rep)]["rt"]) return dataset
def __init__(self, system, infile, sigma0=1.0): self.system = system self.temp=0 self.theta=0.1 self.file=infile self.conditions = Conditions() f=open(infile,"r") line=f.readline() column_headers=None # Get Header values # All are 2 value lines while len(line.split(','))==2: self.get_header_value(line) line=f.readline() #print line for line in f: if len(line.split(','))==0: continue if line.split(',')[0]=="peptide" and column_headers is None: column_headers=self.get_column_headers(line) continue if len(line.split(',')) >= 1 and column_headers != None and line.split(',')[column_headers.index("percentd_replicate")]=="NA": state=[] #This is a consolidated fragment entry. Just grab the state names for now (only two states in current format) and add to model if not already present. state.append(line.split(',')[column_headers.index("sample1_name")].replace(" ","_")) state.append(line.split(',')[column_headers.index("sample2_name")].replace(" ","_")) for s_data in state: add_state_to_model=True for s_mod in model.states: if s_mod.state_name==s_data: add_state_to_model=False if add_state_to_model==True: model.add_state(s_data) # Need some mechanism to add mole_fraction_liganded if weak binding ligand. Current default is 100% binding continue if len(line.split(',')) >= 1 and column_headers != None and line.split(',')[column_headers.index("percentd_replicate")]!="NA": #This is replicate data. First see if it was discarded: discarded=line.split(',')[column_headers.index("discarded_replicate")] if discarded==True or discarded=="T" or discarded == "true" or discarded == "True" or discarded == "TRUE": continue #Get fragment seq / start / end frag_seq=line.split(',')[column_headers.index("peptide")] start_res=int(line.split(',')[column_headers.index("start")]) end_res=int(line.split(',')[column_headers.index("end")]) state_name=line.split(',')[column_headers.index("sample")].replace(" ","_") #First, see if this fragment is already in the model states for s in model.states: if s.state_name==state_name: if self.is_this_fragment_in_state(s, frag_seq, start_res)==False: state_frag=s.create_fragment(frag_seq, start_res, end_res) if state_frag is not None: print("Fragment ", frag_seq, "created for state", s.state_name) else: print("Skipping this fragment") else: #if it is not, state_frag=next((f for f in s.frags if f.seq==frag_seq and f.start_res==start_res), None) if state_frag is not None: self.add_timepoint_to_frag(state_frag, column_headers, line, default_sig = sigma0, empirical_sig=True) #print state_frag.timepoints[-1].time, state_frag.timepoints[-1].replicates[-1].deut for s in model.states: s.get_sectors(s.frags) s.get_coverage(s.frags)
def import_HDXWorkbench(infile, macromolecule=None, name="Data", sequence=None, error_estimate=5.0, n_fastamides=2, offset=0, max_t=36000): ''' HDXWorbench files are different because they contain information from two experiments. They recieve a macromolecule rather than a single state and will create each state within that macromolecule Will also create a list of two Dataset objects if no macromolecule is passed ''' # set up the dataset conditions conditions = Conditions() if macromolecule is not None and sequence is None: sequence = macromolecule.get_sequence() column_headers = None # Pre-set to None. See #XXXX f = open(infile,"r") line = f.readline() # First, let's get the header information from the workbench file # All lines are param, value pairs separated by a comma while len(line.split(','))==2: param=line.split(',')[0] value=line.split(',')[1] if param=="Used fully deuterated controls": if value=="true" or value=="True" or value=="T": conditions.fully_deuterated = True elif value=="false" or value=="False" or value=="F": conditions.fully_deuterated=False if param=="Temperature": conditions.temperature = float(value)+273.15 if param=="Offset": conditions.offset = int(value) if param=="Deuterium solution concentration": conditions.saturation = float(value) if param=="Recovery estimate": conditions.recovery = float(value) if param=="Experiment Protein Sequence": file_sequence = str(value) if file_sequence != sequence and sequence is not None: print("WARNING: Sequence in HDXWorkbench file does not match inputted sequence") sequence = file_sequence.strip() if param=="Experiment name": name = str(value).replace(" ","_") line=f.readline() states = set() for line in f: # For nothing, just keep going if len(line.split(','))==0: continue #XXXX - the first line beginning with "peptide" is the list of column headers. if line.split(',')[0]=="peptide" and column_headers is None: column_headers = line.split(",") continue # This is a consolidated fragment entry. Just grab the state names for now # (only two states in current format). if len(line.split(',')) >= 1 and column_headers != None and line.split(',')[column_headers.index("percentd_replicate")]=="NA" and len(states)==0: states.add(line.split(',')[column_headers.index("sample1_name")].replace(" ","_")) states.add(line.split(',')[column_headers.index("sample2_name")].replace(" ","_")) # Now create the different dataset datasets=[] for s in states: d = Dataset(name=s, sequence=sequence, conditions=conditions, error_estimate=error_estimate, input_file=infile, percent_deuterium=True) datasets.append(d) ############################################################# # This is replicate data. ############################################################# if len(line.split(',')) >= 1 and column_headers != None and line.split(',')[column_headers.index("percentd_replicate")]!="NA": # First see if it was discarded: discarded = line.split(',')[column_headers.index("discarded_replicate")] if discarded==True or discarded=="T" or discarded == "true" or discarded == "True" or discarded == "TRUE": continue # If not, get the peptide seq / start # ********** Also, potentially other parameters ************* fields = line.split(",") charge_state = int(fields[column_headers.index("charge")]) peptide_sequence = fields[column_headers.index("peptide")] start_residue = int(fields[column_headers.index("start")]) state_name = fields[column_headers.index("sample")].replace(" ","_") replicate_id = int(fields[column_headers.index("replicate")]) try: replicate_score = float(fields[column_headers.index("score_replicate")]) except ValueError: try: replicate_score = float(fields[column_headers.index("score_replicate")+1]) #print("DHJSDNSLCNSIOACJSNCSAJ", line) except ValueError: replicate_score = 0 #print("DHJSDNSLCNSIOACJSNCSAJ", 0, line) # retention time is a tuple with the start and end replicate_rt = (float(fields[column_headers.index("rt_start_replicate")]), float(line.split(',')[column_headers.index("rt_end_replicate")])) for data in datasets: if state_name == data.name: # If the peptide is not there...create the peptide is_pep_in, new_peptide = data.is_this_peptide_in_dataset(peptide_sequence, start_residue, charge_state) if new_peptide is None: new_peptide = data.create_peptide(sequence=peptide_sequence, start_residue=start_residue, charge_state=charge_state) #print("XX", state_name, replicate_id, peptide_sequence, charge_state, new_peptide) #print("Replicate", new_peptide) # Now, once the we know the peptide is there (or has just been created), add the data if new_peptide is not None: time = float(line.split(',')[column_headers.index("timepoint")].replace("s","")) # If the time is 0.0, that's weird. Ignore that. if time==0.0 or time > max_t: continue deut=float(line.split(',')[column_headers.index("percentd_replicate")]) if deut < 155: # XXX Hard coded cap on %D value. Not ideal. new_timepoint=True # If timepoint is already in fragment, grab that timepoint if time in [tp.time for tp in new_peptide.get_timepoints()]: tp = new_peptide.get_timepoint_by_time(time) #If not, add a new timepoint at this time. What to put for sigma?? else: tp = new_peptide.add_timepoint(time) # add the deuteration value as a replicate. # Any other replicate information from the file should be added at this step. tp.add_replicate(deut, experiment_id=replicate_id, score=replicate_score, rt=replicate_rt) new_peptide.add_timepoint(time) if macromolecule is not None: for d in datasets: d.calculate_observable_rate_bounds() d.calculate_observable_protection_factors() s = macromolecule.add_state(d.name) s.add_dataset(d) return datasets