Exemple #1
0
def import_MSStudio(infile, state=None):
    """ Function for converting a MS Studio file
    into a Dataset object
    """
    conditions = Conditions()

    data = Dataset(conditions = conditions, input_file=infile)
Exemple #2
0
def import_HXcolumns(infile, sequence, name=None, percentD=False, conditions=None, error_estimate=5.0, n_fastamides=2, offset=0):
    """ Function for converting a set of HX columns intoa Dataset object
    """

    if type(conditions) is not Conditions:
        print("Standard Conditions used.  Please modify these in the script if you are not at 283K and pH=7")
        conditions = Conditions()

    f = open(infile,"r")
    line = f.readline()

    dataset = Dataset(name=name, sequence=sequence, conditions=conditions, error_estimate=error_estimate, offset=offset, input_file=infile)

    column_headers = line.rstrip().split(",")  # Pre-set to None.  See #XXXX

    for line in f.readlines():

        fields = line.split(",")
        sequence = fields[column_headers.index("peptide_seq")]
        start_res = int(fields[column_headers.index("start_res")]) + offset
        time = float(fields[column_headers.index("time")])
        deut = float(fields[column_headers.index("D_inc")])
        if not percentD:
            deut = deut / tools.calculate_number_of_observable_amides(sequence, n_fastamides) * 100
        score = float(fields[column_headers.index("score")])

        new_peptide = dataset.create_peptide(sequence, start_res)

        if new_peptide is not None:
            # If the time is 0.0, that's weird.  Ignore that.
            if time==0.0:
                continue

            if deut < 105:  # XXX Hard coded cap on %D value.  Not ideal.
                new_timepoint=True

                # If timepoint is already in fragment, grab that timepoint
                if time in [tp.time for tp in new_peptide.get_timepoints()]:
                    tp = new_peptide.get_timepoint_by_time(time)
                #If not, add a new timepoint at this time.  What to put for sigma??
                else:
                    tp = new_peptide.add_timepoint(time)

                # add the deuteration value as a replicate.
                # Any other replicate information from the file should be added at this step.
                tp.add_replicate(deut, score=score)

            new_peptide.add_timepoint(time)

    dataset.calculate_observable_rate_bounds()
    dataset.calculate_observable_protection_factors()

    return dataset
Exemple #3
0
def import_json(infile, name=""):
    # Imports a json written by Dataset.write_to_file()
    import json
    #print(infile)
    with open(infile) as json_data:
        d = json.load(json_data)

    cond_dict = d["conditions"]
    conditions = Conditions()
    for a in cond_dict:
        conditions.a = cond_dict[a]

    dataset = Dataset(d["name"], conditions, d["sequence"],
        input_file = d["raw_data_file"],
        error_estimate = d["error_estimate"],
        offset = d["offset"],
        number_fast_exchanging_amides = d["num_fast_exchanging_amides"],
        percent_deuterium = d["percent_deuterium"])

    pep_dict = d['peptides']

    for pep in pep_dict:
        p = pep_dict[str(pep)]
        peptide = dataset.create_peptide(p["sequence"], p["start_residue"], 
                            peptide_id=pep, 
                            charge_state=p["charge_state"], 
                            retention_time=p["retention_time"],
                            sigma=p["sigma"])
        tp_dict = p["timepoints"]
        for tp in tp_dict:
            timepoint = peptide.add_timepoint(tp_dict[str(tp)]["time"], tp_dict[str(tp)]["sigma"])
            reps = tp_dict[str(tp)]["replicates"]
            for rep in reps:
                timepoint.add_replicate(reps[str(rep)]["deut"], experiment_id=reps[str(rep)]["experiment_id"], 
                                score=reps[str(rep)]["reliability"], rt=reps[str(rep)]["rt"])

    return dataset
Exemple #4
0
    def __init__(self, system, infile, sigma0=1.0):
        self.system = system
        self.temp=0
        self.theta=0.1
        self.file=infile
        self.conditions = Conditions()

        f=open(infile,"r")
        line=f.readline()

        column_headers=None
        # Get Header values
        # All are 2 value lines
        while len(line.split(','))==2:
            self.get_header_value(line)
            line=f.readline()
            #print line

        for line in f:

            if len(line.split(','))==0:
                continue

            if line.split(',')[0]=="peptide" and column_headers is None:
                column_headers=self.get_column_headers(line)
                continue

            if len(line.split(',')) >= 1 and column_headers != None and line.split(',')[column_headers.index("percentd_replicate")]=="NA":
                state=[]
                #This is a consolidated fragment entry. Just grab the state names for now (only two states in current format) and add to model if not already present.
                state.append(line.split(',')[column_headers.index("sample1_name")].replace(" ","_"))
                state.append(line.split(',')[column_headers.index("sample2_name")].replace(" ","_"))

                for s_data in state:
                    add_state_to_model=True
                    for s_mod in model.states:
                        if s_mod.state_name==s_data:
                            add_state_to_model=False
                    if add_state_to_model==True:
                        model.add_state(s_data) # Need some mechanism to add mole_fraction_liganded if weak binding ligand. Current default is 100% binding
                continue

            if len(line.split(',')) >= 1 and column_headers != None and line.split(',')[column_headers.index("percentd_replicate")]!="NA":
                #This is replicate data. First see if it was discarded:
                discarded=line.split(',')[column_headers.index("discarded_replicate")]
                if discarded==True or discarded=="T" or discarded == "true" or discarded == "True" or discarded == "TRUE":
                    continue
                #Get fragment seq / start / end
                frag_seq=line.split(',')[column_headers.index("peptide")]
                start_res=int(line.split(',')[column_headers.index("start")])
                end_res=int(line.split(',')[column_headers.index("end")])
                state_name=line.split(',')[column_headers.index("sample")].replace(" ","_")

                #First, see if this fragment is already in the model states
                for s in model.states:
                    if s.state_name==state_name:
                        if self.is_this_fragment_in_state(s, frag_seq, start_res)==False:
                            state_frag=s.create_fragment(frag_seq, start_res, end_res)
                            if state_frag is not None:
                                print("Fragment ", frag_seq, "created for state", s.state_name)
                            else:
                                print("Skipping this fragment")
                        else:  #if it is not, 
                            state_frag=next((f for f in s.frags if f.seq==frag_seq and f.start_res==start_res), None)

                if state_frag is not None:
                    self.add_timepoint_to_frag(state_frag, column_headers, line, default_sig = sigma0, empirical_sig=True)
                    #print state_frag.timepoints[-1].time, state_frag.timepoints[-1].replicates[-1].deut
        for s in model.states:
            s.get_sectors(s.frags)
            s.get_coverage(s.frags)
Exemple #5
0
def import_HDXWorkbench(infile, macromolecule=None, name="Data", sequence=None, error_estimate=5.0, n_fastamides=2, offset=0,
                    max_t=36000):
    '''
    HDXWorbench files are different because they contain information from two experiments.
    They recieve a macromolecule rather than a single state and will create
    each state within that macromolecule

    Will also create a list of two Dataset objects if no macromolecule is passed
    '''
    # set up the dataset conditions
    conditions = Conditions()

    if macromolecule is not None and sequence is None:
        sequence = macromolecule.get_sequence()

    column_headers = None  # Pre-set to None.  See #XXXX

    f = open(infile,"r")
    line = f.readline()
    # First, let's get the header information from the workbench file
    # All lines are param, value pairs separated by a comma
    while len(line.split(','))==2:
        param=line.split(',')[0]
        value=line.split(',')[1]
        if param=="Used fully deuterated controls":
            if value=="true" or value=="True" or value=="T":
                conditions.fully_deuterated = True
            elif value=="false" or value=="False" or value=="F":
                conditions.fully_deuterated=False
        if param=="Temperature":
            conditions.temperature = float(value)+273.15
        if param=="Offset":
            conditions.offset = int(value)
        if param=="Deuterium solution concentration":
            conditions.saturation = float(value)
        if param=="Recovery estimate":
            conditions.recovery = float(value)
        if param=="Experiment Protein Sequence":
            file_sequence = str(value)
            if file_sequence != sequence and sequence is not None:
                print("WARNING: Sequence in HDXWorkbench file does not match inputted sequence")
            sequence = file_sequence.strip()
        if param=="Experiment name":
            name = str(value).replace(" ","_")
        line=f.readline()
    states = set()
    for line in f:

        # For nothing, just keep going
        if len(line.split(','))==0:
            continue

        #XXXX - the first line beginning with "peptide" is the list of column headers.
        if line.split(',')[0]=="peptide" and column_headers is None:
            column_headers = line.split(",")
            continue

        # This is a consolidated fragment entry. Just grab the state names for now 
        # (only two states in current format).
        if len(line.split(',')) >= 1 and column_headers != None and line.split(',')[column_headers.index("percentd_replicate")]=="NA" and len(states)==0:
            
            
            states.add(line.split(',')[column_headers.index("sample1_name")].replace(" ","_"))
            states.add(line.split(',')[column_headers.index("sample2_name")].replace(" ","_"))

            # Now create the different dataset
            datasets=[]
            for s in states:
                d = Dataset(name=s, sequence=sequence, conditions=conditions, error_estimate=error_estimate, input_file=infile, percent_deuterium=True)
                datasets.append(d)

        #############################################################
        #  This is replicate data. 
        #############################################################
        if len(line.split(',')) >= 1 and column_headers != None and line.split(',')[column_headers.index("percentd_replicate")]!="NA":
            # First see if it was discarded:
            discarded = line.split(',')[column_headers.index("discarded_replicate")]
            if discarded==True or discarded=="T" or discarded == "true" or discarded == "True" or discarded == "TRUE":
                continue

            # If not, get the peptide seq / start 
            # ********** Also, potentially other parameters *************
            fields = line.split(",")
            charge_state = int(fields[column_headers.index("charge")])
            peptide_sequence = fields[column_headers.index("peptide")]
            start_residue = int(fields[column_headers.index("start")])
            state_name = fields[column_headers.index("sample")].replace(" ","_")
            replicate_id = int(fields[column_headers.index("replicate")])

            try:
                replicate_score = float(fields[column_headers.index("score_replicate")])
            except ValueError:
                try:
                    replicate_score = float(fields[column_headers.index("score_replicate")+1])
                    #print("DHJSDNSLCNSIOACJSNCSAJ", line)
                except ValueError:
                    replicate_score = 0
                    #print("DHJSDNSLCNSIOACJSNCSAJ", 0, line)

            # retention time is a tuple with the start and end
            replicate_rt = (float(fields[column_headers.index("rt_start_replicate")]), float(line.split(',')[column_headers.index("rt_end_replicate")]))

            for data in datasets:
                if state_name == data.name:
                    # If the peptide is not there...create the peptide
                    is_pep_in, new_peptide = data.is_this_peptide_in_dataset(peptide_sequence, start_residue, charge_state)
                    
                    if new_peptide is None:
                        new_peptide = data.create_peptide(sequence=peptide_sequence, start_residue=start_residue, charge_state=charge_state)
            #print("XX", state_name, replicate_id, peptide_sequence, charge_state, new_peptide)            
            #print("Replicate", new_peptide)
            # Now, once the we know the peptide is there (or has just been created), add the data
            if new_peptide is not None:
                time = float(line.split(',')[column_headers.index("timepoint")].replace("s",""))

                # If the time is 0.0, that's weird.  Ignore that.
                if time==0.0 or time > max_t:
                    continue

                deut=float(line.split(',')[column_headers.index("percentd_replicate")])

                if deut < 155:  # XXX Hard coded cap on %D value.  Not ideal.
                    new_timepoint=True

                    # If timepoint is already in fragment, grab that timepoint
                    if time in [tp.time for tp in new_peptide.get_timepoints()]:
                        tp = new_peptide.get_timepoint_by_time(time)
                    #If not, add a new timepoint at this time.  What to put for sigma??
                    else:
                        tp = new_peptide.add_timepoint(time)
         
                    # add the deuteration value as a replicate.
                    # Any other replicate information from the file should be added at this step.
                    tp.add_replicate(deut, experiment_id=replicate_id, score=replicate_score, rt=replicate_rt)

                new_peptide.add_timepoint(time)

    if macromolecule is not None:
        for d in datasets:
            d.calculate_observable_rate_bounds()
            d.calculate_observable_protection_factors()
            s = macromolecule.add_state(d.name)
            s.add_dataset(d)

    return datasets