def __init__(self, filename, delimiter='\t', skip_bad=True): """ Expect files of the followin format: Unique_ID, key1:value1, key2:value2, ..., keyn:valuen """ if delimiter == ':': raise InterfaceException( 'When parsing domain file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)' ) with open(filename, 'r') as fh: content = fh.readlines() ID2ADs = {} linecount = 0 for line in content: linecount = linecount + 1 sline = line.strip().split(delimiter) # try try: unique_ID = sline[0].strip() attributes = {} except Exception as e: msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % ( filename, linecount, str(e), line) # should update this to also display the actual error... if skip_bad: shephard_exceptions.print_warning( msg + "\nSkipping this line...") continue else: raise InterfaceException(msg) # if some key/value pairs were included then parse these out one at a time if len(sline) > 1: attributes = interface_tools.parse_key_value_pairs( sline[1:], filename, linecount, line) else: # skip over empty entries continue if unique_ID in ID2ADs: ID2ADs[unique_ID].append(attributes) else: ID2ADs[unique_ID] = [attributes] self.data = ID2ADs
def add_protein_attributes_from_dictionary(proteome, protein_attribute_dictionary, safe=True, verbose=True): """ Function that takes a correctly formatted protein_atttribute dictionary and will add those attributes to the proteins in the Proteome. protein attribute dictionaries are key-value pairs, where the key is a unique ID and the value is a list of dictionaries. For each sub-dictionary, the key-value pair reflects the attribute key-value pairing. Parameters ---------- proteome : Proteome Object Proteome object to which attributes will be added protein_attribute_dictionary : dict Dictionary that defines protein attributes. This is slightly confusing, but the keys for this dictionary is a unique protein IDs and the values is a list of dictionaries. Each of THOSE sub dictionaries has one (or more) key:value pairs that define key:value pairs that will be associated with the protein of interest. safe : boolean If set to True then any exceptions raised during the protein_attribute-adding process are acted on. If set to False, exceptions simply mean the protein_attribute in question is skipped. Note if set to False, pre-existing protein_attributes with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True. The only reason protein attribute addition could fail is if the attribute already exists, so this is effectively a flag to define if pre-existing attributes should be overwritten (False) or not (True). Default = True. verbose : boolean Flag that defines how 'loud' output is. Will warn about errors on adding attributes. Returns ----------- None No return value, but attributes are added to proteins in the Proteome object passed as the first argument """ # check first argument is a Proteome interface_tools.check_proteome( proteome, 'add_protein_attributes (si_protein_attributes)') for protein in proteome: if protein.unique_ID in protein_attribute_dictionary: # note here each AD is its own dictionary for AD in protein_attribute_dictionary[protein.unique_ID]: # for each attribute-key for k in AD: # get the value v = AD[k] try: protein.add_attribute(k, v, safe=safe) except ProteinException as e: msg = '- skipping attribute entry on protein %s (key: %s) ' % ( protein.unique_ID, k) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
def add_tracks_from_dictionary(proteome, tracks_dictionary, mode, safe=True, verbose=True): """ Function that takes a correctly formatted tracks dictionary and will add those tracks to the proteins in the Proteome. track dictionaries are key-value pairs, where the key is a unique ID and the value is a list of dictionaries. For each sub-dictionary, there are two key-value pairs that reflect: 'track_name' : name of the track (str) 'track_data' : parsed list of floats (if expecting values) or strings (if expecting symbols) that should equal the length of the associated protein. Parameters ---------- proteome : Proteome Object Proteome object which tracks will be added to tracks_dictionary : dict Dictionary in which keys are unique IDs for proteins and the value is a list of dictionaries, where each subdictionary has the two key-value pairs: 'track_name' : name of the track (str) 'track_data' : parsed list of floats (if expecting values) or strings (if expecting symbols) that should equal the length of the associated protein. mode : string {'symbols','values'} A selector that defines the type of track file to be read. Must be either 'symbols' or 'values' safe : bool (default = True) If set to True then any exceptions raised during the track-adding process are acted on. If set to False, exceptions simply mean the Track in question is skipped. Note if set to False, pre-existing Tracks with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True There are various reasons Track addition could fail (length does not match the protein etc) and so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. Default = True. verbose : boolean Flag that defines how 'loud' output is. Will warn about errors on adding tracks. Returns ----------- None No return value, but tracks are added to the Proteome object passed as the first argument """ # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_tracks_from_dictionary (si_tracks)') # check mode is valid general_utilities.valid_keyword('mode', mode, ['symbols', 'values']) # cycle through each protein in the proteome... for protein in proteome: if protein.unique_ID in tracks_dictionary: for track in tracks_dictionary[protein.unique_ID]: # get the track name and vector info track_name = track['track_name'] track_data = track['track_data'] # add the track as either values or symbols depending # on what was provided try: if mode == 'values': protein.add_track(track_name, values=track_data, safe=safe) else: protein.add_track(track_name, symbols=track_data, safe=safe) # if an ProteinException was raised when trying to add a track some # anticipated error occurred except (ProteinException, TrackException) as e: msg = '- skipping track at %s on %s' % (track_name, protein) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
def __init__(self, filename, delimiter='\t', skip_bad=True): """ Expect files of the following format: Unique_ID, name, sequence, key1:value1, key2:value2, ..., keyn:valuen NOTE that each unique_ID can ONLY appear once! """ if delimiter == ':': raise InterfaceException( 'When parsing domain file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)' ) with open(filename, 'r') as fh: content = fh.readlines() ID2protein = {} linecount = 0 for line in content: linecount = linecount + 1 sline = line.strip().split(delimiter) # try try: unique_ID = sline[0].strip() name = sline[1].strip() sequence = sline[2].strip() attributes = {} except Exception as e: msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % ( filename, linecount, str(e), line) # should update this to also display the actual error... if skip_bad: shephard_exceptions.print_warning( msg + "\nSkipping this line...") continue else: raise InterfaceException(msg) # if some key/value pairs were included then parse these out one at a time if len(sline) > 3: attributes = interface_tools.parse_key_value_pairs( sline[3:], filename, linecount, line) else: # skip over empty entries pass if unique_ID in ID2protein: raise InterfaceException( "Duplicate protein found in the file %s (offending UID=%s). This cannot be skipped" % (filename, UID)) else: ID2protein[unique_ID] = { 'name': name, 'sequence': sequence, 'attributes': attributes } print(ID2protein) self.data = ID2protein
def __init__(self, filename, delimiter='\t', mode='values', skip_bad=True): """ Class for reading in correctly formatted tracks files for parsing into a Proteome object. Tracks files must adhere to the following specification unique_ID, track_name, val_1, val_2, ...., val_n where n = length of protein. This class allows a tracksfile to be read in and defined as either a values track file, or a symbols track file, returning a tracks dictionary. """ with open(filename, 'r') as fh: content = fh.readlines() ID2track = {} linecount = 0 # cycle over every line in the file for line in content: linecount = linecount + 1 # extract chop off lagging whitespace and divide up using the delimiter sline = line.strip().split(delimiter) track_data = [] # for this list try: # extract track name and unique_id unique_ID = sline[0].strip() track_name = sline[1].strip() # parse track values or symbols if mode == 'values': # for each element in sline strip whitespace and convert to a float track_data = [float(i.strip()) for i in sline[2:]] elif mode == 'symbols': # for each element in sline strip whitespace track_data = [i.strip() for i in sline[2:]] else: raise InterfaceException( 'Error: %s' % "mode passed = %s, yet this does not match 'symbols' or 'values'" ) if unique_ID in ID2track: ID2track[unique_ID].append({ 'track_name': track_name, 'track_data': track_data }) else: ID2track[unique_ID] = [{ 'track_name': track_name, 'track_data': track_data }] except Exception as e: msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % ( filename, linecount, str(e), line) # should update this to also display the actual error... if skip_bad: shephard_exceptions.print_warning( msg + "\nSkipping this line...") continue else: raise InterfaceException(msg) self.data = ID2track
def add_proteins_from_dictionary(proteome, protein_dictionary, safe=True, verbose=True): """ Function that takes a correctly formatted protein dictionary and will add those proteins to the Proteome. protein dictionaries are key-value pairs, where the key is a unique ID and the value is itself a dictionary which has the following keys 'name' = Protein name (uncontrolled vocabulary, but should be a string) 'sequence' = Amino acid sequence for the protein (note that no sanity checking is done here) 'attributes' = dictionary of arbitrary key:value pairings (optional) Parameters ---------- proteome : Proteome Object Proteome object to which attributes will be added protein_dictionary : dict Dictionary that defines proteins. The keys for this dictionary is a unique protein IDs and the values is a list of dictionaries. Each of THOSE sub dictionaries contains key-value pairs are described above safe : boolean If set to True then any exceptions raised during the protein-adding process are acted on. If set to False, exceptions simply mean the protein_attribute in question is skipped. Note if set to False, pre-existing protein_attributes with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True. The only reason protein attribute addition could fail is if the attribute already exists, so this is effectively a flag to define if pre-existing attributes should be overwritten (False) or not (True). Default = True. verbose : boolean Flag that defines how 'loud' output is. Will warn about errors on adding attributes. Returns ----------- None No return value, but attributes are added to proteins in the Proteome object passed as the first argument """ # check first argument is a Proteome interface_tools.check_proteome( proteome, 'add_protein_from_dictionary (si_proteins)') if safe is False: force_overwrite = True else: force_overwrite = False # for each entry in the overall dictionary for UID in protein_dictionary: # if attributes are included read these out try: ats = protein_dictionary[UID]['attributes'] except: ats = None s = protein_dictionary[UID]['sequence'] # note we use the clean_string to remove n = protein_dictionary[UID]['name'] try: proteome.add_protein(s, n, UID, attributes=ats, force_overwrite=force_overwrite) except (ProteinException, ProteomeException) as e: msg = '- skipping protein %s (name = %s, len=%i' % (UID, n, len(s)) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
def __init__(self, filename, delimiter='\t', skip_bad=True): """ Expect files of the following format: Unique_ID, start, stop, domain_type, key1:value1, key2:value2, ..., keyn:valuen Note that the first four arguments are required, while all of the key:value pairs are optional. Key value must be separated by a ':', but any delimiter (other than ':') is allowed. When created, this constructor parses the keyfile to generate a .data class object, which itself maps a uniqueID to a list of domain dictionaries. Domain dictionaries have the following key-value pairs REQUIRED: start : int (domain start position) end : int (domain end position) domain_type : string (domain type) OPTIONAL: attributes : dictionary of arbitrary key-value pairs that will be associated with the domain """ if delimiter == ':': raise InterfaceException( 'When parsing domain file cannot use ":" as a delimiter because this is used to delimit key/value pairs (if provided)' ) with open(filename, 'r') as fh: content = fh.readlines() ID2domain = {} linecount = 0 for line in content: linecount = linecount + 1 sline = line.strip().split(delimiter) try: unique_ID = sline[0].strip() start = int(sline[1].strip()) end = int(sline[2].strip()) domain_type = sline[3].strip() attributes = {} except Exception as e: msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % ( filename, linecount, str(e), line) # if we're skipping bad things then... if skip_bad: shephard_exceptions.print_warning( msg + "\nSkipping this line...") continue else: raise InterfaceException(msg) # if some key/value pairs were included then parse these out one at a time if len(sline) > 4: attributes = interface_tools.parse_key_value_pairs( sline[4:], filename, linecount, line) if unique_ID in ID2domain: ID2domain[unique_ID].append({ 'start': start, 'end': end, 'domain_type': domain_type, 'attributes': attributes }) else: ID2domain[unique_ID] = [{ 'start': start, 'end': end, 'domain_type': domain_type, 'attributes': attributes }] self.data = ID2domain
def add_domains_from_dictionary(proteome, domain_dictionary, autoname=False, safe=True, verbose=True): """ Function that takes a correctly formatted Domains dictionary and will add those domains to the proteins in the Proteome. Domains dictionaries are key-value pairs, where the key is a unique_ID associated with a given protein, and the value is a list of dictionaries. Each subdictionary has four key-value pairs: 'start' = start position (int showing start of the domain, starting at 1) 'end' = end position (int showing end of the domain, inclusive) 'domain_type' = domain type (string that names the domain) 'attributes' = dictionary of arbitrary key:value pairings (optional) The start and end positions should be locations within the sequence defined by the unique_ID, and if they are out of the sequence bounds this will throw an exception. Domain type is a string that names the type of domain. The attributes dictionary is an arbitrary key-value pair dictionary where key-values map an arbitrary key to an arbitrary value (read in as strings). In this way, each domain that maps to a give unique_ID will be added. Note the attribute is optional. Parameters ---------- proteome : Proteome object Proteome object to which domains will be added domain_dictionary : dict Dictionary that maps unique_IDs to a list of one or more domain dictionaries autoname : bool If autoname is set to true, this function ensures each domain ALWAYS has a unique name - i.e. the allows for multiple domains to be perfecly overlapping in position and type. This is generally not going to be required and/or make sense, but having this feature in place is useful. In general we want to avoid this as it makes it easy to include duplicates which by default are prevented when autoname = False. Default = False. safe : bool If set to True then any exceptions raised during the Domain-adding process are acted on. If set to False, exceptions simply mean the domain in question is skipped. Note if set to False, pre-existing Domains with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True There are various reasons Domain addition could fail (start/end position outside of the protein limits etc.) and so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. Default = True. verbose : bool Flag that defines how 'loud' output is. Will warn about errors on adding domains. Returns ----------- None No return value, but domains are added to the Proteome object passed as the first argument """ # Note - the safe keyword is actually dealt with in this function in conjunction with the Verbose # keyword, so we pass safe=False to the add_domain function and then catch the exception in this # function. # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_domains (si_domains)') for protein in proteome: if protein.unique_ID in domain_dictionary: for domain in domain_dictionary[protein.unique_ID]: start = domain['start'] end = domain['end'] domain_type = domain['domain_type'] try: ad = domain['attributes'] except: ad = {} # try and add the domain... try: protein.add_domain(start, end, domain_type, attributes=ad, safe=safe, autoname=autoname) except (ProteinException, DomainException) as e: msg = '- skipping domain at %i-%i on %s' % (start, end, protein) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
def add_sites_from_dictionary(proteome, sites_dictionary, safe=True, verbose=False): """ Function that takes a correctly formatted Sites dictionary and will add those sites to the proteins in the Proteome. Sites dictionaries are key-value pairs, where the key is a unique_ID associated with a given protein, and the value is a list of dictionaries. Each subdirectionay has the following elements 'position' = site position 'site_type' = site type 'symbol' = site symbol 'value' = site value 'attributes' = site attribute dictionary In this way, each site that maps to a give unique_ID will be added to the associated protein. NOTE: In Parameters ------------- proteome : Proteome Proteome object to which we're adding sites. Note that ONLY sites for which a protein is found will be used. Protein-Site cross-referencing is done using the protein's unique_ID which should be the key used in the sites_dictionary sites_dictionary : dict A sites dictionary is a defined dictionary that maps a unique_ID back to a list of dictionaries, where each subdictionay has five elements. Each dictionary entry provides information on the site as a key-value pair, specifically: 'position' = site position 'site_type' = site type 'symbol' = site symbol 'value' = site value 'attributes' = site attribute dictionary Recall the only type-specific values (position and value) are cast automatically when a site is added by the Protein object, so no need to do that in this function too. Extra key-value paris in each sub-dictionary are ignored safe : boolean If set to True then any exceptions raised during the site-adding process are acted on. If set to false, exceptions simply mean the site in question is skipped. There are various reasons site addition could fail (notably position of the site is outside of the protein limits) and so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True Default = True verbose : boolean Flag that defines how 'loud' output is. Will warn about errors on adding sites. Returns --------- None No return value, but adds all of the passed sites to the protein """ for protein in proteome: if protein.unique_ID in sites_dictionary: for site in sites_dictionary[protein.unique_ID]: try: position = site['position'] site_type = site['site_type'] symbol = site['symbol'] value = site['value'] try: ad = site['attributes'] except: ad = {} except Exception: raise InterfaceException( 'When sites dictionary for key [%s] was unable to extract five distinct parametes. Entry is:\n%s\n' % (protein.unique_ID, site)) # assuming we can read all five params try and add the site try: protein.add_site(position, site_type, symbol, value, attributes=ad) except ProteinException as e: msg = '- skipping site %s at %i on %s' % ( site_type, position, protein) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
def __init__(self, filename, delimiter='\t', skip_bad=True): """ Expect files of the following format: Unique_ID, position, site type, symbol, value, key1:value1, key2:value2, ..., keyn:valuen Note that the first four arguments are required, while all of the key:value pairs are optional. Key value must be separated by a ':', but any delimiter (other than ':') is allowed """ if delimiter == ':': raise InterfaceException( 'When parsing site file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)' ) with open(filename, 'r') as fh: content = fh.readlines() ID2site = {} linecount = 0 for line in content: linecount = linecount + 1 sline = line.strip().split(delimiter) try: unique_ID = sline[0].strip() position = int(sline[1].strip()) site_type = sline[2].strip() symbol = sline[3].strip() value = float(sline[4].strip()) attributes = {} except Exception as e: msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % ( filename, linecount, str(e), line) # should update this to also display the actual error... if skip_bad: shephard_exceptions.print_warning( msg + "\nSkipping this line...") continue else: raise InterfaceException(msg) # if there's more parse attribute dictionary entries if len(sline) > 5: attributes = interface_tools.parse_key_value_pairs( sline[5:], filename, linecount, line) if unique_ID in ID2site: ID2site[unique_ID].append({ 'position': position, 'site_type': site_type, 'symbol': symbol, 'value': value, 'attributes': attributes }) else: ID2site[unique_ID] = [{ 'position': position, 'site_type': site_type, 'symbol': symbol, 'value': value, 'attributes': attributes }] self.data = ID2site