def test_return_list(): test_data_dir = protfasta._get_data('test_data') duplicate_filename = '%s/testset_duplicate_seqs.fasta' % (test_data_dir) duplicate_record = '%s/testset_duplicate.fasta' % (test_data_dir) simple_filename = '%s/testset_1.fasta' % (test_data_dir) x = protfasta.read_fasta(simple_filename, duplicate_sequence_action='fail') assert type(x) == dict x = protfasta.read_fasta(simple_filename, duplicate_sequence_action='fail', return_list=True) assert type(x) == list # show we can use return_list to read in a FASTA file with two identical records (note when we did this before and # return_list=False then len(x) == 2 because the dictionary overwrites x = protfasta.read_fasta(duplicate_record, duplicate_record_action='ignore', return_list=True, expect_unique_header=False) assert len(x) == 3 x = protfasta.read_fasta(duplicate_record, duplicate_record_action='remove', return_list=True, expect_unique_header=False) assert len(x) == 2
def test_duplicate_sequence_action(): test_data_dir = protfasta._get_data('test_data') duplicate_filename = '%s/testset_duplicate_seqs.fasta' % (test_data_dir) simple_filename = '%s/testset_1.fasta' % (test_data_dir) # this should be fine because simple_filename is valid assert len( protfasta.read_fasta(simple_filename, duplicate_sequence_action='fail')) == 9 # this should be fine because simple_filename is valid with pytest.raises(ProtfastaException): assert protfasta.read_fasta(duplicate_filename, duplicate_sequence_action='fail') # this should be fine because simple_filename is valid assert len( protfasta.read_fasta(duplicate_filename, duplicate_sequence_action='ignore')) == 3 # remove duplciate sequence assert len( protfasta.read_fasta(duplicate_filename, duplicate_sequence_action='remove', verbose=True)) == 2 # note only the sequences are duplicate, not the record assert len( protfasta.read_fasta(duplicate_filename, duplicate_record_action='remove', verbose=True)) == 3
def test_read_fasta_standard(): test_data_dir = protfasta._get_data('test_data') simple_filename = '%s/testset_1.fasta' % (test_data_dir) x = protfasta.read_fasta(simple_filename) assert len(x) == 9 # check can read in a sequence correctly assert x[test_data['test1'][0]] == test_data['test1'][1]
def test_expect_unique_header_toggle(): test_data_dir = protfasta._get_data('test_data') simple_filename = '%s/testset_1.fasta' % (test_data_dir) x = protfasta.read_fasta(simple_filename, expect_unique_header=False) assert len(x) == 9 assert x[test_data['test1'][0]] == test_data['test1'][1] x = protfasta.read_fasta(simple_filename, expect_unique_header=True) assert len(x) == 9 assert x[test_data['test1'][0]] == test_data['test1'][1] # bool only with pytest.raises(ProtfastaException): assert protfasta.read_fasta(simple_filename, expect_unique_header='dog') # bool only with pytest.raises(ProtfastaException): assert protfasta.read_fasta(simple_filename, expect_unique_header=1)
def test_duplicate_record_action(): test_data_dir = protfasta._get_data('test_data') duplicate_filename = '%s/testset_duplicate.fasta' % (test_data_dir) simple_filename = '%s/testset_1.fasta' % (test_data_dir) # this should be fine because simple_filename is valid assert len( protfasta.read_fasta(simple_filename, duplicate_record_action='fail')) == 9 # this should fail because duplicate_filename has duplicates with pytest.raises(ProtfastaException): assert protfasta.read_fasta(duplicate_filename, duplicate_record_action='fail') # this should fail because this combination of options (i.e. implicit expect_unique=True) # will throw and error with pytest.raises(ProtfastaException): assert protfasta.read_fasta(duplicate_filename, duplicate_record_action='ignore') # THIS should fail because even though we've said remove, we are still expecting uniqe with pytest.raises(ProtfastaException): assert protfasta.read_fasta(duplicate_filename, duplicate_record_action='remove') x = protfasta.read_fasta(duplicate_filename, duplicate_record_action='remove', expect_unique_header=False) assert len(x) == 2 x = protfasta.read_fasta(duplicate_filename, duplicate_record_action='ignore', expect_unique_header=False, return_list=True) assert len(x) == 3 # this is not goood, BUT if we say expect uniuqe false, ignore duplicates and dont return a list we will use the first entry x = protfasta.read_fasta(duplicate_filename, duplicate_record_action='ignore', expect_unique_header=False) assert len(x) == 2 # if we ignore or remove, same difference x = protfasta.read_fasta(duplicate_filename, duplicate_record_action='remove', expect_unique_header=False) assert len(x) == 2
def test_header_parser(): test_data_dir = protfasta._get_data('test_data') simple_filename = '%s/testset_1.fasta' % (test_data_dir) def d(s): return s[0:10] def d_dumb(s): return "asas" def d_bad(): return "asas" def d_bad2(s): return 1 x = protfasta.read_fasta(simple_filename, header_parser=d) assert len(x) == 9 assert x[test_data['test1'][0][0:10]] == test_data['test1'][1] # this dumb combination of settings means we overwrite the headers a = protfasta.read_fasta(simple_filename, header_parser=d_dumb, duplicate_sequence_action='ignore', expect_unique_header=False) assert len(a) == 1 # now we at least avoid overwriting by setting the return type to be a list a = protfasta.read_fasta(simple_filename, header_parser=d_dumb, duplicate_sequence_action='ignore', expect_unique_header=False, return_list=True) assert len(a) == 9 # should fail because headers are duplicate with pytest.raises(ProtfastaException): assert protfasta.read_fasta(simple_filename, header_parser=d_dumb) # bool only with pytest.raises(ProtfastaException): assert protfasta.read_fasta(simple_filename, header_parser=d_bad) with pytest.raises(ProtfastaException): assert protfasta.read_fasta(simple_filename, header_parser=d_bad2)
import os import pytest import protfasta import metapredict as meta from metapredict.meta import MetapredictError current_filepath = os.getcwd() fasta_filepath = "{}/input_data/testing.fasta".format(current_filepath) test_sequence = protfasta.read_fasta(fasta_filepath)['Q8N6T3'] def test_graph_disorder_png(): # can make PNGs fn = 'demo1.png' full_fn = 'output/%s' % (fn) meta.graph_disorder(test_sequence, output_file=full_fn) assert os.path.isfile(full_fn) is True full_fn = 'output/demo1_custom_title.png' meta.graph_disorder(test_sequence, output_file=full_fn, title='Custom title') full_fn = 'output/demo1_disorder_thresh0p5.png' meta.graph_disorder(test_sequence, output_file=full_fn, disorder_threshold=0.5) full_fn = 'output/demo1_shaded_1_20.png'
def shephard_fasta_to_proteome(filename, proteome=None, force_overwrite=False, invalid_sequence_action='fail'): """ Stand alone function that allows the user to build a proteome from a FASTA file generated by SHEPHARD (using the proteome_to_fasta() function. When SHEPHARD generates a FASTA file it uses a general convention for encoding the unique ID, protein name, This function assumes the uniprot-standard format for the header file has been maintained - i.e. >xx|ACCESSION|xxxx Where ACCESSION is the uniprot accession and will be used as the unique_ID Parameters ------------ filename : string Name of the FASTA file we're going to parse in. Note the protein name will be defined as the full FASTA header for each entry. proteome : Proteome If a Proteome object is provided the FASTA file will be read and added to the existing proteome, whereas if set to None a new Proteome will be generated. force_overwrite : bool [**Default = False**] Flag that if set to true and we encounter a unique_ID that is already in the proteome the newer value overwrites the older one without predudice. This is mostly useful if you are adding in a file with known duplicate entries OR combining multiple FASTA files where you know there's some duplications. Note that if build_unique_ID = None and user_header_as_unique_ID = None then fasta_to_proteome guarentees that every FASTA entry will be given a unique_ID (meaning force_overwrite is irrelevant in this case). invalid_sequence_action : ``'ignore'``, ``'fail'``, ``'remove'``, ``'convert'``, ``'convert-ignore'`` [**Default = 'fail'**] Selector that determines how to deal with invalid sequences that contain invalid/non-standard amino acids. If ``convert`` or ``convert-ignore`` are chosen, then conversion is completed with either the standard conversion table (shown under the ``correction_dictionary`` documentation) or with a custom conversion dictionary passed to ``correction_dictionary``. Options are as follows: * ``ignore`` - invalid sequences are completely ignored * ``fail`` - invalid sequence cause parsing to fail and throw an exception * ``remove`` - invalid sequences are removed * ``convert`` - invalid sequences are convert * ``convert-ignore`` - invalid sequences are converted to valid sequences and any remaining invalid residues are ignored Returns -------- Proteome Object Returns an initialized Proteome object """ # read in the fasta file using protfasta fasta_dictionary = protfasta.read_fasta( filename, invalid_sequence_action=invalid_sequence_action) # initialize the empty list proteome_list = [] # for each entry for k in fasta_dictionary: # because we know what the header format will be we can be definitive about extracting the relevant information fasta_split = k.split('|') # ENSURE EVERY single line is a valid if fasta_split[0] != "SHPRD": raise APIException( 'Trying to parse a FASTA file that is expected to be SHEPHARD generated but formatting does not comply [on entry %s in file %s]' % (k, filename)) # extract out try: # get the unique ID unique_ID = fasta_split[1] # then take everything after the unique_ID tmp = "|".join(fasta_split[2:]) attributes_string = tmp.split(SHEPHARD_ATTRIBUTE_SPLITTER) name = attributes_string[0] except IndexError: raise APIException( 'Trying to parse a FASTA file that is expected to be SHEPHARD generated but formatting does not comply [on entry %s in file %s]' % (k, filename)) attributes_dict = {} if len(attributes_string) > 1: attributes_string_s = attributes_string[1].split('\t') for a in attributes_string_s: local_k = a.strip().split('=')[0].strip() local_v = a.strip().split('=')[0].strip() attributes_dict[local_k] = local_v # now create an protein dictionary object and populate! newdict = {} newdict['sequence'] = str(fasta_dictionary[k]) newdict['name'] = name newdict['unique_ID'] = unique_ID newdict['attributes'] = attributes_dict proteome_list.append(newdict) # finally if a proteome was provided then if proteome is not None: proteome.add_proteins(proteome_list, force_overwrite=force_overwrite) return proteome else: # no proteome provided so build a new proteome and return it return Proteome(proteome_list, force_overwrite=force_overwrite)
def fasta_to_proteome(filename, proteome=None, build_unique_ID=None, build_attributes=None, use_header_as_unique_ID=False, force_overwrite=False, invalid_sequence_action='fail'): """ Stand alone function that allows the user to build a Proteome from a standard FASTA file, or add sequences in a FASTA file to an existing Proteome. The input filename must be a FASTA file without duplicate headers. If the file has duplicate headers and these have to be further processed we suggest using the protfasta (https://protfasta.readthedocs.io/) package to parse through the FASTA file first creating a santizied input FASTA. Each protein in a Proteome must have a unique_ID associated with it. There are two ways a FASTA file can be used to generate a unique ID: 1. By parsing the FASTA header, which could be as much as simply reading the header or couple involve some more complex logic. 2. By incrementing an automatically unique ID. IF the argument ``build_unique_ID`` is not provided, the ``fasta_to_proteome`` function will automatically generate a unique numerical ID for each protein. However, if the ``build_unique_ID`` argument *is* provided, this function is used to convert the header into a unique key. Parameters ------------ filename : string Name of the FASTA file we're going to parse in. Note the protein name will be defined as the full FASTA header for each entry **unless** a ``header_parser`` function is provided. proteome : Proteome If a Proteome object is provided the FASTA file will be read and added to the existing proteome, whereas if set to None a new Proteome will be generated. build_unique_ID : function [**Default = None**] ``build_unique_ID`` allows a user-defined function that is used to convert the FASTA header to a (hopefully) unique string. This can be useful if the FASTA header is well structured and includes a specific, useful unique string that can be used as the unique_ID. build_attributes : function [**Default = None**] ``build_attributes`` allows a user-defined function that allows meta-information from the FASTA header to be converted into protein attributes. Specifically, build_attributes should be a function which takes in the FASTA header as a string and returns a dictionary where key:value pairs are assigned as protein attributes. This can be useful if the FASTA header is well structured and includes a specific, useful information relivent to protein of interest. use_header_as_unique_ID : bool [**Default = False**] ``user_header_as_unique_ID`` is a boolean flag which, if set to true means the unique_ID is set to the FASTA file header. NOTE that the combination of this parameter being set to true and `build_unique_ID` function not being set to None will trigger an exception as this means there are two conflicting definitions of how the unique_ID should be defined. Note that if non-unique headers are found this will trigger an exception. force_overwrite : bool [**Default = False**] Flag that if set to true and we encounter a unique_ID that is already in the proteome the newer value overwrites the older one without predudice. This is mostly useful if you are adding in a file with known duplicate entries OR combining multiple FASTA files where you know there's some duplications. Note that if build_unique_ID = None and user_header_as_unique_ID = None then fasta_to_proteome guarentees that every FASTA entry will be given a unique_ID (meaning force_overwrite is irrelevant in this case). invalid_sequence_action : ``'ignore'``, ``'fail'``, ``'remove'``, ``'convert'``, ``'convert-ignore'`` [**Default = 'fail'**] Selector that determines how to deal with invalid sequences. If ``convert`` or ``convert-ignore`` are chosen, then conversion is completed with either the standard conversion table (shown under the ``correction_dictionary`` documentation) or with a custom conversion dictionary passed to ``correction_dictionary``. Options are as follows: * ``ignore`` - invalid sequences are completely ignored * ``fail`` - invalid sequence cause parsing to fail and throw an exception * ``remove`` - invalid sequences are removed * ``convert`` - invalid sequences are convert * ``convert-ignore`` - invalid sequences are converted to valid sequences and any remaining invalid residues are ignored Returns -------- Proteome Returns an initialized Proteome object """ # parameter sanity checking if use_header_as_unique_ID is True and build_unique_ID is not None: raise APIException( 'Cannot simultaneously set use_header_as_unique_ID = True and build_unique_ID to not None' ) # read in the fasta file using protfasta fasta_dictionary = protfasta.read_fasta( filename, invalid_sequence_action=invalid_sequence_action) # extract the keys (FASTA headers) and initialize the record_index (internal # numbering used for construction. Also initialize the proteom_dict, which is # a dictionary of protein entries we passed to Proteome. record_index = 0 # IF we're adding to a new proteome this bit of code sets the record_index to the largest new integer # such that we can add multiple proteomes in succession and we'll get a proteome where there are numerically # contigous unique_IDs. Note we only do this if we'll be using the record_index if proteome is not None and (build_unique_ID is None or use_header_as_unique_ID is None): numeric_record_ids = [] for uid in proteome.proteins: try: numeric_record_ids.append(int(uid)) except ValueError: pass if len(numeric_record_ids) > 0: record_index = max(numeric_record_ids) + 1 # initialize the empty list proteome_list = [] # for each entry for k in fasta_dictionary: # create a key-value pair where # key = the unique record_index (this is only used for internal structure # within this function to assure we never overwrite in this dictionary # # value = a four-position list where the positions reflect the following # [0] = amino acid sequence # [1] = name (this can be anything) # [2] = unique_ID - this should be a unique identifier that can be used # to cross-reference this entry to other data. If extrat_unique_ID # is passed we try to use this # [3] = attribute dictionary (we set this to None) # get unique_ID if build_unique_ID: unique_ID = build_unique_ID(k) elif use_header_as_unique_ID is True: unique_ID = k else: unique_ID = record_index # build an attributes dictionary using the user-provided custom function if build_attributes: attributes = build_attributes(k) else: attributes = {} # now create an input dictionary orbject newdict = {} newdict['sequence'] = str(fasta_dictionary[k]) newdict['name'] = k newdict['unique_ID'] = unique_ID newdict['attributes'] = attributes proteome_list.append(newdict) record_index = record_index + 1 # finally if a proteome was provided then if proteome is not None: proteome.add_proteins(proteome_list, force_overwrite=force_overwrite) return proteome else: # no proteome provided so build a new proteome and return it return Proteome(proteome_list, force_overwrite=force_overwrite)
def test_alignment_files(): test_data_dir = protfasta._get_data('test_data') f1 = '%s/aligned_seq_all_valid.fasta' % (test_data_dir) f2 = '%s/aligned_seq_all_valid_convertable.fasta' % (test_data_dir) f3 = '%s/aligned_seq_all_valid_unconvertable.fasta' % (test_data_dir) x = protfasta.read_fasta(f1, alignment=True) assert x['Seq1'] == 'A-----CDEFGHIKLMNPQRSTVWY' # this should fail because by default dashes are invalid and fail upon invalid is set to true with pytest.raises(ProtfastaException): x = protfasta.read_fasta(f1) with pytest.raises(ProtfastaException): x = protfasta.read_fasta(f1, alignment=1) with pytest.raises(ProtfastaException): x = protfasta.read_fasta(f2, alignment=True) x = protfasta.read_fasta(f2, alignment=True, invalid_sequence_action='convert') assert x['Seq1'] == 'A-----CDEFGHIKLMNPQRSTVWY' with pytest.raises(ProtfastaException): x = protfasta.read_fasta(f3, alignment=True, invalid_sequence_action='convert') with pytest.raises(ProtfastaException): x = protfasta.read_fasta(f3, alignment=True) x = protfasta.read_fasta(f3, alignment=True, invalid_sequence_action='convert-ignore') assert x['Seq2'] == 'ACDEFGHIKL-----MNPQRSTVWYN' x = protfasta.read_fasta(f3, alignment=True, invalid_sequence_action='remove') assert len(x) == 0 x = protfasta.read_fasta(f1, alignment=True, invalid_sequence_action='remove') assert len(x) == 3 x = protfasta.read_fasta(f1, invalid_sequence_action='remove') assert len(x) == 0
def test_sequences_with_bad_chars(): test_data_dir = protfasta._get_data('test_data') badchar_filename = '%s/test_data_with_bad_chars.fa' % (test_data_dir) nonstandard_filename = '%s/test_data_with_nonstandard_chars.fa' % ( test_data_dir) # expect this to fail because invalid characters are in here... with pytest.raises(ProtfastaException): x = protfasta.read_fasta(badchar_filename) # expect this to fail because non-standard characters are in here... with pytest.raises(ProtfastaException): x = protfasta.read_fasta(nonstandard_filename) # expect this to fail because invalid characters are in here (explicitlty pass 'fail') with pytest.raises(ProtfastaException): x = protfasta.read_fasta(badchar_filename, invalid_sequence_action='fail') # expect this to fail because non-standard characters are in here... (explicitlty pass 'fail') with pytest.raises(ProtfastaException): x = protfasta.read_fasta(nonstandard_filename, invalid_sequence_action='fail') # make sure we can ignore bad chars regardless of if they're convertable or not assert len( protfasta.read_fasta(nonstandard_filename, invalid_sequence_action='ignore')) == 4 assert len( protfasta.read_fasta(badchar_filename, invalid_sequence_action='ignore')) == 4 # make sure we can convert nonstandard names assert len( protfasta.read_fasta(nonstandard_filename, invalid_sequence_action='convert')) == 4 # make sure we can't convert invalid character names with pytest.raises(ProtfastaException): x = protfasta.read_fasta(badchar_filename, invalid_sequence_action='convert') # make sure we can convert and ignore (even when ignore is not needed) assert len( protfasta.read_fasta(nonstandard_filename, invalid_sequence_action='convert-ignore')) == 4 # make sure we can convert and ignore (even when ignore is needed) assert len( protfasta.read_fasta(badchar_filename, invalid_sequence_action='convert-ignore')) == 4 # make sure we can remove sequences with bad chars regardless of if they're convertable or not assert len( protfasta.read_fasta(nonstandard_filename, invalid_sequence_action='remove')) == 0 assert len( protfasta.read_fasta(badchar_filename, invalid_sequence_action='remove')) == 0 #CD = {'-': '', '.': 'A', 'X':'Y'} CD = {'.': 'A'} # this should fail because no conversion has been requested with pytest.raises(ProtfastaException): x = protfasta.read_fasta(nonstandard_filename, correction_dictionary=CD) # THIS should fail because we've overwritten the default dictionary with pytest.raises(ProtfastaException): x = protfasta.read_fasta(nonstandard_filename, correction_dictionary=CD, invalid_sequence_action='convert') # this should fail because CD does not explain all chars that must be corrected with pytest.raises(ProtfastaException): x = protfasta.read_fasta(badchar_filename, correction_dictionary=CD, invalid_sequence_action='convert') CD = {'.': 'A', '-': 'C'} x = protfasta.read_fasta(badchar_filename, correction_dictionary=CD, invalid_sequence_action='convert') CD = {'.': 'A'} x = protfasta.read_fasta(badchar_filename, correction_dictionary=CD, invalid_sequence_action='convert-ignore')