def test_join(self): """ Test BioSeqs.join() method. """ infile1 = 'Fasta/f001.fasta' infile2 = 'Phylip/f003.phylip' self.assertTrue(os.path.isfile(infile1)) self.assertTrue(os.path.isfile(infile2)) seq_db = BioSeqs.from_seqfile(infile1, 'fasta') extra_db = BioSeqs.from_seqfile(infile2, 'phylip') seq_db.join(extra_db) # Check the sequence data indict1 = SeqIO.to_dict(SeqIO.parse(infile1, 'fasta')) indict2 = SeqIO.to_dict(SeqIO.parse(infile2, 'phylip')) self.assertEqual(len(indict1) + len(indict2), len(seq_db)) for key, value in viewitems(indict1): self.assertEqual(str(value.seq), str(seq_db.data[key].seq)) for key, value in viewitems(indict2): self.assertEqual(str(value.seq), str(seq_db.data[key].seq)) # Check the report information self.assertIn('local', seq_db._report[0][1]) self.assertIn('Tests/Fasta/f001.fasta', seq_db._report[0][2]) self.assertIn('fasta', seq_db._report[0][3]) self.assertIn('local', seq_db._report[1][1]) self.assertIn('Tests/Phylip/f003.phylip', seq_db._report[1][2]) self.assertIn('phylip', seq_db._report[1][3])
def test_join ( self ) : """ Test BioSeqs.join() method. """ infile1 = 'Fasta/f001.fasta' infile2 = 'Phylip/f003.phylip' self.assertTrue(os.path.isfile(infile1)) self.assertTrue(os.path.isfile(infile2)) seq_db = BioSeqs.from_seqfile(infile1, 'fasta') extra_db = BioSeqs.from_seqfile(infile2, 'phylip') seq_db.join(extra_db) # Check the sequence data indict1 = SeqIO.to_dict(SeqIO.parse(infile1, 'fasta')) indict2 = SeqIO.to_dict(SeqIO.parse(infile2, 'phylip')) self.assertEqual(len(indict1) + len(indict2), len(seq_db)) for key, value in viewitems(indict1) : self.assertEqual(str(value.seq), str(seq_db.data[key].seq)) for key, value in viewitems(indict2) : self.assertEqual(str(value.seq), str(seq_db.data[key].seq)) # Check the report information self.assertIn('local', seq_db._report[0][1]) self.assertIn('Tests/Fasta/f001.fasta', seq_db._report[0][2]) self.assertIn('fasta', seq_db._report[0][3]) self.assertIn('local', seq_db._report[1][1]) self.assertIn('Tests/Phylip/f003.phylip', seq_db._report[1][2]) self.assertIn('phylip', seq_db._report[1][3])
def get_keywords ( tool ) : """ Arguments : tool ( string ) Name of the alignment tool. Returns : dict Dictionary containing the keywords and their corresponding arguments. Raises : ValueError If the tool introduced isn't included in MEvoLib.Align. """ tool = tool.lower() if ( tool not in _TOOL_TO_LIB ) : message = 'The alignment tool "{}" isn\'t included in ' \ 'MEvoLib.Align'.format(tool) raise ValueError(message) # else : # tool in _TOOL_TO_LIB keyword_dict = {} for key, value in iter(viewitems(_TOOL_TO_LIB[tool].KEYWORDS)) : keyword_dict[key] = ' '.join(value) return ( keyword_dict )
def test_alignment(self): """ Testing default procedure for the genes method with alignment assitance for sequences without biological information (FASTA input instead of GENBANK). """ infile = "Fasta/f006.fasta" informat = "fasta" # Check the input self.assertTrue(os.path.isfile(infile)) self.assertEqual(len(list(SeqIO.parse(infile, informat))), 5) # Generate the gene clustering without metadata subset_dict = Cluster.get_subsets("genes", infile, informat) # Check the output self.assertEqual(len(subset_dict), 1) self.assertEqual(len(subset_dict["unprocessable"]), 5) # Generate the gene clustering with external metadata (from a reference # sequence) subset_dict = Cluster.get_subsets("genes", infile, informat, ref_seq="rCRS", alignment_bin=mafft_exe) # Check the output self.assertEqual(len(subset_dict), 98) self.assertNotIn("unprocessable", subset_dict) for key, value in viewitems(subset_dict): self.assertNotEqual(len(value), 0) self.assertTrue(len(value) % 5 == 0)
def standard_test(self, informat, outformat, params): """ Standard testing procedure used by all tests. Arguments : informat ( string ) Input file format. outformat ( string ) Output file format. params ( string ) Arguments passed to the alignment tool. """ infile = '{}/f001.{}'.format(informat.capitalize(), informat) outfile = 'tmp_test.aln' self.add_file_to_clean(outfile) # Check the input self.assertTrue(os.path.isfile(infile)) self.assertEqual(len(list(SeqIO.parse(infile, informat))), 50) # Generate the alignment Align.get_alignment(mafft_exe, infile, informat, args=params, outfile=outfile, outfile_format=outformat) # Check the output self.assertTrue(os.path.isfile(outfile)) out_align = SeqIO.to_dict(SeqIO.parse(outfile, outformat)) prevfile = '{}/f001.mafft_{}.aln'.format(outformat.capitalize(), params) self.assertTrue(os.path.isfile(prevfile)) prev_align = SeqIO.to_dict(SeqIO.parse(prevfile, outformat)) self.assertEqual(len(viewkeys(out_align)), len(viewkeys(prev_align))) for key, value in viewitems(out_align): self.assertEqual(str(value.seq), str(prev_align[key].seq))
def get_keywords ( tool ) : """ Arguments : tool ( string ) Name of the supertree or consensus tool. Returns : dict Dictionary containing the keywords and their corresponding arguments. Raises : ValueError If the tool introduced isn't included in MEvoLib.PhyloAssemble. """ tool = tool.lower() #tool_lib_keys = viewkeys(_STREE_TOOL_TO_LIB) | viewkeys(_CONS_TOOL_TO_LIB) tool_lib_keys = viewkeys(_CONS_TOOL_TO_LIB) if ( tool not in tool_lib_keys ) : message = 'The tool "{}" isn\'t included in ' \ 'MEvoLib.PhyloAssemble'.format(tool) raise ValueError(message) # else : # tool in tool_lib_keys keyword_dict = {} # if ( tool in _STREE_TOOL_TO_LIB ) : # tool_lib_dict = _STREE_TOOL_TO_LIB # else : # tool in _CONS_TOOL_TO_LIB if ( tool in _CONS_TOOL_TO_LIB ) : tool_lib_dict = _CONS_TOOL_TO_LIB for key, value in iter(viewitems(tool_lib_dict[tool].KEYWORDS)) : keyword_dict[key] = ' '.join(value) return ( keyword_dict )
def test_alignment(self): """ Testing default procedure for the genes method with alignment assitance for sequences without biological information (FASTA input instead of GENBANK). """ infile = 'Fasta/f006.fasta' informat = 'fasta' # Check the input self.assertTrue(os.path.isfile(infile)) self.assertEqual(len(list(SeqIO.parse(infile, informat))), 5) # Generate the gene clustering without metadata subset_dict = Cluster.get_subsets('genes', infile, informat) # Check the output self.assertEqual(len(subset_dict), 1) self.assertEqual(len(subset_dict['unprocessable']), 5) # Generate the gene clustering with external metadata (from a reference # sequence) subset_dict = Cluster.get_subsets('genes', infile, informat, ref_seq='rCRS', alignment_bin=mafft_exe) # Check the output self.assertEqual(len(subset_dict), 98) self.assertNotIn('unprocessable', subset_dict) for key, value in viewitems(subset_dict): self.assertNotEqual(len(value), 0) self.assertTrue(len(value) % 5 == 0)
def test_seqfile_source ( self ) : """ Test BioSeqs.from_seqfile() and BioSeqs.write() methods. """ infile = 'Fasta/f001.fasta' self.assertTrue(os.path.isfile(infile)) seq_db = BioSeqs.from_seqfile(infile, 'fasta') outfile = 'tmp_test.gb' outrepfile = 'tmp_test.rep' self.files_to_clean.add(outfile) self.files_to_clean.add(outrepfile) seq_db.write(outfile) self.assertTrue(os.path.isfile(outfile)) # Check the content of both sequence files indict = SeqIO.to_dict(SeqIO.parse(infile, 'fasta')) outdict = SeqIO.to_dict(SeqIO.parse(outfile, 'gb')) self.assertEqual(len(indict), len(outdict)) for key, value in viewitems(indict) : self.assertEqual(str(value.seq), str(outdict[key].seq)) # Check the content of the report file with open(outrepfile, 'r') as repfile : for line in repfile.readlines() : self.assertTrue(('Num. sequences: 50' in line) or ('History:' in line) or (bool(re.match(r"""\d\d\d\d/\d\d/\d\d\ \d\d:\d\d:\d\d[ ]+ local[ ]+.*Tests/Fasta/f001\.fasta [ ]+fasta""", line, re.VERBOSE))))
def test_seqfile_source(self): """ Test BioSeqs.from_seqfile() and BioSeqs.write() methods. """ infile = 'Fasta/f001.fasta' self.assertTrue(os.path.isfile(infile)) seq_db = BioSeqs.from_seqfile(infile, 'fasta') outfile = 'tmp_test.gb' outrepfile = 'tmp_test.rep' self.files_to_clean.add(outfile) self.files_to_clean.add(outrepfile) seq_db.write(outfile) self.assertTrue(os.path.isfile(outfile)) # Check the content of both sequence files indict = SeqIO.to_dict(SeqIO.parse(infile, 'fasta')) outdict = SeqIO.to_dict(SeqIO.parse(outfile, 'gb')) self.assertEqual(len(indict), len(outdict)) for key, value in viewitems(indict): self.assertEqual(str(value.seq), str(outdict[key].seq)) # Check the content of the report file with open(outrepfile, 'r') as repfile: for line in repfile.readlines(): self.assertTrue( ('Num. sequences: 50' in line) or ('History:' in line) or (bool( re.match( r"""\d\d\d\d/\d\d/\d\d\ \d\d:\d\d:\d\d[ ]+ local[ ]+.*Tests/Fasta/f001\.fasta [ ]+fasta""", line, re.VERBOSE))))
def get_keywords ( tool ) : """ Arguments : tool ( string ) Name of the phylogenetic inference or bootstrapping tool. Returns : dict Dictionary containing the keywords and their corresponding arguments. Raises : ValueError If the tool introduced isn't included in MEvoLib.Inference. """ tool = tool.lower() tool_lib_keys = viewkeys(_PHYLO_TOOL_TO_LIB) | viewkeys(_BOOTS_TOOL_TO_LIB) if ( tool not in tool_lib_keys ) : raise ValueError('The tool "{}" isn\'t included in ' \ 'MEvoLib.Inference'.format(tool)) # else : # tool in tool_lib_keys keyword_dict = {} if ( tool in _PHYLO_TOOL_TO_LIB ) : tool_lib_dict = _PHYLO_TOOL_TO_LIB else : # tool in _BOOTS_TOOL_TO_LIB tool_lib_dict = _BOOTS_TOOL_TO_LIB for key, value in iter(viewitems(tool_lib_dict[tool].KEYWORDS)) : keyword_dict[key] = ' '.join(value) return ( keyword_dict )
def standard_test ( self, informat, outformat, params ) : """ Standard testing procedure used by all tests. Arguments : informat ( string ) Input file format. outformat ( string ) Output file format. params ( string ) Arguments passed to the alignment tool. """ infile = '{}/f001.{}'.format(informat.capitalize(), informat) outfile = 'tmp_test.aln' self.add_file_to_clean(outfile) # Check the input self.assertTrue(os.path.isfile(infile)) self.assertEqual(len(list(SeqIO.parse(infile, informat))), 50) # Generate the alignment Align.get_alignment(muscle_exe, infile, informat, args=params, outfile=outfile, outfile_format=outformat) # Check the output self.assertTrue(os.path.isfile(outfile)) out_align = SeqIO.to_dict(SeqIO.parse(outfile, outformat)) prevfile = '{}/f001.muscle_{}.aln'.format(outformat.capitalize(), params) self.assertTrue(os.path.isfile(prevfile)) prev_align = SeqIO.to_dict(SeqIO.parse(prevfile, outformat)) self.assertEqual(len(viewkeys(out_align)), len(viewkeys(prev_align))) for key, value in viewitems(out_align) : self.assertEqual(str(value.seq), str(prev_align[key].seq))
def get_keywords(tool): """ Arguments : tool ( string ) Name of the alignment tool. Returns : dict Dictionary containing the keywords and their corresponding arguments. Raises : ValueError If the tool introduced isn't included in MEvoLib.Align. """ tool = tool.lower() if (tool not in _TOOL_TO_LIB): message = 'The alignment tool "{}" isn\'t included in ' \ 'MEvoLib.Align'.format(tool) raise ValueError(message) # else : # tool in _TOOL_TO_LIB keyword_dict = {} for key, value in iter(viewitems(_TOOL_TO_LIB[tool].KEYWORDS)): keyword_dict[key] = ' '.join(value) return (keyword_dict)
def get_keywords(tool): """ Arguments : tool ( string ) Name of the supertree or consensus tool. Returns : dict Dictionary containing the keywords and their corresponding arguments. Raises : ValueError If the tool introduced isn't included in MEvoLib.PhyloAssemble. """ tool = tool.lower() #tool_lib_keys = viewkeys(_STREE_TOOL_TO_LIB) | viewkeys(_CONS_TOOL_TO_LIB) tool_lib_keys = viewkeys(_CONS_TOOL_TO_LIB) if (tool not in tool_lib_keys): message = 'The tool "{}" isn\'t included in ' \ 'MEvoLib.PhyloAssemble'.format(tool) raise ValueError(message) # else : # tool in tool_lib_keys keyword_dict = {} # if ( tool in _STREE_TOOL_TO_LIB ) : # tool_lib_dict = _STREE_TOOL_TO_LIB # else : # tool in _CONS_TOOL_TO_LIB if (tool in _CONS_TOOL_TO_LIB): tool_lib_dict = _CONS_TOOL_TO_LIB for key, value in iter(viewitems(tool_lib_dict[tool].KEYWORDS)): keyword_dict[key] = ' '.join(value) return (keyword_dict)
def test_feature_filter(self): """ Testing the genes method with a feature filter. """ infile = "Genbank/f006.genbank" informat = "genbank" # Check the input self.assertTrue(os.path.isfile(infile)) self.assertEqual(len(list(SeqIO.parse(infile, informat))), 5) # Generate the gene clustering subset_dict = Cluster.get_subsets("genes", infile, informat, feature_filter=["CDS"]) # Check the output self.assertEqual(len(subset_dict), 14) for key, subset in viewitems(subset_dict): if key == "unprocessable": self.assertEqual(len(subset), 0) else: self.assertEqual(len(subset), 5)
def test_feature_filter(self): """ Testing the genes method with a feature filter. """ infile = 'Genbank/f006.genbank' informat = 'genbank' # Check the input self.assertTrue(os.path.isfile(infile)) self.assertEqual(len(list(SeqIO.parse(infile, informat))), 5) # Generate the gene clustering subset_dict = Cluster.get_subsets('genes', infile, informat, feature_filter=['CDS']) # Check the output self.assertEqual(len(subset_dict), 14) for key, subset in viewitems(subset_dict): if (key == 'unprocessable'): self.assertEqual(len(subset), 0) else: self.assertEqual(len(subset), 5)
def test_bioseqs_source ( self ) : """ Test BioSeqs.from_bioseqs() method and len() property. """ infile = 'BioSeqs/f001.gb' inrepfile = 'BioSeqs/f001.rep' self.assertTrue(os.path.isfile(infile)) self.assertTrue(os.path.isfile(inrepfile)) seq_db = BioSeqs.from_bioseqs(infile) # Check the content of the BioSeqs' object indict = SeqIO.to_dict(SeqIO.parse(infile, 'gb')) self.assertEqual(len(seq_db), len(indict)) for key, value in viewitems(indict) : self.assertEqual(str(value.seq), str(seq_db.data[key].seq)) # Check the content of the BioSeqs' report with open(inrepfile, 'r') as repfile : line = repfile.readline().strip() # Num. sequences: 50 self.assertEqual(len(seq_db), int(line[-2:])) line = repfile.readline() # History: line = repfile.readline().strip() # [First source information] source_info = line.split(' ') self.assertEqual(seq_db._report[0], tuple(source_info))
def test_bioseqs_source(self): """ Test BioSeqs.from_bioseqs() method and len() property. """ infile = 'BioSeqs/f001.gb' inrepfile = 'BioSeqs/f001.rep' self.assertTrue(os.path.isfile(infile)) self.assertTrue(os.path.isfile(inrepfile)) seq_db = BioSeqs.from_bioseqs(infile) # Check the content of the BioSeqs' object indict = SeqIO.to_dict(SeqIO.parse(infile, 'gb')) self.assertEqual(len(seq_db), len(indict)) for key, value in viewitems(indict): self.assertEqual(str(value.seq), str(seq_db.data[key].seq)) # Check the content of the BioSeqs' report with open(inrepfile, 'r') as repfile: line = repfile.readline().strip() # Num. sequences: 50 self.assertEqual(len(seq_db), int(line[-2:])) line = repfile.readline() # History: line = repfile.readline().strip() # [First source information] source_info = line.split(' ') self.assertEqual(seq_db._report[0], tuple(source_info))
def map_seqs ( record_list, feature_filter = None, ref_seq = None, alignment_bin = None, log_file = None ) : """ Gene splicing of the sequences at 'record_list'. By default, the gene location is extracted from the feature list of each sequence. If there is no list, that sequence is classified as "unprocessable" or, if a reference sequence is given, the reference features are used to extract the different genes (through a normalization process using an alignment tool). All the features are returned unless a list of feature keywords are passed through 'feature_filter' parameter. If a log file path is given and any file exists with that name, the file will be overwritten without any warning. Arguments : record_list ( list ) List of SeqRecord objects (from Biopython). feature_filter ( Optional[list] ) List of feature keywords the user wants to be returned (from all the possible ones). ref_seq ( Optional[string] ) Keyword (from MEvoLib.Data) or file path (GENBANK format) of the reference sequence. alignment_bin ( Optional[string] ) Binary path of the alignment tool (only required if a reference sequence is passed). log_file ( Optional[string] ) Absolute path for the log file. Returns : dict Dictionary with the set identifiers as keys and the corresponding sequence fragments as values in lists of SeqRecord objects. Raises : IOError If the reference sequence's file path doesn't exist. RuntimeError If the call to the alignment tool command raises an exception. * Reference sequence's file must be in GENBANK format. """ # Load the desired feature keywords as keys of the gene dictionary and a # term dictionary with a list of sequences for each qualifier of any # selected feature if ( feature_filter ) : gene_dict = dict((key, {}) for key in feature_filter) term_dict = dict((key, {}) for key in feature_filter) else : # feature_filter is None gene_dict = dict((key, {}) for key in viewkeys(_FEAT_QUAL_DICT)) term_dict = dict((key, {}) for key in viewkeys(_FEAT_QUAL_DICT)) # Get the reference sequence's SeqRecord object or create an unprocessable # list for those sequences without gene information if ( ref_seq in _REF_SEQ_DICT ) : refseq_record = _REF_SEQ_DICT[ref_seq].RECORD elif ( ref_seq ) : # ref_seq != None refseq_record = SeqIO.read(ref_seq, 'gb') else : # ref_seq is None unprocessable = [] num_seqs = 0 # Iterate over all the records to get their gene division for record in record_list : num_seqs += 1 if ( len(record.features) <= 1 ) : # GenBank's "source" feature key is mandatory if ( ref_seq ) : record.seq, record.features = _normalization(record, refseq_record, alignment_bin) else : # ref_seq is None unprocessable.append(record) continue # else : # len(record.features) > 1 record_features = (feat for feat in record.features[1:] if feat.type in gene_dict) for feature in record_features : # Create a set of qualifiers of the record from the main fields of # GenBank (pre-saved in _FEAT_QUAL_DICT) record_qualifiers = set() for qualifier_key in iter(_FEAT_QUAL_DICT[feature.type]) : if ( qualifier_key in feature.qualifiers ) : record_qualifiers.update((_string_filter(x) for x in feature.qualifiers[qualifier_key])) if ( not record_qualifiers ) : # 'record_qualifiers' is empty record_qualifiers.add(feature.type) # Generate a string of the qualifiers' set to store it as a # description of the gene SeqRecord object qualifier_id = ':'.join(sorted(record_qualifiers, key=lambda item: (len(item), item))) feature_record = SeqRecord(feature.extract(record).seq, id=record.id, name=record.id, description=qualifier_id) # Add new terms to the corresponding entry of the dictionary for # the given feature, or add the sequence record id to the existing # entry for pair in itertools.combinations(qualifier_id.split(':'), 2) : if ( pair not in term_dict[feature.type] ) : term_dict[feature.type][pair] = set([record.id]) else : # pair in term_dict[feature.type] term_dict[feature.type][pair].add(record.id) # Merge possible matching qualifiers for the same type of feature qualifiers_to_merge = [] for key in viewkeys(gene_dict[feature.type]) : key_set = set(key.split(':')) if ( not record_qualifiers.isdisjoint(key_set) ) : if ( record_qualifiers <= key_set ) : record_qualifiers.update(key_set) elif ( record_qualifiers > key_set ) : qualifiers_to_merge.append(key) else : # 'record_qualifiers' and 'key_set' differ but their # intersection is not empty record_qualifiers.update(key_set) qualifiers_to_merge.append(key) # Generate new qualifier string qualifier_id = ':'.join(sorted(record_qualifiers, key=lambda item: (len(item), item))) # Add the new gene SeqRecord object to the dictionary if ( qualifier_id not in gene_dict[feature.type] ) : gene_dict[feature.type][qualifier_id] = [feature_record] else : # qualifier_id in gene_dict[feature.type] gene_dict[feature.type][qualifier_id].append(feature_record) # Merge those qualifiers that belong to the same gene for qualifier_key in qualifiers_to_merge : if ( qualifier_key != qualifier_id ) : gene_dict[feature.type][qualifier_id].extend( gene_dict[feature.type][qualifier_key]) del gene_dict[feature.type][qualifier_key] # The error calculation has been extracted from the following sampling # statistics equation: # # N * Z^2 * p * (1-p) # n = ------------------------------- # (N-1) * e^2 + Z^2 * p * (1-p) # # where N is the number of sequences, n is the minimum sampling size # (threshold), e is the error fixed to 0,01, Z is fixed to get a 0,99 # confidence interval and p is assumed to be 0,5. e_value = 0.01 z_value = 2.58 p_value = 0.5 coef = math.pow(z_value, 2) * p_value * (1.0 - p_value) threshold = math.ceil((num_seqs * coef) / \ ((num_seqs - 1.0) * math.pow(e_value, 2) + coef)) # If no log file path is provided, save log content in a named temporary # file that won't be deleted after the function ends if ( not log_file ) : log_file = (tempfile.NamedTemporaryFile(delete=False)).name # Clean-up empty features and merge qualifiers dict keys with features dict # keys to get a {str: list} dict for all the genes set_dict = {} with open(log_file, 'w') as log : for feat_key, feat_value in iter(viewitems(gene_dict)) : if ( feat_value ) : log.write('> {}\n'.format(feat_key)) for qual_key, qual_value in iter(viewitems(feat_value)) : # Generation of the content of the set dictionary that will # be returned new_key = '{}.{}'.format(feat_key, qual_key.split(':')[0]) set_dict.setdefault(new_key, []).extend(qual_value) # For every existing pair of qualifiers, if the number of # records that hold both is below the calculated threshold, # it might be the result of a typo in those records' # information (further review of the log file is advisable) for pair in itertools.combinations(qual_key.split(':'), 2) : if ( pair in term_dict[feat_key] ) : sampling_size = len(term_dict[feat_key][pair]) if ( sampling_size < threshold ) : seq_list = list(term_dict[feat_key][pair]) text = '\t{}\n'.format(' || '.join(pair)) for i in range(0, sampling_size // 6 + 1): text += '\t\t{}\n'.format( ' '.join(seq_list[i*6:(i+1)*6])) if ( (sampling_size % 6) != 0 ) : text += '\n' log.write(text) log.write('\n') # If no reference sequence has been introduced, include in the gene dict # those sequences that couldn't be processed due to lack of information if ( not ref_seq ) : set_dict['unprocessable'] = unprocessable return ( set_dict )
def map_seqs(record_list, feature_filter=None, ref_seq=None, alignment_bin=None, log_file=None): """ Gene splicing of the sequences at 'record_list'. By default, the gene location is extracted from the feature list of each sequence. If there is no list, that sequence is classified as "unprocessable" or, if a reference sequence is given, the reference features are used to extract the different genes (through a normalization process using an alignment tool). All the features are returned unless a list of feature keywords are passed through 'feature_filter' parameter. If a log file path is given and any file exists with that name, the file will be overwritten without any warning. Arguments : record_list ( list ) List of SeqRecord objects (from Biopython). feature_filter ( Optional[list] ) List of feature keywords the user wants to be returned (from all the possible ones). ref_seq ( Optional[string] ) Keyword (from MEvoLib.Data) or file path (GENBANK format) of the reference sequence. alignment_bin ( Optional[string] ) Binary path of the alignment tool (only required if a reference sequence is passed). log_file ( Optional[string] ) Absolute path for the log file. Returns : dict Dictionary with the set identifiers as keys and the corresponding sequence fragments as values in lists of SeqRecord objects. Raises : IOError If the reference sequence's file path doesn't exist. RuntimeError If the call to the alignment tool command raises an exception. * Reference sequence's file must be in GENBANK format. """ # Load the desired feature keywords as keys of the gene dictionary and a # term dictionary with a list of sequences for each qualifier of any # selected feature if (feature_filter): gene_dict = dict((key, {}) for key in feature_filter) term_dict = dict((key, {}) for key in feature_filter) else: # feature_filter is None gene_dict = dict((key, {}) for key in viewkeys(_FEAT_QUAL_DICT)) term_dict = dict((key, {}) for key in viewkeys(_FEAT_QUAL_DICT)) # Get the reference sequence's SeqRecord object or create an unprocessable # list for those sequences without gene information if (ref_seq in _REF_SEQ_DICT): refseq_record = _REF_SEQ_DICT[ref_seq].RECORD elif (ref_seq): # ref_seq != None refseq_record = SeqIO.read(ref_seq, 'gb') else: # ref_seq is None unprocessable = [] num_seqs = 0 # Iterate over all the records to get their gene division for record in record_list: num_seqs += 1 if (len(record.features) <= 1): # GenBank's "source" feature key is mandatory if (ref_seq): record.seq, record.features = _normalization( record, refseq_record, alignment_bin) else: # ref_seq is None unprocessable.append(record) continue # else : # len(record.features) > 1 record_features = (feat for feat in record.features[1:] if feat.type in gene_dict) for feature in record_features: # Create a set of qualifiers of the record from the main fields of # GenBank (pre-saved in _FEAT_QUAL_DICT) record_qualifiers = set() for qualifier_key in iter(_FEAT_QUAL_DICT[feature.type]): if (qualifier_key in feature.qualifiers): record_qualifiers.update( (_string_filter(x) for x in feature.qualifiers[qualifier_key])) if (not record_qualifiers): # 'record_qualifiers' is empty record_qualifiers.add(feature.type) # Generate a string of the qualifiers' set to store it as a # description of the gene SeqRecord object qualifier_id = ':'.join( sorted(record_qualifiers, key=lambda item: (len(item), item))) feature_record = SeqRecord(feature.extract(record).seq, id=record.id, name=record.id, description=qualifier_id) # Add new terms to the corresponding entry of the dictionary for # the given feature, or add the sequence record id to the existing # entry for pair in itertools.combinations(qualifier_id.split(':'), 2): if (pair not in term_dict[feature.type]): term_dict[feature.type][pair] = set([record.id]) else: # pair in term_dict[feature.type] term_dict[feature.type][pair].add(record.id) # Merge possible matching qualifiers for the same type of feature qualifiers_to_merge = [] for key in viewkeys(gene_dict[feature.type]): key_set = set(key.split(':')) if (not record_qualifiers.isdisjoint(key_set)): if (record_qualifiers <= key_set): record_qualifiers.update(key_set) elif (record_qualifiers > key_set): qualifiers_to_merge.append(key) else: # 'record_qualifiers' and 'key_set' differ but their # intersection is not empty record_qualifiers.update(key_set) qualifiers_to_merge.append(key) # Generate new qualifier string qualifier_id = ':'.join( sorted(record_qualifiers, key=lambda item: (len(item), item))) # Add the new gene SeqRecord object to the dictionary if (qualifier_id not in gene_dict[feature.type]): gene_dict[feature.type][qualifier_id] = [feature_record] else: # qualifier_id in gene_dict[feature.type] gene_dict[feature.type][qualifier_id].append(feature_record) # Merge those qualifiers that belong to the same gene for qualifier_key in qualifiers_to_merge: if (qualifier_key != qualifier_id): gene_dict[feature.type][qualifier_id].extend( gene_dict[feature.type][qualifier_key]) del gene_dict[feature.type][qualifier_key] # The error calculation has been extracted from the following sampling # statistics equation: # # N * Z^2 * p * (1-p) # n = ------------------------------- # (N-1) * e^2 + Z^2 * p * (1-p) # # where N is the number of sequences, n is the minimum sampling size # (threshold), e is the error fixed to 0,01, Z is fixed to get a 0,99 # confidence interval and p is assumed to be 0,5. e_value = 0.01 z_value = 2.58 p_value = 0.5 coef = math.pow(z_value, 2) * p_value * (1.0 - p_value) threshold = math.ceil((num_seqs * coef) / \ ((num_seqs - 1.0) * math.pow(e_value, 2) + coef)) # If no log file path is provided, save log content in a named temporary # file that won't be deleted after the function ends if (not log_file): log_file = (tempfile.NamedTemporaryFile(delete=False)).name # Clean-up empty features and merge qualifiers dict keys with features dict # keys to get a {str: list} dict for all the genes set_dict = {} with open(log_file, 'w') as log: for feat_key, feat_value in iter(viewitems(gene_dict)): if (feat_value): log.write('> {}\n'.format(feat_key)) for qual_key, qual_value in iter(viewitems(feat_value)): # Generation of the content of the set dictionary that will # be returned new_key = '{}.{}'.format(feat_key, qual_key.split(':')[0]) set_dict.setdefault(new_key, []).extend(qual_value) # For every existing pair of qualifiers, if the number of # records that hold both is below the calculated threshold, # it might be the result of a typo in those records' # information (further review of the log file is advisable) for pair in itertools.combinations(qual_key.split(':'), 2): if (pair in term_dict[feat_key]): sampling_size = len(term_dict[feat_key][pair]) if (sampling_size < threshold): seq_list = list(term_dict[feat_key][pair]) text = '\t{}\n'.format(' || '.join(pair)) for i in range(0, sampling_size // 6 + 1): text += '\t\t{}\n'.format(' '.join( seq_list[i * 6:(i + 1) * 6])) if ((sampling_size % 6) != 0): text += '\n' log.write(text) log.write('\n') # If no reference sequence has been introduced, include in the gene dict # those sequences that couldn't be processed due to lack of information if (not ref_seq): set_dict['unprocessable'] = unprocessable return (set_dict)
def update(self, email): """ Update the BioSeqs object from the last NCBI's Entrez database and query values stored in the report list. All the sequences stored must have their genbank identifier information in the annotations property. The deleted sequences from the database will be deleted in the object and the new sequences will be fetched and stored. Arguments: email (string) E-mail required by Bio.Entrez. Raises: ValueError If there is no entrez entry in the report list. ValueError If any sequence hasn't its GenBank identifier information in the annotations property. * The e-mail information is considered sensible information and it won't be saved in any public or private variable of the object. """ # Get the last entrez entry of the report for record in reversed(self._report): if (record[1] == 'entrez'): date_time, src_type, entrez_db, query = record break else: message = 'No entrez entry found in object\'s report' raise ValueError(message) # Perform the update process in a copy of the dictionary to avoid # incomplete updates due to unexpected HTTP exceptions seq_dict = copy.copy(self.data) Entrez.email = email db_rettype = _get_entrez_db_rettype(entrez_db) # Execute Entrez.esearch() to get the total number of sequences that # matches the query in the Entrez database handle = Entrez.esearch(db=entrez_db, term=query, rettype='count') num_seqs = int(Entrez.read(handle)['Count']) handle.close() if (num_seqs == 0): warnings.warn('The query stored didn\'t return any sequence') else: # Execute again Entrez.esearch() giving the total number of # sequences to get the complete list of Entrez database's sequence # identifiers updated_seq_ids = set() for index in range(0, num_seqs, MAX_NUM_SEQS): handle = Entrez.esearch(db=entrez_db, term=query, restart=index, retmax=num_seqs) record = Entrez.read(handle) handle.close() updated_seq_ids.update(record['IdList']) # Get an "entrez identifier: accession" dictionary of the stored # sequences gi_acc_dict = {} try: for key, value in viewitems(seq_dict): gi_acc_dict[value.annotations['gi']] = key except KeyError: message = 'Missing genbank identifier' raise ValueError(message) else: # Use that dictionary to check which of the stored identifiers # have been removed from the Entrez database deprecated_seq_ids = viewkeys(gi_acc_dict) ids_to_remove = deprecated_seq_ids - updated_seq_ids # Remove all the deprecated sequences for gi_value in ids_to_remove: accession = gi_acc_dict[gi_value] del seq_dict[accession] del gi_acc_dict[gi_value] # Finally, get the list of new identifiers to fetch ids_to_fetch = list( updated_seq_ids.difference(deprecated_seq_ids)) num_to_fetch = len(ids_to_fetch) if (num_to_fetch > 0): # Fetch the first sequence and estimate the batch size fetch_handle = Entrez.efetch(db=entrez_db, id=ids_to_fetch[0], retmode='text', rettype=db_rettype) record_str = fetch_handle.read() fetch_handle.close() record = SeqIO.read(StringIO(record_str), 'genbank') seq_dict[record.id] = record batch_size = _estimate_batch_size(record_str) # In batches of 'batch_size', fetch the Entrez database # information of each new sequence in text format through # Entrez.efetch() start = 1 exceptRaised = False while (start < num_to_fetch): end = start + batch_size try: fetch_handle = Entrez.efetch( db=entrez_db, id=ids_to_fetch[start:end], retmode='text', rettype=db_rettype) except: # If it is the first time for this batch, # wait for a minute to see if we can recover # from the exception if (not exceptRaised): warnings.warn( ("Exception raised durig fetching" ". Trying to recover...")) exceptRaised = True sleep(60) else: warnings.warn(("Exception raised for second " "time. Saving current progress " "and exiting.")) break else: exceptRaised = False for record in SeqIO.parse(fetch_handle, 'genbank'): seq_dict[record.id] = record fetch_handle.close() start += batch_size # The process has ended correctly so we can replace the old # dictionary with the new one self.data = seq_dict # Generate the corresponding report tuple date_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S') self._report.append( (date_time, 'entrez', entrez_db, query))
def update(self, email): """ Update the BioSeqs object from the last NCBI's Entrez database and query values stored in the report list. All the sequences stored must have their genbank identifier information in the annotations property. The deleted sequences from the database will be deleted in the object and the new sequences will be fetched and stored. Arguments : email ( string ) E-mail required by Bio.Entrez. Raises : ValueError If there is no entrez entry in the report list. ValueError If any sequence hasn't its GenBank identifier information in the annotations property. * The e-mail information is considered sensible information and it won't be saved in any public or private variable of the object. """ # Get the last entrez entry of the report for record in reversed(self._report): if record[1] == "entrez": date_time, src_type, entrez_db, query = record break else: message = "No entrez entry found in object's report" raise ValueError(message) # Perform the update process in a copy of the dictionary to avoid # incomplete updates due to unexpected HTTP exceptions seq_dict = copy.copy(self.data) Entrez.email = email db_rettype = _get_entrez_db_rettype(entrez_db) # Execute Entrez.esearch() to get the total number of sequences that # matches the query in the Entrez database handle = Entrez.esearch(db=entrez_db, term=query, rettype="count") num_seqs = int(Entrez.read(handle)["Count"]) handle.close() if num_seqs == 0: warnings.warn("The query stored didn't return any sequence") else: # Execute again Entrez.esearch() giving the total number of # sequences to get the complete list of Entrez database's sequence # identifiers updated_seq_ids = set() for index in range(0, num_seqs, 100000): handle = Entrez.esearch(db=entrez_db, term=query, restart=index, retmax=num_seqs) record = Entrez.read(handle) handle.close() updated_seq_ids.update(record["IdList"]) # Get an "entrez identifier: accession" dictionary of the stored # sequences gi_acc_dict = {} try: for key, value in viewitems(seq_dict): gi_acc_dict[value.annotations["gi"]] = key except KeyError: message = "Missing genbank identifier" raise ValueError(message) else: # Use that dictionary to check which of the stored identifiers # have been removed from the Entrez database deprecated_seq_ids = viewkeys(gi_acc_dict) ids_to_remove = deprecated_seq_ids - updated_seq_ids # Remove all the deprecated sequences for gi_value in ids_to_remove: accession = gi_acc_dict[gi_value] del seq_dict[accession] del gi_acc_dict[gi_value] # Finally, get the list of new identifiers to fetch ids_to_fetch = list(updated_seq_ids.difference(deprecated_seq_ids)) num_to_fetch = len(ids_to_fetch) if num_to_fetch > 0: # Fetch the first sequence and estimate the batch size fetch_handle = Entrez.efetch(db=entrez_db, id=ids_to_fetch[0], retmode="text", rettype=db_rettype) record_str = fetch_handle.read() fetch_handle.close() record = SeqIO.read(StringIO(record_str), "genbank") seq_dict[record.id] = record batch_size = _estimate_batch_size(record_str) # In batches of 'batch_size', fetch the Entrez database # information of each new sequence in text format through # Entrez.efetch() for start in range(1, num_to_fetch, batch_size): end = start + batch_size fetch_handle = Entrez.efetch( db=entrez_db, id=ids_to_fetch[start:end], retmode="text", rettype=db_rettype ) for record in SeqIO.parse(fetch_handle, "genbank"): seq_dict[record.id] = record fetch_handle.close() # The process has ended correctly so we can replace the old # dictionary with the new one self.data = seq_dict # Generate the corresponding report tuple date_time = datetime.now().strftime("%Y/%m/%d %H:%M:%S") self._report.append((date_time, "entrez", entrez_db, query))