def test_extract_by_reference_positions(self): sig = utils.extract_by_reference_positions("ABC-DE-F", "A-BC-DEF", [0, 1, 3, 4]) assert sig == "ACE-" sig = utils.extract_by_reference_positions("ABCDF", "ABCDE", [0, 1, 3, 4]) assert sig == "ABDF"
def run_kr_analysis( queries: Dict[str, str] ) -> Tuple[Dict[str, Prediction], Dict[str, Prediction]]: """ Extract activity and stereochemistry signatures from KR domains Arguments: queries: a mapping of query CDS name to sequence Returns: a pair of dicts, one mapping query name to activity bool, the other mapping query name to stereochemistry (e.g. A2) """ querysignames = [] activity_signatures = [] stereochem_signatures = [] for name, seq in sorted(queries.items()): querysignames.append(name) muscle_dict = subprocessing.run_muscle_single(name, seq, _KR_DOMAINS_FILENAME) positions_act = [110, 134, 147, 151] # active site positions_ste = [90, 91, 92, 139, 144, 147, 149, 151] # stereochem refsequence = "MAPSI|PKS|CAM00062.1|Erythromycin_synthase_modules_1_and_2|Sacc_KR1" refseq = muscle_dict[refsequence] activity_signatures.append( utils.extract_by_reference_positions(muscle_dict[name], refseq, positions_act)) stereochem_signatures.append( utils.extract_by_reference_positions(muscle_dict[name], refseq, positions_ste)) # Check activity activity = {} # type: Dict[str, Prediction] for name, signature in zip(querysignames, activity_signatures): if is_active(signature): activity[name] = SimplePrediction("kr_activity", "active") else: activity[name] = SimplePrediction("kr_activity", "inactive") # Predict stereochemistry stereochemistry = {} # type: Dict[str, Prediction] for name, signature in zip(querysignames, stereochem_signatures): chem = predict_stereochemistry(signature) if chem: stereochemistry[name] = SimplePrediction("kr_stereochem", chem) return activity, stereochemistry
def extract_cterminus(data_dir: str, cds_features: List[CDSFeature], end_cds: Optional[CDSFeature]) -> Dict[str, str]: """ Extract C-terminal 100 residues of each non-ending protein, scan for docking domains, parse output to locate interacting residues Arguments: data_dir: the directory containing the C-terminal reference files cds_features: the list of CDSFeatures to extract terminals from end_cds: if not None, skips this CDS since C-terminals are irrelevant Returns: A dictionary mapping gene name to the pair of residues extracted """ c_terminal_residues = {} c_terminals = {} # type: Dict[str, str] cterm_file = os.path.join(data_dir, 'cterm.fasta') for cds in cds_features: if cds is not end_cds: seq = str(cds.translation) c_terminals[cds.get_name()] = seq[-100:] for name, seq in c_terminals.items(): alignments = subprocessing.run_muscle_single(name, seq, cterm_file) query_seq = alignments[name] ref_seq = alignments["EryAII_ref"] c_terminal_residues[name] = utils.extract_by_reference_positions( query_seq, ref_seq, [55, 64]) return c_terminal_residues
def run_kr_analysis( queries: Dict[str, str]) -> Tuple[Dict[str, bool], Dict[str, str]]: """ Extract activity and stereochemistry signatures from KR domains Arguments: input_file: the fasta file to read queries from out_file: a filename, if provided, writes results to file as well Returns: a pair of dicts, one mapping query name to activity bool, the other mapping query name to stereochemistry (e.g. A2) """ querysignames = [] activity_signatures = [] stereochem_signatures = [] for name, seq in sorted(queries.items()): querysignames.append(name) muscle_dict = subprocessing.run_muscle_single(name, seq, _KR_DOMAINS_FILENAME) positions_act = [110, 134, 147, 151] # active site positions_ste = [90, 91, 92, 139, 144, 147, 149, 151] # stereochem refsequence = "MAPSI|PKS|CAM00062.1|Erythromycin_synthase_modules_1_and_2|Sacc_KR1" refseq = muscle_dict[refsequence] activity_signatures.append( utils.extract_by_reference_positions(muscle_dict[name], refseq, positions_act)) stereochem_signatures.append( utils.extract_by_reference_positions(muscle_dict[name], refseq, positions_ste)) # Check activity activity = {} for name, signature in zip(querysignames, activity_signatures): activity[name] = is_active(signature) # Predict stereochemistry stereochemistry = {} for name, signature in zip(querysignames, stereochem_signatures): chem = predict_stereochemistry(signature) if chem: stereochemistry[name] = chem return activity, stereochemistry
def run_minowa(sequence_info: Dict[str, str], startpos: int, muscle_ref: str, ref_sequence: str, positions_file: str, data_dir: str, hmm_names: List[str]) -> Dict[str, Prediction]: """ Scores query sequences against a set of provided HMM profiles. The scoring is calculated by aligning each query against the reference set, then extracting a signature by using the sequence positions provided, finally hmmsearch is used to compare the signature with the provided set of HMM profiles. Arguments: sequence_info: a dict mapping sequence id to sequence startpos: an int to subtract from those positions in positions_file muscle_ref: the path of a file containing reference sequence to align against ref_sequence: the reference sequence to base extractions on positions_file: the path of a file containing signature extraction positions data_dir: the directory containing HMM profiles for the current method hmm_names: the names of the HMM profiles for the current method Returns: an instance of MinowaResults, which is a subclass of dict mapping query sequence id to MinowaPrediction """ positions = get_positions(positions_file, startpos) results_by_query = {} # type: Dict[str, Prediction] for query_id, query_seq in sequence_info.items(): muscle = subprocessing.run_muscle_single(query_id, query_seq, muscle_ref) # count residues in ref sequence and put positions in list # extract positions from query sequence and create fasta formatted seq # to use as input for hmm searches seq = utils.extract_by_reference_positions(muscle[query_id], muscle[ref_sequence], positions) fasta_format = ">%s\n%s\n" % (query_id, seq.replace("-", "X")) # then use list to extract positions from every sequence -> HMMs (one time, without any query sequence) hmm_scores = {} for hmmname in hmm_names: hmm_scores[hmmname] = hmmsearch( fasta_format, path.join(data_dir, hmmname + ".hmm")) results = sorted(hmm_scores.items(), reverse=True, key=lambda x: (x[1], x[0])) results_by_query[query_id] = MinowaPrediction(results) return results_by_query
def run_at_domain_analysis(domains: Dict[str, str]) -> ATSignatureResults: """ Analyses PKS signature of AT domains Arguments: domains: a dictionary mapping domain identifier (e.g. 'locus_AT2') to domain sequence Returns: a dictionary mapping domain identifier to a list of ATResults ordered by decreasing score """ # construct the query signatures query_signatures = {} at_positions = get_at_positions(startpos=7) for name, seq in sorted(domains.items()): alignments = subprocessing.run_muscle_single(name, seq, _AT_DOMAINS_FILENAME) query_signatures[name] = utils.extract_by_reference_positions(alignments[name], alignments[_REF_SEQUENCE], at_positions) # load reference PKS signatures and score queries against them return score_signatures(query_signatures, fasta.read_fasta(_SIGNATURES_FILENAME))
def get_signature(query: str, hmm: str, positions: List[int]) -> str: """ Retrieves a signature from an aligned pair based on 1-indexed positions given. Arguments: query: the sequence of the query that the signature will be extracted from hmm: the sequence of the hit, used to adjust positions to account for introduced gaps positions: a list of 1-indexed positions to use for the signature positions are relative to hit start expected: if provided, a signature extracted from the reference sequence must match this Returns: None if the provided positions would be out of bounds of the hit, otherwise a string of the same length as positions and expected """ ungapped = str(hmm).replace('.', '') if max(positions) > len(ungapped): # the hit was too small and a correct signature can't be generated return "" return utils.extract_by_reference_positions(query, hmm, [pos - 1 for pos in positions])
def extract_nterminus(data_dir: str, cds_features: List[CDSFeature], start_cds: Optional[CDSFeature]) -> Dict[str, str]: """ -extract N-terminal 50 residues of each non-starting protein -scan for docking domains using hmmsearch -parse output to locate interacting residues """ n_terminal_residues = {} n_terminals = {} nterm_file = os.path.join(data_dir, 'nterm.fasta') for cds in cds_features: if cds is not start_cds: seq = str(cds.translation) n_terminals[cds.get_name()] = seq[:50] for name, seq in n_terminals.items(): alignments = subprocessing.run_muscle_single(name, seq, nterm_file) query_seq = alignments[name] ref_seq = alignments["EryAIII_5_6_ref"] n_terminal_residues[name] = utils.extract_by_reference_positions( query_seq, ref_seq, [2, 15]) return n_terminal_residues
def extract_nterminus(data_dir, genes, start_gene): """ -extract N-terminal 50 residues of each non-starting protein -scan for docking domains using hmmsearch -parse output to locate interacting residues """ n_terminal_residues = {} n_terminals = {} nterm_file = os.path.join(data_dir, 'nterm.fasta') for gene in genes: gene_name = gene.get_name() if gene_name != start_gene: seq = str(gene.translation) n_terminals[gene_name] = seq[:50] for name, seq in n_terminals.items(): alignments = subprocessing.run_muscle_single(name, seq, nterm_file) query_seq = alignments[name] ref_seq = alignments["EryAIII_5_6_ref"] n_terminal_residues[name] = utils.extract_by_reference_positions( query_seq, ref_seq, [2, 15]) return n_terminal_residues