def get_sample_kmers(self, sample): """Get all kmers from a sample, either from the reference sequence, all possible kmers from an alphabet or all kmers that cover a modified nucelotide :param sample: AbstractSamples object :return: set of desired kmers """ kmers = set() # if motifs is present, process for all motifs with modified base if sample.motifs: for motif in sample.motifs: kmers |= get_motif_kmers(motif, self.k, alphabet=self.canonical) # if we want to limit kmers which were seen in reference sequence if sample.kmers_from_reference: for _, _, sequence in read_fasta(sample.bwa_reference): kmers |= get_sequence_kmers(sequence, k=self.k, rev_comp=True) else: kmers |= { x for x in all_string_permutations(self.canonical, length=self.k) } return kmers
def make_positions_file(reference, output_path, motifs, overlap=False): """Creates a tsv file with the following format ("contig", "position", "strand", "change_from", "change_to"). Given a reference sequence and sets of sequence motif changes we report the location of each change. NOTE: the motifs cannot create new characters on the opposite strand! :param reference: path to reference sequence :param output_path: output path of positions file :param motifs: list of lists of find replace motifs ex: [("CCAGG","CFAGG"), ("CCTGG","CFTGG")] :param overlap: if the motif can overlap with its self, find index of overlap if set to true """ rev_motifs = [] for motif in motifs: rev_motifs.append([x[::-1] for x in motif]) with open(output_path, "w") as outfile: for header, comment, sequence in read_fasta(reference): fwd_seq = sequence bwd_seq = reverse_complement(fwd_seq, reverse=False, complement=True).upper() for index, old_char, substitution_char in find_motifs_sequence_positions( fwd_seq, motifs, overlap=overlap): outfile.write(header + "\t" + np.str(index) + "\t" + "+" + "\t" + old_char + "\t" + substitution_char + "\n") for index, old_char, substitution_char in find_motifs_sequence_positions( bwd_seq, rev_motifs, overlap=overlap): outfile.write(header + "\t" + np.str(index) + "\t" + "-" + "\t" + old_char + "\t" + substitution_char + "\n") return output_path
def replace_motif_reference_positions(reference_location, sub_fasta_path, motifs, overlap=False): """Replace motif reference sequence to a specific path :param reference_location: input reference :param sub_fasta_path: location of edited reference :param motifs: list of motif's which need to be replaced: eg [[find, replace]], [["CCAGG", "CEAGG"]] :param overlap: of overlap is possible, replace with overlap: eg [["AAA", "AAT"]] : AAAA -> AATT """ if os.path.isfile(sub_fasta_path): print( "[substitute_reference_positions] Substituted reference fasta file exists: {}" .format(sub_fasta_path)) return sub_fasta_path else: print( "[substitute_reference_positions] Creating substituted reference fasta file: {}" .format(sub_fasta_path)) # write with open(sub_fasta_path, 'w') as outfasta: for header, comment, sequence in read_fasta(reference_location): subst_sequence = replace_motifs_sequence_positions( sequence, motifs, overlap) print( ">%s %s\n%s" % (header, "substituted:{}".format(motifs), subst_sequence), file=outfasta) return sub_fasta_path
def replace_periodic_reference_positions(reference_location, sub_fasta_path, step, offset, substitution_char='X'): """Edit and write a reference sequence to a specified path by replacing periodic characters note: if sub_fasta_path exists it will return the path without creating a new file :param reference_location: input reference :param sub_fasta_path: location of edited reference :param step: size of gap between substitution characters :param offset: offset of when to start creating substiutions :param substitution_char: character to replace original character """ if os.path.isfile(sub_fasta_path): print( "[substitute_reference_positions] Substituted reference fasta file exists: {}" .format(sub_fasta_path)) return sub_fasta_path else: print( "[substitute_reference_positions] Creating substituted reference fasta file: {}" .format(sub_fasta_path)) # write with open(sub_fasta_path, 'w') as outfasta: for header, comment, sequence in read_fasta(reference_location): subst_sequence = replace_periodic_sequence_positions( sequence, step, offset, substitution_char) print(">%s %s\n%s" % (header, "substituted:{},step:{},offset:{}".format( substitution_char, step, offset), subst_sequence), file=outfasta) return sub_fasta_path
def get_first_seq(fasta_file): assert os.path.isfile(fasta_file) i = 0 seq = "" for t, c, s in read_fasta(fasta_file): seq += s i += 1 if i > 0: break return seq
def get_sequence(path_to_fasta): seqs = [] for header, comment, sequence in read_fasta(path_to_fasta): seqs.append(sequence) assert len(seqs) > 0, "ERROR parsing sequence {}".format(len(seqs)) if len(seqs) > 1: print("Taking first sequence of {}".format(len(seqs))) return seqs[0]
def get_reference_sequence(path_to_fasta): seqs = [] for header, comment, sequence in read_fasta(path_to_fasta): seqs.append(sequence) assert len(seqs) > 0, "Didn't find any sequences in the reference file" if len(seqs) > 1: print("[NOTICE] Found more than one sequence in the reference file, using the first one") return seqs[0]
def processReferenceFasta(fasta, work_folder, motif_key=None, sub_char=None, positions_file=None): """loops over all of the contigs in the reference file, writes the forward and backward sequences as flat files (no headers or anything) for signalMachine, returns a dict that has the sequence names as keys and the paths to the processed sequence as keys """ if positions_file is not None and motif_key is not None: raise RuntimeError( "[processReferenceFasta]Cannot specify motif key and ambiguity position file" ) if positions_file is not None and sub_char is not None: raise RuntimeError( "[processReferenceFasta]Cannot specify a substitution character and a ambiguity position file" ) if positions_file is not None: if not os.path.exists(positions_file): raise RuntimeError( "[processReferenceFasta]Did not find ambiguity position file here: %s" % positions_file) positions = CustomAmbiguityPositions(positions_file) else: positions = None if motif_key is None and sub_char is None and positions_file is None: return fasta, None else: fw_fasta_path = work_folder.add_file_path("forward.{}".format( os.path.basename(fasta))) bw_fasta_path = work_folder.add_file_path("backward.{}".format( os.path.basename(fasta))) if not os.path.exists(fw_fasta_path) and not os.path.exists( bw_fasta_path): print( "[SignalALignment.run]NOTICE: Creating forward and backward fasta files." ) with open(bw_fasta_path, 'w') as bw_outfasta, open(fw_fasta_path, 'w') as fw_outfasta: for header, comment, sequence in read_fasta(fasta): # the motif label allows us to make multiple copies of the reference with unique file names # motif_lab = "" if motif_key is None else "%s." % motif_key # these are the paths to the flat files that have the references # signalAlign likes uppercase if motif_key is not None: motif, ok = getMotif(motif_key, sequence) if not ok: raise RuntimeError( "[processReferenceFasta]Illegal motif key %s" % motif_key) fw_sequence = motif.forwardSubstitutedSequence( sub_char) bw_sequence = motif.complementSubstitutedSequence( sub_char) elif positions is not None: fw_sequence = positions.getForwardSequence( contig=header, raw_sequence=sequence.upper()) bw_sequence = positions.getBackwardSequence( contig=header, raw_sequence=sequence.upper()) else: fw_sequence = sequence.upper() bw_sequence = reverse_complement(fw_sequence, reverse=False, complement=True) print(">%s %s\n%s" % (header, "backward", bw_sequence), file=bw_outfasta) print(">%s %s\n%s" % (header, "forward", fw_sequence), file=fw_outfasta) return fw_fasta_path, bw_fasta_path
def processReferenceFasta(fasta, work_folder, motif_key=None, sub_char=None, positions_file=None): """loops over all of the contigs in the reference file, writes the forward and backward sequences as flat files (no headers or anything) for signalMachine, returns a dict that has the sequence names as keys and the paths to the processed sequence as keys """ if positions_file is not None and motif_key is not None: raise RuntimeError( "[processReferenceFasta]Cannot specify motif key and ambiguity position file" ) if positions_file is not None and sub_char is not None: raise RuntimeError( "[processReferenceFasta]Cannot specify a substitution character and a ambiguity position file" ) if positions_file is not None: if not os.path.exists(positions_file): raise RuntimeError( "[processReferenceFasta]Did not find ambiguity position file here: %s" % positions_file) positions = CustomAmbiguityPositions(positions_file) else: positions = None ref_sequence_map = {} for header, comment, sequence in read_fasta(fasta): # the motif label allows us to make multiple copies of the reference with unique file names motif_lab = "" if motif_key is None else "%s." % motif_key # these are the paths to the flat files that have the references fw_path = work_folder.add_file_path("%s%s.%s.forward.txt" % (motif_lab, header, sub_char)) bw_path = work_folder.add_file_path("%s%s.%s.backward.txt" % (motif_lab, header, sub_char)) # signalAlign likes uppercase if motif_key is not None: motif, ok = getMotif(motif_key, sequence) if not ok: raise RuntimeError( "[processReferenceFasta]Illegal motif key %s" % motif_key) fw_sequence = motif.forwardSubstitutedSequence(sub_char) bw_sequence = motif.complementSubstitutedSequence(sub_char) elif positions is not None: fw_sequence = positions.getForwardSequence( contig=header, raw_sequence=sequence.upper()) bw_sequence = positions.getBackwardSequence( contig=header, raw_sequence=sequence.upper()) else: fw_sequence = sequence.upper() bw_sequence = reverse_complement(fw_sequence, reverse=False, complement=True) with open(fw_path, 'w') as fH: fH.write("%s\n" % fw_sequence) with open(bw_path, 'w') as fH: fH.write("%s\n" % bw_sequence) ref_sequence_map[header] = {"forward": fw_path, "backward": bw_path} return ref_sequence_map
def processReferenceFasta(fasta, work_folder, name, motifs=None, positions_file=None): """loops over all of the contigs in the reference file, writes the forward and backward sequences as flat files (no headers or anything) for signalMachine, returns a dict that has the sequence names as keys and the paths to the processed sequence as keys :param fasta: path to un-edited fasta file :param work_folder: FolderHandler object :param motifs: list of tuple pairs for motif edits. ex [["CCAGG", "CEAGG"]] :param positions_file: ambiguous positions file which can be processed via CustomAmbiguityPositions :return: paths to possibly edited forward reference sequence and backward reference sequence """ positions = None # if no processing needs to happen if positions_file is None and motifs is None: return fasta, None # Cant pass positions file and motifs if positions_file is not None and motifs is not None: raise RuntimeError( "[processReferenceFasta] Cannot specify motif key and ambiguity position file" ) # get positions object (if appropriate) if positions_file: if not os.path.exists(positions_file): raise RuntimeError( "[processReferenceFasta] Did not find ambiguity position file here: %s" % positions_file) positions = CustomAmbiguityPositions(positions_file) # process fasta fw_fasta_path = work_folder.add_file_path("forward.{}.{}".format( name, os.path.basename(fasta))) bw_fasta_path = work_folder.add_file_path("backward.{}.{}".format( name, os.path.basename(fasta))) print( "[SignalAlignment.run] NOTICE: Creating forward and backward fasta files." ) with open(bw_fasta_path, 'w') as bw_outfasta, open(fw_fasta_path, 'w') as fw_outfasta: for header, comment, sequence in read_fasta(fasta): # signalAlign likes uppercase if positions is not None: fw_sequence = positions.getForwardSequence( contig=header, raw_sequence=sequence.upper()) bw_sequence = positions.getBackwardSequence( contig=header, raw_sequence=sequence.upper()) else: fw_sequence = sequence.upper() bw_sequence = reverse_complement(fw_sequence, reverse=False, complement=True).upper() if motifs: fw_sequence = replace_motifs_sequence_positions( fw_sequence, motifs, True) bw_sequence = replace_motifs_sequence_positions( bw_sequence, motifs, True) print(">%s %s\n%s" % (header, "backward", bw_sequence), file=bw_outfasta) print(">%s %s\n%s" % (header, "forward", fw_sequence), file=fw_outfasta) return fw_fasta_path, bw_fasta_path