Esempio n. 1
0
    def get_sample_kmers(self, sample):
        """Get all kmers from a sample, either from the reference sequence, all possible kmers from an alphabet or
            all kmers that cover a modified nucelotide

        :param sample: AbstractSamples object
        :return: set of desired kmers
        """
        kmers = set()
        # if motifs is present, process for all motifs with modified base
        if sample.motifs:
            for motif in sample.motifs:
                kmers |= get_motif_kmers(motif,
                                         self.k,
                                         alphabet=self.canonical)
        # if we want to limit kmers which were seen in reference sequence
        if sample.kmers_from_reference:
            for _, _, sequence in read_fasta(sample.bwa_reference):
                kmers |= get_sequence_kmers(sequence, k=self.k, rev_comp=True)
        else:
            kmers |= {
                x
                for x in all_string_permutations(self.canonical, length=self.k)
            }

        return kmers
Esempio n. 2
0
def make_positions_file(reference, output_path, motifs, overlap=False):
    """Creates a tsv file with the following format ("contig", "position", "strand", "change_from", "change_to").
    Given a reference sequence and sets of sequence motif changes we report the location of each change.

    NOTE: the motifs cannot create new characters on the opposite strand!

    :param reference: path to reference sequence
    :param output_path: output path of positions file
    :param motifs: list of lists of find replace motifs ex: [("CCAGG","CFAGG"), ("CCTGG","CFTGG")]
    :param overlap: if the motif can overlap with its self, find index of overlap if set to true
    """
    rev_motifs = []
    for motif in motifs:
        rev_motifs.append([x[::-1] for x in motif])

    with open(output_path, "w") as outfile:
        for header, comment, sequence in read_fasta(reference):
            fwd_seq = sequence
            bwd_seq = reverse_complement(fwd_seq,
                                         reverse=False,
                                         complement=True).upper()
            for index, old_char, substitution_char in find_motifs_sequence_positions(
                    fwd_seq, motifs, overlap=overlap):
                outfile.write(header + "\t" + np.str(index) + "\t" + "+" +
                              "\t" + old_char + "\t" + substitution_char +
                              "\n")

            for index, old_char, substitution_char in find_motifs_sequence_positions(
                    bwd_seq, rev_motifs, overlap=overlap):
                outfile.write(header + "\t" + np.str(index) + "\t" + "-" +
                              "\t" + old_char + "\t" + substitution_char +
                              "\n")

    return output_path
Esempio n. 3
0
def replace_motif_reference_positions(reference_location,
                                      sub_fasta_path,
                                      motifs,
                                      overlap=False):
    """Replace motif  reference sequence to a specific path

    :param reference_location: input reference
    :param sub_fasta_path: location of edited reference
    :param motifs: list of motif's which need to be replaced: eg [[find, replace]], [["CCAGG", "CEAGG"]]
    :param overlap: of overlap is possible, replace with overlap: eg [["AAA", "AAT"]] :  AAAA -> AATT
    """
    if os.path.isfile(sub_fasta_path):
        print(
            "[substitute_reference_positions] Substituted reference fasta file exists: {}"
            .format(sub_fasta_path))
        return sub_fasta_path
    else:
        print(
            "[substitute_reference_positions] Creating substituted reference fasta file: {}"
            .format(sub_fasta_path))
        # write
        with open(sub_fasta_path, 'w') as outfasta:
            for header, comment, sequence in read_fasta(reference_location):
                subst_sequence = replace_motifs_sequence_positions(
                    sequence, motifs, overlap)
                print(
                    ">%s %s\n%s" %
                    (header, "substituted:{}".format(motifs), subst_sequence),
                    file=outfasta)
    return sub_fasta_path
Esempio n. 4
0
def replace_periodic_reference_positions(reference_location,
                                         sub_fasta_path,
                                         step,
                                         offset,
                                         substitution_char='X'):
    """Edit and write a reference sequence to a specified path by replacing periodic characters

    note: if sub_fasta_path exists it will return the path without creating a new file

    :param reference_location: input reference
    :param sub_fasta_path: location of edited reference
    :param step: size of gap between substitution characters
    :param offset: offset of when to start creating substiutions
    :param substitution_char: character to replace original character
    """
    if os.path.isfile(sub_fasta_path):
        print(
            "[substitute_reference_positions] Substituted reference fasta file exists: {}"
            .format(sub_fasta_path))
        return sub_fasta_path
    else:
        print(
            "[substitute_reference_positions] Creating substituted reference fasta file: {}"
            .format(sub_fasta_path))
        # write
        with open(sub_fasta_path, 'w') as outfasta:
            for header, comment, sequence in read_fasta(reference_location):
                subst_sequence = replace_periodic_sequence_positions(
                    sequence, step, offset, substitution_char)
                print(">%s %s\n%s" %
                      (header, "substituted:{},step:{},offset:{}".format(
                          substitution_char, step, offset), subst_sequence),
                      file=outfasta)

    return sub_fasta_path
Esempio n. 5
0
def get_first_seq(fasta_file):
    assert os.path.isfile(fasta_file)
    i = 0
    seq = ""
    for t, c, s in read_fasta(fasta_file):
        seq += s
        i += 1
        if i > 0:
            break
    return seq
def get_sequence(path_to_fasta):
    seqs = []
    for header, comment, sequence in read_fasta(path_to_fasta):
        seqs.append(sequence)

    assert len(seqs) > 0, "ERROR parsing sequence {}".format(len(seqs))
    if len(seqs) > 1:
        print("Taking first sequence of {}".format(len(seqs)))

    return seqs[0]
Esempio n. 7
0
def get_reference_sequence(path_to_fasta):
    seqs = []

    for header, comment, sequence in read_fasta(path_to_fasta):
        seqs.append(sequence)

    assert len(seqs) > 0, "Didn't find any sequences in the reference file"

    if len(seqs) > 1:
        print("[NOTICE] Found more than one sequence in the reference file, using the first one")

    return seqs[0]
Esempio n. 8
0
def processReferenceFasta(fasta,
                          work_folder,
                          motif_key=None,
                          sub_char=None,
                          positions_file=None):
    """loops over all of the contigs in the reference file, writes the forward and backward sequences
    as flat files (no headers or anything) for signalMachine, returns a dict that has the sequence
    names as keys and the paths to the processed sequence as keys
    """
    if positions_file is not None and motif_key is not None:
        raise RuntimeError(
            "[processReferenceFasta]Cannot specify motif key and ambiguity position file"
        )
    if positions_file is not None and sub_char is not None:
        raise RuntimeError(
            "[processReferenceFasta]Cannot specify a substitution character and a ambiguity position file"
        )

    if positions_file is not None:
        if not os.path.exists(positions_file):
            raise RuntimeError(
                "[processReferenceFasta]Did not find ambiguity position file here: %s"
                % positions_file)
        positions = CustomAmbiguityPositions(positions_file)
    else:
        positions = None
    if motif_key is None and sub_char is None and positions_file is None:
        return fasta, None
    else:
        fw_fasta_path = work_folder.add_file_path("forward.{}".format(
            os.path.basename(fasta)))
        bw_fasta_path = work_folder.add_file_path("backward.{}".format(
            os.path.basename(fasta)))
        if not os.path.exists(fw_fasta_path) and not os.path.exists(
                bw_fasta_path):
            print(
                "[SignalALignment.run]NOTICE: Creating forward and backward fasta files."
            )
            with open(bw_fasta_path,
                      'w') as bw_outfasta, open(fw_fasta_path,
                                                'w') as fw_outfasta:
                for header, comment, sequence in read_fasta(fasta):
                    # the motif label allows us to make multiple copies of the reference with unique file names
                    # motif_lab = "" if motif_key is None else "%s." % motif_key
                    # these are the paths to the flat files that have the references
                    # signalAlign likes uppercase
                    if motif_key is not None:
                        motif, ok = getMotif(motif_key, sequence)
                        if not ok:
                            raise RuntimeError(
                                "[processReferenceFasta]Illegal motif key %s" %
                                motif_key)
                        fw_sequence = motif.forwardSubstitutedSequence(
                            sub_char)
                        bw_sequence = motif.complementSubstitutedSequence(
                            sub_char)
                    elif positions is not None:
                        fw_sequence = positions.getForwardSequence(
                            contig=header, raw_sequence=sequence.upper())
                        bw_sequence = positions.getBackwardSequence(
                            contig=header, raw_sequence=sequence.upper())
                    else:
                        fw_sequence = sequence.upper()
                        bw_sequence = reverse_complement(fw_sequence,
                                                         reverse=False,
                                                         complement=True)

                    print(">%s %s\n%s" % (header, "backward", bw_sequence),
                          file=bw_outfasta)
                    print(">%s %s\n%s" % (header, "forward", fw_sequence),
                          file=fw_outfasta)

    return fw_fasta_path, bw_fasta_path
Esempio n. 9
0
def processReferenceFasta(fasta,
                          work_folder,
                          motif_key=None,
                          sub_char=None,
                          positions_file=None):
    """loops over all of the contigs in the reference file, writes the forward and backward sequences
    as flat files (no headers or anything) for signalMachine, returns a dict that has the sequence
    names as keys and the paths to the processed sequence as keys
    """
    if positions_file is not None and motif_key is not None:
        raise RuntimeError(
            "[processReferenceFasta]Cannot specify motif key and ambiguity position file"
        )
    if positions_file is not None and sub_char is not None:
        raise RuntimeError(
            "[processReferenceFasta]Cannot specify a substitution character and a ambiguity position file"
        )

    if positions_file is not None:
        if not os.path.exists(positions_file):
            raise RuntimeError(
                "[processReferenceFasta]Did not find ambiguity position file here: %s"
                % positions_file)
        positions = CustomAmbiguityPositions(positions_file)
    else:
        positions = None

    ref_sequence_map = {}
    for header, comment, sequence in read_fasta(fasta):
        # the motif label allows us to make multiple copies of the reference with unique file names
        motif_lab = "" if motif_key is None else "%s." % motif_key
        # these are the paths to the flat files that have the references
        fw_path = work_folder.add_file_path("%s%s.%s.forward.txt" %
                                            (motif_lab, header, sub_char))
        bw_path = work_folder.add_file_path("%s%s.%s.backward.txt" %
                                            (motif_lab, header, sub_char))
        # signalAlign likes uppercase
        if motif_key is not None:
            motif, ok = getMotif(motif_key, sequence)
            if not ok:
                raise RuntimeError(
                    "[processReferenceFasta]Illegal motif key %s" % motif_key)
            fw_sequence = motif.forwardSubstitutedSequence(sub_char)
            bw_sequence = motif.complementSubstitutedSequence(sub_char)
        elif positions is not None:
            fw_sequence = positions.getForwardSequence(
                contig=header, raw_sequence=sequence.upper())
            bw_sequence = positions.getBackwardSequence(
                contig=header, raw_sequence=sequence.upper())
        else:
            fw_sequence = sequence.upper()
            bw_sequence = reverse_complement(fw_sequence,
                                             reverse=False,
                                             complement=True)

        with open(fw_path, 'w') as fH:
            fH.write("%s\n" % fw_sequence)
        with open(bw_path, 'w') as fH:
            fH.write("%s\n" % bw_sequence)

        ref_sequence_map[header] = {"forward": fw_path, "backward": bw_path}

    return ref_sequence_map
Esempio n. 10
0
def processReferenceFasta(fasta,
                          work_folder,
                          name,
                          motifs=None,
                          positions_file=None):
    """loops over all of the contigs in the reference file, writes the forward and backward sequences
    as flat files (no headers or anything) for signalMachine, returns a dict that has the sequence
    names as keys and the paths to the processed sequence as keys

    :param fasta: path to un-edited fasta file
    :param work_folder: FolderHandler object
    :param motifs: list of tuple pairs for motif edits. ex [["CCAGG", "CEAGG"]]
    :param positions_file: ambiguous positions file which can be processed via CustomAmbiguityPositions
    :return: paths to possibly edited forward reference sequence and backward reference sequence
    """
    positions = None
    # if no processing needs to happen
    if positions_file is None and motifs is None:
        return fasta, None
    # Cant pass positions file and motifs
    if positions_file is not None and motifs is not None:
        raise RuntimeError(
            "[processReferenceFasta] Cannot specify motif key and ambiguity position file"
        )
    # get positions object (if appropriate)
    if positions_file:
        if not os.path.exists(positions_file):
            raise RuntimeError(
                "[processReferenceFasta] Did not find ambiguity position file here: %s"
                % positions_file)
        positions = CustomAmbiguityPositions(positions_file)

    # process fasta
    fw_fasta_path = work_folder.add_file_path("forward.{}.{}".format(
        name, os.path.basename(fasta)))
    bw_fasta_path = work_folder.add_file_path("backward.{}.{}".format(
        name, os.path.basename(fasta)))
    print(
        "[SignalAlignment.run] NOTICE: Creating forward and backward fasta files."
    )
    with open(bw_fasta_path, 'w') as bw_outfasta, open(fw_fasta_path,
                                                       'w') as fw_outfasta:
        for header, comment, sequence in read_fasta(fasta):
            # signalAlign likes uppercase
            if positions is not None:
                fw_sequence = positions.getForwardSequence(
                    contig=header, raw_sequence=sequence.upper())
                bw_sequence = positions.getBackwardSequence(
                    contig=header, raw_sequence=sequence.upper())
            else:
                fw_sequence = sequence.upper()
                bw_sequence = reverse_complement(fw_sequence,
                                                 reverse=False,
                                                 complement=True).upper()
                if motifs:
                    fw_sequence = replace_motifs_sequence_positions(
                        fw_sequence, motifs, True)
                    bw_sequence = replace_motifs_sequence_positions(
                        bw_sequence, motifs, True)

            print(">%s %s\n%s" % (header, "backward", bw_sequence),
                  file=bw_outfasta)
            print(">%s %s\n%s" % (header, "forward", fw_sequence),
                  file=fw_outfasta)

    return fw_fasta_path, bw_fasta_path