def multi_align_samples(self, retree=1):

        # store here a dict, {sample_name: [sequence_id1, sequence_id2, ...]}

        # because in Python3 OrderedDict does not have iteritems(), it simply has items()
        if hasattr(self.genomeFastas, "iteritems"):
            genomeKVIterator = self.genomeFastas.iteritems()
        else:
            genomeKVIterator = self.genomeFastas.items()

        for sampleName, fastaFile in genomeKVIterator:
            with util.file.open_or_gzopen(fastaFile, 'r') as inf:
                for seq in Bio.SeqIO.parse(inf, 'fasta'):
                    self.sequence_order.setdefault(sampleName, default=[])
                    self.sequence_order[sampleName].append(seq.id)

        inputFastas = []
        inputFastas.append(self.ref)
        inputFastas.extend(self.genomeFastas.values())
        transposedFiles = interhost.transposeChromosomeFiles(inputFastas)

        # since the FASTA files are
        for idx, filePath in enumerate(transposedFiles):

            outFile = util.file.mkstempfname('.fasta')
            outFilePath = os.path.dirname(outFile)

            alignedOutFile = tools.mafft.MafftTool().execute(
                inFastas=[os.path.abspath(filePath)],
                outFile=os.path.join(outFilePath,
                                     "{}{}.fasta".format("aligned", idx)),
                localpair=False,
                globalpair=True,
                preservecase=True,
                reorder=None,
                gapOpeningPenalty=None,
                offset=None,
                verbose=False,
                outputAsClustal=None,
                maxiters=1000,
                threads=-1,
                retree=retree)
            self.alignedFastas.append(alignedOutFile)
Beispiel #2
0
    def multi_align_samples(self, retree=1):

        # store here a dict, {sample_name: [sequence_id1, sequence_id2, ...]}

        # because in Python3 OrderedDict does not have iteritems(), it simply has items()
        if hasattr(self.genomeFastas, "iteritems"):
            genomeKVIterator = self.genomeFastas.iteritems()
        else:
            genomeKVIterator = self.genomeFastas.items()

        for sampleName, fastaFile in genomeKVIterator:
            with util.file.open_or_gzopen(fastaFile, 'r') as inf:
                for seq in Bio.SeqIO.parse(inf, 'fasta'):
                    self.sequence_order.setdefault(sampleName, default=[])
                    self.sequence_order[sampleName].append(seq.id)

        inputFastas = []
        inputFastas.append(self.ref)
        inputFastas.extend(self.genomeFastas.values())
        transposedFiles = interhost.transposeChromosomeFiles(inputFastas)

        # since the FASTA files are
        for idx, filePath in enumerate(transposedFiles):

            outFile = util.file.mkstempfname('.fasta')
            outFilePath = os.path.dirname(outFile)

            alignedOutFile = tools.mafft.MafftTool().execute(
                inFastas=[os.path.abspath(filePath)],
                outFile=os.path.join(outFilePath, "{}{}.fasta".format("aligned", idx)),
                localpair=False,
                globalpair=True,
                preservecase=True,
                reorder=None,
                gapOpeningPenalty=None,
                offset=None,
                verbose=False,
                outputAsClustal=None,
                maxiters=1000,
                threads=-1,
                retree=retree)
            self.alignedFastas.append(alignedOutFile)
Beispiel #3
0
def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCounts=False):
    """ Write or print pairwise alignment summary information for sequences in two FASTA
        files, including SNPs, ambiguous bases, and indels.
    """
    gap = '-'
    ambiguous = 'N'
    aligner = tools.muscle.MuscleTool()

    per_chr_fastas = interhost.transposeChromosomeFiles([inFastaFileOne, inFastaFileTwo])

    results = OrderedDict()
    results["same_unambig"]  = 0
    results["snp_unambig"]   = 0
    results["indel_unambig"] = 0
    results["indel_ambig"]   = 0
    results["ambig_one"]     = 0
    results["ambig_two"]     = 0
    results["ambig_both"]    = 0
    results["unambig_both"]  = 0

    for chr_fasta in per_chr_fastas:
        same_unambig  = 0
        snp_unambig   = 0
        indel_unambig = 0
        indel_ambig   = 0
        ambig_one     = 0
        ambig_two     = 0
        ambig_both    = 0
        unambig_both  = 0

        alignOutFileName = util.file.mkstempfname('.fasta')
        aligner.execute(chr_fasta, alignOutFileName, fmt="clw")

        with open(alignOutFileName, "r") as f:
            alignment = Bio.AlignIO.read(f, "clustal")

            for col_idx in range(0, alignment.get_alignment_length()):
                col = alignment[:, col_idx]
                c1 = col[0]
                c2 = col[1]

                if (c1 in ambiguous
                   and c2 in ambiguous):
                    ambig_both +=1
                elif c1 in ambiguous:
                    ambig_one += 1
                elif c2 in ambiguous:
                    ambig_two += 1

                if (c1 in IUPACUnambiguousDNA().letters
                   and c2 in IUPACUnambiguousDNA().letters):
                    unambig_both += 1
                    if c1 == c2:
                        same_unambig += 1
                    else:
                        snp_unambig += 1

                if ((c1 == gap and
                    c2 in IUPACUnambiguousDNA().letters) or
                   (c2 == gap and
                    c1 in IUPACUnambiguousDNA().letters)):
                    indel_unambig += 1

                if ((c1 == gap and
                    c2 in ambiguous) or
                   (c2 == gap and
                    c1 in ambiguous)):
                    indel_ambig += 1

        if printCounts:
            print("Counts for this segment/chromosome:")
            print("same_unambig ", same_unambig)
            print("snp_unambig  ", snp_unambig)
            print("indel_unambig", indel_unambig)
            print("indel_ambig  ", indel_ambig)
            print("ambig_one    ", ambig_one)
            print("ambig_two    ", ambig_two)
            print("ambig_both   ", ambig_both)
            print("unambig_both ", unambig_both)

        results["same_unambig"]  += same_unambig
        results["snp_unambig"]   += snp_unambig
        results["indel_unambig"] += indel_unambig
        results["indel_ambig"]   += indel_ambig
        results["ambig_one"]     += ambig_one
        results["ambig_two"]     += ambig_two
        results["ambig_both"]    += ambig_both
        results["unambig_both"]  += unambig_both

    if printCounts:
        print("\nCounts for this sample:")
        print("same_unambig ", results["same_unambig"])
        print("snp_unambig  ", results["snp_unambig"])
        print("indel_unambig", results["indel_unambig"])
        print("indel_ambig  ", results["indel_ambig"])
        print("ambig_one    ", results["ambig_one"])
        print("ambig_two    ", results["ambig_two"])
        print("ambig_both   ", results["ambig_both"])
        print("unambig_both ", results["unambig_both"])

    if outfileName:
        with open(outfileName, "wt") as of:
            csvout = csv.writer(of, delimiter='\t')
            csvout.writerow(list(results.keys()))
            csvout.writerow(list(results.values()))
Beispiel #4
0
def alignment_summary(inFastaFileOne,
                      inFastaFileTwo,
                      outfileName=None,
                      printCounts=False):
    """ Write or print pairwise alignment summary information for sequences in two FASTA
        files, including SNPs, ambiguous bases, and indels.
    """
    gap = '-'
    ambiguous = 'N'
    aligner = tools.muscle.MuscleTool()

    per_chr_fastas = interhost.transposeChromosomeFiles(
        [inFastaFileOne, inFastaFileTwo])

    results = OrderedDict()
    results["same_unambig"] = 0
    results["snp_unambig"] = 0
    results["indel_unambig"] = 0
    results["indel_ambig"] = 0
    results["ambig_one"] = 0
    results["ambig_two"] = 0
    results["ambig_both"] = 0
    results["unambig_both"] = 0

    for chr_fasta in per_chr_fastas:
        same_unambig = 0
        snp_unambig = 0
        indel_unambig = 0
        indel_ambig = 0
        ambig_one = 0
        ambig_two = 0
        ambig_both = 0
        unambig_both = 0

        alignOutFileName = util.file.mkstempfname('.fasta')
        aligner.execute(chr_fasta, alignOutFileName, fmt="clw")

        with open(alignOutFileName, "r") as f:
            alignment = Bio.AlignIO.read(f, "clustal")

            for col_idx in range(0, alignment.get_alignment_length()):
                col = alignment[:, col_idx]
                c1 = col[0]
                c2 = col[1]

                if (c1 in ambiguous and c2 in ambiguous):
                    ambig_both += 1
                elif c1 in ambiguous:
                    ambig_one += 1
                elif c2 in ambiguous:
                    ambig_two += 1

                if (c1 in IUPACUnambiguousDNA().letters
                        and c2 in IUPACUnambiguousDNA().letters):
                    unambig_both += 1
                    if c1 == c2:
                        same_unambig += 1
                    else:
                        snp_unambig += 1

                if ((c1 == gap and c2 in IUPACUnambiguousDNA().letters) or
                    (c2 == gap and c1 in IUPACUnambiguousDNA().letters)):
                    indel_unambig += 1

                if ((c1 == gap and c2 in ambiguous)
                        or (c2 == gap and c1 in ambiguous)):
                    indel_ambig += 1

        if printCounts:
            print("Counts for this segment/chromosome:")
            print("same_unambig ", same_unambig)
            print("snp_unambig  ", snp_unambig)
            print("indel_unambig", indel_unambig)
            print("indel_ambig  ", indel_ambig)
            print("ambig_one    ", ambig_one)
            print("ambig_two    ", ambig_two)
            print("ambig_both   ", ambig_both)
            print("unambig_both ", unambig_both)

        results["same_unambig"] += same_unambig
        results["snp_unambig"] += snp_unambig
        results["indel_unambig"] += indel_unambig
        results["indel_ambig"] += indel_ambig
        results["ambig_one"] += ambig_one
        results["ambig_two"] += ambig_two
        results["ambig_both"] += ambig_both
        results["unambig_both"] += unambig_both

    if printCounts:
        print("\nCounts for this sample:")
        print("same_unambig ", results["same_unambig"])
        print("snp_unambig  ", results["snp_unambig"])
        print("indel_unambig", results["indel_unambig"])
        print("indel_ambig  ", results["indel_ambig"])
        print("ambig_one    ", results["ambig_one"])
        print("ambig_two    ", results["ambig_two"])
        print("ambig_both   ", results["ambig_both"])
        print("unambig_both ", results["unambig_both"])

    if outfileName:
        with open(outfileName, "wt") as of:
            csvout = csv.writer(of, delimiter='\t')
            csvout.writerow(list(results.keys()))
            csvout.writerow(list(results.values()))