コード例 #1
0
ファイル: __init__.py プロジェクト: Honglongwu/biopython
def parse(handle, format):
    """Parses an output file of motif finding programs.

    Currently supported formats (case is ignored):
     - AlignAce:      AlignAce output file format
     - MEME:          MEME output file motif
     - MAST:          MAST output file motif
     - TRANSFAC:      TRANSFAC database file format
     - pfm:           JASPAR-style position-frequency matrix
     - jaspar:        JASPAR-style multiple PFM format
     - sites:         JASPAR-style sites file
    As files in the pfm and sites formats contain only a single motif,
    it is easier to use Bio.motifs.read() instead of Bio.motifs.parse()
    for those.

    For example:

    >>> from Bio import motifs
    >>> for m in motifs.parse(open("Motif/alignace.out"), "AlignAce"):
    ...     print(m.consensus)
    TCTACGATTGAG
    CTGCAGCTAGCTACGAGTGAG
    GTGCTCTAAGCATAGTAGGCG
    GCCACTAGCAGAGCAGGGGGC
    CGACTCAGAGGTT
    CCACGCTAAGAGAGGTGCCGGAG
    GCGCGTCGCTGAGCA
    GTCCATCGCAAAGCGTGGGGC
    GGGATCAGAGGGCCG
    TGGAGGCGGGG
    GACCAGAGCTTCGCATGGGGG
    GGCGTGCGTG
    GCTGGTTGCTGTTCATTAGG
    GCCGGCGGCAGCTAAAAGGG
    GAGGCCGGGGAT
    CGACTCGTGCTTAGAAGG
    """
    format = format.lower()
    if format=="alignace":
        from Bio.motifs import alignace
        record = alignace.read(handle)
        return record
    elif format=="meme":
        from Bio.motifs import meme
        record = meme.read(handle)
        return record
    elif format=="mast":
        from Bio.motifs import mast
        record = mast.read(handle)
        return record
    elif format=="transfac":
        from Bio.motifs import transfac
        record = transfac.read(handle)
        return record
    elif format in ('pfm', 'sites', 'jaspar'):
        from Bio.motifs import jaspar
        record = jaspar.read(handle, format)
        return record
    else:
        raise ValueError("Unknown format %s" % format)
コード例 #2
0
def transfac2ic(args):
    """Compute information content for each filter motif (.transfac)."""
    train_samples = np.load(args.train, mmap_mode='r')
    probs = np.mean(np.mean(train_samples, axis=1), axis=0)
    # background
    bg = {'A': probs[0], 'C': probs[1], 'G': probs[2], 'T': probs[3]}

    # create output directory
    out_dir = os.path.dirname(args.out_file)
    if out_dir == "":
        out_dir = "."
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # load all filter motifs
    with open(args.in_file) as handle:
        records = transfac.read(handle)

    # for each motif compute IC:
    for m in records:

        pwm = m.counts.normalize(pseudocounts=bg)
        pwm.background = bg
        # pssm = pwm.log_odds(background = bg))
        ic = compute_mean_ic(pwm)
        with open(args.out_file, "a") as file:
            file.write(m.get("ID") + "\t" + str(ic) + "\n")
コード例 #3
0
ファイル: __init__.py プロジェクト: mmokrejs/biopython
def parse(handle, format):
    """Parses an output file of motif finding programs.

    Currently supported formats (case is ignored):
     - AlignAce:      AlignAce output file format
     - MEME:          MEME output file motif
     - MAST:          MAST output file motif
     - TRANSFAC:      TRANSFAC database file format
     - pfm:           JASPAR-style position-frequency matrix
     - jaspar:        JASPAR-style multiple PFM format
     - sites:         JASPAR-style sites file
    As files in the pfm and sites formats contain only a single motif,
    it is easier to use Bio.motifs.read() instead of Bio.motifs.parse()
    for those.

    For example:

    >>> from Bio import motifs
    >>> for m in motifs.parse(open("Motif/alignace.out"),"AlignAce"):
    ...     print m.consensus
    TCTACGATTGAG
    CTGCAGCTAGCTACGAGTGAG
    GTGCTCTAAGCATAGTAGGCG
    GCCACTAGCAGAGCAGGGGGC
    CGACTCAGAGGTT
    CCACGCTAAGAGAGGTGCCGGAG
    GCGCGTCGCTGAGCA
    GTCCATCGCAAAGCGTGGGGC
    GGGATCAGAGGGCCG
    TGGAGGCGGGG
    GACCAGAGCTTCGCATGGGGG
    GGCGTGCGTG
    GCTGGTTGCTGTTCATTAGG
    GCCGGCGGCAGCTAAAAGGG
    GAGGCCGGGGAT
    CGACTCGTGCTTAGAAGG
    """
    format = format.lower()
    if format == "alignace":
        from Bio.motifs import alignace
        record = alignace.read(handle)
        return record
    elif format == "meme":
        from Bio.motifs import meme
        record = meme.read(handle)
        return record
    elif format == "mast":
        from Bio.motifs import mast
        record = mast.read(handle)
        return record
    elif format == "transfac":
        from Bio.motifs import transfac
        record = transfac.read(handle)
        return record
    elif format in ('pfm', 'sites', 'jaspar'):
        from Bio.motifs import jaspar
        record = jaspar.read(handle, format)
        return record
    else:
        raise ValueError("Unknown format %s" % format)
コード例 #4
0
def parse(handle, format, strict=True):
    """Parse an output file from a motif finding program.

    Currently supported formats (case is ignored):
     - AlignAce:         AlignAce output file format
     - ClusterBuster:    Cluster Buster position frequency matrix format
     - XMS:              XMS matrix format
     - MEME:             MEME output file motif
     - MINIMAL:          MINIMAL MEME output file motif
     - MAST:             MAST output file motif
     - TRANSFAC:         TRANSFAC database file format
     - pfm-four-columns: Generic position-frequency matrix format with four columns. (cisbp, homer, hocomoco, neph, tiffin)
     - pfm-four-rows:    Generic position-frequency matrix format with four row. (scertf, yetfasco, hdpi, idmmpmm, flyfactor survey)
     - pfm:              JASPAR-style position-frequency matrix
     - jaspar:           JASPAR-style multiple PFM format
     - sites:            JASPAR-style sites file

    As files in the pfm and sites formats contain only a single motif,
    it is easier to use Bio.motifs.read() instead of Bio.motifs.parse()
    for those.

    For example:

    >>> from Bio import motifs
    >>> with open("motifs/alignace.out") as handle:
    ...     for m in motifs.parse(handle, "AlignAce"):
    ...         print(m.consensus)
    ...
    TCTACGATTGAG
    CTGCACCTAGCTACGAGTGAG
    GTGCCCTAAGCATACTAGGCG
    GCCACTAGCAGAGCAGGGGGC
    CGACTCAGAGGTT
    CCACGCTAAGAGAAGTGCCGGAG
    GCACGTCCCTGAGCA
    GTCCATCGCAAAGCGTGGGGC
    GAGATCAGAGGGCCG
    TGGACGCGGGG
    GACCAGAGCCTCGCATGGGGG
    AGCGCGCGTG
    GCCGGTTGCTGTTCATTAGG
    ACCGACGGCAGCTAAAAGGG
    GACGCCGGGGAT
    CGACTCGCGCTTACAAGG

    If strict is True (default), the parser will raise a ValueError if the
    file contents does not strictly comply with the specified file format.
    """
    format = format.lower()
    if format == "alignace":
        from Bio.motifs import alignace

        return alignace.read(handle)
    elif format == "meme":
        from Bio.motifs import meme

        return meme.read(handle)
    elif format == "minimal":
        from Bio.motifs import minimal

        return minimal.read(handle)
    elif format == "clusterbuster":
        from Bio.motifs import clusterbuster

        return clusterbuster.read(handle)
    elif format in ("pfm-four-columns", "pfm-four-rows"):
        from Bio.motifs import pfm

        return pfm.read(handle, format)
    elif format == "xms":
        from Bio.motifs import xms

        return xms.read(handle)
    elif format == "mast":
        from Bio.motifs import mast

        return mast.read(handle)
    elif format == "transfac":
        from Bio.motifs import transfac

        return transfac.read(handle, strict)
    elif format in ("pfm", "sites", "jaspar"):
        from Bio.motifs import jaspar

        return jaspar.read(handle, format)
    else:
        raise ValueError("Unknown format %s" % format)
コード例 #5
0
def parse(handle, format, strict=True):
    """Parse an output file from a motif finding program.

    Currently supported formats (case is ignored):
     - AlignAce:      AlignAce output file format
     - MEME:          MEME output file motif
     - MINIMAL:       MINIMAL MEME output file motif
     - MAST:          MAST output file motif
     - TRANSFAC:      TRANSFAC database file format
     - pfm:           JASPAR-style position-frequency matrix
     - jaspar:        JASPAR-style multiple PFM format
     - sites:         JASPAR-style sites file

    As files in the pfm and sites formats contain only a single motif,
    it is easier to use Bio.motifs.read() instead of Bio.motifs.parse()
    for those.

    For example:

    >>> from Bio import motifs
    >>> with open("Motif/alignace.out") as handle:
    ...     for m in motifs.parse(handle, "AlignAce"):
    ...         print(m.consensus)
    ...
    TCTACGATTGAG
    CTGCAGCTAGCTACGAGTGAG
    GTGCTCTAAGCATAGTAGGCG
    GCCACTAGCAGAGCAGGGGGC
    CGACTCAGAGGTT
    CCACGCTAAGAGAGGTGCCGGAG
    GCGCGTCGCTGAGCA
    GTCCATCGCAAAGCGTGGGGC
    GGGATCAGAGGGCCG
    TGGAGGCGGGG
    GACCAGAGCTTCGCATGGGGG
    GGCGTGCGTG
    GCTGGTTGCTGTTCATTAGG
    GCCGGCGGCAGCTAAAAGGG
    GAGGCCGGGGAT
    CGACTCGTGCTTAGAAGG

    If strict is True (default), the parser will raise a ValueError if the
    file contents does not strictly comply with the specified file format.
    """
    format = format.lower()
    if format == "alignace":
        from Bio.motifs import alignace
        record = alignace.read(handle)
        return record
    elif format == "meme":
        from Bio.motifs import meme
        record = meme.read(handle)
        return record
    elif format == "minimal":
        from Bio.motifs import minimal
        record = minimal.read(handle)
        return record
    elif format == "mast":
        from Bio.motifs import mast
        record = mast.read(handle)
        return record
    elif format == "transfac":
        from Bio.motifs import transfac
        record = transfac.read(handle, strict)
        return record
    elif format in ('pfm', 'sites', 'jaspar'):
        from Bio.motifs import jaspar
        record = jaspar.read(handle, format)
        return record
    else:
        raise ValueError("Unknown format %s" % format)
コード例 #6
0
ファイル: __init__.py プロジェクト: HuttonICS/biopython
def parse(handle, format, strict=True):
    """Parse an output file from a motif finding program.

    Currently supported formats (case is ignored):
     - AlignAce:      AlignAce output file format
     - MEME:          MEME output file motif
     - MINIMAL:       MINIMAL MEME output file motif
     - MAST:          MAST output file motif
     - TRANSFAC:      TRANSFAC database file format
     - pfm:           JASPAR-style position-frequency matrix
     - jaspar:        JASPAR-style multiple PFM format
     - sites:         JASPAR-style sites file

    As files in the pfm and sites formats contain only a single motif,
    it is easier to use Bio.motifs.read() instead of Bio.motifs.parse()
    for those.

    For example:

    >>> from Bio import motifs
    >>> with open("Motif/alignace.out") as handle:
    ...     for m in motifs.parse(handle, "AlignAce"):
    ...         print(m.consensus)
    ...
    TCTACGATTGAG
    CTGCAGCTAGCTACGAGTGAG
    GTGCTCTAAGCATAGTAGGCG
    GCCACTAGCAGAGCAGGGGGC
    CGACTCAGAGGTT
    CCACGCTAAGAGAGGTGCCGGAG
    GCGCGTCGCTGAGCA
    GTCCATCGCAAAGCGTGGGGC
    GGGATCAGAGGGCCG
    TGGAGGCGGGG
    GACCAGAGCTTCGCATGGGGG
    GGCGTGCGTG
    GCTGGTTGCTGTTCATTAGG
    GCCGGCGGCAGCTAAAAGGG
    GAGGCCGGGGAT
    CGACTCGTGCTTAGAAGG

    If strict is True (default), the parser will raise a ValueError if the
    file contents does not strictly comply with the specified file format.
    """
    format = format.lower()
    if format == "alignace":
        from Bio.motifs import alignace
        record = alignace.read(handle)
        return record
    elif format == "meme":
        from Bio.motifs import meme
        record = meme.read(handle)
        return record
    elif format == "minimal":
        from Bio.motifs import minimal
        record = minimal.read(handle)
        return record
    elif format == "mast":
        from Bio.motifs import mast
        record = mast.read(handle)
        return record
    elif format == "transfac":
        from Bio.motifs import transfac
        record = transfac.read(handle, strict)
        return record
    elif format in ('pfm', 'sites', 'jaspar'):
        from Bio.motifs import jaspar
        record = jaspar.read(handle, format)
        return record
    else:
        raise ValueError("Unknown format %s" % format)
コード例 #7
0
def parse(handle, format, strict=True):
    """Parse an output file from a motif finding program.

    Currently supported formats (case is ignored):
     - AlignAce:         AlignAce output file format
     - ClusterBuster:    Cluster Buster position frequency matrix format
     - XMS:              XMS matrix format
     - MEME:             MEME output file motif
     - MINIMAL:          MINIMAL MEME output file motif
     - MAST:             MAST output file motif
     - TRANSFAC:         TRANSFAC database file format
     - pfm-four-columns: Generic position-frequency matrix format with four columns. (cisbp, homer, hocomoco, neph, tiffin)
     - pfm-four-rows:    Generic position-frequency matrix format with four row. (scertf, yetfasco, hdpi, idmmpmm, flyfactor survey)
     - pfm:              JASPAR-style position-frequency matrix
     - jaspar:           JASPAR-style multiple PFM format
     - sites:            JASPAR-style sites file

    As files in the pfm and sites formats contain only a single motif,
    it is easier to use Bio.motifs.read() instead of Bio.motifs.parse()
    for those.

    For example:

    >>> from Bio import motifs
    >>> with open("motifs/alignace.out") as handle:
    ...     for m in motifs.parse(handle, "AlignAce"):
    ...         print(m.consensus)
    ...
    TCTACGATTGAG
    CTGCACCTAGCTACGAGTGAG
    GTGCCCTAAGCATACTAGGCG
    GCCACTAGCAGAGCAGGGGGC
    CGACTCAGAGGTT
    CCACGCTAAGAGAAGTGCCGGAG
    GCACGTCCCTGAGCA
    GTCCATCGCAAAGCGTGGGGC
    GAGATCAGAGGGCCG
    TGGACGCGGGG
    GACCAGAGCCTCGCATGGGGG
    AGCGCGCGTG
    GCCGGTTGCTGTTCATTAGG
    ACCGACGGCAGCTAAAAGGG
    GACGCCGGGGAT
    CGACTCGCGCTTACAAGG

    If strict is True (default), the parser will raise a ValueError if the
    file contents does not strictly comply with the specified file format.
    """
    format = format.lower()
    if format == "alignace":
        from Bio.motifs import alignace
        return alignace.read(handle)
    elif format == "meme":
        from Bio.motifs import meme
        return meme.read(handle)
    elif format == "minimal":
        from Bio.motifs import minimal
        return minimal.read(handle)
    elif format == "clusterbuster":
        from Bio.motifs import clusterbuster
        return clusterbuster.read(handle)
    elif format in ('pfm-four-columns', 'pfm-four-rows'):
        from Bio.motifs import pfm
        return pfm.read(handle, format)
    elif format == "xms":
        from Bio.motifs import xms
        return xms.read(handle)
    elif format == "mast":
        from Bio.motifs import mast
        return mast.read(handle)
    elif format == "transfac":
        from Bio.motifs import transfac
        return transfac.read(handle, strict)
    elif format in ('pfm', 'sites', 'jaspar'):
        from Bio.motifs import jaspar
        return jaspar.read(handle, format)
    else:
        raise ValueError("Unknown format %s" % format)
コード例 #8
0
def motif_compare(args):
    """Compare PSSMs of filter motifs."""
    # create output directory
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    # load training data to determine background nucleotide content
    train_samples = np.load(args.train_data, mmap_mode='r')
    probs = np.mean(np.mean(train_samples, axis=1), axis=0)
    bg = {'A': probs[0], 'C': probs[1], 'G': probs[2], 'T': probs[3]}

    # load all filter motifs from first file
    with open(args.in_file1) as handle:
        records1 = transfac.read(handle)

    # load all filter motifs from second file
    with open(args.in_file2) as handle:
        records2 = transfac.read(handle)

    # convert motifs to pssm's
    pssms1 = {}
    pssms2 = {}
    rc_pssms2 = {}

    for idx, m1 in enumerate(records1):
        pwm1 = m1.counts.normalize(pseudocounts=bg)
        pssm1 = pwm1.log_odds(background=bg)
        pssms1[m1.get("ID")] = pssm1

    for idx, m2 in enumerate(records2):
        pwm2 = m2.counts.normalize(pseudocounts=bg)
        pssm2 = pwm2.log_odds(background=bg)
        pssms2[m2.get("ID")] = pssm2
        # build reverse complement
        if args.rc:
            rc_pssm2 = pssm2.reverse_complement()
            rc_pssms2[idx] = rc_pssm2

    result_table = []
    # compare motifs
    for idx1, pssm1 in pssms1.items():

        for idx2, pssm2 in pssms2.items():

            if args.extensively or idx1 == idx2:

                row = [idx1, idx2]

                for measure in [pearsonr, spearmanr]:

                    cor, p_value, offset = get_motif_similarity(measure, pssm1, pssm2,
                                                                args.min_overlap if args.shift else pssm1.length)
                    orientation = "+"
                    if args.rc:
                        rc_pssm2 = rc_pssms2[idx2]
                        cor_rc, p_value_rc, offset_rc = get_motif_similarity(measure, pssm1, rc_pssm2,
                                                                             args.min_overlap if args.shift else pssm1.length)
                        # if cor < cor_rc:
                        if p_value > p_value_rc:
                            cor, p_value, offset, orientation = cor_rc, p_value_rc, offset_rc, "-"
                    row.extend([cor, p_value, offset, orientation])

                result_table.append(row)

    # write results to output file
    out_file_name = args.out_dir + "/correlation_motifs" + ("_extensively" if args.extensively else "") + (
        "_rc" if args.rc else "") + ("_shift_min_overlap=" + str(args.min_overlap) if args.shift else "") + ".txt"

    with open(out_file_name, 'w') as csv_file:
        file_writer = csv.writer(csv_file, delimiter="\t")
        file_writer.writerow(["ID1", "ID2", "cor_pearson", "p_value_pearson", "offset_pearson", "orientation_pearson",
                              "cor_spearman", "p_value_spearman", "offset_spearman", "orientation_spearman"])
        for row in result_table:
            file_writer.writerow(row)