Exemple #1
0
def MafMotifScorer(species, maf, motifs):
    width = len(maf.components[0].text)
    headers = [(c.src, c.start, c.end) for c in maf.components]

    # expand block rows to full
    mafBlockSpecies = [
        specName.src.split('.')[0] for specName in maf.components
    ]
    alignlist = []
    for sp in species:
        try:
            i = mafBlockSpecies.index(sp)
            alignlist.append(maf.components[i].text)
        except ValueError:
            alignlist.append([NaN for n in range(width)])

    alignrows = pwmx.Align(alignlist, headers)
    # record gap positions
    filter = pwmx.score_align_gaps(alignrows)
    # score motif
    if isinstance(motifs, list):
        scoremax = {}
        for string in motifs:
            scoremax[string] = pwmx.score_align_motif(alignrows, string,
                                                      filter)
    else:
        scoremax = pwmx.score_align_motif(alignrows, motifs, filter)
    yield scoremax, width, headers
Exemple #2
0
def MafMotifSelect(mafblock, pwm, motif=None, threshold=0):

    if motif is not None and len(motif) != len(pwm):
        raise Exception("pwm and motif must be the same length")
    # generic alignment
    alignlist = [c.text for c in mafblock.components]
    align = pwmx.Align(alignlist)
    nrows, ncols = align.dims
    # required sequence length
    minSeqLen = len(motif)
    # record the text sizes from the alignment rows

    for start in range(ncols - minSeqLen):
        if align.rows[0][start] == '-':
            continue
        subseq = ""
        pwm_score_vec = []
        motif_score_vec = []
        max_cols = 0
        for ir in range(nrows):
            expanded = align.rows[ir].count('-', start, minSeqLen)
            subtext = align.rows[ir][start:minSeqLen + expanded]
            max_cols = max(len(subtext), max_cols)
            subseq = subtext.replace('-', '')
            revseq = pwmx.reverse_complement(subseq)
            # pwm score
            nill, f_score = pwm.score_seq(subseq)[0]
            r_score, nill = pwm.score_seq(revseq)[0]
            pwm_score_vec.append(max(f_score, r_score))
            # consensus score
            if motif is not None:
                for_score = int(pwmx.match_consensus(subseq, motif))
                rev_score = int(pwmx.match_consensus(revseq, motif))
                motif_score_vec.append(max(for_score, rev_score))
        # check threshold
        try:
            assert not isnan(max(pwm_score_vec))
            assert not isnan(max(motif_score_vec))
        except AssertionError:
            print(pwm_score_vec, motif_score_vec, file=sys.stderr)
            print(len(subseq), len(pwm), file=sys.stderr)
        if max(pwm_score_vec) < threshold:
            continue
        if max(motif_score_vec) < threshold:
            continue
        # chop block
        col_start = start
        col_end = max_cols + 1
        motifmaf = mafblock.slice(col_start, col_end)
        yield motifmaf, pwm_score_vec, motif_score_vec
    """
def MafBlockScorer(pwm,species,maf):
    width = len(maf.components[0].text)
    headers = [ (c.src,c.start,c.end) for c in maf.components]

    # expand block rows to full
    mafBlockSpecies = [specName.src.split('.')[0] for specName in maf.components]
    alignlist = []
    for sp in species:
        try:
            i = mafBlockSpecies.index( sp )
            alignlist.append( maf.components[i].text )
        except ValueError:
            alignlist.append( [ NaN for n in range( width ) ] )
    alignrows = pwmx.Align( alignlist )
    scoremax = {}
    # record gap positions
    filter = pwmx.score_align_gaps( alignrows )
    # score pwm models
    for model in pwm.keys():
        #print >>sys.stderr,"%s_%d_%d" % headers[0],width,model
        scoremax[model] = pwm[model].score_align( alignrows, filter )
    yield scoremax,width,headers