Python normform Exemples, produce_contexts.normform Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : saved_strip_contexts_v3.py Projet : TurkuNLP/BioCreativeVI_BioID_assignment

def all_process(s):
    init()  #opens the various DBs

    for (b, bX, e), dbName in simstringMatches(s, symbDBs).iteritems():
        print dbName, ": ", s[b:bX]
    for (b, bX, e) in simstringMatches(s, (("-org-", orgDB), )):
        print "ORG : ", s[b:bX]
    # sys.exit()

    #print simstringMatches("Rontgen Ray upon Cancer",symbDBs)
    #print stripSymbol("activated human cyclin dependent kinases from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts)
    #print stripSymbol("human sphingolipid activator proteins from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts)
    #sys.exit()

    OFFSET = 0
    BATCH = 100
    conn = dbconnect()
    c = conn.cursor()
    while True:
        c.execute("SELECT string FROM tmp_new_unique_strings LIMIT %d,%d" %
                  (OFFSET, BATCH))
        rows = c.fetchall()
        OFFSET += BATCH
        if len(rows) == 0:
            break
        for (symbol, ) in rows:
            try:
                (pref, symb, suff), form = stripSymbol(symbol, symbDBs, orgDB,
                                                       prefixDB, suffixDB,
                                                       prefixCounts,
                                                       suffixCounts)
            except:
                symb = symbol
                print >> sys.stderr, "??? -> '" + symb + "'"
            symb = symb.replace("xxxorgxxx", "")
            norm = normform(symb)
            if norm == "":
                norm = normform(symbol)
            print norm + "\t" + symbol

    c.close()
    conn.close()

Exemple #2

0

Afficher le fichier

Fichier : saved_strip_contexts_v3.py Projet : TurkuNLP/BioCreativeVI_BioID_assignment

def readAffixFile(fName, reverse):
    res = {}  #normform:count
    f = open(fName, "r")
    for line in f:
        line = line.strip()
        if not line:
            continue
        line = line.replace("-org-", "xxxorgxxx")
        count, s = line.split(" ", 1)
        if not atLeastOneCharRe.match(s):
            continue
        s_n = normform(s)
        if len(s_n) < 3:
            continue
        if reverse:
            s_n = s_n[::-1]
        res[s_n] = res.get(s_n, 0) + int(count)
    return res

Exemple #3

0

Afficher le fichier

Fichier : saved_strip_contexts_v3.py Projet : TurkuNLP/BioCreativeVI_BioID_assignment

def stripSymbol(symbol, symbDBs, orgDB, prefixDB, suffixDB, prefixCounts,
                suffixCounts):
    symbolMatches = hitsByLen(symbol, symbDBs)
    orgMatches = list(simstringMatches(symbol, (("-org-", orgDB), )))
    if not symbolMatches:  #No known symbol here, find the longest prefix/suffix and strip
        orgs = bestIntervals(orgMatches)
        repl, mapBack = formatORGContext(symbol, orgs)
        pref, pref_remain, prefix_matches, prefix_matches_subset = longest_prefix(
            repl, prefixDB)
        suff, suff_remain, suffix_matches, suffix_matches_subset = longest_prefix(
            repl[::-1], suffixDB)
        if len(normform(pref)) + len(normform(suff)) == 0:
            return ("", symbol, "", (0, len(symbol))), "NOAFFIX"
        elif len(normform(pref)) + len(normform(suff)) < len(
                normform(repl)):  #Something remains, yay!
            B, E = len(pref), len(repl) - len(suff)
            B, E = mapBack[B], mapBack[E - 1] + 1
            while B < E and symbol[B].isspace():
                B += 1
            while B < E - 1 and symbol[E - 1].isspace():
                E -= 1
            return (pref, symbol[B:E], suff[::-1], (B, E)), "PRSUFF"
        else:
            strips = []
            #We have an overlap, one way or another, revert to the original count-based algorithm
            for idx in prefix_matches_subset:
                b, eX, e = prefix_matches[idx]
                try:
                    count = prefixCounts[normform(repl[b:eX])]
                except:
                    count = 0
                strips.append((b, e, count))
            for idx in suffix_matches_subset[::-1]:
                b, eX, e = suffix_matches[idx]
                try:
                    count = suffixCounts[normform(repl[::-1][b:eX])]
                except:
                    count = 0
                strips.append((len(repl) - e, len(repl) - b, count))
            B = 0
            E = len(repl)
            while True:
                if not strips:
                    break
                if strips[0][2] > strips[-1][2]:
                    newB = strips[0][1]
                    newE = E
                    strips.pop(0)
                else:
                    newE = strips[-1][0] - 1
                    newB = B
                    strips.pop(-1)
                if newB >= newE:
                    break
                B = newB
                E = newE
            # print [B, E, mapBack, repl, symbol]
            return ("", repl, "", (mapBack[B], mapBack[E - 1] + 1)), "OVERLAP"
    #We have a symbol match
    stripped_versions = []
    for (
            b, eX, e
    ), symbType in symbolMatches:  #In order, longest to shortest single GGP match in the string
        pref = symbol[:b]
        suff = symbol[e:]
        if normform(pref) == "" and normform(suff) == "":  #Perfect match
            return ("", symbol[b:eX], "", (b, eX)), "match"
        orgP, orgS = subsetMatches(orgMatches, (
            b,
            e))  #so these are all organism matches compatible with this symbol
        orgPMatches_resolved = bestIntervals(
            orgP)  #widest-spanning subset of orgs in prefix
        orgSMatches_resolved = bestIntervals(
            orgS)  #widest-spanning subset of orgs in suffix
        fullPrefix, mapBackPref = formatORGContext(symbol[:b],
                                                   orgPMatches_resolved)
        fullSuffix, mapBackSuff = formatORGContext(symbol[e:],
                                                   orgSMatches_resolved)
        pref, pref_remain, delme, delme2 = longest_prefix(fullPrefix, prefixDB)
        suff, suff_remain, delme, delme2 = longest_prefix(
            fullSuffix[::-1], suffixDB)
        suff, suff_remain = suff[::-1], suff_remain[::-1]
        if normform(pref_remain) == "" and normform(suff_remain) == "":
            return ("", symbol[b:eX], "",
                    (b, eX)), "match"  #stripped down to a perfect match
        else:
            stripped_versions.append((pref_remain, symbol[b:e], suff_remain,
                                      (b - len(pref), e + len(suff))))
    else:
        if stripped_versions:
            stripped_versions.sort(key=lambda (pref, sym, suff, be): len(pref)
                                   + len(sym) + len(suff))
            pref, sym, suff, be = stripped_versions[0]
            return ("", (pref + sym + suff), "", be), "contains"
        else:
            assert False

Exemple #4

0

Afficher le fichier

Fichier : saved_strip_contexts_v3.py Projet : TurkuNLP/BioCreativeVI_BioID_assignment

                  (OFFSET, BATCH))
        rows = c.fetchall()
        OFFSET += BATCH
        if len(rows) == 0:
            break
        for (symbol, ) in rows:
            try:
                (pref, symb, suff), form = stripSymbol(symbol, symbDBs, orgDB,
                                                       prefixDB, suffixDB,
                                                       prefixCounts,
                                                       suffixCounts)
            except:
                symb = symbol
                print >> sys.stderr, "??? -> '" + symb + "'"
            symb = symb.replace("xxxorgxxx", "")
            norm = normform(symb)
            if norm == "":
                norm = normform(symbol)
            print norm + "\t" + symbol

    c.close()
    conn.close()


def all_process(s):
    init()  #opens the various DBs

    for (b, bX, e), dbName in simstringMatches(s, symbDBs).iteritems():
        print dbName, ": ", s[b:bX]
    for (b, bX, e) in simstringMatches(s, (("-org-", orgDB), )):
        print "ORG : ", s[b:bX]