def longest_prefix(symbol, db):
    matches = list(simstringMatches(symbol, (("-xxx-", db), )))
    matches.sort()
    if not matches:
        return "", symbol, [], []
    after = [None for x in range(len(matches))
             ]  #list of matches that go after this one as token boundary
    for idx, (b, eX, e) in enumerate(matches):
        for idx2 in range(idx + 1, len(matches)):
            b2, eX2, e2 = matches[idx2]
            if b2 > e:
                if tokBoundaryRe.match(symbol[e:b2]):
                    after[idx] = b2
                break
    value = [(None, None) for x in range(len(symbol))]
    for revIdx in range(len(matches) - 1, -1, -1):
        b, eX, e = matches[revIdx]
        ownValue = e - b
        nextOne = after[revIdx]
        backtrace = []
        if nextOne:
            assert value[nextOne] != (None, None)
            ownValue += value[nextOne][0]
            backtrace = value[nextOne][1]
        if value[b] == (None, None) or ownValue > value[b][0]:
            value[b] = ownValue, [revIdx] + backtrace
    #assemble the result
    firstB = matches[0][0]
    if firstB == 0 or tokBoundaryRe.match(symbol[:firstB]):
        lastIntervalIdx = value[firstB][1][-1]
        b, eX, e = matches[lastIntervalIdx]
        return symbol[:e], symbol[e:], matches, value[firstB][1]
    else:
        return "", symbol, [], []
def all_process(s):
    init()  #opens the various DBs

    for (b, bX, e), dbName in simstringMatches(s, symbDBs).iteritems():
        print dbName, ": ", s[b:bX]
    for (b, bX, e) in simstringMatches(s, (("-org-", orgDB), )):
        print "ORG : ", s[b:bX]
    # sys.exit()

    #print simstringMatches("Rontgen Ray upon Cancer",symbDBs)
    #print stripSymbol("activated human cyclin dependent kinases from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts)
    #print stripSymbol("human sphingolipid activator proteins from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts)
    #sys.exit()

    OFFSET = 0
    BATCH = 100
    conn = dbconnect()
    c = conn.cursor()
    while True:
        c.execute("SELECT string FROM tmp_new_unique_strings LIMIT %d,%d" %
                  (OFFSET, BATCH))
        rows = c.fetchall()
        OFFSET += BATCH
        if len(rows) == 0:
            break
        for (symbol, ) in rows:
            try:
                (pref, symb, suff), form = stripSymbol(symbol, symbDBs, orgDB,
                                                       prefixDB, suffixDB,
                                                       prefixCounts,
                                                       suffixCounts)
            except:
                symb = symbol
                print >> sys.stderr, "??? -> '" + symb + "'"
            symb = symb.replace("xxxorgxxx", "")
            norm = normform(symb)
            if norm == "":
                norm = normform(symbol)
            print norm + "\t" + symbol

    c.close()
    conn.close()
def hitsByLen(symbol, dbs):
    matches = simstringMatches(symbol, dbs)
    matches = sorted(matches.items(), cmp=matchSort, reverse=True)
    return matches
def stripSymbol(symbol, symbDBs, orgDB, prefixDB, suffixDB, prefixCounts,
                suffixCounts):
    symbolMatches = hitsByLen(symbol, symbDBs)
    orgMatches = list(simstringMatches(symbol, (("-org-", orgDB), )))
    if not symbolMatches:  #No known symbol here, find the longest prefix/suffix and strip
        orgs = bestIntervals(orgMatches)
        repl, mapBack = formatORGContext(symbol, orgs)
        pref, pref_remain, prefix_matches, prefix_matches_subset = longest_prefix(
            repl, prefixDB)
        suff, suff_remain, suffix_matches, suffix_matches_subset = longest_prefix(
            repl[::-1], suffixDB)
        if len(normform(pref)) + len(normform(suff)) == 0:
            return ("", symbol, "", (0, len(symbol))), "NOAFFIX"
        elif len(normform(pref)) + len(normform(suff)) < len(
                normform(repl)):  #Something remains, yay!
            B, E = len(pref), len(repl) - len(suff)
            B, E = mapBack[B], mapBack[E - 1] + 1
            while B < E and symbol[B].isspace():
                B += 1
            while B < E - 1 and symbol[E - 1].isspace():
                E -= 1
            return (pref, symbol[B:E], suff[::-1], (B, E)), "PRSUFF"
        else:
            strips = []
            #We have an overlap, one way or another, revert to the original count-based algorithm
            for idx in prefix_matches_subset:
                b, eX, e = prefix_matches[idx]
                try:
                    count = prefixCounts[normform(repl[b:eX])]
                except:
                    count = 0
                strips.append((b, e, count))
            for idx in suffix_matches_subset[::-1]:
                b, eX, e = suffix_matches[idx]
                try:
                    count = suffixCounts[normform(repl[::-1][b:eX])]
                except:
                    count = 0
                strips.append((len(repl) - e, len(repl) - b, count))
            B = 0
            E = len(repl)
            while True:
                if not strips:
                    break
                if strips[0][2] > strips[-1][2]:
                    newB = strips[0][1]
                    newE = E
                    strips.pop(0)
                else:
                    newE = strips[-1][0] - 1
                    newB = B
                    strips.pop(-1)
                if newB >= newE:
                    break
                B = newB
                E = newE
            # print [B, E, mapBack, repl, symbol]
            return ("", repl, "", (mapBack[B], mapBack[E - 1] + 1)), "OVERLAP"
    #We have a symbol match
    stripped_versions = []
    for (
            b, eX, e
    ), symbType in symbolMatches:  #In order, longest to shortest single GGP match in the string
        pref = symbol[:b]
        suff = symbol[e:]
        if normform(pref) == "" and normform(suff) == "":  #Perfect match
            return ("", symbol[b:eX], "", (b, eX)), "match"
        orgP, orgS = subsetMatches(orgMatches, (
            b,
            e))  #so these are all organism matches compatible with this symbol
        orgPMatches_resolved = bestIntervals(
            orgP)  #widest-spanning subset of orgs in prefix
        orgSMatches_resolved = bestIntervals(
            orgS)  #widest-spanning subset of orgs in suffix
        fullPrefix, mapBackPref = formatORGContext(symbol[:b],
                                                   orgPMatches_resolved)
        fullSuffix, mapBackSuff = formatORGContext(symbol[e:],
                                                   orgSMatches_resolved)
        pref, pref_remain, delme, delme2 = longest_prefix(fullPrefix, prefixDB)
        suff, suff_remain, delme, delme2 = longest_prefix(
            fullSuffix[::-1], suffixDB)
        suff, suff_remain = suff[::-1], suff_remain[::-1]
        if normform(pref_remain) == "" and normform(suff_remain) == "":
            return ("", symbol[b:eX], "",
                    (b, eX)), "match"  #stripped down to a perfect match
        else:
            stripped_versions.append((pref_remain, symbol[b:e], suff_remain,
                                      (b - len(pref), e + len(suff))))
    else:
        if stripped_versions:
            stripped_versions.sort(key=lambda (pref, sym, suff, be): len(pref)
                                   + len(sym) + len(suff))
            pref, sym, suff, be = stripped_versions[0]
            return ("", (pref + sym + suff), "", be), "contains"
        else:
            assert False
    prefixCounts = readAffixFile("src_data/prefixes2.txt", False)
    suffixCounts = readAffixFile("src_data/suffixes2.txt", True)

    print >> sys.stderr, "INIT DONE"
    return symbDBs, orgDB


if __name__ == "__main__":
    init()  #opens the various DBs

    s = "activated human cyclin dependent kinases from p53-defficient NaDpH-enriched mouse embryo"
    print stripSymbolLIB(s)
    print stripSymbolLIB('dependent kinases from p53-defficient')
    print stripSymbolLIB('T(3)')
    s = 'T(3)'
    for (b, bX, e), dbName in simstringMatches(s, symbDBs).iteritems():
        print dbName, ": ", s[b:bX]
    for (b, bX, e) in simstringMatches(s, (("-org-", orgDB), )):
        print "ORG : ", s[b:bX]
    sys.exit()

    #print simstringMatches("Rontgen Ray upon Cancer",symbDBs)
    #print stripSymbol("activated human cyclin dependent kinases from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts)
    #print stripSymbol("human sphingolipid activator proteins from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts)
    #sys.exit()

    OFFSET = 0
    BATCH = 100
    conn = dbconnect()
    c = conn.cursor()
    while True: