def longest_prefix(symbol, db): matches = list(simstringMatches(symbol, (("-xxx-", db), ))) matches.sort() if not matches: return "", symbol, [], [] after = [None for x in range(len(matches)) ] #list of matches that go after this one as token boundary for idx, (b, eX, e) in enumerate(matches): for idx2 in range(idx + 1, len(matches)): b2, eX2, e2 = matches[idx2] if b2 > e: if tokBoundaryRe.match(symbol[e:b2]): after[idx] = b2 break value = [(None, None) for x in range(len(symbol))] for revIdx in range(len(matches) - 1, -1, -1): b, eX, e = matches[revIdx] ownValue = e - b nextOne = after[revIdx] backtrace = [] if nextOne: assert value[nextOne] != (None, None) ownValue += value[nextOne][0] backtrace = value[nextOne][1] if value[b] == (None, None) or ownValue > value[b][0]: value[b] = ownValue, [revIdx] + backtrace #assemble the result firstB = matches[0][0] if firstB == 0 or tokBoundaryRe.match(symbol[:firstB]): lastIntervalIdx = value[firstB][1][-1] b, eX, e = matches[lastIntervalIdx] return symbol[:e], symbol[e:], matches, value[firstB][1] else: return "", symbol, [], []
def all_process(s): init() #opens the various DBs for (b, bX, e), dbName in simstringMatches(s, symbDBs).iteritems(): print dbName, ": ", s[b:bX] for (b, bX, e) in simstringMatches(s, (("-org-", orgDB), )): print "ORG : ", s[b:bX] # sys.exit() #print simstringMatches("Rontgen Ray upon Cancer",symbDBs) #print stripSymbol("activated human cyclin dependent kinases from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts) #print stripSymbol("human sphingolipid activator proteins from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts) #sys.exit() OFFSET = 0 BATCH = 100 conn = dbconnect() c = conn.cursor() while True: c.execute("SELECT string FROM tmp_new_unique_strings LIMIT %d,%d" % (OFFSET, BATCH)) rows = c.fetchall() OFFSET += BATCH if len(rows) == 0: break for (symbol, ) in rows: try: (pref, symb, suff), form = stripSymbol(symbol, symbDBs, orgDB, prefixDB, suffixDB, prefixCounts, suffixCounts) except: symb = symbol print >> sys.stderr, "??? -> '" + symb + "'" symb = symb.replace("xxxorgxxx", "") norm = normform(symb) if norm == "": norm = normform(symbol) print norm + "\t" + symbol c.close() conn.close()
def hitsByLen(symbol, dbs): matches = simstringMatches(symbol, dbs) matches = sorted(matches.items(), cmp=matchSort, reverse=True) return matches
def stripSymbol(symbol, symbDBs, orgDB, prefixDB, suffixDB, prefixCounts, suffixCounts): symbolMatches = hitsByLen(symbol, symbDBs) orgMatches = list(simstringMatches(symbol, (("-org-", orgDB), ))) if not symbolMatches: #No known symbol here, find the longest prefix/suffix and strip orgs = bestIntervals(orgMatches) repl, mapBack = formatORGContext(symbol, orgs) pref, pref_remain, prefix_matches, prefix_matches_subset = longest_prefix( repl, prefixDB) suff, suff_remain, suffix_matches, suffix_matches_subset = longest_prefix( repl[::-1], suffixDB) if len(normform(pref)) + len(normform(suff)) == 0: return ("", symbol, "", (0, len(symbol))), "NOAFFIX" elif len(normform(pref)) + len(normform(suff)) < len( normform(repl)): #Something remains, yay! B, E = len(pref), len(repl) - len(suff) B, E = mapBack[B], mapBack[E - 1] + 1 while B < E and symbol[B].isspace(): B += 1 while B < E - 1 and symbol[E - 1].isspace(): E -= 1 return (pref, symbol[B:E], suff[::-1], (B, E)), "PRSUFF" else: strips = [] #We have an overlap, one way or another, revert to the original count-based algorithm for idx in prefix_matches_subset: b, eX, e = prefix_matches[idx] try: count = prefixCounts[normform(repl[b:eX])] except: count = 0 strips.append((b, e, count)) for idx in suffix_matches_subset[::-1]: b, eX, e = suffix_matches[idx] try: count = suffixCounts[normform(repl[::-1][b:eX])] except: count = 0 strips.append((len(repl) - e, len(repl) - b, count)) B = 0 E = len(repl) while True: if not strips: break if strips[0][2] > strips[-1][2]: newB = strips[0][1] newE = E strips.pop(0) else: newE = strips[-1][0] - 1 newB = B strips.pop(-1) if newB >= newE: break B = newB E = newE # print [B, E, mapBack, repl, symbol] return ("", repl, "", (mapBack[B], mapBack[E - 1] + 1)), "OVERLAP" #We have a symbol match stripped_versions = [] for ( b, eX, e ), symbType in symbolMatches: #In order, longest to shortest single GGP match in the string pref = symbol[:b] suff = symbol[e:] if normform(pref) == "" and normform(suff) == "": #Perfect match return ("", symbol[b:eX], "", (b, eX)), "match" orgP, orgS = subsetMatches(orgMatches, ( b, e)) #so these are all organism matches compatible with this symbol orgPMatches_resolved = bestIntervals( orgP) #widest-spanning subset of orgs in prefix orgSMatches_resolved = bestIntervals( orgS) #widest-spanning subset of orgs in suffix fullPrefix, mapBackPref = formatORGContext(symbol[:b], orgPMatches_resolved) fullSuffix, mapBackSuff = formatORGContext(symbol[e:], orgSMatches_resolved) pref, pref_remain, delme, delme2 = longest_prefix(fullPrefix, prefixDB) suff, suff_remain, delme, delme2 = longest_prefix( fullSuffix[::-1], suffixDB) suff, suff_remain = suff[::-1], suff_remain[::-1] if normform(pref_remain) == "" and normform(suff_remain) == "": return ("", symbol[b:eX], "", (b, eX)), "match" #stripped down to a perfect match else: stripped_versions.append((pref_remain, symbol[b:e], suff_remain, (b - len(pref), e + len(suff)))) else: if stripped_versions: stripped_versions.sort(key=lambda (pref, sym, suff, be): len(pref) + len(sym) + len(suff)) pref, sym, suff, be = stripped_versions[0] return ("", (pref + sym + suff), "", be), "contains" else: assert False
prefixCounts = readAffixFile("src_data/prefixes2.txt", False) suffixCounts = readAffixFile("src_data/suffixes2.txt", True) print >> sys.stderr, "INIT DONE" return symbDBs, orgDB if __name__ == "__main__": init() #opens the various DBs s = "activated human cyclin dependent kinases from p53-defficient NaDpH-enriched mouse embryo" print stripSymbolLIB(s) print stripSymbolLIB('dependent kinases from p53-defficient') print stripSymbolLIB('T(3)') s = 'T(3)' for (b, bX, e), dbName in simstringMatches(s, symbDBs).iteritems(): print dbName, ": ", s[b:bX] for (b, bX, e) in simstringMatches(s, (("-org-", orgDB), )): print "ORG : ", s[b:bX] sys.exit() #print simstringMatches("Rontgen Ray upon Cancer",symbDBs) #print stripSymbol("activated human cyclin dependent kinases from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts) #print stripSymbol("human sphingolipid activator proteins from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts) #sys.exit() OFFSET = 0 BATCH = 100 conn = dbconnect() c = conn.cursor() while True: