def all_process(s): init() #opens the various DBs for (b, bX, e), dbName in simstringMatches(s, symbDBs).iteritems(): print dbName, ": ", s[b:bX] for (b, bX, e) in simstringMatches(s, (("-org-", orgDB), )): print "ORG : ", s[b:bX] # sys.exit() #print simstringMatches("Rontgen Ray upon Cancer",symbDBs) #print stripSymbol("activated human cyclin dependent kinases from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts) #print stripSymbol("human sphingolipid activator proteins from mouse",symbDBs,orgDB,prefixDB,suffixDB,prefixCounts,suffixCounts) #sys.exit() OFFSET = 0 BATCH = 100 conn = dbconnect() c = conn.cursor() while True: c.execute("SELECT string FROM tmp_new_unique_strings LIMIT %d,%d" % (OFFSET, BATCH)) rows = c.fetchall() OFFSET += BATCH if len(rows) == 0: break for (symbol, ) in rows: try: (pref, symb, suff), form = stripSymbol(symbol, symbDBs, orgDB, prefixDB, suffixDB, prefixCounts, suffixCounts) except: symb = symbol print >> sys.stderr, "??? -> '" + symb + "'" symb = symb.replace("xxxorgxxx", "") norm = normform(symb) if norm == "": norm = normform(symbol) print norm + "\t" + symbol c.close() conn.close()
def readAffixFile(fName, reverse): res = {} #normform:count f = open(fName, "r") for line in f: line = line.strip() if not line: continue line = line.replace("-org-", "xxxorgxxx") count, s = line.split(" ", 1) if not atLeastOneCharRe.match(s): continue s_n = normform(s) if len(s_n) < 3: continue if reverse: s_n = s_n[::-1] res[s_n] = res.get(s_n, 0) + int(count) return res
def stripSymbol(symbol, symbDBs, orgDB, prefixDB, suffixDB, prefixCounts, suffixCounts): symbolMatches = hitsByLen(symbol, symbDBs) orgMatches = list(simstringMatches(symbol, (("-org-", orgDB), ))) if not symbolMatches: #No known symbol here, find the longest prefix/suffix and strip orgs = bestIntervals(orgMatches) repl, mapBack = formatORGContext(symbol, orgs) pref, pref_remain, prefix_matches, prefix_matches_subset = longest_prefix( repl, prefixDB) suff, suff_remain, suffix_matches, suffix_matches_subset = longest_prefix( repl[::-1], suffixDB) if len(normform(pref)) + len(normform(suff)) == 0: return ("", symbol, "", (0, len(symbol))), "NOAFFIX" elif len(normform(pref)) + len(normform(suff)) < len( normform(repl)): #Something remains, yay! B, E = len(pref), len(repl) - len(suff) B, E = mapBack[B], mapBack[E - 1] + 1 while B < E and symbol[B].isspace(): B += 1 while B < E - 1 and symbol[E - 1].isspace(): E -= 1 return (pref, symbol[B:E], suff[::-1], (B, E)), "PRSUFF" else: strips = [] #We have an overlap, one way or another, revert to the original count-based algorithm for idx in prefix_matches_subset: b, eX, e = prefix_matches[idx] try: count = prefixCounts[normform(repl[b:eX])] except: count = 0 strips.append((b, e, count)) for idx in suffix_matches_subset[::-1]: b, eX, e = suffix_matches[idx] try: count = suffixCounts[normform(repl[::-1][b:eX])] except: count = 0 strips.append((len(repl) - e, len(repl) - b, count)) B = 0 E = len(repl) while True: if not strips: break if strips[0][2] > strips[-1][2]: newB = strips[0][1] newE = E strips.pop(0) else: newE = strips[-1][0] - 1 newB = B strips.pop(-1) if newB >= newE: break B = newB E = newE # print [B, E, mapBack, repl, symbol] return ("", repl, "", (mapBack[B], mapBack[E - 1] + 1)), "OVERLAP" #We have a symbol match stripped_versions = [] for ( b, eX, e ), symbType in symbolMatches: #In order, longest to shortest single GGP match in the string pref = symbol[:b] suff = symbol[e:] if normform(pref) == "" and normform(suff) == "": #Perfect match return ("", symbol[b:eX], "", (b, eX)), "match" orgP, orgS = subsetMatches(orgMatches, ( b, e)) #so these are all organism matches compatible with this symbol orgPMatches_resolved = bestIntervals( orgP) #widest-spanning subset of orgs in prefix orgSMatches_resolved = bestIntervals( orgS) #widest-spanning subset of orgs in suffix fullPrefix, mapBackPref = formatORGContext(symbol[:b], orgPMatches_resolved) fullSuffix, mapBackSuff = formatORGContext(symbol[e:], orgSMatches_resolved) pref, pref_remain, delme, delme2 = longest_prefix(fullPrefix, prefixDB) suff, suff_remain, delme, delme2 = longest_prefix( fullSuffix[::-1], suffixDB) suff, suff_remain = suff[::-1], suff_remain[::-1] if normform(pref_remain) == "" and normform(suff_remain) == "": return ("", symbol[b:eX], "", (b, eX)), "match" #stripped down to a perfect match else: stripped_versions.append((pref_remain, symbol[b:e], suff_remain, (b - len(pref), e + len(suff)))) else: if stripped_versions: stripped_versions.sort(key=lambda (pref, sym, suff, be): len(pref) + len(sym) + len(suff)) pref, sym, suff, be = stripped_versions[0] return ("", (pref + sym + suff), "", be), "contains" else: assert False
(OFFSET, BATCH)) rows = c.fetchall() OFFSET += BATCH if len(rows) == 0: break for (symbol, ) in rows: try: (pref, symb, suff), form = stripSymbol(symbol, symbDBs, orgDB, prefixDB, suffixDB, prefixCounts, suffixCounts) except: symb = symbol print >> sys.stderr, "??? -> '" + symb + "'" symb = symb.replace("xxxorgxxx", "") norm = normform(symb) if norm == "": norm = normform(symbol) print norm + "\t" + symbol c.close() conn.close() def all_process(s): init() #opens the various DBs for (b, bX, e), dbName in simstringMatches(s, symbDBs).iteritems(): print dbName, ": ", s[b:bX] for (b, bX, e) in simstringMatches(s, (("-org-", orgDB), )): print "ORG : ", s[b:bX]