def keydb_add(freqDict,dbName='keydb.data',dbFolder='./keydb/'): ''' This function checks for same directory first and then dbFolder for the database specified. If found, then add the dict to the database. if not found, then add empty dict to current directory with the dbName specified. Args: freqDict: the processed frequency dictinoary dbName: The physical storage for the database dbFolder: the foler where db was placed in. Returns: dictionary of the keydb ''' import os.path,cPickle as pickle dbPath = dbName if os.path.exists(dbName): db = pickle.load(open(dbName,'r')) dbPath = dbName elif os.path.exists(dbFolder+dbName): db = pickle.load(open(dbFolder+dbName,'r')) dbPath = dbFolder + dbName else: db = {} db = dict_add(db,freqDict) pickle.dump(db,open(dbPath,'w')) #print 'added to db: ',dbPath return db
def valdb_add(dbVal,dbName = 'Valdb.data'): import os.path,pickle if os.path.exists(dbName): db = pickle.load(open(dbName,'r')) else: db = {} db = dict_add(db,dbVal) pickle.dump(db,open(dbName,'w')) # print 'added to db: ',dbName return db
def valdb_add(dbVal, dbName="Valdb.data"): # add dbVal to the database dbName import os.path, pickle if os.path.exists(dbName): db = pickle.load(open(dbName, "r")) else: db = {} db = dict_add(db, dbVal) pickle.dump(db, open(dbName, "w")) # print 'added to db: ',dbName return db
def keydb_add_result(result,dbName='keydb.data'): ''' Args: Result is the previous result or stored result. newKey: is the new key passed in. Returns: the db dictionary for all key frequencies ''' db = {} #the result stores all keys for different cancers. for record in result.values(): db = dict_add(db,keydb_core(record)) db = keydb_add(db) return db
def getScore(key, value, valdb=None, add=True): """ This function calculates the value score # score_type : [0,1] among three types => (1) num, (2) text, (3) num_text # calculate proportion of a particular type of v with respect to the total frequency # The larger, the higher confidence # # score_length : only apply to num_text type (let score of other types to be 1) # calculate proportaion of long or short text with respect to total number of num_text type # The larger, the higher confidence # # score_wordcount : only apply to text type # calculate absolute value of (word count for v - med of c and then divided by std.dev of c) where c is a vector of wordcount # The smaller, the higher confidence # # score_token : only apply to text type (Note: the value can be negative) # calculate the difference between frequency of particular token and equal portion (1/total number of tokens) where token is value # The larger, the higher confidence Sample use: input : key = 'tumor grade', value = '3' output: 'type 1.0 length 1 wordcount NA token NA' """ # load the "Valdb.data" database if the database is not specified if valdb is None: valdb = keydb_marginal_load("Valdb.data") # add new data to the database # default is to add a new value to the database if add == True: dictInput = {key: [value]} valdb = valdb_add(dictInput) score = {} dbVal = {} dbVal_wordcount = [] # get frequency for current value countdict = getCount(value) # get frequency for all value in valdb for k, v in valdb.iteritems(): if k == key: for v2 in valdb[k]: countdict_current = getCount(v2) dbVal = dict_add(countdict_current, dbVal) if countdict_current["text"] == 1: dbVal_wordcount.append(len(v2.split(" "))) # Type feature: calculate proportion of a particular type of v with respect to the total frequency score["Type"] = float( ( countdict["num"] * dbVal["num"] + countdict["num_text"] * dbVal["num_text"] + countdict["text"] * dbVal["text"] ) ) / float(dbVal["total"]) # Length feature: only apply to num_text type (let score of other types to be 1) # calculate proportaion of long or short text with respect to total number of num_text type if countdict["num_text"] == 1: score["Length"] = float( ( countdict["num_text_long"] * dbVal["num_text_long"] + countdict["num_text_short"] * dbVal["num_text_short"] ) ) / float(dbVal["num_text"]) else: score["Length"] = 1 # Wordcount feature and Token feature : only apply to text type if countdict["text"] == 1: # Wordcount feature c = np.array(dbVal_wordcount) # c is a vector of word count dbVal_wordcount.sort() med = dbVal_wordcount[len(dbVal_wordcount) / 2] # median of a vector containing word count # If std.dev(c) which is the denominator is not 0, calculate the score # score = absolute value of (word count for value - med and then divided by std.dev of c) if c.std() != 0: score["Wordcount"] = abs(float((len(value.split(" ")) - med)) / float(c.std())) # If std.dev(c) is 0: # check if word count of value is equal to med then set score to 0 (good case) # otherwise, set score to be 100 (bad case) else: # print 'value',value if len(value.split(" ")) == med: score["Wordcount"] = 0 else: score["Wordcount"] = 100 # Token feature label = ["total", "num", "num_text", "text", "num_text_short", "num_text_long"] # token of value token = list(set(dbVal.keys()) - set(label)) token_combine = {} # combine synonym and antonym with original word antonym = defaultdict(list) remove_item = {} for k in token: # if k has already included as synonym or antonym of other token, k is not get processed if k in remove_item.keys(): continue flag = 0 # collect synonym and antonym in syn and an, respectively syn, an = Syn_Ant(k) if an != {}: antonym[an.keys()[0]] = an.values()[0] # If any item in token is synonym or antonym of k, combine frequency and remove that word from the token list. # Collect the new frequency in a token_combine dictionary for s in syn: if str(s) in token and str(s) != k: remove_item[s] = k token_combine[k] = dbVal[k] + dbVal[str(s)] flag = 1 for key, val in an.iteritems(): if str(val) in token: remove_item[val] = k token_combine[k] = dbVal[k] + dbVal[str(val)] flag = 1 # If there is no synonym or antonym of k contained in token list, collect the frequency from dbVal if flag == 0: token_combine[k] = dbVal[k] # Calculate the score for each element in the original token list for k in list(set(dbVal.keys()) - set(label)): # making sure k is always equal to v if k != value: continue if token_combine.get(k) is not None: num_token = token_combine[k] else: num_token = token_combine[remove_item[k]] eq_portion = float(1) / float(len(token_combine)) percentage = float(num_token) / float(dbVal["total"]) score["Token"] = float(percentage - eq_portion) # if value is not text type, Wordcount and Token features are "NA" else: score["Wordcount"] = "NA" score["Token"] = "NA" # for a new data format (to be combined with Abstractor) score_type = score["Type"] score_length = score["Length"] score_wordcount = score["Wordcount"] score_token = score["Token"] # return {'type':score_type,'length':score_length,'wordcount':score_wordcount,'token':score_token} return " ".join( [ str(item) for item in ["type", score_type, "length", score_length, "wordcount", score_wordcount, "token", score_token] ] )