Example #1
0
def keydb_add(freqDict,dbName='keydb.data',dbFolder='./keydb/'):
    '''
    This function checks for same directory first and then dbFolder for the database specified. 
    If found, then add the dict to the database. 
    if not found, then add empty dict to current directory with the dbName specified. 
    Args:
        freqDict: the processed frequency dictinoary 
        dbName: The physical storage for the database
        dbFolder: the foler where db was placed in. 
    Returns:
        dictionary of the keydb 
    '''
    import os.path,cPickle as pickle
    dbPath = dbName
    if os.path.exists(dbName):
        db = pickle.load(open(dbName,'r'))
        dbPath = dbName
    elif os.path.exists(dbFolder+dbName):
        db = pickle.load(open(dbFolder+dbName,'r'))
        dbPath = dbFolder + dbName
    else:
        db = {}
    db = dict_add(db,freqDict)
    pickle.dump(db,open(dbPath,'w'))
    #print 'added to db: ',dbPath
    return db
Example #2
0
def valdb_add(dbVal,dbName = 'Valdb.data'):
    import os.path,pickle
    if os.path.exists(dbName):
        db = pickle.load(open(dbName,'r'))
    else:
        db = {}
       
   
    db = dict_add(db,dbVal)

    pickle.dump(db,open(dbName,'w'))
#    print 'added to db: ',dbName
    return db
def valdb_add(dbVal, dbName="Valdb.data"):
    # add dbVal to the database dbName
    import os.path, pickle

    if os.path.exists(dbName):
        db = pickle.load(open(dbName, "r"))
    else:
        db = {}

    db = dict_add(db, dbVal)

    pickle.dump(db, open(dbName, "w"))
    #    print 'added to db: ',dbName
    return db
Example #4
0
def keydb_add_result(result,dbName='keydb.data'):
    '''
    Args: 
        Result is the previous result or stored result. 
        newKey: is the new key passed in.   
    Returns:
        the db dictionary for all key frequencies          
    '''
    db = {}
    #the result stores all keys for different cancers. 
    for record in result.values():
       db = dict_add(db,keydb_core(record))
    db = keydb_add(db)
    return db
def getScore(key, value, valdb=None, add=True):
    """
    This function calculates the value score 
    # score_type : [0,1] among three types => (1) num, (2) text, (3) num_text
    # calculate proportion of a particular type of v with respect to the total frequency
    # The larger, the higher confidence
    #
    # score_length : only apply to num_text type (let score of other types to be 1)
    # calculate proportaion of long or short text with respect to total number of num_text type
    # The larger, the higher confidence
    #
    # score_wordcount : only apply to text type
    # calculate  absolute value of (word count for v - med of c and then divided by std.dev of c) where c is a vector of wordcount
    # The smaller, the higher confidence
    #
    # score_token : only apply to text type (Note: the value can be negative)
    # calculate the difference between frequency of particular token and equal portion (1/total number of tokens) where token is value
    # The larger, the higher confidence
    
    Sample use:
        input : key = 'tumor grade', value = '3'
        output: 'type 1.0 length 1 wordcount NA token NA'
    """
    # load the "Valdb.data" database if the database is not specified
    if valdb is None:
        valdb = keydb_marginal_load("Valdb.data")

    # add new data to the database
    # default is to add a new value to the database
    if add == True:
        dictInput = {key: [value]}
        valdb = valdb_add(dictInput)

    score = {}
    dbVal = {}
    dbVal_wordcount = []

    # get frequency for current value
    countdict = getCount(value)

    # get frequency for all value in valdb
    for k, v in valdb.iteritems():
        if k == key:
            for v2 in valdb[k]:
                countdict_current = getCount(v2)
                dbVal = dict_add(countdict_current, dbVal)
                if countdict_current["text"] == 1:
                    dbVal_wordcount.append(len(v2.split(" ")))

    # Type feature: calculate proportion of a particular type of v with respect to the total frequency

    score["Type"] = float(
        (
            countdict["num"] * dbVal["num"]
            + countdict["num_text"] * dbVal["num_text"]
            + countdict["text"] * dbVal["text"]
        )
    ) / float(dbVal["total"])

    # Length feature: only apply to num_text type (let score of other types to be 1)
    # calculate proportaion of long or short text with respect to total number of num_text type
    if countdict["num_text"] == 1:
        score["Length"] = float(
            (
                countdict["num_text_long"] * dbVal["num_text_long"]
                + countdict["num_text_short"] * dbVal["num_text_short"]
            )
        ) / float(dbVal["num_text"])
    else:
        score["Length"] = 1

    # Wordcount feature and Token feature : only apply to text type
    if countdict["text"] == 1:
        # Wordcount feature

        c = np.array(dbVal_wordcount)  # c is a vector of word count
        dbVal_wordcount.sort()
        med = dbVal_wordcount[len(dbVal_wordcount) / 2]  # median of a vector containing word count
        # If std.dev(c) which is the denominator is not 0, calculate the score
        # score = absolute value of (word count for value - med and then divided by std.dev of c)
        if c.std() != 0:
            score["Wordcount"] = abs(float((len(value.split(" ")) - med)) / float(c.std()))
        # If std.dev(c) is 0:
        #   check if word count of value is equal to med then set score to 0 (good case)
        #   otherwise, set score to be 100 (bad case)
        else:
            #            print 'value',value
            if len(value.split(" ")) == med:
                score["Wordcount"] = 0
            else:
                score["Wordcount"] = 100

        # Token feature
        label = ["total", "num", "num_text", "text", "num_text_short", "num_text_long"]
        # token of value
        token = list(set(dbVal.keys()) - set(label))
        token_combine = {}  # combine synonym and antonym with original word
        antonym = defaultdict(list)
        remove_item = {}

        for k in token:
            # if k has already included as synonym or antonym of other token, k is not get processed
            if k in remove_item.keys():
                continue
            flag = 0
            # collect synonym and antonym in syn and an, respectively
            syn, an = Syn_Ant(k)
            if an != {}:
                antonym[an.keys()[0]] = an.values()[0]
            # If any item in token is synonym or antonym of k, combine frequency and remove that word from the token list.
            # Collect the new frequency in a token_combine dictionary
            for s in syn:
                if str(s) in token and str(s) != k:
                    remove_item[s] = k
                    token_combine[k] = dbVal[k] + dbVal[str(s)]
                    flag = 1

            for key, val in an.iteritems():
                if str(val) in token:
                    remove_item[val] = k
                    token_combine[k] = dbVal[k] + dbVal[str(val)]
                    flag = 1

            # If there is no synonym or antonym of k contained in token list, collect the frequency from dbVal
            if flag == 0:
                token_combine[k] = dbVal[k]

        # Calculate the score for each element in the original token list
        for k in list(set(dbVal.keys()) - set(label)):
            # making sure k is always equal to v
            if k != value:
                continue
            if token_combine.get(k) is not None:
                num_token = token_combine[k]
            else:
                num_token = token_combine[remove_item[k]]

            eq_portion = float(1) / float(len(token_combine))
            percentage = float(num_token) / float(dbVal["total"])
            score["Token"] = float(percentage - eq_portion)

    # if value is not text type, Wordcount and Token features are "NA"
    else:
        score["Wordcount"] = "NA"
        score["Token"] = "NA"

    # for a new data format (to be combined with Abstractor)
    score_type = score["Type"]
    score_length = score["Length"]
    score_wordcount = score["Wordcount"]
    score_token = score["Token"]

    # return {'type':score_type,'length':score_length,'wordcount':score_wordcount,'token':score_token}
    return " ".join(
        [
            str(item)
            for item in ["type", score_type, "length", score_length, "wordcount", score_wordcount, "token", score_token]
        ]
    )