def readIn(filename, fileVersion, verbose=True): 

    if verbose:
        print ("Reading information of file already transformed into MY FORMAT! Filename: ", filename)

    #data already sorted
    #data = readMyFormat(filename, fileVersion, verbose)
    data = readMyFormat(filename, verbose)

    return data
def createFV(filename, label, minNumberOfQueries, maxNumberOfQueries):
    print "min = ", minNumberOfQueries, " max = ", maxNumberOfQueries
    data = readMyFormat(filename, formatVersion) 
    data = preProcessData(data, removeStopWords)    # Sort the data by user and date
    data = keepUsersInsideLimiteOfQueires(data, minNumberOfQueries, maxNumberOfQueries)
    
    userDict = createDictOfUsers(data, label)
    
    print len(userDict)
    
    return userDict
Exemple #3
0
PATH_TO_DATASETS = "/data/palotti/logAnalysisDataSets/"
AOLH_DATASET = "/data/palotti/logAnalysisDataSets/aolData/AOL-user-ct-collection/healthq.fixed.gz"
AOLNH_DATASET = "/data/palotti/logAnalysisDataSets/aolData/AOL-user-ct-collection/nhealthq.fixed.gz"
usingScoop = False
useHON = True
useGM = True
useTRIP = True
useAOLH = True
useAOLNH = False
useLAY = False
useEXP = False

if __name__ == "__main__A":
    datasets = []
    data = readMyFormat(PATH_TO_DATASETS + "trip/trip1.gz")
    datasets.append([data, "TEST"])
    
    calculateStatistics(datasets, usingScoop) 

if __name__ == "__main__":
    
    datasets = []

    #GoldMiner
    if useGM:
        gm = readMyFormat(PATH_TO_DATASETS + "gm/gm.gz")
        datasets.append([gm, "GM"])
   
    #HON
    if useHON:
def calculateDCohen(values, idx1, idx2):
    n1 = values[idx1]
    n2 = values[idx2]
    return DCohen(n1.mean, n2.mean, n1.std, n2.std)

values = []
#for file in sys.argv[1:]:
#    data = readMyFormat(file, "v5")
#    npCombo, countingCombo = analyseData(data)
#    for k, c in sorted(countingCombo.items(), key= lambda x:x[0]  ):
#       print "%f,%d" % (k, c)
    
#    values.append(npCombo)

file = sys.argv[1]
data = readMyFormat(file, "v5")
file = sys.argv[2]
data += readMyFormat(file, "v5")

file = sys.argv[3]
data2 = readMyFormat(file, "v5")
file = sys.argv[4]
data2 += readMyFormat(file, "v5")


npCombo, countingCombo = analyseData(data)
values.append(npCombo)

npCombo2, countingCombo = analyseData(data2)
values.append(npCombo2)
Exemple #5
0
    Implementation decision:
        1) If combo value for the CHV entry is -1, I decided to substitute it for the mean combo value of all entries.
        2) When it is not found any CHV entry in the query, I assume that the combo value for that entry is the mean combo value (around 0.28)
"""

usingScoop = True
if usingScoop:
    from scoop import futures

chvfile = sys.argv[1]
v4datasetFile = sys.argv[2]
outfilename = sys.argv[3]

popularNames = []

data = readMyFormat(v4datasetFile, "v5")
queries = []

from collections import defaultdict
popCounter = defaultdict(int)

class CHV(object):
    def __init__(self, text, isCHV, isUMLS, misspelled, combo):
        self.text = text
        self.isCHV = isCHV
        self.isUMLS = isUMLS
        self.misspelled = misspelled
        self.comboScore = combo

for member in data:        
    query = tokenize(member.keywords)