def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print "arguments: <input dir> <output>" return 1 indir = argv[1]+"/" activityFactorFiles=[] personFactorFiles=[] friendsFiles = [] outdir = argv[2]+"/" random.seed(SEED) for file in os.listdir(indir): if file.endswith("activityFactors.txt"): activityFactorFiles.append(indir+file) if file.endswith("personFactors.txt"): personFactorFiles.append(indir+file) if file.startswith("m0friendList"): friendsFiles.append(indir+file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postHisto) = readfactors.load(personFactorFiles, activityFactorFiles, friendsFiles) # find person parameters print "find parameter bindings for Persons" selectedPersonParams = {} for i in range(1, 15): factors = readfactors.getFactorsForQuery(i, personFactors) selectedPersonParams[i] = discoverparams.generate(factors) # Queries 13 and 14 take two person parameters each. Generate pairs secondPerson = {} for i in [13, 14]: secondPerson[i] = [] for person in selectedPersonParams[i]: j = 0 while True: j = random.randint(0, len(selectedPersonParams[i])-1) if selectedPersonParams[i][j] != person: break secondPerson[i].append(selectedPersonParams[i][j]) # find country parameters for Query 3 and 11 print "find parameter bindings for Countries" selectedCountryParams = {} for i in [3, 11]: factors = readfactors.getCountryFactorsForQuery(i, countryFactors) selectedCountryParams[i] = discoverparams.generate(factors, portion=0.1) # make sure there are as many country parameters as person parameters oldlen = len(selectedCountryParams[i]) newlen = len(selectedPersonParams[i]) selectedCountryParams[i].extend([selectedCountryParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # Query 3 needs two countries as parameters. Generate the second one: secondCountry = [] for c in selectedCountryParams[3]: i=0 while True: i = random.randint(0, len(selectedCountryParams[3])-1) if selectedCountryParams[3][i]!=c: break secondCountry.append(selectedCountryParams[3][i]) #find tag parameters for Query 6 #print "find parameter bindings for Tags" # old tag selection #selectedTagParams = {} #for i in [6]: # selectedTagParams[i] = discoverparams.generate(tagFactors, portion=0.1) # # make sure there are as many tag paramters as person parameters # oldlen = len(selectedTagParams[i]) # newlen = len(selectedPersonParams[i]) # selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) #print "find parameter bindings for Tags" (leftTagFactors, rightTagFactors) = discoverparams.divideFactors(tagFactors, 0.7) leftSize = len(leftTagFactors) rightSize = len(rightTagFactors) leftPortion = 0.1*(leftSize+rightSize) / (2.0*leftSize) rightPortion = 0.1*(leftSize+rightSize) / (2.0*rightSize) selectedTagParams = {} for i in [6]: selectedTagParams[i] = discoverparams.generate(leftTagFactors, portion=leftPortion) selectedTagParams[i].extend(discoverparams.generate(rightTagFactors, portion=rightPortion)) oldlen = len(selectedTagParams[i]) newlen = len(selectedPersonParams[i]) selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # generate tag type parameters for Query 12 selectedTagTypeParams = {} for i in [12]: selectedTagTypeParams[i] = discoverparams.generate(tagClassFactors, portion=0.1) # make sure there are as many tag paramters as person parameters oldlen = len(selectedTagTypeParams[i]) newlen = len(selectedPersonParams[i]) selectedTagTypeParams[i].extend([selectedTagTypeParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # find time parameters for Queries 2,3,4,5,9 selectedPersons = selectedPersonParams[2] + selectedPersonParams[3]+selectedPersonParams[4] selectedPersons += selectedPersonParams[5] + selectedPersonParams[9] selectedTimeParams = {} timeSelectionInput = { 2: (selectedPersonParams[2], "f", getTimeParamsBeforeMedian), 3: (selectedPersonParams[3], "ff", getTimeParamsWithMedian), 4: (selectedPersonParams[4], "f", getTimeParamsWithMedian), 5: (selectedPersonParams[5], "ffg", getTimeParamsAfterMedian), 9: (selectedPersonParams[9], "ff", getTimeParamsBeforeMedian) #11: (selectedPersonParams[11], "w", getTimeParamsBeforeMedian) # friends of friends work } print "find parameter bindings for Timestamps" selectedTimeParams = findTimeParams(timeSelectionInput, personFactorFiles, activityFactorFiles, friendsFiles, ts[1]) # Query 11 takes WorksFrom timestamp selectedTimeParams[11] = [random.randint(ts[2], ts[3]) for j in range(len(selectedPersonParams[11]))] # Query 10 additionally needs the HS parameter HS = [] for person in selectedPersonParams[10]: HS0 = random.randint(1, 12) if HS0 == 12: HS1 = 1 else: HS1 = HS0 + 1 HS.append((HS0, HS1)) # Query 1 takes first name as a parameter #nameParams = findNameParameters(nameFactors)# discoverparams.generate(nameFactors) ## if there are fewer first names than person parameters, repeat some of the names #if len(nameParams) < len(selectedPersonParams[2]): # oldlen = len(nameParams) # newlen = len(selectedPersonParams[2]) # nameParams.extend([nameParams[random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) nameParams = [] for person in selectedPersonParams[1]: n = givenNames.getValue(person) nameParams.append(n) # serialize all the parameters as CSV csvWriters = {} # all the queries have Person as parameter for i in range(1,15): csvWriter = CSVSerializer() csvWriter.setOutputFile(outdir+"interactive_%d_param.txt"%(i)) if i != 13 and i != 14: # these three queries take two Persons as parameters csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "Person") csvWriters[i] = csvWriter # add output for Time parameter for i in timeSelectionInput: if i==3 or i==4: csvWriters[i].registerHandler(handleTimeDurationParam, selectedTimeParams[i], "Date0|Duration") else: csvWriters[i].registerHandler(handleTimeParam, selectedTimeParams[i], "Date0") # other, query-specific parameters csvWriters[1].registerHandler(handleFirstNameParam, nameParams, "Name") csvWriters[3].registerHandler(handlePairCountryParam, zip(selectedCountryParams[3],secondCountry),"Country1|Country2") csvWriters[6].registerHandler(handleTagParam, selectedTagParams[6],"Tag") csvWriters[10].registerHandler(handleHSParam, HS, "HS0") csvWriters[11].registerHandler(handleCountryParam, selectedCountryParams[11],"Country") csvWriters[11].registerHandler(handleWorkYearParam, selectedTimeParams[11],"Year") csvWriters[12].registerHandler(handleTagTypeParam, selectedTagTypeParams[12],"TagType") csvWriters[13].registerHandler(handlePairPersonParam, zip(selectedPersonParams[13], secondPerson[13]),"Person1|Person2") csvWriters[14].registerHandler(handlePairPersonParam, zip(selectedPersonParams[14], secondPerson[14]),"Person1|Person2") for j in csvWriters: csvWriters[j].writeCSV()
def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print "arguments: <input dir> <output>" return 1 indir = argv[1]+"/" activityFactorFiles=[] personFactorFiles=[] friendsFiles = [] outdir = argv[2]+"/" random.seed(SEED) for file in os.listdir(indir): if file.endswith("activityFactors.txt"): activityFactorFiles.append(indir+file) if file.endswith("personFactors.txt"): personFactorFiles.append(indir+file) if file.startswith("m0friendList"): friendsFiles.append(indir+file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postHisto) = readfactors.load(personFactorFiles, activityFactorFiles, friendsFiles) # find person parameters print "find parameter bindings for Persons" selectedPersonParams = {} for i in range(1, 15): factors = readfactors.getFactorsForQuery(i, personFactors) selectedPersonParams[i] = discoverparams.generate(factors) # Queries 13 and 14 take two person parameters each. Generate pairs # secondPerson = {} # for i in [13, 14]: # secondPerson[i] = [] # for person in selectedPersonParams[i]: # j = 0 # while True: # j = random.randint(0, len(selectedPersonParams[i])-1) # if selectedPersonParams[i][j] != person: # break # secondPerson[i].append(selectedPersonParams[i][j]) # find country parameters for Query 3 and 11 print "find parameter bindings for Countries" selectedCountryParams = {} for i in [3, 11]: factors = readfactors.getCountryFactorsForQuery(i, countryFactors) selectedCountryParams[i] = discoverparams.generate(factors, portion=0.1) # make sure there are as many country parameters as person parameters oldlen = len(selectedCountryParams[i]) newlen = len(selectedPersonParams[i]) selectedCountryParams[i].extend([selectedCountryParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # Query 3 needs two countries as parameters. Generate the second one: secondCountry = [] for c in selectedCountryParams[3]: i=0 while True: i = random.randint(0, len(selectedCountryParams[3])-1) if selectedCountryParams[3][i]!=c: break secondCountry.append(selectedCountryParams[3][i]) #find tag parameters for Query 6 #print "find parameter bindings for Tags" # old tag selection #selectedTagParams = {} #for i in [6]: # selectedTagParams[i] = discoverparams.generate(tagFactors, portion=0.1) # # make sure there are as many tag paramters as person parameters # oldlen = len(selectedTagParams[i]) # newlen = len(selectedPersonParams[i]) # selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) #print "find parameter bindings for Tags" (leftTagFactors, rightTagFactors) = discoverparams.divideFactors(tagFactors, 0.7) leftSize = len(leftTagFactors) rightSize = len(rightTagFactors) leftPortion = 0.1*(leftSize+rightSize) / (2.0*leftSize) rightPortion = 0.1*(leftSize+rightSize) / (2.0*rightSize) selectedTagParams = {} for i in [6]: selectedTagParams[i] = discoverparams.generate(leftTagFactors, portion=leftPortion) selectedTagParams[i].extend(discoverparams.generate(rightTagFactors, portion=rightPortion)) oldlen = len(selectedTagParams[i]) newlen = len(selectedPersonParams[i]) selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # generate tag type parameters for Query 12 selectedTagTypeParams = {} for i in [12]: selectedTagTypeParams[i] = discoverparams.generate(tagClassFactors, portion=0.1) # make sure there are as many tag paramters as person parameters oldlen = len(selectedTagTypeParams[i]) newlen = len(selectedPersonParams[i]) selectedTagTypeParams[i].extend([selectedTagTypeParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # find time parameters for Queries 2,3,4,5,9,13,14 selectedPersons = selectedPersonParams[2] + selectedPersonParams[3]+selectedPersonParams[4] selectedPersons += selectedPersonParams[5] + selectedPersonParams[9] selectedPersons += selectedPersonParams[13] + selectedPersonParams[14] selectedTimeParams = {} timeSelectionInput = { 2: (selectedPersonParams[2], "f", getTimeParamsBeforeMedian), 3: (selectedPersonParams[3], "ff", getTimeParamsWithMedian), 4: (selectedPersonParams[4], "f", getTimeParamsWithMedian), 5: (selectedPersonParams[5], "ffg", getTimeParamsAfterMedian), 9: (selectedPersonParams[9], "ff", getTimeParamsBeforeMedian), 13: (selectedPersonParams[13], "f", getTimeParamsAfterMedian), 14: (selectedPersonParams[14], "f", getTimeParamsAfterMedian) #11: (selectedPersonParams[11], "w", getTimeParamsBeforeMedian) # friends of friends work } print "find parameter bindings for Timestamps" selectedTimeParams = findTimeParams(timeSelectionInput, personFactorFiles, activityFactorFiles, friendsFiles, ts[1]) # Query 11 takes WorksFrom timestamp selectedTimeParams[11] = [random.randint(ts[2], ts[3]) for j in range(len(selectedPersonParams[11]))] # Query 10 additionally needs the HS parameter HS = [] for person in selectedPersonParams[10]: HS0 = random.randint(1, 12) if HS0 == 12: HS1 = 1 else: HS1 = HS0 + 1 HS.append((HS0, HS1)) # Query 1 takes first name as a parameter #nameParams = findNameParameters(nameFactors)# discoverparams.generate(nameFactors) ## if there are fewer first names than person parameters, repeat some of the names #if len(nameParams) < len(selectedPersonParams[2]): # oldlen = len(nameParams) # newlen = len(selectedPersonParams[2]) # nameParams.extend([nameParams[random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) nameParams = [] for person in selectedPersonParams[1]: n = givenNames.getValue(person) nameParams.append(n) # serialize all the parameters as CSV csvWriters = {} # all the queries have Person as parameter for i in range(1,15): csvWriter = CSVSerializer() csvWriter.setOutputFile(outdir+"query_%d_param.txt"%(i)) # if i != 13 and i != 14: # these three queries take two Persons as parameters csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "Person") csvWriters[i] = csvWriter # add output for Time parameter for i in timeSelectionInput: if i==3 or i==4: csvWriters[i].registerHandler(handleTimeDurationParam, selectedTimeParams[i], "Date0|Duration") else: csvWriters[i].registerHandler(handleTimeParam, selectedTimeParams[i], "Date0") # other, query-specific parameters csvWriters[1].registerHandler(handleFirstNameParam, nameParams, "Name") csvWriters[3].registerHandler(handlePairCountryParam, zip(selectedCountryParams[3],secondCountry),"Country1|Country2") csvWriters[6].registerHandler(handleTagParam, selectedTagParams[6],"Tag") csvWriters[10].registerHandler(handleHSParam, HS, "HS0") csvWriters[11].registerHandler(handleCountryParam, selectedCountryParams[11],"Country") csvWriters[11].registerHandler(handleWorkYearParam, selectedTimeParams[11],"Year") csvWriters[12].registerHandler(handleTagTypeParam, selectedTagTypeParams[12],"TagType") # csvWriters[13].registerHandler(handlePairPersonParam, zip(selectedPersonParams[13], secondPerson[13]),"Person1|Person2") # csvWriters[14].registerHandler(handlePairPersonParam, zip(selectedPersonParams[14], secondPerson[14]),"Person1|Person2") for j in csvWriters: csvWriters[j].writeCSV()
def main(argv=None): if argv is None: argv = sys.argv if len(argv) < 3: print("arguments: <input dir> <output>") return 1 indir = argv[1]+"/" activityFactorFiles=[] personFactorFiles=[] friendsFiles = [] outdir = argv[2]+"/" random.seed(SEED) for file in os.listdir(indir): if file.endswith("activityFactors.txt"): activityFactorFiles.append(indir+file) if file.endswith("personFactors.txt"): personFactorFiles.append(indir+file) if file.startswith("m0friendList"): friendsFiles.append(indir+file) # read precomputed counts from files (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = readfactors.load(personFactorFiles, activityFactorFiles, friendsFiles) # find person parameters selectedPersonParams = {} for i in range(1, 15): factors = readfactors.getFactorsForQuery(i, personFactors) selectedPersonParams[i] = discoverparams.generate(factors) # Queries 13 and 14 take two person parameters each. Generate pairs secondPerson = {} for i in [13, 14]: secondPerson[i] = [] for person in selectedPersonParams[i]: j = 0 while True: j = random.randint(0, len(selectedPersonParams[i])-1) if selectedPersonParams[i][j] != person: break secondPerson[i].append(selectedPersonParams[i][j]) # find country parameters for Query 3 and 11 selectedCountryParams = {} for i in [3, 11]: selectedCountryParams[i] = discoverparams.generate(countryFactors, portion=0.1) # make sure there are as many country parameters as person parameters oldlen = len(selectedCountryParams[i]) newlen = len(selectedPersonParams[i]) selectedCountryParams[i].extend([selectedCountryParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # Query 3 needs two countries as parameters. Generate the second one: secondCountry = [] for c in selectedCountryParams[3]: i=0 while True: i = random.randint(0, len(selectedCountryParams[3])-1) if selectedCountryParams[3][i]!=c: break secondCountry.append(selectedCountryParams[3][i]) (leftTagFactors, rightTagFactors) = discoverparams.divideFactors(tagFactors, 0.7) leftSize = len(leftTagFactors) rightSize = len(rightTagFactors) leftPortion = 0.1*(leftSize+rightSize) / (2.0*leftSize) rightPortion = 0.1*(leftSize+rightSize) / (2.0*rightSize) selectedTagParams = {} for i in [6]: selectedTagParams[i] = discoverparams.generate(leftTagFactors, portion=leftPortion) selectedTagParams[i].extend(discoverparams.generate(rightTagFactors, portion=rightPortion)) oldlen = len(selectedTagParams[i]) newlen = len(selectedPersonParams[i]) selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # generate tag type parameters for Query 12 selectedTagTypeParams = {} for i in [12]: selectedTagTypeParams[i] = discoverparams.generate(tagClassFactors, portion=0.1) # make sure there are as many tag paramters as person parameters oldlen = len(selectedTagTypeParams[i]) newlen = len(selectedPersonParams[i]) selectedTagTypeParams[i].extend([selectedTagTypeParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)]) # find time parameters for Queries 2,3,4,5,9 selectedPersons = selectedPersonParams[2] + selectedPersonParams[3]+selectedPersonParams[4] selectedPersons += selectedPersonParams[5] + selectedPersonParams[9] timeSelectionInput = { 2: (selectedPersonParams[2], "f", getTimeParamsBeforeMedian), 3: (selectedPersonParams[3], "ff", getTimeParamsWithMedian), 4: (selectedPersonParams[4], "f", getTimeParamsWithMedian), 5: (selectedPersonParams[5], "ffg", getTimeParamsAfterMedian), 9: (selectedPersonParams[9], "ff", getTimeParamsBeforeMedian) } selectedTimeParams = findTimeParams(timeSelectionInput, personFactorFiles, activityFactorFiles, friendsFiles, ts[1]) # Query 11 takes WorksFrom timestamp selectedTimeParams[11] = [random.randint(ts[2], ts[3]) for j in range(len(selectedPersonParams[11]))] # Query 10 additionally needs the month parameter months = [] for person in selectedPersonParams[10]: month = random.randint(1, 12) months.append(month) nameParams = [] for person in selectedPersonParams[1]: n = givenNames.getValue(person) nameParams.append(n) # serialize all the parameters as CSV csvWriters = {} # all the queries have Person as parameter for i in range(1,15): csvWriter = CSVSerializer() csvWriter.setOutputFile(outdir+"interactive_%d_param.txt"%(i)) if i != 13 and i != 14: # these two queries take two Persons as parameters csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "personId") csvWriters[i] = csvWriter # add output for Time parameter for i in timeSelectionInput: if i==3 or i==4: csvWriters[i].registerHandler(handleTimeDurationParam, selectedTimeParams[i], "startDate|durationDays") elif i==2 or i==9: csvWriters[i].registerHandler(handleMaxTimeParam, selectedTimeParams[i], "maxDate") elif i==5: csvWriters[i].registerHandler(handleMinTimeParam, selectedTimeParams[i], "minDate") # other, query-specific parameters csvWriters[1].registerHandler(handleFirstNameParam, nameParams, "firstName") csvWriters[3].registerHandler(handlePairCountryParam, list(zip(selectedCountryParams[3],secondCountry)), "countryXName|countryYName") csvWriters[6].registerHandler(handleTagParam, selectedTagParams[6], "tagName") csvWriters[10].registerHandler(handleMonthParam, months, "month") csvWriters[11].registerHandler(handleCountryParam, selectedCountryParams[11], "countryName") csvWriters[11].registerHandler(handleWorkYearParam, selectedTimeParams[11], "workFromYear") csvWriters[12].registerHandler(handleTagTypeParam, selectedTagTypeParams[12], "tagClassName") csvWriters[13].registerHandler(handlePairPersonParam, list(zip(selectedPersonParams[13], secondPerson[13])), "person1Id|person2Id") csvWriters[14].registerHandler(handlePairPersonParam, list(zip(selectedPersonParams[14], secondPerson[14])), "person1Id|person2Id") for j in csvWriters: csvWriters[j].writeCSV()