Beispiel #1
0
def main(argv=None):
	if argv is None:
		argv = sys.argv

	if len(argv) < 3:
		print "arguments: <input dir> <output>"
		return 1

	indir = argv[1]+"/"
	activityFactorFiles=[]
	personFactorFiles=[]
	friendsFiles = []
	outdir = argv[2]+"/"
	random.seed(SEED)
	

	for file in os.listdir(indir):
		if file.endswith("activityFactors.txt"):
			activityFactorFiles.append(indir+file)
		if file.endswith("personFactors.txt"):
			personFactorFiles.append(indir+file)
		if file.startswith("m0friendList"):
			friendsFiles.append(indir+file)

	# read precomputed counts from files	
	(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames,  ts, postHisto) = readfactors.load(personFactorFiles, activityFactorFiles, friendsFiles)

	# find person parameters
	print "find parameter bindings for Persons"
	selectedPersonParams = {}
	for i in range(1, 15):
		factors = readfactors.getFactorsForQuery(i, personFactors)
		selectedPersonParams[i] = discoverparams.generate(factors)

	# Queries 13 and 14 take two person parameters each. Generate pairs
#	secondPerson = {}
#	for i in [13, 14]:
#		secondPerson[i] = []
#		for person in selectedPersonParams[i]:
#			j = 0
#			while True:
#				j = random.randint(0, len(selectedPersonParams[i])-1)
#				if selectedPersonParams[i][j] != person:
#					break
#			secondPerson[i].append(selectedPersonParams[i][j])

	# find country parameters for Query 3 and 11
	print "find parameter bindings for Countries"
	selectedCountryParams = {}
	for i in [3, 11]:
		factors = readfactors.getCountryFactorsForQuery(i, countryFactors)
		selectedCountryParams[i] = discoverparams.generate(factors, portion=0.1)

		# make sure there are as many country parameters as person parameters
		oldlen = len(selectedCountryParams[i])
		newlen = len(selectedPersonParams[i])
		selectedCountryParams[i].extend([selectedCountryParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	# Query 3 needs two countries as parameters. Generate the second one:
	secondCountry = []
	for c in selectedCountryParams[3]:
		i=0
		while True:
			i = random.randint(0, len(selectedCountryParams[3])-1)
			if selectedCountryParams[3][i]!=c:
				break
		secondCountry.append(selectedCountryParams[3][i])

	#find tag parameters for Query 6
	#print "find parameter bindings for Tags"
  	# old tag selection
  	#selectedTagParams = {}
	#for i in [6]:
	#	selectedTagParams[i] = discoverparams.generate(tagFactors, portion=0.1)
	#	# make sure there are as many tag paramters as person parameters
	#	oldlen = len(selectedTagParams[i])
	#	newlen = len(selectedPersonParams[i])
	#	selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	#print "find parameter bindings for Tags"
	(leftTagFactors, rightTagFactors) = discoverparams.divideFactors(tagFactors, 0.7)
	leftSize = len(leftTagFactors)
	rightSize = len(rightTagFactors)
	leftPortion = 0.1*(leftSize+rightSize) / (2.0*leftSize)
	rightPortion = 0.1*(leftSize+rightSize) / (2.0*rightSize)
	selectedTagParams = {}
	for i in [6]:
		selectedTagParams[i] = discoverparams.generate(leftTagFactors, portion=leftPortion)
		selectedTagParams[i].extend(discoverparams.generate(rightTagFactors, portion=rightPortion))
		oldlen = len(selectedTagParams[i])
		newlen = len(selectedPersonParams[i])
		selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	# generate tag type parameters for Query 12
	selectedTagTypeParams = {}
	for i in [12]:
		selectedTagTypeParams[i] = discoverparams.generate(tagClassFactors, portion=0.1)
		# make sure there are as many tag paramters as person parameters
		oldlen = len(selectedTagTypeParams[i])
		newlen = len(selectedPersonParams[i])
		selectedTagTypeParams[i].extend([selectedTagTypeParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	# find time parameters for Queries 2,3,4,5,9,13,14
	selectedPersons = selectedPersonParams[2] + selectedPersonParams[3]+selectedPersonParams[4]
	selectedPersons += selectedPersonParams[5] + selectedPersonParams[9]
        selectedPersons += selectedPersonParams[13] + selectedPersonParams[14]

	selectedTimeParams = {}
	timeSelectionInput = {
		2: (selectedPersonParams[2], "f", getTimeParamsBeforeMedian),
		3: (selectedPersonParams[3], "ff", getTimeParamsWithMedian),
		4: (selectedPersonParams[4], "f", getTimeParamsWithMedian),
		5: (selectedPersonParams[5], "ffg", getTimeParamsAfterMedian),
		9: (selectedPersonParams[9], "ff", getTimeParamsBeforeMedian),
                13: (selectedPersonParams[13], "f", getTimeParamsAfterMedian),
                14: (selectedPersonParams[14], "f", getTimeParamsAfterMedian)
		#11: (selectedPersonParams[11], "w", getTimeParamsBeforeMedian) # friends of friends work
	}

	print "find parameter bindings for Timestamps"
	selectedTimeParams = findTimeParams(timeSelectionInput, personFactorFiles, activityFactorFiles, friendsFiles, ts[1])
	# Query 11 takes WorksFrom timestamp
	selectedTimeParams[11] = [random.randint(ts[2], ts[3]) for j in range(len(selectedPersonParams[11]))]

	# Query 10 additionally needs the HS parameter
	HS = []
	for person in selectedPersonParams[10]:
		HS0 = random.randint(1, 12)
		if HS0 == 12:
			HS1 = 1
		else:
			HS1 = HS0 + 1
		HS.append((HS0, HS1))

	# Query 1 takes first name as a parameter
	#nameParams =  findNameParameters(nameFactors)# discoverparams.generate(nameFactors)
	## if there are fewer first names than person parameters, repeat some of the names
	#if len(nameParams) < len(selectedPersonParams[2]):
	#	oldlen = len(nameParams)
	#	newlen = len(selectedPersonParams[2])
	#	nameParams.extend([nameParams[random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])
	nameParams = []
	for person in selectedPersonParams[1]:
		n = givenNames.getValue(person)
		nameParams.append(n)

	# serialize all the parameters as CSV
	csvWriters = {}
	# all the queries have Person as parameter
	for i in range(1,15):
		csvWriter = CSVSerializer()
		csvWriter.setOutputFile(outdir+"query_%d_param.txt"%(i))
#		if i != 13 and i != 14: # these three queries take two Persons as parameters
                csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "Person")
		csvWriters[i] = csvWriter

	# add output for Time parameter
	for i in timeSelectionInput:
		if i==3 or i==4:
			csvWriters[i].registerHandler(handleTimeDurationParam, selectedTimeParams[i], "Date0|Duration")
		else:
			csvWriters[i].registerHandler(handleTimeParam, selectedTimeParams[i], "Date0")

	# other, query-specific parameters
	csvWriters[1].registerHandler(handleFirstNameParam, nameParams, "Name")
	csvWriters[3].registerHandler(handlePairCountryParam, zip(selectedCountryParams[3],secondCountry),"Country1|Country2")
	csvWriters[6].registerHandler(handleTagParam, selectedTagParams[6],"Tag")
	csvWriters[10].registerHandler(handleHSParam, HS, "HS0")
	csvWriters[11].registerHandler(handleCountryParam, selectedCountryParams[11],"Country")
	csvWriters[11].registerHandler(handleWorkYearParam, selectedTimeParams[11],"Year")
	csvWriters[12].registerHandler(handleTagTypeParam, selectedTagTypeParams[12],"TagType")
#	csvWriters[13].registerHandler(handlePairPersonParam, zip(selectedPersonParams[13], secondPerson[13]),"Person1|Person2")
#	csvWriters[14].registerHandler(handlePairPersonParam, zip(selectedPersonParams[14], secondPerson[14]),"Person1|Person2")


	for j in csvWriters:
		csvWriters[j].writeCSV()
Beispiel #2
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    if len(argv) < 3:
        print "arguments: <input dir> <output dir>"
        return 1

    indir = argv[1] + "/"
    outdir = argv[2] + "/"
    activityFactorFiles = []
    personFactorFiles = []
    friendsFiles = []

    for file in os.listdir(indir):
        if file.endswith("activityFactors.txt"):
            activityFactorFiles.append(indir + file)
        if file.endswith("personFactors.txt"):
            personFactorFiles.append(indir + file)
        if file.startswith("m0friendList"):
            friendsFiles.append(indir + file)

    # read precomputed counts from files
    (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames,  ts, postsHisto) = \
       readfactors.load(personFactorFiles,activityFactorFiles, friendsFiles)
    week_posts = convert_posts_histo(postsHisto)

    persons = []
    for key, _ in personFactors.values.iteritems():
        persons.append(key)
    random.seed(1988)
    random.shuffle(persons)

    country_sample = []
    for key, value in countryFactors.values.iteritems():
        country_sample.append([key, value.getValue("p")])
    country_sample.sort(key=lambda x: x[1], reverse=True)

    tagclass_posts = tagClassFactors
    tagclass_posts.sort(key=lambda x: x[1], reverse=True)

    tag_posts = tagFactors
    tag_posts.sort(key=lambda x: x[1], reverse=True)

    total_posts = 0
    for day, count in tag_posts:
        total_posts += count

    person_sum = 0
    for country, count in country_sample:
        person_sum += count

    post_lower_threshold = 0.1 * total_posts * 0.9
    post_upper_threshold = 0.1 * total_posts * 1.1
    post_day_ranges = post_date_range_params(week_posts, post_lower_threshold,
                                             post_upper_threshold)

    bad_words = [
        'Augustine', 'William', 'James', 'with', 'Henry', 'Robert', 'from',
        'Pope', 'Hippo', 'album', 'David', 'has', 'one', 'also', 'Green',
        'which', 'that'
    ]
    #post_lower_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*0.8
    #post_upper_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*1.2
    non_empty_weeks = len(week_posts)
    for ix in range(0, len(week_posts)):
        if week_posts[ix][1] == 0:
            non_empty_weeks -= 1

    post_lower_threshold = (total_posts / (non_empty_weeks / 4)) * 0.8
    post_upper_threshold = (total_posts / (non_empty_weeks / 4)) * 1.2
    post_months = post_month_params(week_posts, post_lower_threshold,
                                    post_upper_threshold)

    # the lower bound is inclusive and the upper bound is exclusive
    path_bounds = enumerate_path_bounds(3, 6, 2)
    language_codes = prob_language_codes()
    post_lengths = prob_post_lengths()

    serialize_q2(outdir,
                 key_params(country_sample, total_posts / 200,
                            total_posts / 100),
                 post_day_ranges)  # TODO determine constants
    serialize_q3(outdir, post_months)
    serialize_q14(outdir, post_months)

    serialize_q1(
        outdir,
        post_date_right_open_range_params(week_posts, 0.3 * total_posts,
                                          0.6 * total_posts))
    serialize_q12(
        outdir,
        post_date_right_open_range_params(week_posts, 0.3 * total_posts,
                                          0.6 * total_posts))
    serialize_q18(
        outdir,
        post_date_right_open_range_params(week_posts, 0.3 * total_posts,
                                          0.6 * total_posts), post_lengths,
        language_codes)
    serialize_q10(
        outdir, key_params(tag_posts, total_posts / 900, total_posts / 600),
        post_date_right_open_range_params(week_posts, 0.3 * total_posts,
                                          0.6 * total_posts))

    serialize_q4(
        outdir, key_params(tagclass_posts, total_posts / 20, total_posts / 10),
        key_params(country_sample, total_posts / 150, total_posts / 50))
    serialize_q5(
        outdir, key_params(country_sample, total_posts / 200,
                           total_posts / 100))
    serialize_q6(outdir,
                 key_params(tag_posts, total_posts / 1300, total_posts / 900))
    serialize_q7(outdir,
                 key_params(tag_posts, total_posts / 900, total_posts / 600))
    serialize_q8(outdir,
                 key_params(tag_posts, total_posts / 600, total_posts / 300))
    serialize_q9(outdir, key_params(tagclass_posts, 6000, 25000))
    serialize_q13(
        outdir, key_params(country_sample, total_posts / 200,
                           total_posts / 100))
    serialize_q15(
        outdir, key_params(country_sample, total_posts / 200,
                           total_posts / 100))
    serialize_q16(
        outdir, persons,
        key_params(tagclass_posts, total_posts / 30, total_posts / 10),
        key_params(country_sample, total_posts / 80, total_posts / 20),
        path_bounds)
    serialize_q17(
        outdir, key_params(country_sample, total_posts / 200,
                           total_posts / 100))
    serialize_q19(
        outdir, key_params(tagclass_posts, total_posts / 60, total_posts / 10))
    serialize_q21(
        outdir, key_params(country_sample, total_posts / 200,
                           total_posts / 100))
    serialize_q22(
        outdir, key_params(country_sample, total_posts / 120,
                           total_posts / 40))
    serialize_q23(
        outdir, key_params(country_sample, total_posts / 200,
                           total_posts / 100))
    serialize_q24(
        outdir, key_params(tagclass_posts, total_posts / 140, total_posts / 5))
    serialize_q25(outdir, persons, post_months)

    # TODO: Refine
    serialize_q20(
        outdir, key_params(tagclass_posts, total_posts / 20, total_posts / 2))
    serialize_q11(
        outdir, key_params(country_sample, total_posts / 80, total_posts / 20),
        bad_words)
def main(argv=None):
	if argv is None:
		argv = sys.argv

	if len(argv) < 3:
		print "arguments: <input dir> <output>"
		return 1

	indir = argv[1]+"/"
	activityFactorFiles=[]
	personFactorFiles=[]
	friendsFiles = []
	outdir = argv[2]+"/"
	random.seed(SEED)
	

	for file in os.listdir(indir):
		if file.endswith("activityFactors.txt"):
			activityFactorFiles.append(indir+file)
		if file.endswith("personFactors.txt"):
			personFactorFiles.append(indir+file)
		if file.startswith("m0friendList"):
			friendsFiles.append(indir+file)

	# read precomputed counts from files	
	(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames,  ts, postHisto) = readfactors.load(personFactorFiles, activityFactorFiles, friendsFiles)

	# find person parameters
	print "find parameter bindings for Persons"
	selectedPersonParams = {}
	for i in range(1, 15):
		factors = readfactors.getFactorsForQuery(i, personFactors)
		selectedPersonParams[i] = discoverparams.generate(factors)

	# Queries 13 and 14 take two person parameters each. Generate pairs
	secondPerson = {}
	for i in [13, 14]:
		secondPerson[i] = []
		for person in selectedPersonParams[i]:
			j = 0
			while True:
				j = random.randint(0, len(selectedPersonParams[i])-1)
				if selectedPersonParams[i][j] != person:
					break
			secondPerson[i].append(selectedPersonParams[i][j])

	# find country parameters for Query 3 and 11
	print "find parameter bindings for Countries"
	selectedCountryParams = {}
	for i in [3, 11]:
		factors = readfactors.getCountryFactorsForQuery(i, countryFactors)
		selectedCountryParams[i] = discoverparams.generate(factors, portion=0.1)

		# make sure there are as many country parameters as person parameters
		oldlen = len(selectedCountryParams[i])
		newlen = len(selectedPersonParams[i])
		selectedCountryParams[i].extend([selectedCountryParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	# Query 3 needs two countries as parameters. Generate the second one:
	secondCountry = []
	for c in selectedCountryParams[3]:
		i=0
		while True:
			i = random.randint(0, len(selectedCountryParams[3])-1)
			if selectedCountryParams[3][i]!=c:
				break
		secondCountry.append(selectedCountryParams[3][i])

	#find tag parameters for Query 6
	#print "find parameter bindings for Tags"
  	# old tag selection
  	#selectedTagParams = {}
	#for i in [6]:
	#	selectedTagParams[i] = discoverparams.generate(tagFactors, portion=0.1)
	#	# make sure there are as many tag paramters as person parameters
	#	oldlen = len(selectedTagParams[i])
	#	newlen = len(selectedPersonParams[i])
	#	selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	#print "find parameter bindings for Tags"
	(leftTagFactors, rightTagFactors) = discoverparams.divideFactors(tagFactors, 0.7)
	leftSize = len(leftTagFactors)
	rightSize = len(rightTagFactors)
	leftPortion = 0.1*(leftSize+rightSize) / (2.0*leftSize)
	rightPortion = 0.1*(leftSize+rightSize) / (2.0*rightSize)
	selectedTagParams = {}
	for i in [6]:
		selectedTagParams[i] = discoverparams.generate(leftTagFactors, portion=leftPortion)
		selectedTagParams[i].extend(discoverparams.generate(rightTagFactors, portion=rightPortion))
		oldlen = len(selectedTagParams[i])
		newlen = len(selectedPersonParams[i])
		selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	# generate tag type parameters for Query 12
	selectedTagTypeParams = {}
	for i in [12]:
		selectedTagTypeParams[i] = discoverparams.generate(tagClassFactors, portion=0.1)
		# make sure there are as many tag paramters as person parameters
		oldlen = len(selectedTagTypeParams[i])
		newlen = len(selectedPersonParams[i])
		selectedTagTypeParams[i].extend([selectedTagTypeParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	# find time parameters for Queries 2,3,4,5,9
	selectedPersons = selectedPersonParams[2] + selectedPersonParams[3]+selectedPersonParams[4]
	selectedPersons += selectedPersonParams[5] + selectedPersonParams[9]

	selectedTimeParams = {}
	timeSelectionInput = {
		2: (selectedPersonParams[2], "f", getTimeParamsBeforeMedian),
		3: (selectedPersonParams[3], "ff", getTimeParamsWithMedian),
		4: (selectedPersonParams[4], "f", getTimeParamsWithMedian),
		5: (selectedPersonParams[5], "ffg", getTimeParamsAfterMedian),
		9: (selectedPersonParams[9], "ff", getTimeParamsBeforeMedian)
		#11: (selectedPersonParams[11], "w", getTimeParamsBeforeMedian) # friends of friends work
	}

	print "find parameter bindings for Timestamps"
	selectedTimeParams = findTimeParams(timeSelectionInput, personFactorFiles, activityFactorFiles, friendsFiles, ts[1])
	# Query 11 takes WorksFrom timestamp
	selectedTimeParams[11] = [random.randint(ts[2], ts[3]) for j in range(len(selectedPersonParams[11]))]

	# Query 10 additionally needs the HS parameter
	HS = []
	for person in selectedPersonParams[10]:
		HS0 = random.randint(1, 12)
		if HS0 == 12:
			HS1 = 1
		else:
			HS1 = HS0 + 1
		HS.append((HS0, HS1))

	# Query 1 takes first name as a parameter
	#nameParams =  findNameParameters(nameFactors)# discoverparams.generate(nameFactors)
	## if there are fewer first names than person parameters, repeat some of the names
	#if len(nameParams) < len(selectedPersonParams[2]):
	#	oldlen = len(nameParams)
	#	newlen = len(selectedPersonParams[2])
	#	nameParams.extend([nameParams[random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])
	nameParams = []
	for person in selectedPersonParams[1]:
		n = givenNames.getValue(person)
		nameParams.append(n)

	# serialize all the parameters as CSV
	csvWriters = {}
	# all the queries have Person as parameter
	for i in range(1,15):
		csvWriter = CSVSerializer()
		csvWriter.setOutputFile(outdir+"interactive_%d_param.txt"%(i))
		if i != 13 and i != 14: # these three queries take two Persons as parameters
			csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "Person")
		csvWriters[i] = csvWriter

	# add output for Time parameter
	for i in timeSelectionInput:
		if i==3 or i==4:
			csvWriters[i].registerHandler(handleTimeDurationParam, selectedTimeParams[i], "Date0|Duration")
		else:
			csvWriters[i].registerHandler(handleTimeParam, selectedTimeParams[i], "Date0")

	# other, query-specific parameters
	csvWriters[1].registerHandler(handleFirstNameParam, nameParams, "Name")
	csvWriters[3].registerHandler(handlePairCountryParam, zip(selectedCountryParams[3],secondCountry),"Country1|Country2")
	csvWriters[6].registerHandler(handleTagParam, selectedTagParams[6],"Tag")
	csvWriters[10].registerHandler(handleHSParam, HS, "HS0")
	csvWriters[11].registerHandler(handleCountryParam, selectedCountryParams[11],"Country")
	csvWriters[11].registerHandler(handleWorkYearParam, selectedTimeParams[11],"Year")
	csvWriters[12].registerHandler(handleTagTypeParam, selectedTagTypeParams[12],"TagType")
	csvWriters[13].registerHandler(handlePairPersonParam, zip(selectedPersonParams[13], secondPerson[13]),"Person1|Person2")
	csvWriters[14].registerHandler(handlePairPersonParam, zip(selectedPersonParams[14], secondPerson[14]),"Person1|Person2")


	for j in csvWriters:
		csvWriters[j].writeCSV()
Beispiel #4
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    if len(argv) < 3:
        print("arguments: <input dir> <output dir>")
        return 1

    indir = argv[1] + "/"
    outdir = argv[2] + "/"
    activityFactorFiles = []
    personFactorFiles = []
    friendsFiles = []

    for file in os.listdir(indir):
        if file.endswith("activityFactors.txt"):
            activityFactorFiles.append(indir + file)
        if file.endswith("personFactors.txt"):
            personFactorFiles.append(indir + file)
        if file.startswith("m0friendList"):
            friendsFiles.append(indir + file)

    # read precomputed counts from files
    (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = \
       readfactors.load(personFactorFiles,activityFactorFiles, friendsFiles)
    week_posts = convert_posts_histo(postsHisto)

    persons = []
    for key, _ in personFactors.values.items():
        persons.append(key)
    random.seed(1988)
    random.shuffle(persons)

    country_sample = countryFactors
    country_sample.sort(key=lambda x: x[1], reverse=True)

    tagclass_posts = tagClassFactors
    tagclass_posts.sort(key=lambda x: x[1], reverse=True)

    tag_posts = tagFactors
    tag_posts.sort(key=lambda x: x[1], reverse=True)

    total_posts = 0
    for day, count in tag_posts:
        total_posts += count

    person_sum = 0
    for country, count in country_sample:
        person_sum += count

    post_lower_threshold = 0.1 * total_posts * 0.9
    post_upper_threshold = 0.1 * total_posts * 1.1

    non_empty_weeks = len(week_posts)
    for ix in range(0, len(week_posts)):
        if week_posts[ix][1] == 0:
            non_empty_weeks -= 1
    post_lower_threshold = (total_posts // (non_empty_weeks // 4)) * 0.8
    post_upper_threshold = (total_posts // (non_empty_weeks // 4)) * 1.2
    post_months = post_month_params(week_posts, post_lower_threshold,
                                    post_upper_threshold)

    # the lower bound is inclusive and the upper bound is exclusive
    path_bounds = enumerate_path_bounds(3, 6, 2)
    language_codes = prob_language_codes()
    post_lengths = prob_post_lengths()

    serialize_q3(outdir, post_months)  #new: 2
    serialize_q14(outdir, post_months)  #new: 9

    serialize_q1(
        outdir,
        post_date_right_open_range_params(week_posts, 0.3 * total_posts,
                                          0.6 * total_posts))
    serialize_q18(outdir,
                  post_date_right_open_range_params(week_posts,
                                                    0.3 * total_posts,
                                                    0.6 * total_posts),
                  post_lengths, language_codes)  #new: 12
    serialize_q10(
        outdir, key_params(tag_posts, total_posts // 900, total_posts // 600),
        post_date_right_open_range_params(week_posts, 0.3 * total_posts,
                                          0.6 * total_posts))  #new: 8

    serialize_q4(outdir,
                 key_params(tagclass_posts, total_posts // 20,
                            total_posts // 10),
                 key_params(country_sample, total_posts // 150,
                            total_posts // 50))  #new: 3
    serialize_q5(outdir,
                 key_params(country_sample, total_posts // 200,
                            total_posts // 100))  #new: 4
    serialize_q6(outdir,
                 key_params(tag_posts, total_posts // 1300,
                            total_posts // 900))  #new: 5
    serialize_q7(outdir,
                 key_params(tag_posts, total_posts // 900,
                            total_posts // 600))  #new: 6
    serialize_q8(outdir,
                 key_params(tag_posts, total_posts // 600,
                            total_posts // 300))  #new: 7
    serialize_q16(outdir, persons,
                  key_params(tagclass_posts, total_posts // 30,
                             total_posts // 10),
                  key_params(country_sample, total_posts // 80,
                             total_posts // 20), path_bounds)  #new: 10
    serialize_q17(outdir,
                  key_params(country_sample, total_posts // 200,
                             total_posts // 100))  #new: 11
    serialize_q21(outdir,
                  key_params(country_sample, total_posts // 200,
                             total_posts // 100))  #new: 13
    serialize_q22(outdir,
                  key_params(country_sample, total_posts // 120,
                             total_posts // 40))  #new: 14
    serialize_q25(outdir, persons, post_months)  #new: 15
def main(argv=None):
   if argv is None:
      argv = sys.argv

   if len(argv) < 3:
      print "arguments: <input dir> <output>"
      return 1

   indir = argv[1]+"/"
   factorFiles=[]
   friendsFiles = []
   outdir = argv[2]+"/"

   for file in os.listdir(indir):
      if file.endswith("factors.txt"):
         factorFiles.append(indir+file)
      if file.startswith("m0friendList"):
         friendsFiles.append(indir+file)

   # read precomputed counts from files   
   (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames,  ts, postsHisto) = readfactors.load(factorFiles, friendsFiles)
   week_posts = convert_posts_histo(postsHisto)

   country_sample = []
   for key, value in countryFactors.values.iteritems():
      country_sample.append([key, value.getValue("p")])
   country_sample.sort(key=lambda x: x[1], reverse=True)

   tagclass_posts = tagClassFactors
   tagclass_posts.sort(key=lambda x: x[1], reverse=True)

   tag_posts = tagFactors
   tag_posts.sort(key=lambda x: x[1], reverse=True)

   total_posts = 0
   for day, count in tag_posts:
      total_posts += count

   person_sum = 0
   for country, count in country_sample:
      person_sum += count

   country_lower_threshold = 0.1*total_posts*0.9
   country_upper_threshold = 0.1*total_posts*1.1
   country_sets = country_sets_params(country_sample, country_lower_threshold, country_upper_threshold, 4)

   post_lower_threshold = 0.1*total_posts*0.9
   post_upper_threshold = 0.1*total_posts*1.1
   post_day_ranges = post_date_range_params(week_posts, post_lower_threshold, post_upper_threshold)
   
   post_lower_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*0.8
   post_upper_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*1.2
   post_months = post_month_params(week_posts, post_lower_threshold, post_upper_threshold)

   serialize_q2(country_sets, post_day_ranges)
   serialize_q3(post_months)
   serialize_q14(post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))

   serialize_q1(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
   serialize_q12(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
   serialize_q18(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))

   serialize_q4(key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70))
   serialize_q5(key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q6(key_params(tag_posts, total_posts/1300, total_posts/900))
   serialize_q7(key_params(tag_posts, total_posts/900, total_posts/600))
   serialize_q8(key_params(tag_posts, total_posts/600, total_posts/300))
   serialize_q9(key_params(tagclass_posts, 6000, 25000))
   serialize_q10(key_params(tag_posts, total_posts/900, total_posts/600))
   serialize_q13(key_params(country_sample, total_posts/200, total_posts/100))
   # serialize_q14(post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))
   serialize_q15(key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q16(key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/110, total_posts/70))
   serialize_q17(key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q19(key_params(tagclass_posts, total_posts/60, total_posts/10))
   serialize_q21(key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q22(key_params(country_sample, total_posts/120, total_posts/40))
   serialize_q23(key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q24(key_params(tagclass_posts, total_posts/140, total_posts/5))
def main(argv=None):
	if argv is None:
		argv = sys.argv

	if len(argv) < 3:
		print("arguments: <input dir> <output>")
		return 1

	indir = argv[1]+"/"
	activityFactorFiles=[]
	personFactorFiles=[]
	friendsFiles = []
	outdir = argv[2]+"/"
	random.seed(SEED)
	

	for file in os.listdir(indir):
		if file.endswith("activityFactors.txt"):
			activityFactorFiles.append(indir+file)
		if file.endswith("personFactors.txt"):
			personFactorFiles.append(indir+file)
		if file.startswith("m0friendList"):
			friendsFiles.append(indir+file)

	# read precomputed counts from files	
	(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = readfactors.load(personFactorFiles, activityFactorFiles, friendsFiles)

	# find person parameters
	selectedPersonParams = {}
	for i in range(1, 15):
		factors = readfactors.getFactorsForQuery(i, personFactors)
		selectedPersonParams[i] = discoverparams.generate(factors)

	# Queries 13 and 14 take two person parameters each. Generate pairs
	secondPerson = {}
	for i in [13, 14]:
		secondPerson[i] = []
		for person in selectedPersonParams[i]:
			j = 0
			while True:
				j = random.randint(0, len(selectedPersonParams[i])-1)
				if selectedPersonParams[i][j] != person:
					break
			secondPerson[i].append(selectedPersonParams[i][j])

	# find country parameters for Query 3 and 11
	selectedCountryParams = {}
	for i in [3, 11]:
		selectedCountryParams[i] = discoverparams.generate(countryFactors, portion=0.1)

		# make sure there are as many country parameters as person parameters
		oldlen = len(selectedCountryParams[i])
		newlen = len(selectedPersonParams[i])
		selectedCountryParams[i].extend([selectedCountryParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	# Query 3 needs two countries as parameters. Generate the second one:
	secondCountry = []
	for c in selectedCountryParams[3]:
		i=0
		while True:
			i = random.randint(0, len(selectedCountryParams[3])-1)
			if selectedCountryParams[3][i]!=c:
				break
		secondCountry.append(selectedCountryParams[3][i])

	(leftTagFactors, rightTagFactors) = discoverparams.divideFactors(tagFactors, 0.7)
	leftSize = len(leftTagFactors)
	rightSize = len(rightTagFactors)
	leftPortion = 0.1*(leftSize+rightSize) / (2.0*leftSize)
	rightPortion = 0.1*(leftSize+rightSize) / (2.0*rightSize)
	selectedTagParams = {}
	for i in [6]:
		selectedTagParams[i] = discoverparams.generate(leftTagFactors, portion=leftPortion)
		selectedTagParams[i].extend(discoverparams.generate(rightTagFactors, portion=rightPortion))
		oldlen = len(selectedTagParams[i])
		newlen = len(selectedPersonParams[i])
		selectedTagParams[i].extend([selectedTagParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	# generate tag type parameters for Query 12
	selectedTagTypeParams = {}
	for i in [12]:
		selectedTagTypeParams[i] = discoverparams.generate(tagClassFactors, portion=0.1)
		# make sure there are as many tag paramters as person parameters
		oldlen = len(selectedTagTypeParams[i])
		newlen = len(selectedPersonParams[i])
		selectedTagTypeParams[i].extend([selectedTagTypeParams[i][random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])

	# find time parameters for Queries 2,3,4,5,9
	selectedPersons = selectedPersonParams[2] + selectedPersonParams[3]+selectedPersonParams[4]
	selectedPersons += selectedPersonParams[5] + selectedPersonParams[9]

	timeSelectionInput = {
		2: (selectedPersonParams[2], "f", getTimeParamsBeforeMedian),
		3: (selectedPersonParams[3], "ff", getTimeParamsWithMedian),
		4: (selectedPersonParams[4], "f", getTimeParamsWithMedian),
		5: (selectedPersonParams[5], "ffg", getTimeParamsAfterMedian),
		9: (selectedPersonParams[9], "ff", getTimeParamsBeforeMedian)
	}

	selectedTimeParams = findTimeParams(timeSelectionInput, personFactorFiles, activityFactorFiles, friendsFiles, ts[1])
	# Query 11 takes WorksFrom timestamp
	selectedTimeParams[11] = [random.randint(ts[2], ts[3]) for j in range(len(selectedPersonParams[11]))]

	# Query 10 additionally needs the month parameter
	months = []
	for person in selectedPersonParams[10]:
		month = random.randint(1, 12)
		months.append(month)

	nameParams = []
	for person in selectedPersonParams[1]:
		n = givenNames.getValue(person)
		nameParams.append(n)

	# serialize all the parameters as CSV
	csvWriters = {}
	# all the queries have Person as parameter
	for i in range(1,15):
		csvWriter = CSVSerializer()
		csvWriter.setOutputFile(outdir+"interactive_%d_param.txt"%(i))
		if i != 13 and i != 14: # these two queries take two Persons as parameters
			csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "personId")
		csvWriters[i] = csvWriter

	# add output for Time parameter
	for i in timeSelectionInput:
		if i==3 or i==4:
			csvWriters[i].registerHandler(handleTimeDurationParam, selectedTimeParams[i], "startDate|durationDays")
		elif i==2 or i==9:
			csvWriters[i].registerHandler(handleMaxTimeParam, selectedTimeParams[i], "maxDate")
		elif i==5:
			csvWriters[i].registerHandler(handleMinTimeParam, selectedTimeParams[i], "minDate")

	# other, query-specific parameters
	csvWriters[1].registerHandler(handleFirstNameParam, nameParams, "firstName")
	csvWriters[3].registerHandler(handlePairCountryParam, list(zip(selectedCountryParams[3],secondCountry)), "countryXName|countryYName")
	csvWriters[6].registerHandler(handleTagParam, selectedTagParams[6], "tagName")
	csvWriters[10].registerHandler(handleMonthParam, months, "month")
	csvWriters[11].registerHandler(handleCountryParam, selectedCountryParams[11], "countryName")
	csvWriters[11].registerHandler(handleWorkYearParam, selectedTimeParams[11], "workFromYear")
	csvWriters[12].registerHandler(handleTagTypeParam, selectedTagTypeParams[12], "tagClassName")
	csvWriters[13].registerHandler(handlePairPersonParam, list(zip(selectedPersonParams[13], secondPerson[13])), "person1Id|person2Id")
	csvWriters[14].registerHandler(handlePairPersonParam, list(zip(selectedPersonParams[14], secondPerson[14])), "person1Id|person2Id")


	for j in csvWriters:
		csvWriters[j].writeCSV()
def main(argv=None):
   if argv is None:
      argv = sys.argv

   if len(argv) < 3:
      print "arguments: <input dir> <output dir>"
      return 1

   indir = argv[1]+"/"
   outdir = argv[2]+"/"
   activityFactorFiles=[]
   personFactorFiles=[]
   friendsFiles = []

   for file in os.listdir(indir):
      if file.endswith("activityFactors.txt"):
         activityFactorFiles.append(indir+file)
      if file.endswith("personFactors.txt"):
         personFactorFiles.append(indir+file)
      if file.startswith("m0friendList"):
         friendsFiles.append(indir+file)

   # read precomputed counts from files   
   (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames,  ts, postsHisto) = \
      readfactors.load(personFactorFiles,activityFactorFiles, friendsFiles)
   week_posts = convert_posts_histo(postsHisto)

   persons = []
   for key, _ in personFactors.values.iteritems():
      persons.append(key)
   random.seed(1988)
   random.shuffle(persons)

   country_sample = []
   for key, value in countryFactors.values.iteritems():
      country_sample.append([key, value.getValue("p")])
   country_sample.sort(key=lambda x: x[1], reverse=True)

   tagclass_posts = tagClassFactors
   tagclass_posts.sort(key=lambda x: x[1], reverse=True)

   tag_posts = tagFactors
   tag_posts.sort(key=lambda x: x[1], reverse=True)

   total_posts = 0
   for day, count in tag_posts:
      total_posts += count

   person_sum = 0
   for country, count in country_sample:
      person_sum += count

   post_lower_threshold = 0.1*total_posts*0.9
   post_upper_threshold = 0.1*total_posts*1.1
   post_day_ranges = post_date_range_params(week_posts, post_lower_threshold, post_upper_threshold)
   
   bad_words = ['Augustine','William','James','with','Henry','Robert','from','Pope','Hippo','album','David','has','one','also','Green','which','that']
   #post_lower_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*0.8
   #post_upper_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*1.2
   non_empty_weeks=len(week_posts)
   for ix in range(0,len(week_posts)):
      if week_posts[ix][1]==0:
         non_empty_weeks-= 1

   post_lower_threshold = (total_posts/(non_empty_weeks/4))*0.8
   post_upper_threshold = (total_posts/(non_empty_weeks/4))*1.2
   post_months = post_month_params(week_posts, post_lower_threshold, post_upper_threshold)

   path_bounds = enumerate_path_bounds(3, 9, 2)
   language_codes = prob_language_codes()
   post_lengths = prob_post_lengths()

   serialize_q2 (outdir, key_params(country_sample, total_posts/200, total_posts/100), post_day_ranges) # TODO determine constants
   serialize_q3 (outdir, post_months)
   serialize_q14(outdir, post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))

   serialize_q1 (outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
   serialize_q12(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
   serialize_q18(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts), post_lengths, language_codes)
   serialize_q10(outdir, key_params(tag_posts, total_posts/900, total_posts/600), post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))

   serialize_q4 (outdir, key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70))
   serialize_q5 (outdir, key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q6 (outdir, key_params(tag_posts, total_posts/1300, total_posts/900))
   serialize_q7 (outdir, key_params(tag_posts, total_posts/900, total_posts/600))
   serialize_q8 (outdir, key_params(tag_posts, total_posts/600, total_posts/300))
   serialize_q9 (outdir, key_params(tagclass_posts, 6000, 25000))
   serialize_q13(outdir, key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q15(outdir, key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q16(outdir, persons, key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/80, total_posts/20), path_bounds)
   serialize_q17(outdir, key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q19(outdir, key_params(tagclass_posts, total_posts/60, total_posts/10))
   serialize_q21(outdir, key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q22(outdir, key_params(country_sample, total_posts/120, total_posts/40))
   serialize_q23(outdir, key_params(country_sample, total_posts/200, total_posts/100))
   serialize_q24(outdir, key_params(tagclass_posts, total_posts/140, total_posts/5))
   serialize_q25(outdir, persons, post_months)

   # TODO: Refine
   serialize_q20(outdir, key_params(tagclass_posts, total_posts/20, total_posts/2))
   serialize_q11(outdir, key_params(country_sample, total_posts/80, total_posts/20), bad_words)