Ejemplo n.º 1
0
def run_ml_analysis(log_file="log.txt", splitfrac=0.1, nfolds=10, 
		feat_choice="ads", nfeat=5, verbose=False):
	if(feat_choice != "ads" and feat_choice != "words"):
		print "Illegal feat_choice", feat_choice
		return
	collection, names = converter.get_ads_from_log(log_file)	

	#print(collection)
	#print(names)

	if len(collection) < nfolds:
		print "Too few blocks (%s). Analysis requires at least as many blocks as nfolds (%s)." % (len(collection), nfolds)
		return
# 	intX, inty, intFeat = converter.get_interest_vectors(collection)
# 	plot.treatment_feature_histogram(intX, inty, intFeat, names)
	s = datetime.now()
	X,y,feat = converter.get_feature_vectors(collection, feat_choice='ads')
	"""
	print("XXXXXXXXXXXXXXXXXX")
	print("len:" + str(len(X)) ) 
	print(X[0])
	print("YYYYYYYYYYYYYYYY")
	print(y)
	return
	"""
	e = datetime.now()
	if(verbose):
		print "Time for constructing feature vectors: ", str(e-s)
		stat.print_counts(X,y)
	ml.run_ml_analysis(X, y, feat, names, feat_choice, nfeat, splitfrac=splitfrac, 
		nfolds=nfolds, verbose=verbose)
Ejemplo n.º 2
0
def run_ml_analysis(log_file="log.txt", splitfrac=0.1, nfolds=10, 
		feat_choice="ads", nfeat=5, verbose=False):
	if(feat_choice != "ads" and feat_choice != "words"):
		print "Illegal feat_choice", feat_choice
		return
	collection, names = converter.read_log(log_file)	
# 	collection = collection[:100]
# 	print collection[0]['adv']
# 	plot.temporalPlots(collection[0]['adv'][0:1])
# 	raw_input("wait")
	if len(collection) < nfolds:
		print "Too few blocks (%s). Analysis requires at least as many blocks as nfolds (%s)." % (len(collection), nfolds)
		return
# 	intX, inty, intFeat = converter.get_interest_vectors(collection)
# 	plot.treatment_feature_histogram(intX, inty, intFeat, names)
	s = datetime.now()
	X,y,feat = converter.get_feature_vectors(collection, feat_choice='ads')
	print X.shape
	print y.shape
	e = datetime.now()
	if(verbose):
		print "Time for constructing feature vectors: ", str(e-s)
		stat.print_counts(X,y)
	ml.run_ml_analysis(X, y, feat, names, feat_choice, nfeat, splitfrac=splitfrac, 
		nfolds=nfolds, verbose=verbose)
Ejemplo n.º 3
0
def run_ml_analysis(log_file="log.txt",
                    splitfrac=0.1,
                    nfolds=10,
                    feat_choice="ads",
                    nfeat=5,
                    verbose=False):
    if (feat_choice != "ads" and feat_choice != "words"):
        print "Illegal feat_choice", feat_choice
        return
    collection, names = converter.get_ads_from_log(log_file)

    #print(collection)
    #print(names)

    if len(collection) < nfolds:
        print "Too few blocks (%s). Analysis requires at least as many blocks as nfolds (%s)." % (
            len(collection), nfolds)
        return


# 	intX, inty, intFeat = converter.get_interest_vectors(collection)
# 	plot.treatment_feature_histogram(intX, inty, intFeat, names)
    s = datetime.now()
    X, y, feat = converter.get_feature_vectors(collection, feat_choice='ads')
    """
	print("XXXXXXXXXXXXXXXXXX")
	print("len:" + str(len(X)) ) 
	print(X[0])
	print("YYYYYYYYYYYYYYYY")
	print(y)
	return
	"""
    e = datetime.now()
    if (verbose):
        print "Time for constructing feature vectors: ", str(e - s)
        stat.print_counts(X, y)
    ml.run_ml_analysis(X,
                       y,
                       feat,
                       names,
                       feat_choice,
                       nfeat,
                       splitfrac=splitfrac,
                       nfolds=nfolds,
                       verbose=verbose)
Ejemplo n.º 4
0
def run_ml_analysis(log_file="log.txt",
                    splitfrac=0.1,
                    nfolds=10,
                    feat_choice="ads",
                    nfeat=5,
                    verbose=False):
    if (feat_choice != "ads" and feat_choice != "words"):
        print "Illegal feat_choice", feat_choice
        return
    collection, names = converter.read_log(log_file)
    # 	collection = collection[:100]
    # 	print collection[0]['adv']
    # 	plot.temporalPlots(collection[0]['adv'][0:1])
    # 	raw_input("wait")
    if len(collection) < nfolds:
        print "Too few blocks (%s). Analysis requires at least as many blocks as nfolds (%s)." % (
            len(collection), nfolds)
        return


# 	intX, inty, intFeat = converter.get_interest_vectors(collection)
# 	plot.treatment_feature_histogram(intX, inty, intFeat, names)
    s = datetime.now()
    X, y, feat = converter.get_feature_vectors(collection, feat_choice='ads')
    print X.shape
    print y.shape
    e = datetime.now()
    if (verbose):
        print "Time for constructing feature vectors: ", str(e - s)
        stat.print_counts(X, y)
    ml.run_ml_analysis(X,
                       y,
                       feat,
                       names,
                       feat_choice,
                       nfeat,
                       splitfrac=splitfrac,
                       nfolds=nfolds,
                       verbose=verbose)
Ejemplo n.º 5
0
def compute_influence(log_file="log.txt"):							## eventually move it to analysis
	collection, names = converter.read_log(log_file)
	print names
# 	collection = collection[:5]
	X,y,feat = converter.get_feature_vectors(collection, feat_choice='ads')
	print X.shape, y.shape
	out = np.array([[0.]*X.shape[2]]*len(names))
	print out.shape
	
	for i in range(0, X.shape[0]):
		for j in range(0, X.shape[1]):
			out[j] = out[j] + X[i][np.where(y[i]==j)]
# 	print out

	total = out[0]+out[1]+out[2]+out[3]+out[4]+out[5]
	print total
	raw_input("wait")
	print "Computing gender influence"
	diff = (abs(out[0] - out[3]) + abs(out[1] - out[4]) + abs(out[2] - out[5]))/total
# 	for i in range(0,len(total)):
# 		diff[i] = diff[i]*1.0/total[i]
	print diff
	
	print "Computing age influence"
	diff2 = (abs(out[0] - out[1]) + abs(out[1] - out[2]) + abs(out[2] - out[0]) + abs(out[3] - out[4]) + abs(out[4] - out[5]) + abs(out[5] - out[3]))/total
	print diff2
	
	male = out[0]+out[1]+out[2]
	female = out[3]+out[4]+out[5]
	print "-------"
	print male
	print female
	print "-------"
	print "total ads:", out.sum()
# 	print "Computing age influence"
# 	diff = abs(out[0] - out[3]) + abs(out[1] - out[4]) + abs(out[2] - out[5])
# 	print diff
# 	feat.display("url+title")
	sortdiff = np.sort(diff)
	sortdiff = sortdiff[::-1]
	print sortdiff
	count = 0
	for i in sortdiff:
		print "out:-----", i
# 		print np.where(diff==i)
		for j in np.where(diff==i)[0]:
			count += 1
			print "index:", j, "infl:", i, "---", 
			print "m:", male[j], "f:", female[j]
			print out[0][j], out[1][j], out[2][j], out[3][j], out[4][j], out[5][j]
			feat.choose_by_index(j).display()
		if count > 20:
			break;
			
	X2 = np.array([[[0.]*X.shape[2]]*2]*X.shape[0])
	y2 = np.array([[0]*2]*y.shape[0])
	print X.shape, 
	print X2.shape
	names2 = ['m18', 'f35']
	
	for i in range(0, X.shape[0]):
		k = np.where(y[i]%4==0)
		X2[i] = X[i][k]
		y2[i] = y[i][k]/4
		
# 	print X2
# 	print y2
# 	print X2.shape, y2.shape
# 	raw_input("wait")
	ml.run_ml_analysis(X2, y2, feat, names2, feat_choice="ads", nfeat=5, splitfrac=0.1, 
		nfolds=10, verbose=False)
Ejemplo n.º 6
0
def compute_influence(log_file="log.txt"):  ## eventually move it to analysis
    collection, names = converter.read_log(log_file)
    print names
    # 	collection = collection[:5]
    X, y, feat = converter.get_feature_vectors(collection, feat_choice='ads')
    print X.shape, y.shape
    out = np.array([[0.] * X.shape[2]] * len(names))
    print out.shape

    for i in range(0, X.shape[0]):
        for j in range(0, X.shape[1]):
            out[j] = out[j] + X[i][np.where(y[i] == j)]
# 	print out

    total = out[0] + out[1] + out[2] + out[3] + out[4] + out[5]
    print total
    raw_input("wait")
    print "Computing gender influence"
    diff = (abs(out[0] - out[3]) + abs(out[1] - out[4]) +
            abs(out[2] - out[5])) / total
    # 	for i in range(0,len(total)):
    # 		diff[i] = diff[i]*1.0/total[i]
    print diff

    print "Computing age influence"
    diff2 = (abs(out[0] - out[1]) + abs(out[1] - out[2]) +
             abs(out[2] - out[0]) + abs(out[3] - out[4]) +
             abs(out[4] - out[5]) + abs(out[5] - out[3])) / total
    print diff2

    male = out[0] + out[1] + out[2]
    female = out[3] + out[4] + out[5]
    print "-------"
    print male
    print female
    print "-------"
    print "total ads:", out.sum()
    # 	print "Computing age influence"
    # 	diff = abs(out[0] - out[3]) + abs(out[1] - out[4]) + abs(out[2] - out[5])
    # 	print diff
    # 	feat.display("url+title")
    sortdiff = np.sort(diff)
    sortdiff = sortdiff[::-1]
    print sortdiff
    count = 0
    for i in sortdiff:
        print "out:-----", i
        # 		print np.where(diff==i)
        for j in np.where(diff == i)[0]:
            count += 1
            print "index:", j, "infl:", i, "---",
            print "m:", male[j], "f:", female[j]
            print out[0][j], out[1][j], out[2][j], out[3][j], out[4][j], out[
                5][j]
            feat.choose_by_index(j).display()
        if count > 20:
            break

    X2 = np.array([[[0.] * X.shape[2]] * 2] * X.shape[0])
    y2 = np.array([[0] * 2] * y.shape[0])
    print X.shape,
    print X2.shape
    names2 = ['m18', 'f35']

    for i in range(0, X.shape[0]):
        k = np.where(y[i] % 4 == 0)
        X2[i] = X[i][k]
        y2[i] = y[i][k] / 4


# 	print X2
# 	print y2
# 	print X2.shape, y2.shape
# 	raw_input("wait")
    ml.run_ml_analysis(X2,
                       y2,
                       feat,
                       names2,
                       feat_choice="ads",
                       nfeat=5,
                       splitfrac=0.1,
                       nfolds=10,
                       verbose=False)