Ejemplo n.º 1
0
def set_libodbc_path(path):
    """
    Set the first path that GraphLab Create will search for libodbc.so.

    Since ODBC requires a driver manager to be installed system-wide, we
    provide this to help you if it is installed in a non-standard location.
    GraphLab Create will also search on the system's default library paths, so
    if you installed your driver manager in a standard way, you shouldn't need
    to worry about this function.
    """
    gl.set_runtime_config('GRAPHLAB_LIBODBC_PREFIX', path)
def set_libodbc_path(path):
    """
    Set the first path that GraphLab Create will search for libodbc.so.

    Since ODBC requires a driver manager to be installed system-wide, we
    provide this to help you if it is installed in a non-standard location.
    GraphLab Create will also search on the system's default library paths, so
    if you installed your driver manager in a standard way, you shouldn't need
    to worry about this function.
    """
    gl.set_runtime_config('GRAPHLAB_LIBODBC_PREFIX', path)
Ejemplo n.º 3
0
def load_graphlab():
    if sys.version_info >= (3, 0):
        raise VersionError("Graphlab is only available in Python 2")
    start = time.clock()  # noqa
    import graphlab
    gl_product_key = os.getenv('GLCREATE_PRODUCT_KEY', False)
    if not gl_product_key:
        print("Please set GLCREATE_PRODUCT_KEY")
        return

    graphlab.product_key.set_product_key(gl_product_key)
    # Display graphlab canvas in notebook
    graphlab.canvas.set_target('ipynb')
    # Number of workers
    graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 16)
    since = time.clock() - start
    print("Graphlab loaded in {:.3f} seconds".format(since))
    return graphlab
Ejemplo n.º 4
0
import graphlab as gl
import numpy as np
from scipy.spatial.distance import cosine,euclidean
import time
import datetime
from operator import itemgetter
import itertools
import math
import multiprocessing as mp
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 32)
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 32)
gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY',100000000000)
gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE',100000000000)


tfidf = False
n_iter = 50
random_sample_size=5000000
k_range = np.arange(285,301,5)
filename = 'tf'
np.random.seed(99)
n_cores = 16


docs = gl.SArray("doc_array")
if tfidf:
    docs = gl.text_analytics.tf_idf(docs)
    docs.apply(lambda row: {k:round(v)+1 for k,v in row.iteritems()})
train,test = gl.text_analytics.random_split(docs,0.1)
train.save("train_data_"+filename)
test.save("test_data_"+filename)
Ejemplo n.º 5
0
import graphlab as gl
import time
gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/mnt/data/tmp')
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 30)

dpath = '/mnt/data/'


def load_data(dpath, maxn=None):
    cites = gl.SFrame.read_csv(dpath + 'cites.csv',
                               column_type_hints=[int, int])
    paper = gl.SFrame.read_csv(dpath + 'papers.csv',
                               column_type_hints=[int, int])
    if maxn is not None:
        paper = paper[paper['id'] < maxn]
        cites = cites[cites.apply(lambda x: x['p1'] < maxn and x['p2'] < maxn)]
    sg = gl.SGraph(vertices=paper,
                   edges=cites,
                   vid_field='id',
                   src_field='p1',
                   dst_field='p2')
    return sg


def findp_update_fn(src, edge, dst):
    pdst = dst['parent']
    psrc = src['parent']
    for pid, d in pdst.iteritems():
        if pid not in psrc:
            psrc[pid] = d + 1
            src['changed'] = True
Ejemplo n.º 6
0
	return merged_sf

def eval_model(model, test, col):
	'''Evaluate a trained model using Kaggle scoring.'''
	return log_loss_raw(test[col], model.predict(test, output_type='probability'))
	
def log_loss_raw(target, predicted):
	'''Calculate log_loss between target and predicted and return.'''
	p = predicted.apply(lambda x: min(0.99999, max(1e-5, x)))
	logp = p.apply(lambda x: math.log(x))
	logmp = p.apply(lambda x: (math.log(1-x)))
	return -(target * logp + (1-target) * logmp).mean()


if __name__=='__main__':
	gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS','/home/mraza/tmp/')	
	gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY', 200*1024*1024*1024)
	gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 100*1024*1024*1024)

	parser=argparse.ArgumentParser(description='Spectral Features Preprocessing')
	parser.add_argument('-cf','--clusteral_features',help='Input File containing clusteral features', required=True)
	parser.add_argument('-lf','--labels_file', help='Ground Truth labels file', required=True)
	parser.add_argument('-of','--output_file', help='Output file', required=True)
	parser.add_argument('-cfk','--clusteral_key_column', required=True)
	parser.add_argument('-lfk','--labels_key_column', required=True)
	parser.add_argument('-lfv','--labels_value_column', required=True)
	parser.add_argument('-i','--interaction', required=True)
	parser.add_argument('-j','--join_type', required=False)
	parser.add_argument('-e','--encode', required=False)

	args=parser.parse_args()
Ejemplo n.º 7
0
# processed/ directories -- where you ran the prep_image.sh.  It will
# put a image-sframes/ directory with train and test SFrames in the
# save_path location below. 

# os.chdir('/home/pablo/Kaggle/kaggle-train')

# preprocessed_image_path = "processed/"
preprocessed_image_path = "processed/"
save_train = False
save_test = True

print "current working directory = %s" % os.getcwd()

save_path = "./"

gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", os.path.expanduser("~/data/tmp/"))
#gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", "/media/pablo/OS/Users/Pablo/Downloads/Kaggle/graphlab-cache/")

print "loading images" # las siguientes sentencias tarda en Pablo's notebook 400 segs aprox.
# shuffle the training images1
X = gl.image_analysis.load_images(preprocessed_image_path)
X["is_train"] = X["path"].apply(lambda p: "train" in p)

# Add in all the relevant information in places
source_f = lambda p: re.search("run-(?P<source>[^/]+)", p).group("source")
X["source"] = X["path"].apply(source_f)

extract_name = lambda p: re.search("[0-9]+_(right|left)", p).group(0)
X["name"] = X["path"].apply(extract_name)

X_train = X[X["is_train"] == True]
Ejemplo n.º 8
0
import random
from copy import copy
import os
import graphlab.aggregate as agg
import array

import sys

model_name = "pooling-2"
which_model = 0

print "Running model %d, %s" % (which_model, model_name)

alt_path = os.path.expanduser("~/data/tmp/")
if os.path.exists(alt_path):
    gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", alt_path)

model_path = "nn_256x256/models/model-%d-%s/" % (which_model, model_name)
model_filename = model_path + "nn_model" 

X_train = gl.SFrame("image-sframes/train-%d/" % which_model)
X_valid = gl.SFrame("image-sframes/validation-%d/" % which_model)
X_test = gl.SFrame("image-sframes/test/")

################################################################################

# init_random vs random_type in ConvolutionLayer. 

dll = gl.deeplearning.layers

nn = gl.deeplearning.NeuralNet()
Ejemplo n.º 9
0
parser.add_argument('-i',
                    '--train',
                    help='Input training matrix',
                    required=True)
parser.add_argument('-t', '--test', help='test data matrix', required=True)
parser.add_argument('-d,',
                    '--modeldir',
                    help='Directory to save the model',
                    default='svmmodel')
parser.add_argument('-r,',
                    '--report',
                    help='report file',
                    default='report.txt')
args = parser.parse_args()

gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/scratch')
test = gl.SFrame.read_csv(args.test, delimiter='\t', header=False)
train = gl.SFrame.read_csv(args.train, delimiter='\t', header=False)
file_report = open(args.report, 'w')

test.save('test_sframe')
train.save('train_sframe')

model = gl.svm_classifier.create(train, target='X1')
predictions = model.predict(test)
print predictions
#file_report.write(predictions)
results = model.evaluate(test)
print results
#file_report.write(results)
model.save(args.modeldir)
Ejemplo n.º 10
0
def eval_model(model, test, col):
    '''Evaluate a trained model using Kaggle scoring.'''
    return log_loss_raw(test[col],
                        model.predict(test, output_type='probability'))


def log_loss_raw(target, predicted):
    '''Calculate log_loss between target and predicted and return.'''
    p = predicted.apply(lambda x: min(0.99999, max(1e-5, x)))
    logp = p.apply(lambda x: math.log(x))
    logmp = p.apply(lambda x: (math.log(1 - x)))
    return -(target * logp + (1 - target) * logmp).mean()


if __name__ == '__main__':
    gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/home/mraza/tmp/')
    gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY',
                          200 * 1024 * 1024 * 1024)
    gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE',
                          100 * 1024 * 1024 * 1024)

    parser = argparse.ArgumentParser(
        description='Spectral Features Preprocessing')
    parser.add_argument('-cf',
                        '--clusteral_features',
                        help='Input File containing clusteral features',
                        required=True)
    parser.add_argument('-lf',
                        '--labels_file',
                        help='Ground Truth labels file',
                        required=True)
Ejemplo n.º 11
0
import graphlab as gl
import re
import random
from copy import copy
import os
import graphlab.aggregate as agg
import array

import sys

# Change cache file directory to avoid overloading /var
my_graphlab_cache_file_locations = '/home/zak/tmp_graphlab'
gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS',my_graphlab_cache_file_locations)

# gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", os.path.expanduser("~/data/tmp/"))

base_path = os.getcwd()

model_path = base_path + "/nn_96x96/models/"

train_sf = []
test_sf = []
feature_names = []

for n in [0,1,2,3,4]:
    
    try: 
        Xf_train = gl.SFrame(model_path + "/scores_train_%d" % n)
        Xf_test = gl.SFrame(model_path + "/scores_test_%d" % n)

        train_sf.append(Xf_train)
Ejemplo n.º 12
0
# processed/ directories -- where you ran the prep_image.sh.  It will
# put a image-sframes/ directory with train and test SFrames in the
# save_path location below.

# os.chdir('/home/pablo/Kaggle/kaggle-train')

# preprocessed_image_path = "processed/"
preprocessed_image_path = "processed/"
save_train = False
save_test = True

print "current working directory = %s" % os.getcwd()

save_path = "./"

gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS",
                      os.path.expanduser("~/data/tmp/"))
#gl.set_runtime_config("GRAPHLAB_CACHE_FILE_LOCATIONS", "/media/pablo/OS/Users/Pablo/Downloads/Kaggle/graphlab-cache/")

print "loading images"  # las siguientes sentencias tarda en Pablo's notebook 400 segs aprox.
# shuffle the training images1
X = gl.image_analysis.load_images(preprocessed_image_path)
X["is_train"] = X["path"].apply(lambda p: "train" in p)

# Add in all the relevant information in places
source_f = lambda p: re.search("run-(?P<source>[^/]+)", p).group("source")
X["source"] = X["path"].apply(source_f)

extract_name = lambda p: re.search("[0-9]+_(right|left)", p).group(0)
X["name"] = X["path"].apply(extract_name)

X_train = X[X["is_train"] == True]
Ejemplo n.º 13
0
import graphlab as gl
import loadgraph as load
import time

gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/mnt/data/tmp')

dpath = '/mnt/data/'


def childs_update_fn(src, edge, dst):
    pdst = dst['childs']
    psrc = src['childs']
    for pid, d in psrc.iteritems():
        if pid not in pdst:
            pdst[pid] = d + 1
            dst['changed'] = True
        else:
            if pdst[pid] > d + 1:
                pdst[pid] = d + 1
                dst['changed'] = True
    dst['childs'] = pdst
    return (src, edge, dst)


def find_childs(g, maxn):
    start = time.time()
    num_changed = len(g.vertices)
    it = 0
    g.vertices['childs'] = g.vertices['__id'].apply(lambda x: {x: 0}
                                                    if x < maxn else {})
    while (num_changed > 0):
Ejemplo n.º 14
0
import graphlab as gl
import loadgraph as load
import time

gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/mnt/data/tmp')

dpath = '/mnt/data/'


def node_update_fn(src, edge, dst):
    src['out_edges'] += 1 
    dst['in_edges'] += 1
    return (src, edge, dst)
    
def find_stats(g):
    start = time.time()
    g.vertices['in_edges'] = 0
    g.vertices['out_edges'] = 0
    g = g.triple_apply(node_update_fn, ['in_edges', 'out_edges'])
    print 'Triple apply all finished in: %f secs' % (time.time() - start)
    return g


def cnt_update_fn(src, edge, dst):
    if dst['out_edges'] == dst['counter']:
        src['counter'] += 1
        src['parent-cnt'] += dst['parent-cnt'] + 1
    return (src, edge, dst)

def find_cnt(g):
    start = time.time()
Ejemplo n.º 15
0
import random
from copy import copy
import os
import graphlab.aggregate as agg
import array
import numpy as np
import sys

# Run this script in the same directory as the

train_path = "image-sframes/train-%d/"
valid_path = "image-sframes/validation-%d/"

# Change cache file directory to avoid overloading /var
my_graphlab_cache_file_locations = '/home/zak/tmp_graphlab'
gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS',
                      my_graphlab_cache_file_locations)

X_data = gl.SFrame("image-sframes/train/")


def save_as_train_and_test(X, train_loc, valid_loc):

    # Can't just randomly sample the indices
    all_names = list(X["name"].unique())

    n_valid = (2 * len(all_names)) / 100

    random.shuffle(all_names)

    tr_names = gl.SArray(all_names[n_valid:])
    valid_names = gl.SArray(all_names[:n_valid])
Ejemplo n.º 16
0
import numpy as np
import itertools
from collections import defaultdict
from datetime import datetime
import math
import sys
import os
from scipy.stats import percentileofscore
import graphlab as gl
import graphlab.aggregate as agg
gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/home/mraza/tmp/')
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 48)

# X1,X2,X3,X4,X5,X6,X7
# 2015-10-01 12:08:41,1046885725705,1046910448494,GSM,ITN006,,1.5
# 2015-10-01 16:55:32,1046885725705,1046910448494,GSM,ITN010,,1.5


def distance(l1_lat, l1_lng, l2_lat, l2_lng):
    R = 6371
    # Radius of the earth in km
    d = 0.0
    try:
        l1_lat, l1_lng, l2_lat, l2_lng = float(l1_lat), float(l1_lng), float(
            l2_lat), float(l2_lng)
    except:
        l1_lat, l1_lng, l2_lat, l2_lng = 0.0, 0.0, 0.0, 0.0
    dLat = (l1_lat - l2_lat) * math.pi / 180
    dLon = (l1_lng - l2_lng) * math.pi / 180
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + math.cos(
        (l1_lat) * math.pi / 180) * math.cos(
Ejemplo n.º 17
0
    tic = time.time()

    if is_reachable:
        print("Vertex {} is reachable from vertex {} - Distance: {}".format(target_vertex, source_vertex, int(distance)))
    else:
        print("Vertex {} cannot be reached from vertex {} - Distance: {}".format(target_vertex, source_vertex, int(distance)))

    return "Total runtime: {} seconds".format(tic-toc)

if __name__ == '__main__':

    if(len(sys.argv) is 1):
        print("Please add number of workers, dataset path, source vertex, target vertex, and max recursion depth as arguments when loading script")
        sys.exit()
    else:
        workers = int(sys.argv[1])
        path = sys.argv[2]
        source_vertex = long(sys.argv[3])
        target_vertex = long(sys.argv[4])
        # max_depth = int(sys.argv[5])
        # assert max_depth >= 1
    
    # Configure GraphLab to utilize a specific number of workers (cores)
    gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', workers)

    r_job = gl.deploy.job.create(run_reachability_job, path_to_file = path, source_vertex=source_vertex, target_vertex=target_vertex)

    # Collect job status, result, and metrics
    print("Job status\n{}".format(r_job.get_status())) 
    print("Job results\n{}".format(r_job.get_results())) 
    print("Job metrics\n{}".format(r_job.get_metrics())) 
# coding: utf-8

# # Predicting sentiment from product reviews
# 
# # Fire up GraphLab Create
# (See [Getting Started with SFrames](/notebooks/Week%201/Getting%20Started%20with%20SFrames.ipynb) for setup instructions)

# In[ ]:

import graphlab


# In[ ]:

# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)


# # Read some product review data
# 
# Loading reviews for a set of baby products. 

# In[ ]:

products = graphlab.SFrame('amazon_baby.gl/')


# # Let's explore this data together
# 
# Data includes the product name, the review text and the rating of the review. 
Ejemplo n.º 19
0
import graphlab as gl
from graphlab.toolkits._main import ToolkitError
import numpy as np
import time
import datetime

gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 8)
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 8)
#gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY',100000000000)
#gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE',100000000000)

raw_docs = gl.SArray("LDA_vectors/")
vocab_idx = {}
for line in open('vocab_idx'):
    line = line.strip().split('\t')
    vocab_idx[int(line[1])] = line[0]

def formatter(row):
    row = eval(row)
    row = dict(zip([vocab_idx[term] for term in row[1][1]],row[1][2]))
    return row

docs = raw_docs.filter(lambda x: x!="").apply(formatter)
docs.save("doc_array",format='binary')

docs = gl.SArray("doc_array")

train,test = gl.SFrame(docs).random_split(0.9,seed=99)
train = gl.text_analytics.tf_idf(train['X1'])
test = gl.text_analytics.tf_idf(test['X1'])
#data = {'tfidf':(train_tfidf,test_tfidf),'tf':(train,test)}
Ejemplo n.º 20
0
import numpy
from graphlab import feature_engineering as fe
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

from util import Util
from evals import Eval
from spectral_training import SpectralTraining
DEBUG=1




if __name__=='__main__':
	gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS','/home/mraza/tmp/')	
	gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY', 200*1024*1024*1024)
	gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 100*1024*1024*1024)
	gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 20)

	parser=argparse.ArgumentParser(description='Spectral Features Preprocessing')
	parser.add_argument('-cf','--clusteral_features',help='Input File containing clusteral features', required=True)
	parser.add_argument('-lf','--labels_file', help='Ground Truth labels file', required=True)
	parser.add_argument('-of','--output_file', help='Output file', required=True)
	parser.add_argument('-cfk','--clusteral_key_column', required=True)
	parser.add_argument('-lfk','--labels_key_column', required=True)
	parser.add_argument('-lfv','--labels_value_column', required=True)
	parser.add_argument('-i','--interaction', required=True)
	parser.add_argument('-j','--join_type', required=False)
	parser.add_argument('-e','--encode', required=False)
	parser.add_argument('-ex','--exclude', required=False)
# coding: utf-8

# # Fire up GraphLab Create
#
# We always start with this line before using any part of GraphLab Create. It can take up to 30 seconds to load the GraphLab library - be patient!
#
# The first time you use GraphLab create, you must enter a product key to license the software for non-commerical academic use. To register for a free one-year academic license and obtain your key, go to [dato.com](https://dato.com/download/academic.html).

# In[2]:

import graphlab
# Set product key on this computer. After running this cell, you will not need to re-enter your product key.
graphlab.product_key.set_product_key('C7E4-BB1D-0150-A1E6-645C-66D9-D454-CC8D')

# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)

# Output active product key.
graphlab.product_key.get_product_key()

# # Load a tabular data set

# In[3]:

sf = graphlab.SFrame('people-example.csv')

# # SFrame basics

# In[6]:

sf.head()  # we can view first few lines of table
Ejemplo n.º 22
0
import csv
import sys
import math
import numpy
from graphlab import feature_engineering as fe
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

from util import Util
from evals import Eval
from spectral_training import SpectralTraining
DEBUG = 1

if __name__ == '__main__':
    gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS', '/home/mraza/tmp/')
    gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY',
                          200 * 1024 * 1024 * 1024)
    gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE',
                          100 * 1024 * 1024 * 1024)
    gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 20)

    parser = argparse.ArgumentParser(
        description='Spectral Features Preprocessing')
    parser.add_argument('-cf',
                        '--clusteral_features',
                        help='Input File containing clusteral features',
                        required=True)
    parser.add_argument('-lf',
                        '--labels_file',
                        help='Ground Truth labels file',
Ejemplo n.º 23
0
#!/usr/bin/env python

import graphlab as gl
from sklearn.metrics import precision_recall_curve
import numpy as np
#import matplotlib.pyplot as plt
import argparse
import glob

gl.set_runtime_config('GRAPHLAB_CACHE_FILE_LOCATIONS','/scratch')
gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY',40000000000)
gl.set_runtime_config('GRAPHLAB_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 40000000000)
gl.set_runtime_config('GRAPHLAB_SFRAME_SORT_BUFFER_SIZE',40000000000)

parser = argparse.ArgumentParser(description='A script to get P-R curve for a model')
parser.add_argument('-d','--directory', help='directory to test', required=True )
parser.add_argument('-t','--test', help = 'test data matrix', required=True)
#parser.add_argument('-r,','--report',help= 'data for PR curve', required = True )
#parser.add_argument('-f,','--figure',help= 'figure name for plotting', required = True )

args = parser.parse_args()


#test =  gl.SFrame.read_csv('/global/projectb/scratch/arrivers/geneleanrntest/20150818/test.twoclass.txt', delimiter='\t', header=False)
#train =  gl.SFrame.read_csv('/global/projectb/scratch/arrivers/geneleanrntest/20150818/train.twoclass.txt', delimiter='\t', header=False)
#test.save('test_twoclass_sframe')
#train.save('train_twoclass_sframe')


test =  gl.SFrame.read_csv(args.test, delimiter='\t', header=False)
Ejemplo n.º 24
0
    #rmses_cf = run_cf(min_lambduh, min_k, min_lambduh_w)
    #rmses_cf2 = run_cf2(min_lambduh, min_k, min_lambduh_w)


# In[64]:

def main(argv):
    # pylint: disable=W0612
    try:
        argv = FLAGS(argv)  # parse flags
    except gflags.FlagsError, e:
        print '%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)
        sys.exit(1)

    gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 16)
    for flag_name in sorted(FLAGS.RegisteredFlags()):
        if flag_name not in ["?", "help", "helpshort", "helpxml"]:
            fl = FLAGS.FlagDict()[flag_name]
            with open('output/main.out', 'a') as f:
                f.write(
                    "# " + fl.help + " (" + flag_name + "): " + str(fl.value) + '\n')

    X_train, X_test = load(FLAGS.dataset)
    g = get_graph(X_train, FLAGS.rank)

    rmse_train, rmse_test, L, R, wu, wm, bu, bm = \
        sgd_gl_edge(g, X_train, X_test,
                    FLAGS.lamb, FLAGS.rank, FLAGS.eta, Niter=FLAGS.maxit,
                    unified=FLAGS.unified, lambduh_w=FLAGS.lamb_w, output="main")