def showPath(self, highlight=None):
        # with open(self.verticesFn,'a') as Vwr:
        #     with open(self.edgesFn,'a') as Ewr:
        #         for i in range(8):
        #             Vwr.write('\nc0_' + `i` + ', ')
        #             Ewr.write('\np8_0_t,' + 'c0_' + `i` + ',c')
        #             highlight['c0_'+`i`] = [0.69, 0.0, 0.498]
        # start = datetime.datetime.now()
        edge_data = SFrame.read_csv(self.edgesFn)
        vertex_data = SFrame.read_csv(self.verticesFn)
        g = SGraph(vertices=vertex_data,
                   edges=edge_data,
                   vid_field='name',
                   src_field='src',
                   dst_field='dst')
        # end = datetime.datetime.now()
        # print (end - start)
        # g.show(vlabel='attributes', elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True)  # highLight

        g.show(vlabel='attributes',
               vlabel_hover=False,
               elabel='relation',
               highlight=highlight,
               arrows=True)  # highLight
        sleep(20)
        pass
Esempio n. 2
0
def create_frame_from_file(file_name):
    n_total_lines = 220000
    sf = SFrame()
    with open(file_name) as data:
        dt = []
        ip = []
        py = []
        script = []
        id = []
        for i, line in enumerate(data):
            jo = json.loads(line)
            dt += jo['dt']
            ip += jo['ip']
            py += jo['py']
            id += [i]
            script += jo['user_script']

            if i % 100 == 0:
                print float(i) / n_total_lines

        sf = sf.add_column(SArray(id), name='id')
        sf.add_column(SArray(dt), name='dt')
        sf.add_column(SArray(ip), name='ip')
        sf.add_column(SArray(py, dtype=str), name='py')
        sf.add_column(SArray(script), name='user_script')

        sf.save('python_tutor')
    return sf
Esempio n. 3
0
 def get_rating_sf(self, samples, save_to=None):
     sf = SFrame(self.ratings.ix[samples])
     sf['userId'] = sf['userId'].apply(lambda uid: self.user_dict[uid])
     sf['movieId'] = sf['movieId'].apply(lambda mid: self.movie_dict[mid])
     if save_to is not None:
         print "saving sframe to", save_to
         sf.save(save_to)
     return sf
Esempio n. 4
0
 def get_rating_sf(self, samples, save_to=None):
     sf = SFrame(self.ratings.ix[samples])
     sf['userId'] = sf['userId'].apply(lambda uid: self.user_dict[uid])
     sf['movieId'] = sf['movieId'].apply(lambda mid: self.movie_dict[mid])
     if save_to is not None:
         print "saving sframe to", save_to
         sf.save(save_to)
     return sf
 def showPath(self, highlight=None):
     edge_data = SFrame.read_csv(self.edgesFn)
     vertex_data = SFrame.read_csv(self.verticesFn)
     g = SGraph(vertices=vertex_data,
                edges=edge_data,
                vid_field='name',
                src_field='src',
                dst_field='dst')
     g.show(vlabel='id',
            elabel='relation',
            highlight=highlight,
            arrows=True)  # highLight
     sleep(10)
     pass
Esempio n. 6
0
 def showPath(self, highlight=None):
     start = datetime.datetime.now()
     edge_data = SFrame.read_csv(self.edgesFn)
     vertex_data = SFrame.read_csv(self.verticesFn)
     g = SGraph(vertices=vertex_data,
                edges=edge_data,
                vid_field='name',
                src_field='src',
                dst_field='dst')
     end = datetime.datetime.now()
     print(end - start)
     # g.show(vlabel='attributes', elabel='relation', highlight=highlight, arrows=True)  # highLight
     # sleep(40)
     pass
Esempio n. 7
0
    def download(symbol, start_date, end_date):
        stock = Share(symbol)
        # ^GSPC is the Yahoo finance symbol to refer S&P 500 index
        # we gather historical quotes from 2001-01-01 up to today
        hist_quotes = stock.get_historical(start_date, end_date)
        l_date = []
        l_open = []
        l_high = []
        l_low = []
        l_close = []
        l_volume = []
        # reverse the list
        hist_quotes.reverse()
        for quotes in hist_quotes:
            l_date.append(quotes['Date'])
            l_open.append(float(quotes['Open']))
            l_high.append(float(quotes['High']))
            l_low.append(float(quotes['Low']))
            l_close.append(float(quotes['Adj_Close']))
            l_volume.append(int(quotes['Volume']))

        sf = SFrame({
            'datetime': l_date,
            'open': l_open,
            'high': l_high,
            'low': l_low,
            'close': l_close,
            'volume': l_volume
        })
        # datetime is a string, so convert into datetime object
        sf['datetime'] = sf['datetime'].apply(
            lambda x: datetime.strptime(x, '%Y-%m-%d'))
        return sf
    def __init__(self):
        rec_data = "../data/train.csv"
        data = ProcessData()
        df_rec = data.get_data(rec_data)
        df_rec = data.clean_data(df_rec)
        df_rec = df_rec[df_rec.record_type == 1]
        sf = SFrame(data=df_rec)
        del df_rec  # memory optimization

        self.modelA = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="A")
        self.modelB = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="B")
        self.modelC = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="C")
        self.modelD = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="D")
        self.modelE = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="E")
        self.modelF = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="F")
Esempio n. 9
0
def resize_images(filename):
    images = graphlab.image_analysis.load_images(filename,
                                                 format='auto',
                                                 with_path=False,
                                                 recursive=False,
                                                 ignore_failure=True,
                                                 random_order=True)
    # firstImages = images[0:9]['image']
    new_images = list()
    new_images.append(
        graphlab.image_analysis.resize(images['image'],
                                       32,
                                       32,
                                       channels=4,
                                       decode=True))
    frame = SFrame(new_images)
    frame.save('mini')
Esempio n. 10
0
 def load_graph(self,
                graph_path,
                direction=1,
                start_line=0,
                limit=None,
                blacklist=set(),
                delimiter=','):
     json_object = utils.is_json(graph_path)
     if json_object is not False:
         # print json_object
         graph_path = SFrame(SArray(json_object).unpack())
         graph_path.rename({'X.0': 'X1', 'X.1': 'X2', 'X.2': 'Weight'})
     else:
         # load_sgraph()
         graph_path = SFrame.read_csv(graph_path,
                                      delimiter=delimiter,
                                      header=False,
                                      column_type_hints={
                                          'X1': str,
                                          'X2': str
                                      },
                                      nrows=limit,
                                      skiprows=start_line)
         if self._weight_field != "":
             graph_path.rename({'X3': 'Weight'})
     # print graph_data
     self._graph = self._graph.add_edges(graph_path,
                                         src_field='X1',
                                         dst_field='X2')
     if not self.is_directed:
         self.to_undirected()
Esempio n. 11
0
def query_model(dogo, model, images):
    neighbours = get_images_from_ids(model.query(dogo, k=20), images)

    image_list = SFrame(data=None)

    shown_dogs = {dogo['images'][0][0]}

    for i in range(0, len(neighbours)):
        if len(shown_dogs) < 6:
            if neighbours[i]['images'][0] not in shown_dogs:
                # neighbours[i]['image'].show()
                dogo_clone = neighbours[i:i + 1].copy()
                image_list = image_list.append(SFrame(dogo_clone))
                shown_dogs.add(neighbours[i]['images'][0])
        else:
            break

    return image_list
def CC():
    url = '/home/gengl/Datasets/CC/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    cc_model = connected_components.create(graph, verbose=True)
    cc_model.summary()
Esempio n. 13
0
def SSSP():
    url = '/home/gengl/Datasets/SSSP/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3')
    sp_model.summary()
Esempio n. 14
0
def process_frame(filename):
    sf = gl.load_sframe(filename)
    
    output_frame = SFrame()
    
    #Setup our output frame
    id = []
    ip = []
    sub_count = []
    error_count = []
    time_count = []
    error_sequence_raw = []
    error_sequence = []
    
    #How many session ID's do we have?
    sa = gl.SArray()
    sa = sf['session_id']
    test = sa.unique()
    
    limit = len(test)
    
    #Start grabbing each session
    for i in range(1,limit):
        
        #Test output
        if (i % 100 == 0):   
            break 
        
        #Get the session and sort it by the date time
        session_frame = sf.filter_by(i,"session_id")
        #sorted_session = session_frame.sort("dt")
        
        row = sf[0]
        
        id += [i]
        ip += [row['ip']]
        sub_count += [len(row)]
        #time_count += [fn_time_count(sorted_session)]
        #error_count += [fn_error_count(sorted_session)]
        #error_sequence_raw += [fn_error_sequence_raw(sorted_session)]
    
    print len(id)
    print len(ip)
    print len(sub_count)
    #print len(time_count)
    
    output_frame = output_frame.add_column(SArray(id), name='id')
    output_frame.add_column(SArray(ip), name='ip')
    output_frame.add_column(SArray(sub_count),name='sub_count')
    #output_frame.add_column(SArray(time_count),name='sub_length')
    #output_frame.add_column(SArray(error_count),name='error_count')
    #output_frame.add_column(SArray(error_sequence_raw,dtype=str),name='err_seq_raw')

    output_frame.save('py2_session_analysis')
Esempio n. 15
0
 def showPath(self, highlight=None):
     # start = datetime.datetime.now()
     edge_data = SFrame.read_csv(self.edgesFn)
     vertex_data = SFrame.read_csv(self.verticesFn)
     g = SGraph(vertices=vertex_data,
                edges=edge_data,
                vid_field='name',
                src_field='src',
                dst_field='dst')
     # end = datetime.datetime.now()
     # print (end - start)
     # g.show(vlabel='attributes',vlabel_hover=True, elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True)  # highLight
     g.show(vlabel='id',
            elabel='relation',
            vlabel_hover=True,
            highlight=highlight,
            arrows=True)  # highLight
     sleep(30)
     pass
Esempio n. 16
0
def get_vertices(k, factor0=1):
    return SFrame({
        # Movies
        '__id': mids,
        'factors': map(lambda _: rand(k) * factor0, movie_ids),
        'w': map(lambda _: np.zeros(ng + nht), movie_ids),
        'b': map(lambda _: 0, movie_ids),
        'features': movies_features,
        'user': map(lambda _: 0, movie_ids)
    }).append(
        SFrame({
            # User
            '__id': uids,
            'factors': map(lambda _: rand(k) * factor0, user_ids),
            'w': map(lambda _: np.zeros(ng + nht), user_ids),
            'b': map(lambda _: 0, user_ids),
            'features': map(lambda _: {}, user_ids),
            'user': map(lambda _: 1, user_ids)
        }))
Esempio n. 17
0
def PageRank():
    url = '/clueweb/PageRank/clueweb_20M/edge_pair.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    pr_model = pagerank.create(graph,
                               reset_probability=0.2,
                               threshold=0.000000000001,
                               max_iterations=42,
                               _distributed=True)
    pr_model.summary()
def PageRank():
    url = '/home/gengl/Datasets/PageRank/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    pr_model = pagerank.create(graph,
                               reset_probability=0.2,
                               threshold=0.0001,
                               max_iterations=1000,
                               _distributed=True)
    pr_model.summary()
Esempio n. 19
0
def fn_time_count(frame):
    sf = SFrame()
    sf = frame
    
    sa = SArray()
    sa = sf['dt']
    
    num = sf.num_rows()
    
    fmt = '%Y-%m-%d %H:%M:%S'
    
    #Convert to date time objects
    start_dt = datetime.strptime(sa[0],fmt)
    end_dt = datetime.strptime(sa[num - 1],fmt)
    
    #Convert to Unix time stamps
    start_timestamp = time.mktime(start_dt.timetuple())
    end_timestamp = time.mktime(end_dt.timetuple())
    
    #Calculate the difference in minutes
    difference = (end_timestamp - start_timestamp) / 60
    
    return difference
Esempio n. 20
0
def train_model(filename):
    # load already prepared data in form of an SFrame
    image_train = graphlab.SFrame(filename)
    # load the pre-trained model
    loaded_model = graphlab.load_model('model/')
    # extract features of the model on the given pictures
    image_train['deep_features'] = loaded_model.extract_features(SFrame(image_train))
    # add ids to the SFrame to be able to find the closest images
    ids = SArray(list(range(0,len(image_train))))
    image_train.add_column(ids, name='id')
    # print image_train.head()
    # train the NN model on the extracted features
    knn_model = graphlab.nearest_neighbors.create(image_train, features=['deep_features'], label='id')
    return knn_model, image_train
Esempio n. 21
0
def fit_model(play_matrix):
    """Take matrix of play counts and return GraphLab recommender model
    based on these data
    INPUT: play_matrix (numpy array): Array of features to fit to
    OUTPUT: mod (GraphLab model): GraphLab matrix factorisation recommender
    model
    """
    plays_df = pd.DataFrame(play_matrix)
    plays_sf = SFrame(plays_df)
    agg_sf = plays_sf.groupby('user_id',
                  operations={'mean_plays': agg.MEAN('play_count'),
                              'sd_plays': agg.STDV('play_count'),
                              'play_quantile': agg.QUANTILE('play_count',
                                               [0.2, 0.4, 0.6, 0.8, 1])})
    plays_sf = plays_sf.join(agg_sf, on='user_id', how='inner')
    play_quantiles = np.array(plays_sf['play_quantile'])
    play_counts = np.array(plays_sf['play_count'])
    play_counts = play_counts.reshape(play_counts.shape[0], 1)
    plays_sf['rating'] = np.sum(play_counts <= play_quantiles, axis=1)
    mod = graphlab.recommender.create(plays_sf, user_id='user_id',
                                  item_id='song_title', target='rating',
                                  ranking=True)
    return mod
def SSSP():
    url = '/home/gengl/Datasets/SSSP/Google/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3')
    sp_model.summary()
    with open('/home/gengl/sssp_graphlab', 'w') as fo:
        for vid in range(0, 875713):
            try:
                result_pair = sp_model.get_path(vid)
                fo.write(str(result_pair[-1]) + '\n')
            except:
                pass
Esempio n. 23
0
def graph_lab(url, format = 'auto', flip_img = False, zoom = False):
	"""Extracts the graphlab features"""
	if format == 'auto':	
		extension = url.split('.')[-1]

	img = preprocess(url)
	if flip_img:
		img = flip(img)
	if zoom:
		img = middle(img)

	h,w,_ = img.shape
	img_bytes = bytearray(img)
	image_data_size = len(img_bytes)
	img = graphlab.Image(_image_data=img_bytes, _width=w, _height=h, _channels=3, _format_enum=2, _image_data_size=image_data_size)

	return SFrame({'image': [img]})
Esempio n. 24
0
def test_graphlab(num_factors=10, reg=0.01, niter=50):
    ''' test the graphlab install '''
    url = 'http://s3.amazonaws.com/GraphLab-Datasets/movie_ratings/training_data.csv'
    data = SFrame(url)
    mfac = recommender.matrix_factorization.create(data,
                                                   'user',
                                                   'movie',
                                                   'rating',
                                                   num_factors,
                                                   reg=reg,
                                                   nmf=True,
                                                   use_bias=True,
                                                   holdout_probability=0.2,
                                                   niter=niter,
                                                   random_seed=42)
    print mfac.summary
    return mfac
Esempio n. 25
0
def test_graphlab2(num_factors=10, reg=0.1, niter=50):
    ''' test the graphlab install with our data'''
    infile = PARS['data_dir'] + 'subset_partial.csv'
    data = SFrame(infile)
    mfac = recommender.matrix_factorization.create(data,
                                                   'id',
                                                   'brand',
                                                   'purchasequantity',
                                                   num_factors,
                                                   reg=reg,
                                                   nmf=True,
                                                   use_bias=True,
                                                   holdout_probability=0.2,
                                                   niter=niter,
                                                   random_seed=42)
    print mfac.summary
    return mfac
Esempio n. 26
0
def append_images(json_file):

    # we fill an SFrame with all the given metadata of the dogs
    meta = SFrame.read_json(json_file, orient='records')
    # this is the SFrame that we will fill with the data plus the image, which will be saved in the final file
    image_list = SFrame(data=None)
    # for each image in the images column in the meta SFrame, we add one line in the final SF with one image per line
    for i in range(0, len(meta) - 1):
        dogo = meta[i:i + 1]
        for image in dogo['images'][0]:
            # print image
            dogo_clone = dogo.copy()
            dogo_clone.add_column(SArray([(graphlab.Image(images_path + image))
                                          ]),
                                  name='image')
            dogo_clone.add_column(SArray([image]), name='image_filename')
            image_list = image_list.append(SFrame(dogo_clone))

    image_list.save(filename='prepared_data/')
Esempio n. 27
0
    def data_frame_with_target(self, data_frame):
        """

        :param data_frame:
        :type data_frame: DataFrame
        :return:
        :rtype: SFrame
        """
        data_sframe = SFrame(data_frame.toPandas())
        sentiment_array = data_sframe.select_column('sentiment')
        target_array = []
        for x in sentiment_array:
            try:
                target_array.append(self.convert_func(x))
            except Exception as ex:
                print len(target_array), 'get_sentiments', x
                target_array.append(3)
                print ex

        data_sframe.add_column(SArray(target_array, dtype=int), name='target')
        print data_sframe
        return data_sframe.dropna()
Esempio n. 28
0
import graphlab as gl
import datetime

# Create cluster
c = gl.deploy.hadoop_cluster.create(name='test-cluster',dato_dist_path='hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/dato/tmp',hadoop_conf_dir='/usr/local/hadoop/etc/hadoop',num_containers=3)
print c

from graphlab import SFrame, SGraph
url = 'hdfs://ec2-54-215-136-187.us-west-1.compute.amazonaws.com:9000/data/pokec.txt'
data = SFrame.read_csv(url, delimiter='\t',header=False)
g = SGraph().add_edges(data, src_field='X2', dst_field='X1')


# triangle counting
from graphlab import triangle_counting
tc = triangle_counting.create(g)
tc_out = tc['triangle_count']


#pagerank
from graphlab import pagerank
datetime.datetime.now()
pr = pagerank.create(g,threshold=0.001)
datetime.datetime.now()


# Connected Components
from graphlab import connected_components
datetime.datetime.now()
cc = connected_components.create(g)
datetime.datetime.now()
Esempio n. 29
0
#g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name',
#           src_field='src', dst_field='dst')

#targets = ['James Bond', 'Moneypenny']
#subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True)
#subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)

#from graphlab import SGraph, Vertex, Edge

#g = SGraph()
#verts = [Vertex(0, attr={'breed': 'labrador'}),
#         Vertex(1, attr={'breed': 'labrador'}),
#         Vertex(2, attr={'breed': 'vizsla'})]

#g = g.add_vertices(verts)
#g = g.add_edges(Edge(1, 2))

#print g

from graphlab import SFrame, SGraph
edge_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv')
vertex_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv')

g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name',src_field='src', dst_field='dst')
#print g

g.show()


Esempio n. 30
0
def create_sessions(sf=SFrame()):
    assert(type(sf) == SFrame)
    ip = []
    user_script = []
    err_msg = []
    compile_err = []
    of_interest = []
    ignored = 0

    for i in xrange(len(sf)):
        count = sf['count'][i]
        if count != len(sf['id'][i]):
            ignored += 1
            continue

        tip = sf['ip'][i]
        chunk_user_script = cut_dict_by_dt(sf['user_script'][i])
        user_script += chunk_user_script
        ip += [tip,] * len(chunk_user_script)

        err_msg += cut_dict_by_dt(sf['err_msg'][i])

        chunk_compile_err = cut_dict_by_dt(sf['compile_err'][i])
        compile_err += chunk_compile_err

        of_interest += set_of_interest_bit(chunk_compile_err)

    print "DEBUG:", "ignored:", ignored
    rst = SFrame()
    rst.add_column(SArray(ip, dtype=str), name='ip')
    rst.add_column(SArray(user_script, dtype=dict), name='user_script')
    rst.add_column(SArray(err_msg, dtype=dict), name='err_msg')
    rst.add_column(SArray(compile_err, dtype=dict), name='compile_err')
    rst.add_column(SArray(of_interest, dtype=int), name='of_interest')

    return rst
Esempio n. 31
0
def main():
    with open('../../Data/data_file_modified.txt') as data:
        sf = SFrame()
        
        # Data model format
        # RecordID | Date/Time | IP Address | Python Version |
        # User Script | Compile Flag | Compile Message
        id = []
        dt = []
        ip = []
        py = []
        script = []
        error = []       
        error_msg = [] 
        
        for i, line in enumerate(data):
            jo = json.loads(line)
            
            # Two different version of Python script
            # need to be compiled on different interpreters
            if(jo['py'][0] == 3):
            
                # Setup the data model we're using
                id += [i]
                dt += jo['dt']
                ip += jo['ip']
                py += jo['py']            
                script += jo['user_script']  
                
                # Run the script on the compile method
                # and obtain any error message
                flag = False
                msg = ""
                
                pattern = "is local and global"
                
                try:
                    compile(jo['user_script'][0],'<string>','exec')
                except SyntaxError, e:
                    if(re.search(pattern, str(e))):
                        msg = "Variable is Local and Global"
                    else:
                        msg = str(e)
                    flag = True
                
                
                if(flag):
                    error += [1]
                else:
                    error += [0] 
                
                # We need to chop off the error type
                # and remove the (filename line number)
                # to have any meaning here.
                fix_msg = msg.partition('(')[0]
                error_msg += [fix_msg.strip()]        
       
        sf = sf.add_column(SArray(id), name='id')
        sf.add_column(SArray(dt), name='dt')
        sf.add_column(SArray(ip), name='ip')
        sf.add_column(SArray(py, dtype=str), name='py')
        sf.add_column(SArray(script), name='user_script')
        sf.add_column(SArray(error), name='compile_err')
        sf.add_column(SArray(error_msg), name='err_msg')

        sf.save('py3_error_frame_clean')
Esempio n. 32
0
verbose = False
vertexFiles = [
    "City", "Country", "Region", "Advisor", "Category", "Founder",
    "FundingRound", "HQ", "keywords", "Member", "Office", "organizations",
    "PrimaryImage", "TeamMember", "Website", "companies_acquired_by_sap"
]
edgesFiles = [
    "GeoInformation", "acquisitions", "categories_keywords_edges",
    "investments", "keywords_descriptions_edges", "keywords_webpages_edges",
    "relationships", "companies_acquired_by_sap_edges"
]
g = SGraph()

for f in vertexFiles:
    content = SFrame.read_csv(path + f + '.csv',
                              na_values='null',
                              verbose=verbose)
    if 'path' in content.column_names():
        g = g.add_vertices(content, vid_field='path')
    elif 'url' in content.column_names():
        g = g.add_vertices(content, vid_field='url')
    else:
        print "Unknown vid field: ", content.column_names()
        sys.exit()

for f in edgesFiles:
    content = SFrame.read_csv(path + f + '.csv',
                              na_values='null',
                              verbose=verbose)
    if 'src' in content.column_names() and 'dst' in content.column_names():
        g = g.add_edges(content, src_field='src', dst_field='dst')
Esempio n. 33
0
# -------

import pandas as pd
from graphlab import SFrame
from graphlab import popularity_recommender as pr


# -----------
# Data Import
# -----------

col_names = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_table('u.data', names=col_names)
data = data.drop('timestamp', 1)

# ----------
# Data Split
# ----------

sf = SFrame(data=data, format='auto')
train, test = sf.random_split(.7)
print(len(train))
print(len(test))

# ----------------------
# Popularity Recommender
# ----------------------

recommender = pr.create(train, target='rating')
eval = recommender.evaluate(test)  # ('\nOverall RMSE: ', 1.0262364153594763)
Esempio n. 34
0
 def read_columns_to_sframe(self, column_map):
     from graphlab import SFrame
     sf = SFrame.read_csv(csv_datasource_path(self.name))
     return sf.rename(invert_dict(column_map))
Esempio n. 35
0
def get_sf_from_coo(coo, save_to):
    sf = SFrame({'userId': coo.row, 'movieId': coo.col, 'rating': coo.data})
    if save_to is not None:
        print "saving sframe to", save_to
        sf.save(save_to)
    return sf
Esempio n. 36
0
                song_3a=feat_mat['title'][feat_mat['song_id'] == song_pairs[2][0]][0],
                song_3b=feat_mat['title'][feat_mat['song_id'] == song_pairs[2][1]][0],
                artist_1a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[0][0]][0],
                artist_1b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[0][1]][0],
                artist_2a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1][0]][0],
                artist_2b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1][1]][0],
                artist_3a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2][0]][0],
                artist_3b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2][1]][0])
    # Pass song names/titles to choose from to HTML template

if __name__ == '__main__':
    play_count_path = 'train_triplets.txt'
    feat_path = 'trimmed_tempos.csv'
    model_path = 'full_two_hour_mod_directory'

    feat_mat = SFrame.read_csv(feat_path)
    model = graphlab.load_model(model_path)
    song_pairs = generate_pairs(model, np.array(feat_mat['song_id']), k=3)
    pref_songs = get_user_prefs(feat_mat, song_pairs)
    playlist = get_playlist(model, pref_songs, feat_mat,
                            desired_tempo=160,
                            tempo_margin=10,
                            playlist_length=10)

    songs = playlist['song_name'].values
    artists = playlist['artist'].values
    tempoxs = playlist['tempo_multiplier'].values
    cadences = playlist['effective_tempo'].values
    pl_length = 10
    timeline_obj = ""
Esempio n. 37
0
def get_sf_from_coo(coo, save_to):
    sf = SFrame({'userId': coo.row, 'movieId': coo.col, 'rating': coo.data})
    if save_to is not None:
        print "saving sframe to", save_to
        sf.save(save_to)
    return sf
Esempio n. 38
0
# import pandas as pd
import numpy as np
import os
import graphlab as gl
from graphlab import SFrame
import pandas as pd

def merge_data(df):
    return ''.join([str(df["store_nbr"]), "_", str(df["item_nbr"]), "_", df["date"]])



ind = True

weather = SFrame.read_csv(os.path.join('..', "data", "weather_modified_3.csv"))

if ind:
  test = SFrame.read_csv(os.path.join('..', "data", "test.csv"))


train = SFrame.read_csv(os.path.join('..', "data", "train.csv"))
key = SFrame.read_csv(os.path.join('..', "data", "key.csv"))

zero_items = SFrame.read_csv(os.path.join('..', 'data', 'zero_items_solid_new.csv'))


train_new = train.join(zero_items)

if ind:
  test_new = test.join(zero_items)
Esempio n. 39
0
                                          [0]][0],
        artist_2b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[1]
                                          [1]][0],
        artist_3a=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2]
                                          [0]][0],
        artist_3b=feat_mat['artist_name'][feat_mat['song_id'] == song_pairs[2]
                                          [1]][0])
    # Pass song names/titles to choose from to HTML template


if __name__ == '__main__':
    play_count_path = 'train_triplets.txt'
    feat_path = 'trimmed_tempos.csv'
    model_path = 'full_two_hour_mod_directory'

    feat_mat = SFrame.read_csv(feat_path)
    model = graphlab.load_model(model_path)
    song_pairs = generate_pairs(model, np.array(feat_mat['song_id']), k=3)
    pref_songs = get_user_prefs(feat_mat, song_pairs)
    playlist = get_playlist(model,
                            pref_songs,
                            feat_mat,
                            desired_tempo=160,
                            tempo_margin=10,
                            playlist_length=10)

    songs = playlist['song_name'].values
    artists = playlist['artist'].values
    tempoxs = playlist['tempo_multiplier'].values
    cadences = playlist['effective_tempo'].values
    pl_length = 10
Esempio n. 40
0
def build_data_graph():
  file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
  beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv")
  bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False)
  
  #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), 
  #and the outer [] makes sure we emit a list of lists.
  bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], 
                                     lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()])
 

  bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
  del bene_chrons['chronic_condition_value']
  bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

  g = SGraph()
  bene_chrons['relation'] = 'had_chronic'
  g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition')
  print g.summary()
 
  #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
  bene_with_chrons = SFrame(None)
  bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id')
  
  #Add edges to the graph indicating which patient had which diagnosed condition
  tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcdc[column]
  #Same patient can be diagnosed with same condition multiple times a year, so take distinct
  tcdc = tcdc.unique()
  #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no diagnosed condition, however.
  bene_chrons_tcdc = bene_with_chrons.join(tcdc)
  
  bene_chrons_tcdc['relation'] = 'diagnosed_with'
  g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd')
  print g.summary()

  
  #Add edges to the graph indicating which patient had which procedure
  tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str})
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcpc[column]
  tcpc = tcpc.unique()
  #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no procedure, however.
  bene_chrons_tcpc = bene_with_chrons.join(tcpc)
  bene_chrons_tcpc['relation'] = 'underwent'
  g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd')
  print g.summary()

  #Add edges to the graph indicating which patient had which medicine
  pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
  pde = pde.unique()
  #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no medicine, however.
  bene_chrons_pde = bene_with_chrons.join(pde)
  bene_chrons_pde['relation'] = 'had_drug'
  g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename')
  print g.summary()
   
  return g
Esempio n. 41
0
def build_data_graph():
    file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
    beneficiaries = SFrame.read_csv(file_path +
                                    "beneficiary_summary_2008_2009.csv")
    bene_packed = beneficiaries.pack_columns(
        column_prefix='chron_',
        dtype=dict,
        new_column_name='chronic_conditions',
        remove_prefix=False)

    #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(),
    #and the outer [] makes sure we emit a list of lists.
    bene_chrons = bene_packed.flat_map(
        ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"],
        lambda x: [
            list(k + (x['desynpuf_id'], ))
            for k in x['chronic_conditions'].iteritems()
        ])

    bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
    del bene_chrons['chronic_condition_value']
    bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

    g = SGraph()
    bene_chrons['relation'] = 'had_chronic'
    g = g.add_edges(bene_chrons,
                    src_field='desynpuf_id',
                    dst_field='chronic_condition')
    print g.summary()

    #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
    bene_with_chrons = SFrame(None)
    bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(),
                                'desynpuf_id')

    #Add edges to the graph indicating which patient had which diagnosed condition
    tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcdc[column]
    #Same patient can be diagnosed with same condition multiple times a year, so take distinct
    tcdc = tcdc.unique()
    #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no diagnosed condition, however.
    bene_chrons_tcdc = bene_with_chrons.join(tcdc)

    bene_chrons_tcdc['relation'] = 'diagnosed_with'
    g = g.add_edges(bene_chrons_tcdc,
                    src_field='desynpuf_id',
                    dst_field='dgns_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which procedure
    tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv",
                           column_type_hints={'prcdr_cd': str})
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcpc[column]
    tcpc = tcpc.unique()
    #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no procedure, however.
    bene_chrons_tcpc = bene_with_chrons.join(tcpc)
    bene_chrons_tcpc['relation'] = 'underwent'
    g = g.add_edges(bene_chrons_tcpc,
                    src_field='desynpuf_id',
                    dst_field='prcdr_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which medicine
    pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
    pde = pde.unique()
    #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no medicine, however.
    bene_chrons_pde = bene_with_chrons.join(pde)
    bene_chrons_pde['relation'] = 'had_drug'
    g = g.add_edges(bene_chrons_pde,
                    src_field='desynpuf_id',
                    dst_field='substancename')
    print g.summary()

    return g
Esempio n. 42
0
outputPath = os.environ.get("OUTPUT_PATH")
startScale = int(os.environ.get("START_SCALE"))

tagFile = './tmp'
with open(tagFile, 'r') as f:
    infor = f.readline().strip().split(",")
    maxScale = int(infor[1])
    realEndScale = int(infor[2])

scaleRange = range(startScale, realEndScale + 1)

for scale in scaleRange:

    inputPath = os.path.join(outputPath, 'tmp', 'AdjacentRelationships',
                             str(scale))
    url = inputPath
    data = SFrame.read_csv(url, header=False)
    if (data.num_rows() == 0):
        cc_ids = SFrame({"__id": [], "component_id": []})
    else:
        g = SGraph().add_edges(data,
                               src_field=data.column_names()[0],
                               dst_field=data.column_names()[1])
        cc = connected_components.create(g)
        cc_ids = cc.get('component_id')
    path = os.path.join(outputPath, 'tmp', 'ConnectedComponents', str(scale))
    if (~os.path.exists(path)):
        os.makedirs(path)

    SFrame.export_csv(cc_ids, os.path.join(path))
Esempio n. 43
0
import graphlab
from graphlab import SFrame

train_input = graphlab.image_analysis.load_images('train_images/', "auto", with_path=False, random_order=False)
train_output = SFrame.read_csv('train_outputs.csv',delimiter=',', header=True, column_type_hints=[int,int])
train_output.rename({'Prediction':'label'})
train_output.remove_column('Id')
train_output.add_column(train_input.select_column('image'),name='image')
training_data, validation_data = train_output.random_split(0.8)

training_data['image'] = graphlab.image_analysis.resize(training_data['image'], 28, 28, 1)
validation_data['image'] = graphlab.image_analysis.resize(validation_data['image'], 28, 28, 1)

mnist_net = graphlab.deeplearning.get_builtin_neuralnet('mnist')

#net = graphlab.deeplearning.create(sf, target='Prediction')

m = graphlab.neuralnet_classifier.create(training_data, target='label', network = mnist_net, validation_set=validation_data, max_iterations=200)

#test_data = graphlab.image_analysis.load_images('test_images/', "auto", with_path=False, random_order=False)

#pred = m.classify(test_data)
Esempio n. 44
0
def process_frame(frame_name):
    
    #Setup columns for the new frame
    session_id = []
    ip_address = []
    python_version = []
    interest = []
    submissions = []
    
    #Load in the frame we're processing
    frame = gl.load_sframe(frame_name)
    
    #Sort the frame by IP and then DT ASC
    sorted_frame = frame.sort(['ip','dt'])
    
    #Previous IP to see if we're looking at a new IP address
    previous_ip = 0
    previous_py = 0
    
    #Counters (for keys)
    record_counter = 1
    submission_counter = 1
    
    #Dictionary to hold submissions
    submissions_collection = {}
    
    #Looping through all records to break this up into
    #ip address and then 'session' chunks
    for i in xrange(len(sorted_frame)):
        if(i == 1):
            print sorted_frame['ip'][i]
            break;
        if(i % 100 == 0):
            print "processing record:" + str(i)
        if((sorted_frame['ip'][i] != previous_ip)):
            if(previous_ip != 0):
                #Add in the record to the frame
                session_id += str(record_counter)
                ip_address += str(previous_ip)
                python_version += str(previous_py)
                interest += str(is_interesting(submissions_collection))
                submissions += submissions_collection
                
                #Reset all values
                submissions_collection = {}
                previous_ip = sorted_frame['ip'][i]
                previous_py = sorted_frame['py'][i]
                record_counter = record_counter + 1
                submission_counter = 1
                
                #Create and append the submission
                d = {}
                d['date-time'] = sorted_frame['dt'][i]
                d['code_segment'] = sorted_frame['user_script'][i]
                d['error_message'] = sorted_frame['err_msg'][i]
                d['error_flag'] = sorted_frame['compile_err'][i]
                
                submissions_collection[str(submission_counter)] = d
                submission_counter = submission_counter + 1
                
            else:
                #Handling the very first record
                previous_ip = sorted_frame['ip'][i]
                previous_py = sorted_frame['py'][i]
                
                #Create and append the submission
                d = {}
                d['date-time'] = sorted_frame['dt'][i]
                d['code_segment'] = sorted_frame['user_script'][i]
                d['error_message'] = sorted_frame['err_msg'][i]
                d['error_flag'] = sorted_frame['compile_err'][i]
                
                submissions_collection[str(submission_counter)] = d
                submission_counter = submission_counter + 1
        else:
            #Create and append the submission
            d = {}
            d['date-time'] = sorted_frame['dt'][i]
            d['code_segment'] = sorted_frame['user_script'][i]
            d['error_message'] = sorted_frame['err_msg'][i]
            d['error_flag'] = sorted_frame['compile_err'][i]
            
            submissions_collection[str(submission_counter)] = d
            submission_counter = submission_counter + 1
            
    #Finally, create the frame and save it!
    
    print ip_address
    print len(session_id)
    print len(ip_address)
    print len(python_version)
    print len(submissions)
    
    rst = SFrame()
    rst.add_column(SArray(session_id, dtype=str), name='session_id')
    rst.add_column(SArray(ip_address, dtype=str), name='ip_address')
    rst.add_column(SArray(python_version, dtype=str), name='python_version')
    rst.add_column(SArray(submissions, dtype=dict), name='submissions')

    rst.save("test_frame")          
Esempio n. 45
0
#targets = ['James Bond', 'Moneypenny']
#subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True)
#subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)

#from graphlab import SGraph, Vertex, Edge

#g = SGraph()
#verts = [Vertex(0, attr={'breed': 'labrador'}),
#         Vertex(1, attr={'breed': 'labrador'}),
#         Vertex(2, attr={'breed': 'vizsla'})]

#g = g.add_vertices(verts)
#g = g.add_edges(Edge(1, 2))

#print g

from graphlab import SFrame, SGraph
edge_data = SFrame.read_csv(
    'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv')
vertex_data = SFrame.read_csv(
    'http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv')

g = SGraph(vertices=vertex_data,
           edges=edge_data,
           vid_field='name',
           src_field='src',
           dst_field='dst')
#print g

g.show()