Esempio n. 1
0
    def get_subgraph(self, ids, radius=1, full_subgraph=True):
        verts = ids

        # find the vertices within radius (and the path edges)
        for i in range(radius):
            edges_out = self._graph.get_edges(src_ids=verts)
            # edges_in = self._graph.get_edges(dst_ids=verts)

            verts = list(edges_out['__src_id']) + list(edges_out['__dst_id'])
            verts = list(set(verts))

        # make a new graph to return and add the vertices
        g = SGraph()
        g = g.add_vertices(self._graph.get_vertices(verts), vid_field='__id')

        # add the requested edge set
        if full_subgraph is True:
            df_induced = self._graph.get_edges(src_ids=verts)
            # induced_edge_in = self._graph.get_edges(dst_ids=verts)
            # df_induced = induced_edge_out.append(induced_edge_in)
            df_induced = df_induced.groupby(df_induced.column_names(), {})

            verts_sa = SArray(list(ids))
            edges = df_induced.filter_by(verts_sa, "__src_id")
            edges.append(df_induced.filter_by(verts_sa, "__dst_id"))

            g = g.add_edges(edges, src_field='__src_id', dst_field='__dst_id')
        return GlGraph(is_directed=self.is_directed, graph_obj=g)
Esempio n. 2
0
 def load_graph(self,
                graph_path,
                direction=1,
                start_line=0,
                limit=None,
                blacklist=set(),
                delimiter=','):
     json_object = utils.is_json(graph_path)
     if json_object is not False:
         # print json_object
         graph_path = SFrame(SArray(json_object).unpack())
         graph_path.rename({'X.0': 'X1', 'X.1': 'X2', 'X.2': 'Weight'})
     else:
         # load_sgraph()
         graph_path = SFrame.read_csv(graph_path,
                                      delimiter=delimiter,
                                      header=False,
                                      column_type_hints={
                                          'X1': str,
                                          'X2': str
                                      },
                                      nrows=limit,
                                      skiprows=start_line)
         if self._weight_field != "":
             graph_path.rename({'X3': 'Weight'})
     # print graph_data
     self._graph = self._graph.add_edges(graph_path,
                                         src_field='X1',
                                         dst_field='X2')
     if not self.is_directed:
         self.to_undirected()
Esempio n. 3
0
def append_images(json_file):

    # we fill an SFrame with all the given metadata of the dogs
    meta = SFrame.read_json(json_file, orient='records')
    # this is the SFrame that we will fill with the data plus the image, which will be saved in the final file
    image_list = SFrame(data=None)
    # for each image in the images column in the meta SFrame, we add one line in the final SF with one image per line
    for i in range(0, len(meta) - 1):
        dogo = meta[i:i + 1]
        for image in dogo['images'][0]:
            # print image
            dogo_clone = dogo.copy()
            dogo_clone.add_column(SArray([(graphlab.Image(images_path + image))
                                          ]),
                                  name='image')
            dogo_clone.add_column(SArray([image]), name='image_filename')
            image_list = image_list.append(SFrame(dogo_clone))

    image_list.save(filename='prepared_data/')
Esempio n. 4
0
def initVertex(g):
    g.vertices['dist'] = 8888

    g.vertices['sent'] = 0
    #g.vertices['from_last_art'] = 0
    #g.vertices['count'] =0 
    g.vertices['isDead'] = 0
    #g.vertices['vid_set'] = SArray.from_const({}, g.summary()['num_vertices'])
#seen here have two function, for the cat, it is used to remember the articles, for art, it is used as the vid_set
#in fact, it is a dict with the form of {'id':[dist, from_last_art]}
    g.vertices['seen'] = SArray.from_const({}, g.summary()['num_vertices'])
Esempio n. 5
0
def train_model(filename):
    # load already prepared data in form of an SFrame
    image_train = graphlab.SFrame(filename)
    # load the pre-trained model
    loaded_model = graphlab.load_model('model/')
    # extract features of the model on the given pictures
    image_train['deep_features'] = loaded_model.extract_features(SFrame(image_train))
    # add ids to the SFrame to be able to find the closest images
    ids = SArray(list(range(0,len(image_train))))
    image_train.add_column(ids, name='id')
    # print image_train.head()
    # train the NN model on the extracted features
    knn_model = graphlab.nearest_neighbors.create(image_train, features=['deep_features'], label='id')
    return knn_model, image_train
Esempio n. 6
0
 def store(self):
     # build as sparse matrix first
     self.sa = SArray(dtype=dict)
     for i, group in enumerate(self.selection):
         #if self.selection and i not in self.selection:
         #if self.selection and group not in self.selection:
         #    continue
         counts_file = "%s/%s_%s_%s/%s.counts.bin" % (self.reference, self.hash, self.ks, self.ke, group)
         hashes_file = "%s/%s_%s_%s/%s.hashes.bin" % (self.reference, self.hash, self.ks, self.ke, group)
         c = loader.create_list(counts_file,False)
         h = loader.create_list(hashes_file,True)
         print "%s, %s, %s" % (group, len(h), len(c))
         values = {}
         if self.filter != None:
             mask = np.in1d(h, self.filter.classes_)
             values = dict(zip(self.filter.transform(np.array(h, dtype=np.uint32)[mask]), np.array(c, dtype=np.uint32)[mask]))
         else:
             values = dict(zip(h, c))
         if len(values) == 0:
             print "No data for %s" % group
             continue
         sa = SArray([values])
         self.sa = self.sa.append(sa)
Esempio n. 7
0
 def generate(self, sf):
     Feature.validateData(sf, self.price)
     sma1 = sf[self.price].rolling_mean(-1 * self.MA1, 0)
     sma2 = sf[self.price].rolling_mean(-1 * self.MA2, 0)
     diff = sma1 - sma2
     signals = [None] * len(sma1)
     for i in xrange(1, len(diff)):
         if diff[i] and diff[i - 1]:
             if diff[i] > 0 and diff[i - 1] < 0:
                 signals[i] = 1
             elif diff[i] < 0 and diff[i - 1] > 0:
                 signals[i] = -1
             else:
                 signals[i] = 0
     sf.add_column(SArray(signals), name=self.name)
     return self.name
Esempio n. 8
0
def crossproduct(d):
    """
    Create an SFrame containing the crossproduct of all provided options.

    Parameters
    ----------
    d : dict
        Each key is the name of an option, and each value is a list
        of the possible values for that option.

    Returns
    -------
    out : SFrame
        There will be a column for each key in the provided dictionary,
        and a row for each unique combination of all values.

    Example
    -------
    settings = {'argument_1':[0, 1],
                'argument_2':['a', 'b', 'c']}
    print crossproduct(settings)
    +------------+------------+
    | argument_2 | argument_1 |
    +------------+------------+
    |     a      |     0      |
    |     a      |     1      |
    |     b      |     0      |
    |     b      |     1      |
    |     c      |     0      |
    |     c      |     1      |
    +------------+------------+
    [6 rows x 2 columns]
    """

    _mt._get_metric_tracker().track('util.crossproduct')
    from graphlab import SArray
    d = [zip(d.keys(), x) for x in _itertools.product(*d.values())]
    sa = [{k: v for (k, v) in x} for x in d]
    return SArray(sa).unpack(column_name_prefix='')
Esempio n. 9
0
 def compute(cls, sf, period, column):
     gain, loss = 0, 0
     rsi = [None] * len(sf[column])
     j = 0
     for i in xrange(len(sf[column])):
         v = sf[column][i]
         if v > 0:
             gain += v
         else:
             loss += -1 * v
         if i >= period - 1:
             if loss == 0 and gain == 0:
                 rsi[i] = 50.
             elif loss == 0:
                 rsi[i] = 100.
             else:
                 rsi[i] = 100. - 100. / (1. + gain / float(loss))
             u = sf[column][j]
             gain -= u if u > 0 else 0
             loss += u if u < 0 else 0
             j += 1
     return SArray(rsi)
Esempio n. 10
0
    def data_frame_with_target(self, data_frame):
        """

        :param data_frame:
        :type data_frame: DataFrame
        :return:
        :rtype: SFrame
        """
        data_sframe = SFrame(data_frame.toPandas())
        sentiment_array = data_sframe.select_column('sentiment')
        target_array = []
        for x in sentiment_array:
            try:
                target_array.append(self.convert_func(x))
            except Exception as ex:
                print len(target_array), 'get_sentiments', x
                target_array.append(3)
                print ex

        data_sframe.add_column(SArray(target_array, dtype=int), name='target')
        print data_sframe
        return data_sframe.dropna()
Esempio n. 11
0
def createSArray(arr):
    return SArray(arr)
# For each customers get his personla recommendation and item item simillarity of what he is buying
for x in customerIDs:
    
    cld2 = products_2[products_2['customerID'] == x]
    cld2['rating'] = cld2.groupby('product_code')['product_code'].transform('count')
    sf = graphlab.SFrame(data=cld2)
    #Based on his own Recommendation
    mf = graphlab.factorization_recommender.create(sf, user_id = 'customerID' , item_id = 'product_code', target='rating')
    # Selected Top 3 from his personal recommendation and from total recommendation
    personalRecommdation = mf.recommend_from_interactions([], k=3)
    
    
    cluster_Personal = []
    cluster_Personal = personalRecommdation['product_code']
    #print (cluster_Personal)
    sa = SArray(data=cluster_Personal)
    totalRecommdation = m.get_similar_items(items=sa, k=3, verbose=False)
    #totalRecommdation = m.recommend([x], k=3)
    cluster_Recommdation = []
    cluster_Recommdation = totalRecommdation['similar']
    print(cluster_Personal)
    print(cluster_Recommdation)
    #print (cluster_Recommdation)
    empty = []
    empty.append(cluster_Recommdation)
    empty.append(cluster_Personal)
    for s in range (0, len(empty)):
        defaultValueList[s]= empty[s]
    makeitastring = ",".join(map(str, defaultValueList))
    print (makeitastring)
    d1 = { 'customerID' : x, 'products' : makeitastring}
Esempio n. 13
0
    aggregatedData = aggregateData(data_matrix)

    # print(aggregatedData)

    aggregatedData.save('../data/musicbrainz/aggregatedData')

    aggregatedData.save('../data/musicbrainz/aggregatedData.csv', format='csv')

    # sed 's/[^a-zA-Z0-9,-.]/ /g' aggregatedData.csv > removePunctuation.csv

    # temp1 = pd.read_csv("data/musicbrainz/removePunctuation.csv", skipinitialspace=True)

    # temp1 = np.genfromtxt("../data/musicbrainz/removePunctuation.csv", delimiter=",", filling_values=None)

    temp = SArray('../data/musicbrainz/removePunctuation.csv')

    docs_tfidf = gl.text_analytics.tf_idf(temp)

    print(docs_tfidf)

    docs_tfidf.save('../data/musicbrainz/docs_tfidf')

    docs_tfidf.save('../data/musicbrainz/docs_tfidf.csv', format='csv')

########################################################################################################################

# print ("starting to load the data...\n")
#
# df1 = pd.read_csv("data/musicbrainz/track_artist_tags.csv", sep=';')
#
Esempio n. 14
0
class KMergeCorpus(object):
    def __init__(self, groups, reference, hashf, ks, ke, filter_file=None):
        self.groups = groups
        self.reference = reference
        self.hash = hashf
        self.ks = ks
        self.ke = ke
        self.selection = None
        self.filter = None
        if filter_file or isfile("hashes.k%s.npz" % ks):
            self.filter = LabelEncoder()
            self.filter.fit(np.load(filter_file)["hashes"])
            pickle.dump(self.filter, open("encoder.k%s.pkl" % self.ks, "wb"))
            #self.filter = {h:i for i,h in enumerate(np.load(filter_file)["hashes"])}

    def __len__(self):
        return len(self.selection)

    def set_selection(self, s):
        self.selection = s


    def store(self):
        # build as sparse matrix first
        self.sa = SArray(dtype=dict)
        for i, group in enumerate(self.selection):
            #if self.selection and i not in self.selection:
            #if self.selection and group not in self.selection:
            #    continue
            counts_file = "%s/%s_%s_%s/%s.counts.bin" % (self.reference, self.hash, self.ks, self.ke, group)
            hashes_file = "%s/%s_%s_%s/%s.hashes.bin" % (self.reference, self.hash, self.ks, self.ke, group)
            c = loader.create_list(counts_file,False)
            h = loader.create_list(hashes_file,True)
            print "%s, %s, %s" % (group, len(h), len(c))
            values = {}
            if self.filter != None:
                mask = np.in1d(h, self.filter.classes_)
                values = dict(zip(self.filter.transform(np.array(h, dtype=np.uint32)[mask]), np.array(c, dtype=np.uint32)[mask]))
            else:
                values = dict(zip(h, c))
            if len(values) == 0:
                print "No data for %s" % group
                continue
            sa = SArray([values])
            self.sa = self.sa.append(sa)
        
    def __iter__(self):
        for i, group in enumerate(self.selection):
            #if self.selection and i not in self.selection:
            #if self.selection and group not in self.selection:
            #    continue
            counts_file = "%s/%s_%s_%s/%s.counts.bin" % (self.reference, self.hash, self.ks, self.ke, group)
            hashes_file = "%s/%s_%s_%s/%s.hashes.bin" % (self.reference, self.hash, self.ks, self.ke, group)
            c = loader.create_list(counts_file,False)
            h = loader.create_list(hashes_file,True)
            values = []
            if self.filter != None:
                mask = np.in1d(h, self.filter.classes_)
                values = zip(self.filter.transform(np.array(h)[mask]), np.array(c)[mask])
            else:
                values = zip(h, c)
            if len(values) == 0:
                print "No data for %s" % group
            yield values