Example #1
0
def save_data(image_test):
    """Save image_train data to csv"""

    # inspect the images in the data set
    print "*********************"
    print "expecting 4000 images:", len(image_test)
    print "*********************"

    # [column_names]
    # 0000=id
    # 0001=image
    # 0002=label
    # 0003=deep_features
    # 0004=image_array

    print "*********************"
    print "save image labels to csv"
    all_image_labels = image_test['label']
    image_labels = gl.SArray(all_image_labels)
    all_image_labels.save('all_image_labels.csv')
    print "*********************"
    all_image_ids = image_test['id'].astype(int)
    image_id = gl.SArray(all_image_ids)
    print "*********************"
    print "create a csv file concatenated with the id and label"
    image_id_and_label = gl.SFrame({'ids': image_id, 'label': image_labels})
    image_id_and_label.save('image_id_and_label.csv')
    print "*********************"
Example #2
0
    def evaluate_recommendation(self,
                                recom,
                                selected_authors=None,
                                n_recommendations=10):
        all_ratios = []
        if not selected_authors:
            selected_authors = self.selected_authors

        exisiting_collabs_list = []
        for a in selected_authors:
            n_existing_collabs = 0
            a_papers = self.topic_model.corpus.documents_by_author(a[0])
            a_collabs = [self.topic_model.corpus.authors(b) for b in a_papers]
            try:
                a_recoms = recom.get_similar_users(gl.SArray(
                    [a[0]]), 1000).sort("distance")[:n_recommendations]
            except:
                a_recoms = recom.get_similar_items(gl.SArray([a[0]]),
                                                   n_recommendations)
            pass
            a_similar = a_recoms["similar"]

            for sim in a_similar:
                for collabs in a_collabs:
                    if sim in collabs:
                        n_existing_collabs += 1
                        if not [a[0], sim] in exisiting_collabs_list and not [
                                sim, a[0]
                        ] in exisiting_collabs_list:
                            exisiting_collabs_list.append(sorted([a[0], sim]))
                        break
            all_ratios.append(
                float(n_existing_collabs) / (len(a_recoms) + .00001))
        return all_ratios, exisiting_collabs_list
 def _array_to_sframe(self, data, targets=None):
     d = dict()
     for i in xrange(data.shape[1]):
         d['feat_%d' % (i + 1)] = gl.SArray(data[:, i])
     if targets is not None:
         d['target'] = gl.SArray(targets)
     return gl.SFrame(d)
Example #4
0
def predict_options(options):
    """
    Run predictions on potential options
    :param options: array of dictionary, expected format [{"user": __, "content.id": __}]
    :return: an array with predicted scores for each option; None if invalid
    """
    # TODO - Need to format option in a way that makes sense for the predictor
    if os.path.exists(MODEL_LOCATION):
        model = gl.load_model(MODEL_LOCATION)
    else:
        logger.warn("couldn't load module, re-training", exc_info=True)
        model = train()

    if "user" in options[0] and "content_id" in options[0]:
        temp_users = []
        temp_content = []
        for option in options:
            temp_users.append(option["user"])
            temp_content.append(option["content_id"])
        users = gl.SArray(temp_users)
        content = gl.SArray(temp_content)
        frame = gl.SFrame({
            "user": users,
            "content_id": content
        },
                          format="dict")
        prediction = model.predict(frame)
        logger.info("prediction: ", str(prediction))
    else:
        logger.error(
            "options not in the correct format, expected key 'user' and key 'content_id'"
        )
        prediction = None

    return list(prediction)
    def classify(self, path, h=48, w=48, channels=1):
        """Classify the image

        :param path: path to image file
        :param h: image height
        :param w: image width
        :param channels: number of channels for the image
        :return: classifications
        """
        image = gl.Image(path)
        data = image.pixel_data.copy()
        image, face = self._image_processor.process_image(data)
        if face is None:
            return None

        face = face.flatten()
        face = face - np.mean(face)
        face /= np.std(face)

        fmin = np.min(face)
        fmax = np.max(face)

        face = np.floor(255 * (face - fmin) / (fmax - fmin))

        face_arr = gl.SArray([face.tolist()])
        clf_image = face_arr.pixel_array_to_image(h,
                                                  w,
                                                  channels,
                                                  allow_rounding=True)
        x = gl.SFrame({'images': clf_image})
        classifications = self._classifier(x)
        return image, clf_image[0], classifications
Example #6
0
    def _convert_to_SArray(cls, value):
        ''' Convert an input value to SArray, the logic is:

                list => an SArray of len(list) rows
                other => an SArray of one row

            Parameters
            ----------
            value : any type
                The value to be converted

            Returns
            -------
            (success, converted) : pair(bool, SArray | value)
                 'success' indicates if the conversion is successful,
                 if successful, 'converted' contains the converted value
                 otherwise, 'converted' is original value
        '''
        converted = value
        if not isinstance(converted, list):
            converted = [converted]

        # create an SArray now
        try:
            return (True, graphlab.SArray(converted))
        except Exception as e:
            logging.info(
                "Hit exception trying to convert input %s to SArray. Error: %s"
                % (value, e.message))
            return (False, value)
Example #7
0
    def clean_up(self):
        """Clean up after video image capturing

        This is where the actual classification is done
        """
        if self._capture:
            self._capture.release()

        self.transformed_images = None
        self.original_images = None

        if self.images is not None:
            count = 0

            self.transformed_images = []
            self.original_images = []

            images_to_gl = []
            for im in self.images:
                if im[0] is None or im[1] is None:
                    continue

                self.original_images.append(im[0])
                self.transformed_images.append(im[1])
                images_to_gl.append(im[1].flatten().tolist())
                count += 1

            x = gl.SArray(images_to_gl)
            x.pixel_array_to_image(self._w, self._h, self._d)
            x = gl.SFrame({'images': x})

            if self.classifier is not None:
                self._classifications = self.classifier(x)
Example #8
0
 def between(row):
     if len(row['distances']) != n_dimensions:
         return None
     x = gl.SArray(row['distances'])
     if x.std() > .15:
         return None
     return x.mean() + x.std()
def bipartition(cluster, maxiter=400, num_runs=4, seed=None):
    '''cluster: should be a dictionary containing the following keys
                * dataframe: original dataframe
                * matrix:    same data, in matrix format
                * centroid:  centroid for this particular cluster'''
    
    data_matrix = cluster['matrix']
    dataframe   = cluster['dataframe']
    
    # Run k-means on the data matrix with k=2. We use scikit-learn here to simplify workflow.
    kmeans_model = KMeans(n_clusters=2, max_iter=maxiter, n_init=num_runs, random_state=seed, n_jobs=1)
    kmeans_model.fit(data_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    
    # Divide the data matrix into two parts using the cluster assignments.
    data_matrix_left_child, data_matrix_right_child = data_matrix[cluster_assignment==0],                                                       data_matrix[cluster_assignment==1]
    
    # Divide the dataframe into two parts, again using the cluster assignments.
    cluster_assignment_sa = graphlab.SArray(cluster_assignment) # minor format conversion
    dataframe_left_child, dataframe_right_child     = dataframe[cluster_assignment_sa==0],                                                       dataframe[cluster_assignment_sa==1]
        
    
    # Package relevant variables for the child clusters
    cluster_left_child  = {'matrix': data_matrix_left_child,
                           'dataframe': dataframe_left_child,
                           'centroid': centroids[0]}
    cluster_right_child = {'matrix': data_matrix_right_child,
                           'dataframe': dataframe_right_child,
                           'centroid': centroids[1]}
    
    return (cluster_left_child, cluster_right_child)
def triple_apply_knn(features):

    def graph_JSD(src,edge,dst):
        P = src['X1']
        Q = dst['X1']
        _P = P / norm(P, ord=1)
        _Q = Q / norm(Q, ord=1)
        _M = 0.5 * (_P + _Q)
        edge['distance'] = 0.5 * (entropy(_P, _M) + entropy(_Q, _M))
        return (src, edge, dst)

    n = len(features)
    sf = gl.SFrame(features)
    sf = sf.add_row_number('row_id')

    sg = gl.SGraph().add_vertices(sf, vid_field='row_id')

    edges = [gl.Edge(u, v, attr={'distance': None}) for (u, v) in itertools.combinations(range(n), 2)]
    sg = sg.add_edges(edges)

    sg_dist = sg.triple_apply(graph_JSD, mutated_fields=['distance'])

    #knn = sg_dist.edges.groupby("__src_id", {"knn" : gl.aggregate.CONCAT("__dst_id","distance")}).sort("__src_id")
    #top_neighbors = knn.apply(lambda row: sorted(row['knn'],key=row['knn'].get)[:N])

    top_neighbors = []
    for idx in xrange(n):
        topN_sf = sg_dist.get_edges(src_ids=[idx,None],dst_ids=[None,idx]).topk('distance',k=N,reverse=True)
        topN = topN_sf.apply(lambda row: row['__src_id'] if row['__dst_id']==idx else row['__dst_id'])
        top_neighbors.append(topN)

    return gl.SArray(top_neighbors)
 def _detect_array(self, arr, class_score_threshold=0.5, nms_threshold=0.3):
     try:
         import graphlab as _gl
     except ImportError:
         import sframe as _gl
     except ImportError:
         raise ImportError('Require GraphLab Create or SFrame')
     ret = _gl.SFrame()
     det_cnt = [0] * len(arr)
     for i in range(len(arr)):
         tmp = self._postprocess(self._detect(arr[i]), class_score_threshold, nms_threshold)
         det_cnt[i] = len(tmp)
         # if nothing detected:
         if len(tmp) == 0:
             if self._executor_with_feature:
                 tmp = _gl.SFrame({"box":[[0.,0.,0.,0.]],
                     "class":["nothing"], "score":[0.], "feature":[[0.]]})
             else:
                 tmp = _gl.SFrame({"box":[[0.,0.,0.,0.]], "class":["nothing"], "score":[0.]})
             det_cnt[i] = 1
         ret = ret.append(tmp)
     image_id = []
     for i in range(len(arr)):
         image_id.extend([i] * det_cnt[i])
     ret.add_column(_gl.SArray(image_id), "id")
     return ret
Example #12
0
 def _create_images(self, x):
     sarray = gl.SArray(x)
     images = sarray.pixel_array_to_image(self._w,
                                          self._h,
                                          self._d,
                                          allow_rounding=True)
     return images
Example #13
0
def _wrap_function_return(val):
    """
    Recursively walks each thing in val, opening lists and dictionaries,
    converting all occurances of UnityGraphProxy to an SGraph,
    UnitySFrameProxy to SFrame, and UnitySArrayProxy to SArray.
    """
    if type(val) == _UnityGraphProxy:
        return _gl.SGraph(_proxy=val)
    elif type(val) == _UnitySFrameProxy:
        return _gl.SFrame(_proxy=val)
    elif type(val) == _UnitySArrayProxy:
        return _gl.SArray(_proxy=val)
    elif type(val) == _UnityModel:
        # we need to cast it up to the appropriate type
        try:
            if '__uid__' in val.list_fields():
                uid = val.get('__uid__')
                if uid in class_uid_to_class:
                    return class_uid_to_class[uid](_proxy=val)
        except:
            pass
        return val
    elif type(val) == list:
        return [_wrap_function_return(i) for i in val]
    elif type(val) == dict:
        return {i: _wrap_function_return(val[i]) for i in val}
    else:
        return val
Example #14
0
def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path):
    """
    Internal util to get a GLC object from a persistent ID in the pickle file.

    Parameters
    ----------
    type_tag : The name of the glc class as saved in the GLC pickler.

    gl_archive_abs_path: An absolute path to the GLC archive where the 
                          object was saved.

    Returns
    ----------
    The GLC object.

    """
    if type_tag == "SFrame":
        obj = _gl.SFrame(gl_archive_abs_path)
    elif type_tag == "SGraph":
        obj = _gl.load_graph(gl_archive_abs_path)
    elif type_tag == "SArray":
        obj = _gl.SArray(gl_archive_abs_path)
    elif type_tag == "Model":
        obj = _gl.load_model(gl_archive_abs_path)
    else:
        raise _pickle.UnpicklingError(
            "GraphLab pickling Error: Unspported object."
            " Only SFrames, SGraphs, SArrays, and Models are supported.")
    return obj
Example #15
0
def uniform_numeric_column(n, col_type=float, range=(0, 1), missingness=0.):
    """
    Return an SArray of uniformly random numeric values.

    Parameters
    ----------
    n : int
        Number of entries in the output SArray.

    col_type : type, optional
        Type of the output SArray. Default is floats.

    range : tuple[int, int], optional
        Minimum and maximum of the uniform distribution from which values are
        chosen.

    missingness : float, optional
        Probability that a given entry in the output is missing.

    Returns
    -------
    out : SArray
    """
    if col_type == int:
        v = np.random.randint(low=range[0], high=range[1], size=n).astype(float)
    else:
        v = np.random.rand(n)
        v = v * (range[1] - range[0]) + range[0]

    idx_na = np.random.rand(n) < missingness
    v[idx_na] = None
    v = np.where(np.isnan(v), None, v)

    return gl.SArray(v, dtype=col_type)
Example #16
0
def parse_speakers(filename):
    """
    Clean up the description for each speaker.
    """
    speakers = gl.SArray(filename)[0]
    speakers = gl.SFrame({
        'url_suffix': speakers.keys(),
        'data': speakers.values()
    })

    def clean_bio(y):
        x = y['bio']
        x = x.replace('Website', '')
        x = x.replace('| Attendee Directory', '')
        x = x.replace('Profile', '')
        x = x.replace('\n', ' ')
        x = x.replace('\r', '')
        y['bio'] = x.strip()
        return y

    speakers['data'] = speakers['data'].apply(clean_bio)

    sdict = {}
    for s in speakers:
        k = s['url_suffix']
        sdict[k] = s['data']

    return sdict, speakers
Example #17
0
def polynomial_sframe(sf, feature, degree):
    sf_feature_power = graphlab.SFrame()
    sf_feature_power[feature] = sf[feature]
    if degree > 1:
        for i in range(2, degree + 1):
            sf_feature_power[feature + '_power_' + str(i)] = graphlab.SArray(
                sf[feature]).apply(lambda x: x**i)
    return sf_feature_power
Example #18
0
def loadData(filename, type=None):
    data = {}
    data['label'] = []
    data['image'] = []
    input = []
    output = []
    size = 128, 128
    # print filename
    with open(filename, 'r') as f:
        for line in f:
            train = line.strip().split()
            if type == 'train':
                if len(train) < 50:
                    continue
            classID = int(train[0])
            for j in range(3):
                output.append(classID)
                #105*122 features
                image = []
                for i in range(12810):
                    image.append(0.0)  #initialize
                for i in range(1, len(train)):
                    pixel = train[i].split(':')
                    k = random.random()
                    if j == 0:
                        k = 1
                    if float(pixel[1]) < 0.1:
                        pixel[1] = 0
                    elif float(pixel[1]) < 0.4 and k < 0.3:
                        pixel[1] = 0
                    else:
                        pixel[1] = 1
                    image[int(pixel[0]) - 1] = float(pixel[1]) * 255
                input.append(image)
                if type == 'test':
                    break

    data['label'] = gl.SArray(output)
    scaled_input = gl.SArray(input)
    img_sarray = gl.SArray.pixel_array_to_image(scaled_input,
                                                105,
                                                122,
                                                1,
                                                allow_rounding=True)
    data['image'] = img_sarray
    return data
Example #19
0
 def test_rebuild_bitset(self):
     signal = gl.SFrame(gen_signal())
     nb_layers = signal['layer'].max() + 1  # starts at 0
     res = sptgraph_fast.aggregate_layers(signal, 'baseID', 'layer', nb_layers)
     l1 = res['layers'].apply(utils.reform_layer_int_from_blocks)
     l2 = map(lambda x: int(x, 2), res['layers'].apply(sptgraph_fast.flex_bitset_to_flex_string))
     m = l1 == gl.SArray(l2)
     self.assertTrue(m.all(), 'Layers should be equal')
Example #20
0
def save_as_train_and_test(X, train_loc, valid_loc):

    # Can't just randomly sample the indices
    all_names = list(X["name"].unique())

    n_valid = (2 * len(all_names)) / 100

    random.shuffle(all_names)

    tr_names = gl.SArray(all_names[n_valid:])
    valid_names = gl.SArray(all_names[:n_valid])

    X_train = X.filter_by(tr_names, 'name')
    X_valid = X.filter_by(valid_names, 'name')

    X_train.save(train_loc)
    X_valid.save(valid_loc)
Example #21
0
def find_communities(sgraph, threads=4):
    timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
    timestamp = timestamp + str(random.randint(1, 1000))
    input_f = _cache_dir + '/input/%s.txt' % timestamp
    if not os.path.exists(os.path.dirname(input_f)):
        os.makedirs(os.path.dirname(input_f))
    output_dir = _cache_dir + '/output/'
    if not os.path.exists(os.path.dirname(output_dir)):
        os.makedirs(os.path.dirname(output_dir))

    #input file
    vertices = sgraph.get_vertices()
    vertices.sort('__id')
    v_idx_map = dict([(x['__id'], i) for i, x in enumerate(vertices)])
    with open(input_f, 'w') as f:
        for row in sgraph.edges:
            f.write('%s %s %s \n' %
                    (v_idx_map[row['__src_id']], v_idx_map[row['__dst_id']],
                     row['weight']))

    #run relaxmap
    command = "%(_this_dir)s/ompRelaxmap %(seed)s %(network_data)s %(threads)s %(attempts)s "+\
        "%(threshold)s %(vThresh)s %(maxIter)s %(outDir)s %(prior)s >/dev/null 2>&1"
    params = {
        '_this_dir': _this_dir,
        'seed': 1,
        'network_data': input_f,
        'threads': threads,
        'attempts': 1,
        'threshold': 1e-4,
        'vThresh': 0.0,
        'maxIter': 15,
        'outDir': output_dir,
        'prior': 'prior'
    }
    command = command % params
    os.system(command)

    #output file
    mdl = 0
    node_comm_map = {}
    with open(output_dir + '%s.tree' % timestamp, 'r') as f:
        for i, line in enumerate(f):
            if i == 0:
                mdl = float(line.split(' ')[3])
                continue
            comm_id = line.split(':')[0]
            node_id = line.split()[-1].strip('"')
            node_comm_map[node_id] = comm_id
    comm_ids = []
    for i in range(vertices.num_rows()):
        comm_id = node_comm_map.get(str(i),
                                    random.choice(node_comm_map.values()))
        comm_ids.append(comm_id)

    #vertices sframe
    vertices['community_id'] = gl.SArray(comm_ids)
    return vertices, mdl
def tf_idf(dataset):
    """
    Compute the TF-IDF scores for each word in each document. The collection
    of documents must be in bag-of-words format.

    .. math::
        \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w))

    where :math:`tf(w, d)` is the number of times word :math:`w` appeared in
    document :math:`d`, :math:`f(w)` is the number of documents word :math:`w`
    appeared in, :math:`N` is the number of documents, and we use the
    natural logarithm.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF`.

    Parameters
    ----------
    dataset : SArray[str | dict | list]
        Input text data. See :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF`
        documentation for details on how string, dict, and list inputs are handled.

    Returns
    -------
    out : SArray[dict]
        The same document corpus where each score has been replaced by the
        TF-IDF transformation.

    See Also
    --------
    count_words, count_ngrams, tokenize,
    graphlab.toolkits.feature_engineering._tfidf.TFIDF

    References
    ----------
    - `Wikipedia - TF-IDF <https://en.wikipedia.org/wiki/TFIDF>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text')
        >>> docs_tfidf = graphlab.text_analytics.tf_idf(docs)
    """
    _mt._get_metric_tracker().track('toolkit.text_analytics.tf_idf')

    _raise_error_if_not_sarray(dataset, "dataset")

    if len(dataset) == 0:
        return _graphlab.SArray()

    dataset = _graphlab.SFrame({'docs': dataset})
    scores = _graphlab.feature_engineering.TFIDF('docs').fit_transform(dataset)

    return scores['docs']
Example #23
0
def adaboost_with_tree_stumps(data, features, target, num_tree_stumps):
    # start with unweighted data
    alpha = graphlab.SArray([1.] * len(data))
    weights = []
    tree_stumps = []
    target_values = data[target]

    for t in xrange(num_tree_stumps):
        print '====================================================='
        print 'Adaboost Iteration %d' % t
        print '====================================================='
        # Learn a weighted decision tree stump. Use max_depth=1
        tree_stump = weighted_decision_tree_create(data,
                                                   features,
                                                   target,
                                                   data_weights=alpha,
                                                   max_depth=6)
        tree_stumps.append(tree_stump)

        # Make predictions
        predictions = data.apply(lambda x: classify_weighted(tree_stump, x))

        # Produce a Boolean array indicating whether
        # each data point was correctly classified
        is_correct = predictions == target_values
        is_wrong = predictions != target_values

        temp = data['weights'][is_correct == 0]

        # Compute weighted error
        #split_feature =tree_stump['splitting_feature']
        #left_split = data[data[split_feature] == 0]
        #right_split = data[data[split_feature] == 1]

        #left_data_weights = left_split['weights']
        #right_data_weights = right_split['weights']

        #left_weighted_mistakes, left_class = intermediate_node_weighted_mistakes(left_split[target], left_data_weights)
        #right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(right_split[target], right_data_weights)

        weighted_error = temp.sum() / data['weights'].sum()
        # Compute model coefficient using weighted error
        weight = math.log((1 - weighted_error) / weighted_error) / 2.0
        weights.append(weight)

        # Adjust weights on data point
        adjustment = is_correct.apply(lambda is_correct: math.exp(-weight)
                                      if is_correct else math.exp(weight))

        # Scale alpha by multiplying by adjustment
        # Then normalize data points weights
        data_weights = data['weights'] * adjustment
        su = data_weights.sum()
        data_weights = data_weights.apply(lambda x: x / su)
        alpha = data_weights

    return weights, tree_stumps
 def getData(self, reco):
     pn = self.queryAmazon(reco['ProductId'])
     pn = [
         x.encode('utf-8') if len(x) != 0 else "Some Awesome Product"
         for x in pn
     ]
     reco.add_column(gl.SArray(pn), name='ProductName')
     rn = self.queryAmazon(reco['ProductId'], 'Images')
     rn = [
         x.encode('utf-8') if len(x) != 0 else "static/img/default.jpeg"
         for x in rn
     ]
     reco.add_column(gl.SArray(rn), name='ProductURL')
     reco = reco.pack_columns(
         columns=['score', 'rank', 'ProductName', 'ProductURL'],
         new_column_name='Details')
     df = reco.to_dataframe().set_index('ProductId')
     recommendations = df.to_dict(orient='dict')['Details']
     return recommendations
def save_as_train_and_test(X, train_loc, valid_loc, valid_split_percent=2):

    # Can't just randomly sample the indices
    all_names = list(X["name"].unique())

    n_valid = (valid_split_percent * len(all_names)) / 100

    random.shuffle(all_names)

    tr_names = gl.SArray(all_names[n_valid:])
    valid_names = gl.SArray(all_names[:n_valid])

    X_train = X.filter_by(tr_names, 'name')
    X_valid = X.filter_by(valid_names, 'name')

    print "Saving %d images in training set" % X_train.num_rows()
    X_train.save(train_loc)
    print "Saving %d images in validation set" % X_valid.num_rows()
    X_valid.save(valid_loc)
def loadData(filename, type=None, ignore_indices=[]):
    data = {}
    data['label'] = []
    data['image'] = []
    input = []
    output = []
    size = 128, 128

    idx = 0

    with open(filename, 'r') as f:
        for line in f:
            if idx in ignore_indices:
                idx += 1
                continue
            else:
                idx += 1
                train = line.strip().split()
                if type == 'train':
                    if len(train) < 10:
                        continue
                classID = int(train[0])
                output.append(classID)
                #105*122 features
                image = []
                for i in range(12810):
                    image.append(0.0)
                for i in range(1, len(train)):
                    pixel = train[i].split(':')
                    image[int(pixel[0]) - 1] = float(pixel[1]) * 255

                input.append(image)

    data['label'] = gl.SArray(output)
    scaled_input = gl.SArray(input)
    img_sarray = gl.SArray.pixel_array_to_image(scaled_input,
                                                105,
                                                122,
                                                1,
                                                allow_rounding=True)
    data['image'] = img_sarray
    return data
    def _get_density_estimate(self, predictions, bandwidth=0.1):
        '''
        compute 200 numbers to represent the distribution of the predicted probability
        The output x and y are scaled to the range of [0, 1]
        '''
        output_size = 200.0

        def epanechnikov(u):
            if u > 1 or u < -1:
                return 0.0
            else:
                return 0.75 - 0.75 * u * u

        # use binned data to approximate density
        probability = predictions['mean_probability']
        weight = predictions['count']
        total_weight = predictions['count'].sum()

        x_val = _gl.SArray(range(int(output_size)))
        x_val = x_val / output_size  # [0.001, 0.002, ..., 0.999]

        estimate = []
        for i, x in enumerate(x_val):
            est = 0
            for j, prob in enumerate(probability):
                est += weight[j] * epanechnikov((x - prob) / bandwidth)
            est = est / total_weight / bandwidth
            estimate.append(est)

        # then scale the estimate to [0, 1] range
        estimate = _gl.SArray(estimate)
        est_max = estimate.max()
        est_min = min(estimate.min(), 0)
        if (est_max - est_min) > 0:
            estimate = (estimate - float(est_min)) / float(est_max - est_min)
        else:
            # min == max, it is a constant signal
            estimate = estimate * 0 + 0.5  # make it [0.5, ..., 0.5]
        ret = _gl.SFrame()
        ret['x'] = x_val
        ret['density'] = estimate
        return ret
def predict_adaboost(stump_weights, tree_stumps, data):
    scores = graphlab.SArray([0.] * len(data))

    for i, tree_stump in enumerate(tree_stumps):
        predictions = data.apply(lambda x: classify(tree_stump, x))

        # Accumulate predictions on scaores array
        # YOUR CODE HERE
        scores += stump_weights[i] * predictions

    return scores.apply(lambda score: +1 if score > 0 else -1)
    def _get_instance_and_data(cls):
        from PIL import Image as _PIL_Image
        import random
        _format = {'JPG': 0, 'PNG': 1, 'RAW': 2, 'UNDEFINED': 3}

        # Note: This needs to be added to the OSS repo as an exposed function.
        def from_pil_image(pil_img):
            height = pil_img.size[1]
            width = pil_img.size[0]
            if pil_img.mode == 'L':
                image_data = bytearray([z for z in pil_img.getdata()])
                channels = 1
            elif pil_img.mode == 'RGB':
                image_data = bytearray(
                    [z for l in pil_img.getdata() for z in l])
                channels = 3
            else:
                image_data = bytearray(
                    [z for l in pil_img.getdata() for z in l])
                channels = 4
            format_enum = _format['RAW']
            image_data_size = len(image_data)
            img = _gl.Image(_image_data=image_data,
                            _width=width,
                            _height=height,
                            _channels=channels,
                            _format_enum=format_enum,
                            _image_data_size=image_data_size)
            return img

        num_examples = 100
        dims = (28, 28)
        images = []
        for i in range(num_examples):

            def rand_image():
                return [random.randint(0, 255)] * (28 * 28)

            pil_img = _PIL_Image.new('RGB', dims)
            pil_img.putdata(list(zip(rand_image(), rand_image(),
                                     rand_image())))
            images.append(from_pil_image(pil_img))
            random_labels = random.randint(0, 1)

        data = _gl.SFrame({'image': _gl.SArray(images)})
        data['label'] = random_labels
        nn_model = _gl.neuralnet_classifier.create(data, 'label')
        data.remove_column('label')
        extractor = _gl.feature_engineering.DeepFeatureExtractor(
            features=['image'], model=nn_model)
        extractor = extractor.fit(data)
        return extractor, data
Example #30
0
 def load_edges(cls, edge_csv, header=False):
     sf = gl.SFrame.read_csv(edge_csv,
                             header=header,
                             column_type_hints=str,
                             verbose=False)
     assert sf.num_cols() in [2, 3], "edge_csv must be 2 or 3 columns"
     if sf.num_cols() == 2:
         sa = gl.SArray([1] * sf.num_rows())
         sf.add_column(sa, 'weight')
     col_names = ['__src_id', '__dst_id', 'weight']
     rename = dict(zip(sf.column_names(), col_names))
     sf.rename(rename)
     return sf