def __init__(self):
        rec_data = "../data/train.csv"
        data = ProcessData()
        df_rec = data.get_data(rec_data)
        df_rec = data.clean_data(df_rec)
        df_rec = df_rec[df_rec.record_type == 1]
        sf = SFrame(data=df_rec)
        del df_rec  # memory optimization

        self.modelA = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="A")
        self.modelB = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="B")
        self.modelC = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="C")
        self.modelD = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="D")
        self.modelE = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="E")
        self.modelF = recommender.create(sf,
                                         user_column="customer_ID",
                                         item_column="F")
Exemple #2
0
 def load_graph(self,
                graph_path,
                direction=1,
                start_line=0,
                limit=None,
                blacklist=set(),
                delimiter=','):
     json_object = utils.is_json(graph_path)
     if json_object is not False:
         # print json_object
         graph_path = SFrame(SArray(json_object).unpack())
         graph_path.rename({'X.0': 'X1', 'X.1': 'X2', 'X.2': 'Weight'})
     else:
         # load_sgraph()
         graph_path = SFrame.read_csv(graph_path,
                                      delimiter=delimiter,
                                      header=False,
                                      column_type_hints={
                                          'X1': str,
                                          'X2': str
                                      },
                                      nrows=limit,
                                      skiprows=start_line)
         if self._weight_field != "":
             graph_path.rename({'X3': 'Weight'})
     # print graph_data
     self._graph = self._graph.add_edges(graph_path,
                                         src_field='X1',
                                         dst_field='X2')
     if not self.is_directed:
         self.to_undirected()
Exemple #3
0
    def download(symbol, start_date, end_date):
        stock = Share(symbol)
        # ^GSPC is the Yahoo finance symbol to refer S&P 500 index
        # we gather historical quotes from 2001-01-01 up to today
        hist_quotes = stock.get_historical(start_date, end_date)
        l_date = []
        l_open = []
        l_high = []
        l_low = []
        l_close = []
        l_volume = []
        # reverse the list
        hist_quotes.reverse()
        for quotes in hist_quotes:
            l_date.append(quotes['Date'])
            l_open.append(float(quotes['Open']))
            l_high.append(float(quotes['High']))
            l_low.append(float(quotes['Low']))
            l_close.append(float(quotes['Adj_Close']))
            l_volume.append(int(quotes['Volume']))

        sf = SFrame({
            'datetime': l_date,
            'open': l_open,
            'high': l_high,
            'low': l_low,
            'close': l_close,
            'volume': l_volume
        })
        # datetime is a string, so convert into datetime object
        sf['datetime'] = sf['datetime'].apply(
            lambda x: datetime.strptime(x, '%Y-%m-%d'))
        return sf
 def get_rating_sf(self, samples, save_to=None):
     sf = SFrame(self.ratings.ix[samples])
     sf['userId'] = sf['userId'].apply(lambda uid: self.user_dict[uid])
     sf['movieId'] = sf['movieId'].apply(lambda mid: self.movie_dict[mid])
     if save_to is not None:
         print "saving sframe to", save_to
         sf.save(save_to)
     return sf
Exemple #5
0
def query_model(dogo, model, images):
    neighbours = get_images_from_ids(model.query(dogo, k=20), images)

    image_list = SFrame(data=None)

    shown_dogs = {dogo['images'][0][0]}

    for i in range(0, len(neighbours)):
        if len(shown_dogs) < 6:
            if neighbours[i]['images'][0] not in shown_dogs:
                # neighbours[i]['image'].show()
                dogo_clone = neighbours[i:i + 1].copy()
                image_list = image_list.append(SFrame(dogo_clone))
                shown_dogs.add(neighbours[i]['images'][0])
        else:
            break

    return image_list
Exemple #6
0
def append_images(json_file):

    # we fill an SFrame with all the given metadata of the dogs
    meta = SFrame.read_json(json_file, orient='records')
    # this is the SFrame that we will fill with the data plus the image, which will be saved in the final file
    image_list = SFrame(data=None)
    # for each image in the images column in the meta SFrame, we add one line in the final SF with one image per line
    for i in range(0, len(meta) - 1):
        dogo = meta[i:i + 1]
        for image in dogo['images'][0]:
            # print image
            dogo_clone = dogo.copy()
            dogo_clone.add_column(SArray([(graphlab.Image(images_path + image))
                                          ]),
                                  name='image')
            dogo_clone.add_column(SArray([image]), name='image_filename')
            image_list = image_list.append(SFrame(dogo_clone))

    image_list.save(filename='prepared_data/')
Exemple #7
0
def get_vertices(k, factor0=1):
    return SFrame({
        # Movies
        '__id': mids,
        'factors': map(lambda _: rand(k) * factor0, movie_ids),
        'w': map(lambda _: np.zeros(ng + nht), movie_ids),
        'b': map(lambda _: 0, movie_ids),
        'features': movies_features,
        'user': map(lambda _: 0, movie_ids)
    }).append(
        SFrame({
            # User
            '__id': uids,
            'factors': map(lambda _: rand(k) * factor0, user_ids),
            'w': map(lambda _: np.zeros(ng + nht), user_ids),
            'b': map(lambda _: 0, user_ids),
            'features': map(lambda _: {}, user_ids),
            'user': map(lambda _: 1, user_ids)
        }))
Exemple #8
0
def train_model(filename):
    # load already prepared data in form of an SFrame
    image_train = graphlab.SFrame(filename)
    # load the pre-trained model
    loaded_model = graphlab.load_model('model/')
    # extract features of the model on the given pictures
    image_train['deep_features'] = loaded_model.extract_features(SFrame(image_train))
    # add ids to the SFrame to be able to find the closest images
    ids = SArray(list(range(0,len(image_train))))
    image_train.add_column(ids, name='id')
    # print image_train.head()
    # train the NN model on the extracted features
    knn_model = graphlab.nearest_neighbors.create(image_train, features=['deep_features'], label='id')
    return knn_model, image_train
Exemple #9
0
def test_graphlab(num_factors=10, reg=0.01, niter=50):
    ''' test the graphlab install '''
    url = 'http://s3.amazonaws.com/GraphLab-Datasets/movie_ratings/training_data.csv'
    data = SFrame(url)
    mfac = recommender.matrix_factorization.create(data,
                                                   'user',
                                                   'movie',
                                                   'rating',
                                                   num_factors,
                                                   reg=reg,
                                                   nmf=True,
                                                   use_bias=True,
                                                   holdout_probability=0.2,
                                                   niter=niter,
                                                   random_seed=42)
    print mfac.summary
    return mfac
Exemple #10
0
def graph_lab(url, format = 'auto', flip_img = False, zoom = False):
	"""Extracts the graphlab features"""
	if format == 'auto':	
		extension = url.split('.')[-1]

	img = preprocess(url)
	if flip_img:
		img = flip(img)
	if zoom:
		img = middle(img)

	h,w,_ = img.shape
	img_bytes = bytearray(img)
	image_data_size = len(img_bytes)
	img = graphlab.Image(_image_data=img_bytes, _width=w, _height=h, _channels=3, _format_enum=2, _image_data_size=image_data_size)

	return SFrame({'image': [img]})
Exemple #11
0
def resize_images(filename):
    images = graphlab.image_analysis.load_images(filename,
                                                 format='auto',
                                                 with_path=False,
                                                 recursive=False,
                                                 ignore_failure=True,
                                                 random_order=True)
    # firstImages = images[0:9]['image']
    new_images = list()
    new_images.append(
        graphlab.image_analysis.resize(images['image'],
                                       32,
                                       32,
                                       channels=4,
                                       decode=True))
    frame = SFrame(new_images)
    frame.save('mini')
Exemple #12
0
def test_graphlab2(num_factors=10, reg=0.1, niter=50):
    ''' test the graphlab install with our data'''
    infile = PARS['data_dir'] + 'subset_partial.csv'
    data = SFrame(infile)
    mfac = recommender.matrix_factorization.create(data,
                                                   'id',
                                                   'brand',
                                                   'purchasequantity',
                                                   num_factors,
                                                   reg=reg,
                                                   nmf=True,
                                                   use_bias=True,
                                                   holdout_probability=0.2,
                                                   niter=niter,
                                                   random_seed=42)
    print mfac.summary
    return mfac
Exemple #13
0
    def data_frame_with_target(self, data_frame):
        """

        :param data_frame:
        :type data_frame: DataFrame
        :return:
        :rtype: SFrame
        """
        data_sframe = SFrame(data_frame.toPandas())
        sentiment_array = data_sframe.select_column('sentiment')
        target_array = []
        for x in sentiment_array:
            try:
                target_array.append(self.convert_func(x))
            except Exception as ex:
                print len(target_array), 'get_sentiments', x
                target_array.append(3)
                print ex

        data_sframe.add_column(SArray(target_array, dtype=int), name='target')
        print data_sframe
        return data_sframe.dropna()
# -------

import pandas as pd
from graphlab import SFrame
from graphlab import popularity_recommender as pr


# -----------
# Data Import
# -----------

col_names = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_table('u.data', names=col_names)
data = data.drop('timestamp', 1)

# ----------
# Data Split
# ----------

sf = SFrame(data=data, format='auto')
train, test = sf.random_split(.7)
print(len(train))
print(len(test))

# ----------------------
# Popularity Recommender
# ----------------------

recommender = pr.create(train, target='rating')
eval = recommender.evaluate(test)  # ('\nOverall RMSE: ', 1.0262364153594763)
Exemple #15
0
outputPath = os.environ.get("OUTPUT_PATH")
startScale = int(os.environ.get("START_SCALE"))

tagFile = './tmp'
with open(tagFile, 'r') as f:
    infor = f.readline().strip().split(",")
    maxScale = int(infor[1])
    realEndScale = int(infor[2])

scaleRange = range(startScale, realEndScale + 1)

for scale in scaleRange:

    inputPath = os.path.join(outputPath, 'tmp', 'AdjacentRelationships',
                             str(scale))
    url = inputPath
    data = SFrame.read_csv(url, header=False)
    if (data.num_rows() == 0):
        cc_ids = SFrame({"__id": [], "component_id": []})
    else:
        g = SGraph().add_edges(data,
                               src_field=data.column_names()[0],
                               dst_field=data.column_names()[1])
        cc = connected_components.create(g)
        cc_ids = cc.get('component_id')
    path = os.path.join(outputPath, 'tmp', 'ConnectedComponents', str(scale))
    if (~os.path.exists(path)):
        os.makedirs(path)

    SFrame.export_csv(cc_ids, os.path.join(path))
Exemple #16
0
def build_data_graph():
    file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
    beneficiaries = SFrame.read_csv(file_path +
                                    "beneficiary_summary_2008_2009.csv")
    bene_packed = beneficiaries.pack_columns(
        column_prefix='chron_',
        dtype=dict,
        new_column_name='chronic_conditions',
        remove_prefix=False)

    #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(),
    #and the outer [] makes sure we emit a list of lists.
    bene_chrons = bene_packed.flat_map(
        ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"],
        lambda x: [
            list(k + (x['desynpuf_id'], ))
            for k in x['chronic_conditions'].iteritems()
        ])

    bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
    del bene_chrons['chronic_condition_value']
    bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

    g = SGraph()
    bene_chrons['relation'] = 'had_chronic'
    g = g.add_edges(bene_chrons,
                    src_field='desynpuf_id',
                    dst_field='chronic_condition')
    print g.summary()

    #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
    bene_with_chrons = SFrame(None)
    bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(),
                                'desynpuf_id')

    #Add edges to the graph indicating which patient had which diagnosed condition
    tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcdc[column]
    #Same patient can be diagnosed with same condition multiple times a year, so take distinct
    tcdc = tcdc.unique()
    #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no diagnosed condition, however.
    bene_chrons_tcdc = bene_with_chrons.join(tcdc)

    bene_chrons_tcdc['relation'] = 'diagnosed_with'
    g = g.add_edges(bene_chrons_tcdc,
                    src_field='desynpuf_id',
                    dst_field='dgns_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which procedure
    tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv",
                           column_type_hints={'prcdr_cd': str})
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcpc[column]
    tcpc = tcpc.unique()
    #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no procedure, however.
    bene_chrons_tcpc = bene_with_chrons.join(tcpc)
    bene_chrons_tcpc['relation'] = 'underwent'
    g = g.add_edges(bene_chrons_tcpc,
                    src_field='desynpuf_id',
                    dst_field='prcdr_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which medicine
    pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
    pde = pde.unique()
    #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no medicine, however.
    bene_chrons_pde = bene_with_chrons.join(pde)
    bene_chrons_pde['relation'] = 'had_drug'
    g = g.add_edges(bene_chrons_pde,
                    src_field='desynpuf_id',
                    dst_field='substancename')
    print g.summary()

    return g
def get_sf_from_coo(coo, save_to):
    sf = SFrame({'userId': coo.row, 'movieId': coo.col, 'rating': coo.data})
    if save_to is not None:
        print "saving sframe to", save_to
        sf.save(save_to)
    return sf