Example #1
0
    def train(self, model_id):

        #get training status of model container
        train_status = model_cont['train_status'] 

        if train_status != "trained":
            
            #load the data to train
            data_loader = DataLoader()
            dataset = data_loader.load_user_data(user_data_path)

            #load model specific parameters
            #TODO

            if "train_test_split" in params.keys() and params["train_test_split"]:
                data_split = params['train_test_split']
                trainset = DataProcessor().get_trainset(features, labels, data_split)

            #train the model
            clf = svc()
            clf.fit(dataset['features'], dataset['labels'])
            pkl_file = pdumps(clf)
            
            #update the model object with the results of training
            model_cont['learned_model']=Binary(pkl_file)
            model_cont['train_status'] = "trained"
            
            return model_cont

        else:
            print("Already trained")
            return False     
Example #2
0
    def train(self, model_cont, user_data=None):

        train_status=model_cont['train_status']

        if train_status != "training":

            #load the data to train
            data_loader = DataLoader()
            dataset = data_loader.load_user_data(user_data_path);

            #load the model specific parameters
            alpha = model_cont['parameters']['alpha']

            #train the model
            clf = LinearRegression(alpha)
            clf.fit(dataset['features'], dataset['labels'])
            pkl_file = pdumps(clf)

            #update the model object with the results of training
            model_cont['learned_model']=Binary(pkl_file)
            model_cont['train_status'] = "trained"
            
            return model_cont
        
        else:
            
            print("Already Trained")
            return True
Example #3
0
def build_product_model(host, port, **kwargs):
    prod_model_data = 'prod_model_data.pickle'
    print("Loading products from database:")
    prod_filt = {'comodegenic': {'$type': 'int'}}  # Only return entries with comodegenic score
    prod_prjctn = {
        'ingredient_list': True,
        'comodegenic': True}
    db_objects = PRODUCTS_DB.read(prod_filt, projection=prod_prjctn)
    products = [DB_Object.build_from_dict(p) for p in db_objects]

    # The tfidf_vect will ignore the following words
    stop_words = [
        '',
        'water',
        'glycerin',
        'titanium dioxide',
        'iron oxides',
        'beeswax',
        'methylparaben',
        'propylparaben',
        'propylene glycol',
        'panthenol',
        'mica']

    # Tokenizer for product ingredient lists
    def get_prod_ings_as_list(product):
        '''
        Queries the ingredients DB for a given product's ingredient list
        and returns the ingredient list as a list of ingredient strings
        Note: The DB query is performed once using all ingredient object
        IDs simultaneously.
        '''
        fltr = {'_id': {'$in': product.get('ingredient_list', [])}}
        ing_prjctn = {'_id': False, 'ingredient_name': True}
        db_objects = INGREDIENTS_DB.read(fltr, projection=ing_prjctn)
        return [DB_Object.build_from_dict(i).get('ingredient_name', '') for i in db_objects]

    print('Vectorizing product ingredient lists')
    tfidf_vect = TfidfVectorizer(
        tokenizer=get_prod_ings_as_list,
        lowercase=False,
        stop_words=stop_words)
    X = tfidf_vect.fit_transform(products)
    y = [p['comodegenic'] for p in products]

    print('Storing vectorized data and training labels')
    # Flatten CSR sparse matrix to strings
    model = {
        'X': X,
        'y': y
    }

    print("Saving model data to disk for next time")
    # Insert the model into the model database
    MODEL_DB.create_file(pdumps(model, protocol=2), filename="ml_product_data")
    # Save model data to disk
    with open(prod_model_data, "wb") as pickle_out:
        pdump(model, pickle_out)
    print('[SUCCESS] Product model data post-processed and stored')
Example #4
0
    def dump_pending_points(self):
        result = [(name, pdumps({
            'xyz': xyz,
            'rgb': rgb
        }), len(xyz)) for name, xyz, rgb in self._get_pending_points()]

        self.pending_xyz = []
        self.pending_rgb = []
        return result
Example #5
0
    def save_to_bytes(self):
        sub_pickle = {}
        if self.children is not None:
            sub_pickle['children'] = self.children
            sub_pickle['grid'] = self.grid
        else:
            sub_pickle['points'] = self.points

        d = pdumps(sub_pickle)
        return d
    def dump(self, name, max_depth):
        """Serialize the stored nodes to a bytes list"""
        node = self.nodes[name]
        if node.dirty:
            self.node_bytes[name] = node.save_to_bytes()

        if node.children is not None and max_depth > 0:
            for n in node.children:
                self.dump(n, max_depth - 1)

        return pdumps(self.node_bytes)
Example #7
0
def analyze_url(self,parsed):
    setup_task_logger(parsed)
    log_string("Start analyzing",task=parsed['task'])
    temp_container = None
    try:
        parsed["domain"] = ""
        try:
            extracted = textract(parsed['buffer'])
            parsed["domain"] = "{}.{}".format(extracted.domain,extracted.suffix)
        except:
            pass
        log_string(parsed["domain"],task=parsed['task'])
        parsed['locations'] = json_settings[environ["project_env"]]["task_logs"]
        if parsed['use_proxy']:
            log_string("Proxy detected",task=parsed['task'])
            temp_container = DOCKER_CLIENT.containers.run("url-sandbox_box", command=[hexlify(pdumps(parsed)).decode()] , volumes={json_settings[environ["project_env"]]["output_folder"]: {'bind': json_settings[environ["project_env"]]["task_logs"]["box_output"], 'mode': 'rw'}}, detach=True, network="url-sandbox_frontend_box")
        else:
            log_string("No proxy, running privileged for custom tor config",task=parsed['task'])
            temp_container = DOCKER_CLIENT.containers.run("url-sandbox_box", command=[hexlify(pdumps(parsed)).decode()] , volumes={json_settings[environ["project_env"]]["output_folder"]: {'bind': json_settings[environ["project_env"]]["task_logs"]["box_output"], 'mode': 'rw'}}, detach=True, network="url-sandbox_frontend_box", privileged=True)
        temp_logs = ""
        for item in range(1,parsed['analyzer_timeout']):
            temp_logs = temp_container.logs()
            if len(temp_logs) > 1:
                if temp_logs.endswith(b"Done!!\n"):
                    break
            sleep(1)
        temp_container.stop()
        if len(temp_logs) > 0:
            for item in temp_logs.split(b"\n"):
                with ignore_excpetion():
                    if len(item) > 0:
                        log_string(item.decode("utf-8"),task=parsed['task'])
        log_string("Parsing output",task=parsed['task'])
        parsed['locations']['box_output'] = json_settings[environ["project_env"]]["output_folder"]
    except Exception as e:
        log_string("Error -> {}".format(e),task=parsed['task'])
    #clean_up()
    try:
        if temp_container != None:
            temp_container.stop()
            temp_container.remove()
    except Exception as e:
        log_string("Error -> {}".format(e),task=parsed['task'])
    make_report(parsed)
    cancel_task_logger(parsed['task'])
Example #8
0
def run(_id, filename, offset_scale, portion, queue, projection, verbose):
    """
    Reads points from a xyz file
    """
    try:
        f = open(filename, "r")

        point_count = portion[1] - portion[0]

        step = min(point_count, max((point_count) // 10, 100000))

        f.seek(portion[2])

        for i in range(0, point_count, step):
            points = np.zeros((step, 3), dtype=np.float32)

            for j in range(0, step):
                line = f.readline()
                if not line:
                    points = np.resize(points, (j, 3))
                    break
                points[j] = [float(s) for s in line.split(" ")]

            x, y, z = [points[:, c] for c in [0, 1, 2]]

            if projection:
                x, y, z = pyproj.transform(projection[0], projection[1], x, y,
                                           z)

            x = (x + offset_scale[0][0]) * offset_scale[1][0]
            y = (y + offset_scale[0][1]) * offset_scale[1][1]
            z = (z + offset_scale[0][2]) * offset_scale[1][2]

            coords = np.vstack((x, y, z)).transpose()

            if offset_scale[2] is not None:
                # Apply transformation matrix (because the tile's transform will contain
                # the inverse of this matrix)
                coords = np.dot(coords, offset_scale[2])

            coords = np.ascontiguousarray(coords.astype(np.float32))

            # Read colors
            colors = np.full((point_count, 3), 255, dtype=np.uint8)

            result = (
                "".encode("ascii"),
                pdumps({
                    "xyz": coords,
                    "rgb": colors
                }),
                len(coords),
            )
            queue.send_multipart(
                [
                    "".encode("ascii"),
                    pdumps({
                        "xyz": coords,
                        "rgb": colors
                    }),
                    struct.pack(">I", len(coords)),
                ],
                copy=False,
            )

        queue.send_multipart([pdumps({"name": _id, "total": 0})])
        # notify we're idle
        queue.send_multipart([b""])

        f.close()
    except Exception as e:
        print("Exception while reading points from xyz file")
        print(e)
        traceback.print_exc()
def run(_id, filename, offset_scale, portion, queue, transformer, verbose):
    """
    Reads points from a xyz file

    Consider XYZIRGB format following FME documentation(*). If the number of
    features does not correspond (i.e. does not equal to 7), we do the
    following hypothesis:
    - 3 features mean XYZ
    - 4 features mean XYZI
    - 6 features mean XYZRGB

    (*) See: https://docs.safe.com/fme/html/FME_Desktop_Documentation/FME_ReadersWriters/pointcloudxyz/pointcloudxyz.htm
    """
    try:
        f = open(filename, "r")

        point_count = portion[1] - portion[0]

        step = min(point_count, max((point_count) // 10, 100000))

        f.seek(portion[2])

        feature_nb = 7

        for i in range(0, point_count, step):
            points = np.zeros((step, feature_nb), dtype=np.float32)

            for j in range(0, step):
                line = f.readline()
                if not line:
                    points = np.resize(points, (j, feature_nb))
                    break
                line_features = [float(s) for s in line.split(" ")]
                if len(line_features) == 3:
                    line_features += [None] * 4  # Insert intensity and RGB
                elif len(line_features) == 4:
                    line_features += [None] * 3  # Insert RGB
                elif len(line_features) == 6:
                    line_features.insert(3, None)  # Insert intensity
                points[j] = line_features

            x, y, z = [points[:, c] for c in [0, 1, 2]]

            if transformer:
                x, y, z = transformer.transform(x, y, z)

            x = (x + offset_scale[0][0]) * offset_scale[1][0]
            y = (y + offset_scale[0][1]) * offset_scale[1][1]
            z = (z + offset_scale[0][2]) * offset_scale[1][2]

            coords = np.vstack((x, y, z)).transpose()

            if offset_scale[2] is not None:
                # Apply transformation matrix (because the tile's transform will contain
                # the inverse of this matrix)
                coords = np.dot(coords, offset_scale[2])

            coords = np.ascontiguousarray(coords.astype(np.float32))

            # Read colors: 3 last columns of the point cloud
            colors = points[:, -3:].astype(np.uint8)

            queue.send_multipart(
                [
                    "".encode("ascii"),
                    pdumps({
                        "xyz": coords,
                        "rgb": colors
                    }),
                    struct.pack(">I", len(coords)),
                ],
                copy=False,
            )

        queue.send_multipart([pdumps({"name": _id, "total": 0})])
        # notify we're idle
        queue.send_multipart([b""])

        f.close()
    except Exception as e:
        print("Exception while reading points from xyz file")
        print(e)
        traceback.print_exc()
Example #10
0
def test_py_pickle():
   """ Tests: test_py_pickle frompickle
   """
   print('::: TEST: test_py_pickle()')
   edict_with_all = _get_orig__edict_with_all()
   new_reobj_all__pdumps = pdumps(edict_with_all)
   new_reobj_all = ploads(new_reobj_all__pdumps)

   ok_(edict_with_all == new_reobj_all, msg=None)
   ok_(isinstance(new_reobj_all, Edict), msg=None)
   ok_(edict_with_all.extra_data == new_reobj_all.extra_data, msg=None)
   ok_(new_reobj_all.extra_data['edict extra2'] == 'edict extra_value2', msg=None)

   ok_(edict_with_all['edict1'] == new_reobj_all['edict1'], msg=None)
   ok_(isinstance(new_reobj_all['edict1'], Edict), msg=None)
   ok_(edict_with_all['edict1'].extra_data == new_reobj_all['edict1'].extra_data, msg=None)
   ok_(new_reobj_all['edict1'].extra_data['edict_obj.edict1 extra2'] == 'edict_obj.edict1 extra_value2', msg=None)

   ok_(edict_with_all['rdict1'] == new_reobj_all['rdict1'], msg=None)
   ok_(isinstance(new_reobj_all['rdict1'], Rdict), msg=None)
   ok_(edict_with_all['rdict1'].extra_data == new_reobj_all['rdict1'].extra_data, msg=None)
   ok_(new_reobj_all['rdict1'].extra_data['edict_obj.rdict1 extra2'] == 'edict_obj.rdict1 extra_value2', msg=None)

   ok_(edict_with_all['edictf1'] == new_reobj_all['edictf1'], msg=None)
   ok_(isinstance(new_reobj_all['edictf1'], RdictF), msg=None)
   ok_(edict_with_all['edictf1'].extra_data == new_reobj_all['edictf1'].extra_data, msg=None)
   ok_(new_reobj_all['edictf1'].extra_data['edict_obj.edictf1 extra2'] == 'edict_obj.edictf1 extra_value2', msg=None)

   ok_(edict_with_all['edictio1'] == new_reobj_all['edictio1'], msg=None)
   ok_(isinstance(new_reobj_all['edictio1'], RdictIO), msg=None)
   ok_(edict_with_all['edictio1'].extra_data == new_reobj_all['edictio1'].extra_data, msg=None)
   ok_(new_reobj_all['edictio1'].extra_data['edict_obj.edictio1 extra2'] == 'edict_obj.edictio1 extra_value2', msg=None)
   ok_(edict_with_all['edictio1'].key_order == new_reobj_all['edictio1'].key_order, msg=None)
   ok_(new_reobj_all['edictio1'].key_order == ['edictio_inner1', 'edictio_inner2', 'edictio_inner3'], msg=None)
   ok_(edict_with_all['edictio1'].extra_key_order == new_reobj_all['edictio1'].extra_key_order, msg=None)
   ok_(new_reobj_all['edictio1'].extra_key_order == ['edictio_inner2', 'edictio_inner3', 'edictio_inner1'], msg=None)

   ok_(edict_with_all['edictfo1'] == new_reobj_all['edictfo1'], msg=None)
   ok_(isinstance(new_reobj_all['edictfo1'], RdictFO), msg=None)
   ok_(edict_with_all['edictfo1'].extra_data == new_reobj_all['edictfo1'].extra_data, msg=None)
   ok_(new_reobj_all['edictfo1'].extra_data['edict_obj.edictfo1 extra2'] == 'edict_obj.edictfo1 extra_value2', msg=None)
   ok_(edict_with_all['edictfo1'].key_order == new_reobj_all['edictfo1'].key_order, msg=None)
   ok_(new_reobj_all['edictfo1'].key_order == ['edictfo_inner1', 'edictfo_inner2', 'edictfo_inner3'], msg=None)
   ok_(edict_with_all['edictfo1'].extra_key_order == new_reobj_all['edictfo1'].extra_key_order, msg=None)
   ok_(new_reobj_all['edictfo1'].extra_key_order == ['edictfo_inner2', 'edictfo_inner3', 'edictfo_inner1'], msg=None)

   ok_(edict_with_all['edictfo2_1'] == new_reobj_all['edictfo2_1'], msg=None)
   ok_(isinstance(new_reobj_all['edictfo2_1'], RdictFO2), msg=None)
   ok_(edict_with_all['edictfo2_1'].extra_data == new_reobj_all['edictfo2_1'].extra_data, msg=None)
   ok_(new_reobj_all['edictfo2_1'].extra_data['edict_obj.edictfo2_1 extra2'] == 'edict_obj.edictfo2_1 extra_value2', msg=None)
   ok_(edict_with_all['edictfo2_1'].key_order == new_reobj_all['edictfo2_1'].key_order, msg=None)
   ok_(new_reobj_all['edictfo2_1'].key_order == ['edictfo2_inner1', 'edictfo2_inner2', 'edictfo2_inner3'], msg=None)
   ok_(edict_with_all['edictfo2_1'].extra_key_order == new_reobj_all['edictfo2_1'].extra_key_order, msg=None)
   ok_(new_reobj_all['edictfo2_1'].extra_key_order == ['edictfo2_inner2', 'edictfo2_inner3', 'edictfo2_inner1'], msg=None)

   ok_(edict_with_all['elist1'] == new_reobj_all['elist1'], msg=None)
   ok_(isinstance(new_reobj_all['elist1'], Elist), msg=None)
   ok_(edict_with_all['elist1'].extra_data == new_reobj_all['elist1'].extra_data, msg=None)
   ok_(new_reobj_all['elist1'].extra_data['edict_obj.elist1 extra2'] == 'edict_obj.elist1 extra_value2', msg=None)

   ok_(edict_with_all['rlist1'] == new_reobj_all['rlist1'], msg=None)
   ok_(isinstance(new_reobj_all['rlist1'], Rlist), msg=None)
   ok_(edict_with_all['rlist1'].extra_data == new_reobj_all['rlist1'].extra_data, msg=None)
   ok_(new_reobj_all['rlist1'].extra_data['edict_obj.rlist1 extra2'] == 'edict_obj.rlist1 extra_value2', msg=None)

   ok_(edict_with_all['rlistf1'] == new_reobj_all['rlistf1'], msg=None)
   ok_(isinstance(new_reobj_all['rlistf1'], RlistF), msg=None)
   ok_(edict_with_all['rlistf1'].extra_data == new_reobj_all['rlistf1'].extra_data, msg=None)
   ok_(new_reobj_all['rlistf1'].extra_data['edict_obj.rlistf1 extra2'] == 'edict_obj.rlistf1 extra_value2', msg=None)

   ok_(edict_with_all['etuple1'] == new_reobj_all['etuple1'], msg=None)
   ok_(isinstance(new_reobj_all['etuple1'], Etuple), msg=None)
   ok_(edict_with_all['etuple1'].extra_data == new_reobj_all['etuple1'].extra_data, msg=None)
   ok_(new_reobj_all['etuple1'].extra_data['edict_obj.etuple1 extra2'] == 'edict_obj.etuple1 extra_value2', msg=None)

   ok_(edict_with_all['lmatrix1'] == new_reobj_all['lmatrix1'], msg=None)
   ok_(isinstance(new_reobj_all['lmatrix1'], Lmatrix), msg=None)
   ok_(edict_with_all['lmatrix1'].extra_data == new_reobj_all['lmatrix1'].extra_data, msg=None)
   ok_(new_reobj_all['lmatrix1'].extra_data['edict_obj.lmatrix1 extra2'] == 'edict_obj.lmatrix1 extra_value2', msg=None)

   ok_(edict_with_all['lmatrixf1'] == new_reobj_all['lmatrixf1'], msg=None)
   ok_(isinstance(new_reobj_all['lmatrixf1'], LmatrixF), msg=None)
   ok_(edict_with_all['lmatrixf1'].extra_data == new_reobj_all['lmatrixf1'].extra_data, msg=None)
   ok_(new_reobj_all['lmatrixf1'].extra_data['edict_obj.lmatrixf1 extra2'] == 'edict_obj.lmatrixf1 extra_value2', msg=None)

   # some data checks
   ok_(edict_with_all['edictfo1']['edictfo_inner2'] == new_reobj_all['edictfo1']['edictfo_inner2'] and new_reobj_all['edictfo1']['edictfo_inner2'] == 'edictfo_inner2 value', msg=None)
   ok_(edict_with_all['rlist1'][1] == new_reobj_all['rlist1'][1] and new_reobj_all['rlist1'][1] == 'rlist_inner value2', msg=None)

   ok_(edict_with_all['lmatrixf1'].this_column_values('name') == new_reobj_all['lmatrixf1'].this_column_values('name') and new_reobj_all['lmatrixf1'].this_column_values('name') == ['darkorange', 'flesh', 'firebrick 3'], msg=None)
   ok_(edict_with_all['lmatrixf1'][1][2] == new_reobj_all['lmatrixf1'][1][new_reobj_all['lmatrixf1'].column_names_idx_lookup['green']] and new_reobj_all['lmatrixf1'][1][2] == 125, msg=None)

   # Change original
   edict_with_all['etuple1'].replace_extra_data({'edict_obj.etuple1 UPDATED': 'UPDATED'})

   ok_(edict_with_all['etuple1'] == new_reobj_all['etuple1'], msg=None)
   ok_(isinstance(new_reobj_all['etuple1'], Etuple), msg=None)
   ok_(edict_with_all['etuple1'].extra_data != new_reobj_all['etuple1'].extra_data, msg=None)
   ok_(new_reobj_all['etuple1'].extra_data['INFO'] == 'edict_obj.etuple1 inner', msg=None)
   edict_with_all = {}
   ok_(isinstance(new_reobj_all['etuple1'], Etuple), msg=None)
   ok_(new_reobj_all['etuple1'].extra_data['INFO'] == 'edict_obj.etuple1 inner', msg=None)

   ok_(isinstance(new_reobj_all['lmatrix1'], Lmatrix), msg=None)
   ok_(new_reobj_all['lmatrix1'].extra_data['INFO'] == 'edict_obj.lmatrix1 inner', msg=None)
   ok_(new_reobj_all['lmatrix1'].column_names == ('name', 'red', 'green', 'blue'), msg=None)
Example #11
0
def run(_id, filename, offset_scale, portion, queue, projection, verbose):
    '''
    Reads points from a las file
    '''
    try:
        f = laspy.file.File(filename, mode='r')

        point_count = portion[1] - portion[0]

        step = min(point_count, max((point_count) // 10, 100000))

        indices = [i for i in range(math.ceil((point_count) / step))]

        color_scale = offset_scale[3]

        file_points = f.get_points()['point']
        X = file_points['X']
        Y = file_points['Y']
        Z = file_points['Z']
        # todo: attributes
        if 'red' in f.point_format.lookup:
            RED = file_points['red']
            GREEN = file_points['green']
            BLUE = file_points['blue']
        else:
            RED = file_points['intensity']
            GREEN = file_points['intensity']
            BLUE = file_points['intensity']

        for index in indices:
            start_offset = portion[0] + index * step
            num = min(step, portion[1] - start_offset)

            # read scaled values and apply offset
            x = X[start_offset:start_offset +
                  num] * f.header.scale[0] + f.header.offset[0]
            y = Y[start_offset:start_offset +
                  num] * f.header.scale[1] + f.header.offset[1]
            z = Z[start_offset:start_offset +
                  num] * f.header.scale[2] + f.header.offset[2]

            if projection:
                x, y, z = pyproj.transform(projection[0], projection[1], x, y,
                                           z)

            x = (x + offset_scale[0][0]) * offset_scale[1][0]
            y = (y + offset_scale[0][1]) * offset_scale[1][1]
            z = (z + offset_scale[0][2]) * offset_scale[1][2]

            coords = np.vstack((x, y, z)).transpose()

            if offset_scale[2] is not None:
                # Apply transformation matrix (because the tile's transform will contain
                # the inverse of this matrix)
                coords = np.dot(coords, offset_scale[2])

            coords = np.ascontiguousarray(coords.astype(np.float32))

            # Read colors
            red = RED[start_offset:start_offset + num]
            green = GREEN[start_offset:start_offset + num]
            blue = BLUE[start_offset:start_offset + num]

            if color_scale is None:
                red = red.astype(np.uint8)
                green = green.astype(np.uint8)
                blue = blue.astype(np.uint8)
            else:
                red = (red * color_scale).astype(np.uint8)
                green = (green * color_scale).astype(np.uint8)
                blue = (blue * color_scale).astype(np.uint8)

            colors = np.vstack((red, green, blue)).transpose()

            result = (''.encode('ascii'), pdumps({
                'xyz': coords,
                'rgb': colors
            }), len(coords))
            queue.send_multipart([
                ''.encode('ascii'),
                pdumps({
                    'xyz': coords,
                    'rgb': colors
                }),
                struct.pack('>I', len(coords))
            ],
                                 copy=False)

        queue.send_multipart([pdumps({'name': _id, 'total': 0})])
        # notify we're idle
        queue.send_multipart([b''])

        f.close()
    except Exception as e:
        print('Exception while reading points from las file')
        print(e)
        traceback.print_exc()
Example #12
0
def build_people_model(host, port, **kwargs):
    global PROD_COMO
    ppl_model_data = 'ppl_model_data.pickle'
    batch_size = kwargs.get('batch_size', 10000)
    vocabulary = get_ingredient_vocabulary(host, port)

    # The tfidf_vect will ignore the following words
    stop_words = [
        '',
        'water',
        'glycerin',
        'titanium dioxide',
        'iron oxides',
        'beeswax',
        'methylparaben',
        'propylparaben',
        'propylene glycol',
        'panthenol',
        'mica']

    # Create vectorizers
    d_vect = DictVectorizer(sparse=False)
    tfidf_vect = TfidfVectorizer(
        tokenizer=get_ingredients_as_list,
        lowercase=False,
        stop_words=stop_words,
        vocabulary=vocabulary)

    print("Loading people from database, batch_size:", str(batch_size))
    ppl_filt = {}
    ppl_prjctn = {
        '_id': False,
        'race': True,
        'birth_sex': True,
        'age': True,
        'acne': True,
        'skin': True,
        'acne_products': True}  # Don't include any PII
    db_objects = PEOPLE_DB.read(ppl_filt, projection=ppl_prjctn)

    y, demo_mult = [], []
    batch_num, pulled = 0, 0
    X = None

    # Work in batches to build dataset
    while pulled <= db_objects.count(with_limit_and_skip=True):
        # Initialize
        X_demo_lst, X_prod_lst = [], []
        people = []

        print('Parsing batch:', batch_num)

        try:
            # Build a batch
            for i in range(batch_size):
                people.append(DB_Object.build_from_dict(db_objects.next()))
                pulled += 1
        except StopIteration:
        # End of available data
            break

        # Extract features
        for person in people:
            # Create new entry for each product
            # Note: Model is only applicable to entries with products
            for product_id in person.pop('acne_products'):
                # Pull product ingredients info
                X_prod_lst.append([product_id])

                # Pull demographic info
                X_demo_lst.append(person)

                # Generate demographic multiplier
                mult = get_multiplier(person)
                demo_mult.append(mult)

        # Vectorize data
        X_demo = d_vect.fit_transform(X_demo_lst)  # X_demo is now a numpy array
        X_prod = tfidf_vect.fit_transform(X_prod_lst)  # X_prod is now a CSR sparse matrix

        # Add batch result to output matrix
        if X is not None:
            X_t = hstack([csr_matrix(X_demo), X_prod], format="csr")
            try:
                X = vstack([X, X_t], format="csr")
            except ValueError:
                break
        else:
            # Initialize X
            X = hstack([csr_matrix(X_demo), X_prod], format="csr")

        batch_num += 1

    for como, mult in zip(PROD_COMO, demo_mult):
        val = como * mult
        if val < 6:
            y.append(0)
        elif val < 12:
            y.append(1)
        else:
            y.append(2)

    print('Storing vectorized data and training labels')
    # Flatten CSR sparse matrix to strings
    model = {
        'X': X,
        'y': y,
        'd_vect': d_vect,
        'tfidf_vect': tfidf_vect,
        'vocabulary': vocabulary
    }

    print("Saving model data to disk for next time")
    # Insert the model into the model database
    MODEL_DB.create_file(pdumps(model, protocol=2), filename="ml_people_data")
    # Save model data to disk
    with open(ppl_model_data, "wb") as pickle_out:
        pdump(model, pickle_out)
    print('[SUCCESS] People model data post-processed and stored')
Example #13
0
                        if day.attrs['class'] == ['ProductionCalendar_holiday']:
                            self.days[date_day] = 'holiday'
                        elif day.attrs['class'] == ['ProductionCalendar_preholiday']:
                            self.days[date_day] = 'short'
                        else:
                            self.days[date_day] = 'work'

    def serialize(self):
        return dumps(dict([(x[0].isoformat(), x[1]) for x in self.days.iteritems()]))


# example:

if __name__ == '__main__':
    s = SuperjobCalendarParser('http://www.superjob.ru/proizvodstvennyj_kalendar/', debug=True)

    all_days = s.days
    print all_days.get(date(2012, 2, 2))
    print all_days.get(date(2008, 1, 23))
    print all_days.get(date(2014, 1, 7))
    print all_days.get(date(2014, 2, 2))

    # save as json
    json = s.serialize()
    with open('days.json', 'w') as f:
        f.write(json)

    # save as pickle
    with open('days.pickle', 'w') as f:
        f.write(pdumps(all_days))
Example #14
0
def remote_action():
    vbox, x, y, s, key = request.json
    t = (int(x), int(y), 0, 0, s, key)
    vm_name_action = "{}_action".format(all_boxes[vbox]["vm"])
    r.set(vm_name_action, pdumps(t, protocol=2))
    return jsonify(status="sent")
Example #15
0
def do_task_and_print(rdd,partition_number):
	'''Do a task, serialize the result'''
	result = do_task(rdd,partition_number)
	result = {'payload':pdumps(list(result)),
		'partition_number':partition_number}
	printer(jdumps(result))
def hash_obj(*args):
    return '_{:x}'.format(abs(hash(pdumps(args))))