コード例 #1
0
from pymldb import Connection
mldb = Connection("http://localhost/")

mldb.put(
    '/v1/procedures/import_bench_train_1m', {
        "type": "import.text",
        "params": {
            "dataFileUrl":
            "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv",
            "outputDataset": "bench_train_1m",
            "runOnCreation": True
        }
    })

mldb.put(
    '/v1/procedures/import_bench_test', {
        "type": "import.text",
        "params": {
            "dataFileUrl": "https://s3.amazonaws.com/benchm-ml--main/test.csv",
            "outputDataset": "bench_test",
            "runOnCreation": True
        }
    })

mldb.put(
    '/v1/procedures/benchmark', {
        "type": "classifier.experiment",
        "params": {
            "experimentName": "benchm_ml",
            "inputData": """
            select
コード例 #2
0
ファイル: query.py プロジェクト: BHChoEE/Dbms_mldb.ai_Demo
import sys
from random import randint
from pymldb import Connection

mldb = Connection("http://localhost:8080")

url_list = open('./url_list.txt', 'r').read().splitlines()

# url = url_list[randint(0, len(url_list) - 1)]
url = url_list[int(sys.argv[1])]

result = mldb.query("""
    SELECT scores.pred as score
    NAMED imagenet_labels.label
    FROM transpose(
        (
            SELECT flatten(inception({url: '%s'})[softmax]) as *
            NAMED 'pred'
        )
    ) AS scores

    LEFT JOIN imagenet_labels ON
        imagenet_labels.rowName() = scores.rowName()

    ORDER BY score DESC
    LIMIT 10
""" % url)

print(url)
print(str(result.index[0]))
print(str(result['score'][0]))
コード例 #3
0
from pymldb import Connection
from pymldb.resource import ResourceError
from rec.settings import HOST, PREFIX


def title(title, linechar='-'):
    return '\n' + ' {} '.format(title).center(80, linechar) + '\n'


if __name__ == '__main__':
    if len(sys.argv) > 1:
        whats = sys.argv[1:]
    else:
        whats = ['datasets', 'procedures', 'functions']

    mldb = Connection(HOST)
    prefix = PREFIX
    for what in whats:
        print(title(what.upper(), '='))
        for x in getattr(mldb, what).get():
            if x.startswith(prefix):
                print(title(x))
                print('CONFIG')
                pprint(getattr(mldb, what)(x).get_query())
                if what == 'functions':
                    print('INFO')
                    pprint(getattr(mldb, what)(x).info.get_query())
                if what == 'datasets':
                    try:
                        head = DataFrame(
                            getattr(mldb,
コード例 #4
0
ファイル: score.py プロジェクト: wuyanxing/mldb
from pprint import pprint
from pymldb import Connection
import rec.settings as _


def run_score_pipeline(mldb):
    # mldb.procedures(_.DATASET_MANAGER_TEST).runs.post_json({})
    # mldb.procedures(_.FEATURE_GENERATION_TEST).runs.post_json({})
    # r = mldb.datasets(_.TEST_FEATS_DATASET).query.get_query(
    #     select='APPLY FUNCTION {} WITH(object(select *) as features) EXTRACT(*)'.format(_.SCORE),
    #     limit=100,
    #     format='aos')
    # pprint(r)

    r = mldb.datasets(_.ACTIONS_DATASET).query.get_query(
        select='user_id, apply function {} with(*) extract(*), apply function',
        limit=10,
        format='aos')
    pprint(r)


if __name__ == '__main__':
    mldb = Connection(_.HOST)
    run_score_pipeline(mldb)
コード例 #5
0
import sys
from pymldb import Connection
import rec.settings as _
from pprint import pprint

if __name__ == '__main__':
    mldb = Connection(_.HOST)
    if len(sys.argv) <= 1:
        pprint(mldb.datasets.get())
    else:
        dataset_name = sys.argv[1]
        res = mldb.datasets(dataset_name).query.get_query(limit=1,
                                                          format='table')
        pprint(res[0])
コード例 #6
0
ファイル: train_matrix.py プロジェクト: MeltingCake/twodate
from pymldb import Connection
mldb = Connection()#starts connection with mldb

mldb.put('/v1/datasets/raw_data', { #loads in raw data
    "type":"text.csv.tabular",
    "params": {
        "dataFileUrl":"file:///mldb_data/sample.csv",
        'delimiter':'', 'quotechar':''
        }
    })

mldb.put('/v1/procedures/sparse_matrix',{
        "type":"transform",
        "params":{
            "inputData":"select tokenize(lineText,{offset:1, value:1}) as * from raw_data",
            "outputDataset":"sparse_matrix",
            "runOnCreation":True
        }
    })

mldb.put('/v1/procedures/svd_matrix', {
        "type" : "svd.train",
        "params" : {
            "trainingData" : """
            SELECT
                COLUMN EXPR (AS columnName() ORDER BY rowCount() DESC, columnName() LIMIT 4000)
            FROM sparse_matrix
            """,
            "columnOutputDataset" : "location_svd_embedding",
            "modelFileUrl" : "file://svd/svd_matrix.svd",
            "functionName": "location_svd_embedder",
コード例 #7
0
ファイル: load_browsing.py プロジェクト: wuyanxing/mldb
                     dataFileUrl='file://' + filename, encoding='us-ascii'))

if __name__ == '__main__':
    parser = argparse.ArgumentParser('load data from files into mldb')

    tsv_gz_help = '.tsv.gz file, path relative to mldb data dir'
    parser.add_argument('--sdb-dump', required=True, help='the big .gz file')
                        # '(path relative to mldb data dir)')
    parser.add_argument('--users', required=True, help=tsv_gz_help)
    parser.add_argument('--events', required=True, help=tsv_gz_help)
    parser.add_argument('--purchases', required=True, help=tsv_gz_help)
    args = parser.parse_args()

    # logging.basicConfig(level=logging.INFO)

    mldb = Connection(_.HOST)

    dataset = mldb.create_dataset(dataset_conf(_.ACTION_DATASET, 'beh.mutable'))

    from multiprocessing import Pool, Process
    import signal
    signal.signal(signal.SIGINT, signal.SIG_DFL)

    n = 8

    def make_keep_if(i,n):
        def foo(row):
            return row % n == i
        return foo

    # load_sdb(mldb, args.sdb_dump, dataset, None, None)
コード例 #8
0
from pymldb import Connection

mldb = Connection("http://localhost:8080")

inceptionUrl = 'http://public.mldb.ai/models/inception_dec_2015.zip'

mldb.put('/v1/functions/fetch', {"type": 'fetcher', "params": {}})
print("done")

mldb.put(
    '/v1/functions/inception', {
        "type": 'tensorflow.graph',
        "params": {
            "modelFileUrl":
            'archive+' + inceptionUrl + '#tensorflow_inception_graph.pb',
            "inputs": 'fetch({url})[content] AS "DecodeJpeg/contents"',
            "outputs": "softmax"
        }
    })
print("done")

mldb.put(
    "/v1/procedures/imagenet_labels_importer", {
        "type": "import.text",
        "params": {
            "dataFileUrl": 'archive+' + inceptionUrl +
            '#imagenet_comp_graph_label_strings.txt',
            "outputDataset": {
                "id": "imagenet_labels",
                "type": "sparse.mutable"
            },
コード例 #9
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--date',
                        '-d',
                        required=True,
                        help='date YYYY-MM-DD that separates'
                        ' training and testing')
    parser.add_argument('--nb-train',
                        '-n',
                        default=2000,
                        type=int,
                        help='nb training examples')
    args = parser.parse_args()

    mldb = Connection(_.HOST)
    # missing bit of datasets, the user/item sparse matrix
    where_commun = ["verb='rate'"]

    params = dict(select='scatter(item_id, compl) AS *',
                  groupBy='user_id',
                  rowName='user_id',
                  inputDataset=dataset_conf(_.ACTION_DATASET))

    # the one for training the SVD, only on the train set
    mldb.create_procedure(
        _.USER_ITEM_DATASET_MAKER,
        'transform',
        where=' AND '.join(where_commun + get_where_train(args.date)),
        outputDataset=dataset_conf(_.USER_ITEM_DATASET, 'beh.mutable'),
        **params)
コード例 #10
0
ファイル: train.py プロジェクト: hexavi42/McHacks
 def __init__(self, port=8080, pool=all_candidates, depth=False):
     self.mldb = Connection(host="http://localhost:{0}".format(port))
     self.set_wordnet()
     self.candidates = pool
     self.depth = depth
コード例 #11
0
ファイル: train.py プロジェクト: hexavi42/McHacks
class Candidate_Predictor:
    candidates = []
    candidate_favor = {}
    mldb = None
    theta = []
    depth = False

    def __init__(self, port=8080, pool=all_candidates, depth=False):
        self.mldb = Connection(host="http://localhost:{0}".format(port))
        self.set_wordnet()
        self.candidates = pool
        self.depth = depth

    # Tickles SentiWordnet, removing POS data
    def set_wordnet(self):
        self.mldb.put('/v1/procedures/sentiwordneter', {
            "type": "import.sentiwordnet",
            "params": {
                "dataFileUrl": "file:///mldb_data/SentiWordNet_3.0.0_20130122.txt",
                "outputDataset": "sentiwordnet",
                "runOnCreation": True
            }
        })
        self.mldb.put("/v1/procedures/baseWorder", {
            "type": "transform",
            "params": {
                "inputData": """
                    select *, jseval('
                        return x.split("#")[0];
                    ', 'x', rowName()) as baseWord
                    from sentiwordnet
                """,
                "outputDataset": "senti_clean",
                "runOnCreation": True
            }
        })
        self.mldb.put("/v1/procedures/baseWorder", {
            "type": "transform",
            "params": {
                "inputData": """
                       select avg({* EXCLUDING(baseWord)}) as avg,
                              min({* EXCLUDING(baseWord)}) as min,
                              max({* EXCLUDING(baseWord)}) as max,
                       count(*) as cnt
                        NAMED baseWord
                        from senti_clean
                        group by baseWord
                        order by cnt desc
                """,
                "outputDataset": "senti_clean2",
                "runOnCreation": True
            }
        })

    # check sentiment of sentence
    def return_sent(self, sentence):
        # remove quotes because it messes with query
        no_quote = sentence.replace("'", '')
        split = list(set(no_quote.split(' ')))
        join = "','".join(split)
        sent_sent = self.mldb.query("select avg* from senti_clean2 where rowName() in ('{0}')".format(join))
        overall_senti = 0
        if 'avg.NegSenti' in sent_sent.keys():
            for word in sent_sent['avg.NegSenti'].keys():
                overall_senti += sent_sent['avg.PosSenti'][word]-sent_sent['avg.NegSenti'][word]
        return overall_senti

    # run tweet csvs about candidates through sentiment analysis
    def run_candidates(self):
        for candidate in self.candidates:
            states = {}
            counter = 0
            with open('data/{0}.csv'.format(candidate), 'rb') as csvfile:
                spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
                for row in spamreader:
                    if self.depth and counter > self.depth:
                        break
                    elif len(row) == 2 and row[1]:
                        state = normalize_state_name(row[1])
                        if state is None:
                            pass
                        else:
                            overall_senti = self.return_sent(row[0])
                            if state not in states:
                                states[state] = overall_senti
                            else:
                                states[state] = states[state]+overall_senti
                            counter += 1
                    else:
                        pass
            self.candidate_favor[candidate] = states

    # Returns the sentiment value stored in the database
    def get_sentiment_value(self, candidate, state):
        self.candidate_favor[all_candidates[candidate]][state]

    # Gets all the results stored in the database
    def get_results(self):
        # Format: [[CANDIDATE_ID, STATE_ID, VOTE_PERCENTAGE], ...]
        pass

    # Calculates the parameteres using normal equations
    def calculate_params(self):
        inp = []
        out = []
        results = self.get_results()
        for r in results:
            candidate = r[0]
            state = r[1]
            # TODO: More inputs
            inp.append(self.get_sentiment_value(candidate, state))
            out.append(r[3])
        # Linear regression
        x = np.array(inp)
        self.theta = np.multiply(np.multiply(np.linalg.inv(np.multiply(np.transpose(x), x)), np.transpose(x)), out)

    # Trains all the sentiment values based on the expected results
    def train(self):
        self.run_candidates()
        self.calculate_params()

    # Predicts the situation for a given list of candidates for a specific state
    # Returns a map of the percentage each candidate is predicted to have
    def predict(self, state):
        results = {}
        for candidate in self.candidates:
            inp = [self.get_sentiment_value(candidate, state)]
            # TODO: WTF
            results[candidate] = np.multiply(self.theta, inp)
        return results

    def save(self, file='sentiment.csv'):
        with open(file, 'w') as csvfile:
            fieldnames = ['candidate']+state_code.values()
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for candidate in self.candidates:
                self.candidate_favor[candidate]['candidate'] = candidate
                writer.writerow(self.candidate_favor[candidate])
コード例 #12
0
## https://github.com/szilard/benchm-ml/issues/25

## from @nicolaskruchten, thanks :)

## This code gives an AUC of 0.7431 in 19.1s for the 1M training set on an r3.8xlarge EC2 instance
## with the latest release of Datacratic's Machine Learning Database (MLDB), available at http://mldb.ai/

from pymldb import Connection
mldb = Connection("http://localhost/")

mldb.v1.datasets("bench-train-1m").put({
    "type": "text.csv.tabular",
    "params": {
        "dataFileUrl": "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv"
    }
})

mldb.v1.datasets("bench-test").put({
    "type": "text.csv.tabular",
    "params": {
        "dataFileUrl": "https://s3.amazonaws.com/benchm-ml--main/test.csv"
    }
})

mldb.v1.procedures("benchmark").put({
    "type": "classifier.experiment",
    "params": {
        "experimentName": "benchm_ml",
        "training_dataset": {
            "id": "bench-train-1m"
        },