Ejemplo n.º 1
0
def predict(service, chart, image_filenames):
    # setting up DD client
    host = 'localhost'
    sname = config['REPO'][service]['NAME']
    dd = DD(host)
    dd.set_return_format(dd.RETURN_PYTHON)

    parameters_input = {}
    parameters_mllib = {}
    parameters_output = {
        "best":
        10,
        "template":
        "{{#body}}{{#predictions}} "
        "{ \"index\": {\"_index\": \"objects-10\", \"_type\": \"img\" } }\n "
        "{ \"uri\": \"{{uri}}\", "
        "\"chart\": \"" + chart + "\", "
        # "\"artist\": \"" + artist + "\", "
        "\"categories\": [ {{#classes}} "
        "{ \"category\": \"{{cat}}\", "
        "\"score\":{{prob}} } "
        "{{^last}},{{/last}}{{/classes}} ] }\n "
        "{{/predictions}}{{/body}} \n",
        "network": {
            "url": "host.docker.internal:9200/objects-10/_bulk",
            "http_method": "POST"
        }
    }

    predict = dd.post_predict(sname, image_filenames, parameters_input,
                              parameters_mllib, parameters_output)
Ejemplo n.º 2
0
    batch_sizes.append(l)
    if l < 32:
        l = l * 2
    else:
        l += 16

parameters_input = {}
parameters_mllib = {'gpu': args.gpu}
parameters_output = {}
if args.detection:
    parameters_output['bbox'] = True
    parameters_output['confidence_threshold'] = 0.1

# First call to load model
data = list_bench_files[:1]
classif = dd.post_predict(args.sname, data, parameters_input, parameters_mllib,
                          parameters_output)

for b in batch_sizes:
    data = list_bench_files[:b]
    #print data
    fail = False
    mean_ptime = 0
    mean_ptime_per_img = 0
    for i in range(0, args.npasses + 1):
        #print 'testing batch size =',len(data)
        classif = dd.post_predict(args.sname, data, parameters_input,
                                  parameters_mllib, parameters_output)
        if classif['status']['code'] == 200:
            if i == 0:
                continue  # skipping first pass so that the batch resize does not affect timing
            ptime = classif['head']['time']
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description="DeepDetect benchmark tool")
    parser.add_argument("--host", help="server host", default="localhost")
    parser.add_argument("--port", help="server port", type=int, default=8080)
    parser.add_argument("--sname", help="service name")
    parser.add_argument("--img-width",
                        help="image width",
                        type=int,
                        default=224)
    parser.add_argument("--img-height",
                        help="image height",
                        type=int,
                        default=224)
    parser.add_argument("--bw",
                        help="whether images are bw",
                        action="store_true")
    parser.add_argument(
        "--histogram-equalization",
        "--eqhist",
        help="whether we apply an histogram equalization to images",
        action="store_true",
    )
    parser.add_argument("--gpu",
                        help="whether to bench GPU",
                        action="store_true")
    parser.add_argument("--gpuid", help="gpu id to use", type=int, default=0)
    parser.add_argument("--cpu",
                        help="whether to bench CPU",
                        action="store_true")
    parser.add_argument(
        "--remote-bench-data-dir",
        help="when bench data directory, when available remotely on the server",
    )
    parser.add_argument("--max-batch-size",
                        help="max batch size to be tested",
                        type=int,
                        default=256)
    parser.add_argument(
        "--max-workspace-size",
        help="max workspace size for tensort bench",
        type=int,
        default=1024,
    )
    parser.add_argument(
        "--list-bench-files",
        help="file holding the list of bench files",
        default="list_bench_files.txt",
    )
    parser.add_argument("--npasses",
                        help="number of passes for every batch size",
                        type=int,
                        default=5)
    parser.add_argument("--detection",
                        help="whether benching a detection model",
                        action="store_true")
    parser.add_argument(
        "--segmentation",
        help="whether benching a segmentation model",
        action="store_true",
    )
    parser.add_argument(
        "--regression",
        help="whether benching a regression model",
        action="store_true",
    )
    parser.add_argument(
        "--search",
        help="whether benching a similarity search service",
        action="store_true",
    )
    parser.add_argument(
        "--search-multibox",
        help="whether benching a multibox similarity search service",
        action="store_true",
    )
    parser.add_argument("--create",
                        help="model's folder name to create a service")
    parser.add_argument(
        "--nclasses",
        help="number of classes for service creation",
        type=int,
        default=1000,
    )
    parser.add_argument(
        "--auto-kill",
        help="auto kill the service after benchmarking",
        action="store_true",
    )
    parser.add_argument("--csv-output", help="CSV file output")
    parser.add_argument("--json-output", help="JSON file output")
    parser.add_argument("--mllib",
                        help="mllib to bench, ie [tensorrt|ncnn|caffe]",
                        default="caffe")
    parser.add_argument("--datatype",
                        help="datatype for tensorrt [fp16|fp32]",
                        default="fp32")
    parser.add_argument(
        "--recreate",
        help=
        "recreate service between every batchsize, useful for batch_size dependent precompiling backends (ie tensorRT)",
        action="store_true",
        default=False,
    )
    parser.add_argument("--dla",
                        help="use dla",
                        action="store_true",
                        default=False)
    parser.add_argument("--gpu-resize",
                        help="image resizing on gpu",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "--image-interp",
        help="image interpolation method (nearest, linear, cubic, ...)",
    )
    args = parser.parse_args()

    host = args.host
    port = args.port
    dd = DD(host, port)
    dd.set_return_format(dd.RETURN_PYTHON)
    autokill = args.auto_kill

    def service_create(bs):
        # Create a service
        if args.create:
            description = "image classification service"
            mllib = args.mllib
            model = {"repository": args.create}
            parameters_input = {
                "connector": "image",
                "width": args.img_width,
                "height": args.img_height,
                "bw": args.bw,
                "histogram_equalization": args.histogram_equalization,
            }
            if args.segmentation:
                parameters_input["segmentation"] = True
            if args.regression:
                parameters_input["regression"] = True
            if args.dla:
                parameters_mllib = {
                    "nclasses": args.nclasses,
                    "datatype": args.datatype,
                    "readEngine": True,
                    "writeEngine": True,
                    "maxBatchSize": bs,
                    "dla": 0,
                    "maxWorkspaceSize": args.max_workspace_size,
                }
            else:
                parameters_mllib = {
                    "nclasses": args.nclasses,
                    "datatype": args.datatype,
                    "readEngine": True,
                    "writeEngine": True,
                    "maxBatchSize": bs,
                    "maxWorkspaceSize": args.max_workspace_size,
                }
            parameters_output = {}
            dd.put_service(
                args.sname,
                model,
                description,
                mllib,
                parameters_input,
                parameters_mllib,
                parameters_output,
            )
        else:
            pass

    out_json = []
    out_csv = None
    csv_writer = None
    if args.csv_output:
        out_csv = open(args.csv_output, "w+")
        csv_writer = csv.writer(out_csv)
        csv_writer.writerow(
            ["batch_size", "mean processing time", "mean time per img"])

    list_bench_files = []
    with open(args.list_bench_files) as f:
        for line in f:
            list_bench_files.append(args.remote_bench_data_dir + "/" +
                                    line.rstrip())
    batch_sizes = []
    batch_size = 1
    while batch_size <= args.max_batch_size:
        batch_sizes.append(batch_size)
        if batch_size < 32:
            batch_size = batch_size * 2
        else:
            batch_size += 16

    parameters_input = {}
    if not args.image_interp == "":
        parameters_input["interp"] = args.image_interp
    if args.gpu_resize:
        parameters_input["cuda"] = args.gpu_resize
    parameters_mllib = {"gpu": args.gpu, "gpuid": args.gpuid}
    parameters_output = {}
    if args.detection:
        parameters_output["confidence_threshold"] = 0.1
        if args.search or args.search_multibox:
            parameters_output["search"] = True
            parameters_output["rois"] = "rois"
            parameters_output["bbox"] = False
        else:
            parameters_output["bbox"] = True
        if args.search_multibox:
            parameters_output["multibox_rois"] = True
    elif args.segmentation:
        parameters_input["segmentation"] = True
    elif args.regression:
        parameters_output["regression"] = True
    elif args.search:
        parameters_output["search"] = True

    # First call to load model
    data = list_bench_files[:1]
    if not args.recreate:
        if not args.mllib == "tensorrt" or args.recreate:
            service_create(1)
        else:
            service_create(args.max_batch_size)
        classif = dd.post_predict(args.sname, data, parameters_input,
                                  parameters_mllib, parameters_output)

    for b in batch_sizes:
        data = list_bench_files[:b]
        fail = False
        if args.recreate:
            service_create(b)
            for i in range(5):
                classif = dd.post_predict(
                    args.sname,
                    data,
                    parameters_input,
                    parameters_mllib,
                    parameters_output,
                )
        mean_ptime = 0
        mean_ptime_per_img = 0
        for i in range(0, args.npasses + 1):
            print("testing batch size = %s" % len(data))
            classif = dd.post_predict(args.sname, data, parameters_input,
                                      parameters_mllib, parameters_output)
            if classif["status"]["code"] == 200:
                if i == 0:
                    continue  # skipping first pass so that the batch resize does not affect timing
                ptime = classif["head"]["time"]
                ptime_per_img = ptime / b
                mean_ptime += ptime
                mean_ptime_per_img += ptime_per_img
                print(
                    "pass %s batch size = %s / processing time = %s / time per image = %s"
                    % (i, b, ptime, ptime_per_img))
            else:
                print(classif["status"])
                # reload model
                data = list_bench_files[:1]
                classif = dd.post_predict(
                    args.sname,
                    data,
                    parameters_input,
                    parameters_mllib,
                    parameters_output,
                )
                fail = True
                break
        mean_processing_time = mean_ptime / args.npasses
        mean_time_per_img = mean_ptime_per_img / args.npasses
        print(
            ">>> batch size = %s / mean processing time = %s / mean time per image = %s / fps = %s / fail = %s"
            % (
                b,
                mean_ptime / args.npasses,
                mean_ptime_per_img / args.npasses,
                1000 / (mean_ptime_per_img / args.npasses),
                fail,
            ), )
        out_json.append({
            "batch_size": b,
            "mean_processing_time": mean_processing_time,
            "mean_time_per_img": mean_time_per_img,
        })
        if args.csv_output:
            csv_writer.writerow([b, mean_processing_time, mean_time_per_img])
        # break
        if args.recreate:
            dd.delete_service(args.sname)

    if args.json_output:
        with open(args.json_output, "w") as outfile:
            json.dump(out_json, outfile)

    if autokill:
        dd.delete_service(args.sname)
Ejemplo n.º 4
0
                    os.system(cmd)
                    i = i + 1

            # jpg_unannotated -> json, jpg_annotated
            print('  Processing unannotated frames through DeepDetect...')
            for subroot, subdirs, subfiles in os.walk(output_jpg_unannotated):
                for frame in sorted(subfiles):
                    parameters_input = {}
                    parameters_mllib = {'gpu': True}
                    parameters_output = {
                        'bbox': True,
                        'confidence_threshold': args.confidence_threshold
                    }
                    data = [os.path.join(output_jpg_unannotated, frame)]
                    detect = dd.post_predict('ssd', data, parameters_input,
                                             parameters_mllib,
                                             parameters_output)
                    #print detect
                    if detect['status']['code'] != 200:
                        print '  error', detect['status']['code'], 'on', frame
                        break
                    predictions = detect['body']['predictions']
                    with open(os.path.join(output_json, frame[:-4] + '.json'),
                              'w') as f:
                        json.dump(detect, f)
                        f.close()
                    for p in predictions:
                        img = cv2.imread(p['uri'])
                        # white image background, comment line below to see image behind boxes
                        cv2.rectangle(img, (0, 9999), (9999, 0),
                                      (255, 255, 255), -1)
Ejemplo n.º 5
0
try:
    jout = dd.put_service('testggan', model, 'gan generator inference test',
                          'torch', parameters_input, parameters_mllib,
                          parameters_output)
except:
    print('model already exists')
    pass

# inference call
data = [args.img_in]
parameters_input = {
    'rgb': True,
    'scale': 0.00392,
    "mean": [0.5, 0.5, 0.5],
    "std": [0.5, 0.5, 0.5]
}
parameters_mllib = {'extract_layer': 'last'}
parameters_output = {}
jout = dd.post_predict('testggan', data, parameters_input, parameters_mllib,
                       parameters_output)

#print(jout)
vals = jout['body']['predictions'][0]['vals']
#print('vals=',vals)
np_vals = np.array(vals)
np_vals = np_vals.reshape((3, args.img_size, args.img_size))
out_img = (np.transpose(np_vals, (1, 2, 0)) + 1) / 2.0 * 255.0
out_img = cv2.cvtColor(out_img.astype('float32'), cv2.COLOR_RGB2BGR)
cv2.imwrite(args.img_out, out_img)
print("Successfully generated image " + args.img_out)
Ejemplo n.º 6
0
if args.index:
    parameters_output['index'] = True

    # list files in image repository
    c = 0
    onlyfiles = []
    for (dirpath, dirnames, filenames) in walk(args.index):
        nfilenames = []
        for f in filenames:
            nfilenames.append(dirpath + '/' + f)
        onlyfiles.extend(nfilenames)
    for x in batch(onlyfiles, args.index_batch_size):
        sys.stdout.write('\r' + str(c) + '/' + str(len(onlyfiles)))
        sys.stdout.flush()
        classif = dd.post_predict(sname, x, parameters_input, parameters_mllib,
                                  parameters_output)
        for p in classif['body']['predictions']:
            c = c + 1
        if c >= 100:
            break

    # one last dumb predict call to build the index
    print 'building index...\n'
    parameters_output['index'] = False
    parameters_output['build_index'] = True
    classif = dd.post_predict(sname, [nfilenames[0]], parameters_input,
                              parameters_mllib, parameters_output)

if args.search:
    parameters_output['search'] = True
    parameters_output['search_nn'] = args.search_size
Ejemplo n.º 7
0
        os.remove('data.bin')
    except:
        pass
    s = shelve.open('data.bin')

    # list files in image repository
    c = 0
    d = 1
    onlyfiles = []
    for (dirpath, dirnames, filenames) in walk(args.index):
        nfilenames = []
        for f in filenames:
            nfilenames.append(dirpath + '/' + f)
        onlyfiles.extend(nfilenames)
    for x in batch(onlyfiles,args.index_batch_size):
        classif = dd.post_predict(sname,x,parameters_input,parameters_mllib,parameters_output)
        
        for p in classif['body']['predictions']:
            uri =  p['uri']
            rois = p['rois']
            sys.stdout.write('\rIndexing image '+str(d)+'/'+str(len(onlyfiles)) + ' : ' + str(len(rois)) + ' rois  total:' + str(c) + '   ')
            sys.stdout.flush()

            for roi in rois:
                bbox = roi['bbox']
                cat = roi['cat']
                prob = roi['prob']
                vals = roi['vals']
                if c == 0:
                    layer_size = len(vals)
                    s['layer_size'] = layer_size
Ejemplo n.º 8
0
def segment(image, nclasses=150, port=8080, host="localhost"):
    random.seed(134124)
    model_dir = '/home/model'
    sname = 'segserv'
    description = 'image segmentation'
    mllib = 'caffe'
    mltype = 'unsupervised'
    dd = DD(host, port)
    dd.set_return_format(dd.RETURN_PYTHON)

    def random_color():
        ''' generate rgb using a list comprehension '''
        r, g, b = [random.randint(0, 255) for i in range(3)]
        return [r, g, b]

    raw_img = plt.imread("/home/ubuntu/model/" + image).astype("float32") / 255
    width, height = raw_img.shape[:2]
    #width = 480
    #height = 480
    # creating ML service
    model_repo = model_dir
    if not model_repo:
        model_repo = os.getcwd() + '/model/'
    model = {'repository': model_repo}
    parameters_input = {'connector': 'image', 'width': width, 'height': height}
    parameters_mllib = {'nclasses': nclasses}
    parameters_output = {}
    try:
        servput = dd.put_service(sname, model, description, mllib,
                                 parameters_input, parameters_mllib,
                                 parameters_output, mltype)
    except:  # most likely the service already exists
        pass

    # prediction call
    parameters_input = {'segmentation': True}
    parameters_mllib = {'gpu': True, 'gpuid': 0}
    parameters_output = {}
    data = ["/home/model/" + image]
    detect = dd.post_predict(sname, data, parameters_input, parameters_mllib,
                             parameters_output)

    pixels = np.array((map(int, detect['body']['predictions'][0]['vals'])))
    imgsize = detect['body']['predictions'][0]['imgsize']

    # visual output
    label_colours = []
    for c in range(nclasses):
        label_colours.append(random_color())
    label_colours = np.array(label_colours)

    r = pixels.copy()
    g = pixels.copy()
    b = pixels.copy()
    for l in range(0, nclasses):
        r[pixels == l] = label_colours[l, 0]
        g[pixels == l] = label_colours[l, 1]
        b[pixels == l] = label_colours[l, 2]

    r = np.reshape(r, (imgsize['height'], imgsize['width']))
    g = np.reshape(g, (imgsize['height'], imgsize['width']))
    b = np.reshape(b, (imgsize['height'], imgsize['width']))
    rgb = np.zeros((imgsize['height'], imgsize['width'], 3))
    rgb[:, :, 0] = r / 255.0
    rgb[:, :, 1] = g / 255.0
    rgb[:, :, 2] = b / 255.0
    print(rgb[0, 0])
    body_mask = np.where(rgb * 255 == np.array([47, 197, 233]), 1, 0)

    result = body_mask * raw_img
    plt.imsave("result.png", result)
    return result
Ejemplo n.º 9
0
class DNNFeatureExtractor(FeatureGenerator):
    def __init__(self,
                 dnnmodel,
                 image_files,
                 index_repo,
                 batch_size=32,
                 dd_host='localhost',
                 dd_port=8080,
                 dd_description='image classification',
                 meta_in='',
                 meta_out='',
                 captions_in='',
                 captions_out='',
                 mapi_in='',
                 mapi_out=''):
        self.dd_host = dd_host
        self.dd_port = dd_port
        self.dd_description = dd_description
        self.dd_mllib = 'caffe'
        self.meta_in = meta_in
        self.meta_out = meta_out
        self.captions_in = captions_in
        self.captions_out = captions_out
        self.mapi_in = mapi_in
        self.mapi_out = mapi_out
        self.gpuid = 0
        self.dnnmodel = dnnmodel
        if self.dnnmodel.extract_layer:
            self.dd_mltype = 'unsupervised'
        else:
            self.dd_mltype = 'supervised'
        self.image_files = image_files
        self.batch_size = batch_size
        self.binarized = False
        self.dd = DD(self.dd_host, self.dd_port)
        self.dd.set_return_format(self.dd.RETURN_PYTHON)
        self.index_repo = index_repo + '/' + self.dnnmodel.name
        try:
            os.mkdir(self.index_repo)
        except:
            #logger.warning('directory ' + self.index_repo + ' may alreay exist')
            pass
        self.st = {}  # shelve used for full tags storage
        self.stm = {}  # in memory tmp storage
        if self.dd_mltype == 'supervised':
            self.st = shelve.open(self.index_repo + '/tags.bin')
        self.delete_dd_service()

    def __del__(self):
        if self.dd_mltype == 'supervised':
            for i, t in self.stm.iteritems():
                self.st[i] = t
            self.st.close()

    def create_dd_service(self):
        model = {'repository': self.dnnmodel.model_repo}
        parameters_input = {
            'connector': 'image',
            'width': self.dnnmodel.img_width,
            'height': self.dnnmodel.img_height
        }
        parameters_mllib = {
            'nclasses': self.dnnmodel.nclasses,
            'gpu': True,
            'gpuid': self.gpuid
        }
        parameters_output = {}
        screate = self.dd.put_service(self.dnnmodel.name, model,
                                      self.dd_description, self.dd_mllib,
                                      parameters_input, parameters_mllib,
                                      parameters_output, self.dd_mltype)
        outcode = screate['status']['code']
        if outcode != 201 and outcode != 403:
            logger.error('failed creation of DNN service ' +
                         self.dnnmodel.name)
            #return
            raise Exception('failed creating DNN service ' +
                            self.dnnmodel.name)
        return

    def delete_dd_service(self):
        self.dd.delete_service(self.dnnmodel.name, clear='')

    def preproc(self):
        # none needed with dd at the moment
        return

    def index(self):
        ## feature generation, to be indexed or searched for
        self.create_dd_service()
        feature_vectors = []
        uris = []
        parameters_input = {}
        parameters_mllib = {
            'gpu': True,
            'gpuid': self.gpuid,
            'extract_layer': self.dnnmodel.extract_layer
        }

        if self.dd_mltype == 'unsupervised':
            parameters_output = {'binarized': self.binarized}
            # pass one image to get the size of the output layer
            classif = self.dd.post_predict(self.dnnmodel.name,
                                           [self.image_files[0]],
                                           parameters_input, parameters_mllib,
                                           parameters_output)
            response_code = classif['status']['code']
            if response_code != 200:
                print 'response=', classif
                logger.error(
                    'failed (index) initial prediction call to model ' +
                    self.dnnmodel.name + ' via dd')
                self.delete_dd_service()
                return
            dim = len(classif['body']['predictions']['vals'])
        else:
            parameters_output = {'best': self.dnnmodel.best}
            dim = self.dnnmodel.nclasses

        c = 0
        logger.info('dnn feature prediction and indexing for service ' +
                    self.dnnmodel.name + ' with layer of size ' + str(dim))
        with Indexer(dim, self.index_repo) as indexer:
            for x in batch(self.image_files, self.batch_size):
                classif = self.dd.post_predict(self.dnnmodel.name, x,
                                               parameters_input,
                                               parameters_mllib,
                                               parameters_output)
                #print classif
                response_code = classif['status']['code']
                if response_code != 200:
                    print 'response=', classif
                    logger.error(
                        'failed (index) batch prediction call to model ' +
                        self.dnnmodel.name + ' via dd')
                    continue
                predictions = classif['body']['predictions']
                if self.batch_size == 1 or len(self.image_files) == 1:
                    predictions = [predictions]
                for p in predictions:
                    if self.dd_mltype == 'unsupervised':
                        indexer.index_single(c, p['vals'], p['uri'])
                        if c > 0 and c % self.batch_size == 0:
                            logger.info('indexed ' + str(c) + ' images')
                    else:
                        puri = str(p['uri'])
                        indexer.index_tags_single(p['classes'], p['uri'])
                        self.stm[puri] = []
                        for pc in p['classes']:
                            self.stm[puri].append(pc['cat'])
                    c = c + 1

            indexer.build_index()
            indexer.save_index()
        logger.info('indexed a total of ' + str(c) + ' images')
        self.delete_dd_service()

    def search(self, jdataout={}):
        self.create_dd_service()
        parameters_input = {}
        parameters_mllib = {
            'gpu': True,
            'gpuid': self.gpuid,
            'extract_layer': self.dnnmodel.extract_layer
        }

        if self.dd_mltype == 'unsupervised':
            parameters_output = {'binarized': self.binarized}
        else:
            parameters_output = {'best': self.dnnmodel.best}

        logger.info('dnn feature prediction and searching for service ' +
                    self.dnnmodel.name)
        results = {}
        with Searcher(self.index_repo, search_size=500) as searcher:
            searcher.load_index()
            for x in batch(self.image_files, self.batch_size):
                classif = self.dd.post_predict(self.dnnmodel.name, x,
                                               parameters_input,
                                               parameters_mllib,
                                               parameters_output)
                response_code = classif['status']['code']
                if response_code != 200:
                    print 'response=', classif
                    logger.error(
                        'failed batch (search) prediction call to model ' +
                        self.dnnmodel.name + ' via dd')
                    self.delete_dd_service()
                    print classif
                    raise Exception(
                        'failed batch (search) prediction call to model ' +
                        self.dnnmodel.name)
                predictions = classif['body']['predictions']
                if self.batch_size == 1 or len(self.image_files) == 1:
                    predictions = [predictions]
                #print 'predictions=',predictions
                for p in predictions:
                    if self.dd_mltype == 'unsupervised':
                        nns = searcher.search_single(p['vals'], p['uri'])
                    else:
                        puri = str(p['uri'])
                        nns = searcher.search_tags_single(p['classes'], puri)
                        nns['tags_out_all'] = []
                        for nn in nns['nns_uris']:
                            nns['tags_out_all'].append(self.st[str(nn)])
                    results[p['uri']] = nns

        self.delete_dd_service()
        return self.to_json(results, '/img/reuters/', '/img/tate/',
                            self.dnnmodel.name, self.dnnmodel.description,
                            jdataout, self.meta_in, self.meta_out,
                            self.captions_in, self.captions_out, self.mapi_in,
                            self.mapi_out)
Ejemplo n.º 10
0
model = {'repository':model_repo}
parameters_input = {'connector':'image','width':width,'height':height}
parameters_mllib = {'nclasses':nclasses}
parameters_output = {}
try:
    servput = dd.put_service(sname,model,description,mllib,
                             parameters_input,parameters_mllib,parameters_output,mltype)
except: # most likely the service already exists
    pass

# prediction call
parameters_input = {'segmentation':True}
parameters_mllib = {'gpu':True,'gpuid':0}
parameters_output = {}
data = [args.image]
detect = dd.post_predict(sname,data,parameters_input,parameters_mllib,parameters_output)

pixels = np.array((map(int,detect['body']['predictions'][0]['vals'])))
imgsize = detect['body']['predictions'][0]['imgsize']

# visual output
label_colours = []
for c in range(nclasses):
    label_colours.append(random_color())
label_colours = np.array(label_colours)

r = pixels.copy()
g = pixels.copy()
b = pixels.copy()
for l in range(0,nclasses):
    r[pixels==l] = label_colours[l,0]
Ejemplo n.º 11
0
    info = dd.info()
    print(info)
    sys.exit()

if args.delete:
    delete_service = dd.delete_service(args.model_name,clear='')
    print(delete_service)
    sys.exit()
    
# creating ML service
if args.create_service:
    model = {'repository':model_config['path']+args.model_name}
    parameters_input = {'connector':'image','width':model_config['width'],'height':model_config['height']}
    parameters_mllib = {'nclasses':model_config['nclasses'],'gpu':True}
    parameters_output = {}
    creation = dd.put_service(args.model_name,model,args.model_name,model_config['backend'],
                              parameters_input,parameters_mllib,parameters_output,'supervised')
    print(creation)

if args.img_url:
    parameters_input = {}
    parameters_mllib = {}
    parameters_output = {'best':3}
    if args.model_name == 'voc0712':
        parameters_output['bbox'] = True
        parameters_output['confidence_threshold'] = 0.01
    data = [args.img_url]
    classify = dd.post_predict(args.model_name,data,
                               parameters_input,parameters_mllib,parameters_output)
    print classify
Ejemplo n.º 12
0
class InformationExtractor(object):
    """ Module with functions for information Extraction """
    wordnet_lemmatizer = WordNetLemmatizer()

    #External service URLs
    google_service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    probase_service_url = "https://concept.research.microsoft.com/api/Concept/ScoreByProb"
    #DD constants
    height = width = 224
    nclasses_clothing = 304
    nclasses_bags = 37
    nclasses_footwear = 51
    nclasses_fabric = 233

    #setting up DD client
    mllib = 'caffe'

    def __init__(self, word_vectors, companies, styles, materials, items,
                 probase_brands, probase_materials, patterns,
                 top_category_items, deep_detectStartup, confFilePath, tfidf):
        self.conf = json.load(open(confFilePath))
        self.tfidf = tfidf
        self.api_key = self.conf["google_api_key_path"]
        self.deep_detect_models = self.conf["deep_detect_models"]
        self.CAPTION_FACTOR = self.conf["caption_factor"]
        self.COMMENTS_FACTOR = self.conf["comments_factor"]
        self.USERTAG_FACTOR = self.conf["usertag_factor"]
        self.HASHTAG_FACTOR = self.conf["hashtag_factor"]
        if deep_detectStartup:
            self.dd = DD(self.conf["deep_detect_host"],
                         port=self.conf["deep_detect_port"])
            self.startup_deep_detect()
        self.wordvec_model = gensim.models.KeyedVectors.load_word2vec_format(
            word_vectors, binary=False)
        self.companies = companies
        self.styles = styles
        self.materials = materials
        self.items = items
        self.brands_keywords_google = []
        self.materials_keywords_google = []
        self.probase_brands = probase_brands
        self.probase_materials = probase_materials
        self.colors = []
        self.patterns = patterns
        self.top_category_items = top_category_items
        self.lemmatize()

    def lemmatize(self):
        """ Lemmatize domain lists"""
        self.styles_lemmas = {
            self.wordnet_lemmatizer.lemmatize(style): style
            for style in self.styles
        }
        self.materials_lemmas = {
            self.wordnet_lemmatizer.lemmatize(material): material
            for material in self.materials
        }
        self.items_lemmas = {
            self.wordnet_lemmatizer.lemmatize(item): item
            for item in self.items
        }

    def find_closest_semantic(self, caption, comments, tags, hashtags,
                              segmented_hashtags, num, topic, id):
        """ Finds num semantically closest candidates for a given topic"""
        topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"),
                    topic)
        freq_scores = {}
        for x in topic:
            freq_scores[x] = 0.0
        for token in caption:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity(token, token2, token2Lemma,
                                                   self.CAPTION_FACTOR,
                                                   self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        for token in comments:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity(token, token2, token2Lemma,
                                                   self.COMMENTS_FACTOR,
                                                   self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        for token in hashtags:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity(token, token2, token2Lemma,
                                                   self.HASHTAG_FACTOR,
                                                   self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        for token in segmented_hashtags:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity(token, token2, token2Lemma,
                                                   self.HASHTAG_FACTOR,
                                                   self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        for token in tags:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity(token, token2, token2Lemma,
                                                   self.USERTAG_FACTOR,
                                                   self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        top = sorted([(k, v) for k, v in freq_scores.iteritems()],
                     reverse=True,
                     key=lambda x: x[1])[:num]
        return top

    def token_similarity(self, token, token2, token2Lemma, factor, tfidf):
        """ Returns similarity between two tokens using cosine similarity between embeddings, edit distance and TFIDF weighting"""
        similarity = 0.0
        if isinstance(token, str):
            token = token.decode("utf-8", "ignore")
        tokenLemma = self.wordnet_lemmatizer.lemmatize(token)
        if tokenLemma in self.wordvec_model.wv.vocab and token2Lemma in self.wordvec_model.wv.vocab:
            if edit_distance(tokenLemma, token2Lemma) == 0:
                factor = factor * 10
            similarity = factor * math.pow(
                float(self.wordvec_model.wv.similarity(tokenLemma,
                                                       token2Lemma)), 2)
        else:
            dist = factor * edit_distance(tokenLemma, token2Lemma)
            similarity = float(1) / float(1 + math.pow(dist, 2))
        tfidf_score = 0.0
        if token in tfidf:
            tfidf_score = tfidf[token]
        if token.encode("utf-8") in tfidf:
            tfidf_score = tfidf[token.encode("utf-8")]
        tfidf_score = max(tfidf_score, 0.0001)
        similarity = similarity * tfidf_score
        return similarity

    def find_closest_syntactic(self, caption, comments, tags, hashtags,
                               segmented_hashtags, num, topic, id):
        """ Finds num semantically closest candidates for a given topic"""
        topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"),
                    topic)
        freq_scores = {}
        for x in topic:
            freq_scores[x] = 0.0
        for token in caption:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity_syntactic_only(
                    token, token2, token2Lemma, self.CAPTION_FACTOR,
                    self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        for token in comments:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity_syntactic_only(
                    token, token2, token2Lemma, self.COMMENTS_FACTOR,
                    self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        for token in hashtags:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity_syntactic_only(
                    token, token2, token2Lemma, self.HASHTAG_FACTOR,
                    self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        for token in segmented_hashtags:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity_syntactic_only(
                    token, token2, token2Lemma, self.HASHTAG_FACTOR,
                    self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        for token in tags:
            scores = []
            for x in topic:
                token2 = x.lower()
                token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                similarity = self.token_similarity_syntactic_only(
                    token, token2, token2Lemma, self.USERTAG_FACTOR,
                    self.tfidf[id])
                scores.append((x, similarity))
            top = sorted(scores, reverse=True, key=lambda x: x[1])[:num]
            for x in top:
                freq_scores[x[0]] = freq_scores[x[0]] + x[1]
        top = sorted([(k, v) for k, v in freq_scores.iteritems()],
                     reverse=True,
                     key=lambda x: x[1])[:num]
        return top

    def token_similarity_syntactic_only(self, token, token2, token2Lemma,
                                        factor, tfidf):
        """ Returns similarity between two tokens using edit distance and TFIDF weighting"""
        tokenLemma = self.wordnet_lemmatizer.lemmatize(token)
        similarity = 0.0
        if edit_distance(tokenLemma, token2Lemma) == 0:
            factor = factor * 10
        dist = edit_distance(tokenLemma, token2Lemma)
        similarity = factor * (float(1) / float(1 + dist))
        tfidf_score = 0.0
        if token in tfidf:
            tfidf_score = tfidf[token]
        if token.encode("utf-8") in tfidf:
            tfidf_score = tfidf[token.encode("utf-8")]
        tfidf_score = max(tfidf_score, 0.0001)
        similarity = similarity * tfidf_score
        return similarity

    def lookup_google(self, params):
        """ Lookup in Google Search"""
        #curl "https://kgsearch.googleapis.com/v1/entities:search?query=bebe&key=<key>&limit=2&indent=True&types=Organization"
        url = self.google_service_url + '?' + urllib.urlencode(params)
        #result score = an indicator of how well the entity matched the request constraints.
        response = json.loads(urllib.urlopen(url).read())
        results = []
        if "itemListElement" in response:
            for element in response['itemListElement']:
                dict_result = {}
                if "resultScore" in element:
                    dict_result["resultScore"] = element['resultScore']
                if "result" in element:
                    if "detailedDescription" in element["result"]:
                        dict_result["detailedDescription"] = element["result"][
                            'detailedDescription']
                    if "description" in element["result"]:
                        dict_result["description"] = element["result"][
                            'description']
                    if "url" in element["result"]:
                        dict_result["url"] = element["result"]["url"]
                results.append(dict_result)
        return results

    def rank_google_result_company(self, results):
        """ Binary rank  of google search results"""
        for result in results:
            for keyword in self.brands_keywords_google:
                if "detailedDescription" in result:
                    if keyword in result["detailedDescription"]:
                        return 1
                if "description" in result:
                    if keyword in result["description"]:
                        return 1
        return 0.0

    def rank_google_result_material(self, results):
        """ Binary rank  of google search results"""
        for result in results:
            for keyword in self.materials_keywords_google:
                if keyword in result[
                        "detailedDescription"] or keyword in result[
                            "description"]:
                    return 1
        return 0.0

    def rank_probase_result_company(self, result):
        """Probase probability ranking [0,1]"""
        keywords = filter(lambda x: x in result, self.probase_brands)
        keywords = map(lambda x: result[x], keywords)
        if len(keywords) > 0:
            return 1 + max(keywords)
        else:
            return 0.5

    def rank_probase_result_material(self, result):
        """Probase probability ranking [0,1]"""
        keywords = filter(lambda x: x in result, self.probase_materials)
        keywords = map(lambda x: result[x], keywords)
        if len(keywords) > 0:
            return 1 + max(keywords)
        else:
            return 0.5

    def lookup_probase(self, params):
        """Probase lookup"""
        #curl "https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance=adidas&topK=10"
        url = self.probase_service_url + '?' + urllib.urlencode(params)
        response = json.loads(urllib.urlopen(url).read())
        return response

    def get_liketoknowitlinks(self, tokens):
        """ Extract liketoknowit links"""
        links = []
        for token in tokens:
            match = re.search("http://liketk.it/([^\s]+)", token)
            if match is not None:
                link = match.group(0)
                links.append(link)
        return links

    def lda_topic_models(self, num_topics, num_iter, min_occ, docs):
        """ Extract LDA topic models """
        cvectorizer = CountVectorizer(min_df=min_occ, stop_words="english")
        cvz = cvectorizer.fit_transform(docs)
        lda_model = lda.LDA(n_topics=num_topics, n_iter=num_iter)
        X_topics = lda_model.fit_transform(cvz)
        _lda_keys = []
        for i in xrange(X_topics.shape[0]):
            _lda_keys.append(X_topics[i].argmax())
        topic_summaries = []
        topic_word = lda_model.topic_word_  # all topic words
        n_top_words = 5
        vocab = cvectorizer.get_feature_names()
        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(vocab)[np.argsort(
                topic_dist)][:-(n_top_words + 1):-1]  # get!
            topic_summaries.append(' '.join(topic_words))
        return topic_summaries

    def get_top_num(self, coll, num):
        """ Extract top 10 ranked items"""
        top, counts = zip(*Counter(coll).most_common(num))
        return list(top)

    def get_wikipedia_vote(self, query):
        """ Wikipedia lookup binary rank"""
        pages = wikipedia.search(query)
        for pageName in pages:
            try:
                page = wikipedia.page(pageName)
                content = page.content.lower()
                for keyword in self.brands_keywords_google:
                    if keyword in content:
                        return 1
            except:
                return 0.0
        return 0.0

    def get_google_search_vote(self, query):
        """ Google search lookup binary rank"""
        try:
            response = GoogleSearch().search(query)
            for result in response.results:
                text = result.getText().lower()
                title = result.title.lower()
                for keyword in self.brands_keywords_google:
                    if keyword in text or keyword in title:
                        return 1
        except:
            return 0
        return 0

    def emoji_classification(self, emojis, num):
        """ Emoji classification """
        items = {}
        for item in self.items_lemmas.keys():
            items[item] = 0.0
        for emoji in emojis:
            item_matches = self.emoji_to_item(emoji)
            for item_m in item_matches:
                items[item_m] = items[item_m] + 1
        top = sorted([(k, v) for k, v in items.iteritems()],
                     reverse=True,
                     key=lambda x: x[1])[:num]
        return top

    def emoji_to_item(self, token):
        """Classify item based on emojis"""
        if token == u"👕":
            return ["shirt", "top"]
        if token == u"👖":
            return ["jean", "trouser", "legging", "jogger"]
        if token == u"👗":
            return ["dress"]
        if token == u"👚":
            return ["blouse", "shirt"]
        if token == u"👛":
            ["purse", "bag", "handbag"]
        if token == u"👜":
            return ["bag", "handbag"]
        if token == u"👝" or token == u"🎒 ":
            return ["bag"]
        if token == u"👞":
            return ["shoe", "boot"]
        if token == u"👟":
            return ["trainer", "shoe", "boot"]
        if token == u"👠" or token == u"👡 " or token == u"👢":
            return ["heel", "shoe"]
        if token == u"👒" or token == u"🎩":
            return ["hat"]
        return []

    def map_candidates_to_ontology(self, candidates):
        """ Map candidates from external APIs to our classes"""
        topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"),
                    self.top_category_items)
        freq_scores = {}
        for x in topic:
            parts = x.split(",")
            label = parts[0]
            freq_scores[label] = 0.0
        for token in candidates:
            for x in topic:
                parts = x.split(",")
                label = parts[0]
                words = parts[1].split(" ")
                acc_sim = 0
                scores = []
                for word in words:
                    token2 = word.lower()
                    token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                    similarity = self.token_similarity(token[0], token2,
                                                       token2Lemma,
                                                       self.CAPTION_FACTOR)
                    scores.append(similarity * math.pow(token[1], 2))

                acc_sim = acc_sim + max(scores)
                freq_scores[label] = freq_scores[label] + acc_sim
        return freq_scores

    def liketkit_classification(self, url):
        """ Liketkit link scraping """
        text = []
        try:
            driver = webdriver.PhantomJS()
            driver.get(url)
            p_element = driver.find_element_by_class_name("ltk-products")
            products = p_element.find_elements_by_xpath(".//*")
            urls = []
            for prod in products:
                urls.append(prod.get_attribute("href"))
            for url in urls:
                driver.get(url)
                html = driver.page_source
                soup = BeautifulSoup(html, "lxml")
                data = soup.findAll(text=True, recursive=True)
                text.extend(list(data))
                return text
        except:
            print("error in liketkit classification")
            return text

    def google_vision_lookup(self, imagePath):
        """ Google vision API lookup """
        item_candidates = []
        try:
            # Instantiates a client
            client = vision.ImageAnnotatorClient()

            # The name of the image file to annotate
            file_name = os.path.join(os.path.dirname(__file__), imagePath)

            # Loads the image into memory
            with io.open(file_name, 'rb') as image_file:
                content = image_file.read()

            image = types.Image(content=content)

            # Performs label detection on the image file
            response = client.label_detection(image=image)
            labels = response.label_annotations
            for label in labels:
                item_candidates.append((label.description, label.score))
            return item_candidates
        except:
            print("error in google_vision_LF")
            return item_candidates

    def deep_detect_lookup(self, link):
        """ Deep detect local lookup"""
        items_and_fabrics = {}
        items_and_fabrics["items"] = []
        items_and_fabrics["fabrics"] = []
        try:
            parameters_input = {}
            parameters_mllib = {}
            parameters_output = {'best': 10}
            data = [link]
            clothing_res = self.dd.post_predict(self.sname_clothing, data,
                                                parameters_input,
                                                parameters_mllib,
                                                parameters_output)
            body = clothing_res[u"body"]
            predictions = body[u"predictions"]
            classes = predictions[0][u"classes"]
            for c in classes:
                items = c[u"cat"].strip(" ").split(",")
                prob = c[u"prob"]
                for item in items:
                    items_and_fabrics["items"].append((item, prob))

            bags_res = self.dd.post_predict(self.sname_bags, data,
                                            parameters_input, parameters_mllib,
                                            parameters_output)
            body = bags_res[u"body"]
            predictions = body[u"predictions"]
            classes = predictions[0][u"classes"]
            for c in classes:
                items = c[u"cat"].strip(" ").split(",")
                prob = c[u"prob"]
                for item in items:
                    items_and_fabrics["items"].append((item, 0.5 * prob))

            footwear_res = self.dd.post_predict(self.sname_footwear, data,
                                                parameters_input,
                                                parameters_mllib,
                                                parameters_output)
            body = footwear_res[u"body"]
            predictions = body[u"predictions"]
            classes = predictions[0][u"classes"]
            for c in classes:
                items = c[u"cat"].strip(" ").split(",")
                prob = c[u"prob"]
                for item in items:
                    items_and_fabrics["items"].append((item, 0.5 * prob))

            fabric_res = self.dd.post_predict(self.sname_fabric, data,
                                              parameters_input,
                                              parameters_mllib,
                                              parameters_output)
            body = fabric_res[u"body"]
            predictions = body[u"predictions"]
            classes = predictions[0][u"classes"]
            for c in classes:
                items = c[u"cat"].strip(" ").split(",")
                prob = c[u"prob"]
                for item in items:
                    items_and_fabrics["fabrics"].append((item, prob))
            return items_and_fabrics
        except:
            print("error in deep_detect_LF")
            return items_and_fabrics

    def startup_deep_detect(self):
        """ Startup services for deep detect classification """
        self.dd.set_return_format(self.dd.RETURN_PYTHON)
        for model in self.deep_detect_models:
            m = {"repository": model["path"]}
            parameters_input = {
                'connector': 'image',
                'width': self.width,
                'height': self.height
            }
            parameters_mllib = {'nclasses': self.nclasses_clothing}
            parameters_output = {}
            self.dd.put_service(model["name"], model, model["description"],
                                self.mllib, parameters_input, parameters_mllib,
                                parameters_output)

    def deepomatic_lookup(self, link):
        """ Deepomatic API lookup """
        item_candidates = []
        try:
            client = Client(529372386976, self.conf["deepomatic_api_key"])
            task = client.helper.get("/detect/fashion/?url=" + link)
            taskid = task[u"task_id"]
            i = 0
            while i < 10:
                sleep(0.1)  #100ms
                res = client.helper.get("/tasks/" + str(taskid) + "/")
                task = res[u"task"]
                status = task[u"status"]
                if status == u"success" or status == "success":
                    data = task[u"data"]
                    boxes = data[u"boxes"]
                    for item in boxes.keys():
                        info = boxes[item]
                        probability = 0.0
                        for inf in info:
                            probability = probability + inf[u"proba"]
                        item_candidates.append(
                            (item.encode("utf-8"), probability))
                    i = 10
                else:
                    i += 1
            return item_candidates
        except:
            print("error in deepomaticLF")
            return item_candidates

    def clarifai_lookup(self, link):
        """ Clarifai API lookup"""
        item_candidates = []
        try:
            app = ClarifaiApp(api_key=self.conf["clarifai_api_key"])
            model = app.models.get('apparel')
            image = ClImage(url=link)
            res = model.predict([image])
            outputs = res[u"outputs"]
            for output in outputs:
                data = output[u"data"]
                concepts = data[u"concepts"]
                for concept in concepts:
                    concept_parts = concept[u"name"].encode("utf-8").split(" ")
                    val = concept[u"value"]
                    for part in concept_parts:
                        item_candidates.append((part, val))

            return item_candidates
        except:
            print("error in clarifai LF")
            return item_candidates

    def find_closest_semantic_hierarchy(self, caption, comments, tags,
                                        hashtags, topic, id, num):
        """ Finds num semantically closest candidates for a given topic with multiple words per topic"""
        topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"),
                    topic)
        freq_scores = {}
        for x in topic:
            parts = x.split(",")
            label = parts[0]
            freq_scores[label] = 0.0
        for token in caption:
            for x in topic:
                parts = x.split(",")
                label = parts[0]
                words = parts[1].split(" ")
                acc_sim = 0
                scores = []
                for word in words:
                    token2 = word.lower()
                    token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                    similarity = self.token_similarity(token, token2,
                                                       token2Lemma,
                                                       self.CAPTION_FACTOR,
                                                       self.tfidf[id])
                    scores.append(similarity)
                acc_sim = acc_sim + max(scores)
                freq_scores[label] = freq_scores[label] + acc_sim
        for token in comments:
            for x in topic:
                parts = x.split(",")
                label = parts[0]
                words = parts[1].split(" ")
                acc_sim = 0
                scores = []
                for word in words:
                    token2 = word.lower()
                    token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                    similarity = self.token_similarity(token, token2,
                                                       token2Lemma,
                                                       self.COMMENTS_FACTOR,
                                                       self.tfidf[id])
                    scores.append(similarity)
                acc_sim = acc_sim + max(scores)
                freq_scores[label] = freq_scores[label] + acc_sim
        for token in hashtags:
            for x in topic:
                parts = x.split(",")
                label = parts[0]
                words = parts[1].split(" ")
                acc_sim = 0
                scores = []
                for word in words:
                    token2 = word.lower()
                    token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                    similarity = self.token_similarity(token, token2,
                                                       token2Lemma,
                                                       self.HASHTAG_FACTOR,
                                                       self.tfidf[id])
                    scores.append(similarity)
                acc_sim = acc_sim + max(scores)
                freq_scores[label] = freq_scores[label] + acc_sim
        for token in tags:
            for x in topic:
                parts = x.split(",")
                label = parts[0]
                words = parts[1].split(" ")
                acc_sim = 0
                scores = []
                for word in words:
                    token2 = word.lower()
                    token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                    similarity = self.token_similarity(token, token2,
                                                       token2Lemma,
                                                       self.USERTAG_FACTOR,
                                                       self.tfidf[id])
                    scores.append(similarity)
                    acc_sim = acc_sim + similarity
                acc_sim = acc_sim + max(scores)
                freq_scores[label] = freq_scores[label] + acc_sim
        top = sorted([(k, v) for k, v in freq_scores.iteritems()],
                     reverse=True,
                     key=lambda x: x[1])[:num]
        return top

    def find_closest_syntactic_hierarchy(self, caption, comments, tags,
                                         hashtags, topic, id, num):
        """ Finds num syntactically closest candidates for a given topic, with multiple words per topic"""
        topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"),
                    topic)
        freq_scores = {}
        for x in topic:
            parts = x.split(",")
            label = parts[0]
            freq_scores[label] = 0.0
        for token in caption:
            for x in topic:
                parts = x.split(",")
                label = parts[0]
                words = parts[1].split(" ")
                acc_sim = 0
                scores = []
                for word in words:
                    token2 = word.lower()
                    token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                    similarity = self.token_similarity_syntactic_only(
                        token, token2, token2Lemma, self.CAPTION_FACTOR,
                        self.tfidf[id])
                    scores.append(similarity)
                acc_sim = acc_sim + max(scores)
                freq_scores[label] = freq_scores[label] + acc_sim
        for token in comments:
            for x in topic:
                parts = x.split(",")
                label = parts[0]
                words = parts[1].split(" ")
                acc_sim = 0
                scores = []
                for word in words:
                    token2 = word.lower()
                    token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                    similarity = self.token_similarity_syntactic_only(
                        token, token2, token2Lemma, self.COMMENTS_FACTOR,
                        self.tfidf[id])
                    scores.append(similarity)
                acc_sim = acc_sim + max(scores)
                freq_scores[label] = freq_scores[label] + acc_sim
        for token in hashtags:
            for x in topic:
                parts = x.split(",")
                label = parts[0]
                words = parts[1].split(" ")
                acc_sim = 0
                scores = []
                for word in words:
                    token2 = word.lower()
                    token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                    similarity = self.token_similarity_syntactic_only(
                        token, token2, token2Lemma, self.HASHTAG_FACTOR,
                        self.tfidf[id])
                    scores.append(similarity)
                acc_sim = acc_sim + max(scores)
                freq_scores[label] = freq_scores[label] + acc_sim
        for token in tags:
            for x in topic:
                parts = x.split(",")
                label = parts[0]
                words = parts[1].split(" ")
                acc_sim = 0
                scores = []
                for word in words:
                    token2 = word.lower()
                    token2Lemma = self.wordnet_lemmatizer.lemmatize(token2)
                    similarity = self.token_similarity_syntactic_only(
                        token, token2, token2Lemma, self.USERTAG_FACTOR,
                        self.tfidf[id])
                    scores.append(similarity)
                    acc_sim = acc_sim + similarity
                acc_sim = acc_sim + max(scores)
                freq_scores[label] = freq_scores[label] + acc_sim
        top = sorted([(k, v) for k, v in freq_scores.iteritems()],
                     reverse=True,
                     key=lambda x: x[1])[:num]
        return top
Ejemplo n.º 13
0
    'width': args.input_size
}
parameters_mllib = {'template': 'gpt2', 'gpu': True}
parameters_output = {}
dd.put_service(sname, model, description, mllib, parameters_input,
               parameters_mllib, parameters_output)

# generating text
prompt = input("Enter beggining of sentence >>> ")

for i in range(0, 256):
    data = [prompt]
    parameters_input = {'word_start': "Ġ", 'suffix_start': ""}
    parameters_mllib = {}
    parameters_output = {'best': args.topk}
    result = dd.post_predict(sname, data, parameters_input, parameters_mllib,
                             parameters_output)

    # Select result from the returned tokens
    word_probs = list()
    total_probs = 0

    for cls in result['body']['predictions'][0]['classes']:
        word = cls['cat'].replace("Ġ", " ")
        # dede does not support \n character well, so we don't select tokens containing a new line
        if 'Ċ' in word:
            continue

        prob = pow(cls['prob'], args.temperature)
        total_probs += prob
        word_probs.append((word, prob))