コード例 #1
0
def convert_batch(data, output=None):
    # rtn_obj = {}
    # rtn_tup = (pyrdf(row['s']), pyrdf(row['p']), pyrdf(row['o']))
    # return pyrdf(row['s']) #rtn_tup
    # for key, value in row.items():
    #     # try:
    #     # print("convert_row_main: ", value)
    #     # if value.get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime':
    #     #     pdb.set_trace()
    #     rtn_obj[key] = pyrdf(value)
    #     # print(rtn_obj)
    #     # except:
    #     #     pdb.set_trace()
    # return rtn_obj
    print("starting")
    # data_l = len(data)
    # i = 0
    # while i < data_l:
    #     converted = []
    #     for row in data[i:i+1000]:
    #         converted.append({key:pyrdf(value) for key, value in row.items()})

    #         i += 1
    #     output.put(converted)
    for row in data:
        # output.append({key:pyrdf(value) for key, value in row.items()})
        output.put([{key:pyrdf(value) for key, value in row.items()}])

    # converted = [{key:pyrdf(value) for key, value in row.items()}
    #              for row in data]
    print("converted")
    # output.put(converted)
    return
コード例 #2
0
def convert_batch(data, output=None):
    # rtn_obj = {}
    # rtn_tup = (pyrdf(row['s']), pyrdf(row['p']), pyrdf(row['o']))
    # return pyrdf(row['s']) #rtn_tup
    # for key, value in row.items():
    #     # try:
    #     # print("convert_row_main: ", value)
    #     # if value.get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime':
    #     #     pdb.set_trace()
    #     rtn_obj[key] = pyrdf(value)
    #     # print(rtn_obj)
    #     # except:
    #     #     pdb.set_trace()
    # return rtn_obj
    print("starting")
    # data_l = len(data)
    # i = 0
    # while i < data_l:
    #     converted = []
    #     for row in data[i:i+1000]:
    #         converted.append({key:pyrdf(value) for key, value in row.items()})

    #         i += 1
    #     output.put(converted)
    for row in data:
        # output.append({key:pyrdf(value) for key, value in row.items()})
        output.put([{key: pyrdf(value) for key, value in row.items()}])

    # converted = [{key:pyrdf(value) for key, value in row.items()}
    #              for row in data]
    print("converted")
    # output.put(converted)
    return
コード例 #3
0
    def _convert_results(self, data, **kwargs):
        """ converts the results of a query to RdfDatatype instances

            args:
                data: a list of triples
        """

        if kwargs.get("multiprocessing", False):
            m = mp.Manager()
            output = m.Queue()
            pdb.set_trace()
            # processes = [mp.Process(target=convert_row_main,
            #                         args=(row, output,))
            #              for row in data]
            # # Run processes
            # for p in processes:
            #     p.start()

            # # Exit the completed processes
            # for p in processes:
            #     p.join()
            # # Get process results from the output queue
            # return [output.get() for p in processes]

            pool = mp.Pool(processes=pool_size)
            for i, row in enumerate(data):
                for key, val in row.items():
                    try:
                        pool.apply(convert_row_main,
                                   args=(
                                       val,
                                       i,
                                       key,
                                       output,
                                   ))
                    except:
                        pass  #
            # run = [pool.apply(convert_row_main, args=(row, i, output))
            #        for i, row in enumerate(data)]
            for item in output:
                pdb.set_trace()
            return output
            # with multiprocessing.Pool(processes=pool_size) as pool:
            #     results = [convert_row_main, (row,))
            #                for row in data]
            #     converted = [r.get() for r in results]
            # return converted #pool_outputs
        else:
            return [{key: pyrdf(value)
                     for key, value in row.items()} for row in data]
コード例 #4
0
    def add_triple(self, sub, pred=None, obj=None, **kwargs):
        """ Adds a triple to the dataset

            args:
                sub: The subject of the triple or dictionary contaning a
                     triple
                pred: Optional if supplied in sub, predicate of the triple
                obj:  Optional if supplied in sub, object of the triple

            kwargs:
                map: Optional, a ditionary mapping for a supplied dictionary
                strip_orphans: Optional, remove triples that have an orphan
                               blanknode as the object
                obj_method: if "list" than the object will be returned in the
                            form of a list
        """
        self.__set_map__(**kwargs)
        strip_orphans = kwargs.get("strip_orphans", False)
        obj_method = kwargs.get("obj_method")
        if isinstance(sub, DictClass) or isinstance(sub, dict):
            pred = sub[self.pmap]
            obj = sub[self.omap]
            sub = sub[self.smap]

        pred = pyrdf(pred)
        obj = pyrdf(obj)
        sub = pyrdf(sub)
        # reference existing attr for bnodes and uris
        if obj.type in self.relate_obj_types:
            if strip_orphans and not self.get(obj):
                return
            obj = self.get(obj, obj)
        try:
            self[sub].add_property(pred, obj)
        except KeyError:
            self[sub] = RdfClassBase(sub, self, **kwargs)
            self[sub].add_property(pred, obj)
コード例 #5
0
    def add_triple(self, sub, pred=None,  obj=None, **kwargs):
        """ Adds a triple to the dataset

            args:
                sub: The subject of the triple or dictionary contaning a
                     triple
                pred: Optional if supplied in sub, predicate of the triple
                obj:  Optional if supplied in sub, object of the triple

            kwargs:
                map: Optional, a ditionary mapping for a supplied dictionary
                strip_orphans: Optional, remove triples that have an orphan
                               blanknode as the object
                obj_method: if "list" than the object will be returned in the
                            form of a list
        """
        self.__set_map__(**kwargs)
        strip_orphans = kwargs.get("strip_orphans", False)
        obj_method = kwargs.get("obj_method")
        if isinstance(sub, DictClass) or isinstance(sub, dict):
            pred = sub[self.pmap]
            obj = sub[self.omap]
            sub = sub[self.smap]

        pred = pyrdf(pred)
        obj = pyrdf(obj)
        sub = pyrdf(sub)
        # reference existing attr for bnodes and uris
        if obj.type in self.relate_obj_types :
            if strip_orphans and not self.get(obj):
                return
            obj = self.get(obj,obj)
        try:
            self[sub].add_property(pred, obj)
        except KeyError:
            self[sub] = RdfClassBase(sub, self, **kwargs)
            self[sub].add_property(pred, obj)
コード例 #6
0
def convert_row_main(val, i, key, output):
    # rtn_obj = {}
    # rtn_tup = (pyrdf(row['s']), pyrdf(row['p']), pyrdf(row['o']))
    # return pyrdf(row['s']) #rtn_tup
    # for key, value in row.items():
    #     # try:
    #     # print("convert_row_main: ", value)
    #     # if value.get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime':
    #     #     pdb.set_trace()
    #     rtn_obj[key] = pyrdf(value)
    #     # print(rtn_obj)
    #     # except:
    #     #     pdb.set_trace()
    # return rtn_obj
    output.put((i, key, pyrdf(val),))
コード例 #7
0
def convert_results(data, **kwargs):
    """ converts the results of a query to RdfDatatype instances

        args:
            data: a list of triples
    """
    if kwargs.get("multiprocessing", False):
        manager = SharedManager()
        manager.register("BaseRdfDataType", BaseRdfDataType)
        manager.register("Uri", Uri)

        data_l = len(data)
        group_size = data_l // pool_size
        if data_l % pool_size:
            group_size += 1
        split_data = [
            data[i:i + group_size] for i in range(0, data_l, group_size)
        ]
        output = manager.Queue()
        # output = manager.list()
        # output_data = POOL.map(convert_row, split_data)
        workers = [
            mp.Process(target=convert_batch, args=(
                item,
                output,
            )) for item in split_data
        ]
        for worker in workers:
            # worker.Daemon = True
            worker.start()
        results = []
        while True:
            running = any(p.is_alive() for p in workers)
            while not output.empty():
                results += output.get()
            if not running:
                break
        print("Finished - workers not stoped")
        for worker in workers:
            worker.join()
        # pdb.set_trace()
        # return output
        for i in range(output.qsize()):
            results += output.get()
        return results
    else:
        return [{key: pyrdf(value)
                 for key, value in row.items()} for row in data]
コード例 #8
0
    def _convert_results(self, data, **kwargs):
        """ converts the results of a query to RdfDatatype instances

            args:
                data: a list of triples
        """

        if kwargs.get("multiprocessing", False):
            m = mp.Manager()
            output = m.Queue()
            pdb.set_trace()
            # processes = [mp.Process(target=convert_row_main,
            #                         args=(row, output,))
            #              for row in data]
            # # Run processes
            # for p in processes:
            #     p.start()

            # # Exit the completed processes
            # for p in processes:
            #     p.join()
            # # Get process results from the output queue
            # return [output.get() for p in processes]

            pool = mp.Pool(processes=pool_size)
            for i, row in enumerate(data):
                for key, val in row.items():
                    try:
                        pool.apply(convert_row_main, args=(val, i, key, output,))
                    except:
                        pass #
            # run = [pool.apply(convert_row_main, args=(row, i, output))
            #        for i, row in enumerate(data)]
            for item in output:
                pdb.set_trace()
            return output
            # with multiprocessing.Pool(processes=pool_size) as pool:
            #     results = [convert_row_main, (row,))
            #                for row in data]
            #     converted = [r.get() for r in results]
            # return converted #pool_outputs
        else:
            return [{key:pyrdf(value) for key, value in row.items()}
                    for row in data]
コード例 #9
0
def convert_results(data, **kwargs):
    """ converts the results of a query to RdfDatatype instances

        args:
            data: a list of triples
    """
    if kwargs.get("multiprocessing", False):
        manager = SharedManager()
        manager.register("BaseRdfDataType", BaseRdfDataType)
        manager.register("Uri", Uri)

        data_l = len(data)
        group_size = data_l // pool_size
        if data_l % pool_size:
            group_size += 1
        split_data = [data[i:i + group_size]
                      for i in range(0, data_l, group_size)]
        output = manager.Queue()
        # output = manager.list()
        # output_data = POOL.map(convert_row, split_data)
        workers = [mp.Process(target=convert_batch, args=(item, output,))
                   for item in split_data]
        for worker in workers:
            # worker.Daemon = True
            worker.start()
        results = []
        while True:
            running = any(p.is_alive() for p in workers)
            while not output.empty():
               results += output.get()
            if not running:
                break
        print("Finished - workers not stoped")
        for worker in workers:
            worker.join()
        # pdb.set_trace()
        # return output
        for i in range(output.qsize()):
            results += output.get()
        return results
    else:
        return [{key:pyrdf(value) for key, value in row.items()}
                for row in data]
コード例 #10
0
def convert_row_main(val, i, key, output):
    # rtn_obj = {}
    # rtn_tup = (pyrdf(row['s']), pyrdf(row['p']), pyrdf(row['o']))
    # return pyrdf(row['s']) #rtn_tup
    # for key, value in row.items():
    #     # try:
    #     # print("convert_row_main: ", value)
    #     # if value.get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime':
    #     #     pdb.set_trace()
    #     rtn_obj[key] = pyrdf(value)
    #     # print(rtn_obj)
    #     # except:
    #     #     pdb.set_trace()
    # return rtn_obj
    output.put((
        i,
        key,
        pyrdf(val),
    ))
コード例 #11
0
def get_json_qry_item(dataset, param, no_key=False):
    """ reads the paramater and returns the selected element

    args:
        dataset: the dataset to search
        param: the paramater to search by
        no_key: wheather to use the 'param' 'element' to filter the list.
                This is passed True after the first run during recurssive call
                when the key has already been used to select subset of the
                dataset
    """

    def get_dataset_vals(ds, key, filter_tup=tuple()):
        def reduce_list(value):
            if isinstance(value, list):
                if len(value) == 1:
                    return value[0]
            return value

        def merge_list(value):
            if isinstance(value, list):
                rtn_list = []
                for item in value:
                    if isinstance(item, list):
                        rtn_list += item
                    else:
                        rtn_list.append(item)
                try:
                    return list(set(rtn_list))
                except TypeError:
                    return rtn_list
            return value

        def test_elem(elem, filter_tup):
            search_lst = elem
            if isinstance(elem, dict):
                search_lst = elem.get(filter_tup[0], [])
            if filter_tup[2] == '=':
                try:
                    if elem.subject == filter_tup[1]:
                        return True
                except AttributeError:
                    pass
                test_lst = [item for item in search_lst \
                            if (isinstance(item, dict) \
                                and item.subject == filter_tup[1]) \
                            or item == filter_tup[1]]
                if test_lst:
                    return True
                return False

        def filter_list(ds, key, filter_tup):
            rtn_list = ds
            if key:
                rtn_list = merge_list([reduce_list(reduce_list(elem)[key]) \
                                   for elem in ds
                                   if isinstance(reduce_list(elem), dict)
                                   and reduce_list(elem).get(key)])
            if filter_tup:
                return [elem for elem in rtn_list \
                        if test_elem(elem, filter_tup)]
            return rtn_list

        if isinstance(ds, list):
            return filter_list(ds, key, filter_tup)
        elif isinstance(ds, dict):
            search_dict = ds
            if key:
                search_dict = ds.get(key,[])
            if filter_tup:
                datalist = []
                for elem in search_dict:
                    if filter_tup[2] == "=":
                        # pdb.set_trace()
                        if filter_tup[1] in elem.get(filter_tup[0], []):
                            if isinstance(elem, list):
                                datalist += elem
                            else:
                                datalist.append(elem)
                    elif filter_tup[2] == "!=":
                        if filter_tup[1] not in elem.get(filter_tup[0], []):
                            datalist.append(elem)
                return datalist
                # return [elem for elem in ds[key] \
                #         if filter_tup[1] in elem.get(filter_tup[0], []) \
                #         and elem]
            return merge_list(search_dict)
    if param == "*":
        return dataset
    try:
        if param.startswith("="):
            # if the dataset length is '0' consider it a false match
            if dataset:
                return [pyrdf(param[1:])]
            return []
    except AttributeError:
        pass
    if hasattr(param, 'parsed_tree'):
        param = param.parsed_tree

    if hasattr(param, 'selector'):
        if no_key:
            key = None
        else:
            key = get_element(param.selector)
        rtn_obj = None
        if hasattr(param, 'ident'):
            if key:
                rtn_obj = get_dataset_vals(dataset,
                                           key,
                                           ('rdf_type',
                                            param.ident, "="))
            elif param.ident in dataset.get('rdf_type', []):
                rtn_obj = dataset
            else:
                rtn_obj = [value for value in dataset.values()
                           if param.ident in value.get('rdf_type', [])]
            # pdb.set_trace()
        elif hasattr(param, 'attrib'):
            # if param.parsed_tree.attrib == 'bf_role':
            #     pdb.set_trace()
            rtn_obj = get_dataset_vals(dataset,
                                       key,
                                       (param.attrib,
                                        param.value,
                                        param.operator))
        if rtn_obj is not None:
            if hasattr(param, 'selector') \
                    and hasattr(param.selector, 'selector') \
                    and rtn_obj:
                rtn_obj = get_json_qry_item(rtn_obj, param.selector, True)
            return rtn_obj
        if key:
            return dataset[key]
        else:
            return dataset
    elif hasattr(param, 'element'):
        key = param.element
        return get_dataset_vals(dataset, key)
コード例 #12
0
def get_json_qry_item(dataset, param, no_key=False):
    """ reads the paramater and returns the selected element

    args:
        dataset: the dataset to search
        param: the paramater to search by
        no_key: wheather to use the 'param' 'element' to filter the list.
                This is passed True after the first run during recurssive call
                when the key has already been used to select subset of the
                dataset
    """
    def get_dataset_vals(ds, key, filter_tup=tuple()):
        def reduce_list(value):
            if isinstance(value, list):
                if len(value) == 1:
                    return value[0]
            return value

        def merge_list(value):
            if isinstance(value, list):
                rtn_list = []
                for item in value:
                    if isinstance(item, list):
                        rtn_list += item
                    else:
                        rtn_list.append(item)
                try:
                    return list(set(rtn_list))
                except TypeError:
                    return rtn_list
            return value

        def test_elem(elem, filter_tup):
            search_lst = elem
            if isinstance(elem, dict):
                search_lst = elem.get(filter_tup[0], [])
            if filter_tup[2] == '=':
                try:
                    if elem.subject == filter_tup[1]:
                        return True
                except AttributeError:
                    pass
                test_lst = [item for item in search_lst \
                            if (isinstance(item, dict) \
                                and item.subject == filter_tup[1]) \
                            or item == filter_tup[1]]
                if test_lst:
                    return True
                return False

        def filter_list(ds, key, filter_tup):
            rtn_list = ds
            if key:
                rtn_list = merge_list([reduce_list(reduce_list(elem)[key]) \
                                   for elem in ds
                                   if isinstance(reduce_list(elem), dict)
                                   and reduce_list(elem).get(key)])
            if filter_tup:
                return [elem for elem in rtn_list \
                        if test_elem(elem, filter_tup)]
            return rtn_list

        if isinstance(ds, list):
            return filter_list(ds, key, filter_tup)
        elif isinstance(ds, dict):
            search_dict = ds
            if key:
                search_dict = ds.get(key, [])
            if filter_tup:
                datalist = []
                for elem in search_dict:
                    if filter_tup[2] == "=":
                        # pdb.set_trace()
                        if filter_tup[1] in elem.get(filter_tup[0], []):
                            if isinstance(elem, list):
                                datalist += elem
                            else:
                                datalist.append(elem)
                    elif filter_tup[2] == "!=":
                        if filter_tup[1] not in elem.get(filter_tup[0], []):
                            datalist.append(elem)
                return datalist
                # return [elem for elem in ds[key] \
                #         if filter_tup[1] in elem.get(filter_tup[0], []) \
                #         and elem]
            return merge_list(search_dict)

    if param == "*":
        return dataset
    try:
        if param.startswith("="):
            # if the dataset length is '0' consider it a false match
            if dataset:
                return [pyrdf(param[1:])]
            return []
    except AttributeError:
        pass
    if hasattr(param, 'parsed_tree'):
        param = param.parsed_tree

    if hasattr(param, 'selector'):
        if no_key:
            key = None
        else:
            key = get_element(param.selector)
        rtn_obj = None
        if hasattr(param, 'ident'):
            if key:
                rtn_obj = get_dataset_vals(dataset, key,
                                           ('rdf_type', param.ident, "="))
            elif param.ident in dataset.get('rdf_type', []):
                rtn_obj = dataset
            else:
                rtn_obj = [
                    value for value in dataset.values()
                    if param.ident in value.get('rdf_type', [])
                ]
            # pdb.set_trace()
        elif hasattr(param, 'attrib'):
            # if param.parsed_tree.attrib == 'bf_role':
            #     pdb.set_trace()
            rtn_obj = get_dataset_vals(
                dataset, key, (param.attrib, param.value, param.operator))
        if rtn_obj is not None:
            if hasattr(param, 'selector') \
                    and hasattr(param.selector, 'selector') \
                    and rtn_obj:
                rtn_obj = get_json_qry_item(rtn_obj, param.selector, True)
            return rtn_obj
        if key:
            return dataset[key]
        else:
            return dataset
    elif hasattr(param, 'element'):
        key = param.element
        return get_dataset_vals(dataset, key)