Example #1
0
    def get_document_meta_mappings(self):
        """
        Depending on the configuration of the required location and campagin column,
        query the source object map table and find the equivelant master_object_id_ids
        needed to go through the remainder of the ETL process.
        """

        # during the DocTransform process we associate new AND existing mappings between
        # the metadata assoicated with this doucment.

        # sm_ids = DocumentSourceObjectMap.objects.filter(document_id =\
        #     self.document_id).values_list('source_object_map_id',flat=True)

        # create a tuple dict ex: {('location': "PAK") : 3 , ('location':
        # "PAK") : 3}
        source_map_dict = DataFrame(
            list(
                SourceObjectMap.objects.filter(master_object_id__gt=0)
                # id__in = sm_ids)\
                .values_list(*["master_object_id"])
            ),
            columns=["master_object_id"],
            index=SourceObjectMap.objects.filter(master_object_id__gt=0)
            # ,id__in = sm_ids)\
            .values_list(*["content_type", "source_object_code"]),
        )

        source_map_dict = source_map_dict.to_dict()["master_object_id"]

        return source_map_dict
Example #2
0
    def post(self, slug):
        mc = memcache.Client(["127.0.0.1:11211"], debug=0)

        columns = json.loads(MyBucket.get("{}-columns".format(slug)).data)
        fields = columns
        if self.get_argument("fields", None):
            fields = self.get_argument("fields").split(",")

        filters = [i[0] for i in self.request.arguments.iteritems() if len(i[0].split("filter__")) > 1]

        fields_json = json.dumps(fields)
        filters_json = json.dumps({f: self.get_argument(f) for f in filters})
        if (
            mc.get(str(slug))
            and mc.get("{}-columns".format(slug)) == fields_json
            and mc.get("{}-fulters".format(slug)) == filters_json
        ):
            self.write(mc.get(str(slug)))
            self.finish()

        mc.set("{}-columns".format(slug), fields_json)
        mc.set("{}-filters".format(slug), filters_json)

        df = DataFrame(MyBucket.get(slug).data, columns=fields)
        if len(filters) >= 1:
            for f in filters:
                df = df.query(df_generate(df, self.get_argument, f))
        convert = df.to_dict(outtype="records")

        write = json.dumps({"columns": fields, "json": convert})
        mc.set(str(slug), write)
        self.write(write)
        self.finish()
Example #3
0
def run(cube_slug=None):
    mc = memcache.Client(["127.0.0.1:11211"], debug=0)
    for cube in MyAdminBucket.get("cube").data:
        try:
            slug = cube["slug"]

            if cube_slug and cube_slug != slug:
                continue

            sql = """SELECT * FROM ({}) AS CUBE;""".format(cube["sql"])
            for c in MyAdminBucket.get("connection").data:
                if c["slug"] == cube["connection"]:
                    connection = c["connection"]

            print "\n# CLEAN MEMCACHE/RIAK: {}".format(slug)
            mc.delete(str(slug))
            mc.delete(str("{}-columns".format(slug)))

            MyBucket.new(slug, data="").store()
            MyBucket.new(u"{}-columns".format(slug), data="").store()
            MyBucket.new(u"{}-connect".format(slug), data="").store()
            MyBucket.new(u"{}-sql".format(slug), data="").store()

            print "# CONNECT IN RELATION DATA BASE: {}".format(slug)
            e = create_engine(connection)
            connection = e.connect()

            resoverall = connection.execute(text(sql))

            print "# LOAD DATA ON DATAWAREHOUSE: {}".format(slug)
            df = DataFrame(resoverall.fetchall())
            if df.empty:
                print "[warnning]Empty cube: {}!!".format(cube)
                return
            df.columns = resoverall.keys()
            df.head()

            pdict = map(fix_render, df.to_dict(outtype="records"))

            print "# SAVE DATA (JSON) ON RIAK: {}".format(slug)
            MyBucket.new(slug, data=pdict).store()

            print "# SAVE COLUMNS ON RIAK: {}".format(slug)
            MyBucket.new(u"{}-columns".format(slug), data=json.dumps([c for c in df.columns])).store()

            print "# SAVE CONNECT ON RIAK: {}".format(slug)
            MyBucket.new(u"{}-connect".format(slug), data=c).store()

            print "# SAVE SQL ON RIAK: {}".format(slug)
            MyBucket.new(u"{}-sql".format(slug), data=sql).store()

            print "# CLEAN MEMORY: {}\n".format(slug)
            del pdict, df
            gc.collect()
        except:
            pass

    print "## FINISH"
    return True
Example #4
0
    def test_to_dict_timestamp(self):

        # GH11247
        # split/records producing np.datetime64 rather than Timestamps
        # on datetime64[ns] dtypes only

        tsmp = Timestamp("20130101")
        test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
        test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})

        expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
        expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]

        tm.assert_almost_equal(test_data.to_dict(orient="records"), expected_records)
        tm.assert_almost_equal(test_data_mixed.to_dict(orient="records"), expected_records_mixed)

        expected_series = {"A": Series([tsmp, tsmp]), "B": Series([tsmp, tsmp])}
        expected_series_mixed = {"A": Series([tsmp, tsmp]), "B": Series([1, 2])}

        tm.assert_almost_equal(test_data.to_dict(orient="series"), expected_series)
        tm.assert_almost_equal(test_data_mixed.to_dict(orient="series"), expected_series_mixed)

        expected_split = {"index": [0, 1], "data": [[tsmp, tsmp], [tsmp, tsmp]], "columns": ["A", "B"]}
        expected_split_mixed = {"index": [0, 1], "data": [[tsmp, 1], [tsmp, 2]], "columns": ["A", "B"]}

        tm.assert_almost_equal(test_data.to_dict(orient="split"), expected_split)
        tm.assert_almost_equal(test_data_mixed.to_dict(orient="split"), expected_split_mixed)
Example #5
0
    def open(self, slug):
        columns = json.loads(MyBucket.get("{}-columns".format(slug)).data)
        fields = columns
        if self.get_argument("fields", None):
            fields = self.get_argument("fields").split(",")

        self.write_message({"type": "columns", "data": fields})

        filters = [i[0] for i in self.request.arguments.iteritems() if len(i[0].split("filter__")) > 1]

        df = DataFrame(MyBucket.get(slug).data, columns=fields)
        if len(filters) >= 1:
            for f in filters:
                df = df.query(df_generate(df, self.get_argument, f))

        for i in df.to_dict(outtype="records"):
            self.write_message({"type": "data", "data": i})

        self.close()
Example #6
0
    def open(self, slug):
        columns = json.loads(MyBucket.get("{}-columns".format(slug)).data)
        fields = columns
        if self.get_argument("fields", None):
            fields = self.get_argument("fields").split(",")

        self.write_message({"type": "columns", "data": fields})

        filters = [i[0] for i in self.request.arguments.iteritems() if len(i[0].split("filter__")) > 1]

        df = DataFrame(MyBucket.get(slug).data, columns=fields)
        if len(filters) >= 1:
            for f in filters:
                df = df.query(df_generate(df, self.get_argument(f), f))

        # CLEAN MEMORY
        del filters, fields, columns
        gc.collect()

        ca = None
        for e in MyAdminBucket.get("element").data:
            if e["slug"] == slug:
                ca = e["categories"]

        categories = []
        for i in df.to_dict(outtype="records"):
            if ca:
                categories.append(i[ca])
            self.write_message({"type": "data", "data": i})

        # CLEAN MEMORY
        del df
        gc.collect()

        self.write_message({"type": "categories", "data": categories})
        self.write_message({"type": "close"})

        # CLEAN MEMORY
        del categories
        gc.collect()
Example #7
0
class CubeProcess(object):
    def __init__(self, _cube):

        log_it("START: {}".format(_cube["slug"]), "bin-mining")

        self.mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo()

        del _cube["_id"]
        self.cube = _cube
        self.slug = self.cube["slug"]

    def load(self):
        self.cube["run"] = "run"
        self.mongo["cube"].update({"slug": self.slug}, self.cube)

        self.cube["start_process"] = datetime.now()

        _sql = self.cube["sql"]
        if _sql[-1] == ";":
            _sql = _sql[:-1]
        self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql)

        self.connection = self.mongo["connection"].find_one({"slug": self.cube["connection"]})["connection"]

        log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug), "bin-mining")
        e = create_engine(self.connection, **conf("openmining")["sql_conn_params"])
        Session = sessionmaker(bind=e)
        session = Session()

        resoverall = session.execute(text(self.sql))
        self.data = resoverall.fetchall()
        self.keys = resoverall.keys()

    def environment(self, t):
        if t not in ["relational"]:
            self.sql = t

    def _data(self, data):
        self.data = data

    def _keys(self, keys):
        if type(keys) == list:
            self.keys = keys
        self.keys = list(keys)

    def frame(self):
        log_it("LOAD DATA ON DATAWAREHOUSE: {}".format(self.slug), "bin-mining")
        self.df = DataFrame(self.data)
        if self.df.empty:
            log_it("[warning]Empty cube: {}!!".format(self.cube), "bin-mining")
            return
        self.df.columns = self.keys
        self.df.head()

        self.pdict = map(fix_render, self.df.to_dict(outtype="records"))

    def save(self):
        log_it("SAVE DATA (JSON) ON DATA WAREHOUSE: {}".format(self.slug), "bin-mining")
        data = {"data": self.pdict, "columns": self.keys}
        DW = DataWarehouse()
        DW.save(self.slug, data)

        self.cube["status"] = True
        self.cube["lastupdate"] = datetime.now()
        self.cube["run"] = True
        self.mongo["cube"].update({"slug": self.cube["slug"]}, self.cube)

        log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining")
        gc.collect()
Example #8
0
def data(ws, mongodb, slug):
    if not ws:
        abort(400, "Expected WebSocket request.")

    DW = DataWarehouse()

    element = mongodb["element"].find_one({"slug": slug})

    element["page_limit"] = 50
    if request.GET.get("limit", True) is False:
        element["page_limit"] = 9999999999

    if element["type"] == "grid":
        page = int(request.GET.get("page", 1))
        page_start = 0
        page_end = element["page_limit"]
        if page >= 2:
            page_end = element["page_limit"] * page
            page_start = page_end - element["page_limit"]
    else:
        page = 1
        page_start = None
        page_end = None

    filters = [i[0] for i in request.GET.iteritems() if len(i[0].split("filter__")) > 1]

    if not DW.search:
        data = DW.get(element.get("cube"), page=page)
    else:
        data = DW.get(element.get("cube"), filters=filters, page=page)

    columns = data.get("columns") or []

    fields = columns
    if request.GET.get("fields", None):
        fields = request.GET.get("fields").split(",")

    cube_last_update = mongodb["cube"].find_one({"slug": element.get("cube")})
    ws.send(json.dumps({"type": "last_update", "data": str(cube_last_update.get("lastupdate", ""))}))

    ws.send(json.dumps({"type": "columns", "data": fields}))

    df = DataFrame(data.get("data") or {}, columns=fields)
    if len(filters) >= 1:
        for f in filters:
            s = f.split("__")
            field = s[1]
            operator = s[2]
            value = request.GET.get(f)
            if operator == "like":
                df = df[df[field].str.contains(value)]
            elif operator == "regex":
                df = DataFrameSearchColumn(df, field, value, operator)
            else:
                df = df.query(df_generate(df, value, f))

    groupby = []
    if request.GET.get("groupby", None):
        groupby = request.GET.get("groupby", "").split(",")
    if len(groupby) >= 1:
        df = DataFrame(df.groupby(groupby).grouper.get_group_levels())

    if (
        request.GET.get("orderby", element.get("orderby", None))
        and request.GET.get("orderby", element.get("orderby", None)) in fields
    ):

        orderby = request.GET.get("orderby", element.get("orderby", ""))
        if type(orderby) == str:
            orderby = orderby.split(",")
        orderby__order = request.GET.get("orderby__order", element.get("orderby__order", ""))
        if type(orderby__order) == str:
            orderby__order = orderby__order.split(",")
        ind = 0
        for orde in orderby__order:
            if orde == "0":
                orderby__order[ind] = False
            else:
                orderby__order[ind] = True
            ind += 1
        df = df.sort(orderby, ascending=orderby__order)

    ws.send(json.dumps({"type": "max_page", "data": data.get("count", len(df))}))

    # CLEAN MEMORY
    del filters, fields, columns
    gc.collect()
    categories = []

    records = df.to_dict(orient="records")
    if not DW.search:
        records = records[page_start:page_end]
    for i in records:
        if element.get("categories", None):
            categories.append(i[element.get("categories")])
        ws.send(json.dumps({"type": "data", "data": i}))

    # CLEAN MEMORY
    del df
    gc.collect()

    ws.send(json.dumps({"type": "categories", "data": categories}))
    ws.send(json.dumps({"type": "close"}))

    # CLEAN MEMORY
    del categories
    gc.collect()
Example #9
0
def data(ws, mongodb, slug):
    if not ws:
        abort(400, "Expected WebSocket request.")

    MyClient = riak.RiakClient(
        protocol=conf("riak")["protocol"], http_port=conf("riak")["http_port"], host=conf("riak")["host"]
    )

    MyBucket = MyClient.bucket(conf("riak")["bucket"])

    element = mongodb["element"].find_one({"slug": slug})

    element["page_limit"] = 50
    if request.GET.get("limit", True) is False:
        element["page_limit"] = 9999999999

    coll = MyBucket.get("{}-columns".format(element.get("cube"))).data or []
    columns = json.loads(coll)

    fields = columns
    if request.GET.get("fields", None):
        fields = request.GET.get("fields").split(",")

    cube_last_update = mongodb["cube"].find_one({"slug": element.get("cube")})
    ws.send(json.dumps({"type": "last_update", "data": str(cube_last_update.get("lastupdate", ""))}))

    ws.send(json.dumps({"type": "columns", "data": fields}))

    filters = [i[0] for i in request.GET.iteritems() if len(i[0].split("filter__")) > 1]

    if element["type"] == "grid":
        page = int(request.GET.get("page", 1))
        page_start = 0
        page_end = element["page_limit"]
        if page >= 2:
            page_end = element["page_limit"] * page
            page_start = page_end - element["page_limit"]
    else:
        page_start = None
        page_end = None

    df = DataFrame(MyBucket.get(element.get("cube")).data, columns=fields)
    if len(filters) >= 1:
        for f in filters:
            s = f.split("__")
            field = s[1]
            operator = s[2]
            value = request.GET.get(f)
            if operator == "like":
                df = df[df[field].str.contains(value)]
            elif operator == "regex":
                df = DataFrameSearchColumn(df, field, value, operator)
            else:
                df = df.query(df_generate(df, value, f))

    groupby = []
    if request.GET.get("groupby", None):
        groupby = request.GET.get("groupby").split(",")
    if len(groupby) >= 1:
        df = DataFrame(df.groupby(groupby).grouper.get_group_levels())

    if (
        request.GET.get("orderby", element.get("orderby", None))
        and request.GET.get("orderby", element.get("orderby", None)) in fields
    ):

        orderby = request.GET.get("orderby", element.get("orderby", ""))
        if type(orderby) == str:
            orderby = orderby.split(",")
        orderby__order = request.GET.get("orderby__order", element.get("orderby__order", ""))
        if type(orderby__order) == str:
            orderby__order = orderby__order.split(",")
        ind = 0
        for orde in orderby__order:
            if orde == "0":
                orderby__order[ind] = False
            else:
                orderby__order[ind] = True
            ind += 1
        df = df.sort(orderby, ascending=orderby__order)

    ws.send(json.dumps({"type": "max_page", "data": len(df)}))

    # CLEAN MEMORY
    del filters, fields, columns
    gc.collect()
    categories = []
    for i in df.to_dict(outtype="records")[page_start:page_end]:
        if element.get("categories", None):
            categories.append(i[element.get("categories")])
        ws.send(json.dumps({"type": "data", "data": i}))

    # CLEAN MEMORY
    del df
    gc.collect()

    ws.send(json.dumps({"type": "categories", "data": categories}))
    ws.send(json.dumps({"type": "close"}))

    # CLEAN MEMORY
    del categories
    gc.collect()
Example #10
0
class CubeProcess(object):
    def __init__(self, _cube):

        log_it("START: {}".format(_cube["slug"]), "bin-mining")

        self.mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo()

        MyClient = riak.RiakClient(
            protocol=conf("riak")["protocol"], http_port=conf("riak")["http_port"], host=conf("riak")["host"]
        )

        self.MyBucket = MyClient.bucket(conf("riak")["bucket"])
        self.MyBucket.enable_search()
        del _cube["_id"]
        self.cube = _cube
        self.slug = self.cube["slug"]

    def load(self):
        self.cube["run"] = "run"
        self.mongo["cube"].update({"slug": self.slug}, self.cube)

        self.cube["start_process"] = datetime.now()

        _sql = self.cube["sql"]
        if _sql[-1] == ";":
            _sql = _sql[:-1]
        self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql)

        self.connection = self.mongo["connection"].find_one({"slug": self.cube["connection"]})["connection"]

        log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug), "bin-mining")
        e = create_engine(self.connection, **conf("openmining")["sql_conn_params"])
        Session = sessionmaker(bind=e)
        session = Session()

        resoverall = session.execute(text(self.sql))
        self.data = resoverall.fetchall()
        self.keys = resoverall.keys()

    def environment(self, t):
        if t not in ["relational"]:
            self.sql = t

    def _data(self, data):
        self.data = data

    def _keys(self, keys):
        self.keys = keys

    def frame(self):
        log_it("LOAD DATA ON DATAWAREHOUSE: {}".format(self.slug), "bin-mining")
        self.df = DataFrame(self.data)
        if self.df.empty:
            log_it("[warning]Empty cube: {}!!".format(self.cube), "bin-mining")
            return
        self.df.columns = self.keys
        self.df.head()

        self.pdict = map(fix_render, self.df.to_dict(outtype="records"))

    def clean(self):
        log_it("CLEAN DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining")

        self.MyBucket.new(self.slug, data="").store()
        self.MyBucket.new(u"{}-columns".format(self.slug), data="").store()
        self.MyBucket.new(u"{}-connect".format(self.slug), data="").store()
        self.MyBucket.new(u"{}-sql".format(self.slug), data="").store()

    def save(self):
        self.clean()

        log_it("SAVE DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(self.slug, data=self.pdict, content_type="application/json").store()

        log_it("SAVE COLUMNS ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(u"{}-columns".format(self.slug), data=json.dumps(self.keys)).store()

        log_it("SAVE CONNECT ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(u"{}-connect".format(self.slug), data=self.connection).store()

        log_it("SAVE SQL ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(u"{}-sql".format(self.slug), data=self.sql).store()

        self.cube["status"] = True
        self.cube["lastupdate"] = datetime.now()
        self.cube["run"] = True
        self.mongo["cube"].update({"slug": self.cube["slug"]}, self.cube)

        log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining")
        gc.collect()
Example #11
0
def data(mongodb, slug):
    # check protocol to work
    ws = request.environ.get("wsgi.websocket")
    protocol = "websocket"
    if not ws:
        response.content_type = "application/json"
        protocol = "http"
    DataManager = __from__("mining.controllers.data.{}.DataManager".format(protocol))

    # instantiates the chosen protocol
    DM = DataManager(ws)

    # instantiate data warehouse
    DW = DataWarehouse()

    element = mongodb["element"].find_one({"slug": slug})

    element["page_limit"] = 50
    if request.GET.get("limit", True) is False:
        element["page_limit"] = 9999999999

    if element["type"] == "grid" and "download" not in request.GET.keys():
        page = int(request.GET.get("page", 1))
        page_start = 0
        page_end = element["page_limit"]
        if page >= 2:
            page_end = element["page_limit"] * page
            page_start = page_end - element["page_limit"]
    else:
        page = 1
        page_start = None
        page_end = None

    filters = [i[0] for i in request.GET.iteritems() if len(i[0].split("filter__")) > 1]

    if not DW.search:
        data = DW.get(element.get("cube"), page=page)
    else:
        data = DW.get(element.get("cube"), filters=filters, page=page)

    columns = data.get("columns") or []

    fields = columns
    if request.GET.get("fields", None):
        fields = request.GET.get("fields").split(",")

    cube_last_update = mongodb["cube"].find_one({"slug": element.get("cube")})
    DM.send(json.dumps({"type": "last_update", "data": str(cube_last_update.get("lastupdate", ""))}))

    DM.send(json.dumps({"type": "columns", "data": fields}))

    df = DataFrame(data.get("data") or {}, columns=fields)
    if len(filters) >= 1:
        for f in filters:
            s = f.split("__")
            field = s[1]
            operator = s[2]
            value = request.GET.get(f)
            if operator == "like":
                df = df[df[field].str.contains(value)]
            elif operator == "regex":
                df = DataFrameSearchColumn(df, field, value, operator)
            else:
                df = df.query(df_generate(df, value, f))

    groupby = []
    if request.GET.get("groupby", None):
        groupby = request.GET.get("groupby", "").split(",")
    if len(groupby) >= 1:
        df = DataFrame(df.groupby(groupby).grouper.get_group_levels())

    if (
        request.GET.get("orderby", element.get("orderby", None))
        and request.GET.get("orderby", element.get("orderby", None)) in fields
    ):

        orderby = request.GET.get("orderby", element.get("orderby", ""))
        if type(orderby) == str:
            orderby = orderby.split(",")
        orderby__order = request.GET.get("orderby__order", element.get("orderby__order", ""))
        if type(orderby__order) == str:
            orderby__order = orderby__order.split(",")
        ind = 0
        for orde in orderby__order:
            if orde == "0":
                orderby__order[ind] = False
            else:
                orderby__order[ind] = True
            ind += 1
        df = df.sort(orderby, ascending=orderby__order)

    DM.send(json.dumps({"type": "max_page", "data": data.get("count", len(df))}))

    # CLEAN MEMORY
    del filters, fields, columns
    gc.collect()
    categories = []

    # TODO: loop in aggregate (apply mult aggregate)
    aggregate = [i[0] for i in request.GET.iteritems() if len(i[0].split("aggregate__")) > 1]
    if len(aggregate) >= 1:
        agg = aggregate[0].split("__")
        _agg = getattr(df.groupby(agg[1]), request.GET.get(aggregate[0]))()
        DF_A = DataFrame(_agg[_agg.keys()[0]]).to_dict().get(_agg.keys()[0])
        DM.send(json.dumps({"type": "aggregate", "data": DF_A}))

    records = df.to_dict(orient="records")
    if not DW.search:
        records = records[page_start:page_end]
    for i in records:
        if element.get("categories", None):
            categories.append(i[element.get("categories")])
        DM.send(json.dumps({"type": "data", "data": i}))

    DM.send(json.dumps({"type": "categories", "data": categories}))
    DM.send(json.dumps({"type": "close"}))

    # CLEAN MEMORY
    del categories
    gc.collect()

    if not ws:
        if "download" in request.GET.keys():

            ext = request.GET.get("download", "xls")
            if ext == "":
                ext = "xls"

            file_name = "{}/frontend/assets/exports/openmining-{}.{}".format(PROJECT_PATH, element.get("cube"), ext)
            if ext == "csv":
                df.to_csv(file_name, sep=";")
                contenttype = "text/csv"
            else:
                df.to_excel(file_name)
                contenttype = "application/vnd.ms-excel"

            response.set_header("charset", "utf-8")
            response.set_header("Content-disposition", "attachment; " "filename={}.{}".format(element.get("cube"), ext))
            response.content_type = contenttype

            ifile = open(file_name, "r")
            o = ifile.read()
            ifile.close()

            return o

        return json.dumps(DM.data)
Example #12
0
class Cube(object):
    def __init__(self, _cube):

        log_it("START: {}".format(_cube["slug"]), "bin-mining")

        self.mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo()

        try:
            del _cube["_id"]
        except KeyError:
            pass
        self.cube = _cube
        self.slug = self.cube["slug"]

    def load(self):
        self.cube["run"] = "run"
        self.mongo["cube"].update({"slug": self.slug}, self.cube)

        self.cube["start_process"] = datetime.now()

        _sql = self.cube["sql"]
        if _sql[-1] == ";":
            _sql = _sql[:-1]
        self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql)

        self.connection = self.mongo["connection"].find_one({"slug": self.cube["connection"]})["connection"]

        log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug), "bin-mining")
        if "sqlite" in self.connection:
            e = create_engine(self.connection)
        else:
            e = create_engine(self.connection, **conf("openmining")["sql_conn_params"])
        Session = sessionmaker(bind=e)
        session = Session()

        resoverall = session.execute(text(self.sql))
        self.data = resoverall.fetchall()
        self.keys = resoverall.keys()

    def environment(self, t):
        if t not in ["relational"]:
            self.sql = t

    def _data(self, data):
        self.data = data

    def _keys(self, keys):
        if type(keys) == list:
            self.keys = keys
        self.keys = list(keys)

    def frame(self, data_type=None):
        log_it("LOAD DATA ON DATAWAREHOUSE via {}: {}".format(data_type or "dict", self.slug), "bin-mining")
        if data_type:
            self.df = getattr(pandas, "read_{}".format(data_type))(self.data)
        else:
            self.df = DataFrame(self.data)

        if self.df.empty:
            self.pdict = {}
            log_it("[warning]Empty cube: {}!!".format(self.cube), "bin-mining")
            return

        try:
            self.df.columns = self.keys
        except AttributeError:
            self._keys(self.df.columns.tolist())

        # If the OML is active, it renders the script that there is
        if conf("oml").get("on") and self.cube.get("oml"):
            from oml import RunTime

            self.df.columns = self.keys
            df = RunTime(
                conf("oml").get("language", "lua"),
                self.df.to_dict(orient="records"),
                self.cube.get("oml"),
                conf("oml").get("class", {"OML": "oml.base.OMLBase"}),
            )
            self.df = DataFrame(df)
            self._keys(self.df.columns.tolist())

        self.df.head()
        self.pdict = map(fix_render, self.df.to_dict(orient="records"))

    def save(self):
        log_it("SAVE DATA (JSON) ON DATA WAREHOUSE: {}".format(self.slug), "bin-mining")
        data = {"data": self.pdict, "columns": self.keys}
        DW = DataWarehouse()
        DW.save(self.slug, data)

        self.cube["status"] = True
        self.cube["lastupdate"] = datetime.now()
        self.cube["run"] = True
        self.mongo["cube"].update({"slug": self.cube["slug"]}, self.cube)

        log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining")
        gc.collect()
Example #13
0
class EphysSweepFeatureExtractor:
    """Feature calculation for a sweep (voltage and/or current time series)."""

    def __init__(
        self,
        t=None,
        v=None,
        i=None,
        start=None,
        end=None,
        filter=10.0,
        dv_cutoff=20.0,
        max_interval=0.005,
        min_height=2.0,
        min_peak=-30.0,
        thresh_frac=0.05,
        baseline_interval=0.1,
        baseline_detect_thresh=0.3,
        id=None,
    ):
        """Initialize SweepFeatures object.

        Parameters
        ----------
        t : ndarray of times (seconds)
        v : ndarray of voltages (mV)
        i : ndarray of currents (pA)
        start : start of time window for feature analysis (optional)
        end : end of time window for feature analysis (optional)
        filter : cutoff frequency for 4-pole low-pass Bessel filter in kHz (optional, default 10)
        dv_cutoff : minimum dV/dt to qualify as a spike in V/s (optional, default 20)
        max_interval : maximum acceptable time between start of spike and time of peak in sec (optional, default 0.005)
        min_height : minimum acceptable height from threshold to peak in mV (optional, default 2)
        min_peak : minimum acceptable absolute peak level in mV (optional, default -30)
        thresh_frac : fraction of average upstroke for threshold calculation (optional, default 0.05)
        baseline_interval: interval length for baseline voltage calculation (before start if start is defined, default 0.1)
        baseline_detect_thresh : dV/dt threshold for evaluating flatness of baseline region (optional, default 0.3)
        """
        self.id = id
        self.t = t
        self.v = v
        self.i = i
        self.start = start
        self.end = end
        self.filter = filter
        self.dv_cutoff = dv_cutoff
        self.max_interval = max_interval
        self.min_height = min_height
        self.min_peak = min_peak
        self.thresh_frac = thresh_frac
        self.baseline_interval = baseline_interval
        self.baseline_detect_thresh = baseline_detect_thresh
        self.stimulus_amplitude_calculator = None

        self._sweep_features = {}

    def process_spikes(self):
        """Perform spike-related feature analysis"""
        self._process_individual_spikes()
        self._process_spike_related_features()

    def _process_individual_spikes(self):
        v = self.v
        t = self.t
        dvdt = ft.calculate_dvdt(v, t, self.filter)

        # Basic features of spikes
        putative_spikes = ft.detect_putative_spikes(v, t, self.start, self.end, self.filter, self.dv_cutoff)
        peaks = ft.find_peak_indexes(v, t, putative_spikes, self.end)
        putative_spikes, peaks = ft.filter_putative_spikes(v, t, putative_spikes, peaks, self.min_height, self.min_peak)

        if not putative_spikes.size:
            # Save time if no spikes detected
            self._spikes_df = DataFrame()
            return

        upstrokes = ft.find_upstroke_indexes(v, t, putative_spikes, peaks, self.filter, dvdt)
        thresholds = ft.refine_threshold_indexes(v, t, upstrokes, self.thresh_frac, self.filter, dvdt)
        thresholds, peaks, upstrokes = ft.check_thresholds_and_peaks(
            v, t, thresholds, peaks, upstrokes, self.max_interval
        )

        if not thresholds.size:
            # Save time if no spikes detected
            self._spikes_df = DataFrame()
            return

        # Spike list and thresholds have been refined - now find other features
        upstrokes = ft.find_upstroke_indexes(v, t, thresholds, peaks, self.filter, dvdt)
        troughs = ft.find_trough_indexes(v, t, thresholds, peaks, self.end)
        downstrokes = ft.find_downstroke_indexes(v, t, peaks, troughs, self.filter, dvdt)
        trough_details = ft.analyze_trough_details(v, t, thresholds, peaks, self.end, self.filter, dvdt=dvdt)
        widths = ft.find_widths(v, t, thresholds, peaks, trough_details[1])

        # Points where we care about t, v, and i if available
        vit_data_indexes = {"threshold": thresholds, "peak": peaks, "trough": troughs}

        # Points where we care about t and dv/dt
        dvdt_data_indexes = {"upstroke": upstrokes, "downstroke": downstrokes}

        # Trough details
        isi_types = trough_details[0]
        trough_detail_indexes = dict(zip(["fast_trough", "adp", "slow_trough"], trough_details[1:]))

        # Redundant, but ensures that DataFrame has right number of rows
        # Any better way to do it?
        spikes_df = DataFrame(data=thresholds, columns=["threshold_index"])

        for k, vals in vit_data_indexes.iteritems():
            spikes_df[k + "_index"] = np.nan
            spikes_df[k + "_t"] = np.nan
            spikes_df[k + "_v"] = np.nan

            if len(vals) > 0:
                spikes_df.ix[: len(vals) - 1, k + "_index"] = vals
                spikes_df.ix[: len(vals) - 1, k + "_t"] = t[vals]
                spikes_df.ix[: len(vals) - 1, k + "_v"] = v[vals]

            if self.i is not None:
                spikes_df[k + "_i"] = np.nan
                if len(vals) > 0:
                    spikes_df.ix[: len(vals) - 1, k + "_i"] = self.i[vals]

        for k, vals in dvdt_data_indexes.iteritems():
            spikes_df[k + "_index"] = np.nan
            spikes_df[k] = np.nan
            if len(vals) > 0:
                spikes_df.ix[: len(vals) - 1, k + "_index"] = vals
                spikes_df.ix[: len(vals) - 1, k + "_t"] = t[vals]
                spikes_df.ix[: len(vals) - 1, k + "_v"] = v[vals]
                spikes_df.ix[: len(vals) - 1, k] = dvdt[vals]

        spikes_df["isi_type"] = isi_types

        for k, vals in trough_detail_indexes.iteritems():
            spikes_df[k + "_index"] = np.nan
            if np.any(~np.isnan(vals)):
                spikes_df.ix[~np.isnan(vals), k + "_index"] = vals[~np.isnan(vals)]

            spikes_df[k + "_t"] = np.nan
            if np.any(~np.isnan(vals)):
                spikes_df.ix[~np.isnan(vals), k + "_t"] = t[vals[~np.isnan(vals)].astype(int)]

            spikes_df[k + "_v"] = np.nan
            if np.any(~np.isnan(vals)):
                spikes_df.ix[~np.isnan(vals), k + "_v"] = v[vals[~np.isnan(vals)].astype(int)]

            if self.i is not None:
                spikes_df[k + "_i"] = np.nan
                if np.any(~np.isnan(vals)):
                    spikes_df.ix[~np.isnan(vals), k + "_i"] = self.i[vals[~np.isnan(vals)].astype(int)]

        spikes_df["width"] = np.nan
        spikes_df.ix[: len(widths) - 1, "width"] = widths

        spikes_df["upstroke_downstroke_ratio"] = spikes_df["upstroke"] / -spikes_df["downstroke"]

        self._spikes_df = spikes_df

    def _process_spike_related_features(self):
        t = self.t

        if len(self._spikes_df) == 0:
            self._sweep_features["avg_rate"] = 0
            return

        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(t, thresholds)
        with warnings.catch_warnings():
            # ignore mean of empty slice warnings here
            warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy")

            sweep_level_features = {
                "adapt": ft.adaptation_index(isis),
                "latency": ft.latency(t, thresholds, self.start),
                "isi_cv": (isis.std() / isis.mean()) if len(isis) >= 1 else np.nan,
                "mean_isi": isis.mean(),
                "median_isi": np.median(isis),
                "first_isi": isis[0] if len(isis) >= 1 else np.nan,
                "avg_rate": ft.average_rate(t, thresholds, self.start, self.end),
            }

        for k, v in sweep_level_features.iteritems():
            self._sweep_features[k] = v

    def _process_pauses(self, cost_weight=1.0):
        # Pauses are unusually long ISIs with a "detour reset" among delay resets
        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(self.t, thresholds)
        isi_types = self._spikes_df["isi_type"][:-1].values

        return ft.detect_pauses(isis, isi_types, cost_weight)

    def pause_metrics(self):
        """Estimate average number of pauses and average fraction of time spent in a pause

        Attempts to detect pauses with a variety of conditions and averages results together.

        Pauses that are consistently detected contribute more to estimates.

        Returns
        -------
        avg_n_pauses : average number of pauses detected across conditions
        avg_pause_frac : average fraction of interval (between start and end) spent in a pause
        max_reliability : max fraction of times most reliable pause was detected given weights tested
        n_max_rel_pauses : number of pauses detected with `max_reliability`
        """

        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(self.t, thresholds)

        weight = 1.0
        pause_list = self._process_pauses(weight)

        if len(pause_list) == 0:
            return 0, 0.0

        n_pauses = len(pause_list)
        pause_frac = isis[pause_list].sum()
        pause_frac /= self.end - self.start

        return n_pauses, pause_frac

    def _process_bursts(self, tol=0.5, pause_cost=1.0):
        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(self.t, thresholds)

        isi_types = self._spikes_df["isi_type"][:-1].values

        fast_tr_v = self._spikes_df["fast_trough_v"].values
        fast_tr_t = self._spikes_df["fast_trough_t"].values
        slow_tr_v = self._spikes_df["slow_trough_v"].values
        slow_tr_t = self._spikes_df["slow_trough_t"].values
        thr_v = self._spikes_df["threshold_v"].values

        bursts = ft.detect_bursts(isis, isi_types, fast_tr_v, fast_tr_t, slow_tr_v, slow_tr_t, thr_v, tol, pause_cost)

        return np.array(bursts)

    def burst_metrics(self):
        """Find bursts and return max "burstiness" index (normalized max rate in burst vs out).

        Returns
        -------
        max_burstiness_index : max "burstiness" index across detected bursts
        num_bursts : number of bursts detected
        """

        burst_info = self._process_bursts()

        if burst_info.shape[0] > 0:
            return burst_info[:, 0].max(), burst_info.shape[0]
        else:
            return 0.0, 0

    def delay_metrics(self):
        """Calculates ratio of latency to dominant time constant of rise before spike

        Returns
        -------
        delay_ratio : ratio of latency to tau (higher means more delay)
        tau : dominant time constant of rise before spike
        """

        if len(self._spikes_df) == 0:
            logging.info("No spikes available for delay calculation")
            return 0.0, 0.0
        start = self.start
        spike_time = self._spikes_df["threshold_t"].values[0]

        tau = ft.fit_prespike_time_constant(self.v, self.t, start, spike_time)
        latency = spike_time - start

        delay_ratio = latency / tau
        return delay_ratio, tau

    def _get_baseline_voltage(self):
        v = self.v
        t = self.t
        filter_frequency = 1.0  # in kHz

        # Look at baseline interval before start if start is defined
        if self.start is not None:
            return ft.average_voltage(v, t, self.start - self.baseline_interval, self.start)

        # Otherwise try to find an interval where things are pretty flat
        dv = ft.calculate_dvdt(v, t, filter_frequency)
        non_flat_points = np.flatnonzero(np.abs(dv >= self.baseline_detect_thresh))
        flat_intervals = t[non_flat_points[1:]] - t[non_flat_points[:-1]]
        long_flat_intervals = np.flatnonzero(flat_intervals >= self.baseline_interval)
        if long_flat_intervals.size > 0:
            interval_index = long_flat_intervals[0] + 1
            baseline_end_time = t[non_flat_points[interval_index]]
            return ft.average_voltage(v, t, baseline_end_time - self.baseline_interval, baseline_end_time)
        else:
            logging.info("Could not find sufficiently flat interval for automatic baseline voltage", RuntimeWarning)
            return np.nan

    def voltage_deflection(self, deflect_type=None):
        """Measure deflection (min or max, between start and end if specified).

        Parameters
        ----------
        deflect_type : measure minimal ('min') or maximal ('max') voltage deflection
            If not specified, it will check to see if the current (i) is positive or negative
            between start and end, then choose 'max' or 'min', respectively
            If the current is not defined, it will default to 'min'.

        Returns
        -------
        deflect_v : peak
        deflect_index : index of peak deflection
        """

        deflect_dispatch = {"min": np.argmin, "max": np.argmax}

        start = self.start
        if not start:
            start = 0
        start_index = ft.find_time_index(self.t, start)

        end = self.end
        if not end:
            end = self.t[-1]
        end_index = ft.find_time_index(self.t, end)

        if deflect_type is None:
            if self.i is not None:
                halfway_index = ft.find_time_index(self.t, (end - start) / 2.0 + start)
                if self.i[halfway_index] >= 0:
                    deflect_type = "max"
                else:
                    deflect_type = "min"
            else:
                deflect_type = "min"

        deflect_func = deflect_dispatch[deflect_type]

        v_window = self.v[start_index:end_index]
        deflect_index = deflect_func(v_window) + start_index

        return self.v[deflect_index], deflect_index

    def stimulus_amplitude(self):
        """ """
        if self.stimulus_amplitude_calculator is not None:
            return self.stimulus_amplitude_calculator(self)
        else:
            return np.nan

    def estimate_time_constant(self):
        """Calculate the membrane time constant by fitting the voltage response with a
        single exponential.

        Returns
        -------
        tau : membrane time constant in seconds
        """

        # Assumes this is being done on a hyperpolarizing step
        v_peak, peak_index = self.voltage_deflection("min")
        v_baseline = self.sweep_feature("v_baseline")

        if self.start:
            start_index = ft.find_time_index(self.t, self.start)
        else:
            start_index = 0

        frac = 0.1
        search_result = np.flatnonzero(self.v[start_index:] <= frac * (v_peak - v_baseline) + v_baseline)
        if not search_result.size:
            raise ft.FeatureError("could not find interval for time constant estimate")
        fit_start = self.t[search_result[0] + start_index]
        fit_end = self.t[peak_index]

        a, inv_tau, y0 = ft.fit_membrane_time_constant(self.v, self.t, fit_start, fit_end)

        return 1.0 / inv_tau

    def estimate_sag(self, peak_width=0.005):
        """Calculate the sag in a hyperpolarizing voltage response.

        Parameters
        ----------
        peak_width : window width to get more robust peak estimate in sec (default 0.005)

        Returns
        -------
        sag : fraction that membrane potential relaxes back to baseline
        """

        t = self.t
        v = self.v

        start = self.start
        if not start:
            start = 0

        end = self.end
        if not end:
            end = self.t[-1]

        v_peak, peak_index = self.voltage_deflection("min")
        v_peak_avg = ft.average_voltage(
            v, t, start=t[peak_index] - peak_width / 2.0, end=t[peak_index] + peak_width / 2.0
        )
        v_baseline = self.sweep_feature("v_baseline")
        v_steady = ft.average_voltage(v, t, start=end - self.baseline_interval, end=end)
        sag = (v_peak_avg - v_steady) / (v_peak_avg - v_baseline)
        return sag

    def spikes(self):
        """Get all features for each spike as a list of records."""
        return self._spikes_df.to_dict("records")

    def spike_feature(self, key):
        """Get specified feature for every spike.

        Parameters
        ----------
        key : feature name

        Returns
        -------
        spike_feature_values : ndarray of features for each spike
        """

        if len(self._spikes_df) == 0:
            return np.array([])

        if key not in self._spikes_df.columns:
            raise KeyError("requested feature '{:s}' not available".format(key))

        return self._spikes_df[key].values

    def spike_feature_keys(self):
        """Get list of every available spike feature."""
        return self._spikes_df.columns.values.tolist()

    def sweep_feature(self, key, allow_missing=False):
        """Get sweep-level feature (`key`).

        Parameters
        ----------
        key : name of sweep-level feature
        allow_missing : return np.nan if key is missing for sweep (default False)

        Returns
        -------
        sweep_feature : sweep-level feature value
        """

        on_request_dispatch = {
            "v_baseline": self._get_baseline_voltage,
            "tau": self.estimate_time_constant,
            "sag": self.estimate_sag,
            "peak_deflect": self.voltage_deflection,
            "stim_amp": self.stimulus_amplitude,
        }

        if allow_missing and key not in self._sweep_features and key not in on_request_dispatch:
            return np.nan
        elif key not in self._sweep_features and key not in on_request_dispatch:
            raise KeyError("requested feature '{:s}' not available".format(key))

        if key not in self._sweep_features and key in on_request_dispatch:
            fn = on_request_dispatch[key]
            if fn is not None:
                self._sweep_features[key] = fn()
            else:
                raise KeyError("requested feature '{:s}' not defined".format(key))

        return self._sweep_features[key]

    def process_new_spike_feature(self, feature_name, feature_func):
        """Add new spike-level feature calculation function

           The function should take this sweep extractor as its argument. Its results
           can be accessed by calling the method spike_feature(<feature_name>).
        """

        if feature_name in self._spikes_df.columns:
            raise KeyError("Feature {:s} already exists for sweep".format(feature_name))

        features = feature_func(self)
        self._spikes_df[feature_name] = np.nan
        self._spikes_df.ix[: len(features) - 1, feature_name] = features

    def process_new_sweep_feature(self, feature_name, feature_func):
        """Add new sweep-level feature calculation function

           The function should take this sweep extractor as its argument. Its results
           can be accessed by calling the method sweep_feature(<feature_name>).
        """

        if feature_name in self._sweep_features:
            raise KeyError("Feature {:s} already exists for sweep".format(feature_name))

        self._sweep_features[feature_name] = feature_func(self)

    def set_stimulus_amplitude_calculator(self, function):
        self.stimulus_amplitude_calculator = function

    def sweep_feature_keys(self):
        """Get list of every available sweep-level feature."""
        return self._sweep_features.keys()

    def as_dict(self):
        """Create dict of features and spikes."""
        output_dict = self._sweep_features.copy()
        output_dict["spikes"] = self.spikes()
        if self.id is not None:
            output_dict["id"] = self.id
        return output_dict