Python DataFrame.to_dict Examples, pandas.DataFrame.to_dict Python Examples

Example #1

0

Show file

File: frame_ctor.py Project: mwaskom/pandas

class FromDicts(object):

    goal_time = 0.2

    def setup(self):
        np.random.seed(1234)
        N, K = 5000, 50
        self.index = tm.makeStringIndex(N)
        self.columns = tm.makeStringIndex(K)
        self.frame = DataFrame(np.random.randn(N, K),
                               index=self.index,
                               columns=self.columns)
        self.data = self.frame.to_dict()
        self.some_dict = list(self.data.values())[0]
        self.dict_list = self.frame.to_dict(orient='records')
        self.data2 = {i: {j: float(j) for j in range(100)}
                      for i in range(2000)}

    def time_frame_ctor_list_of_dict(self):
        DataFrame(self.dict_list)

    def time_frame_ctor_nested_dict(self):
        DataFrame(self.data)

    def time_series_ctor_from_dict(self):
        Series(self.some_dict)

    def time_frame_ctor_nested_dict_int64(self):
        # nested dict, integer indexes, regression described in #621
        DataFrame(self.data2)

Example #2

0

Show file

File: test_convert_to.py Project: chrish42/pandas

 def test_to_dict_index_not_unique_with_index_orient(self):
     # GH22801
     # Data loss when indexes are not unique. Raise ValueError.
     df = DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
     msg = "DataFrame index must be unique for orient='index'"
     with pytest.raises(ValueError, match=msg):
         df.to_dict(orient='index')

Example #3

0

Show file

File: frame_ctor.py Project: DusanMilunovic/pandas

 def setup(self):
     N, K = 5000, 50
     self.index = tm.makeStringIndex(N)
     self.columns = tm.makeStringIndex(K)
     frame = DataFrame(np.random.randn(N, K), index=self.index,
                       columns=self.columns)
     self.data = frame.to_dict()
     self.dict_list = frame.to_dict(orient='records')
     self.data2 = {i: {j: float(j) for j in range(100)}
                   for i in range(2000)}

Example #4

0

Show file

File: views.py Project: Zionus/Aqua

def gotcha():
    #use passing data to generate tab
    #print('timer is ticking,',timer)
    name  = request.args.get('name', None)
    if request.is_xhr:
        cfg = name+'_cfg'
        panel= ticker(eval(cfg))
        indexs = list(panel.axes[0])
        #indexs.remove('src')
        result = dict(labels=indexs)
        col =  ['Bug ID', 'Product', 'Hardware', 'Severity', 'Function', 'Owner']
        result.update({'cols' : col}) 
        #result = {'src':DataFrame(panel['src'],columns= rows +indexs).to_dict(orient='records')}
        #print(result['src'][:3])
        
        for ini in indexs:
            #print(stscols[ini])
            df =  DataFrame(panel[ini],columns= prefix + stscols[ini] + ['All'] ).fillna('').query('Project != "" ')
            #df.to_csv(r'D:\Aqua\apps\ticks\B_' + ini + '.csv',index= False)
            result.update({ini : df.to_dict('records') })
            
            result[ini + '_cols'] =  stscols[ini] 
            #print('tab label as : ',stscols[ini] )
            #print(result[ini][:3])
        return jsonify(result)
        
    return render_template('project.html',name = name)

Example #5

0

Show file

File: mining.py Project: uddhavpgautam/mining

def run(cube_slug=None):
    mc = memcache.Client(['127.0.0.1:11211'], debug=0)
    for cube in MyAdminBucket.get('cube').data:
        try:
            slug = cube['slug']

            if cube_slug and cube_slug != slug:
                continue

            sql = """SELECT * FROM ({}) AS CUBE;""".format(cube['sql'])
            for c in MyAdminBucket.get('connection').data:
                if c['slug'] == cube['connection']:
                    connection = c['connection']

            print "\n# CLEAN MEMCACHE/RIAK: {}".format(slug)
            mc.delete(str(slug))
            mc.delete(str('{}-columns'.format(slug)))

            MyBucket.new(slug, data='').store()
            MyBucket.new(u'{}-columns'.format(slug), data='').store()
            MyBucket.new(u'{}-connect'.format(slug), data='').store()
            MyBucket.new(u'{}-sql'.format(slug), data='').store()

            print "# CONNECT IN RELATION DATA BASE: {}".format(slug)
            e = create_engine(connection)
            connection = e.connect()

            resoverall = connection.execute(text(sql))

            print "# LOAD DATA ON DATAWAREHOUSE: {}".format(slug)
            df = DataFrame(resoverall.fetchall())
            if df.empty:
                print '[warnning]Empty cube: {}!!'.format(cube)
                return
            df.columns = resoverall.keys()
            df.head()

            pdict = map(fix_render, df.to_dict(outtype='records'))

            print "# SAVE DATA (JSON) ON RIAK: {}".format(slug)
            MyBucket.new(slug, data=pdict).store()

            print "# SAVE COLUMNS ON RIAK: {}".format(slug)
            MyBucket.new(u'{}-columns'.format(slug),
                         data=json.dumps([c for c in df.columns])).store()

            print "# SAVE CONNECT ON RIAK: {}".format(slug)
            MyBucket.new(u'{}-connect'.format(slug), data=c).store()

            print "# SAVE SQL ON RIAK: {}".format(slug)
            MyBucket.new(u'{}-sql'.format(slug), data=sql).store()

            print "# CLEAN MEMORY: {}\n".format(slug)
            del pdict, df
            gc.collect()
        except:
            pass

    print "## FINISH"
    return True

Example #6

0

Show file

File: views.py Project: yupbank/mining

    def post(self, slug):
        columns = json.loads(MyBucket.get('{}-columns'.format(slug)).data)
        fields = columns
        if self.get_argument('fields', None):
            fields = self.get_argument('fields').split(',')

        filters = [i[0] for i in self.request.arguments.iteritems()
                   if len(i[0].split('filter__')) > 1]

        fields_json = json.dumps(fields)
        filters_json = json.dumps({f: self.get_argument(f) for f in filters})
        if MyCache.get(str(slug)) and\
                MyCache.get('{}-columns'.format(slug)) == fields_json and\
                MyCache.get('{}-fulters'.format(slug)) == filters_json:
            self.write(MyCache.get(str(slug)))
            self.finish()

        MyCache.set('{}-columns'.format(slug), fields_json)
        MyCache.set('{}-filters'.format(slug), filters_json)

        df = DataFrame(MyBucket.get(slug).data, columns=fields)
        if len(filters) >= 1:
            for f in filters:
                df = df.query(df_generate(df, self.get_argument, f))
        convert = df.to_dict(outtype='records')

        write = json.dumps({'columns': fields, 'json': convert})
        MyCache.set(str(slug), write)
        self.write(write)
        self.finish()

Example #7

0

Show file

File: views.py Project: yupbank/mining

    def open(self, slug):
        columns = json.loads(MyBucket.get('{}-columns'.format(slug)).data)
        fields = columns
        if self.get_argument('fields', None):
            fields = self.get_argument('fields').split(',')

        self.write_message({'type': 'columns', 'data': fields})

        filters = [i[0] for i in self.request.arguments.iteritems()
                   if len(i[0].split('filter__')) > 1]

        df = DataFrame(MyBucket.get(slug).data, columns=fields)
        if len(filters) >= 1:
            for f in filters:
                df = df.query(df_generate(df, self.get_argument, f))

        ca = None
        for e in MyAdminBucket.get('element').data:
            if e['slug'] == slug:
                ca = e['categories']

        categories = []
        for i in df.to_dict(outtype='records'):
            if ca:
                categories.append(i[ca])
            self.write_message({'type': 'data', 'data': i})

        self.write_message({'type': 'categories', 'data': categories})
        self.write_message({'type': 'close'})

Example #8

0

Show file

File: document_models.py Project: unicef/rhizome

    def get_document_meta_mappings(self):
        '''
        Depending on the configuration of the required location and campagin column,
        query the source object map table and find the equivelant master_object_id_ids
        needed to go through the remainder of the ETL process.
        '''

        # during the DocTransform process we associate new AND existing mappings between
        # the metadata assoicated with this doucment.

        # sm_ids = DocumentSourceObjectMap.objects.filter(document_id =\
        #     self.document_id).values_list('source_object_map_id',flat=True)

        # create a tuple dict ex: {('location': "PAK") : 3 , ('location':
        # "PAK") : 3}
        source_map_dict = DataFrame(list(SourceObjectMap.objects
                         .filter(
                             master_object_id__gt=0)\
                         # id__in = sm_ids)\
                         .values_list(*['master_object_id']))
                    , columns=['master_object_id']\
                    , index=SourceObjectMap.objects.filter(master_object_id__gt=0)
                    # ,id__in = sm_ids)\
                    .values_list(*['content_type', 'source_object_code']))\


        source_map_dict = source_map_dict.to_dict()['master_object_id']

        return source_map_dict

Example #9

0

Show file

File: test_convert_to.py Project: chrish42/pandas

 def test_to_dict_box_scalars(self, orient, item_getter):
     # 14216, 23753
     # make sure that we are boxing properly
     df = DataFrame({'a': [1, 2], 'b': [.1, .2]})
     result = df.to_dict(orient=orient)
     assert isinstance(item_getter(result, 'a', 0), int)
     assert isinstance(item_getter(result, 'b', 0), float)

Example #10

0

Show file

File: genetic.py Project: vhcandido/forex-genetic

    def sort(self):
        df = DataFrame({'chromo': self._population,
                        'fitness': self._fitness})
        df = df.sort_values('fitness', ascending=False)
        dic = df.to_dict(orient='list')

        self._population = dic['chromo']
        self._fitness = dic['fitness']

Example #11

0

Show file

File: utils.py Project: ashapochka/saapy

def dataframe_to_dicts(dataframe: pd.DataFrame) -> List[dict]:
    """
    converts a pandas data frame into the list of dictionaries
    :param dataframe: pandas data frame to convert
    :return: list of dictionaries with keys set from the data frame column
    names and values from the column values
    """
    return dataframe.to_dict(orient='records')

Example #12

0

Show file

File: test_convert_to.py Project: chrish42/pandas

    def test_to_dict(self, mapping):
        test_data = {
            'A': {'1': 1, '2': 2},
            'B': {'1': '1', '2': '2', '3': '3'},
        }

        # GH16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("l", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert (v2 == recons_data[k][int(k2) - 1])

        recons_data = DataFrame(test_data).to_dict("s", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("sp", mapping)
        expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
                          'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("r", mapping)
        expected_records = [{'A': 1.0, 'B': '1'},
                            {'A': 2.0, 'B': '2'},
                            {'A': np.nan, 'B': '3'}]
        assert isinstance(recons_data, list)
        assert (len(recons_data) == 3)
        for l, r in zip(recons_data, expected_records):
            tm.assert_dict_equal(l, r)

        # GH10844
        recons_data = DataFrame(test_data).to_dict("i")

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert (v2 == recons_data[k2][k])

        df = DataFrame(test_data)
        df['duped'] = df[df.columns[0]]
        recons_data = df.to_dict("i")
        comp_data = test_data.copy()
        comp_data['duped'] = comp_data[df.columns[0]]
        for k, v in comp_data.items():
            for k2, v2 in v.items():
                assert (v2 == recons_data[k2][k])

Example #13

0

Show file

File: test_convert_to.py Project: chrish42/pandas

    def test_to_dict_index_dtypes(self, into, expected):
        # GH 18580
        # When using to_dict(orient='index') on a dataframe with int
        # and float columns only the int columns were cast to float

        df = DataFrame({'int_col': [1, 2, 3],
                        'float_col': [1.0, 2.0, 3.0]})

        result = df.to_dict(orient='index', into=into)
        cols = ['int_col', 'float_col']
        result = DataFrame.from_dict(result, orient='index')[cols]
        expected = DataFrame.from_dict(expected, orient='index')[cols]
        tm.assert_frame_equal(result, expected)

Example #14

0

Show file

File: test_convert_to.py Project: chrish42/pandas

    def test_frame_to_dict_tz(self):
        # GH18372 When converting to dict with orient='records' columns of
        # datetime that are tz-aware were not converted to required arrays
        data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
                (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)]
        df = DataFrame(list(data), columns=["d", ])

        result = df.to_dict(orient='records')
        expected = [
            {'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)},
            {'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)},
        ]
        tm.assert_dict_equal(result[0], expected[0])
        tm.assert_dict_equal(result[1], expected[1])

Example #15

0

Show file

File: test_convert_to.py Project: chrish42/pandas

    def test_to_dict_timestamp(self):

        # GH11247
        # split/records producing np.datetime64 rather than Timestamps
        # on datetime64[ns] dtypes only

        tsmp = Timestamp('20130101')
        test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]})
        test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]})

        expected_records = [{'A': tsmp, 'B': tsmp},
                            {'A': tsmp, 'B': tsmp}]
        expected_records_mixed = [{'A': tsmp, 'B': 1},
                                  {'A': tsmp, 'B': 2}]

        assert (test_data.to_dict(orient='records') ==
                expected_records)
        assert (test_data_mixed.to_dict(orient='records') ==
                expected_records_mixed)

        expected_series = {
            'A': Series([tsmp, tsmp], name='A'),
            'B': Series([tsmp, tsmp], name='B'),
        }
        expected_series_mixed = {
            'A': Series([tsmp, tsmp], name='A'),
            'B': Series([1, 2], name='B'),
        }

        tm.assert_dict_equal(test_data.to_dict(orient='series'),
                             expected_series)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient='series'),
                             expected_series_mixed)

        expected_split = {
            'index': [0, 1],
            'data': [[tsmp, tsmp],
                     [tsmp, tsmp]],
            'columns': ['A', 'B']
        }
        expected_split_mixed = {
            'index': [0, 1],
            'data': [[tsmp, 1],
                     [tsmp, 2]],
            'columns': ['A', 'B']
        }

        tm.assert_dict_equal(test_data.to_dict(orient='split'),
                             expected_split)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient='split'),
                             expected_split_mixed)

Example #16

0

Show file

File: views.py Project: srkama/mining

    def open(self, slug):
        columns = json.loads(MyBucket.get('{}-columns'.format(slug)).data)
        fields = columns
        if self.get_argument('fields', None):
            fields = self.get_argument('fields').split(',')

        self.write_message({'type': 'columns', 'data': fields})

        filters = [i[0] for i in self.request.arguments.iteritems()
                   if len(i[0].split('filter__')) > 1]

        df = DataFrame(MyBucket.get(slug).data, columns=fields)
        if len(filters) >= 1:
            for f in filters:
                df = df.query(df_generate(df, self.get_argument, f))

        for i in df.to_dict(outtype='records'):
            self.write_message({'type': 'data', 'data': i})

        self.close()

Example #17

0

Show file

File: frame_ctor.py Project: mwaskom/pandas

class FromDictwithTimestampOffsets(object):

    params = [all_offsets, [1, 2]]
    param_names = ['offset', 'n_steps']

    offset_kwargs = {'WeekOfMonth': {'weekday': 1, 'week': 1},
                     'LastWeekOfMonth': {'weekday': 1, 'week': 1},
                     'FY5253': {'startingMonth': 1, 'weekday': 1},
                     'FY5253Quarter': {'qtr_with_extra_week': 1,
                                       'startingMonth': 1,
                                       'weekday': 1}}

    offset_extra_cases = {'FY5253': {'variation': ['nearest', 'last']},
                          'FY5253Quarter': {'variation': ['nearest', 'last']}}

    def setup(self, offset, n_steps):
        np.random.seed(1234)
        extra = False
        if offset.endswith("_", None, -1):
            extra = int(offset[-1])
            offset = offset[:-2]

        kwargs = {}
        if offset in self.offset_kwargs:
            kwargs = self.offset_kwargs[offset]

        if extra:
            extras = self.offset_extra_cases[offset]
            for extra_arg in extras:
                kwargs[extra_arg] = extras[extra_arg][extra - 1]

        offset = getattr(offsets, offset)
        self.idx = get_index_for_offset(offset(n_steps, **kwargs))
        self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx)
        self.d = self.df.to_dict()

    def time_frame_ctor(self, offset, n_steps):
        DataFrame(self.d)

Example #18

0

Show file

File: views.py Project: yuripiratello/mining

    def post(self, slug):
        mc = memcache.Client(['127.0.0.1:11211'], debug=0)
        myClient = riak.RiakClient(protocol='http',
                                   http_port=8098,
                                   host='127.0.0.1')
        myBucket = myClient.bucket('openmining')

        columns = json.loads(myBucket.get('{}-columns'.format(slug)).data)
        fields = columns
        if self.get_argument('fields', None):
            fields = self.get_argument('fields').split(',')

        filters = [i[0] for i in self.request.arguments.iteritems()
                   if len(i[0].split('filter__')) > 1]

        fields_json = json.dumps(fields)
        filters_json = json.dumps({f: self.get_argument(f) for f in filters})
        if mc.get(str(slug)) and\
                mc.get('{}-columns'.format(slug)) == fields_json and\
                mc.get('{}-fulters'.format(slug)) == filters_json:
            self.write(mc.get(str(slug)))
            self.finish()

        mc.set('{}-columns'.format(slug), fields_json)
        mc.set('{}-filters'.format(slug), filters_json)

        df = DataFrame(myBucket.get(slug).data, columns=fields)
        if len(filters) >= 1:
            for f in filters:
                df = df.query(df_generate(df, self.get_argument, f))
        convert = df.to_dict(outtype='records')

        write = json.dumps({'columns': fields, 'json': convert})
        mc.set(str(slug), write)
        self.write(write)
        self.finish()

Example #19

0

Show file

 def serialize(self, instance: pd.DataFrame):
     self._check_type(instance, pd.DataFrame, SerializationError)
     self._check_columns(instance, SerializationError)
     return {'values': instance.to_dict('records')}

Example #20

0

Show file

File: evalutils.py Project: schelv/evalutils

class BaseEvaluation(ABC):
    def __init__(
            self,
            *,
            ground_truth_path: Path = Path(
                "/usr/src/evaluation/ground-truth/"),
            predictions_path: Path = Path("/input/"),
            file_sorter_key: Callable = first_int_in_filename_key,
            file_loader: FileLoader,
            validators: Tuple[DataFrameValidator, ...],
            join_key: str = None,
            aggregates: Set[str] = {
                "mean",
                "std",
                "min",
                "max",
                "25%",
                "50%",
                "75%",
                "count",
                "uniq",
                "freq",
            },
            output_file: Path = Path("/output/metrics.json"),
    ):
        self._ground_truth_path = ground_truth_path
        self._predictions_path = predictions_path
        self._file_sorter_key = file_sorter_key
        self._file_loader = file_loader
        self._validators = validators
        self._join_key = join_key
        self._aggregates = aggregates
        self._output_file = output_file

        self._ground_truth_cases = DataFrame()
        self._predictions_cases = DataFrame()

        self._cases = DataFrame()

        self._case_results = DataFrame()
        self._aggregate_results = {}
        super().__init__()

        if isinstance(self._file_loader, CSVLoader) and self._join_key is None:
            raise ConfigurationError(
                f"You must set a `join_key` when using {self._file_loader}.")

    @property
    def _metrics(self):
        return {
            "case": self._case_results.to_dict(),
            "aggregates": self._aggregate_results,
        }

    def evaluate(self):
        self.load()
        self.validate()
        self.merge_ground_truth_and_predictions()
        self.cross_validate()
        self.score()
        self.save()

    def load(self):
        self._ground_truth_cases = self._load_cases(
            folder=self._ground_truth_path)
        self._predictions_cases = self._load_cases(
            folder=self._predictions_path)

    def _load_cases(self, *, folder: Path) -> DataFrame:
        cases = None

        for f in sorted(folder.glob("**/*"), key=self._file_sorter_key):
            try:
                new_cases = self._file_loader.load(fname=f)
            except FileLoaderError:
                logger.warning(
                    f"Could not load {f.name} using {self._file_loader}.")
            else:
                if cases is None:
                    cases = [new_cases]
                else:
                    cases.append(new_cases)

        if cases is None:
            raise FileLoaderError(f"Could not load and files in {folder} with "
                                  f"{self._file_loader}.")

        return DataFrame(cases)

    def validate(self):
        self._validate_data_frame(df=self._ground_truth_cases)
        self._validate_data_frame(df=self._predictions_cases)

    def _validate_data_frame(self, *, df: DataFrame):
        for validator in self._validators:
            validator.validate(df=df)

    @abstractmethod
    def merge_ground_truth_and_predictions(self):
        pass

    @abstractmethod
    def cross_validate(self):
        pass

    def _raise_missing_predictions_error(self, *, missing=None):
        if missing is not None:
            message = (
                "Predictions missing: you did not submit predictions for "
                f"{missing}. Please try again.")
        else:
            message = (
                "Predictions missing: you did not submit enough predictions, "
                "please try again.")

        raise ValidationError(message)

    def _raise_extra_predictions_error(self, *, extra=None):
        if extra is not None:
            message = (
                "Too many predictions: we do not have the ground truth data "
                f"for {extra}. Please try again.")
        else:
            message = (
                "Too many predictions: you submitted too many predictions, "
                "please try again.")

        raise ValidationError(message)

    @abstractmethod
    def score(self):
        pass

    # noinspection PyUnusedLocal
    @staticmethod
    def score_case(*, idx: int, case: DataFrame) -> Dict:
        return {}

Example #21

0

Show file

File: stream.py Project: douglas-larocca/mining

def data(ws, mongodb, slug):
    if not ws:
        abort(400, 'Expected WebSocket request.')

    DW = DataWarehouse()

    element = mongodb['element'].find_one({'slug': slug})

    element['page_limit'] = 50
    if request.GET.get('limit', True) is False:
        element['page_limit'] = 9999999999

    data = DW.get(element.get('cube'))
    columns = data.get('columns') or []

    fields = columns
    if request.GET.get('fields', None):
        fields = request.GET.get('fields').split(',')

    cube_last_update = mongodb['cube'].find_one({'slug': element.get('cube')})
    ws.send(json.dumps({'type': 'last_update',
                        'data': str(cube_last_update.get('lastupdate', ''))}))

    ws.send(json.dumps({'type': 'columns', 'data': fields}))

    filters = [i[0] for i in request.GET.iteritems()
               if len(i[0].split('filter__')) > 1]

    if element['type'] == 'grid':
        page = int(request.GET.get('page', 1))
        page_start = 0
        page_end = element['page_limit']
        if page >= 2:
            page_end = element['page_limit'] * page
            page_start = page_end - element['page_limit']
    else:
        page_start = None
        page_end = None

    df = DataFrame(data.get('data') or {}, columns=fields)
    if len(filters) >= 1:
        for f in filters:
            s = f.split('__')
            field = s[1]
            operator = s[2]
            value = request.GET.get(f)
            if operator == 'like':
                df = df[df[field].str.contains(value)]
            elif operator == 'regex':
                df = DataFrameSearchColumn(df, field, value, operator)
            else:
                df = df.query(df_generate(df, value, f))

    groupby = []
    if request.GET.get('groupby', None):
        groupby = request.GET.get('groupby', ).split(',')
    if len(groupby) >= 1:
        df = DataFrame(df.groupby(groupby).grouper.get_group_levels())

    if request.GET.get('orderby',
                       element.get('orderby', None)) and request.GET.get(
            'orderby', element.get('orderby', None)) in fields:

        orderby = request.GET.get('orderby', element.get('orderby', ''))
        if type(orderby) == str:
            orderby = orderby.split(',')
        orderby__order = request.GET.get('orderby__order',
                                         element.get('orderby__order', ''))
        if type(orderby__order) == str:
            orderby__order = orderby__order.split(',')
        ind = 0
        for orde in orderby__order:
            if orde == '0':
                orderby__order[ind] = False
            else:
                orderby__order[ind] = True
            ind += 1
        df = df.sort(orderby, ascending=orderby__order)

    ws.send(json.dumps({'type': 'max_page', 'data': len(df)}))

    # CLEAN MEMORY
    del filters, fields, columns
    gc.collect()
    categories = []
    for i in df.to_dict(outtype='records')[page_start:page_end]:
        if element.get('categories', None):
            categories.append(i[element.get('categories')])
        ws.send(json.dumps({'type': 'data', 'data': i}))

    # CLEAN MEMORY
    del df
    gc.collect()

    ws.send(json.dumps({'type': 'categories', 'data': categories}))
    ws.send(json.dumps({'type': 'close'}))

    # CLEAN MEMORY
    del categories
    gc.collect()

Example #22

0

Show file

File: test_to_dict.py Project: frreiss/pandas-fred

 def test_to_dict_invalid_orient(self):
     df = DataFrame({"A": [0, 1]})
     msg = "orient 'xinvalid' not understood"
     with pytest.raises(ValueError, match=msg):
         df.to_dict(orient="xinvalid")

Example #23

0

Show file

File: test_to_dict.py Project: frreiss/pandas-fred

    def test_to_dict(self, mapping):
        # orient= should only take the listed options
        # see GH#32515
        test_data = {
            "A": {
                "1": 1,
                "2": 2
            },
            "B": {
                "1": "1",
                "2": "2",
                "3": "3"
            }
        }

        # GH#16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("list", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][int(k2) - 1]

        recons_data = DataFrame(test_data).to_dict("series", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("split", mapping)
        expected_split = {
            "columns": ["A", "B"],
            "index": ["1", "2", "3"],
            "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
        }
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("records", mapping)
        expected_records = [
            {
                "A": 1.0,
                "B": "1"
            },
            {
                "A": 2.0,
                "B": "2"
            },
            {
                "A": np.nan,
                "B": "3"
            },
        ]
        assert isinstance(recons_data, list)
        assert len(recons_data) == 3
        for left, right in zip(recons_data, expected_records):
            tm.assert_dict_equal(left, right)

        # GH#10844
        recons_data = DataFrame(test_data).to_dict("index")

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

        df = DataFrame(test_data)
        df["duped"] = df[df.columns[0]]
        recons_data = df.to_dict("index")
        comp_data = test_data.copy()
        comp_data["duped"] = comp_data[df.columns[0]]
        for k, v in comp_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

Example #24

0

Show file

File: test_to_dict.py Project: frreiss/pandas-fred

 def test_to_dict_not_unique_warning(self):
     # GH#16927: When converting to a dict, if a column has a non-unique name
     # it will be dropped, throwing a warning.
     df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
     with tm.assert_produces_warning(UserWarning):
         df.to_dict()

Example #25

0

Show file

File: test_to_dict.py Project: frreiss/pandas-fred

 def test_to_dict_wide(self):
     # GH#24939
     df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)})
     result = df.to_dict("records")[0]
     expected = {f"A_{i:d}": i for i in range(256)}
     assert result == expected

Example #26

0

Show file

File: utils.py Project: Job2506/engine

def df_to_dict(df: DataFrame) -> dict:
    """
    Method turns dataframe into dictionary
    """
    df.index = df.index.map(str)
    return df.to_dict('index')

Example #27

0

Show file

class TestRun:
    """
    represents the collected data of a particular (set of) log file(s)
    """
    FILE_EXTENSION = ".trn"
    """ the file extension for saving and loading test runs from """
    def __init__(self, filenames=[]):
        self.inputfromstdin = False
        self.filenames = []
        for filename in filenames:
            self.appendFilename(filename)
        self.data = DataFrame(dtype=object)

        self.datadict = {}
        self.currentproblemdata = {}
        self.currentproblemid = 0
        """ meta data represent problem-independent data """
        self.metadatadict = {}
        self.parametervalues = {}
        self.defaultparametervalues = {}
        self.keyset = set()

        self.currentfileiterator = None
        self.currentfile = None
        self.consumedStdinput = []

    def __iter__(self):
        if (self.currentfile != ""):
            with open(self.currentfile, "r") as f:
                for line in enumerate(f):
                    yield line
        else:
            for line in enumerate(self.consumedStdinput):
                yield line
            for line in enumerate(sys.stdin, len(self.consumedStdinput)):
                yield line

    def iterationPrepare(self):
        filenames = sorted(
            self.filenames,
            key=lambda x: misc.sortingKeyContext(misc.filenameGetContext(x)))
        self.currentfileiterator = iter(filenames)

    def iterationNextFile(self):
        try:
            self.currentfile = next(self.currentfileiterator)
            return True
        except StopIteration:
            return False

    def iterationAddConsumedStdinput(self, consumedlines):
        if self.currentfile == "":
            for line in consumedlines:
                self.consumedStdinput.append(line)

    def iterationCleanUp(self):
        self.currentfileiterator = None

    def iterationGetCurrentFile(self):
        return self.currentfile

    def setInputFromStdin(self):
        self.filenames.append("")

    def appendFilename(self, filename):
        # TODO test this
        """Append a file name to the list of filenames of this test run
        """
        filename = os.path.abspath(filename)
        if filename not in self.filenames:
            self.filenames.append(filename)
        else:
            return

        extension = misc.filenameGetContext(filename)
        if extension in [Key.CONTEXT_ERRFILE, Key.CONTEXT_LOGFILE]:
            metafile = os.path.splitext(filename)[0] + ".meta"

            if os.path.isfile(metafile) and (metafile not in self.filenames):
                self.filenames.append(metafile)

    def addDataByName(self, datakeys, data, problem):
        """Add the current data under the specified dataname

        Readers can use this method to add data, either as a single datakey, or as list,
        where in the latter case it is required that datakeys and data are both lists of the same length

        after data was added, the method getProblemDataById() can be used for access
        """
        for problemid, name in self.datadict.setdefault(Key.ProblemName,
                                                        {}).items():
            if name == problem:
                self.addDataById(datakeys, data, problemid)

    def addData(self, datakey, data):
        """Add data to current problem

        readers can use this method to add data, either as a single datakey, or as list,
        where in the latter case it is required that datakeys and data are both lists of the same length
        """
        logging.debug("TestRun %s receives data Datakey %s, %s" %
                      (self.getName(), repr(datakey), repr(data)))

        if type(datakey) is list and type(data) is list:
            for key, datum in zip(datakey, data):
                self.currentproblemdata[key] = datum
        else:
            self.currentproblemdata[datakey] = data

    def getCurrentProblemData(self, datakey: str = None):
        """Return current problem data, either entirely or for specified data key
        """
        if datakey is None:
            return self.currentproblemdata
        else:
            return self.currentproblemdata.get(datakey)

    def addDataById(self, datakeys, data, problemid):
        """Add the data or to the specified problem

        readers can use this method to add data, either as a single datakey, or as list,
        where in the latter case it is required that datakeys and data are both lists of the same length

        after data was added, the method getProblemDataById() can be used for access if a problemid was given
        """
        # check for the right dictionary to store the data
        logging.debug("TestRun %s receives data Datakey %s, %s to problem %s" %
                      (self.getName(), repr(datakeys), repr(data), problemid))

        if type(datakeys) is list and type(data) is list:
            for key, datum in zip(datakeys, data):
                self.datadict.setdefault(key, {})[problemid] = datum
        else:
            self.datadict.setdefault(datakeys, {})[problemid] = data

    def addParameterValue(self, paramname, paramval):
        """Store the value for a parameter of a given name for this test run
        """
        self.parametervalues[paramname] = paramval

    def addDefaultParameterValue(self, paramname, defaultval):
        """Store the value for a parameter of a given name for this test run
        """
        self.defaultparametervalues[paramname] = defaultval

    def getParameterData(self):
        """Return two dictionaries that map parameter names to  their value and default value
        """
        return (self.parametervalues, self.defaultparametervalues)

    def getLogFile(self, fileextension=".out"):
        """Returns the name of the logfile
        """
        for filename in self.filenames:
            if filename.endswith(fileextension):
                return filename
        return None

    def getKeySet(self):
        """Return a list or set of keys (which are the columns headers of the data)
        """
        if self.datadict != {}:
            return list(self.datadict.keys())
        else:
            return set(self.data.columns)

    def emptyData(self):
        """Empty all data of current testrun
        """
        self.data = DataFrame(dtype=object)

    def getMetaData(self):
        """Return a data frame containing meta data
        """
        return DataFrame(self.metadatadict)

    def finalizeCurrentCollection(self, solver):
        """ Any data of the current problem is saved as a new row in datadict
        """
        if self.currentproblemdata != {}:
            # Add data collected by solver into currentproblemdata, such as primal and dual bound,
            self.addData(*solver.getData())
            for key in self.metadatadict.keys():
                self.addData(key, self.metadatadict[key])

            for key in self.currentproblemdata.keys():
                self.datadict.setdefault(
                    key,
                    {})[self.currentproblemid] = self.currentproblemdata[key]
            self.currentproblemdata = {}
            self.currentproblemid = self.currentproblemid + 1

    def finishedReadingFile(self, solver):
        """ Save data of current problem
        """
        self.finalizeCurrentCollection(solver)

    def setupForDataCollection(self):
        """ Save data in a python dictionary for easier data collection
        """
        self.datadict = self.data.to_dict()
        self.data = DataFrame(dtype=object)

    def setupAfterDataCollection(self):
        """ Save data in a pandas dataframe for futher use (i.e. reading and finding data)
        """
        self.data = DataFrame(self.datadict)
        self.datadict = {}

    def hasProblemName(self, problemname):
        """ Return if already collected data for a problem with given name
        """
        if self.datadict != {}:
            return problemname in self.datadict.get(Key.ProblemName,
                                                    {}).values()
        else:
            if Key.ProblemName in self.data.keys():
                for name in self.data[Key.ProblemName]:
                    if problemname == name:
                        return True
            return False

    def hasProblemId(self, problemid):
        """ Returns if there is already data collected for a problem with given id
        """
        return problemid in range(self.currentproblemid)

    def getProblemIds(self):
        """ Return a list of problemids
        """
        return list(range(self.currentproblemid))

    def getProblemNames(self):
        """ Return an (unsorted) list of problemnames
        """
        if self.datadict != {}:
            return list(self.datadict.get(Key.ProblemName, []))
        else:
            if Key.ProblemName in self.data.columns:
                return list(self.data[Key.ProblemName])
            else:
                return []

    def getProblemDataByName(self, problemname, datakey):
        """Return the data collected for problems with given name
        """
        collecteddata = []
        if self.datadict != {}:
            for key, dat in self.datadict.get("ProblemName", None):
                if dat == problemname:
                    collecteddata.append(self.getProblemDataById(key, datakey))
        else:
            collecteddata = list(self.data[self.data[Key.ProblemName] ==
                                           problemname].loc[:, datakey])
        try:
            return collecteddata[0]
        except IndexError:
            return None

    def getProblemDataById(self, problemid, datakey=None):
        """Return data for a specific datakey, or None, if no such data exists for this (probname, datakey) key pair
        """
        if datakey is None:
            try:
                return ",".join("%s: %s" %
                                (key, self.getProblemDataById(problemid, key))
                                for key in self.getKeySet())
            except KeyError:
                return "<%s> not contained in keys, have only\n%s" % \
                    (problemid, ",".join((ind for ind in self.getProblemIds())))
        else:
            if self.datadict != {}:
                return self.datadict.get(datakey, {}).get(problemid, None)
            else:
                try:
                    data = self.data.loc[problemid, datakey]
                except KeyError:
                    data = None
                if type(data) is list or notnull(data):
                    return data
                else:
                    return None

    def getProblemsDataById(self, problemids, datakey):
        """ Return data for a list of problems
        """
        if self.datadict != {}:
            return [
                self.datadict.get(datakey, {}).get(id, None)
                for id in problemids
            ]
        else:
            return self.data.loc[problemids, datakey]

    def deleteProblemDataById(self, problemid):
        """ Delete all data acquired so far for problemid
        """
        if self.datadict != {}:
            for key in list(self.datadict.keys()):
                try:
                    del self.datadict[key][problemid]
                except KeyError:
                    pass
        else:
            try:
                self.data.drop(problemid, inplace=True)
            except TypeError:
                # needs to be caught for pandas version < 0.13
                self.data = self.data.drop(problemid)

    def saveToFile(self, filename):
        """ Dump the pickled instance of itself into a .trn-file
        """
        try:
            f = open(filename, 'wb')
            pickle.dump(self, f, protocol=2)
            f.close()
        except IOError:
            print("Could not open %s for saving test run" % filename)

    def emptyCurrentProblemData(self):
        """ Empty data of currently read problem
        """
        return self.currentproblemdata == {}

    def printToConsole(self, formatstr="{idx}: {d}"):
        """ Print data to console
        """
        for idx, d in self.data.iterrows():
            #            pd.set_option('display.max_rows', len(d))
            print(formatstr.format(d=d, idx=idx))
#            pd.reset_option('display.max_rows')

    def toJson(self):
        """ Return the data-object in json
        """
        return self.data.to_json()

    @staticmethod
    def loadFromFile(filename):
        """ Loads a .trn-File containing a particular instance of TestRun
        """
        try:
            if filename.endswith(".gz"):
                import gzip
                f = gzip.open(filename, 'rb')
            else:
                f = open(filename, 'rb')
        except IOError:
            print("Could not open %s for loading test run" % filename)
            return None
        testrun = pickle.load(f)
        f.close()
        return testrun

    def getData(self, datakey=None):
        """Return a data frame object of the acquired data
        """
        return self.data

    def getCurrentLogfilename(self):
        """ Return the name of the current logfile 
        """
        return os.path.basename(self.filenames[0])

    def getSettings(self):
        """ Return the settings associated with this test run
        """
        try:
            return self.data['Settings'][0]
        except KeyError:
            return os.path.basename(self.filenames[0]).split('.')[-2]
#

    def getName(self):
        """ Convenience method to make test run a manageable object
        """
        return self.getIdentification()

    def getIdentification(self):
        """ Return identification string of this test run
        """
        # TODO Is this still the way to do this? What if we are reading from stdin?
        return os.path.splitext(os.path.basename(self.filenames[0]))[0]

    def problemGetOptimalSolution(self, problemid):
        """ Return objective of an optimal or a best known solution

        ... from solu file, or None, if no such data has been acquired
        """
        try:
            return self.getProblemDataById(problemid, 'OptVal')
        except KeyError:
            #            print(self.getIdentification() + " has no solu file value for ", problemid)
            return None

    def problemGetSoluFileStatus(self, problemid):
        """ Return 'unkn', 'inf', 'best', 'opt'

        ... as solu file status, or None, if no solu file status
        exists for this problem
        """
        try:
            return self.getProblemDataById(problemid, 'SoluFileStatus')
        except KeyError:
            #            print(self.getIdentification() + " has no solu file status for ", problemid)
            return None

Example #28

0

Show file

def bunch_insert_on_duplicate_update(df: pd.DataFrame,
                                     table_name,
                                     engine,
                                     dtype=None,
                                     ignore_none=True,
                                     myisam_if_create_table=False,
                                     primary_keys: list = None,
                                     schema=None):
    """
    将 DataFrame 数据批量插入数据库，ON DUPLICATE KEY UPDATE
    :param df:
    :param table_name:
    :param engine:
    :param dtype: 仅在表不存在的情况下自动创建使用
    :param ignore_none: 为 None 或 NaN 字段不更新
    :param myisam_if_create_table: 如果数据库表为新建，则自动将表engine变为 MYISAM
    :param primary_keys: 如果数据库表为新建，则设置主键为对应list中的key
    :param schema: 仅当需要设置主键时使用
    :return:
    """
    if df is None or df.shape[0] == 0:
        return 0

    has_table = engine.has_table(table_name)
    if has_table:
        col_name_list = list(df.columns)
        if ignore_none:
            generated_directive = [
                "`{0}`=IFNULL(VALUES(`{0}`), `{0}`)".format(col_name)
                for col_name in col_name_list
            ]
        else:
            generated_directive = [
                "`{0}`=VALUES(`{0}`)".format(col_name)
                for col_name in col_name_list
            ]

        sql_str = "insert into {table_name}({col_names}) VALUES({params}) ON DUPLICATE KEY UPDATE {update}".format(
            table_name=table_name,
            col_names="`" + "`,`".join(col_name_list) + "`",
            params=','.join([':' + col_name for col_name in col_name_list]),
            update=','.join(generated_directive),
        )
        data_dic_list = df.to_dict('records')
        for data_dic in data_dic_list:
            for k, v in data_dic.items():
                if (isinstance(v, float) and np.isnan(v)) or isinstance(
                        v, NaTType):
                    data_dic[k] = None
        with with_db_session(engine) as session:
            rslt = session.execute(sql_str, params=data_dic_list)
            insert_count = rslt.rowcount
            session.commit()
    else:
        df.to_sql(table_name,
                  engine,
                  if_exists='append',
                  index=False,
                  dtype=dtype)
        insert_count = df.shape[0]
        # 修改表engine
        if myisam_if_create_table:
            logger.info('修改 %s 表引擎为 MyISAM', table_name)
            sql_str = f"ALTER TABLE {table_name} ENGINE = MyISAM"
            execute_sql(engine, sql_str, commit=True)
        # 创建主键
        if primary_keys is not None:
            if schema is None:
                raise ValueError('schema 不能为 None，对表设置主键时需要指定schema')
            qry_column_type = """SELECT column_name, column_type
                FROM information_schema.columns 
                WHERE table_schema=:schema AND table_name=:table_name"""
            with with_db_session(engine) as session:
                table = session.execute(qry_column_type,
                                        params={
                                            'schema': schema,
                                            'table_name': table_name
                                        })
                column_type_dic = dict(table.fetchall())
                praimary_keys_len, col_name_last, col_name_sql_str_list = len(
                    primary_keys), None, []
                for num, col_name in enumerate(primary_keys):
                    col_type = column_type_dic[col_name]
                    position_str = 'FIRST' if col_name_last is None else f'AFTER `{col_name_last}`'
                    col_name_sql_str_list.append(
                        f'CHANGE COLUMN `{col_name}` `{col_name}` {col_type} NOT NULL {position_str}'
                        + ("," if num < praimary_keys_len - 1 else ""))
                    col_name_last = col_name
                # chg_pk_str = """ALTER TABLE {table_name}
                #     CHANGE COLUMN `ths_code` `ths_code` VARCHAR(20) NOT NULL FIRST,
                #     CHANGE COLUMN `time` `time` DATE NOT NULL AFTER `ths_code`,
                #     ADD PRIMARY KEY (`ths_code`, `time`)""".format(table_name=table_name)
                chg_pk_str = f"ALTER TABLE {table_name}\n" + "\n".join(
                    col_name_sql_str_list)
                logger.info('对 %s 表创建主键 %s', table_name, primary_keys)
                session.execute(chg_pk_str)

    logger.debug('%s 新增数据 (%d, %d)', table_name, insert_count, df.shape[1])
    return insert_count

Example #29

0

Show file

File: emissions.py Project: opennem/opennem

def insert_flows(flow_results: pd.DataFrame) -> int:
    """Takes a list of generation values and calculates energies and bulk-inserts
    into the database"""

    flow_results.reset_index(inplace=True)

    # Add metadata
    flow_results["created_by"] = "opennem.worker.emissions"
    flow_results["created_at"] = ""
    flow_results["updated_at"] = datetime.now()

    # # reorder columns
    columns = [
        "trading_interval",
        "network_id",
        "network_region",
        "energy_imports",
        "energy_exports",
        "emissions_imports",
        "emissions_exports",
        "market_value_imports",
        "market_value_exports",
        "created_by",
        "created_at",
        "updated_at",
    ]
    flow_results = flow_results[columns]

    records_to_store: List[Dict] = flow_results.to_dict("records")

    if len(records_to_store) < 1:
        logger.error("No records returned from energy sum")
        return 0

    # Build SQL + CSV and bulk-insert
    sql_query = build_insert_query(
        AggregateNetworkFlows,  # type: ignore
        [
            "energy_imports",
            "energy_exports",
            "emissions_imports",
            "emissions_exports",
            "market_value_imports",
            "market_value_exports",
        ],
    )
    conn = get_database_engine().raw_connection()
    cursor = conn.cursor()

    csv_content = generate_csv_from_records(
        AggregateNetworkFlows,  # type: ignore
        records_to_store,
        column_names=list(records_to_store[0].keys()),
    )

    try:
        cursor.copy_expert(sql_query, csv_content)
        conn.commit()
    except Exception as e:
        logger.error("Error inserting records: {}".format(e))
        return 0

    logger.info("Inserted {} records".format(len(records_to_store)))

    return len(records_to_store)

Example #30

0

Show file

File: cube.py Project: rmoorman/mining

class CubeProcess(object):
    def __init__(self, _cube):

        log_it("START: {}".format(_cube["slug"]), "bin-mining")

        self.mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo()

        MyClient = riak.RiakClient(
            protocol=conf("riak")["protocol"], http_port=conf("riak")["http_port"], host=conf("riak")["host"]
        )

        self.MyBucket = MyClient.bucket(conf("riak")["bucket"])
        self.MyBucket.enable_search()
        del _cube["_id"]
        self.cube = _cube
        self.slug = self.cube["slug"]

    def load(self):
        self.cube["run"] = "run"
        self.mongo["cube"].update({"slug": self.slug}, self.cube)

        self.cube["start_process"] = datetime.now()

        _sql = self.cube["sql"]
        if _sql[-1] == ";":
            _sql = _sql[:-1]
        self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql)

        self.connection = self.mongo["connection"].find_one({"slug": self.cube["connection"]})["connection"]

        log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug), "bin-mining")
        e = create_engine(self.connection, **conf("openmining")["sql_conn_params"])
        Session = sessionmaker(bind=e)
        session = Session()

        resoverall = session.execute(text(self.sql))
        self.data = resoverall.fetchall()
        self.keys = resoverall.keys()

    def environment(self, t):
        if t not in ["relational"]:
            self.sql = t

    def _data(self, data):
        self.data = data

    def _keys(self, keys):
        self.keys = keys

    def frame(self):
        log_it("LOAD DATA ON DATAWAREHOUSE: {}".format(self.slug), "bin-mining")
        self.df = DataFrame(self.data)
        if self.df.empty:
            log_it("[warning]Empty cube: {}!!".format(self.cube), "bin-mining")
            return
        self.df.columns = self.keys
        self.df.head()

        self.pdict = map(fix_render, self.df.to_dict(outtype="records"))

    def clean(self):
        log_it("CLEAN DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining")

        self.MyBucket.new(self.slug, data="").store()
        self.MyBucket.new(u"{}-columns".format(self.slug), data="").store()
        self.MyBucket.new(u"{}-connect".format(self.slug), data="").store()
        self.MyBucket.new(u"{}-sql".format(self.slug), data="").store()

    def save(self):
        self.clean()

        log_it("SAVE DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(self.slug, data=self.pdict, content_type="application/json").store()

        log_it("SAVE COLUMNS ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(u"{}-columns".format(self.slug), data=json.dumps(self.keys)).store()

        log_it("SAVE CONNECT ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(u"{}-connect".format(self.slug), data=self.connection).store()

        log_it("SAVE SQL ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(u"{}-sql".format(self.slug), data=self.sql).store()

        self.cube["status"] = True
        self.cube["lastupdate"] = datetime.now()
        self.cube["run"] = True
        self.mongo["cube"].update({"slug": self.cube["slug"]}, self.cube)

        log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining")
        gc.collect()

Example #31

0

Show file

def train_unsupervised_models(
    estimator, params, data, scoring,
):

    '''
    This function takes a given estimator with lists of values of hyperparameters and
    performs a gridsearch. This function may also be cancelled
    mid operation with a KeyboardInterupt (kernal interupt). When this happens, all
    estimators with their respective hyperparameters that were not trained will be discarded.

    Inputs:
        estimator: The estimator wanting to be tuned.
        params: The dictionary of hyperparmeters to be parsed into the estimator.
                e.g.: {
                        'n_estimators': range(35, 56),
                        'max_depth': range(10, 21),
                        'min_samples_split': range(10, 21),
                        'random_state': [0]
                    }
        data: Dataframe to perform unsupervised learning on.
        scoring: The scorer to be used in both training and scoring of estimators.
                 To read more, see here:
                    https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values

    Returns:
        models: A pandas.DataFrame of all hyperparameters and the corresponding score obtained.
    '''

    # IMPORTS
    from pandas import DataFrame
    import itertools
    from sklearn.model_selection import train_test_split, cross_val_score


    # list(itertools.product(*params.values()))
    # This gets all combinations of all paramaters and makes a tuple of each
    models = DataFrame(
        data=list(itertools.product(*params.values())),
        columns=list(params.keys())
    )

    scores = []


    for model_num, hyper_params in enumerate(models.to_dict(orient='records')):
        # models.to_dict(orient='records') converts all rows into a dictionary
        # of arguments to parse into an estimator
        try:
            model = estimator(**hyper_params)
            model.fit(data)

            if scoring == 'kmeans':
                model_score = model.inertia_
            scores.append(model_score)
            printProgressBar(model_num + 1, len(models), prefix='Progress:', suffix='Complete', length=50)

        except KeyboardInterrupt:
            # This is here so that if this operation is canceled, all the
            # data will not be lost.
            printProgressBar(model_num + 1, len(models), prefix='Progress:', suffix='Model Stopped!', length=50)
            break

    idx_not_calced = (len(scores) - 1)
    models = models.drop(list(range(len(scores), len(models))))
    models['Score'] = scores

    return models

Example #32

0

Show file

def train_models(
    estimator, params, X, y,
    k_folds=5, scoring='neg_root_mean_squared_error', fast_train=False,
    verbose_level=0
):

    '''
    This function takes a given estimator with lists of values of hyperparameters and
    performs a gridsearch with cross validation. This function may also be cancelled
    mid operation with a KeyboardInterupt (kernal interupt). When this happens, all
    estimators with their respective hyperparameters that were not trained will be discarded.

    Inputs:
        estimator: The estimator wanting to be tuned.
        params: The dictionary of hyperparmeters to be parsed into the estimator.
                e.g.: {
                        'n_estimators': range(35, 56),
                        'max_depth': range(10, 21),
                        'min_samples_split': range(10, 21),
                        'random_state': [0]
                    }
        X: The array training input samples.
        y: The array target values.
        k_folds: The number of folds to be performed in cross validation.
        scoring: The scorer to be used in both training and scoring of estimators.
                 To read more, see here:
                    https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
        fast_train: True will train models using 80% of the input data and validate with the remaining 20%.
                    False will use cross validation, this will be slower as the model will be trained and
                    compared more than one time. RMSE will be used as the accuracy metric.
        verbose_level: The level of verbosity to be outputed in cross validation.
                       0 = Output nothing
                       1 = Output some level of detail
                       2+ = Output all details

    Returns:
        models: A pandas.DataFrame of all hyperparameters and the corresponding score obtained.
    '''

    # IMPORTS
    from pandas import DataFrame
    import itertools
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.metrics import mean_squared_error, mean_absolute_error


    # list(itertools.product(*params.values()))
    # This gets all combinations of all paramaters and makes a tuple of each
    models = DataFrame(
        data=list(itertools.product(*params.values())),
        columns=list(params.keys())
    )

    scores = []

    if fast_train:
        # If not using cross validation, use train/test split
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=0)

    for model_num, hyper_params in enumerate(models.to_dict(orient='records')):
        # models.to_dict(orient='records') converts all rows into a dictionary
        # of arguments to parse into an estimator
        try:
            if fast_train:
                model = estimator(**hyper_params)
                model.fit(X=X_train, y=y_train)
                predictions = model.predict(X_valid)

                if scoring == 'neg_mean_absolute_error':
                    model_score = mean_absolute_error(predictions, y_valid)
                elif scoring == 'neg_root_mean_squared_error':
                    model_score = mean_squared_error(predictions, y_valid, squared=False)
                else:
                    # If parsing in custom function
                    model_score = scoring(predictions, y_valid)

            else:
                # Train model with cross validation
                model_score = -cross_val_score(
                    estimator=estimator(**hyper_params),
                    X=X, y=y,
                    scoring=scoring, cv=k_folds,
                    verbose=verbose_level, error_score='raise'
                ).mean()

            scores.append(model_score)
            printProgressBar(model_num + 1, len(models), prefix='Progress:', suffix='Complete', length=50)

        except KeyboardInterrupt:
            # This is here so that if this operation is canceled, all the
            # data will not be lost.
            printProgressBar(model_num + 1, len(models), prefix='Progress:', suffix='Model Stopped!', length=50)
            break

    idx_not_calced = (len(scores) - 1)
    models = models.drop(list(range(len(scores), len(models))))
    models['Score'] = scores

    return models

Example #33

0

Show file

 def _write_structured(self, output_file, result:pd.DataFrame):
     records = result.to_dict('records')
     doc = self._build_header()
     doc['data'] = records
     json.dump(doc, output_file)

Example #34

0

Show file

File: test_convert_to.py Project: chrish42/pandas

 def test_to_dict_numeric_names(self):
     # https://github.com/pandas-dev/pandas/issues/24940
     df = DataFrame({str(i): [i] for i in range(5)})
     result = set(df.to_dict('records')[0].keys())
     expected = set(df.columns)
     assert result == expected

Example #35

0

Show file

def to_records(df: pd.DataFrame) -> List[Dict]:
    return df.to_dict(orient='records')

Example #36

0

Show file

File: cube.py Project: henriquejensen/mining

class CubeProcess(object):
    def __init__(self, _cube):

        log_it("START: {}".format(_cube['slug']), "bin-mining")

        self.mongo = MongoPlugin(
            uri=conf("mongodb")["uri"],
            db=conf("mongodb")["db"],
            json_mongo=True).get_mongo()

        del _cube['_id']
        self.cube = _cube
        self.slug = self.cube['slug']

    def load(self):
        self.cube['run'] = 'run'
        self.mongo['cube'].update({'slug': self.slug}, self.cube)

        self.cube['start_process'] = datetime.now()

        _sql = self.cube['sql']
        if _sql[-1] == ';':
            _sql = _sql[:-1]
        self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql)

        self.connection = self.mongo['connection'].find_one({
            'slug': self.cube['connection']})['connection']

        log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug),
               "bin-mining")
        if 'sqlite' in self.connection:
            e = create_engine(self.connection)
        else:
            e = create_engine(self.connection,
                              **conf('openmining')['sql_conn_params'])
        Session = sessionmaker(bind=e)
        session = Session()

        resoverall = session.execute(text(self.sql))
        self.data = resoverall.fetchall()
        self.keys = resoverall.keys()

    def environment(self, t):
        if t not in ['relational']:
            self.sql = t

    def _data(self, data):
        self.data = data

    def _keys(self, keys):
        if type(keys) == list:
            self.keys = keys
        self.keys = list(keys)

    def frame(self):
        log_it("LOAD DATA ON DATAWAREHOUSE: {}".format(self.slug),
               "bin-mining")
        self.df = DataFrame(self.data)
        if self.df.empty:
            self.pdict = {}
            log_it('[warning]Empty cube: {}!!'.format(self.cube),
                   "bin-mining")
            return
        self.df.columns = self.keys
        self.df.head()

        self.pdict = map(fix_render, self.df.to_dict(outtype='records'))

    def save(self):
        log_it("SAVE DATA (JSON) ON DATA WAREHOUSE: {}".format(self.slug),
               "bin-mining")
        data = {'data': self.pdict, 'columns': self.keys}
        DW = DataWarehouse()
        DW.save(self.slug, data)

        self.cube['status'] = True
        self.cube['lastupdate'] = datetime.now()
        self.cube['run'] = True
        self.mongo['cube'].update({'slug': self.cube['slug']}, self.cube)

        log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining")
        gc.collect()

Example #37

0

Show file

File: test_to_dict.py Project: frreiss/pandas-fred

 def test_to_dict_numeric_names(self):
     # GH#24940
     df = DataFrame({str(i): [i] for i in range(5)})
     result = set(df.to_dict("records")[0].keys())
     expected = set(df.columns)
     assert result == expected

Example #38

0

Show file

File: business_description.py Project: terence-lim/investment-data-science

    E = DataFrame(data=similar[year][metric], index=perms, columns=perms)
    E = E.stack()
    thresh = [10, 20, 25, 30, 40, 50, 60, 70, 80, 90, 95]
    thresh = {t: v for t,v in zip(thresh, np.percentile(E, thresh))}
    E = E.reset_index()

    thresholds = [40, 60, 80]
    for i, pct in enumerate(thresholds):  # percentile thresholds to cull edges
        print(f'\nMin {metric} similarity threshold for edges: {pct}th %-tile')
        edges = E[(E.iloc[:,0] != E.iloc[:,1]) & (E.iloc[:,-1] > thresh[pct])]
        attributes = edges.iloc[:, -1].values
        edges = edges.iloc[:, :-1].values
        
        # Populate igraph including attributes (note: vertex names must be str)
        g = Graph(directed=False)
        g.add_vertices(vs.index.astype(str).to_list(), vs.to_dict(orient='list'))
        g.add_edges(edges, {'score': attributes})
        degree = Series(g.vs.degree())   # to remove zero degree vertexes
        print('Deleting', len(degree[degree==0]), 'vertex IDs with degree 0.\n')
        g.delete_vertices(degree[degree==0].index.to_list())
        g = g.simplify()         # remove self-loops and multi-edges
        s = Series(igraph_info(g, fast=True)).rename(year)
        print(s.to_frame().T)
        #t = g.components(mode=1)
        #print('Deleting', t._len-1, 'components with sizes', t.sizes()[1:], '\n')
        #g = t.subgraph(0)
        
        # detect communities and report modularity
        c = g.community_multilevel()
        print(DataFrame({'modularity':c.modularity, 'components':len(c.sizes())},
                        index=[pct]).rename_axis(f'{metric} threshold %'))

Example #39

0

Show file

File: test_to_dict.py Project: frreiss/pandas-fred

    def test_to_dict_orient_dtype(self, data, dtype):
        # GH22620 & GH21256

        df = DataFrame({"a": data})
        d = df.to_dict(orient="records")
        assert all(type(record["a"]) is dtype for record in d)

Example #40

0

Show file

File: run.py Project: kiorry/PYQT

    def show_tablewidget(self, dict_data, tableWidget, clear_fore=True):
        '''传入dict_data 与 tableWidget，以实现在tablewidget上面呈现dict_data'''
        '''提取自己需要的信息：'''
        if clear_fore == True:  # 检测搜索之前是否要清空下载购物车信息。
            self.download_info_list = []

        '更新状态栏信息'
        self.signal_status.emit('clear', []) # 清空状态栏

        '检测checkBox之前是否已经被选中过，若选中过则设置为选中，否则设置为不选中'
        if tableWidget.objectName() == 'tableWidget_title':
            if self.current_page_num_title in self.select_title_page_info:
                self.checkBox_select_title.setCheckState(Qt.Checked)
            else:
                self.checkBox_select_title.setCheckState(Qt.Unchecked)
            flag = 'title'
        else:
            if self.current_page_num_content in self.select_content_page_info:
                self.checkBox_select_content.setCheckState(Qt.Checked)
            else:
                self.checkBox_select_content.setCheckState(Qt.Unchecked)
            flag = 'content'

        '''检测过滤显示的信息'''
        if self.lineEdit_filter_title.isEnabled() == True:
            filter_text = self.lineEdit_filter_title.text()
            self.filter_title_list = self.get_filter_list(filter_text)
        else:
            self.filter_title_list=[]
        if self.lineEdit_filter_content.isEnabled() == True:
            filter_text = self.lineEdit_filter_content.text()
            self.filter_content_list = self.get_filter_list(filter_text)
        else:
            self.filter_content_list=[]

        '''从传入的网络爬虫抓取的数据中提取自己需要的数据'''
        if len(dict_data) > 0:
            # key_word = self.lineEdit.text()
            len_index = len(dict_data)
            list_target = []  # 从dict_data中提取目标数据，基本元素是下面的dict_target
            for index in range(len_index):
                dict_temp = dict_data[index] # 提取从服务器中返回的其中一行信息。
                dict_target = {} # 从dict_temp中提取自己需要的信息，主要包括title,content,time,download_url等等
                '提取标题与内容'
                _temp_title = dict_temp['announcementTitle']
                _temp_content = dict_temp['announcementContent']
                for i in ['<em>', '</em>']: # <em>, </em>是服务器对搜索关键字添加的标记，这里对它们剔除
                    _temp_title = _temp_title.replace(i, '')
                    _temp_content = str(_temp_content).replace(i, '')

                dict_target['title'] = _temp_title
                dict_target['content'] = _temp_content

                '提取时间'
                _temp = dict_temp['adjunctUrl']
                dict_target['time'] = _temp.split(r'/')[1]

                '提取url'
                id = _temp.split(r'/')[2].split('.')[0]
                download_url = 'http://www.cninfo.com.cn/cninfo-new/disclosure/fulltext/download/{}?announceTime={}'.format(
                    id, dict_target['time'])
                dict_target['download_url'] = download_url
                dict_target['flag'] = flag
                # print(download_url)
                '添加处理的结果'
                list_target.append(dict_target)

            '''根据过滤规则，进行自定义过滤，默认是不过滤'''
            df = DataFrame(list_target)
            df = self.filter_df(df,filter_title_list=self.filter_title_list,filter_content_list = self.filter_content_list)

            '''过滤后，更新list_target'''
            _temp = df.to_dict('index')
            list_target = list(_temp.values())

        else:  # '处理没有数据的情况'
            list_target = []

        '''tableWidget的初始化'''
        list_col = ['time', 'title', 'download_url']
        len_col = len(list_col)
        len_index = len(list_target)  # list_target可能有所改变，需要重新计算一下长度。
        if tableWidget.objectName() == 'tableWidget_title':
            self.list_target_title = list_target
        else:
            self.list_target_content = list_target
        tableWidget.setRowCount(len_index) # 设置行数
        tableWidget.setColumnCount(len_col) # 设置列数
        tableWidget.setHorizontalHeaderLabels(['时间', '标题', '查看']) # 设置垂直方向上的名字
        tableWidget.setVerticalHeaderLabels([str(i) for i in range(1, len_index + 1)]) # 设置水平方向上的名字
        tableWidget.setCornerButtonEnabled(True) # 左上角一点击就全选

        '''填充tableWidget的数据'''
        for index in range(len_index):
            for col in range(len_col):
                name_col = list_col[col]
                if name_col == 'download_url':
                    item = QTableWidgetItem('查看')
                    item.setTextAlignment(Qt.AlignCenter)
                    font = QFont()
                    font.setBold(True)
                    font.setWeight(75)
                    item.setFont(font)
                    item.setBackground(QColor(218, 218, 218))
                    item.setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled)
                    tableWidget.setItem(index, col, item)
                elif name_col == 'time':
                    item = QTableWidgetItem(list_target[index][name_col])
                    item.setFlags(Qt.ItemIsUserCheckable |
                                  Qt.ItemIsEnabled)
                    '''查看当前行所代表的内容是否已经在下载购物车里面，如过在的话就设置为选中'''
                    if list_target[index] in self.download_info_list:
                        item.setCheckState(Qt.Checked)
                    else:
                        item.setCheckState(Qt.Unchecked)
                    tableWidget.setItem(index, col, item)
                else:
                    tableWidget.setItem(index, col, QTableWidgetItem(list_target[index][name_col]))
        # tableWidget.resizeColumnsToContents()
        tableWidget.setColumnWidth(1, 500)

Example #41

0

Show file

File: test_to_dict.py Project: frreiss/pandas-fred

 def test_to_dict_short_orient_warns(self, orient):
     # GH#32515
     df = DataFrame({"A": [0, 1]})
     with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
         df.to_dict(orient=orient)

Example #42

0

Show file

File: test_convert_to.py Project: chrish42/pandas

 def test_to_dict_not_unique_warning(self):
     # GH16927: When converting to a dict, if a column has a non-unique name
     # it will be dropped, throwing a warning.
     df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b'])
     with tm.assert_produces_warning(UserWarning):
         df.to_dict()

Example #43

0

Show file

File: ephys_extractor.py Project: rgerkin/AllenSDK

class EphysSweepFeatureExtractor:
    """Feature calculation for a sweep (voltage and/or current time series)."""

    def __init__(self, t=None, v=None, i=None, start=None, end=None, filter=10.,
                 dv_cutoff=20., max_interval=0.005, min_height=2., min_peak=-30.,
                 thresh_frac=0.05, baseline_interval=0.1, baseline_detect_thresh=0.3,
                 id=None):
        """Initialize SweepFeatures object.

        Parameters
        ----------
        t : ndarray of times (seconds)
        v : ndarray of voltages (mV)
        i : ndarray of currents (pA)
        start : start of time window for feature analysis (optional)
        end : end of time window for feature analysis (optional)
        filter : cutoff frequency for 4-pole low-pass Bessel filter in kHz (optional, default 10)
        dv_cutoff : minimum dV/dt to qualify as a spike in V/s (optional, default 20)
        max_interval : maximum acceptable time between start of spike and time of peak in sec (optional, default 0.005)
        min_height : minimum acceptable height from threshold to peak in mV (optional, default 2)
        min_peak : minimum acceptable absolute peak level in mV (optional, default -30)
        thresh_frac : fraction of average upstroke for threshold calculation (optional, default 0.05)
        baseline_interval: interval length for baseline voltage calculation (before start if start is defined, default 0.1)
        baseline_detect_thresh : dV/dt threshold for evaluating flatness of baseline region (optional, default 0.3)
        """
        self.id = id
        self.t = t
        self.v = v
        self.i = i
        self.start = start
        self.end = end
        self.filter = filter
        self.dv_cutoff = dv_cutoff
        self.max_interval = max_interval
        self.min_height = min_height
        self.min_peak = min_peak
        self.thresh_frac = thresh_frac
        self.baseline_interval = baseline_interval
        self.baseline_detect_thresh = baseline_detect_thresh
        self.stimulus_amplitude_calculator = None

        self._sweep_features = {}

    def process_spikes(self):
        """Perform spike-related feature analysis"""
        self._process_individual_spikes()
        self._process_spike_related_features()

    def _process_individual_spikes(self):
        v = self.v
        t = self.t
        dvdt = ft.calculate_dvdt(v, t, self.filter)

        # Basic features of spikes
        putative_spikes = ft.detect_putative_spikes(v, t, self.start, self.end,
                                                    self.filter, self.dv_cutoff)
        peaks = ft.find_peak_indexes(v, t, putative_spikes, self.end)
        putative_spikes, peaks = ft.filter_putative_spikes(v, t, putative_spikes, peaks,
                                                           self.min_height, self.min_peak)

        if not putative_spikes.size:
            # Save time if no spikes detected
            self._spikes_df = DataFrame()
            return

        upstrokes = ft.find_upstroke_indexes(v, t, putative_spikes, peaks, self.filter, dvdt)
        thresholds = ft.refine_threshold_indexes(v, t, upstrokes, self.thresh_frac,
                                                 self.filter, dvdt)
        thresholds, peaks, upstrokes = ft.check_thresholds_and_peaks(v, t, thresholds, peaks,
                                                                     upstrokes, self.max_interval)

        if not thresholds.size:
            # Save time if no spikes detected
            self._spikes_df = DataFrame()
            return


        # Spike list and thresholds have been refined - now find other features
        upstrokes = ft.find_upstroke_indexes(v, t, thresholds, peaks, self.filter, dvdt)
        troughs = ft.find_trough_indexes(v, t, thresholds, peaks, self.end)
        downstrokes = ft.find_downstroke_indexes(v, t, peaks, troughs, self.filter, dvdt)
        trough_details = ft.analyze_trough_details(v, t, thresholds, peaks, self.end,
                                                   self.filter, dvdt=dvdt)
        widths = ft.find_widths(v, t, thresholds, peaks, trough_details[1])

        # Points where we care about t, v, and i if available
        vit_data_indexes = {
            "threshold": thresholds,
            "peak": peaks,
            "trough": troughs,
        }

        # Points where we care about t and dv/dt
        dvdt_data_indexes = {
            "upstroke": upstrokes,
            "downstroke": downstrokes
        }

        # Trough details
        isi_types = trough_details[0]
        trough_detail_indexes = dict(zip(["fast_trough", "adp", "slow_trough"], trough_details[1:]))

        # Redundant, but ensures that DataFrame has right number of rows
        # Any better way to do it?
        spikes_df = DataFrame(data=thresholds, columns=["threshold_index"])

        for k, vals in vit_data_indexes.iteritems():
            spikes_df[k + "_index"] = np.nan
            spikes_df[k + "_t"] = np.nan
            spikes_df[k + "_v"] = np.nan

            if len(vals) > 0:
                spikes_df.ix[:len(vals) - 1, k + "_index"] = vals
                spikes_df.ix[:len(vals) - 1, k + "_t"] = t[vals]
                spikes_df.ix[:len(vals) - 1, k + "_v"] = v[vals]

            if self.i is not None:
                spikes_df[k + "_i"] = np.nan
                if len(vals) > 0:
                    spikes_df.ix[:len(vals) - 1, k + "_i"] = self.i[vals]

        for k, vals in dvdt_data_indexes.iteritems():
            spikes_df[k + "_index"] = np.nan
            spikes_df[k] = np.nan
            if len(vals) > 0:
                spikes_df.ix[:len(vals) - 1, k + "_index"] = vals
                spikes_df.ix[:len(vals) - 1, k + "_t"] = t[vals]
                spikes_df.ix[:len(vals) - 1, k + "_v"] = v[vals]
                spikes_df.ix[:len(vals) - 1, k] = dvdt[vals]

        spikes_df["isi_type"] = isi_types

        for k, vals in trough_detail_indexes.iteritems():
            spikes_df[k + "_index"] = np.nan
            if np.any(~np.isnan(vals)):
                spikes_df.ix[~np.isnan(vals), k + "_index"] = vals[~np.isnan(vals)]

            spikes_df[k + "_t"] = np.nan
            if np.any(~np.isnan(vals)):
                spikes_df.ix[~np.isnan(vals), k + "_t"] = t[vals[~np.isnan(vals)].astype(int)]

            spikes_df[k + "_v"] = np.nan
            if np.any(~np.isnan(vals)):
                spikes_df.ix[~np.isnan(vals), k + "_v"] = v[vals[~np.isnan(vals)].astype(int)]

            if self.i is not None:
                spikes_df[k + "_i"] = np.nan
                if np.any(~np.isnan(vals)):
                    spikes_df.ix[~np.isnan(vals), k + "_i"] = self.i[vals[~np.isnan(vals)].astype(int)]

        spikes_df["width"] = np.nan
        spikes_df.ix[:len(widths)-1, "width"] = widths


        spikes_df["upstroke_downstroke_ratio"] = spikes_df["upstroke"] / -spikes_df["downstroke"]

        self._spikes_df = spikes_df

    def _process_spike_related_features(self):
        t = self.t

        if len(self._spikes_df) == 0:
            self._sweep_features["avg_rate"] = 0
            return

        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(t, thresholds)
        with warnings.catch_warnings():
            # ignore mean of empty slice warnings here
            warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy")

            sweep_level_features = {
                "adapt": ft.adaptation_index(isis),
                "latency": ft.latency(t, thresholds, self.start),
                "isi_cv": (isis.std() / isis.mean()) if len(isis) >= 1 else np.nan,
                "mean_isi": isis.mean(),
                "median_isi": np.median(isis),
                "first_isi": isis[0] if len(isis) >= 1 else np.nan,
                "avg_rate": ft.average_rate(t, thresholds, self.start, self.end),
            }

        for k, v in sweep_level_features.iteritems():
            self._sweep_features[k] = v

    def _process_pauses(self, cost_weight=1.0):
        # Pauses are unusually long ISIs with a "detour reset" among delay resets
        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(self.t, thresholds)
        isi_types = self._spikes_df["isi_type"][:-1].values

        return ft.detect_pauses(isis, isi_types, cost_weight)

    def pause_metrics(self):
        """Estimate average number of pauses and average fraction of time spent in a pause

        Attempts to detect pauses with a variety of conditions and averages results together.

        Pauses that are consistently detected contribute more to estimates.

        Returns
        -------
        avg_n_pauses : average number of pauses detected across conditions
        avg_pause_frac : average fraction of interval (between start and end) spent in a pause
        max_reliability : max fraction of times most reliable pause was detected given weights tested
        n_max_rel_pauses : number of pauses detected with `max_reliability`
        """

        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(self.t, thresholds)

        weight = 1.0
        pause_list = self._process_pauses(weight)

        if len(pause_list) == 0:
            return 0, 0.

        n_pauses = len(pause_list)
        pause_frac = isis[pause_list].sum()
        pause_frac /= self.end - self.start

        return n_pauses, pause_frac

    def _process_bursts(self, tol=0.5, pause_cost=1.0):
        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(self.t, thresholds)

        isi_types = self._spikes_df["isi_type"][:-1].values

        fast_tr_v = self._spikes_df["fast_trough_v"].values
        fast_tr_t = self._spikes_df["fast_trough_t"].values
        slow_tr_v = self._spikes_df["slow_trough_v"].values
        slow_tr_t = self._spikes_df["slow_trough_t"].values
        thr_v = self._spikes_df["threshold_v"].values

        bursts = ft.detect_bursts(isis, isi_types, fast_tr_v, fast_tr_t, slow_tr_v, slow_tr_t,
                  thr_v, tol, pause_cost)

        return np.array(bursts)

    def burst_metrics(self):
        """Find bursts and return max "burstiness" index (normalized max rate in burst vs out).

        Returns
        -------
        max_burstiness_index : max "burstiness" index across detected bursts
        num_bursts : number of bursts detected
        """

        burst_info = self._process_bursts()

        if burst_info.shape[0] > 0:
            return burst_info[:, 0].max(), burst_info.shape[0]
        else:
            return 0., 0

    def delay_metrics(self):
        """Calculates ratio of latency to dominant time constant of rise before spike

        Returns
        -------
        delay_ratio : ratio of latency to tau (higher means more delay)
        tau : dominant time constant of rise before spike
        """

        if len(self._spikes_df) == 0:
            logging.info("No spikes available for delay calculation")
            return 0., 0.
        start = self.start
        spike_time = self._spikes_df["threshold_t"].values[0]

        tau = ft.fit_prespike_time_constant(self.v, self.t, start, spike_time)
        latency = spike_time - start

        delay_ratio = latency / tau
        return delay_ratio, tau

    def _get_baseline_voltage(self):
        v = self.v
        t = self.t
        filter_frequency = 1. # in kHz

        # Look at baseline interval before start if start is defined
        if self.start is not None:
            return ft.average_voltage(v, t, self.start - self.baseline_interval, self.start)

        # Otherwise try to find an interval where things are pretty flat
        dv = ft.calculate_dvdt(v, t, filter_frequency)
        non_flat_points = np.flatnonzero(np.abs(dv >= self.baseline_detect_thresh))
        flat_intervals = t[non_flat_points[1:]] - t[non_flat_points[:-1]]
        long_flat_intervals = np.flatnonzero(flat_intervals >= self.baseline_interval)
        if long_flat_intervals.size > 0:
            interval_index = long_flat_intervals[0] + 1
            baseline_end_time = t[non_flat_points[interval_index]]
            return ft.average_voltage(v, t, baseline_end_time - self.baseline_interval,
                                      baseline_end_time)
        else:
            logging.info("Could not find sufficiently flat interval for automatic baseline voltage", RuntimeWarning)
            return np.nan

    def voltage_deflection(self, deflect_type=None):
        """Measure deflection (min or max, between start and end if specified).

        Parameters
        ----------
        deflect_type : measure minimal ('min') or maximal ('max') voltage deflection
            If not specified, it will check to see if the current (i) is positive or negative
            between start and end, then choose 'max' or 'min', respectively
            If the current is not defined, it will default to 'min'.

        Returns
        -------
        deflect_v : peak
        deflect_index : index of peak deflection
        """

        deflect_dispatch = {
            "min": np.argmin,
            "max": np.argmax,
        }

        start = self.start
        if not start:
            start = 0
        start_index = ft.find_time_index(self.t, start)

        end = self.end
        if not end:
            end = self.t[-1]
        end_index = ft.find_time_index(self.t, end)


        if deflect_type is None:
            if self.i is not None:
                halfway_index = ft.find_time_index(self.t, (end - start) / 2. + start)
                if self.i[halfway_index] >= 0:
                    deflect_type = "max"
                else:
                    deflect_type = "min"
            else:
                deflect_type = "min"

        deflect_func = deflect_dispatch[deflect_type]

        v_window = self.v[start_index:end_index]
        deflect_index = deflect_func(v_window) + start_index

        return self.v[deflect_index], deflect_index

    def stimulus_amplitude(self):
        """ """
        if self.stimulus_amplitude_calculator is not None:
            return self.stimulus_amplitude_calculator(self)
        else:
            return np.nan

    def estimate_time_constant(self):
        """Calculate the membrane time constant by fitting the voltage response with a
        single exponential.

        Returns
        -------
        tau : membrane time constant in seconds
        """

        # Assumes this is being done on a hyperpolarizing step
        v_peak, peak_index = self.voltage_deflection("min")
        v_baseline = self.sweep_feature("v_baseline")

        if self.start:
            start_index = ft.find_time_index(self.t, self.start)
        else:
            start_index = 0

        frac = 0.1
        search_result = np.flatnonzero(self.v[start_index:] <= frac * (v_peak - v_baseline) + v_baseline)
        if not search_result.size:
            raise ft.FeatureError("could not find interval for time constant estimate")
        fit_start = self.t[search_result[0] + start_index]
        fit_end = self.t[peak_index]

        a, inv_tau, y0 = ft.fit_membrane_time_constant(self.v, self.t, fit_start, fit_end)

        return 1. / inv_tau

    def estimate_sag(self, peak_width=0.005):
        """Calculate the sag in a hyperpolarizing voltage response.

        Parameters
        ----------
        peak_width : window width to get more robust peak estimate in sec (default 0.005)

        Returns
        -------
        sag : fraction that membrane potential relaxes back to baseline
        """

        t = self.t
        v = self.v

        start = self.start
        if not start:
            start = 0

        end = self.end
        if not end:
            end = self.t[-1]

        v_peak, peak_index = self.voltage_deflection("min")
        v_peak_avg = ft.average_voltage(v, t, start=t[peak_index] - peak_width / 2.,
                                      end=t[peak_index] + peak_width / 2.)
        v_baseline = self.sweep_feature("v_baseline")
        v_steady = ft.average_voltage(v, t, start=end - self.baseline_interval, end=end)
        sag = (v_peak_avg - v_steady) / (v_peak_avg - v_baseline)
        return sag

    def spikes(self):
        """Get all features for each spike as a list of records."""
        return self._spikes_df.to_dict('records')

    def spike_feature(self, key):
        """Get specified feature for every spike.

        Parameters
        ----------
        key : feature name

        Returns
        -------
        spike_feature_values : ndarray of features for each spike
        """

        if len(self._spikes_df) == 0:
            return np.array([])

        if key not in self._spikes_df.columns:
            raise KeyError("requested feature '{:s}' not available".format(key))

        return self._spikes_df[key].values

    def spike_feature_keys(self):
        """Get list of every available spike feature."""
        return self._spikes_df.columns.values.tolist()

    def sweep_feature(self, key, allow_missing=False):
        """Get sweep-level feature (`key`).

        Parameters
        ----------
        key : name of sweep-level feature
        allow_missing : return np.nan if key is missing for sweep (default False)

        Returns
        -------
        sweep_feature : sweep-level feature value
        """

        on_request_dispatch = {
            "v_baseline": self._get_baseline_voltage,
            "tau": self.estimate_time_constant,
            "sag": self.estimate_sag,
            "peak_deflect": self.voltage_deflection,
            "stim_amp": self.stimulus_amplitude,
        }

        if allow_missing and key not in self._sweep_features and key not in on_request_dispatch:
            return np.nan
        elif key not in self._sweep_features and key not in on_request_dispatch:
            raise KeyError("requested feature '{:s}' not available".format(key))

        if key not in self._sweep_features and key in on_request_dispatch:
            fn = on_request_dispatch[key]
            if fn is not None:
                self._sweep_features[key] = fn()
            else:
                raise KeyError("requested feature '{:s}' not defined".format(key))

        return self._sweep_features[key]

    def process_new_spike_feature(self, feature_name, feature_func):
        """Add new spike-level feature calculation function

           The function should take this sweep extractor as its argument. Its results
           can be accessed by calling the method spike_feature(<feature_name>).
        """

        if feature_name in self._spikes_df.columns:
            raise KeyError("Feature {:s} already exists for sweep".format(feature_name))

        features = feature_func(self)
        self._spikes_df[feature_name] = np.nan
        self._spikes_df.ix[:len(features) - 1, feature_name] = features

    def process_new_sweep_feature(self, feature_name, feature_func):
        """Add new sweep-level feature calculation function

           The function should take this sweep extractor as its argument. Its results
           can be accessed by calling the method sweep_feature(<feature_name>).
        """

        if feature_name in self._sweep_features:
            raise KeyError("Feature {:s} already exists for sweep".format(feature_name))

        self._sweep_features[feature_name] = feature_func(self)

    def set_stimulus_amplitude_calculator(self, function):
        self.stimulus_amplitude_calculator = function

    def sweep_feature_keys(self):
        """Get list of every available sweep-level feature."""
        return self._sweep_features.keys()

    def as_dict(self):
        """Create dict of features and spikes."""
        output_dict = self._sweep_features.copy()
        output_dict["spikes"] = self.spikes()
        if self.id is not None:
            output_dict["id"] = self.id
        return output_dict

Example #44

0

Show file

File: items.py Project: zanachka/arche

 def from_df(cls, df: pd.DataFrame):
     return cls(raw=np.array(df.to_dict("records")), df=df)

Example #45

0

Show file

File: test_historical_retrieval.py Project: szalai1/feast

def get_expected_training_df(
    customer_df: pd.DataFrame,
    customer_fv: FeatureView,
    driver_df: pd.DataFrame,
    driver_fv: FeatureView,
    orders_df: pd.DataFrame,
    event_timestamp: str,
):
    # Convert all pandas dataframes into records with UTC timestamps
    order_records = convert_timestamp_records_to_utc(
        orders_df.to_dict("records"), event_timestamp)
    driver_records = convert_timestamp_records_to_utc(
        driver_df.to_dict("records"), driver_fv.input.event_timestamp_column)
    customer_records = convert_timestamp_records_to_utc(
        customer_df.to_dict("records"),
        customer_fv.input.event_timestamp_column)

    # Manually do point-in-time join of orders to drivers and customers records
    for order_record in order_records:
        driver_record = find_asof_record(
            driver_records,
            ts_key=driver_fv.input.event_timestamp_column,
            ts_start=order_record[event_timestamp] - driver_fv.ttl,
            ts_end=order_record[event_timestamp],
            filter_key="driver_id",
            filter_value=order_record["driver_id"],
        )
        customer_record = find_asof_record(
            customer_records,
            ts_key=customer_fv.input.event_timestamp_column,
            ts_start=order_record[event_timestamp] - customer_fv.ttl,
            ts_end=order_record[event_timestamp],
            filter_key="customer_id",
            filter_value=order_record["customer_id"],
        )
        order_record.update({
            f"driver_stats__{k}": driver_record.get(k, None)
            for k in ("conv_rate", "avg_daily_trips")
        })
        order_record.update({
            f"customer_profile__{k}": customer_record.get(k, None)
            for k in (
                "current_balance",
                "avg_passenger_count",
                "lifetime_trip_count",
            )
        })

    # Convert records back to pandas dataframe
    expected_df = pd.DataFrame(order_records)

    # Move "datetime" column to front
    current_cols = expected_df.columns.tolist()
    current_cols.remove(event_timestamp)
    expected_df = expected_df[[event_timestamp] + current_cols]

    # Cast some columns to expected types, since we lose information when converting pandas DFs into Python objects.
    expected_df["order_is_success"] = expected_df["order_is_success"].astype(
        "int32")
    expected_df["customer_profile__current_balance"] = expected_df[
        "customer_profile__current_balance"].astype("float32")
    expected_df["customer_profile__avg_passenger_count"] = expected_df[
        "customer_profile__avg_passenger_count"].astype("float32")

    return expected_df

Example #46

0

Show file

 def set_cache(self, key: str, data: DataFrame) -> None:
     if self.size > self.max_cache_size:
         self.remove_oldest()
     self.cache[key] = data.to_dict()["total_cases"]

Example #47

0

Show file

 def get_data(df: pd.DataFrame, ) -> List[Dict]:  # pylint: disable=no-self-use
     return df.to_dict(orient="records")

Example #48

0

Show file

File: test_convert_to.py Project: zzzzzzzzzx/pandas

    def test_to_dict(self, mapping):
        test_data = {
            'A': {
                '1': 1,
                '2': 2
            },
            'B': {
                '1': '1',
                '2': '2',
                '3': '3'
            },
        }

        # GH16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("l", mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][int(k2) - 1])

        recons_data = DataFrame(test_data).to_dict("s", mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("sp", mapping)
        expected_split = {
            'columns': ['A', 'B'],
            'index': ['1', '2', '3'],
            'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]
        }
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("r", mapping)
        expected_records = [{
            'A': 1.0,
            'B': '1'
        }, {
            'A': 2.0,
            'B': '2'
        }, {
            'A': np.nan,
            'B': '3'
        }]
        assert isinstance(recons_data, list)
        assert (len(recons_data) == 3)
        for l, r in zip(recons_data, expected_records):
            tm.assert_dict_equal(l, r)

        # GH10844
        recons_data = DataFrame(test_data).to_dict("i")

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k2][k])

        df = DataFrame(test_data)
        df['duped'] = df[df.columns[0]]
        recons_data = df.to_dict("i")
        comp_data = test_data.copy()
        comp_data['duped'] = comp_data[df.columns[0]]
        for k, v in compat.iteritems(comp_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k2][k])

Example #49

0

Show file

File: test_convert_to.py Project: chrish42/pandas

 def test_to_dict_errors(self, mapping):
     # GH16122
     df = DataFrame(np.random.randn(3, 3))
     with pytest.raises(TypeError):
         df.to_dict(into=mapping)

Example #50

0

Show file

 def test_to_dict_wide(self):
     # https://github.com/pandas-dev/pandas/issues/24939
     df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)})
     result = df.to_dict("records")[0]
     expected = {"A_{:d}".format(i): i for i in range(256)}
     assert result == expected

Example #51

0

Show file

File: __init__.py Project: bryan-huangyan/mining

def data(mongodb, slug):
    # check protocol to work
    ws = request.environ.get("wsgi.websocket")
    protocol = "websocket"
    if not ws:
        response.content_type = "application/json"
        protocol = "http"
    DataManager = __from__("mining.controllers.data.{}.DataManager".format(protocol))

    # instantiates the chosen protocol
    DM = DataManager(ws)

    # instantiate data warehouse
    DW = DataWarehouse()

    element = mongodb["element"].find_one({"slug": slug})

    element["page_limit"] = 50
    if request.GET.get("limit", True) is False:
        element["page_limit"] = 9999999999

    if element["type"] == "grid" and "download" not in request.GET.keys():
        page = int(request.GET.get("page", 1))
        page_start = 0
        page_end = element["page_limit"]
        if page >= 2:
            page_end = element["page_limit"] * page
            page_start = page_end - element["page_limit"]
    else:
        page = 1
        page_start = None
        page_end = None

    filters = [i[0] for i in request.GET.iteritems() if len(i[0].split("filter__")) > 1]

    if not DW.search:
        data = DW.get(element.get("cube"), page=page)
    else:
        data = DW.get(element.get("cube"), filters=filters, page=page)

    columns = data.get("columns") or []

    fields = columns
    if request.GET.get("fields", None):
        fields = request.GET.get("fields").split(",")

    cube_last_update = mongodb["cube"].find_one({"slug": element.get("cube")})
    DM.send(json.dumps({"type": "last_update", "data": str(cube_last_update.get("lastupdate", ""))}))

    DM.send(json.dumps({"type": "columns", "data": fields}))

    df = DataFrame(data.get("data") or {}, columns=fields)
    if len(filters) >= 1:
        for f in filters:
            s = f.split("__")
            field = s[1]
            operator = s[2]
            value = request.GET.get(f)
            if operator == "like":
                df = df[df[field].str.contains(value)]
            elif operator == "regex":
                df = DataFrameSearchColumn(df, field, value, operator)
            else:
                df = df.query(df_generate(df, value, f))

    groupby = []
    if request.GET.get("groupby", None):
        groupby = request.GET.get("groupby", "").split(",")
    if len(groupby) >= 1:
        df = DataFrame(df.groupby(groupby).grouper.get_group_levels())

    if (
        request.GET.get("orderby", element.get("orderby", None))
        and request.GET.get("orderby", element.get("orderby", None)) in fields
    ):

        orderby = request.GET.get("orderby", element.get("orderby", ""))
        if type(orderby) == str:
            orderby = orderby.split(",")
        orderby__order = request.GET.get("orderby__order", element.get("orderby__order", ""))
        if type(orderby__order) == str:
            orderby__order = orderby__order.split(",")
        ind = 0
        for orde in orderby__order:
            if orde == "0":
                orderby__order[ind] = False
            else:
                orderby__order[ind] = True
            ind += 1
        df = df.sort(orderby, ascending=orderby__order)

    DM.send(json.dumps({"type": "max_page", "data": data.get("count", len(df))}))

    # CLEAN MEMORY
    del filters, fields, columns
    gc.collect()
    categories = []

    records = df.to_dict(orient="records")
    if not DW.search:
        records = records[page_start:page_end]
    for i in records:
        if element.get("categories", None):
            categories.append(i[element.get("categories")])
        DM.send(json.dumps({"type": "data", "data": i}))

    DM.send(json.dumps({"type": "categories", "data": categories}))
    DM.send(json.dumps({"type": "close"}))

    # CLEAN MEMORY
    del categories
    gc.collect()

    if not ws:
        if "download" in request.GET.keys():

            ext = request.GET.get("download", "xls")
            if ext == "":
                ext = "xls"

            file_name = "{}/frontend/assets/exports/openmining-{}.{}".format(PROJECT_PATH, element.get("cube"), ext)
            if ext == "csv":
                df.to_csv(file_name, sep=";")
                contenttype = "text/csv"
            else:
                df.to_excel(file_name)
                contenttype = "application/vnd.ms-excel"

            response.set_header("charset", "utf-8")
            response.set_header("Content-disposition", "attachment; " "filename={}.{}".format(element.get("cube"), ext))
            response.content_type = contenttype

            ifile = open(file_name, "r")
            o = ifile.read()
            ifile.close()

            return o

        return json.dumps(DM.data)

Example #52

0

Show file

File: ephys_extractor.py Project: fun-zoological-computing/AllenSDK

class EphysSweepFeatureExtractor:
    """Feature calculation for a sweep (voltage and/or current time series)."""
    def __init__(self,
                 t=None,
                 v=None,
                 i=None,
                 start=None,
                 end=None,
                 filter=10.,
                 dv_cutoff=20.,
                 max_interval=0.005,
                 min_height=2.,
                 min_peak=-30.,
                 thresh_frac=0.05,
                 baseline_interval=0.1,
                 baseline_detect_thresh=0.3,
                 id=None):
        """Initialize SweepFeatures object.

        Parameters
        ----------
        t : ndarray of times (seconds)
        v : ndarray of voltages (mV)
        i : ndarray of currents (pA)
        start : start of time window for feature analysis (optional)
        end : end of time window for feature analysis (optional)
        filter : cutoff frequency for 4-pole low-pass Bessel filter in kHz (optional, default 10)
        dv_cutoff : minimum dV/dt to qualify as a spike in V/s (optional, default 20)
        max_interval : maximum acceptable time between start of spike and time of peak in sec (optional, default 0.005)
        min_height : minimum acceptable height from threshold to peak in mV (optional, default 2)
        min_peak : minimum acceptable absolute peak level in mV (optional, default -30)
        thresh_frac : fraction of average upstroke for threshold calculation (optional, default 0.05)
        baseline_interval: interval length for baseline voltage calculation (before start if start is defined, default 0.1)
        baseline_detect_thresh : dV/dt threshold for evaluating flatness of baseline region (optional, default 0.3)
        """
        self.id = id
        self.t = t
        self.v = v
        self.i = i
        self.start = start
        self.end = end
        self.filter = filter
        self.dv_cutoff = dv_cutoff
        self.max_interval = max_interval
        self.min_height = min_height
        self.min_peak = min_peak
        self.thresh_frac = thresh_frac
        self.baseline_interval = baseline_interval
        self.baseline_detect_thresh = baseline_detect_thresh
        self.stimulus_amplitude_calculator = None

        self._sweep_features = {}
        self._affected_by_clipping = []

    def process_spikes(self):
        """Perform spike-related feature analysis"""
        self._process_individual_spikes()
        self._process_spike_related_features()

    def _process_individual_spikes(self):
        v = self.v
        t = self.t
        dvdt = ft.calculate_dvdt(v, t, self.filter)

        # Basic features of spikes
        putative_spikes = ft.detect_putative_spikes(v, t, self.start, self.end,
                                                    self.filter,
                                                    self.dv_cutoff)
        peaks = ft.find_peak_indexes(v, t, putative_spikes, self.end)
        putative_spikes, peaks = ft.filter_putative_spikes(v,
                                                           t,
                                                           putative_spikes,
                                                           peaks,
                                                           self.min_height,
                                                           self.min_peak,
                                                           dvdt=dvdt,
                                                           filter=self.filter)

        if not putative_spikes.size:
            # Save time if no spikes detected
            self._spikes_df = DataFrame()
            return

        upstrokes = ft.find_upstroke_indexes(v, t, putative_spikes, peaks,
                                             self.filter, dvdt)
        thresholds = ft.refine_threshold_indexes(v, t, upstrokes,
                                                 self.thresh_frac, self.filter,
                                                 dvdt)
        thresholds, peaks, upstrokes, clipped = ft.check_thresholds_and_peaks(
            v, t, thresholds, peaks, upstrokes, self.end, self.max_interval)
        if not thresholds.size:
            # Save time if no spikes detected
            self._spikes_df = DataFrame()
            return

        # Spike list and thresholds have been refined - now find other features
        upstrokes = ft.find_upstroke_indexes(v, t, thresholds, peaks,
                                             self.filter, dvdt)
        troughs = ft.find_trough_indexes(v, t, thresholds, peaks, clipped,
                                         self.end)
        downstrokes = ft.find_downstroke_indexes(v, t, peaks, troughs, clipped,
                                                 self.filter, dvdt)
        trough_details, clipped = ft.analyze_trough_details(v,
                                                            t,
                                                            thresholds,
                                                            peaks,
                                                            clipped,
                                                            self.end,
                                                            self.filter,
                                                            dvdt=dvdt)
        widths = ft.find_widths(v, t, thresholds, peaks, trough_details[1],
                                clipped)

        base_clipped_list = []

        # Points where we care about t, v, and i if available
        vit_data_indexes = {
            "threshold": thresholds,
            "peak": peaks,
            "trough": troughs,
        }
        base_clipped_list += ["trough"]

        # Points where we care about t and dv/dt
        dvdt_data_indexes = {"upstroke": upstrokes, "downstroke": downstrokes}
        base_clipped_list += ["downstroke"]

        # Trough details
        isi_types = trough_details[0]
        trough_detail_indexes = dict(
            zip(["fast_trough", "adp", "slow_trough"], trough_details[1:]))
        base_clipped_list += ["fast_trough", "adp", "slow_trough"]

        # Redundant, but ensures that DataFrame has right number of rows
        # Any better way to do it?
        spikes_df = DataFrame(data=thresholds, columns=["threshold_index"])
        spikes_df["clipped"] = clipped

        for k, all_vals in six.iteritems(vit_data_indexes):
            valid_ind = ~np.isnan(all_vals)
            vals = all_vals[valid_ind].astype(int)
            spikes_df[k + "_index"] = np.nan
            spikes_df[k + "_t"] = np.nan
            spikes_df[k + "_v"] = np.nan

            if len(vals) > 0:
                spikes_df.ix[valid_ind, k + "_index"] = vals
                spikes_df.ix[valid_ind, k + "_t"] = t[vals]
                spikes_df.ix[valid_ind, k + "_v"] = v[vals]

            if self.i is not None:
                spikes_df[k + "_i"] = np.nan
                if len(vals) > 0:
                    spikes_df.ix[valid_ind, k + "_i"] = self.i[vals]

            if k in base_clipped_list:
                self._affected_by_clipping += [
                    k + "_index",
                    k + "_t",
                    k + "_v",
                    k + "_i",
                ]

        for k, all_vals in six.iteritems(dvdt_data_indexes):
            valid_ind = ~np.isnan(all_vals)
            vals = all_vals[valid_ind].astype(int)
            spikes_df[k + "_index"] = np.nan
            spikes_df[k] = np.nan
            if len(vals) > 0:
                spikes_df.ix[valid_ind, k + "_index"] = vals
                spikes_df.ix[valid_ind, k + "_t"] = t[vals]
                spikes_df.ix[valid_ind, k + "_v"] = v[vals]
                spikes_df.ix[valid_ind, k] = dvdt[vals]

                if k in base_clipped_list:
                    self._affected_by_clipping += [
                        k + "_index",
                        k + "_t",
                        k + "_v",
                        k,
                    ]

        spikes_df["isi_type"] = isi_types
        self._affected_by_clipping += ["isi_type"]

        for k, all_vals in six.iteritems(trough_detail_indexes):
            valid_ind = ~np.isnan(all_vals)
            vals = all_vals[valid_ind].astype(int)
            spikes_df[k + "_index"] = np.nan
            spikes_df[k + "_t"] = np.nan
            spikes_df[k + "_v"] = np.nan
            if len(vals) > 0:
                spikes_df.ix[valid_ind, k + "_index"] = vals
                spikes_df.ix[valid_ind, k + "_t"] = t[vals]
                spikes_df.ix[valid_ind, k + "_v"] = v[vals]

            if self.i is not None:
                spikes_df[k + "_i"] = np.nan
                if len(vals) > 0:
                    spikes_df.ix[valid_ind, k + "_i"] = self.i[vals]

            if k in base_clipped_list:
                self._affected_by_clipping += [
                    k + "_index",
                    k + "_t",
                    k + "_v",
                    k + "_i",
                ]

        spikes_df["width"] = widths
        self._affected_by_clipping += ["width"]

        spikes_df["upstroke_downstroke_ratio"] = spikes_df[
            "upstroke"] / -spikes_df["downstroke"]
        self._affected_by_clipping += ["upstroke_downstroke_ratio"]

        self._spikes_df = spikes_df

    def _process_spike_related_features(self):
        t = self.t

        if len(self._spikes_df) == 0:
            self._sweep_features["avg_rate"] = 0
            return

        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(t, thresholds)
        with warnings.catch_warnings():
            # ignore mean of empty slice warnings here
            warnings.filterwarnings("ignore",
                                    category=RuntimeWarning,
                                    module="numpy")

            sweep_level_features = {
                "adapt": ft.adaptation_index(isis),
                "latency": ft.latency(t, thresholds, self.start),
                "isi_cv":
                (isis.std() / isis.mean()) if len(isis) >= 1 else np.nan,
                "mean_isi": isis.mean() if len(isis) > 0 else np.nan,
                "median_isi": np.median(isis),
                "first_isi": isis[0] if len(isis) >= 1 else np.nan,
                "avg_rate": ft.average_rate(t, thresholds, self.start,
                                            self.end),
            }

        for k, v in six.iteritems(sweep_level_features):
            self._sweep_features[k] = v

    def _process_pauses(self, cost_weight=1.0):
        # Pauses are unusually long ISIs with a "detour reset" among delay resets
        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(self.t, thresholds)
        isi_types = self._spikes_df["isi_type"][:-1].values

        return ft.detect_pauses(isis, isi_types, cost_weight)

    def pause_metrics(self):
        """Estimate average number of pauses and average fraction of time spent in a pause

        Attempts to detect pauses with a variety of conditions and averages results together.

        Pauses that are consistently detected contribute more to estimates.

        Returns
        -------
        avg_n_pauses : average number of pauses detected across conditions
        avg_pause_frac : average fraction of interval (between start and end) spent in a pause
        max_reliability : max fraction of times most reliable pause was detected given weights tested
        n_max_rel_pauses : number of pauses detected with `max_reliability`
        """

        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(self.t, thresholds)

        weight = 1.0
        pause_list = self._process_pauses(weight)

        if len(pause_list) == 0:
            return 0, 0.

        n_pauses = len(pause_list)
        pause_frac = isis[pause_list].sum()
        pause_frac /= self.end - self.start

        return n_pauses, pause_frac

    def _process_bursts(self, tol=0.5, pause_cost=1.0):
        thresholds = self._spikes_df["threshold_index"].values.astype(int)
        isis = ft.get_isis(self.t, thresholds)

        isi_types = self._spikes_df["isi_type"][:-1].values

        fast_tr_v = self._spikes_df["fast_trough_v"].values
        fast_tr_t = self._spikes_df["fast_trough_t"].values
        slow_tr_v = self._spikes_df["slow_trough_v"].values
        slow_tr_t = self._spikes_df["slow_trough_t"].values
        thr_v = self._spikes_df["threshold_v"].values

        bursts = ft.detect_bursts(isis, isi_types, fast_tr_v, fast_tr_t,
                                  slow_tr_v, slow_tr_t, thr_v, tol, pause_cost)

        return np.array(bursts)

    def burst_metrics(self):
        """Find bursts and return max "burstiness" index (normalized max rate in burst vs out).

        Returns
        -------
        max_burstiness_index : max "burstiness" index across detected bursts
        num_bursts : number of bursts detected
        """

        burst_info = self._process_bursts()

        if burst_info.shape[0] > 0:
            return burst_info[:, 0].max(), burst_info.shape[0]
        else:
            return 0., 0

    def delay_metrics(self):
        """Calculates ratio of latency to dominant time constant of rise before spike

        Returns
        -------
        delay_ratio : ratio of latency to tau (higher means more delay)
        tau : dominant time constant of rise before spike
        """

        if len(self._spikes_df) == 0:
            #logging.info("No spikes available for delay calculation")
            return 0., 0.
        start = self.start
        spike_time = self._spikes_df["threshold_t"].values[0]

        tau = ft.fit_prespike_time_constant(self.v, self.t, start, spike_time)
        latency = spike_time - start

        delay_ratio = latency / tau
        return delay_ratio, tau

    def _get_baseline_voltage(self):
        v = self.v
        t = self.t
        filter_frequency = 1.  # in kHz

        # Look at baseline interval before start if start is defined
        if self.start is not None:
            return ft.average_voltage(v, t,
                                      self.start - self.baseline_interval,
                                      self.start)

        # Otherwise try to find an interval where things are pretty flat
        dv = ft.calculate_dvdt(v, t, filter_frequency)
        non_flat_points = np.flatnonzero(
            np.abs(dv >= self.baseline_detect_thresh))
        flat_intervals = t[non_flat_points[1:]] - t[non_flat_points[:-1]]
        long_flat_intervals = np.flatnonzero(
            flat_intervals >= self.baseline_interval)
        if long_flat_intervals.size > 0:
            interval_index = long_flat_intervals[0] + 1
            baseline_end_time = t[non_flat_points[interval_index]]
            return ft.average_voltage(
                v, t, baseline_end_time - self.baseline_interval,
                baseline_end_time)
        else:
            #logging.info("Could not find sufficiently flat interval for automatic baseline voltage", RuntimeWarning)
            return np.nan

    def voltage_deflection(self, deflect_type=None):
        """Measure deflection (min or max, between start and end if specified).

        Parameters
        ----------
        deflect_type : measure minimal ('min') or maximal ('max') voltage deflection
            If not specified, it will check to see if the current (i) is positive or negative
            between start and end, then choose 'max' or 'min', respectively
            If the current is not defined, it will default to 'min'.

        Returns
        -------
        deflect_v : peak
        deflect_index : index of peak deflection
        """

        deflect_dispatch = {
            "min": np.argmin,
            "max": np.argmax,
        }

        start = self.start
        if not start:
            start = 0
        start_index = ft.find_time_index(self.t, start)

        end = self.end
        if not end:
            end = self.t[-1]
        end_index = ft.find_time_index(self.t, end)

        if deflect_type is None:
            if self.i is not None:
                halfway_index = ft.find_time_index(self.t,
                                                   (end - start) / 2. + start)
                if self.i[halfway_index] >= 0:
                    deflect_type = "max"
                else:
                    deflect_type = "min"
            else:
                deflect_type = "min"

        deflect_func = deflect_dispatch[deflect_type]

        v_window = self.v[start_index:end_index]
        deflect_index = deflect_func(v_window) + start_index

        return self.v[deflect_index], deflect_index

    def stimulus_amplitude(self):
        """ """
        if self.stimulus_amplitude_calculator is not None:
            return self.stimulus_amplitude_calculator(self)
        else:
            return np.nan

    def estimate_time_constant(self):
        """Calculate the membrane time constant by fitting the voltage response with a
        single exponential.

        Returns
        -------
        tau : membrane time constant in seconds
        """

        # Assumes this is being done on a hyperpolarizing step
        v_peak, peak_index = self.voltage_deflection("min")
        v_baseline = self.sweep_feature("v_baseline")

        if self.start:
            start_index = ft.find_time_index(self.t, self.start)
        else:
            start_index = 0

        frac = 0.1
        search_result = np.flatnonzero(self.v[start_index:] <= frac *
                                       (v_peak - v_baseline) + v_baseline)
        if not search_result.size:
            raise ft.FeatureError(
                "could not find interval for time constant estimate")
        fit_start = self.t[search_result[0] + start_index]
        fit_end = self.t[peak_index]

        a, inv_tau, y0 = ft.fit_membrane_time_constant(self.v, self.t,
                                                       fit_start, fit_end)

        return 1. / inv_tau

    def estimate_sag(self, peak_width=0.005):
        """Calculate the sag in a hyperpolarizing voltage response.

        Parameters
        ----------
        peak_width : window width to get more robust peak estimate in sec (default 0.005)

        Returns
        -------
        sag : fraction that membrane potential relaxes back to baseline
        """

        t = self.t
        v = self.v

        start = self.start
        if not start:
            start = 0

        end = self.end
        if not end:
            end = self.t[-1]

        v_peak, peak_index = self.voltage_deflection("min")
        v_peak_avg = ft.average_voltage(v,
                                        t,
                                        start=t[peak_index] - peak_width / 2.,
                                        end=t[peak_index] + peak_width / 2.)
        v_baseline = self.sweep_feature("v_baseline")
        v_steady = ft.average_voltage(v,
                                      t,
                                      start=end - self.baseline_interval,
                                      end=end)
        sag = (v_peak_avg - v_steady) / (v_peak_avg - v_baseline)
        return sag

    def spikes(self):
        """Get all features for each spike as a list of records."""
        return self._spikes_df.to_dict('records')

    def spike_feature(self,
                      key,
                      include_clipped=False,
                      force_exclude_clipped=False):
        """Get specified feature for every spike.

        Parameters
        ----------
        key : feature name
        include_clipped: return values for every identified spike, even when clipping means they will be incorrect/undefined

        Returns
        -------
        spike_feature_values : ndarray of features for each spike
        """

        if not hasattr(self, "_spikes_df"):
            raise AttributeError(
                "EphysSweepFeatureExtractor instance attribute with spike information does not exist yet - have spikes been processed?"
            )

        if len(self._spikes_df) == 0:
            return np.array([])

        if key not in self._spikes_df.columns:
            raise KeyError(
                "requested feature '{:s}' not available".format(key))

        values = self._spikes_df[key].values

        if include_clipped and force_exclude_clipped:
            raise ValueError(
                "include_clipped and force_exclude_clipped cannot both be true"
            )

        if not include_clipped and self.is_spike_feature_affected_by_clipping(
                key):
            values = values[~self._spikes_df["clipped"].values]
        elif force_exclude_clipped:
            values = values[~self._spikes_df["clipped"].values]

        return values

    def is_spike_feature_affected_by_clipping(self, key):
        return key in self._affected_by_clipping

    def spike_feature_keys(self):
        """Get list of every available spike feature."""
        return self._spikes_df.columns.values.tolist()

    def sweep_feature(self, key, allow_missing=False):
        """Get sweep-level feature (`key`).

        Parameters
        ----------
        key : name of sweep-level feature
        allow_missing : return np.nan if key is missing for sweep (default False)

        Returns
        -------
        sweep_feature : sweep-level feature value
        """

        on_request_dispatch = {
            "v_baseline": self._get_baseline_voltage,
            "tau": self.estimate_time_constant,
            "sag": self.estimate_sag,
            "peak_deflect": self.voltage_deflection,
            "stim_amp": self.stimulus_amplitude,
        }

        if allow_missing and key not in self._sweep_features and key not in on_request_dispatch:
            return np.nan
        elif key not in self._sweep_features and key not in on_request_dispatch:
            raise KeyError(
                "requested feature '{:s}' not available".format(key))

        if key not in self._sweep_features and key in on_request_dispatch:
            fn = on_request_dispatch[key]
            if fn is not None:
                self._sweep_features[key] = fn()
            else:
                raise KeyError(
                    "requested feature '{:s}' not defined".format(key))

        return self._sweep_features[key]

    def process_new_spike_feature(self,
                                  feature_name,
                                  feature_func,
                                  affected_by_clipping=False):
        """Add new spike-level feature calculation function

           The function should take this sweep extractor as its argument. Its results
           can be accessed by calling the method spike_feature(<feature_name>).
        """

        if feature_name in self._spikes_df.columns:
            raise KeyError(
                "Feature {:s} already exists for sweep".format(feature_name))

        self._spikes_df[feature_name] = feature_func(self)

        if affected_by_clipping:
            self._affected_by_clipping.append(feature_name)

    def process_new_sweep_feature(self, feature_name, feature_func):
        """Add new sweep-level feature calculation function

           The function should take this sweep extractor as its argument. Its results
           can be accessed by calling the method sweep_feature(<feature_name>).
        """

        if feature_name in self._sweep_features:
            raise KeyError(
                "Feature {:s} already exists for sweep".format(feature_name))

        self._sweep_features[feature_name] = feature_func(self)

    def set_stimulus_amplitude_calculator(self, function):
        self.stimulus_amplitude_calculator = function

    def sweep_feature_keys(self):
        """Get list of every available sweep-level feature."""
        return self._sweep_features.keys()

    def as_dict(self):
        """Create dict of features and spikes."""
        output_dict = self._sweep_features.copy()
        output_dict["spikes"] = self.spikes()
        if self.id is not None:
            output_dict["id"] = self.id
        return output_dict

Example #53

0

Show file

File: run.py Project: ZzyMy/PyQt5-2

    def show_tablewidget(self, dict_data, tableWidget, clear_fore=True):
        #传入dict_data与tableWidget，以实现在tableWidget上面呈现dict_data
        #提取自己需要的信息
        if clear_fore == True:  #检测在搜索之前是否要清空下载购物车信息
            self.download_info_list = []
        #更新状态栏信息
        self.signal_status.emit('clear', [])  #清空状态栏
        #检测checkBox之前是否被选中过，若选中过则设置为选中，否则设置为不选中
        if tableWidget.objectName() == 'tableWidget_title':
            if self.current_page_num_title in self.select_title_page_info:
                self.checkBox_select_title.setCheckState(Qt.Checked)
            else:
                self.checkBox_select_title.setCheckState(Qt.Unchecked)
            flag = 'title'
        else:
            if self.current_page_num_content in self.select_content_page_info:
                self.checkBox_select_content.setCheckState(Qt.Checked)
            else:
                self.checkBox_select_content.setCheckState(Qt.Unchecked)
            flag = 'content'
        #检测过滤显示的信息
        if self.lineEdit_filter_title.isEnabled() == True:
            filter_text = self.lineEdit_filter_title.text()
            self.filter_title_list = self.get_filter_list(filter_text)
        else:
            self.filter_title_list = []

        if self.lineEdit_filter_content.isEnabled() == True:
            filter_text = self.lineEdit_filter_content.text()
            self.filter_content_list = self.get_filter_list(filter_text)
        else:
            self.filter_content_list = []
        #从传入的网络爬虫抓取的数据中提取自己需要的数据
        if len(dict_data) > 0:
            #key_word = self.lineEdit.text()
            len_index = len(dict_data)
            list_target = []  #从dict_data中提取目标数据，基本元素是下面的dict_target
            for index in range(len_index):
                dict_temp = dict_data[index]  #提取从服务器中返回的其中一行信息
                dict_target = {}  #从dict_temp中提取自己需要的信息，主要包括标题、内容、时间、下载URL等
                #提取标题与内容
                _temp_title = dict_temp['announcementTitle']
                _temp_content = dict_temp['announcementContent']
                for i in ['<em>',
                          '</em>']:  #<em> </em>是服务器对搜索关键词添加的标记，这里对它们进行剔除
                    _temp_title = _temp_title.replace(i, '')
                    _temp_content = str(_temp_content).replace(i, '')
                dict_target['title'] = _temp_title
                dict_target['content'] = _temp_content

                #提取时间
                _temp = dict_temp['adjunctUrl']
                dict_target['time'] = _temp.split(r'/')[1]

                #提取URL
                id = _temp.split(r'/')[2].split('.')[0]
                download_url = 'http://www.cninfo.com.cn/cninfo-new/discosure/fulltext/download/\
                {}?announceTime={}'.format(id, dict_target['time'])
                dict_target['download_url'] = download_url
                dict_target['flag'] = flag
                #print(download_url)
                #添加处理的结果
                list_target.append(dict_target)
                #根据过滤规则进行自定义过滤，默认是不过滤的
                df = DataFrame(list_target)
                df = self.filter_df(
                    df,
                    filter_title_list=self.filter_title_list,
                    filter_content_list=self.filter_content_list)
                #过滤后，更新list_target
                _temp = df.to_dict('index')
                list_target = list(_temp.values())
        else:
            list_target = []
        #tableWidget的初始化
        list_col = ['time', 'title', 'download_url']
        len_col = len(list_col)
        len_index = len(list_target)  #list_target可能有所改变，需要重新计算长度
        if tableWidget.objectName() == 'tableWidget_title':
            self.list_target_title = list_target
        else:
            self.list_target_content = list_target
        tableWidget.setRowCount(len_index)  #设置行数
        tableWidget.setColumnCount(len_col)  #设置列数
        tableWidget.setHorizontalHeaderLabels(['时间', '标题', '查看'])  #设置垂直方向的名称
        tableWidget.setVerticalHeaderLabels(
            [str(i) for i in range(1, len_index + 1)])  #设置水平方向的名称
        tableWidget.setCornerButtonEnabled(True)  #设置点击左上角进行全选
        #填充tableWidget的数据
        for index in range(len_index):
            for col in range(len_col):
                name_col = list_col[col]
                if name_col == 'download_url':
                    item = QTableWidgetItem('查看')
                    item.setTextAlignment(Qt.AlignCenter)
                    font = QFont()
                    font.setBold(True)
                    font.setWeight(75)
                    item.setFont(font)
                    item.setBackground(QColor(218, 218, 218))
                    item.setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled)
                    tableWidget.setItem(index, col, item)
                elif name_col == 'time':
                    item = QTableWidgetItem(list_target[index][name_col])
                    item.setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled)
                    #查看当前行的内容是否已经在下载购物车中，如果在就设置为选中
                    if list_target[index] in self.download_info_list:
                        item.setCheckState(Qt.Checked)
                    else:
                        item.setCheckState(Qt.Unchecked)
                    tableWidget.setItem(index, col, item)
                else:
                    tableWidget.setItem(
                        index, col,
                        QTableWidgetItem(list_target[index][name_col]))
        #tableWidget.resizeColumnsToContents()
        tableWidget.setColumnWidth(1, 500)

Example #54

0

Show file

File: test3.py Project: shreyansh26/CSE241N-AI-Codes

def train_func(train_list_words, train_list_tags):
    '''
	This creates dictionaries storing the transition and emission probabilities - required for running Viterbi. 
	INPUT: The nested list of words and corresponding nested list of tags from the TRAINING set. This passing of correct lists and calling the function
	has been done for you. You only need to write the code for filling in the below dictionaries. (created with bigram-HMM in mind)
	OUTPUT: The two dictionaries

	HINT: Keep in mind the boundary case of the starting POS tag. You may have to choose (and stick with) some starting POS tag to compute bigram probabilities
	for the first actual POS tag.
	'''

    dict2_tag_follow_tag = {}
    """Nested dictionary to store the transition probabilities
    each tag X is a key of the outer dictionary with an inner dictionary as the corresponding value
    The inner dictionary's key is the tag Y following X
    and the corresponding value is the number of times Y follows X - convert this count to probabilities finally before returning 
    for example - { X: {Y:0.33, Z:0.25}, A: {B:0.443, W:0.5, E:0.01}} (and so on) where X,Y,Z,A,B,W,E are all POS tags
    so the first key-dictionary pair can be interpreted as "there is a probability of 0.33 that tag Y follows tag X, and 0.25 probability that Z follows X"
    """
    dict2_word_tag = {}
    """Nested dictionary to store the emission probabilities.
	Each word W is a key of the outer dictionary with an inner dictionary as the corresponding value
	The inner dictionary's key is the tag X of the word W
	and the corresponding value is the number of times X is a tag of W - convert this count to probabilities finally before returning
	for example - { He: {A:0.33, N:0.15}, worked: {B:0.225, A:0.5}, hard: {A:0.1333, W:0.345, E:0.25}} (and so on) where A,N,B,W,E are all POS tags
	so the first key-dictionary pair can be interpreted as "there is a probability of 0.33 that A is the POS tag for He, and 0.15 probability that N is the POS tag for He"
	"""

    #      *** WRITE YOUR CODE HERE ***
    #print(train_list_tags[0])
    tags_set = [
        "#", "`", "<s>", "C", "D", "E", "F", "I", "J", "L", "M", "N", "P", "R",
        "S", "T", "U", "V", "W", ",", ".", ":", "-", "'", "$"
    ]
    words_set = set()
    words_count = {}

    for i in train_list_words:
        for j in i:
            if j in words_count.keys():
                words_count[j] += 1
            else:
                words_count[j] = 1

    for i in train_list_words:
        for j in i:
            if words_count[j] == 1:
                words_set.add('<UNK>')
            else:
                words_set.add(j)
    words_set = list(words_set)
    transition_prob = DataFrame(data=0, index=tags_set, columns=tags_set)
    emission_prob = DataFrame(data=0, index=tags_set, columns=words_set)

    for pairs in tqdm(zip(train_list_words, train_list_tags)):
        if pairs:
            words_list = pairs[0]
            tags_list = pairs[1]
            previous_tag = '<s>'
            for word, pos_tag in zip(words_list, tags_list):
                #print(word, pos_tag)
                if words_count[word] == 1:
                    emission_prob.loc[pos_tag, '<UNK>'] += 1
                else:
                    emission_prob.loc[pos_tag, word] += 1
                transition_prob.loc[previous_tag, pos_tag] += 1
                previous_tag = pos_tag

    transition_prob = transition_prob / transition_prob.sum()
    emission_prob = emission_prob / emission_prob.sum()

    # END OF YOUR CODE
    dict2_tag_follow_tag = transition_prob.to_dict()
    dict2_word_tag = emission_prob.to_dict()
    return (dict2_tag_follow_tag, dict2_word_tag)

Example #55

0

Show file

File: test_convert_to.py Project: chrish42/pandas

 def test_to_dict_wide(self):
     # https://github.com/pandas-dev/pandas/issues/24939
     df = DataFrame({('A_{:d}'.format(i)): [i] for i in range(256)})
     result = df.to_dict('records')[0]
     expected = {'A_{:d}'.format(i): i for i in range(256)}
     assert result == expected

Example #56

0

Show file

File: dataset.py Project: TrendingTechnology/ebonite

 def serialize(self, instance: pd.DataFrame):
     return {'values': instance.to_dict('records')}

Example #57

0

Show file

File: test_convert_to.py Project: chrish42/pandas

 def test_to_dict_invalid_orient(self):
     df = DataFrame({'A': [0, 1]})
     msg = "orient 'xinvalid' not understood"
     with pytest.raises(ValueError, match=msg):
         df.to_dict(orient='xinvalid')

Example #58

0

Show file

 def test_to_dict_errors(self, mapping):
     # GH16122
     df = DataFrame(np.random.randn(3, 3))
     with pytest.raises(TypeError):
         df.to_dict(into=mapping)

Example #59

0

Show file

File: frame_ctor.py Project: DusanMilunovic/pandas

 def setup(self, offset):
     N = 10**3
     np.random.seed(1234)
     idx = date_range(Timestamp('1/1/1900'), freq=offset, periods=N)
     df = DataFrame(np.random.randn(N, 10), index=idx)
     self.d = df.to_dict()

Example #60

0

Show file

 def test_to_dict_numeric_names(self):
     # https://github.com/pandas-dev/pandas/issues/24940
     df = DataFrame({str(i): [i] for i in range(5)})
     result = set(df.to_dict("records")[0].keys())
     expected = set(df.columns)
     assert result == expected