コード例 #1
0
ファイル: frameparseval.py プロジェクト: chenguandan/semafor
 def __add__(self, that):
     # ensure all rows are present for both tables, filling in 0 if necessary
     # (otherwise the empty rows will be treated as if they contain NaN when adding)
     
     me = self._df
     you = that._df
     for row in me.index:
         if row not in that._df.index:
             you = you.append(DataFrame.from_items([(e, {row: '' if me[e][row]=='' else 0}) for e in PRCounter.COLUMNS]))
     for row in you.index:
         if row not in self._df.index:
             me = me.append(DataFrame.from_items([(e, {row: '' if you[e][row]=='' else 0}) for e in PRCounter.COLUMNS]))
     
     # add counts
     new_df = me + you
     
     # recompute ratios
     new_df['P'] = new_df['Numer'] / new_df['PDenom']
     new_df['R'] = new_df['Numer'] / new_df['RDenom']
     denom = (new_df['P'] + new_df['R'])
     new_df['F'] = 2 * new_df['P'] * new_df['R'] / denom[denom>0]
     new_df['Acc'] = new_df['T'] / new_df['N']
     result = PRCounter()
     result._df = new_df
     return result
コード例 #2
0
    def get_contributions(self):
        data = []
        columns = ['mp-id', 'contribution', 'efermi', 'ehull', 'bandgap']

        docs = self.query_contributions(criteria=self.tam_perovskites_query,
                                        projection={
                                            '_id': 1,
                                            'mp_cat_id': 1,
                                            'content.efermi': 1,
                                            'content.ehull': 1,
                                            'content.bandgap': 1
                                        })
        if not docs:
            raise Exception(
                'No contributions found for TamPerovskites Explorer!')

        for doc in docs:
            mpfile = MPFile.from_contribution(doc)
            mp_id = mpfile.ids[0]
            contrib = mpfile.hdata[mp_id]
            cid_url = '/'.join([
                self.preamble.rsplit('/', 1)[0], 'explorer', 'materials',
                doc['_id']
            ])
            row = [
                mp_id, cid_url, contrib['efermi'], contrib['ehull'],
                contrib['bandgap']
            ]
            data.append((mp_id, row))
        return DataFrame.from_items(data, orient='index', columns=columns)
コード例 #3
0
def TreeMatrix(D, desc, L, Env=None, DshapeLarge=True):
    """
    Applying tree information (desc,L) on a given count matrix (D) and columns grouping (Env) to obtain a matrix of count over the tree
    """
    #I assume that D has correct columns name
    if not DshapeLarge:
        Z = (D[["Sample", "Taxon"]]).values
        Z = MultiIndex.from_tuples(map(tuple, tuple(Z)),
                                   names=["Sample", "Taxon"])
        D.index = Z
        Dlarge = D.Count.unstack(level=0)
        Dlarge.fillna(value=0, inplace=True)
        #I assume that Environment has correct index and columns names
        ExperimentalDesignColumns = MultiIndex.from_tuples(
            map(tuple, tuple(Env.ix[Dlarge.columns].values)),
            names=["Sample", "Group"])
    else:
        # if D is already Large Environment information is already included
        Dlarge = D
        ExperimentalDesignColumns = Dlarge.columns
    #if taxon only present in tree but not in table, access mode .ix correctly report NA for that line, that later will be converted to zero.
    NodeTableLarge = [[x[0], Dlarge.ix[x[-1]].sum()] for x in desc]
    Dtree = DataFrame.from_items(NodeTableLarge).transpose()
    NodeAndLeafNamesIndex = MultiIndex.from_tuples(map(
        tuple, tuple(L.loc[:, ["Name", "Is_Leaf"]].ix[Dtree.index].values)),
                                                   names=["Name", "Is_Leaf"])
    Dtree.index = NodeAndLeafNamesIndex
    Dtree.columns = ExperimentalDesignColumns
    Dtree.columns = Dtree.columns.reorder_levels(["Group", "Sample"])
    return Dtree
コード例 #4
0
    def _update_margins(self):
        for variable in self.margins_by_variable:
            survey_scenario = self.survey_scenario
            simulation = survey_scenario.simulation
            column_by_name = survey_scenario.tax_benefit_system.column_by_name

            assert variable in column_by_name
            column = survey_scenario.tax_benefit_system.column_by_name[variable]
            weight = self.weight
            filter_by = self.filter_by
            initial_weight = self.initial_weight

            value = simulation.calculate_add(variable)
            margin_items = [
                ('actual', weight[filter_by]),
                ('initial', initial_weight[filter_by]),
                ]

            if column.__class__ in [AgeCol, BoolCol, EnumCol]:
                margin_items.append(('category', value[filter_by]))
                # TODO: should not use DataFrame for that ...
                margins_data_frame = DataFrame.from_items(margin_items)
                margins_data_frame = margins_data_frame.groupby('category', sort = True).sum()
                margin_by_type = margins_data_frame.to_dict()
            else:
                margin_by_type = dict(
                    actual = (weight[filter_by] * value[filter_by]).sum(),
                    initial = (initial_weight[filter_by] * value[filter_by]).sum(),
                    )
            self.margins_by_variable[variable].update(margin_by_type)
コード例 #5
0
def StormSums(Stormslist,Data,offset=0):
    eventlist = []
    index =[]
    for storm_index,storm in Stormslist.iterrows():
        #print storm
        start = storm['start']-timedelta(minutes=offset) ##if Storms are defined by stream response you have to grab the preceding precip data
        end= storm['end']
        data = True ## Innocent until proven guilty
        try:
            event = Data.ix[start:end] ### slice list of Data for event
        except KeyError:
            start = start+timedelta(minutes=15) ## if the start time falls between 2 30minute periods
        try:
            event = Data.ix[start:end]
        except KeyError:
            end = end+timedelta(minutes=15)
        try:
            event = Data.ix[start:end]
        except KeyError:
            print 'no precip data available for storm'
            data = False
            pass
        if data != False:
            eventcount = event.count()
            eventsum = event.sum()
            eventmax = event.max()
            eventlist.append((storm['start'],[storm['start']-timedelta(minutes=offset),storm['end'],eventcount,eventsum,eventmax])) 
    Events=DataFrame.from_items(eventlist,orient='index',columns=['start','end','count','sum','max'])
    return Events
コード例 #6
0
def TreeMatrix(D,desc,L, Env=None,DshapeLarge=True):
    """
    Applying tree information (desc,L) on a given count matrix (D) and columns grouping (Env) to obtain a matrix of count over the tree
    """
    #I assume that D has correct columns name
    if not DshapeLarge:
        Z=(D[["Sample","Taxon"]]).values
        Z=MultiIndex.from_tuples(map(tuple,tuple(Z)), names=["Sample","Taxon"])
        D.index=Z
        Dlarge=D.Count.unstack(level=0)
        Dlarge.fillna(value=0,inplace=True)
        #I assume that Environment has correct index and columns names
        ExperimentalDesignColumns=MultiIndex.from_tuples(
        map(tuple,tuple(Env.ix[Dlarge.columns].values))
        , names=["Sample","Group"])
    else:
        # if D is already Large Environment information is already included
        Dlarge=D
        ExperimentalDesignColumns=Dlarge.columns
    #if taxon only present in tree but not in table, access mode .ix correctly report NA for that line, that later will be converted to zero.
    NodeTableLarge=[[x[0],Dlarge.ix[x[-1]].sum()] for x in desc]
    Dtree=DataFrame.from_items(NodeTableLarge).transpose()
    NodeAndLeafNamesIndex=MultiIndex.from_tuples(
        map(tuple,tuple(L.loc[:,["Name","Is_Leaf"]].ix[Dtree.index].values))
        , names=["Name","Is_Leaf"])
    Dtree.index=NodeAndLeafNamesIndex
    Dtree.columns=ExperimentalDesignColumns
    Dtree.columns=Dtree.columns.reorder_levels(["Group", "Sample"])
    return Dtree
コード例 #7
0
    def update_margins(self):
        for variable in self.margins_by_name:
            survey_scenario = self.survey_scenario
            simulation = survey_scenario.simulation
            column_by_name = survey_scenario.tax_benefit_system.column_by_name

            assert variable in column_by_name
            column = survey_scenario.tax_benefit_system.column_by_name[
                variable]
            weight = self.weight
            filter_by = self.filter_by
            initial_weight = self.initial_weight

            value = simulation.calculate(variable)
            margin_items = [
                ('actual', weight[filter_by]),
                ('initial', initial_weight[filter_by]),
            ]

            if column.__class__ in [AgeCol, BoolCol, EnumCol]:
                margin_items.append(('category', value[filter_by]))
                margins_data_frame = DataFrame.from_items(margin_items)
                margins_data_frame = margins_data_frame.groupby(
                    'category', sort=True).sum()
                margin_by_type = margins_data_frame.to_dict()
            else:
                margin_by_type = dict(
                    actual=(weight[filter_by] * value[filter_by]).sum(),
                    initial=(initial_weight[filter_by] *
                             value[filter_by]).sum(),
                )
            self.margins_by_name[variable].update(margin_by_type)

            if self.total_population is not None:
                target = self.margins_by_name[variable].get('target', False)
コード例 #8
0
ファイル: common.py プロジェクト: flamingbear/pandas
 def test_scientific_no_exponent(self):
     # see gh-12215
     df = DataFrame.from_items([("w", ["2e"]), ("x", ["3E"]), ("y", ["42e"]), ("z", ["632E"])])
     data = df.to_csv(index=False)
     for prec in self.float_precision_choices:
         df_roundtrip = self.read_csv(StringIO(data), float_precision=prec)
         tm.assert_frame_equal(df_roundtrip, df)
コード例 #9
0
ファイル: test_reshape.py プロジェクト: vcuplov/pandas
    def test_get_dummies_dont_sparsify_all_columns(self, sparse):
        # GH18914
        df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])])
        df = get_dummies(df, columns=['Nation'], sparse=sparse)
        df2 = df.reindex(columns=['GDP'])

        tm.assert_frame_equal(df[['GDP']], df2)
コード例 #10
0
    def get_contributions(self, phase=None):
        data = []
        phase_query_key = {'$exists': 1} if phase is None else phase
        columns = ['mp-id', 'contribution', 'formula']
        if phase is None:
            columns.append('phase')
        columns += ['dH (formation)', 'dH (hydration)', 'GS?', 'CIF']

        for doc in self.query_contributions(criteria={
                'project': {
                    '$in': ['LBNL', 'MIT']
                },
                'content.info.Phase':
                phase_query_key
        },
                                            projection={
                                                '_id': 1,
                                                'mp_cat_id': 1,
                                                'content': 1
                                            }):
            mpfile = MPFile.from_contribution(doc)
            mp_id = mpfile.ids[0]
            info = mpfile.hdata[mp_id]['info']
            row = [mp_id, get_short_object_id(doc['_id']), info['Formula']]
            if phase is None:
                row.append(info['Phase'])
            row += [info['dHf'], info['dHh'], info['GS'], 'TODO']
            # TODO URLs for mp_id and cid
            data.append((mp_id, row))

        return DataFrame.from_items(data, orient='index', columns=columns)
コード例 #11
0
ファイル: frameparseval.py プロジェクト: chenguandan/semafor
 def __setitem__(self, k, v):
     if isinstance(v[0], int):
         N, gold_set, pred_set = v
         if gold_set or pred_set:
             assert N>0,(N,gold_set,pred_set)
     else:
         gold_set, pred_set = v
         N = ''
     entry = {
         'Numer': len(gold_set & pred_set),
         'PDenom': len(pred_set),
         'RDenom': len(gold_set),
         'N': N
     }
     entry['P'] = entry['Numer'] / entry['PDenom'] if entry['PDenom'] else float('nan')
     entry['R'] = entry['Numer'] / entry['RDenom'] if entry['RDenom'] else float('nan')
     entry['F'] = 2 * entry['P'] * entry['R'] / (entry['P'] + entry['R']) if (entry['P'] + entry['R']) else float('nan')
     if N=='':
         entry['T'] = None
         entry['Acc'] = None
     else:
         if len(gold_set)==len(pred_set)==N:
             entry['T'] = entry['Numer']
         else:
             tp = entry['Numer']
             fp = len(pred_set-gold_set)
             fn = len(gold_set-pred_set)
             entry['T'] = N-fp-fn
         assert entry['T']>=0,(entry,gold_set,pred_set)
         entry['Acc'] = float('nan') if N==0 else entry['T'] / N
     df = DataFrame.from_items([(e, {k: entry[e]}) for e in PRCounter.COLUMNS])
     self._df = self._df.append(df)
コード例 #12
0
def into(a, b, columns=None, schema=None, **kwargs):
    if not columns and schema:
        columns = dshape(schema)[0].names
    return DataFrame.from_items(((column, b[column][:]) for column in
                                    sorted(b.names)),
                                orient='columns',
                                columns=columns)
コード例 #13
0
ファイル: bcolz.py プロジェクト: leolujuyi/blaze
def into(a, b, columns=None, schema=None, **kwargs):
    if not columns and schema:
        columns = dshape(schema)[0].names
    return DataFrame.from_items(((column, b[column][:]) for column in
                                    sorted(b.names)),
                                orient='columns',
                                columns=columns)
コード例 #14
0
ファイル: runtest.py プロジェクト: ATom93/phyloH-1
def Report(FullCounts, t, otu, db):
    print "Unequal Effort"
    Halpha, Hgamma, Hbeta, HE, tots = UltraTreeTest(FullCounts, t, otu)
    H = db.GetEntropiesPandas(q="1", Pairwise=1, EqualEffort=0)
    subFull = FullCounts.iloc[:, [0, 1]]
    subFull.columns.set_labels([0, 1], level=0, inplace=True)
    Halpha_k, Hgamma_k, Hbeta_k, HE_k, tots_k = UltraTreeTest(subFull, t, otu)
    result = DataFrame.from_items(
        [["Hgamma", [Hgamma, H["Hgamma"]]],
         ["Halpha", [Halpha, H["HalphaByEnvironment"]]],
         ["Hbeta", [Hbeta, H["MI_treeAndEnvironment"]]],
         ["DistTurnover", [Hbeta / HE, H["DistTurnover"].iloc[1, 0]]],
         [
             "DistTurnoverbySample",
             [Hbeta_k / HE_k, H["DistTurnoverBySample"].iloc[1, 0]]
         ]],
        columns=["Test", "RegularRoutine"],
        orient="index")
    result["Dif"] = result.Test - result.RegularRoutine
    print(result)

    print "Equal Effort"
    H = db.GetEntropiesPandas(q="1", Pairwise=1, EqualEffort=1)
    #Halpha,Hgamma,Hbeta,HE=UltraTreeTest(countsA,countsB,[0.5,0.5])
    Halpha, Hgamma, Hbeta, HE, tots = UltraTreeTest(FullCounts,
                                                    t,
                                                    otu,
                                                    Equal=True)
    Halpha_k, Hgamma_k, Hbeta_k, HE_k, tots_k = UltraTreeTest(subFull,
                                                              t,
                                                              otu,
                                                              Equal=True)
    result = DataFrame.from_items(
        [["Hgamma", [Hgamma, H["Hgamma"]]],
         ["Halpha", [Halpha, H["HalphaByEnvironment"]]],
         ["Hbeta", [Hbeta, H["MI_treeAndEnvironment"]]],
         ["DistTurnover", [Hbeta / HE, H["DistTurnover"].iloc[1, 0]]],
         [
             "DistTurnoverbySample",
             [Hbeta_k / HE_k, H["DistTurnoverBySample"].iloc[1, 0]]
         ]],
        columns=["Test", "RegularRoutine"],
        orient="index")
    result["Dif"] = result.Test - result.RegularRoutine
    print(result)
    return None
コード例 #15
0
ファイル: common.py プロジェクト: luwenlong123/pandas
 def test_scientific_no_exponent(self):
     # see gh-12215
     df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']),
                                ('y', ['42e']), ('z', ['632E'])])
     data = df.to_csv(index=False)
     for prec in self.float_precision_choices:
         df_roundtrip = self.read_csv(StringIO(data), float_precision=prec)
         tm.assert_frame_equal(df_roundtrip, df)
コード例 #16
0
	def setUp(self):
		self.season_period = 2
		self.values = range(1, self.season_period * 2+1)
		self.dataframe = DataFrame.from_items([('values', self.values)])
		self.model = HoltWinters(self.dataframe,
		                         season_period=self.season_period)
		self.model._init_starting_arrays()
		self.model.coefs = [0.5, 0.5, 0.5]
コード例 #17
0
ファイル: ka_bnet_pandas.py プロジェクト: irr/python-labs
def simulate(adj, theta, num_samples):
    data = DataFrame.from_items( [(node, Series(np.zeros(num_samples, int))) for node in adj.columns] )
    for node in adj.columns:
        P = parents(node, adj)
        for n in range(num_samples):
            key = ','.join( [str(data.ix[n,parent]) for parent in P] )
            pdt = theta[node][key]
            data.ix[n,node] = draw(pdt)
    return data
コード例 #18
0
ファイル: common.py プロジェクト: aFraley/pandas
 def test_scientific_no_exponent(self):
     # see gh-12215
     df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']),
                                ('y', ['42e']), ('z', ['632E'])])
     data = df.to_csv(index=False)
     for prec in self.float_precision_choices:
         df_roundtrip = self.read_csv(
             StringIO(data), float_precision=prec)
         tm.assert_frame_equal(df_roundtrip, df)
コード例 #19
0
ファイル: gfa.py プロジェクト: nvrivera/genome_runner
def fetch_genes(taxon_id):
    c.execute("""
    SELECT id, symbol, name 
    FROM gene 
    WHERE taxon_id=%s 
    ORDER BY id""", (taxon_id,))
    return DataFrame.from_items([(row[0], row) for row in c], 
                                columns=["id", "symbol", "name"],
                                orient="index")
コード例 #20
0
ファイル: test_excel.py プロジェクト: thorwhalen/pandas
    def test_reader_seconds(self):
        # Test reading times with and without milliseconds. GH5945.
        _skip_if_no_xlrd()
        import xlrd

        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            # Xlrd >= 0.9.3 can handle Excel milliseconds.
            expected = DataFrame.from_items([("Time", [
                time(1, 2, 3),
                time(2, 45, 56, 100000),
                time(4, 29, 49, 200000),
                time(6, 13, 42, 300000),
                time(7, 57, 35, 400000),
                time(9, 41, 28, 500000),
                time(11, 25, 21, 600000),
                time(13, 9, 14, 700000),
                time(14, 53, 7, 800000),
                time(16, 37, 0, 900000),
                time(18, 20, 54)
            ])])
        else:
            # Xlrd < 0.9.3 rounds Excel milliseconds.
            expected = DataFrame.from_items([("Time", [
                time(1, 2, 3),
                time(2, 45, 56),
                time(4, 29, 49),
                time(6, 13, 42),
                time(7, 57, 35),
                time(9, 41, 29),
                time(11, 25, 22),
                time(13, 9, 15),
                time(14, 53, 8),
                time(16, 37, 1),
                time(18, 20, 54)
            ])])

        epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls')
        epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls')

        actual = read_excel(epoch_1900, 'Sheet1')
        tm.assert_frame_equal(actual, expected)

        actual = read_excel(epoch_1904, 'Sheet1')
        tm.assert_frame_equal(actual, expected)
コード例 #21
0
ファイル: CSACS_v4.0.py プロジェクト: Space-at-VT/CSACS
def save_data(): 
    ts = time.time()
    base_dir = filedialog.askdirectory()
    filename_time = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H%M')
    filename_base = os.path.join(base_dir, filename_time)
    filename = '%s.csv' % filename_base
    print("Saving Data...")
    df1 = DataFrame.from_items([('Pitch',pdx), ('Roll',pdy),('Yaw',pdz)]) 
    df1.stack(level=0, dropna = False) # Takes rows and converts to columns
    df1.to_csv(filename) # outputs to csv file
コード例 #22
0
	def setUp(self):
		self.season_period = 2
		self.values = [np.array([[i+1], [i]]) for i in
							range(1, self.season_period * 2+1)]
		self.dataframe = DataFrame.from_items([('values', self.values)])
		self.hwi = HoltWintersI(self.dataframe,
		                        season_period=self.season_period)
		self.hwi._init_starting_arrays()
		self.coefs = [0.5] * 12
		self.A, self.B, self.G = flats_to_matrix(self.coefs)
コード例 #23
0
 def setUp(self):
     self.periods = 2
     self.values = [
         np.array([[i + 1], [i]]) for i in range(1, self.periods + 1)
     ]
     self.dataframe = DataFrame.from_items([('values', self.values)])
     self.model = HoltI(self.dataframe)
     self.coefs = [0.5] * 8
     self.A, self.B = flats_to_matrix(self.coefs)
     self.model._init_starting_arrays()
コード例 #24
0
def simulate(adj, theta, num_samples):
    data = DataFrame.from_items([(node, Series(np.zeros(num_samples, int)))
                                 for node in adj.columns])
    for node in adj.columns:
        P = parents(node, adj)
        for n in range(num_samples):
            key = ','.join([str(data.ix[n, parent]) for parent in P])
            pdt = theta[node][key]
            data.ix[n, node] = draw(pdt)
    return data
コード例 #25
0
    def calibrate(self):
        """
        Calibrate according to margins found in frame
        """

        df = self.frame
        inputs = self.simulation.survey
        output_table = self.simulation.output_table
        margins = {}

        if df is not None:
            df = df.reset_index(drop=True)
            df = df.set_index(['var','mod'], inplace = True)
            for var, mod in df.index:
                # Dealing with non categorical vars ...
                if df.get_value((var,mod), u"modalités") == 'total':
                    margins[var] =  df.get_value((var,mod), 'cible')
                #  ... and categorical vars
                else:
                    if not margins.has_key(var):
                        margins[var] = {}
                    margins[var][mod] =  df.get_value((var,mod), 'cible')

        param = self.get_param()

        if self.totalpop is not None:
            margins['totalpop'] = self.totalpop
        adjusted_margins = self.update_weights(margins, param=param)

        if 'totalpop' in margins.keys():
            del margins['totalpop']

        w = self.weights
        for var in margins.keys():
            if var in inputs.column_by_name:
                value = inputs.get_value(var, self.entity)
            else:
                entity = self.entity
                enum = output_table._inputs.column_by_name.get('qui'+self.entity).enum
                people = [x[1] for x in enum]
                value = output_table.get_value(var, entity=entity, opt=people, sum_=True)

            if isinstance(margins[var], dict):
                items = [('marge', w  ),('mod', value)]
                updated_margins = DataFrame.from_items(items).groupby('mod', sort= True).sum()
                for mod in margins[var].keys():
                    df.set_value((var,mod), u"cible ajustée", adjusted_margins[var][mod])
                    df.set_value((var,mod), u"marge", updated_margins['marge'][mod])
            else:
                updated_margin = (w*value).sum()
                df.set_value((var,0), u"cible ajustée", adjusted_margins[var])
                df.set_value((var,0), u"marge", updated_margin)

        if self.frame is not None:
            self.frame = df.reset_index()
コード例 #26
0
    def calibrate(self):
        """
        Calibrate according to margins found in frame
        """

        df = self.frame
        inputs = self.simulation.survey
        output_table = self.simulation.output_table
        margins = {}

        if df is not None:
            df = df.reset_index(drop=True)
            df = df.set_index(['var','mod'], inplace = True)
            for var, mod in df.index:
                # Dealing with non categorical vars ...
                if df.get_value((var,mod), u"modalités") == 'total':
                    margins[var] =  df.get_value((var,mod), 'cible')
                #  ... and categorical vars
                else:
                    if not margins.has_key(var):
                        margins[var] = {}
                    margins[var][mod] =  df.get_value((var,mod), 'cible')

        param = self.get_param()

        if self.totalpop is not None:
            margins['totalpop'] = self.totalpop
        adjusted_margins = self.update_weights(margins, param=param)

        if 'totalpop' in margins.keys():
            del margins['totalpop']

        w = self.weights
        for var in margins.keys():
            if var in inputs.column_by_name:
                value = inputs.get_value(var, self.entity)
            else:
                entity = self.entity
                enum = output_table._inputs.column_by_name.get('qui'+self.entity).enum
                people = [x[1] for x in enum]
                value = output_table.get_value(var, entity=entity, opt=people, sum_=True)

            if isinstance(margins[var], dict):
                items = [('marge', w  ),('mod', value)]
                updated_margins = DataFrame.from_items(items).groupby('mod', sort= True).sum()
                for mod in margins[var].keys():
                    df.set_value((var,mod), u"cible ajustée", adjusted_margins[var][mod])
                    df.set_value((var,mod), u"marge", updated_margins['marge'][mod])
            else:
                updated_margin = (w*value).sum()
                df.set_value((var,0), u"cible ajustée", adjusted_margins[var])
                df.set_value((var,0), u"marge", updated_margin)

        if self.frame is not None:
            self.frame = df.reset_index()
コード例 #27
0
    def test_reader_special_dtypes(self):
        _skip_if_no_xlrd()

        expected = DataFrame.from_items([
            ("IntCol", [1, 2, -3, 4, 0]),
            ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
            ("BoolCol", [True, False, True, True, False]),
            ("StrCol", [1, 2, 3, 4, 5]),
            # GH5394 - this is why convert_float isn't vectorized
            ("Str2Col", ["a", 3, "c", "d", "e"]),
            ("DateCol", [
                datetime(2013, 10, 30),
                datetime(2013, 10, 31),
                datetime(1905, 1, 1),
                datetime(2013, 12, 14),
                datetime(2015, 3, 14)
            ])
        ])

        xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx')
        xls_path = os.path.join(self.dirpath, 'test_types.xls')

        # should read in correctly and infer types
        for path in (xls_path, xlsx_path):
            actual = read_excel(path, 'Sheet1')
            tm.assert_frame_equal(actual, expected)

        # if not coercing number, then int comes in as float
        float_expected = expected.copy()
        float_expected["IntCol"] = float_expected["IntCol"].astype(float)
        float_expected.loc[1, "Str2Col"] = 3.0
        for path in (xls_path, xlsx_path):
            actual = read_excel(path, 'Sheet1', convert_float=False)
            tm.assert_frame_equal(actual, float_expected)

        # check setting Index (assuming xls and xlsx are the same here)
        for icol, name in enumerate(expected.columns):
            actual = read_excel(xlsx_path, 'Sheet1', index_col=icol)
            actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name)
            exp = expected.set_index(name)
            tm.assert_frame_equal(actual, exp)
            tm.assert_frame_equal(actual2, exp)

        # convert_float and converters should be different but both accepted
        expected["StrCol"] = expected["StrCol"].apply(str)
        actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str})
        tm.assert_frame_equal(actual, expected)

        no_convert_float = float_expected.copy()
        no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
        actual = read_excel(xlsx_path,
                            'Sheet1',
                            converters={"StrCol": str},
                            convert_float=False)
        tm.assert_frame_equal(actual, no_convert_float)
コード例 #28
0
ファイル: test_excel.py プロジェクト: 5i7788/pandas
    def test_reader_seconds(self):
        # Test reading times with and without milliseconds. GH5945.
        _skip_if_no_xlrd()
        import xlrd

        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            # Xlrd >= 0.9.3 can handle Excel milliseconds.
            expected = DataFrame.from_items([("Time",
                                              [time(1, 2, 3),
                                               time(2, 45, 56, 100000),
                                               time(4, 29, 49, 200000),
                                               time(6, 13, 42, 300000),
                                               time(7, 57, 35, 400000),
                                               time(9, 41, 28, 500000),
                                               time(11, 25, 21, 600000),
                                               time(13, 9, 14, 700000),
                                               time(14, 53, 7, 800000),
                                               time(16, 37, 0, 900000),
                                               time(18, 20, 54)])])
        else:
            # Xlrd < 0.9.3 rounds Excel milliseconds.
            expected = DataFrame.from_items([("Time",
                                              [time(1, 2, 3),
                                               time(2, 45, 56),
                                               time(4, 29, 49),
                                               time(6, 13, 42),
                                               time(7, 57, 35),
                                               time(9, 41, 29),
                                               time(11, 25, 22),
                                               time(13, 9, 15),
                                               time(14, 53, 8),
                                               time(16, 37, 1),
                                               time(18, 20, 54)])])

        epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls')
        epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls')

        actual = read_excel(epoch_1900, 'Sheet1')
        tm.assert_frame_equal(actual, expected)

        actual = read_excel(epoch_1904, 'Sheet1')
        tm.assert_frame_equal(actual, expected)
コード例 #29
0
def save_data():
    ts = time.time()
    base_dir = filedialog.askdirectory()
    filename_time = datetime.datetime.fromtimestamp(ts).strftime(
        '%Y-%m-%d-%H%M')
    filename_base = os.path.join(base_dir, filename_time)
    filename = '%s.csv' % filename_base
    print("Saving Data...")
    df1 = DataFrame.from_items([('Pitch', pdx), ('Roll', pdy), ('Yaw', pdz)])
    df1.stack(level=0, dropna=False)  # Takes rows and converts to columns
    df1.to_csv(filename)  # outputs to csv file
コード例 #30
0
def main():
    conf = SparkConf()
    # conf.set("spark.executor.memory","3g")
    # conf.set("spark.speculation", "True")
    sc = SparkContext(conf=conf)
    v = sc.textFile('hdfs:///data/ad_f.csv') \
        .map(lambda line: line.split(",")) \
        .filter(lambda line: line[5] != 'NULL') \
        .map(lambda line: (int(line[0]), line[1], [line[3], line[4], line[5]]))  # .collect()
    vl = v.map(lambda i: DenseVector(i[2]))
    index_vl = v.map(lambda i: (i[0], DenseVector(i[2])))
    index_vl.cache(pyspark.storagelevel.MEMORY_AND_DISK)
    max_iterations = 33
    k = 10
    clusters = [
    ]  # contain several dense vectors that is choosen to be centroid
    met = np.array([[1.39120240528e-06, -7.11964361751e-08, 1.68554275438e-07],
                    [-7.1196436173e-08, 4.18367212413e-06, -2.45888145316e-07],
                    [1.6855427544e-07, -2.45888145311e-07, 1.43614586304e-06]])
    row_num = vl.count()
    b_met = sc.broadcast(met)  # broadcast met,access value by b_met.value
    b_row = sc.broadcast(row_num)  # broadcast row numbers
    populate_initial_centers()
    # compute_distances()
    d = compute_distances()
    a = get_clusters(d)
    counter = 0
    while True:
        counter += 1
        previous_clusters = clusters
        print("counter: ", counter)
        print("Previous ", previous_clusters)
        clusters = compute_new_centers(a)
        print(clusters)
        d = compute_distances()
        a = get_clusters(d)
        if counter >= max_iterations:
            break

    sse = d.map(lambda x: x[1]**2).sum()

    x = a.map(lambda x: (x[0][0], float(x[0][1])))
    p = v.join(x)
    q = p.map(lambda x: x[1])
    sqlContext = SQLContext(sc)
    df = sqlContext.createDataFrame(q, ['ad_id', 'cluster'])
    df.repartition(1).write.csv(path="/data/op_clu", header=True)
    # c = sc.parallelize(clusters).map(lambda x: str(x)[1:-1])
    clusters.append(np.array([sse, 0, 0]))
    t = zip(range(k + 1), clusters)
    cen = DataFrame.from_items(t, range(k), 'index')
    df_cen = sqlContext.createDataFrame(cen,
                                        ['campaign_id', 'customer', 'brand'])
    df_cen.repartition(1).write.csv(path="/data/op_cen", header=True)
コード例 #31
0
ファイル: frameparseval.py プロジェクト: Noahs-ARK/semafor
 def __add__(self, that):
     # ensure all rows are present for both tables, filling in 0 if necessary
     # (otherwise the empty rows will be treated as if they contain NaN when adding)
     
     me = self._df
     you = that._df
     for row in me.index:
         if row not in that._df.index:
             you = you.append(DataFrame.from_items([(e, {row: '' if me[e][row]=='' else 0}) for e in PRCounter.COLUMNS]))
     for row in you.index:
         if row not in self._df.index:
             me = me.append(DataFrame.from_items([(e, {row: '' if you[e][row]=='' else 0}) for e in PRCounter.COLUMNS]))
     
     # add counts
     new_df = me + you
     
     result = PRCounter()
     result._df = new_df
     if self.COMPUTE_RATIOS_ON_ADD: # recompute ratios
         self.compute_ratios()
     return result
コード例 #32
0
    def calibrate_old(self):
        """
        Calibrate according to margins found in frame
        """

        df = self.frame
        margins = {}

        if df is not None:
            df.reset_index(drop=True, inplace=True)
            df.set_index(['var', 'mod'], inplace=True)
            for var, mod in df.index:
                # Dealing with non categorical vars ...
                if df.get_value((var, mod), u"modalités") == 'total':
                    margins[var] = df.get_value((var, mod), 'cible')
                #  ... and categorical vars
                else:
                    if var not in margins:
                        margins[var] = {}
                    margins[var][mod] = df.get_value((var, mod), 'cible')

        parameters = self.get_parameters()

        if self.total_population is not None:
            margins['total_population'] = self.total_population

        adjusted_margins = self.update_weights(margins, parameters=parameters)

        if 'total_population' in margins.keys():
            del margins['total_population']

        w = self.weight
        for var in margins.keys():
            if var in self.survey_scenario.tax_benefit_system.column_by_name:
                value = self.survey_scenario.simulation.calculate(
                    var)  # TODO sum over menage

            if isinstance(margins[var], dict):
                items = [('marge', w), ('mod', value)]
                updated_margins = DataFrame.from_items(items).groupby(
                    'mod', sort=True).sum()
                for mod in margins[var].keys():
                    df.set_value((var, mod), u"cible ajustée",
                                 adjusted_margins[var][mod])
                    df.set_value((var, mod), u"marge",
                                 updated_margins['marge'][mod])
            else:
                updated_margin = (w * value).sum()
                df.set_value((var, 0), u"cible ajustée", adjusted_margins[var])
                df.set_value((var, 0), u"marge", updated_margin)

        if self.frame is not None:
            self.frame = df.reset_index()
コード例 #33
0
    def compute_aggregates(self, filter_by=None):
        """
        Compute aggregate amounts
        """
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        V = []
        M = {'data': [], 'default': []}
        B = {'data': [], 'default': []}
        U = []

        M_label = {
            'data': self.labels['dep'],
            'default': self.labels['dep_default']
        }
        B_label = {
            'data': self.labels['benef'],
            'default': self.labels['benef_default']
        }

        for var in self.varlist:
            # amounts and beneficiaries from current data and default data if exists
            montant_benef = self.get_aggregate(var, filter_by)
            V.append(column_by_name[var].label)
            entity = column_by_name[var].entity_key_plural

            U.append(entity)
            for dataname in montant_benef:
                M[dataname].append(montant_benef[dataname][0])
                B[dataname].append(montant_benef[dataname][1])

        # build items list
        items = [(self.labels['var'], V)]

        for dataname in M:
            if M[dataname]:
                items.append((M_label[dataname], M[dataname]))
                items.append((B_label[dataname], B[dataname]))

        items.append((self.labels['entity'], U))
        aggr_frame = DataFrame.from_items(items)

        self.aggr_frame = None
        for code, label in self.labels.iteritems():
            try:
                col = aggr_frame[label]
                if self.aggr_frame is None:
                    self.aggr_frame = DataFrame(col)
                else:
                    self.aggr_frame = self.aggr_frame.join(col, how="outer")
            except:
                pass
コード例 #34
0
ファイル: test_other.py プロジェクト: zheewang/pandas
def test_agg_period_index():
    prng = period_range('2012-1-1', freq='M', periods=3)
    df = DataFrame(np.random.randn(3, 2), index=prng)
    rs = df.groupby(level=0).sum()
    assert isinstance(rs.index, PeriodIndex)

    # GH 3579
    index = period_range(start='1999-01', periods=5, freq='M')
    s1 = Series(np.random.rand(len(index)), index=index)
    s2 = Series(np.random.rand(len(index)), index=index)
    series = [('s1', s1), ('s2', s2)]
    df = DataFrame.from_items(series)
    grouped = df.groupby(df.index.month)
    list(grouped)
コード例 #35
0
ファイル: test_excel.py プロジェクト: 5i7788/pandas
    def test_reader_special_dtypes(self):
        _skip_if_no_xlrd()

        expected = DataFrame.from_items([
            ("IntCol", [1, 2, -3, 4, 0]),
            ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
            ("BoolCol", [True, False, True, True, False]),
            ("StrCol", [1, 2, 3, 4, 5]),
            # GH5394 - this is why convert_float isn't vectorized
            ("Str2Col", ["a", 3, "c", "d", "e"]),
            ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
                         datetime(1905, 1, 1), datetime(2013, 12, 14),
                         datetime(2015, 3, 14)])
        ])

        xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx')
        xls_path = os.path.join(self.dirpath, 'test_types.xls')

        # should read in correctly and infer types
        for path in (xls_path, xlsx_path):
            actual = read_excel(path, 'Sheet1')
            tm.assert_frame_equal(actual, expected)

        # if not coercing number, then int comes in as float
        float_expected = expected.copy()
        float_expected["IntCol"] = float_expected["IntCol"].astype(float)
        float_expected.loc[1, "Str2Col"] = 3.0
        for path in (xls_path, xlsx_path):
            actual = read_excel(path, 'Sheet1', convert_float=False)
            tm.assert_frame_equal(actual, float_expected)

        # check setting Index (assuming xls and xlsx are the same here)
        for icol, name in enumerate(expected.columns):
            actual = read_excel(xlsx_path, 'Sheet1', index_col=icol)
            actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name)
            exp = expected.set_index(name)
            tm.assert_frame_equal(actual, exp)
            tm.assert_frame_equal(actual2, exp)

        # convert_float and converters should be different but both accepted
        expected["StrCol"] = expected["StrCol"].apply(str)
        actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str})
        tm.assert_frame_equal(actual, expected)

        no_convert_float = float_expected.copy()
        no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
        actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str},
                           convert_float=False)
        tm.assert_frame_equal(actual, no_convert_float)
コード例 #36
0
def task_cohen(dataOne, dataTwo):

    labelOne = 'column1'
    labelTwo = 'column2'

    # Create input
    df = DataFrame.from_items([(labelOne, dataOne), (labelTwo, dataTwo)])

    meanx = mean(df[labelOne])
    meany = mean(df[labelTwo])
    sdx = std(df[labelOne])
    sdy = std(df[labelTwo])
    s = sqrt((sdx**2 + sdy**2) / 2)
    d = round(fabs(meanx - meany) / s, 4)

    return d
コード例 #37
0
ファイル: utils.py プロジェクト: hugadams/npsurfacecounter
def to_dataframe(iterable, *attrfields):
    ''' If user specifies fields, only those fields, in that order, will be cast into a data frame.  Otherwise,
    fields are taken from first element in iterable.  Fields must be a list of strings.'''
    if attrfields:
        columns=attrfields #Empty dataframe of fixed column/row size

    ### The autoassign below only works for namedtuples (uses _fields attribute) and takes from the iterable[0] entry ###
    else:
        try:
            columns=iterable[0]._fields  #FOR NOW ONLY WORKS FOR NAMETUPLE attribute
        except AttributeError:
            raise AttributeError('to_dataframe requires a list of attributes')
        else:
            fget=attrgetter(*columns)
            items=[(idx, fget(v)) for idx, v in enumerate(iterable)]  #Key value pairs, key=index position, value =array of results
            return DataFrame.from_items(items, columns, orient='index') #Orient lets it know keys are for row indexing not 
コード例 #38
0
def task_cohen(dataOne, dataTwo):

    labelOne = 'column1'
    labelTwo = 'column2'

    # Create input
    df = DataFrame.from_items([(labelOne, dataOne), (labelTwo, dataTwo)])

    meanx = mean(df[labelOne])
    meany = mean(df[labelTwo])
    sdx = std(df[labelOne])
    sdy = std(df[labelTwo])
    s = sqrt((sdx**2 + sdy**2)/2)
    d = round(fabs(meanx-meany)/s, 4)

    return d
コード例 #39
0
    def compute_aggregates(self, filter_by = None):
        """
        Compute aggregate amounts
        """
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        V = []
        M = {'data': [], 'default': []}
        B = {'data': [], 'default': []}
        U = []

        M_label = {'data': self.labels['dep'],
                   'default': self.labels['dep_default']}
        B_label = {'data': self.labels['benef'],
                   'default': self.labels['benef_default']}

        for var in self.varlist:
            # amounts and beneficiaries from current data and default data if exists
            montant_benef = self.get_aggregate(var, filter_by)
            V.append(column_by_name[var].label)
            entity = column_by_name[var].entity_key_plural

            U.append(entity)
            for dataname in montant_benef:
                M[dataname].append(montant_benef[dataname][0])
                B[dataname].append(montant_benef[dataname][1])

        # build items list
        items = [(self.labels['var'], V)]

        for dataname in M:
            if M[dataname]:
                items.append((M_label[dataname], M[dataname]))
                items.append((B_label[dataname], B[dataname]))

        items.append((self.labels['entity'], U))
        aggr_frame = DataFrame.from_items(items)

        self.aggr_frame = None
        for code, label in self.labels.iteritems():
            try:
                col = aggr_frame[label]
                if self.aggr_frame is None:
                    self.aggr_frame = DataFrame(col)
                else:
                    self.aggr_frame = self.aggr_frame.join(col, how="outer")
            except:
                pass
コード例 #40
0
    def update_output(self, output_data, description = None):
        QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))

        if output_data is None:
            return
        self.set_data(output_data)        
        
        if description is not None:  
            self.set_distribution_choices(description)
            
        if not hasattr(self, 'distribution_by_var'):
            self.distribution_by_var = 'typmen15'
        
        by_var = self.distribution_by_var
        

        V = []
        M = []
        B = []
        for var in self.varlist:
            montant, benef = self.get_aggregate(var)
            V.append(var)
            M.append(montant)
            B.append(benef)
        
        items = [(u'Mesure', V), 
                 (u"Dépense\n(millions d'€)", M), 
                 (u"Bénéficiaires\n(milliers de ménages)", B)]
        aggr_frame = DataFrame.from_items(items)
        self.aggregate_view.set_dataframe(aggr_frame)

        dist_frame = self.group_by(['revdisp', 'nivvie'], by_var)
        by_var_label = self.var2label[by_var]
        dist_frame.insert(0,by_var_label,u"") 
        enum = self.var2enum[by_var]
        dist_frame[by_var_label] = dist_frame[by_var].apply(lambda x: enum._vars[x])
        
        dist_frame.pop(by_var)
                
        self.distribution_view.set_dataframe(dist_frame)
        self.distribution_view.reset()
        self.calculated()
        QApplication.restoreOverrideCursor()
コード例 #41
0
ファイル: frameparseval.py プロジェクト: Noahs-ARK/semafor
 def __setitem__(self, k, v):
     points = {}
     if isinstance(v[0], int):
         N, gold_set, pred_set = v
         if gold_set or pred_set:
             assert N>0,(N,gold_set,pred_set)
     else:
         N = ''
         gold, pred = v
         pred_set = set(pred.keys()) if isinstance(pred, dict) else pred
         gold_set = set(gold.keys()) if isinstance(gold, dict) else gold
         if isinstance(gold, dict):
             points.update(gold)
             if isinstance(pred, dict):
                 for elt in gold_set & pred_set:
                     assert gold[elt]==pred[elt],(elt,gold[elt],pred[elt])
         if isinstance(pred, dict):
             points.update(pred)
         
     entry = {
         'Numer': sum(points.get(elt,1) for elt in gold_set & pred_set),
         'PDenom': sum(points.get(elt,1) for elt in pred_set),
         'RDenom': sum(points.get(elt,1) for elt in gold_set),
         'N': N
     }
     entry['P'] = entry['Numer'] / entry['PDenom'] if entry['PDenom'] else float('nan')
     entry['R'] = entry['Numer'] / entry['RDenom'] if entry['RDenom'] else float('nan')
     entry['F'] = 2 * entry['P'] * entry['R'] / (entry['P'] + entry['R']) if (entry['P'] + entry['R']) else float('nan')
     if N=='':
         entry['T'] = None
         entry['Acc'] = None
     else:
         if len(gold_set)==len(pred_set)==N:
             entry['T'] = entry['Numer']
         else:
             tp = entry['Numer']
             fp = len(pred_set-gold_set)
             fn = len(gold_set-pred_set)
             entry['T'] = N-fp-fn
         assert entry['T']>=0,(entry,gold_set,pred_set)
         entry['Acc'] = float('nan') if N==0 else entry['T'] / N
     df = DataFrame.from_items([(e, {k: entry[e]}) for e in PRCounter.COLUMNS])
     self._df = self._df.append(df)
コード例 #42
0
    def get_contributions(self, phase=None):
        data = []
        phase_query_key = {'$exists': 1} if phase is None else phase
        columns = ['mp-id', 'contribution', 'formula']
        if phase is None:
            columns.append('phase')
        columns += ['dH (formation)', 'dH (hydration)', 'GS?', 'CIF']

        docs = self.query_contributions(
            criteria={
                'content.doi': '10.1021/jacs.6b11301',
                'content.data.Phase': phase_query_key
            },
            projection={
                '_id': 1, 'mp_cat_id': 1, 'content.data': 1,
                'content.{}'.format(mp_level01_titles[3]): 1
            }
        )
        if not docs:
            raise Exception('No contributions found for MnO2 Phase Selection Explorer!')

        for doc in docs:
            mpfile = MPFile.from_contribution(doc)
            mp_id = mpfile.ids[0]
            contrib = mpfile.hdata[mp_id]['data']
            cid_url = '/'.join([
                self.preamble.rsplit('/', 1)[0], 'explorer', 'materials', doc['_id']
            ])
            row = [mp_id, cid_url, contrib['Formula']]
            if phase is None:
                row.append(contrib['Phase'])
            row += [contrib['dHf'], contrib['dHh'], contrib['GS']]
            cif_url = ''
            structures = mpfile.sdata.get(mp_id)
            if structures:
                cif_url = '/'.join([
                    self.preamble.rsplit('/', 1)[0], 'explorer', 'materials',
                    doc['_id'], 'cif', structures.keys()[0]
                ])
            row.append(cif_url)
            data.append((mp_id, row))

        return DataFrame.from_items(data, orient='index', columns=columns)
コード例 #43
0
    def update_output(self, output_data, description=None):
        QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))

        if output_data is None:
            return
        self.set_data(output_data)

        if description is not None:
            self.set_distribution_choices(description)

        if not hasattr(self, 'distribution_by_var'):
            self.distribution_by_var = 'typmen15'

        by_var = self.distribution_by_var

        V = []
        M = []
        B = []
        for var in self.varlist:
            montant, benef = self.get_aggregate(var)
            V.append(var)
            M.append(montant)
            B.append(benef)

        items = [(u'Mesure', V), (u"Dépense\n(millions d'€)", M),
                 (u"Bénéficiaires\n(milliers de ménages)", B)]
        aggr_frame = DataFrame.from_items(items)
        self.aggregate_view.set_dataframe(aggr_frame)

        dist_frame = self.group_by(['revdisp', 'nivvie'], by_var)
        by_var_label = self.var2label[by_var]
        dist_frame.insert(0, by_var_label, u"")
        enum = self.var2enum[by_var]
        dist_frame[by_var_label] = dist_frame[by_var].apply(
            lambda x: enum._vars[x])

        dist_frame.pop(by_var)

        self.distribution_view.set_dataframe(dist_frame)
        self.distribution_view.reset()
        self.calculated()
        QApplication.restoreOverrideCursor()
コード例 #44
0
    def get_contributions(self):
        data = []
        columns = [
            'mp-id', 'contribution', 'kohn-sham_bandgap(indirect)',
            'kohn-sham_bandgap(direct)', 'derivative_discontinuity',
            'quasi-particle_bandgap(indirect)',
            'quasi-particle_bandgap(direct)'
        ]

        docs = self.query_contributions(
            criteria=self.dtu_query,
            projection={
                '_id': 1,
                'mp_cat_id': 1,
                'content.kohn-sham_bandgap.indirect': 1,
                'content.kohn-sham_bandgap.direct': 1,
                'content.derivative_discontinuity': 1,
                'content.quasi-particle_bandgap.indirect': 1,
                'content.quasi-particle_bandgap.direct': 1
            })
        if not docs:
            raise Exception('No contributions found for DTU Explorer!')

        for doc in docs:
            mpfile = MPFile.from_contribution(doc)
            mp_id = mpfile.ids[0]
            contrib = mpfile.hdata[mp_id]
            cid_url = '/'.join([
                self.preamble.rsplit('/', 1)[0], 'explorer', 'materials',
                doc['_id']
            ])
            row = [
                mp_id, cid_url, contrib['kohn-sham_bandgap']['indirect'],
                contrib['kohn-sham_bandgap']['direct'],
                contrib['derivative_discontinuity'],
                contrib['quasi-particle_bandgap']['indirect'],
                contrib['quasi-particle_bandgap']['direct']
            ]
            data.append((mp_id, row))
        return DataFrame.from_items(data, orient='index', columns=columns)
コード例 #45
0
def display_restriction_sites(sequence):
    """This function takes in a sequence, reads it using the read_fasta fucntion from last week's assignment
    it then uses a regular expression to define a restriction sequence. 
    using the data drame and other statistical tools from panda, this function will then fill up a 4 by 7 frequency column
    with the distribution of nucleotides in all the sequences found in a given file of dna sequence 
    the parameter sequence: any fasta file that contains a single string of DNA. """ # find the hits
    sequence1 = read_fasta(sequence)
    recognition_sequence=r'[AG]GG[AGTC]CC[CT]'
    found = re.findall( recognition_sequence , sequence1 )
    from pandas import Series, DataFrame
    import numpy 
    s= Series(found) #convert the data type of found from a list to Series, which is a one-dimensional array for the next step 
    frequency_matrix = numpy.zeros((4, len(s[0])), dtype=numpy.int) # we create an empty array of 4 rows, each with 7 spots. Will fill this array by countinf the number of A,G,T and C nucleotides at each position for all the instances when the draII sequence has occured
    base2index = {'A': 0, 'C': 1, 'G': 2, 'T': 3} # we have created a dictionary, where the A, C, G and T will be represented by row 1, 1, 2 and 3 respectively in the array 
    for sequence in s:  # a for loop iterating over all the individual incidences of the draII restriction site in a given DNA sequence 
        for index, base in enumerate(sequence): #Enumerate() method adds a counter to an iterable and returns it in a form of enumerate object. 
            frequency_matrix[base2index[base]][index] += 1  # for each restriction site found, all the nucleotides will be distributed across the four rows and the empty frequency table will be filled
    from pandas import Series, DataFrame
    Data=DataFrame.from_items([('A', frequency_matrix[0,]), ('C',frequency_matrix[1,] ), 
                     ('G',frequency_matrix[2,]), ('T',frequency_matrix[3,] )],
                     orient='index', columns=['one', 'two', 'three', 'four', 'five', 'six', 'seven'])
    Data_new = Data.loc[:,"one":"seven"].div(Data.sum( axis= 0,skipna=True))  # converts the frequency matrix into a position weight matrix by dividing each element of a column by the sum of that column 
    return Data_new
コード例 #46
0
                                    data[9],
                                    data[10],
                                    data[11],
                                    data[12],
                                    data[13],
                                ],
                            )
                        )  ##append tuple to list
                        print dt
            except:
                print "skipped day"
                pass
        else:
            print "passed"
            pass
    frame = DataFrame.from_items(datalist, orient="index", columns=columns)
    frame.columns = columns
    frame = frame.applymap(lambda x: np.nan if x == "-9999" else x)
    datafile = frame.to_csv("C:/Users/Alex/Desktop/samoa/WATERSHED_ANALYSIS/BarometricData/NSTU/NSTU-current_10_28.csv")

#### Append all
##files = os.listdir('C:/Users/Alex/Desktop/samoa/WATERSHED_ANALYSIS/BarometricData/NSTP6/')
##alldata = open('C:/Users/Alex/Desktop/samoa/WATERSHED_ANALYSIS/BarometricData/NSTP6/'+'2013.txt','w')
##for f in files:
##    if f.endswith('.csv')==True:
##        print f
##        with open(f,'wb') as csvfile:
##            data=csv.reader(csvfile,dialect='excel')
##            for row in data:
##                alldatata.write(row)
コード例 #47
0
import pandas
from pandas import DataFrame

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

stoplist = stopwords.words('english')

seasons, episode_ids = [], []
with open("data/import/episodes.csv", "r") as episodesfile:
    reader = csv.reader(episodesfile, delimiter = ",")
    reader.next()
    for row in reader:
        seasons.append(int(row[3]))
        episode_ids.append(int(row[0]))

df = DataFrame.from_items([('Season', seasons), ('EpisodeId', episode_ids)])
last_episode_in_season = list(df.groupby("Season").max()["EpisodeId"])

print "process sentences"

episodes = defaultdict(list)
with open("data/import/sentences.csv", "r") as sentencesfile:
    reader = csv.reader(sentencesfile, delimiter = ",")
    reader.next()
    for row in reader:
        episodes[row[1]].append([ word
                                    for word in nltk.word_tokenize(row[4].lower())
                                    if word not in string.punctuation and
                                       word not in stoplist ] )

texts = []
コード例 #48
0
    def add_var2(self, varname, target=None, source = 'free'):
        """
        Add a variable in the dataframe

        Parameters
        ----------

        varname : str
                  name of the variable
        target : float
                 target for the margin of the variable
        source : str, default 'free'
                 database source
        """
        w_init = self.weights_init*self.champm
        w = self.weights*self.champm
        inputs = self.simulation.survey
        output_table = self.simulation.output_table

        varcol = self.simulation.get_col(varname)
        entity = self.entity
        enum = inputs.column_by_name.get('qui'+self.entity).enum
        people = [x[1] for x in enum]

        if varname in inputs.column_by_name:
            value = inputs.get_value(varname, index = idx)
        elif output_table is not None and varname in output_table.column_by_name:
            value = output_table.get_value(varname, index = idx, opt = people, sum_ = True)

        label = varcol.label
        # TODO: rewrite this using pivot table
        items = [ ('marge'    , w[self.champm]  ), ('marge initiale' , w_init[self.champm] )]
        if varcol.__class__  in MODCOLS:
            items.append(('mod',   value[self.champm]))
            df = DataFrame.from_items(items)
            res = df.groupby('mod', sort= True).sum()
        else:
            res = DataFrame(index = ['total'],
                            data = {'marge' : (value*w).sum(),
                                    'marge initiale' : (value*w_init).sum()  } )
        res.insert(0, u"modalités",u"")
        res.insert(2, "cible", 0)
        res.insert(2, u"cible ajustée", 0)
        res.insert(4, "source", source)
        mods = res.index

        if target is not None:
            if len(mods) != len(target.keys()):
                drop_indices = [ (varname, mod) for mod in target.keys()]
                if source == 'input':
                    self.input_margins_df.drop(drop_indices, inplace=True)
                    self.input_margins_df.index.names = ['var','mod']
                if source == 'output':
                    self.output_margins_df.drop(drop_indices, inplace=True)
                    self.output_margins_df.index.names = ['var','mod']
                return

        if isinstance(varcol, EnumCol):
            if varcol.enum:
                enum = varcol.enum
                res[u'modalités'] = [enum._vars[mod] for mod in mods]
                res['mod'] = mods
            else:
                res[u'modalités'] = [mod for mod in mods]
                res['mod'] = mods
        elif isinstance(varcol, BoolCol):
            res[u'modalités'] = bool(mods)
            res['mod']        = mods
        elif isinstance(varcol, IntCol):
            res[u'modalités'] = mods
            res['mod']        = mods
        elif isinstance(varcol, AgeCol):
            res[u'modalités'] = mods
            res['mod'] = mods
        else:
            res[u'modalités'] = "total"
            res['mod']  = 0

        if label is not None:
            res['variable'] = label
        else:
            res['variable'] = varname
        res['var'] = varname

        if target is not None:
            for mod, margin in target.iteritems():
                if mod == varname:    # dirty to deal with non catgorical data
                    res['cible'][0] = margin
                else:
                    res['cible'][mod] = margin

        if self.frame is None:
            self.frame = res
        else:
            self.frame = concat([self.frame, res])

        self.frame = self.frame.reset_index(drop=True)
コード例 #49
0
ファイル: ka_bnet_pandas.py プロジェクト: irr/python-labs
        theta = compute_theta(data)    # M-step

        print("Run %d produced theta of:" % i)
        print_theta(theta)
        #log_likelihood(data, theta)

#===============================================

#TODO: infer varaibles and state sizes from data
nodes = ['T', 'E1', 'E2', 'E3', 'E4']
N  = len(nodes)

# create a blank adjacency matrix, then
# set the directed edges.  each row (node)
# should have a 1 in the column of each parent.
adj = DataFrame.from_items( [(node, Series(np.zeros(N, int))) for node in nodes] )
adj.index = nodes
adj.ix['E1', 'T'] = 1
adj.ix['E2', 'T'] = 1
adj.ix['E3', 'T'] = 1
adj.ix['E4', 'T'] = 1
print(adj)

# specify the TRUE joint distribution, theta.  specified as a
# dict of node -> cpt, where each cpt is a dict
# of comma-separated values of the ordered parents -> prob
theta = {}
theta['T'] = {'': {0: 0.75, 1: 0.25}}
theta['E1'] = {'0': {0: 0.45, 1: 0.55}, '1': {0: 0.05, 1: 0.95}, }
theta['E2'] = {'0': {0: 0.40, 1: 0.60}, '1': {0: 0.05, 1: 0.95}, }
theta['E3'] = {'0': {0: 0.50, 1: 0.50}, '1': {0: 0.10, 1: 0.90}, }
コード例 #50
0
    def test_column_dups_operations(self):
        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=['A', 'A'])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range('20130101', periods=4, freq='Q-NOV')
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['a', 'a', 'a', 'a'])
        df.columns = idx
        expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                             columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        df['string'] = 'bah'
        expected = DataFrame(
            [[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']],
            columns=['foo', 'bar', 'foo', 'hello', 'string'])
        check(df, expected)
        with assertRaisesRegexp(ValueError, 'Length of value'):
            df.insert(0, 'AnotherColumn', range(len(df.index) - 1))

        # insert same dtype
        df['foo2'] = 3
        expected = DataFrame(
            [[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
             [2, 1, 3, 5, 'bah', 3]],
            columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # set (non-dup)
        df['foo2'] = 4
        expected = DataFrame(
            [[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
             [2, 1, 3, 5, 'bah', 4]],
            columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)
        df['foo2'] = 3

        # delete (non dup)
        del df['bar']
        expected = DataFrame(
            [[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]],
            columns=['foo', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # try to delete again (its not consolidated)
        del df['hello']
        expected = DataFrame(
            [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]],
            columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # consolidate
        df = df.consolidate()
        expected = DataFrame(
            [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]],
            columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # insert
        df.insert(2, 'new_col', 5.)
        expected = DataFrame(
            [[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]],
            columns=['foo', 'foo', 'new_col', 'string', 'foo2'])
        check(df, expected)

        # insert a dup
        assertRaisesRegexp(ValueError, 'cannot insert', df.insert, 2,
                           'new_col', 4.)
        df.insert(2, 'new_col', 4., allow_duplicates=True)
        expected = DataFrame(
            [[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3],
             [2, 3, 4., 5., 'bah', 3]],
            columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2'])
        check(df, expected)

        # delete (dup)
        del df['foo']
        expected = DataFrame(
            [[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]],
            columns=['new_col', 'new_col', 'string', 'foo2'])
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        check(df)

        df['foo2'] = 7.
        expected = DataFrame(
            [[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]],
            columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        result = df['foo']
        expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
                             columns=['foo', 'foo'])
        check(result, expected)

        # multiple replacements
        df['foo'] = 'string'
        expected = DataFrame(
            [['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.],
             ['string', 1, 'string', 5, 7.]],
            columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        del df['foo']
        expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                             columns=['bar', 'hello', 'foo2'])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        self.assertTrue((result == expected).all().all())

        # rename, GH 4403
        df4 = DataFrame(
            {
                'TClose': [22.02],
                'RT': [0.0454],
                'TExg': [0.0422]
            },
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        df5 = DataFrame(
            {
                'STK_ID': [600809] * 3,
                'RPT_Date': [20120930, 20121231, 20130331],
                'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
                'TClose': [38.05, 41.66, 30.01]
            },
            index=MultiIndex.from_tuples([(600809, 20120930),
                                          (600809, 20121231),
                                          (600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
        result = k.rename(columns={
            'TClose_x': 'TClose',
            'TClose_y': 'QT_Close'
        })
        str(result)
        result.dtypes

        expected = (DataFrame(
            [[0.0454, 22.02, 0.0422, 20130331, 600809,
              u('饡驦'), 30.01]],
            columns=[
                'RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name',
                'QT_Close'
            ]).set_index(['STK_ID', 'RPT_Date'], drop=False))
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        self.assertRaises(ValueError, df.reindex, columns=['bar'])
        self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo'])

        # drop
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        result = df.drop(['a'], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=['bar'])
        check(result, expected)
        result = df.drop('a', axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['bar', 'a', 'a'],
                       dtype='float64')
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3),
                       index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'A'])
        for index in [df.index, pd.Index(list('edcba'))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame.from_items([('A', expected_ser),
                                                ('B', this_df['B']),
                                                ('A', expected_ser)])
            this_df['A'] = index
            check(this_df, expected_df)

        # operations
        for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ['A', 'A']
            df.columns = ['A', 'A']
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
        expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])

        df['that'] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
        expected = DataFrame(1, index=range(5), columns=['that', 'that'])

        df['that'] = 1
        check(df, expected)
コード例 #51
0
    def compute(self):
        """
        Compute inequality dataframe
        """
        output = self.simulation.output_table
        final_df = None

        WEIGHT = model.WEIGHT
        FILTERING_VARS = model.FILTERING_VARS
        for varname, entities in self.vars.iteritems():
            for entity in entities:
                #idx =  output.index[entity]

                val  = output.get_value(varname, entity)
                weights = output._inputs.get_value(WEIGHT, entity)
                filter_var_name = FILTERING_VARS[0]
                filter_var= output._inputs.get_value(filter_var_name, entity)

            items = []
            # Compute mean
            moy = (weights*filter_var*val).sum()/(weights*filter_var).sum()
            items.append( ("Moyenne",  [moy]))

            # Compute deciles
            labels = range(1,11)
            method = 2
            decile, values = mark_weighted_percentiles(val, labels, weights*filter_var, method, return_quantiles=True)

            labels = [ 'D'+str(d) for d in range(1,11)]
            del decile
            for l, v in zip(labels[:-1],values[1:-1]):
                items.append( (l, [v]))

            # Compute Gini
            gini_coeff = gini(val, weights*filter_var)
            items.append( ( _("Gini index"), [gini_coeff]))

            df = DataFrame.from_items(items, orient = 'index', columns = [varname])
            df = df.reset_index()
            if final_df is None:
                final_df = df
            else:
                final_df = final_df.merge(df, on='index')

        final_df[u"Initial à net"] = (final_df['nivvie_net']-final_df['nivvie_ini'])/final_df['nivvie_ini']
        final_df[u"Net à disponible"] = (final_df['nivvie']-final_df['nivvie_net'])/final_df['nivvie_net']
        final_df = final_df[['index','nivvie_ini', u"Initial à net", 'nivvie_net',u"Net à disponible",'nivvie']]
        self.inequality_dataframe = final_df

        # poverty
        poverty = dict()
        entity = "men"
        varname = "nivvie"
        for percentage in [ 40, 50, 60]:
#            idx =  output.index[entity]
            varname = "pauvre" + str(percentage)
            val = output.get_value(varname, entity)
            weights = output._inputs.get_value(WEIGHT, entity)
            filter_var_name = FILTERING_VARS[0]
            filter_var= output._inputs.get_value(filter_var_name, entity)
            poverty[percentage] =  (weights*filter_var*val).sum()/(weights*filter_var).sum()

        self.poverty = poverty
コード例 #52
0
ファイル: CompCOB.py プロジェクト: TrinaKuriger/Camoco
    def _run_comparison(self, random_trials=100,
                        sig_threshold=0.05, debug=False):
        self.log(
            'Running ' + str(random_trials) +
            ' random sets for each term and comparing them.')
        dens = dict()
        significant_terms = 0
        n = 0

        # Use only 25 terms for testing purposes
        if debug:
            ont_loci = list(self.ont_loci.items())[:25]
        else:
            ont_loci = list(self.ont_loci.items())

        # Iterate through all terms
        for term, loci in ont_loci:
            # Log how many teerms are done, to ensure people it hasn't crashed
            if n % 100 == 0:
                self.log('Compared {}/{} terms so far.', n, len(ont_loci))

            real_density = self.cob.density(loci)
            dens[term] = [real_density]

            # Run the random samples
            loci_count = len(loci)
            loci_tot_list = self.cob.refgen.random_genes(
                loci_count*random_trials
            )
            scores = []
            aboves = 0
            for x in range(random_trials):
                # Get the random genes
                loci_list = [loci_tot_list.pop() for x in range(loci_count)]

                # Find the density and save it
                score = self.cob.density(loci_list)
                if score >= real_density:
                    aboves += 1
                scores.append(score)

            # Add on the states from the scores
            dens[term].append(np.mean(scores))
            dens[term].append(np.std(scores))
            dens[term].append(aboves)

            # Figure out if that makes it significant
            if dens[term][-1] <= (random_trials*sig_threshold):
                dens[term].append(1)
                significant_terms += 1
            else:
                dens[term].append(0)
            n += 1
        self.log('Compared all {} terms.', n)

        # Convert the dict to a DataFrame
        ans = DataFrame.from_items(
            dens.items(),
            columns=[
                self.cob.name+' Density', 'Random Density Mean',
                'Random STD', 'Items >= '+self.cob.name, 'Significant'
            ],
            orient='index'
        )

        self.log('Number of Significant Terms: ' + str(significant_terms))
        self.log('Number Random Significants Expected: '+str(len(dens)*0.05))
        return ans
コード例 #53
0
    def test_column_dups_operations(self):

        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=['A', 'A'])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range('20130101', periods=4, freq='Q-NOV')
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['a', 'a', 'a', 'a'])
        df.columns = idx
        expected = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        df['string'] = 'bah'
        expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
                              [2, 1, 3, 5, 'bah']],
                             columns=['foo', 'bar', 'foo', 'hello', 'string'])
        check(df, expected)
        with assertRaisesRegexp(ValueError, 'Length of value'):
            df.insert(0, 'AnotherColumn', range(len(df.index) - 1))

        # insert same dtype
        df['foo2'] = 3
        expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
                              [2, 1, 3, 5, 'bah', 3]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)

        # set (non-dup)
        df['foo2'] = 4
        expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
                              [2, 1, 3, 5, 'bah', 4]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)
        df['foo2'] = 3

        # delete (non dup)
        del df['bar']
        expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
                              [2, 3, 5, 'bah', 3]],
                             columns=['foo', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # try to delete again (its not consolidated)
        del df['hello']
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # consolidate
        df = df.consolidate()
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # insert
        df.insert(2, 'new_col', 5.)
        expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
                              [2, 3, 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col', 'string',
                                      'foo2'])
        check(df, expected)

        # insert a dup
        assertRaisesRegexp(ValueError, 'cannot insert',
                           df.insert, 2, 'new_col', 4.)
        df.insert(2, 'new_col', 4., allow_duplicates=True)
        expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
                              [1, 2, 4., 5., 'bah', 3],
                              [2, 3, 4., 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col',
                                      'new_col', 'string', 'foo2'])
        check(df, expected)

        # delete (dup)
        del df['foo']
        expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
                              [4., 5., 'bah', 3]],
                             columns=['new_col', 'new_col', 'string', 'foo2'])
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        check(df)

        df['foo2'] = 7.
        expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
                              [2, 1, 3., 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        result = df['foo']
        expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
                             columns=['foo', 'foo'])
        check(result, expected)

        # multiple replacements
        df['foo'] = 'string'
        expected = DataFrame([['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        del df['foo']
        expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
                             'bar', 'hello', 'foo2'])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        self.assertTrue((result == expected).all().all())

        # rename, GH 4403
        df4 = DataFrame(
            {'TClose': [22.02],
             'RT': [0.0454],
             'TExg': [0.0422]},
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        df5 = DataFrame({'STK_ID': [600809] * 3,
                         'RPT_Date': [20120930, 20121231, 20130331],
                         'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
                         'TClose': [38.05, 41.66, 30.01]},
                        index=MultiIndex.from_tuples(
                            [(600809, 20120930),
                             (600809, 20121231),
                             (600809, 20130331)],
                            names=['STK_ID', 'RPT_Date']))

        k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
        result = k.rename(
            columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
        str(result)
        result.dtypes

        expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
                                u('饡驦'), 30.01]],
                              columns=['RT', 'TClose', 'TExg',
                                       'RPT_Date', 'STK_ID', 'STK_Name',
                                       'QT_Close'])
                    .set_index(['STK_ID', 'RPT_Date'], drop=False))
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        self.assertRaises(ValueError, df.reindex, columns=['bar'])
        self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo'])

        # drop
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        result = df.drop(['a'], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=['bar'])
        check(result, expected)
        result = df.drop('a', axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['bar', 'a', 'a'], dtype='float64')
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'A'])
        for index in [df.index, pd.Index(list('edcba'))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame.from_items([('A', expected_ser),
                                                ('B', this_df['B']),
                                                ('A', expected_ser)])
            this_df['A'] = index
            check(this_df, expected_df)

        # operations
        for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ['A', 'A']
            df.columns = ['A', 'A']
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
        expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])

        df['that'] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
        expected = DataFrame(1, index=range(5), columns=['that', 'that'])

        df['that'] = 1
        check(df, expected)
コード例 #54
0
from pandas import DataFrame

def is_popuar(row):
    if row['fb_likes'] >= 2000:
        val = 1
    else:
        val = 0
    return val

data = DataFrame.from_items(
    [('category', ['Entertainment', 'Lifestyle', 'Technology']),
    ('fb_likes', [2349, 1299, 6589])
    ])

data['is_popular'] = data.apply(is_popuar, axis=1)

train=data.sample(frac=0.8,random_state=100)
test=data.drop(train.index)

x_train = train.ix[:,:-1]
x_train_target = train.ix[:,-1:]

y_test = test.ix[:,:-1]
y_test_target = test.ix[:,-1:]

print x_train
print x_train_target

#Just use x_train.values and x_train_target.values (same with test data) before fitting
コード例 #55
0
ファイル: system.py プロジェクト: ywuywu/ml_monorepo
def run_system(model, system, group, intraday=False, quantity=1):
    r"""Run a system for a given group, creating a trades frame.

    Parameters
    ----------
    model : alphapy.Model
        The model object with specifications.
    system : alphapy.System
        The system to run.
    group : alphapy.Group
        The group of symbols to trade.
    intraday : bool, optional
        If true, this is an intraday system.
    quantity : float, optional
        The amount to trade for each symbol, e.g., number of shares

    Returns
    -------
    tf : pandas.DataFrame
        All of the trades for this ``group``.

    """

    system_name = system.name
    logger.info("Generating Trades for System %s", system_name)

    # Unpack the model data.

    directory = model.specs['directory']
    extension = model.specs['extension']
    separator = model.specs['separator']

    # Extract the group information.

    gname = group.name
    gmembers = group.members
    gspace = group.space

    # Run the system for each member of the group

    gtlist = []
    for symbol in gmembers:
        # generate the trades for this member
        tlist = trade_system(model, system, gspace, intraday, symbol, quantity)
        if tlist:
            # add trades to global trade list
            for item in tlist:
                gtlist.append(item)
        else:
            logger.info("No trades for symbol %s", symbol)

    # Create group trades frame

    tf = None
    if gtlist:
        tspace = Space(system_name, "trades", group.space.fractal)
        gtlist = sorted(gtlist, key=lambda x: x[0])
        tf = DataFrame.from_items(gtlist, orient='index', columns=Trade.states)
        tfname = frame_name(gname, tspace)
        system_dir = SSEP.join([directory, 'systems'])
        labels = ['date']
        if intraday:
            labels.append('time')
        write_frame(tf,
                    system_dir,
                    tfname,
                    extension,
                    separator,
                    index=True,
                    index_label=labels)
        del tspace
    else:
        logger.info("No trades were found")

    # Return trades frame
    return tf
コード例 #56
0
ファイル: analyze.py プロジェクト: hoffmann/appnetstats
import json
import sys
from pandas import DataFrame

filename = sys.argv[1]

data = json.load(open(filename))
frame_data = [(k, (len(v['following']), len(v['followers']))) for k,v in data.items()]
frame = DataFrame.from_items(frame_data, orient='index', columns=['following', 'followers'])

print frame

print '## Following'
print frame['following'].describe()
print frame.sort('following', ascending=False)[:10]

print '## Followers'
print frame['followers'].describe()
print frame.sort('followers', ascending=False)[:10]
コード例 #57
0
    def compute(self):
        """
        Compute inequality dataframe
        """
        final_df = None
        simulation = self.survey_scenario.new_simulation()
        column_by_name = simulation.tax_benefit_system.column_by_name

        # amounts and beneficiaries from current data and default data if exists
        # Build weights for each entity

        from openfisca_france_data import FILTERING_VARS
        for varname, entities in self.vars.iteritems():
            for entity_key_plural in entities:
                column = column_by_name[varname]
                weight_name = self.survey_scenario.weight_column_name_by_entity_key_plural[column.entity_key_plural]
                filter_by = FILTERING_VARS[0]
                filter_by_name = FILTERING_VARS[0]
                if column.entity_key_plural is not 'menages':
                    filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural)
                val = simulation.calculate(varname)
                weights = simulation.calculate(weight_name)
                filter_var = simulation.calculate(filter_by_name)

            items = []
            # Compute mean
            moy = (weights * filter_var * val).sum() / (weights * filter_var).sum()
            items.append(("Moyenne", [moy]))
            # Compute deciles
            labels = range(1, 11)
            method = 2
            decile, values = mark_weighted_percentiles(val, labels, weights * filter_var,
                                                       method, return_quantiles = True)
            labels = ['D' + str(d) for d in range(1, 11)]
            del decile
            for l, v in zip(labels[:-1], values[1:-1]):
                items.append((l, [v]))

            # Compute Gini
            gini_coeff = gini(val, weights * filter_var)
            items.append((_("Gini index"), [gini_coeff]))
            df = DataFrame.from_items(items, orient = 'index', columns = [varname])
            df = df.reset_index()
            if final_df is None:
                final_df = df
            else:
                final_df = final_df.merge(df, on='index')

        final_df[u"Initial à net"] = (final_df['nivvie_net'] - final_df['nivvie_ini']) / final_df['nivvie_ini']
        final_df[u"Net à disponible"] = (final_df['nivvie'] - final_df['nivvie_net']) / final_df['nivvie_net']
        final_df = final_df[['index', 'nivvie_ini', u"Initial à net", 'nivvie_net', u"Net à disponible", 'nivvie']]
        self.inequality_data_frame = final_df

        # Poverty
        poverty = dict()
        varname = "nivvie"
        for percentage in [40, 50, 60]:
            varname = "pauvre{}".format(percentage)
            column = column_by_name[varname]
            weight_name = self.survey_scenario.weight_column_name_by_entity_key_plural[column.entity_key_plural]
            filter_by_name = FILTERING_VARS[0]
            if column.entity_key_plural is not 'menages':
                filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural)
            val = simulation.calculate(varname)
            weights = simulation.calculate(weight_name)
            filter_var = simulation.calculate(filter_by_name)
            poverty[percentage] = (weights * filter_var * val).sum() / (weights * filter_var).sum()

        self.poverty = poverty