Example #1
0
 def __add__(self, that):
     # ensure all rows are present for both tables, filling in 0 if necessary
     # (otherwise the empty rows will be treated as if they contain NaN when adding)
     
     me = self._df
     you = that._df
     for row in me.index:
         if row not in that._df.index:
             you = you.append(DataFrame.from_items([(e, {row: '' if me[e][row]=='' else 0}) for e in PRCounter.COLUMNS]))
     for row in you.index:
         if row not in self._df.index:
             me = me.append(DataFrame.from_items([(e, {row: '' if you[e][row]=='' else 0}) for e in PRCounter.COLUMNS]))
     
     # add counts
     new_df = me + you
     
     # recompute ratios
     new_df['P'] = new_df['Numer'] / new_df['PDenom']
     new_df['R'] = new_df['Numer'] / new_df['RDenom']
     denom = (new_df['P'] + new_df['R'])
     new_df['F'] = 2 * new_df['P'] * new_df['R'] / denom[denom>0]
     new_df['Acc'] = new_df['T'] / new_df['N']
     result = PRCounter()
     result._df = new_df
     return result
Example #2
0
def TreeMatrix(D,desc,L, Env=None,DshapeLarge=True):
    """
    Applying tree information (desc,L) on a given count matrix (D) and columns grouping (Env) to obtain a matrix of count over the tree
    """
    #I assume that D has correct columns name
    if not DshapeLarge:
        Z=(D[["Sample","Taxon"]]).values
        Z=MultiIndex.from_tuples(map(tuple,tuple(Z)), names=["Sample","Taxon"])
        D.index=Z
        Dlarge=D.Count.unstack(level=0)
        Dlarge.fillna(value=0,inplace=True)
        #I assume that Environment has correct index and columns names
        ExperimentalDesignColumns=MultiIndex.from_tuples(
        map(tuple,tuple(Env.ix[Dlarge.columns].values))
        , names=["Sample","Group"])
    else:
        # if D is already Large Environment information is already included
        Dlarge=D
        ExperimentalDesignColumns=Dlarge.columns
    #if taxon only present in tree but not in table, access mode .ix correctly report NA for that line, that later will be converted to zero.
    NodeTableLarge=[[x[0],Dlarge.ix[x[-1]].sum()] for x in desc]
    Dtree=DataFrame.from_items(NodeTableLarge).transpose()
    NodeAndLeafNamesIndex=MultiIndex.from_tuples(
        map(tuple,tuple(L.loc[:,["Name","Is_Leaf"]].ix[Dtree.index].values))
        , names=["Name","Is_Leaf"])
    Dtree.index=NodeAndLeafNamesIndex
    Dtree.columns=ExperimentalDesignColumns
    Dtree.columns=Dtree.columns.reorder_levels(["Group", "Sample"])
    return Dtree
def StormSums(Stormslist,Data,offset=0):
    eventlist = []
    index =[]
    for storm_index,storm in Stormslist.iterrows():
        #print storm
        start = storm['start']-timedelta(minutes=offset) ##if Storms are defined by stream response you have to grab the preceding precip data
        end= storm['end']
        data = True ## Innocent until proven guilty
        try:
            event = Data.ix[start:end] ### slice list of Data for event
        except KeyError:
            start = start+timedelta(minutes=15) ## if the start time falls between 2 30minute periods
        try:
            event = Data.ix[start:end]
        except KeyError:
            end = end+timedelta(minutes=15)
        try:
            event = Data.ix[start:end]
        except KeyError:
            print 'no precip data available for storm'
            data = False
            pass
        if data != False:
            eventcount = event.count()
            eventsum = event.sum()
            eventmax = event.max()
            eventlist.append((storm['start'],[storm['start']-timedelta(minutes=offset),storm['end'],eventcount,eventsum,eventmax])) 
    Events=DataFrame.from_items(eventlist,orient='index',columns=['start','end','count','sum','max'])
    return Events
Example #4
0
 def test_scientific_no_exponent(self):
     # see gh-12215
     df = DataFrame.from_items([("w", ["2e"]), ("x", ["3E"]), ("y", ["42e"]), ("z", ["632E"])])
     data = df.to_csv(index=False)
     for prec in self.float_precision_choices:
         df_roundtrip = self.read_csv(StringIO(data), float_precision=prec)
         tm.assert_frame_equal(df_roundtrip, df)
Example #5
0
def into(a, b, columns=None, schema=None, **kwargs):
    if not columns and schema:
        columns = dshape(schema)[0].names
    return DataFrame.from_items(((column, b[column][:]) for column in
                                    sorted(b.names)),
                                orient='columns',
                                columns=columns)
Example #6
0
 def __setitem__(self, k, v):
     if isinstance(v[0], int):
         N, gold_set, pred_set = v
         if gold_set or pred_set:
             assert N>0,(N,gold_set,pred_set)
     else:
         gold_set, pred_set = v
         N = ''
     entry = {
         'Numer': len(gold_set & pred_set),
         'PDenom': len(pred_set),
         'RDenom': len(gold_set),
         'N': N
     }
     entry['P'] = entry['Numer'] / entry['PDenom'] if entry['PDenom'] else float('nan')
     entry['R'] = entry['Numer'] / entry['RDenom'] if entry['RDenom'] else float('nan')
     entry['F'] = 2 * entry['P'] * entry['R'] / (entry['P'] + entry['R']) if (entry['P'] + entry['R']) else float('nan')
     if N=='':
         entry['T'] = None
         entry['Acc'] = None
     else:
         if len(gold_set)==len(pred_set)==N:
             entry['T'] = entry['Numer']
         else:
             tp = entry['Numer']
             fp = len(pred_set-gold_set)
             fn = len(gold_set-pred_set)
             entry['T'] = N-fp-fn
         assert entry['T']>=0,(entry,gold_set,pred_set)
         entry['Acc'] = float('nan') if N==0 else entry['T'] / N
     df = DataFrame.from_items([(e, {k: entry[e]}) for e in PRCounter.COLUMNS])
     self._df = self._df.append(df)
    def _update_margins(self):
        for variable in self.margins_by_variable:
            survey_scenario = self.survey_scenario
            simulation = survey_scenario.simulation
            column_by_name = survey_scenario.tax_benefit_system.column_by_name

            assert variable in column_by_name
            column = survey_scenario.tax_benefit_system.column_by_name[variable]
            weight = self.weight
            filter_by = self.filter_by
            initial_weight = self.initial_weight

            value = simulation.calculate_add(variable)
            margin_items = [
                ('actual', weight[filter_by]),
                ('initial', initial_weight[filter_by]),
                ]

            if column.__class__ in [AgeCol, BoolCol, EnumCol]:
                margin_items.append(('category', value[filter_by]))
                # TODO: should not use DataFrame for that ...
                margins_data_frame = DataFrame.from_items(margin_items)
                margins_data_frame = margins_data_frame.groupby('category', sort = True).sum()
                margin_by_type = margins_data_frame.to_dict()
            else:
                margin_by_type = dict(
                    actual = (weight[filter_by] * value[filter_by]).sum(),
                    initial = (initial_weight[filter_by] * value[filter_by]).sum(),
                    )
            self.margins_by_variable[variable].update(margin_by_type)
Example #8
0
def simulate(adj, theta, num_samples):
    data = DataFrame.from_items( [(node, Series(np.zeros(num_samples, int))) for node in adj.columns] )
    for node in adj.columns:
        P = parents(node, adj)
        for n in range(num_samples):
            key = ','.join( [str(data.ix[n,parent]) for parent in P] )
            pdt = theta[node][key]
            data.ix[n,node] = draw(pdt)
    return data
Example #9
0
def fetch_genes(taxon_id):
    c.execute("""
    SELECT id, symbol, name 
    FROM gene 
    WHERE taxon_id=%s 
    ORDER BY id""", (taxon_id,))
    return DataFrame.from_items([(row[0], row) for row in c], 
                                columns=["id", "symbol", "name"],
                                orient="index")
Example #10
0
 def test_scientific_no_exponent(self):
     # see gh-12215
     df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']),
                                ('y', ['42e']), ('z', ['632E'])])
     data = df.to_csv(index=False)
     for prec in self.float_precision_choices:
         df_roundtrip = self.read_csv(
             StringIO(data), float_precision=prec)
         tm.assert_frame_equal(df_roundtrip, df)
Example #11
0
def save_data(): 
    ts = time.time()
    base_dir = filedialog.askdirectory()
    filename_time = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H%M')
    filename_base = os.path.join(base_dir, filename_time)
    filename = '%s.csv' % filename_base
    print("Saving Data...")
    df1 = DataFrame.from_items([('Pitch',pdx), ('Roll',pdy),('Yaw',pdz)]) 
    df1.stack(level=0, dropna = False) # Takes rows and converts to columns
    df1.to_csv(filename) # outputs to csv file
    def calibrate(self):
        """
        Calibrate according to margins found in frame
        """

        df = self.frame
        inputs = self.simulation.survey
        output_table = self.simulation.output_table
        margins = {}

        if df is not None:
            df = df.reset_index(drop=True)
            df = df.set_index(['var','mod'], inplace = True)
            for var, mod in df.index:
                # Dealing with non categorical vars ...
                if df.get_value((var,mod), u"modalités") == 'total':
                    margins[var] =  df.get_value((var,mod), 'cible')
                #  ... and categorical vars
                else:
                    if not margins.has_key(var):
                        margins[var] = {}
                    margins[var][mod] =  df.get_value((var,mod), 'cible')

        param = self.get_param()

        if self.totalpop is not None:
            margins['totalpop'] = self.totalpop
        adjusted_margins = self.update_weights(margins, param=param)

        if 'totalpop' in margins.keys():
            del margins['totalpop']

        w = self.weights
        for var in margins.keys():
            if var in inputs.column_by_name:
                value = inputs.get_value(var, self.entity)
            else:
                entity = self.entity
                enum = output_table._inputs.column_by_name.get('qui'+self.entity).enum
                people = [x[1] for x in enum]
                value = output_table.get_value(var, entity=entity, opt=people, sum_=True)

            if isinstance(margins[var], dict):
                items = [('marge', w  ),('mod', value)]
                updated_margins = DataFrame.from_items(items).groupby('mod', sort= True).sum()
                for mod in margins[var].keys():
                    df.set_value((var,mod), u"cible ajustée", adjusted_margins[var][mod])
                    df.set_value((var,mod), u"marge", updated_margins['marge'][mod])
            else:
                updated_margin = (w*value).sum()
                df.set_value((var,0), u"cible ajustée", adjusted_margins[var])
                df.set_value((var,0), u"marge", updated_margin)

        if self.frame is not None:
            self.frame = df.reset_index()
Example #13
0
    def test_reader_seconds(self):
        # Test reading times with and without milliseconds. GH5945.
        _skip_if_no_xlrd()
        import xlrd

        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            # Xlrd >= 0.9.3 can handle Excel milliseconds.
            expected = DataFrame.from_items([("Time",
                                              [time(1, 2, 3),
                                               time(2, 45, 56, 100000),
                                               time(4, 29, 49, 200000),
                                               time(6, 13, 42, 300000),
                                               time(7, 57, 35, 400000),
                                               time(9, 41, 28, 500000),
                                               time(11, 25, 21, 600000),
                                               time(13, 9, 14, 700000),
                                               time(14, 53, 7, 800000),
                                               time(16, 37, 0, 900000),
                                               time(18, 20, 54)])])
        else:
            # Xlrd < 0.9.3 rounds Excel milliseconds.
            expected = DataFrame.from_items([("Time",
                                              [time(1, 2, 3),
                                               time(2, 45, 56),
                                               time(4, 29, 49),
                                               time(6, 13, 42),
                                               time(7, 57, 35),
                                               time(9, 41, 29),
                                               time(11, 25, 22),
                                               time(13, 9, 15),
                                               time(14, 53, 8),
                                               time(16, 37, 1),
                                               time(18, 20, 54)])])

        epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls')
        epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls')

        actual = read_excel(epoch_1900, 'Sheet1')
        tm.assert_frame_equal(actual, expected)

        actual = read_excel(epoch_1904, 'Sheet1')
        tm.assert_frame_equal(actual, expected)
Example #14
0
 def __add__(self, that):
     # ensure all rows are present for both tables, filling in 0 if necessary
     # (otherwise the empty rows will be treated as if they contain NaN when adding)
     
     me = self._df
     you = that._df
     for row in me.index:
         if row not in that._df.index:
             you = you.append(DataFrame.from_items([(e, {row: '' if me[e][row]=='' else 0}) for e in PRCounter.COLUMNS]))
     for row in you.index:
         if row not in self._df.index:
             me = me.append(DataFrame.from_items([(e, {row: '' if you[e][row]=='' else 0}) for e in PRCounter.COLUMNS]))
     
     # add counts
     new_df = me + you
     
     result = PRCounter()
     result._df = new_df
     if self.COMPUTE_RATIOS_ON_ADD: # recompute ratios
         self.compute_ratios()
     return result
Example #15
0
def test_agg_period_index():
    prng = period_range('2012-1-1', freq='M', periods=3)
    df = DataFrame(np.random.randn(3, 2), index=prng)
    rs = df.groupby(level=0).sum()
    assert isinstance(rs.index, PeriodIndex)

    # GH 3579
    index = period_range(start='1999-01', periods=5, freq='M')
    s1 = Series(np.random.rand(len(index)), index=index)
    s2 = Series(np.random.rand(len(index)), index=index)
    series = [('s1', s1), ('s2', s2)]
    df = DataFrame.from_items(series)
    grouped = df.groupby(df.index.month)
    list(grouped)
Example #16
0
    def test_reader_special_dtypes(self):
        _skip_if_no_xlrd()

        expected = DataFrame.from_items([
            ("IntCol", [1, 2, -3, 4, 0]),
            ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
            ("BoolCol", [True, False, True, True, False]),
            ("StrCol", [1, 2, 3, 4, 5]),
            # GH5394 - this is why convert_float isn't vectorized
            ("Str2Col", ["a", 3, "c", "d", "e"]),
            ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
                         datetime(1905, 1, 1), datetime(2013, 12, 14),
                         datetime(2015, 3, 14)])
        ])

        xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx')
        xls_path = os.path.join(self.dirpath, 'test_types.xls')

        # should read in correctly and infer types
        for path in (xls_path, xlsx_path):
            actual = read_excel(path, 'Sheet1')
            tm.assert_frame_equal(actual, expected)

        # if not coercing number, then int comes in as float
        float_expected = expected.copy()
        float_expected["IntCol"] = float_expected["IntCol"].astype(float)
        float_expected.loc[1, "Str2Col"] = 3.0
        for path in (xls_path, xlsx_path):
            actual = read_excel(path, 'Sheet1', convert_float=False)
            tm.assert_frame_equal(actual, float_expected)

        # check setting Index (assuming xls and xlsx are the same here)
        for icol, name in enumerate(expected.columns):
            actual = read_excel(xlsx_path, 'Sheet1', index_col=icol)
            actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name)
            exp = expected.set_index(name)
            tm.assert_frame_equal(actual, exp)
            tm.assert_frame_equal(actual2, exp)

        # convert_float and converters should be different but both accepted
        expected["StrCol"] = expected["StrCol"].apply(str)
        actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str})
        tm.assert_frame_equal(actual, expected)

        no_convert_float = float_expected.copy()
        no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
        actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str},
                           convert_float=False)
        tm.assert_frame_equal(actual, no_convert_float)
def task_cohen(dataOne, dataTwo):

    labelOne = 'column1'
    labelTwo = 'column2'

    # Create input
    df = DataFrame.from_items([(labelOne, dataOne), (labelTwo, dataTwo)])

    meanx = mean(df[labelOne])
    meany = mean(df[labelTwo])
    sdx = std(df[labelOne])
    sdy = std(df[labelTwo])
    s = sqrt((sdx**2 + sdy**2)/2)
    d = round(fabs(meanx-meany)/s, 4)

    return d
Example #18
0
def to_dataframe(iterable, *attrfields):
    ''' If user specifies fields, only those fields, in that order, will be cast into a data frame.  Otherwise,
    fields are taken from first element in iterable.  Fields must be a list of strings.'''
    if attrfields:
        columns=attrfields #Empty dataframe of fixed column/row size

    ### The autoassign below only works for namedtuples (uses _fields attribute) and takes from the iterable[0] entry ###
    else:
        try:
            columns=iterable[0]._fields  #FOR NOW ONLY WORKS FOR NAMETUPLE attribute
        except AttributeError:
            raise AttributeError('to_dataframe requires a list of attributes')
        else:
            fget=attrgetter(*columns)
            items=[(idx, fget(v)) for idx, v in enumerate(iterable)]  #Key value pairs, key=index position, value =array of results
            return DataFrame.from_items(items, columns, orient='index') #Orient lets it know keys are for row indexing not 
    def compute_aggregates(self, filter_by = None):
        """
        Compute aggregate amounts
        """
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        V = []
        M = {'data': [], 'default': []}
        B = {'data': [], 'default': []}
        U = []

        M_label = {'data': self.labels['dep'],
                   'default': self.labels['dep_default']}
        B_label = {'data': self.labels['benef'],
                   'default': self.labels['benef_default']}

        for var in self.varlist:
            # amounts and beneficiaries from current data and default data if exists
            montant_benef = self.get_aggregate(var, filter_by)
            V.append(column_by_name[var].label)
            entity = column_by_name[var].entity_key_plural

            U.append(entity)
            for dataname in montant_benef:
                M[dataname].append(montant_benef[dataname][0])
                B[dataname].append(montant_benef[dataname][1])

        # build items list
        items = [(self.labels['var'], V)]

        for dataname in M:
            if M[dataname]:
                items.append((M_label[dataname], M[dataname]))
                items.append((B_label[dataname], B[dataname]))

        items.append((self.labels['entity'], U))
        aggr_frame = DataFrame.from_items(items)

        self.aggr_frame = None
        for code, label in self.labels.iteritems():
            try:
                col = aggr_frame[label]
                if self.aggr_frame is None:
                    self.aggr_frame = DataFrame(col)
                else:
                    self.aggr_frame = self.aggr_frame.join(col, how="outer")
            except:
                pass
    def update_output(self, output_data, description = None):
        QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))

        if output_data is None:
            return
        self.set_data(output_data)        
        
        if description is not None:  
            self.set_distribution_choices(description)
            
        if not hasattr(self, 'distribution_by_var'):
            self.distribution_by_var = 'typmen15'
        
        by_var = self.distribution_by_var
        

        V = []
        M = []
        B = []
        for var in self.varlist:
            montant, benef = self.get_aggregate(var)
            V.append(var)
            M.append(montant)
            B.append(benef)
        
        items = [(u'Mesure', V), 
                 (u"Dépense\n(millions d'€)", M), 
                 (u"Bénéficiaires\n(milliers de ménages)", B)]
        aggr_frame = DataFrame.from_items(items)
        self.aggregate_view.set_dataframe(aggr_frame)

        dist_frame = self.group_by(['revdisp', 'nivvie'], by_var)
        by_var_label = self.var2label[by_var]
        dist_frame.insert(0,by_var_label,u"") 
        enum = self.var2enum[by_var]
        dist_frame[by_var_label] = dist_frame[by_var].apply(lambda x: enum._vars[x])
        
        dist_frame.pop(by_var)
                
        self.distribution_view.set_dataframe(dist_frame)
        self.distribution_view.reset()
        self.calculated()
        QApplication.restoreOverrideCursor()
Example #21
0
 def __setitem__(self, k, v):
     points = {}
     if isinstance(v[0], int):
         N, gold_set, pred_set = v
         if gold_set or pred_set:
             assert N>0,(N,gold_set,pred_set)
     else:
         N = ''
         gold, pred = v
         pred_set = set(pred.keys()) if isinstance(pred, dict) else pred
         gold_set = set(gold.keys()) if isinstance(gold, dict) else gold
         if isinstance(gold, dict):
             points.update(gold)
             if isinstance(pred, dict):
                 for elt in gold_set & pred_set:
                     assert gold[elt]==pred[elt],(elt,gold[elt],pred[elt])
         if isinstance(pred, dict):
             points.update(pred)
         
     entry = {
         'Numer': sum(points.get(elt,1) for elt in gold_set & pred_set),
         'PDenom': sum(points.get(elt,1) for elt in pred_set),
         'RDenom': sum(points.get(elt,1) for elt in gold_set),
         'N': N
     }
     entry['P'] = entry['Numer'] / entry['PDenom'] if entry['PDenom'] else float('nan')
     entry['R'] = entry['Numer'] / entry['RDenom'] if entry['RDenom'] else float('nan')
     entry['F'] = 2 * entry['P'] * entry['R'] / (entry['P'] + entry['R']) if (entry['P'] + entry['R']) else float('nan')
     if N=='':
         entry['T'] = None
         entry['Acc'] = None
     else:
         if len(gold_set)==len(pred_set)==N:
             entry['T'] = entry['Numer']
         else:
             tp = entry['Numer']
             fp = len(pred_set-gold_set)
             fn = len(gold_set-pred_set)
             entry['T'] = N-fp-fn
         assert entry['T']>=0,(entry,gold_set,pred_set)
         entry['Acc'] = float('nan') if N==0 else entry['T'] / N
     df = DataFrame.from_items([(e, {k: entry[e]}) for e in PRCounter.COLUMNS])
     self._df = self._df.append(df)
                                    data[9],
                                    data[10],
                                    data[11],
                                    data[12],
                                    data[13],
                                ],
                            )
                        )  ##append tuple to list
                        print dt
            except:
                print "skipped day"
                pass
        else:
            print "passed"
            pass
    frame = DataFrame.from_items(datalist, orient="index", columns=columns)
    frame.columns = columns
    frame = frame.applymap(lambda x: np.nan if x == "-9999" else x)
    datafile = frame.to_csv("C:/Users/Alex/Desktop/samoa/WATERSHED_ANALYSIS/BarometricData/NSTU/NSTU-current_10_28.csv")

#### Append all
##files = os.listdir('C:/Users/Alex/Desktop/samoa/WATERSHED_ANALYSIS/BarometricData/NSTP6/')
##alldata = open('C:/Users/Alex/Desktop/samoa/WATERSHED_ANALYSIS/BarometricData/NSTP6/'+'2013.txt','w')
##for f in files:
##    if f.endswith('.csv')==True:
##        print f
##        with open(f,'wb') as csvfile:
##            data=csv.reader(csvfile,dialect='excel')
##            for row in data:
##                alldatata.write(row)
Example #23
0
import pandas
from pandas import DataFrame

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

stoplist = stopwords.words('english')

seasons, episode_ids = [], []
with open("data/import/episodes.csv", "r") as episodesfile:
    reader = csv.reader(episodesfile, delimiter = ",")
    reader.next()
    for row in reader:
        seasons.append(int(row[3]))
        episode_ids.append(int(row[0]))

df = DataFrame.from_items([('Season', seasons), ('EpisodeId', episode_ids)])
last_episode_in_season = list(df.groupby("Season").max()["EpisodeId"])

print "process sentences"

episodes = defaultdict(list)
with open("data/import/sentences.csv", "r") as sentencesfile:
    reader = csv.reader(sentencesfile, delimiter = ",")
    reader.next()
    for row in reader:
        episodes[row[1]].append([ word
                                    for word in nltk.word_tokenize(row[4].lower())
                                    if word not in string.punctuation and
                                       word not in stoplist ] )

texts = []
Example #24
0
    def test_column_dups_operations(self):

        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=['A', 'A'])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range('20130101', periods=4, freq='Q-NOV')
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['a', 'a', 'a', 'a'])
        df.columns = idx
        expected = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        df['string'] = 'bah'
        expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
                              [2, 1, 3, 5, 'bah']],
                             columns=['foo', 'bar', 'foo', 'hello', 'string'])
        check(df, expected)
        with assertRaisesRegexp(ValueError, 'Length of value'):
            df.insert(0, 'AnotherColumn', range(len(df.index) - 1))

        # insert same dtype
        df['foo2'] = 3
        expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
                              [2, 1, 3, 5, 'bah', 3]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)

        # set (non-dup)
        df['foo2'] = 4
        expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
                              [2, 1, 3, 5, 'bah', 4]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)
        df['foo2'] = 3

        # delete (non dup)
        del df['bar']
        expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
                              [2, 3, 5, 'bah', 3]],
                             columns=['foo', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # try to delete again (its not consolidated)
        del df['hello']
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # consolidate
        df = df.consolidate()
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # insert
        df.insert(2, 'new_col', 5.)
        expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
                              [2, 3, 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col', 'string',
                                      'foo2'])
        check(df, expected)

        # insert a dup
        assertRaisesRegexp(ValueError, 'cannot insert',
                           df.insert, 2, 'new_col', 4.)
        df.insert(2, 'new_col', 4., allow_duplicates=True)
        expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
                              [1, 2, 4., 5., 'bah', 3],
                              [2, 3, 4., 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col',
                                      'new_col', 'string', 'foo2'])
        check(df, expected)

        # delete (dup)
        del df['foo']
        expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
                              [4., 5., 'bah', 3]],
                             columns=['new_col', 'new_col', 'string', 'foo2'])
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        check(df)

        df['foo2'] = 7.
        expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
                              [2, 1, 3., 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        result = df['foo']
        expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
                             columns=['foo', 'foo'])
        check(result, expected)

        # multiple replacements
        df['foo'] = 'string'
        expected = DataFrame([['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        del df['foo']
        expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
                             'bar', 'hello', 'foo2'])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        self.assertTrue((result == expected).all().all())

        # rename, GH 4403
        df4 = DataFrame(
            {'TClose': [22.02],
             'RT': [0.0454],
             'TExg': [0.0422]},
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        df5 = DataFrame({'STK_ID': [600809] * 3,
                         'RPT_Date': [20120930, 20121231, 20130331],
                         'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
                         'TClose': [38.05, 41.66, 30.01]},
                        index=MultiIndex.from_tuples(
                            [(600809, 20120930),
                             (600809, 20121231),
                             (600809, 20130331)],
                            names=['STK_ID', 'RPT_Date']))

        k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
        result = k.rename(
            columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
        str(result)
        result.dtypes

        expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
                                u('饡驦'), 30.01]],
                              columns=['RT', 'TClose', 'TExg',
                                       'RPT_Date', 'STK_ID', 'STK_Name',
                                       'QT_Close'])
                    .set_index(['STK_ID', 'RPT_Date'], drop=False))
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        self.assertRaises(ValueError, df.reindex, columns=['bar'])
        self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo'])

        # drop
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        result = df.drop(['a'], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=['bar'])
        check(result, expected)
        result = df.drop('a', axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['bar', 'a', 'a'], dtype='float64')
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'A'])
        for index in [df.index, pd.Index(list('edcba'))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame.from_items([('A', expected_ser),
                                                ('B', this_df['B']),
                                                ('A', expected_ser)])
            this_df['A'] = index
            check(this_df, expected_df)

        # operations
        for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ['A', 'A']
            df.columns = ['A', 'A']
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
        expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])

        df['that'] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
        expected = DataFrame(1, index=range(5), columns=['that', 'that'])

        df['that'] = 1
        check(df, expected)
Example #25
0
import json
import sys
from pandas import DataFrame

filename = sys.argv[1]

data = json.load(open(filename))
frame_data = [(k, (len(v['following']), len(v['followers']))) for k,v in data.items()]
frame = DataFrame.from_items(frame_data, orient='index', columns=['following', 'followers'])

print frame

print '## Following'
print frame['following'].describe()
print frame.sort('following', ascending=False)[:10]

print '## Followers'
print frame['followers'].describe()
print frame.sort('followers', ascending=False)[:10]
Example #26
0
    def _run_comparison(self, random_trials=100,
                        sig_threshold=0.05, debug=False):
        self.log(
            'Running ' + str(random_trials) +
            ' random sets for each term and comparing them.')
        dens = dict()
        significant_terms = 0
        n = 0

        # Use only 25 terms for testing purposes
        if debug:
            ont_loci = list(self.ont_loci.items())[:25]
        else:
            ont_loci = list(self.ont_loci.items())

        # Iterate through all terms
        for term, loci in ont_loci:
            # Log how many teerms are done, to ensure people it hasn't crashed
            if n % 100 == 0:
                self.log('Compared {}/{} terms so far.', n, len(ont_loci))

            real_density = self.cob.density(loci)
            dens[term] = [real_density]

            # Run the random samples
            loci_count = len(loci)
            loci_tot_list = self.cob.refgen.random_genes(
                loci_count*random_trials
            )
            scores = []
            aboves = 0
            for x in range(random_trials):
                # Get the random genes
                loci_list = [loci_tot_list.pop() for x in range(loci_count)]

                # Find the density and save it
                score = self.cob.density(loci_list)
                if score >= real_density:
                    aboves += 1
                scores.append(score)

            # Add on the states from the scores
            dens[term].append(np.mean(scores))
            dens[term].append(np.std(scores))
            dens[term].append(aboves)

            # Figure out if that makes it significant
            if dens[term][-1] <= (random_trials*sig_threshold):
                dens[term].append(1)
                significant_terms += 1
            else:
                dens[term].append(0)
            n += 1
        self.log('Compared all {} terms.', n)

        # Convert the dict to a DataFrame
        ans = DataFrame.from_items(
            dens.items(),
            columns=[
                self.cob.name+' Density', 'Random Density Mean',
                'Random STD', 'Items >= '+self.cob.name, 'Significant'
            ],
            orient='index'
        )

        self.log('Number of Significant Terms: ' + str(significant_terms))
        self.log('Number Random Significants Expected: '+str(len(dens)*0.05))
        return ans
    def compute(self):
        """
        Compute inequality dataframe
        """
        final_df = None
        simulation = self.survey_scenario.new_simulation()
        column_by_name = simulation.tax_benefit_system.column_by_name

        # amounts and beneficiaries from current data and default data if exists
        # Build weights for each entity

        from openfisca_france_data import FILTERING_VARS
        for varname, entities in self.vars.iteritems():
            for entity_key_plural in entities:
                column = column_by_name[varname]
                weight_name = self.survey_scenario.weight_column_name_by_entity_key_plural[column.entity_key_plural]
                filter_by = FILTERING_VARS[0]
                filter_by_name = FILTERING_VARS[0]
                if column.entity_key_plural is not 'menages':
                    filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural)
                val = simulation.calculate(varname)
                weights = simulation.calculate(weight_name)
                filter_var = simulation.calculate(filter_by_name)

            items = []
            # Compute mean
            moy = (weights * filter_var * val).sum() / (weights * filter_var).sum()
            items.append(("Moyenne", [moy]))
            # Compute deciles
            labels = range(1, 11)
            method = 2
            decile, values = mark_weighted_percentiles(val, labels, weights * filter_var,
                                                       method, return_quantiles = True)
            labels = ['D' + str(d) for d in range(1, 11)]
            del decile
            for l, v in zip(labels[:-1], values[1:-1]):
                items.append((l, [v]))

            # Compute Gini
            gini_coeff = gini(val, weights * filter_var)
            items.append((_("Gini index"), [gini_coeff]))
            df = DataFrame.from_items(items, orient = 'index', columns = [varname])
            df = df.reset_index()
            if final_df is None:
                final_df = df
            else:
                final_df = final_df.merge(df, on='index')

        final_df[u"Initial à net"] = (final_df['nivvie_net'] - final_df['nivvie_ini']) / final_df['nivvie_ini']
        final_df[u"Net à disponible"] = (final_df['nivvie'] - final_df['nivvie_net']) / final_df['nivvie_net']
        final_df = final_df[['index', 'nivvie_ini', u"Initial à net", 'nivvie_net', u"Net à disponible", 'nivvie']]
        self.inequality_data_frame = final_df

        # Poverty
        poverty = dict()
        varname = "nivvie"
        for percentage in [40, 50, 60]:
            varname = "pauvre{}".format(percentage)
            column = column_by_name[varname]
            weight_name = self.survey_scenario.weight_column_name_by_entity_key_plural[column.entity_key_plural]
            filter_by_name = FILTERING_VARS[0]
            if column.entity_key_plural is not 'menages':
                filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural)
            val = simulation.calculate(varname)
            weights = simulation.calculate(weight_name)
            filter_var = simulation.calculate(filter_by_name)
            poverty[percentage] = (weights * filter_var * val).sum() / (weights * filter_var).sum()

        self.poverty = poverty
Example #28
0
        theta = compute_theta(data)    # M-step

        print("Run %d produced theta of:" % i)
        print_theta(theta)
        #log_likelihood(data, theta)

#===============================================

#TODO: infer varaibles and state sizes from data
nodes = ['T', 'E1', 'E2', 'E3', 'E4']
N  = len(nodes)

# create a blank adjacency matrix, then
# set the directed edges.  each row (node)
# should have a 1 in the column of each parent.
adj = DataFrame.from_items( [(node, Series(np.zeros(N, int))) for node in nodes] )
adj.index = nodes
adj.ix['E1', 'T'] = 1
adj.ix['E2', 'T'] = 1
adj.ix['E3', 'T'] = 1
adj.ix['E4', 'T'] = 1
print(adj)

# specify the TRUE joint distribution, theta.  specified as a
# dict of node -> cpt, where each cpt is a dict
# of comma-separated values of the ordered parents -> prob
theta = {}
theta['T'] = {'': {0: 0.75, 1: 0.25}}
theta['E1'] = {'0': {0: 0.45, 1: 0.55}, '1': {0: 0.05, 1: 0.95}, }
theta['E2'] = {'0': {0: 0.40, 1: 0.60}, '1': {0: 0.05, 1: 0.95}, }
theta['E3'] = {'0': {0: 0.50, 1: 0.50}, '1': {0: 0.10, 1: 0.90}, }
    def add_var2(self, varname, target=None, source = 'free'):
        """
        Add a variable in the dataframe

        Parameters
        ----------

        varname : str
                  name of the variable
        target : float
                 target for the margin of the variable
        source : str, default 'free'
                 database source
        """
        w_init = self.weights_init*self.champm
        w = self.weights*self.champm
        inputs = self.simulation.survey
        output_table = self.simulation.output_table

        varcol = self.simulation.get_col(varname)
        entity = self.entity
        enum = inputs.column_by_name.get('qui'+self.entity).enum
        people = [x[1] for x in enum]

        if varname in inputs.column_by_name:
            value = inputs.get_value(varname, index = idx)
        elif output_table is not None and varname in output_table.column_by_name:
            value = output_table.get_value(varname, index = idx, opt = people, sum_ = True)

        label = varcol.label
        # TODO: rewrite this using pivot table
        items = [ ('marge'    , w[self.champm]  ), ('marge initiale' , w_init[self.champm] )]
        if varcol.__class__  in MODCOLS:
            items.append(('mod',   value[self.champm]))
            df = DataFrame.from_items(items)
            res = df.groupby('mod', sort= True).sum()
        else:
            res = DataFrame(index = ['total'],
                            data = {'marge' : (value*w).sum(),
                                    'marge initiale' : (value*w_init).sum()  } )
        res.insert(0, u"modalités",u"")
        res.insert(2, "cible", 0)
        res.insert(2, u"cible ajustée", 0)
        res.insert(4, "source", source)
        mods = res.index

        if target is not None:
            if len(mods) != len(target.keys()):
                drop_indices = [ (varname, mod) for mod in target.keys()]
                if source == 'input':
                    self.input_margins_df.drop(drop_indices, inplace=True)
                    self.input_margins_df.index.names = ['var','mod']
                if source == 'output':
                    self.output_margins_df.drop(drop_indices, inplace=True)
                    self.output_margins_df.index.names = ['var','mod']
                return

        if isinstance(varcol, EnumCol):
            if varcol.enum:
                enum = varcol.enum
                res[u'modalités'] = [enum._vars[mod] for mod in mods]
                res['mod'] = mods
            else:
                res[u'modalités'] = [mod for mod in mods]
                res['mod'] = mods
        elif isinstance(varcol, BoolCol):
            res[u'modalités'] = bool(mods)
            res['mod']        = mods
        elif isinstance(varcol, IntCol):
            res[u'modalités'] = mods
            res['mod']        = mods
        elif isinstance(varcol, AgeCol):
            res[u'modalités'] = mods
            res['mod'] = mods
        else:
            res[u'modalités'] = "total"
            res['mod']  = 0

        if label is not None:
            res['variable'] = label
        else:
            res['variable'] = varname
        res['var'] = varname

        if target is not None:
            for mod, margin in target.iteritems():
                if mod == varname:    # dirty to deal with non catgorical data
                    res['cible'][0] = margin
                else:
                    res['cible'][mod] = margin

        if self.frame is None:
            self.frame = res
        else:
            self.frame = concat([self.frame, res])

        self.frame = self.frame.reset_index(drop=True)
    def compute(self):
        """
        Compute inequality dataframe
        """
        output = self.simulation.output_table
        final_df = None

        WEIGHT = model.WEIGHT
        FILTERING_VARS = model.FILTERING_VARS
        for varname, entities in self.vars.iteritems():
            for entity in entities:
                #idx =  output.index[entity]

                val  = output.get_value(varname, entity)
                weights = output._inputs.get_value(WEIGHT, entity)
                filter_var_name = FILTERING_VARS[0]
                filter_var= output._inputs.get_value(filter_var_name, entity)

            items = []
            # Compute mean
            moy = (weights*filter_var*val).sum()/(weights*filter_var).sum()
            items.append( ("Moyenne",  [moy]))

            # Compute deciles
            labels = range(1,11)
            method = 2
            decile, values = mark_weighted_percentiles(val, labels, weights*filter_var, method, return_quantiles=True)

            labels = [ 'D'+str(d) for d in range(1,11)]
            del decile
            for l, v in zip(labels[:-1],values[1:-1]):
                items.append( (l, [v]))

            # Compute Gini
            gini_coeff = gini(val, weights*filter_var)
            items.append( ( _("Gini index"), [gini_coeff]))

            df = DataFrame.from_items(items, orient = 'index', columns = [varname])
            df = df.reset_index()
            if final_df is None:
                final_df = df
            else:
                final_df = final_df.merge(df, on='index')

        final_df[u"Initial à net"] = (final_df['nivvie_net']-final_df['nivvie_ini'])/final_df['nivvie_ini']
        final_df[u"Net à disponible"] = (final_df['nivvie']-final_df['nivvie_net'])/final_df['nivvie_net']
        final_df = final_df[['index','nivvie_ini', u"Initial à net", 'nivvie_net',u"Net à disponible",'nivvie']]
        self.inequality_dataframe = final_df

        # poverty
        poverty = dict()
        entity = "men"
        varname = "nivvie"
        for percentage in [ 40, 50, 60]:
#            idx =  output.index[entity]
            varname = "pauvre" + str(percentage)
            val = output.get_value(varname, entity)
            weights = output._inputs.get_value(WEIGHT, entity)
            filter_var_name = FILTERING_VARS[0]
            filter_var= output._inputs.get_value(filter_var_name, entity)
            poverty[percentage] =  (weights*filter_var*val).sum()/(weights*filter_var).sum()

        self.poverty = poverty