Example #1
0
    def testDataFrame(self):
        df = DataFrame([[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"])

        # column indexed
        outp = DataFrame(ujson.decode(ujson.encode(df)))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(**ujson.decode(ujson.encode(df, orient="split")))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="records")))
        outp.index = df.index
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="values")))
        outp.index = df.index
        self.assertTrue((df.values == outp.values).all())

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="index")))
        self.assertTrue((df.transpose() == outp).values.all())
        assert_array_equal(df.transpose().columns, outp.columns)
        assert_array_equal(df.transpose().index, outp.index)
Example #2
0
    def testDataFrame(self):
        df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z'])

        # column indexed
        outp = DataFrame(ujson.decode(ujson.encode(df)))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split")))
        outp = DataFrame(**dec)
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="records")))
        outp.index = df.index
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="values")))
        outp.index = df.index
        self.assertTrue((df.values == outp.values).all())

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="index")))
        self.assertTrue((df.transpose() == outp).values.all())
        assert_array_equal(df.transpose().columns, outp.columns)
        assert_array_equal(df.transpose().index, outp.index)
Example #3
0
def network_perf(systems, unique_id, group_number, detail_options,
                 rampup_value=0, current_dir=""):
    have_net_data = False
    sets = search_item(systems, unique_id, "network", r"(.*)", [], [])
    modes = ['bandwidth', 'requests_per_sec']
    for mode in sorted(modes):
        results = {}
        for system in sets:
            net = []
            series = []
            global_perf = 0.0
            for perf in sets[system]:
                if perf[1] == mode:
                    if not perf[1] in net:
                        net.append(perf[1])
                    global_perf = global_perf + float(perf[3])

            series.append(global_perf)
            results[system] = Series(series, index=net)

        df = DataFrame(results)
        details = []
        matched_category = []
        for net in df.transpose().columns:
            if have_net_data is False:
                print()
                print("Group %d : Checking network disks perf" % group_number)
                have_net_data = True
            consistent = []
            curious = []
            unstable = []
            # How much the variance could be far from the average (in %)
            tolerance_max = 15
            tolerance_min = 2

            print_perf(tolerance_min, tolerance_max, df.transpose()[net], df,
                       mode, net, consistent, curious, unstable, "",
                       rampup_value, current_dir)
            if mode == 'bandwidth':
                unit = "MB/sec"
            else:
                unit = "RRQ/sec"
            prepare_detail(detail_options, group_number, mode, net, details,
                           matched_category)
            print_summary("%-30s %s" % (mode, net), consistent, "consistent",
                          unit, df)
            print_summary("%-30s %s" % (mode, net), curious, "curious", unit,
                          df)
            print_summary("%-30s %s" % (mode, net), unstable, "unstable",
                          unit, df)

        print_detail(detail_options, details, df, matched_category)
Example #4
0
    def unMap(self,X, Y, ycat):
        newX = []
        newY = []

        for i in range(X.shape[0]):
            # x = self.cats[0].categories[X[i]]
            # y = ycat.categories[Y[i]]
            # x = "ISIN%d" % (X[i])
            y = "Stripe %d" % (Y[i])
            # newX.append(x)
            newY.append(y)

        dataFrame = DataFrame([X, newY])

        dataFrame.transpose().to_csv("output.csv", index=False, header=["ISIN", "Risk_Stripe"])
 def diagnostic_table(self, num_years=5, base_calc=None):
     table = []
     row_years = []
     calc = copy.deepcopy(self)
     base_calc = copy.deepcopy(base_calc)
     for i in range(0, num_years):
         if calc.behavior.has_response():
             base_calc.calc_all()
             behavior_calc = Behavior.response(base_calc, calc)
             behavior_calc.diagnostic_table_items(table)
         else:
             calc.calc_all()
             calc.diagnostic_table_items(table)
         row_years.append(calc.policy.current_year)
         if i < num_years - 1:
             calc.increment_year()
             if base_calc is not None:
                 base_calc.increment_year()
     df = DataFrame(table, row_years,
                    ['Returns (#m)', 'AGI ($b)', 'Itemizers (#m)',
                     'Itemized Deduction ($b)',
                     'Standard Deduction Filers (#m)',
                     'Standard Deduction ($b)', 'Personal Exemption ($b)',
                     'Taxable income ($b)', 'Regular Tax ($b)',
                     'AMT income ($b)', 'AMT amount ($b)',
                     'AMT number (#m)', 'Tax before credits ($b)',
                     'refundable credits ($b)',
                     'nonrefundable credits ($b)',
                     'Misc. Surtax ($b)',
                     'Ind inc tax ($b)', 'Payroll tax ($b)',
                     'Combined liability ($b)'])
     df = df.transpose()
     pd.options.display.float_format = '{:8,.1f}'.format
     return df
Example #6
0
def plot_phonemes(path):
    phoneme_embeddings = dict()
    for line in codecs.open(path,"r"):
        line = line.split(",")
        key= line[0][1:-1]
        emb = line[1:]
        emb[-1] = emb[-1][:-1]
        emb = np.array([float(e) for e in emb])
        phoneme_embeddings[key] = emb
    
    phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys())
    print(phoneme_embeddings.columns)
    
    m = TSNE()
    phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings.transpose())
    print(len(phoneme_embeddings_tsne))
    for p,emb in zip(phoneme_embeddings.columns, phoneme_embeddings_tsne):
        c = "black"
        if regex.search("^[aeiou3E][*]?$", p):
            c = "red"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*w~$", p):
            c = "blue"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*y~$", p):
            c = "yellow"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*h~$", p):
            c = "brown"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*\"$", p):
            c = "green"
            plt.annotate(p,(emb[0],emb[1]),color=c)
Example #7
0
 def diagnostic_table(self, num_years=5, base_calc=None):
     table = []
     row_years = []
     calc = copy.deepcopy(self)
     base_calc = copy.deepcopy(base_calc)
     for i in range(0, num_years):
         has_behavior = (calc.behavior.BE_sub or calc.behavior.BE_inc or
                         calc.behavior.BE_CG_per)
         if has_behavior:
             base_calc.calc_all()
             behavior_calc = behavior(base_calc, calc)
             behavior_calc.diagnostic_table_items(table)
         else:
             calc.calc_all()
             calc.diagnostic_table_items(table)
         row_years.append(calc.policy.current_year)
         if i < num_years - 1:
             calc.increment_year()
             if base_calc is not None:
                 base_calc.increment_year()
     df = DataFrame(table, row_years,
                    ["Returns (#m)", "AGI ($b)", "Itemizers (#m)",
                     "Itemized Deduction ($b)",
                     "Standard Deduction Filers (#m)",
                     "Standard Deduction ($b)", "Personal Exemption ($b)",
                     "Taxable income ($b)", "Regular Tax ($b)",
                     "AMT income ($b)", "AMT amount ($b)",
                     "AMT number (#m)", "Tax before credits ($b)",
                     "refundable credits ($b)",
                     "nonrefundable credits ($b)",
                     "Misc. Surtax ($b)",
                     "Ind inc tax ($b)", "Payroll tax ($b)"])
     df = df.transpose()
     pd.options.display.float_format = '{:8,.1f}'.format
     return df
Example #8
0
def from_json_to_dataframe():
    results = json.load(open('./networks/first_level_analysis.json','r'))
    df = DataFrame(results)
    df.to_csv("panels.csv")
    dft = df.transpose()
    dft.to_csv("panels_trans.csv")
    return df
Example #9
0
    def test_dataframe(self, orient, numpy):
        if orient == "records" and numpy:
            pytest.skip("Not idiomatic pandas")

        df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[
            "a", "b"], columns=["x", "y", "z"])
        encode_kwargs = {} if orient is None else dict(orient=orient)
        decode_kwargs = {} if numpy is None else dict(numpy=numpy)

        output = ujson.decode(ujson.encode(df, **encode_kwargs),
                              **decode_kwargs)

        # Ensure proper DataFrame initialization.
        if orient == "split":
            dec = _clean_dict(output)
            output = DataFrame(**dec)
        else:
            output = DataFrame(output)

        # Corrections to enable DataFrame comparison.
        if orient == "values":
            df.columns = [0, 1, 2]
            df.index = [0, 1]
        elif orient == "records":
            df.index = [0, 1]
        elif orient == "index":
            df = df.transpose()

        tm.assert_frame_equal(output, df, check_dtype=False)
Example #10
0
def analyze_first_level_panels():
    results = {}
    
    for d in first_level_topic_list:
        print "\n*********DESCRIPTOR: " + first_level_topic_list[d] + "(" + str(d) + ")"
        G = build_panel_network_by_descriptor(d)
        print "\nDESCRIPTOR: " + first_level_topic_list[d] + "(" + str(d) + ")"
        print "Nodes:", G.number_of_nodes()
        print "Edges:", G.number_of_edges()
        res_clique = analize_cliques(G)
        res_degree = analize_degrees(G)
        res_weight = analize_edges(G)
        d_final = dict(res_clique)
        d_final.update(res_degree)
        d_final.update(res_weight)
        d_final['id'] = d
        d_final['avg_clustering'] = nx.average_clustering(G)
        results[first_level_topic_list[d]] = d_final
        
    print "Writing json..."
    json.dump(results, open('./networks/first_level_panels_analysis.json','w'), indent = 2)
    print "Writing csvs..."
    df = DataFrame(results)
    df.to_csv('./networks/first_level_panels_analysis.csv')
    dfinv = df.transpose()
    dfinv.to_csv('./networks/first_level_panels_analysis_inv.csv')
Example #11
0
 def send_to_db(self):
     conn = sqlite3.connect('data2.sqlite', timeout=30)
     c = conn.cursor()
     df = DataFrame(self.__dict__.items(), index=self.__dict__.keys())
     df = df.drop(0,1)
     df = df.transpose()
     df = df.sort(axis=1)
     df.to_sql('earnings_calendar', conn, if_exists='append', index=False)
Example #12
0
def make_league_df():
    from pandas import DataFrame
    from members import import_teams
    div_teams = import_teams()
    div_bins = load_all_substitute_bins()
    bins = sum(div_bins, [])
    targets = ['Jessica', 'Lexie']
    results = {}
    for target in targets:
        for bin_num, bin in enumerate(bins):
            for person in bin:
                if target in person:
                    break
        if target in results:
            break
    from pprint import pprint
    people = {}
    def find_team (name, teams):
        for team_idx, team in enumerate(teams):
            for person in team:
                if name in person:
                    return team_idx
        return -1
    for div_idx, (bins, teams) in enumerate(zip(div_bins, div_teams)):
        for bin_idx, bin in enumerate(bins):
            for person in bin:
                team_idx = find_team(person, teams)
                int_team = find_team(person, div_teams[1])
                people[person] = {'div': div_idx, 'bin': bin_idx,
                                  'team': team_idx, 'int_team': int_team}
    df = DataFrame(people)
    df = df.transpose()
    comp_busy = [4, 6, 7]  # really, 5, 7, 8
    int_busy = [11, 9, 1]  # really, 12, 10, 2

    print('\n\nbusy')
    brian_int = df.loc[df['bin'] == 0].loc[df['div'] == 2].loc[
        df['int_team'].isin(int_busy)]
    lexies = df.loc[df['bin'] == 1].loc[df['div'] == 2].loc[
        df['team'].isin(comp_busy)]
    jeses = df.loc[df['bin'] == 0].loc[df['div'] == 2].loc[
        df['team'].isin(comp_busy)]
    pprint(", ".join(brian_int.index.values))
    pprint(", ".join(lexies.index.values))
    pprint(", ".join(jeses.index.values))

    print('\n\nyes')
    brian_int = df.loc[df['bin']==0].loc[df['div']==2].loc[~df['int_team'].isin(int_busy)]
    lexies = df.loc[df['bin']==1].loc[df['div']==2].loc[~df['team'].isin(comp_busy)]
    jeses = df.loc[df['bin']==0].loc[df['div']==2].loc[~df['team'].isin(comp_busy)]
    pprint(len(brian_int))
    pprint(len(lexies))
    pprint(len(jeses))
    print(", ".join(brian_int.index.values))
    print(", ".join(lexies.index.values))
    print(", ".join(jeses.index.values))
Example #13
0
    def testDataFrameNumpy(self):
        df = DataFrame([[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"])

        # column indexed
        outp = DataFrame(ujson.decode(ujson.encode(df), numpy=True))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"), numpy=True))
        outp = DataFrame(**dec)
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"), numpy=True))
        self.assertTrue((df.transpose() == outp).values.all())
        assert_array_equal(df.transpose().columns, outp.columns)
        assert_array_equal(df.transpose().index, outp.index)
Example #14
0
 def parse_data_model(self, full_df):
     data_model = {}
     levels = ['specimens', 'samples', 'sites', 'locations',
               'ages', 'measurements', 'criteria', 'contribution']
     for level in levels:
         df = DataFrame(full_df['tables'][level]['columns'])
         data_model[level] = df.transpose()
     # replace np.nan with None
     data_model[level] = data_model[level].where((pd.notnull(data_model[level])), None)
     return data_model
Example #15
0
def annotate(ann, ccols, ocols, clust, c):
    to_add = open(ann, 'r')
    head = next(to_add)
    head = head.rstrip('\n')
    bids = head.split('\t')
    # SHOULD HAVE ROW HEADERS
    Cols = bids[1:]
    maps = ('Reds', 'Reds', 'Greys', 'Greens')
    k = 0
    annot = []
    for line in to_add:
        line = line.rstrip('\n')
        data = line.split('\t')
        to_map = data[1:]
        rmap = []
        newCols = []
        # reorg data to match cluster

        for i in ccols:
            rmap.append(to_map[Cols.index(ocols[i])])
            newCols.append(ocols[i])
        rmap = np.asarray(rmap)
        Rows = []
        Rows.append(data[0])
        # flag if qualitative
        q = 0
        if isint(rmap[0]):
            rmap = rmap.astype(np.float)
        else:
            q = 1
            qdict = {}
            j = 0
            for i in xrange(0, len(rmap), 1):
                if rmap[i] not in qdict:
                    qdict[rmap[i]] = j
                    sys.stderr.write(str(j) + ' ' + rmap[i] + '\n')
                    j += 1
                rmap[i] = qdict[rmap[i]]
            rmap = rmap.astype(np.float)
        df = DataFrame(rmap, index=ocols, columns=Rows)

        df = df.transpose()
        new, cur = plt.subplots()
        cur = sns.heatmap(df, cmap=maps[k], rasterized=True)

        new.set_figheight(2)
        new.set_figwidth(c)
        new.set_dpi(600)
        cur.set_xticklabels(newCols, rotation=90)
        new.savefig('test' + str(k) + '.pdf')
        annot.append(new)
        k += 1
    return annot
Example #16
0
def append_village_areas(divname):
    im_vil = pd.read_csv('../data/%s_village_images.csv' % divname.lower())
    shape_helper = ShapeHelper('../data/shapefiles/fixed_village_shapefiles/%s/%s.shp' % (divname.lower(), divname.lower()),
                               lat_offset, lon_offset)
    areas = shape_helper.get_shape_areas('village')
    areas_df = DataFrame(areas, index=['area'])
    areas_df = areas_df.transpose()
    areas_df.reset_index(inplace=True)
    areas_df.rename(columns={'index': 'village'}, inplace=True)
    im_vil_areas = pd.merge(im_vil, areas_df, how='left')
    im_vil_areas.set_index('image', inplace=True)
    im_vil_areas.to_csv('../data/%s_village_areas_images.csv' % divname.lower())
Example #17
0
 def parse_data_model(self, full_df):
     """
     Format the data model into a dictionary of DataFrames.
     """
     data_model = {}
     levels = ['specimens', 'samples', 'sites', 'locations',
               'ages', 'measurements', 'criteria', 'contribution',
               'images']
     criteria_map = DataFrame(full_df['criteria_map'])
     for level in levels:
         df = DataFrame(full_df['tables'][level]['columns'])
         data_model[level] = df.transpose()
     # replace np.nan with None
     data_model[level] = data_model[level].where((pd.notnull(data_model[level])), None)
     return data_model, criteria_map
Example #18
0
def file_prep(file):
    df = DataFrame(read_csv(file, sep = '\t'))
    df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True)
    major_freqs = df.apply(major_prop_find, axis = 1)
    major_alleles = df.apply(major_find, axis =1 )
    df.insert(3,'major_freqs', major_freqs)
    df.insert(3,'major_alleles', major_alleles)
    df = df.transpose()
    
    
    chrom, chrom_idx = np.unique(df.loc['chrom'], return_index=True)
    
    super_missing_df = df == '.'
    
    chromosome_dict = {}
    for number in np.unique(df.loc['chrom']):
        chromosome_dict[number] = df.loc['chrom'][df.loc['chrom'] == number].index
    return df, super_missing_df, chromosome_dict
Example #19
0
def main():
    train_set = create_dataset(N)
    test_set = create_dataset(N)
    df_ws = DataFrame()

    fig = plt.figure()
    for c, m in enumerate(M):
        f, ws = resolve(train_set, m)
        df_ws = df_ws.append(Series(ws, name="M=%d" % m))

        subplot = fig.add_subplot(2, 2, c + 1)
        subplot.set_xlim(-0.05, 1.05)
        subplot.set_ylim(-1.5, 1.5)
        subplot.set_title("M=%d" % m)

        subplot.scatter(train_set.x, train_set.y, marker='o', color='blue')

        linex = np.linspace(0, 1, 101)
        liney = np.sin(2 * np.pi * linex)
        subplot.plot(linex, liney, color='green', linestyle='--')

        linex = np.linspace(0,1,101)
        # like probabilty density function
        liney = f(linex)
        label = "E(RMS)=%.2f" % rmse(train_set, f)
        subplot.plot(linex, liney, color='red', label=label)
        subplot.legend(loc=1)

    print("Table of the coeffcients")
    print(df_ws.transpose())
    fig.show()

    df = DataFrame()
    for m in range(0, 10):
        f, ws = resolve(train_set, m)
        train_error = rmse(train_set, f)
        test_error = rmse(test_set, f)
        df = df.append(
                Series([train_error, test_error], index=['Training set', 'Test set']),
                ignore_index=True)

    df.plot(title='RMS Error', style=['-', '--'], grid=True, ylim=(0, 0.9))
    plt.show()
Example #20
0
    def apply(self, transforms, axis=0):
        if isinstance(transforms, Transform) \
        or \
        (isinstance(transforms, type) and issubclass(transforms, Transform)):
            transform = transforms #only a single object passed (not a list)
            return transform.__eapply__(self)

        elif isinstance(transforms, (types.FunctionType, types.BuiltinFunctionType, functools.partial)):
            func = transforms #only a single object passed (not a list)
            transformed_data_df = DataFrame(self.data_df.apply(func, axis=axis))

            #transpose to return the samples as column namess rather than row names
            if axis == 0 : transformed_data_df = transformed_data_df.transpose()

            return self.with_data_df(transformed_data_df)

        elif isinstance(transforms, list):
            transformed_exp = self
            for transform in transforms:
                transformed_exp = transform.__eapply__(transformed_exp)
            return transformed_exp

        else:
            raise NotImplementedError
Example #21
0
    def test_frame_from_json_to_json(self):

        def _check_orient(df, orient, dtype=None, numpy=True):
            df = df.sort()
            dfjson = df.to_json(orient=orient)
            unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype,
                                        numpy=numpy)
            unser = unser.sort()
            if df.index.dtype.type == np.datetime64:
                unser.index = DatetimeIndex(unser.index.values.astype('i8'))
            if orient == "records":
                # index is not captured in this orientation
                assert_almost_equal(df.values, unser.values)
                self.assert_(df.columns.equals(unser.columns))
            elif orient == "values":
                # index and cols are not captured in this orientation
                assert_almost_equal(df.values, unser.values)
            elif orient == "split":
                # index and col labels might not be strings
                unser.index = [str(i) for i in unser.index]
                unser.columns = [str(i) for i in unser.columns]
                unser = unser.sort()
                assert_almost_equal(df.values, unser.values)
            else:
                assert_frame_equal(df, unser)

        def _check_all_orients(df, dtype=None):
            _check_orient(df, "columns", dtype=dtype)
            _check_orient(df, "records", dtype=dtype)
            _check_orient(df, "split", dtype=dtype)
            _check_orient(df, "index", dtype=dtype)
            _check_orient(df, "values", dtype=dtype)

            _check_orient(df, "columns", dtype=dtype, numpy=False)
            _check_orient(df, "records", dtype=dtype, numpy=False)
            _check_orient(df, "split", dtype=dtype, numpy=False)
            _check_orient(df, "index", dtype=dtype, numpy=False)
            _check_orient(df, "values", dtype=dtype, numpy=False)

        # basic
        _check_all_orients(self.frame)
        self.assertEqual(self.frame.to_json(),
                         self.frame.to_json(orient="columns"))

        _check_all_orients(self.intframe, dtype=self.intframe.values.dtype)

        # big one
        # index and columns are strings as all unserialised JSON object keys
        # are assumed to be strings
        biggie = DataFrame(np.zeros((200, 4)),
                           columns=[str(i) for i in range(4)],
                           index=[str(i) for i in range(200)])
        _check_all_orients(biggie)

        # dtypes
        _check_all_orients(DataFrame(biggie, dtype=np.float64),
                           dtype=np.float64)
        _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int)
        _check_all_orients(DataFrame(biggie, dtype='<U3'), dtype='<U3')

        # empty
        _check_all_orients(self.empty_frame)

        # time series data
        _check_all_orients(self.tsframe)

        # mixed data
        index = pd.Index(['a', 'b', 'c', 'd', 'e'])
        data = {
            'A': [0., 1., 2., 3., 4.],
            'B': [0., 1., 0., 1., 0.],
            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
            'D': [True, False, True, False, True]
        }
        df = DataFrame(data=data, index=index)
        _check_orient(df, "split")
        _check_orient(df, "records")
        _check_orient(df, "values")
        _check_orient(df, "columns")
        # index oriented is problematic as it is read back in in a transposed
        # state, so the columns are interpreted as having mixed data and
        # given object dtypes.
        # force everything to have object dtype beforehand
        _check_orient(df.transpose().transpose(), "index")
Example #22
0
def memory_perf(systems, unique_id, group_number, detail_options,
                rampup_value=0, current_dir=""):
    have_memory_data = False
    modes = ['1K', '4K', '1M', '16M', '128M', '256M', '1G', '2G']
    sets = search_item(systems, unique_id, "cpu", "(.*)", [], modes)
    for mode in sorted(modes):
        real_mode = "Memory benchmark %s" % mode
        results = {}
        threaded_perf = dict()
        forked_perf = dict()
        for system in sets:
            memory = []
            series = []
            found_data = ""
            threaded_perf[system] = 0
            forked_perf[system] = 0
            for perf in sets[system]:
                if mode in perf[2]:
                    # We shall split individual cpu benchmarking from
                    # the global one
                    if ("logical_" in perf[1] and
                            ("bandwidth_%s" % mode) in perf[2]):
                        if not perf[1] in memory:
                            memory.append(perf[1])
                        series.append(float(perf[3]))
                    elif "threaded_bandwidth_%s" % mode in perf[2]:
                        threaded_perf[system] = float(perf[3])
                        found_data = float(perf[3])
                    elif "forked_bandwidth_%s" % mode in perf[2]:
                        forked_perf[system] = float(perf[3])
                        found_data = float(perf[3])

            if found_data:
                # If no series are populated, it means that a single "All CPU"
                # run was done
                # If so, let's create a single run value
                if not series:
                    series.append(found_data)
                    memory.append("logical")

            results[system] = Series(series, index=memory)

        # No need to continue if no Memory data in this benchmark
        if not results:
            continue

        consistent = []
        curious = []
        unstable = []
        details = []
        matched_category = ''

        df = DataFrame(results)
        for memory in df.transpose().columns:
            if have_memory_data is False:
                print()
                print("Group %d : Checking Memory perf" % group_number)
                have_memory_data = True

            print_perf(1, 7, df.transpose()[memory], df, real_mode, memory,
                       consistent, curious, unstable, "", rampup_value,
                       current_dir)
            matched_category = []
            prepare_detail(detail_options, group_number, mode, memory,
                           details, matched_category)

        print_detail(detail_options, details, df, matched_category)
        print_summary(mode, consistent, "consistent", "MB/s", df)
        print_summary(mode, curious, "curious", "MB/s", df)
        print_summary(mode, unstable, "unstable", "MB/s", df)

        for bench_type in ["threaded", "forked"]:
            efficiency = {}
            have_forked_or_threaded = False
            if "threaded" in bench_type:
                mode_text = "Thread effi."
            else:
                mode_text = "Forked Effi."
            for system in sets:
                host_efficiency_full_load = []
                host_perf = df[system].sum()
                if (host_perf > 0 and threaded_perf[system] > 0 and
                        forked_perf[system] > 0):
                    have_forked_or_threaded = True
                    if "threaded" in bench_type:
                        host_efficiency_full_load.append(
                            threaded_perf[system] / host_perf * 100)
                    else:
                        host_efficiency_full_load.append(
                            forked_perf[system] / host_perf * 100)

                    efficiency[system] = Series(host_efficiency_full_load,
                                                index=[mode_text])

            details = []
            memory_eff = DataFrame(efficiency)
            if have_forked_or_threaded is True:
                consistent = []
                curious = []
                unstable = []

                for memory in memory_eff.transpose().columns:
                    print_perf(2, 10, memory_eff.transpose()[memory],
                               memory_eff, real_mode, memory, consistent,
                               curious, unstable)
                    matched_category = []
                    prepare_detail(detail_options, group_number, mode,
                                   memory, details, matched_category)

                # Let's pad if its a thread or forked effi in addition
                # of the block size
                if matched_category:
                    matched_category[0] += " " + mode_text

                print_detail(detail_options, details, memory_eff,
                             matched_category)
                print_summary(mode + " " + mode_text, consistent,
                              "consistent", "%", memory_eff)
                print_summary(mode + " " + mode_text, curious,
                              "curious", "%", memory_eff)
                print_summary(mode + " " + mode_text, unstable,
                              "unstable", "%", memory_eff)
            else:
                utils.do_print(real_mode, utils.Levels.WARNING,
                               "%-12s : Benchmark not run on this group",
                               mode_text)
Example #23
0
def cpu_perf(systems, unique_id, group_number, detail_options,
             rampup_value=0, current_dir=""):
    have_cpu_data = False
    host_cpu_list = search_item(systems, unique_id, "cpu", "(.*)", [],
                                ['product'])
    host_cpu_number = search_item(systems, unique_id, "cpu",
                                  "(.*logical.*)", [], ['number'])
    core_counts = 1
    for host in host_cpu_number:
        for item in host_cpu_number[host]:
            core_counts = item[3]
            break

    cpu_type = ''
    for host in host_cpu_list:
        for item in host_cpu_list[host]:
            cpu_type = item[3]
            break

    modes = ['bogomips', 'loops_per_sec']
    sets = search_item(systems, unique_id, "cpu", "(.*)", [], modes)
    global_perf = dict()
    for mode in sorted(modes):
        results = {}
        for system in sets:
            cpu = []
            series = []
            found_data = False
            for perf in sets[system]:
                if perf[2] == mode:
                    # We shall split individual cpu benchmarking from
                    # the global one
                    if "_" in perf[1]:
                        if not perf[1] in cpu:
                            cpu.append(perf[1])
                        series.append(float(perf[3]))
                        found_data = True
                    elif "loops_per_sec" in mode:
                        global_perf[system] = float(perf[3])
                        found_data = True

            if found_data is True:
                # If no series are populated, it means that a single
                # "All CPU" run was done
                # If so, let's create a single run value
                if not series:
                    series.append(global_perf[system])
                    cpu.append("logical")

                results[system] = Series(series, index=cpu)

        # No need to continue if no CPU data in this benchmark
        if not results:
            continue

        df = DataFrame(results)
        consistent = []
        curious = []
        unstable = []
        details = []
        matched_category = []

        for cpu in df.transpose().columns:
            if have_cpu_data is False:
                print()
                print("Group %d : Checking CPU perf" % group_number)
                have_cpu_data = True
            print_perf(2, 7, df.transpose()[cpu], df, mode, cpu, consistent,
                       curious, unstable, "", rampup_value, current_dir)
            prepare_detail(detail_options, group_number, mode, cpu, details,
                           matched_category)

        print_detail(detail_options, details, df, matched_category)

        print_summary(mode, consistent, "consistent", "", df, cpu_type)
        print_summary(mode, curious, "curious", "", df)
        print_summary(mode, unstable, "unstable", "", df)

        if mode == "loops_per_sec":
            efficiency = {}
            mode_text = 'CPU Effi.'
            consistent = []
            curious = []
            unstable = []
            details = []
            matched_category = []

            for system in sets:
                host_efficiency_full_load = []
                host_perf = (df[system].sum() *
                             (int(core_counts) / df[system].count()))
                host_efficiency_full_load.append(
                    global_perf[system] / host_perf * 100)
                efficiency[system] = Series(host_efficiency_full_load,
                                            index=[mode_text])

            cpu_eff = DataFrame(efficiency)
            print_perf(1, 2, cpu_eff.transpose()[mode_text], cpu_eff, mode,
                       mode_text, consistent, curious, unstable)
            prepare_detail(detail_options, group_number, mode, mode_text,
                           details, matched_category)

            print_detail(detail_options, details, cpu_eff, matched_category)
            print_summary("CPU Efficiency", consistent, "consistent", '%',
                          cpu_eff)
            print_summary("CPU Efficiency", curious, "curious", '%', cpu_eff)
            print_summary("CPU Efficiency", unstable, "unstable", '%', cpu_eff)
Example #24
0
def logical_disks_perf(systems, unique_id, group_number, detail_options,
                       perf_unit, rampup_value=0, current_dir=""):
    have_disk_data = False
    sets = search_item(systems, unique_id, "disk", r"[a-z]d(\S+)", [],
                       ['simultaneous', 'standalone'])
    modes = []

    # Searching for modes ran in this benchmark
    for system in sets:
        for perf in sets[system]:
            if perf[2] not in modes and perf_unit in perf[2]:
                modes.append(perf[2])

    if len(modes) == 0:
        return

    for mode in sorted(modes):
        results = {}
        for system in sets:
            disks = []
            series = []
            for perf in sets[system]:
                if perf[2] == mode:
                    if not perf[1] in disks:
                        disks.append(perf[1])
                    series.append(int(perf[3]))
            results[system] = Series(series, index=disks)

        df = DataFrame(results)
        details = []
        matched_category = []
        for disk in df.transpose().columns:
            if have_disk_data is False:
                print()
                print("Group %d : Checking logical disks perf" % group_number)
                have_disk_data = True
            consistent = []
            curious = []
            unstable = []
            # How much the variance could be far from the average (in %)
            tolerance_max = 10
            tolerance_min = 2
            # In random mode, the variance could be higher as
            # we cannot insure the distribution pattern was similar
            if "rand" in mode:
                tolerance_min = 5
                tolerance_max = 15

            print_perf(tolerance_min, tolerance_max, df.transpose()[disk], df,
                       mode, disk, consistent, curious, unstable, "-%s" % perf_unit,
                       rampup_value, current_dir)

            prepare_detail(detail_options, group_number, mode, disk, details,
                           matched_category)
            print_summary("%-30s %s" % (mode, disk), consistent, "consistent",
                          perf_unit, df)
            print_summary("%-30s %s" % (mode, disk), curious, "curious",
                          perf_unit, df)
            print_summary("%-30s %s" % (mode, disk), unstable, "unstable",
                          perf_unit, df)

        print_detail(detail_options, details, df, matched_category)
Example #25
0
    def test_frame_from_json_to_json(self):
        def _check_orient(df, orient, dtype=None, numpy=False,
                          convert_axes=True, check_dtype=True, raise_ok=None):
            df = df.sort()
            dfjson = df.to_json(orient=orient)

            try:
                unser = read_json(dfjson, orient=orient, dtype=dtype,
                                  numpy=numpy, convert_axes=convert_axes)
            except Exception as detail:
                if raise_ok is not None:
                    if isinstance(detail, raise_ok):
                        return
                    raise

            unser = unser.sort()

            if dtype is False:
                check_dtype=False

            if not convert_axes and df.index.dtype.type == np.datetime64:
                unser.index = DatetimeIndex(
                    unser.index.values.astype('i8') * 1e6)
            if orient == "records":
                # index is not captured in this orientation
                assert_almost_equal(df.values, unser.values)
                self.assertTrue(df.columns.equals(unser.columns))
            elif orient == "values":
                # index and cols are not captured in this orientation
                assert_almost_equal(df.values, unser.values)
            elif orient == "split":
                # index and col labels might not be strings
                unser.index = [str(i) for i in unser.index]
                unser.columns = [str(i) for i in unser.columns]
                unser = unser.sort()
                assert_almost_equal(df.values, unser.values)
            else:
                if convert_axes:
                    assert_frame_equal(df, unser, check_dtype=check_dtype)
                else:
                    assert_frame_equal(df, unser, check_less_precise=False,
                                       check_dtype=check_dtype)

        def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):

            # numpy=False
            if convert_axes:
                _check_orient(df, "columns", dtype=dtype)
                _check_orient(df, "records", dtype=dtype)
                _check_orient(df, "split", dtype=dtype)
                _check_orient(df, "index", dtype=dtype)
                _check_orient(df, "values", dtype=dtype)

            _check_orient(df, "columns", dtype=dtype, convert_axes=False)
            _check_orient(df, "records", dtype=dtype, convert_axes=False)
            _check_orient(df, "split", dtype=dtype, convert_axes=False)
            _check_orient(df, "index", dtype=dtype, convert_axes=False)
            _check_orient(df, "values", dtype=dtype ,convert_axes=False)

            # numpy=True and raise_ok might be not None, so ignore the error
            if convert_axes:
                _check_orient(df, "columns", dtype=dtype, numpy=True,
                              raise_ok=raise_ok)
                _check_orient(df, "records", dtype=dtype, numpy=True,
                              raise_ok=raise_ok)
                _check_orient(df, "split", dtype=dtype, numpy=True,
                              raise_ok=raise_ok)
                _check_orient(df, "index", dtype=dtype, numpy=True,
                              raise_ok=raise_ok)
                _check_orient(df, "values", dtype=dtype, numpy=True,
                              raise_ok=raise_ok)

            _check_orient(df, "columns", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok)
            _check_orient(df, "records", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok)
            _check_orient(df, "split", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok)
            _check_orient(df, "index", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok)
            _check_orient(df, "values", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok)

        # basic
        _check_all_orients(self.frame)
        self.assertEqual(self.frame.to_json(),
                         self.frame.to_json(orient="columns"))

        _check_all_orients(self.intframe, dtype=self.intframe.values.dtype)
        _check_all_orients(self.intframe, dtype=False)

        # big one
        # index and columns are strings as all unserialised JSON object keys
        # are assumed to be strings
        biggie = DataFrame(np.zeros((200, 4)),
                           columns=[str(i) for i in range(4)],
                           index=[str(i) for i in range(200)])
        _check_all_orients(biggie,dtype=False,convert_axes=False)

        # dtypes
        _check_all_orients(DataFrame(biggie, dtype=np.float64),
                           dtype=np.float64, convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int,
                           convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
                           convert_axes=False, raise_ok=ValueError)

        # empty
        _check_all_orients(self.empty_frame)

        # time series data
        _check_all_orients(self.tsframe)

        # mixed data
        index = pd.Index(['a', 'b', 'c', 'd', 'e'])
        data = {
            'A': [0., 1., 2., 3., 4.],
            'B': [0., 1., 0., 1., 0.],
            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
            'D': [True, False, True, False, True]
        }
        df = DataFrame(data=data, index=index)
        _check_orient(df, "split", check_dtype=False)
        _check_orient(df, "records", check_dtype=False)
        _check_orient(df, "values", check_dtype=False)
        _check_orient(df, "columns", check_dtype=False)
        # index oriented is problematic as it is read back in in a transposed
        # state, so the columns are interpreted as having mixed data and
        # given object dtypes.
        # force everything to have object dtype beforehand
        _check_orient(df.transpose().transpose(), "index", dtype=False)
Example #26
0
def read_lmw(admFile, datFile, kwaFile):
    with open(admFile) as f:
        administration = f.readlines()
    with open(datFile) as f:
        data = f.readlines()
    with open(kwaFile) as f:
        data_quality = f.readlines()

    if len(administration) != len(data):
        raise Exception("Input data is not of same length.")

    # LMW interval in minutes
    interval = 10
    val_series = []
    timestamp_series = []
    remoteid_series = []
    quality_series = []
    zom_win = []

    for i in range(len(administration)):
        values = administration[i].split(",")
        # Get the id of the timeserie
        timeseriesId = values[0].strip() +\
            "_" + values[1].strip() + "_" + values[3].strip()
        # Get the time of the first value
        if values[7].find('MET') == -1:
            zom_win = 'summer'

        values[7] = values[7].replace("JAN", "01")
        values[7] = values[7].replace("FEB", "02")
        values[7] = values[7].replace("MRT", "03")
        values[7] = values[7].replace("APR", "04")
        values[7] = values[7].replace("MEI", "05")
        values[7] = values[7].replace("JUN", "06")
        values[7] = values[7].replace("JUL", "07")
        values[7] = values[7].replace("AUG", "08")
        values[7] = values[7].replace("SEP", "09")
        values[7] = values[7].replace("OKT", "10")
        values[7] = values[7].replace("NOV", "11")
        values[7] = values[7].replace("DEC", "12")
        values[7] = values[7].replace("Z03", "")
        values[7] = values[7].replace("MET", "")
        values[7] = values[7].strip()

        if zom_win == 'summer':
            timeFirstValue = datetime.strptime(values[7], "%d-%m-%y %H:%M") -\
                timedelta(0, 0, 0, 0, 120)
        else:
            timeFirstValue = datetime.strptime(values[7], "%d-%m-%y %H:%M") -\
                timedelta(0, 0, 0, 0, 60)
        # Get all the measurements
        measurements = data[i].split(",")
        quality = data_quality[i].split(",")

        if len(measurements) != 7:
            raise Exception("Invalid number of measurements for timeserie.")

        if len(quality) != 7:
            raise Exception("Invalid number of quality flags for timeserie.")

        counter = 0
        for j in range(6):
            value = measurements[j].strip()
            value_flag = int(quality[j])
            if value != "f" and value != "n":
                TimeForValue = timeFirstValue +\
                    timedelta(0, 0, 0, 0, interval * j)
                val_series.append(float(value))
                timestamp_series.append(TimeForValue)
                remoteid_series.append(timeseriesId)
                counter += 1
                if value_flag in [10, 30, 50, 70]:
                    quality_series.append('0')
                elif value_flag in [2, 22, 24, 28, 42, 44, 48, 62, 68]:
                    quality_series.append('3')
                else:
                    quality_series.append('6')

    tsobj = DataFrame([remoteid_series,
                       val_series, quality_series])
    tsobj = tsobj.transpose()
    tsobj.columns = ['SensorID', 'value', 'flag']
    tstamp = DataFrame(timestamp_series, columns=['ts'])
    tsobj_indexed = tsobj.set_index(tstamp['ts'])
    return tsobj_indexed
Example #27
0
    def test_frame_from_json_to_json(self):
        def _check_orient(
            df,
            orient,
            dtype=None,
            numpy=False,
            convert_axes=True,
            check_dtype=True,
            raise_ok=None,
            sort=None,
            check_index_type=True,
            check_column_type=True,
        ):
            if sort is not None:
                df = df.sort_values(sort)
            else:
                df = df.sort_index()

            # if we are not unique, then check that we are raising ValueError
            # for the appropriate orients
            if not df.index.is_unique and orient in ["index", "columns"]:
                self.assertRaises(ValueError, lambda: df.to_json(orient=orient))
                return
            if not df.columns.is_unique and orient in ["index", "columns", "records"]:
                self.assertRaises(ValueError, lambda: df.to_json(orient=orient))
                return

            dfjson = df.to_json(orient=orient)

            try:
                unser = read_json(dfjson, orient=orient, dtype=dtype, numpy=numpy, convert_axes=convert_axes)
            except Exception as detail:
                if raise_ok is not None:
                    if isinstance(detail, raise_ok):
                        return
                    raise

            if sort is not None and sort in unser.columns:
                unser = unser.sort_values(sort)
            else:
                unser = unser.sort_index()

            if dtype is False:
                check_dtype = False

            if not convert_axes and df.index.dtype.type == np.datetime64:
                unser.index = DatetimeIndex(unser.index.values.astype("i8") * 1e6)
            if orient == "records":
                # index is not captured in this orientation
                assert_almost_equal(df.values, unser.values)
                self.assertTrue(df.columns.equals(unser.columns))
            elif orient == "values":
                # index and cols are not captured in this orientation
                if numpy is True and df.shape == (0, 0):
                    assert unser.shape[0] == 0
                else:
                    assert_almost_equal(df.values, unser.values)
            elif orient == "split":
                # index and col labels might not be strings
                unser.index = [str(i) for i in unser.index]
                unser.columns = [str(i) for i in unser.columns]

                if sort is None:
                    unser = unser.sort_index()
                assert_almost_equal(df.values, unser.values)
            else:
                if convert_axes:
                    assert_frame_equal(
                        df,
                        unser,
                        check_dtype=check_dtype,
                        check_index_type=check_index_type,
                        check_column_type=check_column_type,
                    )
                else:
                    assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype)

        def _check_all_orients(
            df, dtype=None, convert_axes=True, raise_ok=None, sort=None, check_index_type=True, check_column_type=True
        ):

            # numpy=False
            if convert_axes:
                _check_orient(df, "columns", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False)
                _check_orient(df, "records", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False)
                _check_orient(df, "split", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False)
                _check_orient(df, "index", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False)
                _check_orient(df, "values", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False)

            _check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort)
            _check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort)
            _check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort)
            _check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort)
            _check_orient(df, "values", dtype=dtype, convert_axes=False, sort=sort)

            # numpy=True and raise_ok might be not None, so ignore the error
            if convert_axes:
                _check_orient(
                    df,
                    "columns",
                    dtype=dtype,
                    numpy=True,
                    raise_ok=raise_ok,
                    sort=sort,
                    check_index_type=False,
                    check_column_type=False,
                )
                _check_orient(
                    df,
                    "records",
                    dtype=dtype,
                    numpy=True,
                    raise_ok=raise_ok,
                    sort=sort,
                    check_index_type=False,
                    check_column_type=False,
                )
                _check_orient(
                    df,
                    "split",
                    dtype=dtype,
                    numpy=True,
                    raise_ok=raise_ok,
                    sort=sort,
                    check_index_type=False,
                    check_column_type=False,
                )
                _check_orient(
                    df,
                    "index",
                    dtype=dtype,
                    numpy=True,
                    raise_ok=raise_ok,
                    sort=sort,
                    check_index_type=False,
                    check_column_type=False,
                )
                _check_orient(
                    df,
                    "values",
                    dtype=dtype,
                    numpy=True,
                    raise_ok=raise_ok,
                    sort=sort,
                    check_index_type=False,
                    check_column_type=False,
                )

            _check_orient(df, "columns", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort)
            _check_orient(df, "records", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort)
            _check_orient(df, "split", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort)
            _check_orient(df, "index", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort)
            _check_orient(df, "values", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort)

        # basic
        _check_all_orients(self.frame)
        self.assertEqual(self.frame.to_json(), self.frame.to_json(orient="columns"))

        _check_all_orients(self.intframe, dtype=self.intframe.values.dtype)
        _check_all_orients(self.intframe, dtype=False)

        # big one
        # index and columns are strings as all unserialised JSON object keys
        # are assumed to be strings
        biggie = DataFrame(np.zeros((200, 4)), columns=[str(i) for i in range(4)], index=[str(i) for i in range(200)])
        _check_all_orients(biggie, dtype=False, convert_axes=False)

        # dtypes
        _check_all_orients(DataFrame(biggie, dtype=np.float64), dtype=np.float64, convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int, convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype="U3"), dtype="U3", convert_axes=False, raise_ok=ValueError)

        # categorical
        _check_all_orients(self.categorical, sort="sort", raise_ok=ValueError)

        # empty
        _check_all_orients(self.empty_frame, check_index_type=False, check_column_type=False)

        # time series data
        _check_all_orients(self.tsframe)

        # mixed data
        index = pd.Index(["a", "b", "c", "d", "e"])
        data = {
            "A": [0.0, 1.0, 2.0, 3.0, 4.0],
            "B": [0.0, 1.0, 0.0, 1.0, 0.0],
            "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
            "D": [True, False, True, False, True],
        }
        df = DataFrame(data=data, index=index)
        _check_orient(df, "split", check_dtype=False)
        _check_orient(df, "records", check_dtype=False)
        _check_orient(df, "values", check_dtype=False)
        _check_orient(df, "columns", check_dtype=False)
        # index oriented is problematic as it is read back in in a transposed
        # state, so the columns are interpreted as having mixed data and
        # given object dtypes.
        # force everything to have object dtype beforehand
        _check_orient(df.transpose().transpose(), "index", dtype=False)
Example #28
0
    def __init__(self, workbench, data_path = "/home/moritz/people/MoreData/genomes/TOBG/", clean = False):
        Database.__init__(self,workbench = workbench, data_path = data_path)

        wb = load_workbook("metadata/Table3_GenomeStats.xlsx")
        t_metadata = DataFrame([l for i,l in enumerate(wb['Sheet1'].values) if i >1], columns=[l for l in wb['Sheet1'].values][1])
        corrected = { u'\xc2Gemmatimonadetes': 'Gemmatimonadetes' ,
        'marinegroup': 'Puniceicoccaceae',
        'Urania1B19': 'Phycisphaerae',
        'Thalassopira' : 'Thalassospira',
        'SM1A02': 'Phycisphaerae',
        'SAR324cluster': 'SAR324 cluster',
        'unclassifiedAlphaproteobacteria': 'Alphaproteobacteria',
        'SAR202-2': 'SAR202 cluster',
        'SAR202-1': 'SAR202 cluster',
        'SAR116cluster' : 'SAR116 cluster',
        'OPB35soil': 'unidentified Verrucomicrobium group OPB35',
        'Pla3': 'Planctomycetes',
        'OM190': 'Planctomycetes',
        'NovelClass_B': 'Ignavibacteriae',
        'Nitropelagicus': 'Candidatus Nitrosopelagicus' ,
        'Nanoarchaoeta': 'Nanoarchaeota',
        'Methylobacterum': 'Methylobacterium',
        'JL-ENTP-F27': 'Phycisphaerae',
        'FS140-16B-02marinegroup': 'Phycisphaerae',
        'Epsilonbacteraeota': 'Bacteria',
        'DEV007': 'Verrucomicrobiales',
        'CandidatusPuniceispirillum': 'Candidatus Puniceispirillum',
        'CandidatePhylaRadiation' : 'Bacteria candidate phyla',
        'CaThioglobus': 'Candidatus Thioglobus',
        'CaAtelocyanobacterium' : 'Candidatus Atelocyanobacterium',
        '0319-6G20': 'Bdellovibrionales',
        'Euryarcheota' : 'Euryarchaeota' ,
        'SBR1093' : 'Bacteria',
        'Euryarcheoata' : 'Euryarchaeota'
        }

        regions = { 'NP' : 'North_Pacific',
        'NAT' : 'North_Atlantic',
        'MED' : 'Mediterranean',
        'ARS' : 'Arabian_Sea',
        'RS'  : 'Red_Sea',
        'IN'  : 'Indian_Ocean',
        'EAC' : 'East_Africa_Coastal',
        'SAT' : 'South_Atlantic',
        'CPC' : 'Chile_Peru_Coastal',
        'SP'  : 'South_Pacific'
        }

        wb2 = load_workbook("metadata/Table4_Phylogeny.xlsx")
        taxos = { l[0] : [v for v in l[:-1] if v != 'null' and not v[0:4] == "nove" ][-1] for l in wb2.get_sheet_by_name('Hug set').values}
        taxos = {k : corrected[v] if corrected.has_key(v) else v for k, v in taxos.items()}

        tax_2_id = self.taxDb.get_name_translator(taxos.values())
        tax_ids = {g : tax_2_id.get(taxos[g])[0]  for g in t_metadata['Genome ID'] if taxos.has_key(g) }
        t_metadata['species_taxid'] = [ tax_ids[g] if tax_ids.has_key(g) else 131567 for g in t_metadata['Genome ID']]
        t_metadata.index = Index(t_metadata['Genome ID'])
        t_metadata['region'] = [regions[g.split("_")[1].split("-")[0]] for g in t_metadata['Genome ID']]
        self.metadata = t_metadata.transpose().to_dict()

        print "Loading genomes"
        if os.path.exists(pjoin(self.data_path , 'TOBGGENOMES.tar.gz')):
            os.system("tar xzvf " + pjoin(self.data_path , 'TOBGGENOMES.tar.gz'))
            os.remove(pjoin(self.data_path , 'TOBGGENOMES.tar.gz'))

        for k,v in tqdm(self.metadata.items()):
            genome_path = pjoin(self.data_path, v['region'], k)
            genome_file = pjoin(genome_path, k + ".fna")
            if not os.path.exists(genome_file):
                os.makedirs(pjoin(genome_path, 'original_files'))
                shutil.move(self.data_path + k + ".fna", pjoin(genome_path, 'original_files'))
            self.genomes += [Genome(k, genome_path, ref=pjoin(genome_path, 'original_files', k + ".fna"), manual_metadata = v, taxDb = self.taxDb, workbench = self.workbench)]
    def test_frame_from_json_to_json(self):
        def _check_orient(df, orient, dtype=None, numpy=False,
                          convert_axes=True, check_dtype=True, raise_ok=None,
                          sort=None, check_index_type=True,
                          check_column_type=True, check_numpy_dtype=False):
            if sort is not None:
                df = df.sort_values(sort)
            else:
                df = df.sort_index()

            # if we are not unique, then check that we are raising ValueError
            # for the appropriate orients
            if not df.index.is_unique and orient in ['index', 'columns']:
                pytest.raises(
                    ValueError, lambda: df.to_json(orient=orient))
                return
            if (not df.columns.is_unique and
                    orient in ['index', 'columns', 'records']):
                pytest.raises(
                    ValueError, lambda: df.to_json(orient=orient))
                return

            dfjson = df.to_json(orient=orient)

            try:
                unser = read_json(dfjson, orient=orient, dtype=dtype,
                                  numpy=numpy, convert_axes=convert_axes)
            except Exception as detail:
                if raise_ok is not None:
                    if isinstance(detail, raise_ok):
                        return
                raise

            if sort is not None and sort in unser.columns:
                unser = unser.sort_values(sort)
            else:
                unser = unser.sort_index()

            if dtype is False:
                check_dtype = False

            if not convert_axes and df.index.dtype.type == np.datetime64:
                unser.index = DatetimeIndex(
                    unser.index.values.astype('i8') * 1e6)
            if orient == "records":
                # index is not captured in this orientation
                tm.assert_almost_equal(df.values, unser.values,
                                       check_dtype=check_numpy_dtype)
                tm.assert_index_equal(df.columns, unser.columns,
                                      exact=check_column_type)
            elif orient == "values":
                # index and cols are not captured in this orientation
                if numpy is True and df.shape == (0, 0):
                    assert unser.shape[0] == 0
                else:
                    tm.assert_almost_equal(df.values, unser.values,
                                           check_dtype=check_numpy_dtype)
            elif orient == "split":
                # index and col labels might not be strings
                unser.index = [str(i) for i in unser.index]
                unser.columns = [str(i) for i in unser.columns]

                if sort is None:
                    unser = unser.sort_index()
                tm.assert_almost_equal(df.values, unser.values,
                                       check_dtype=check_numpy_dtype)
            else:
                if convert_axes:
                    tm.assert_frame_equal(df, unser, check_dtype=check_dtype,
                                          check_index_type=check_index_type,
                                          check_column_type=check_column_type)
                else:
                    tm.assert_frame_equal(df, unser, check_less_precise=False,
                                          check_dtype=check_dtype)

        def _check_all_orients(df, dtype=None, convert_axes=True,
                               raise_ok=None, sort=None, check_index_type=True,
                               check_column_type=True):

            # numpy=False
            if convert_axes:
                _check_orient(df, "columns", dtype=dtype, sort=sort,
                              check_index_type=False, check_column_type=False)
                _check_orient(df, "records", dtype=dtype, sort=sort,
                              check_index_type=False, check_column_type=False)
                _check_orient(df, "split", dtype=dtype, sort=sort,
                              check_index_type=False, check_column_type=False)
                _check_orient(df, "index", dtype=dtype, sort=sort,
                              check_index_type=False, check_column_type=False)
                _check_orient(df, "values", dtype=dtype, sort=sort,
                              check_index_type=False, check_column_type=False)

            _check_orient(df, "columns", dtype=dtype,
                          convert_axes=False, sort=sort)
            _check_orient(df, "records", dtype=dtype,
                          convert_axes=False, sort=sort)
            _check_orient(df, "split", dtype=dtype,
                          convert_axes=False, sort=sort)
            _check_orient(df, "index", dtype=dtype,
                          convert_axes=False, sort=sort)
            _check_orient(df, "values", dtype=dtype,
                          convert_axes=False, sort=sort)

            # numpy=True and raise_ok might be not None, so ignore the error
            if convert_axes:
                _check_orient(df, "columns", dtype=dtype, numpy=True,
                              raise_ok=raise_ok, sort=sort,
                              check_index_type=False, check_column_type=False)
                _check_orient(df, "records", dtype=dtype, numpy=True,
                              raise_ok=raise_ok, sort=sort,
                              check_index_type=False, check_column_type=False)
                _check_orient(df, "split", dtype=dtype, numpy=True,
                              raise_ok=raise_ok, sort=sort,
                              check_index_type=False, check_column_type=False)
                _check_orient(df, "index", dtype=dtype, numpy=True,
                              raise_ok=raise_ok, sort=sort,
                              check_index_type=False, check_column_type=False)
                _check_orient(df, "values", dtype=dtype, numpy=True,
                              raise_ok=raise_ok, sort=sort,
                              check_index_type=False, check_column_type=False)

            _check_orient(df, "columns", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok, sort=sort)
            _check_orient(df, "records", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok, sort=sort)
            _check_orient(df, "split", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok, sort=sort)
            _check_orient(df, "index", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok, sort=sort)
            _check_orient(df, "values", dtype=dtype, numpy=True,
                          convert_axes=False, raise_ok=raise_ok, sort=sort)

        # basic
        _check_all_orients(self.frame)
        assert self.frame.to_json() == self.frame.to_json(orient="columns")

        _check_all_orients(self.intframe, dtype=self.intframe.values.dtype)
        _check_all_orients(self.intframe, dtype=False)

        # big one
        # index and columns are strings as all unserialised JSON object keys
        # are assumed to be strings
        biggie = DataFrame(np.zeros((200, 4)),
                           columns=[str(i) for i in range(4)],
                           index=[str(i) for i in range(200)])
        _check_all_orients(biggie, dtype=False, convert_axes=False)

        # dtypes
        _check_all_orients(DataFrame(biggie, dtype=np.float64),
                           dtype=np.float64, convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int,
                           convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
                           convert_axes=False, raise_ok=ValueError)

        # categorical
        _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)

        # empty
        _check_all_orients(self.empty_frame, check_index_type=False,
                           check_column_type=False)

        # time series data
        _check_all_orients(self.tsframe)

        # mixed data
        index = pd.Index(['a', 'b', 'c', 'd', 'e'])
        data = {'A': [0., 1., 2., 3., 4.],
                'B': [0., 1., 0., 1., 0.],
                'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
                'D': [True, False, True, False, True]}
        df = DataFrame(data=data, index=index)
        _check_orient(df, "split", check_dtype=False)
        _check_orient(df, "records", check_dtype=False)
        _check_orient(df, "values", check_dtype=False)
        _check_orient(df, "columns", check_dtype=False)
        # index oriented is problematic as it is read back in in a transposed
        # state, so the columns are interpreted as having mixed data and
        # given object dtypes.
        # force everything to have object dtype beforehand
        _check_orient(df.transpose().transpose(), "index", dtype=False)
Example #30
0
def ComputeMetrics1(stats, filename):
    """
    DESCRIPTION
    
    :Parameters:
        NAME : TYPE
            DESCRIPTIOIN
        
    :Return:
        DESCRIPTION
    """

    data = {}

    for article in stats:
        metrics = {}
        temp = {}
        
        title = article['article-title']
        
        # get metrics from data
        allActions = GetMetric(article, 'total-actions')
        number_tokens = GetMetric(article, 'number-tokens')
        maintainanceTag = GetMetric(article, 'tag-maintained')

        # split metrics between maintainer and others
        addsMaintainer, addsOthers = SplitMO(article, maintainers[index], 'tokens-added')
        deletesMaintainer, deletesOthers = SplitMO(article, maintainers[index], 'tokens-deleted')
        revertsMaintainer, revertsOthers = SplitMO(article, maintainers[index], 'tokens-reverted')
        antActionsMaintainer, antActionsOthers = SplitMO(article, maintainers[index], 'antagonistic-actions')
        reintroMaintainer, reintroOthers = SplitMO(article, maintainers[index], 'tokens-reintroduced')
        selfreintroMaintainer, selfreintroOthers = SplitMO(article, maintainers[index], 'tokens-self-reintroduced')
        talkpageMaintainer, talkpageOthers = SplitMO(article, maintainers[index], 'talkpage-edits')

        # BLABLA
        ownershipMaintainerAbs = GetOwnership(article, maintainers_id[index], 'tokens-absolute')
        ownershipMaintainerRel = GetOwnership(article, maintainers_id[index], 'tokens-relative')
        
        # get properties of article
        metrics['firstMaintRev'] = GetFirstMaintainedRev(maintainanceTag)
        metrics['maintainer-name'] = article['maintainer-name']
        metrics['maintainer-id'] = article['maintainer-id']
        metrics['all-actions'] = sum(allActions)
        metrics['edits-maintainer'] = len(addsMaintainer)
        metrics['edits-others'] = len(addsOthers)
        metrics['number-revisions'] = metrics['edits-maintainer'] + metrics['edits-others']
        
        # temporal comparison
        TempCompare()


        # to relativize with edits is just an assumptions to have something.
        if talkpageOthers:
            #metrics['talkPageRatio'] = sum(talkpageMaintainer) / float(metrics['edits-maintainer']) / float( sum(talkpageOthers) / float(metrics['edits-others']) )
            metrics['talkPageRatio'] = sum(talkpageMaintainer) / float(sum(talkpageOthers))
        else:
            metrics['talkPageRatio'] = 0

        # if metrics['all-actions'] is 0:
        #     metrics['addsMaintainerAvg'] = 0
        #     metrics['addsOthersAvg'] = 0
        #     metrics['addsRatio'] = 0
            
        #     metrics['deletesMaintainerRel'] = 0
        #     metrics['deletesOthersRel'] = 0
        #     metrics['deletesRatio'] = 0
            
        #     metrics['revertsMaintainerRel'] = 0
        #     metrics['revertsOthersRel'] = 0
        #     metrics['revertsRatio'] = 0

        #     metrics['reintroMaintainerAvg'] = 0
        #     metrics['reintroOthersAvg'] = 0
        #     metrics['selfreintroMaintainerAvg'] = 0
        #     metrics['selfreintroOthersAvg'] = 0
        #     metrics['selfreintroRatio'] = 0

        #     metrics['antActionsMaintainerAvg'] = 0
        #     metrics['antActionsOthersAvg'] = 0
        #     metrics['negActionsRatio'] = 0

        #     metrics['targetedIntroRatio'] = 0
        # metrics['addsMaintainerRel'] = sum(addsMaintainer)/float(metrics['all-actions'])
        # metrics['addsOthersRel'] = sum(addsOthers)/float(metrics['all-actions'])
        # metrics['addsRatio'] = metrics['addsMaintainerRel'] / float(metrics['addsOthersRel'])
        # metrics['deletesMaintainerRel'] = sum(deletesMaintainer)/float(metrics['all-actions'])
        # metrics['deletesOthersRel'] = sum(deletesOthers)/float(metrics['all-actions'])
        # metrics['deletesRatio'] = metrics['deletesMaintainerRel'] / float(metrics['deletesOthersRel'])
        # metrics['revertsMaintainerRel'] = sum(revertsMaintainer)/float(metrics['all-actions'])
        # metrics['revertsOthersRel'] = sum(revertsOthers)/float(metrics['all-actions'])
        # metrics['revertsRatio'] = metrics['revertsMaintainerRel'] / float(metrics['revertsOthersRel'])
        # metrics['reintroMaintainerRel'] = sum(reintroMaintainer)/float(metrics['all-actions'])
        # metrics['reintroOthersRel'] = sum(reintroOthers)/float(metrics['all-actions']) 
        # metrics['selfreintroMaintainerRel'] = sum(selfreintroMaintainer)/float(metrics['all-actions'])
        # metrics['selfreintroOthersRel'] = sum(selfreintroOthers)/float(metrics['all-actions']) 
        # if metrics['selfreintroOthersAvg'] == 0:
        #     metrics['selfreintroRatio'] = 0
        # else:
        #     metrics['selfreintroRatio'] = metrics['selfreintroMaintainerAvg'] / float(metrics['selfreintroOthersAvg'])
        # if metrics['antActionsOthersAvg'] == 0:
        #     metrics['antActionsRatio'] = 0
        # else:
        #     metrics['antActionsRatio'] = metrics['antActionsMaintainerAvg'] / float(metrics['antActionsOthersAvg'])
        
        # if metrics['reintroMaintainerAvg'] == 0 or metrics['selfreintroOthersAvg'] == 0 or metrics['reintroOthersAvg'] == 0:
        #     metrics['targetedIntroRatio'] = 0
        #     metrics['targetedIntroRatio2Ownership'] = 0
        # else:
        #     metrics['targetedIntroRatio'] = (metrics['selfreintroMaintainerAvg'] / float(metrics['reintroMaintainerAvg'])) \
        #         / float((metrics['selfreintroOthersAvg'] / float(metrics['reintroOthersAvg'])))
        #     #metrics['targetedIntroRatio2Ownership'] = (metrics['selfreintroMaintainerRel'] / float(metrics['reintroMaintainerRel'])) \
        #     #    / float((metrics['selfreintroOthersRel'] / float(metrics['reintroOthersRel'])))

        metrics['addsMaintainerAvg'] = sum(addsMaintainer)/float(metrics['edits-maintainer'])
        metrics['addsOthersAvg'] = sum(addsOthers)/float(metrics['edits-others'])
        metrics['addsRatio'] = metrics['addsMaintainerAvg'] / float(metrics['addsOthersAvg'])
        metrics['reintroMaintainerAvg'] = sum(reintroMaintainer) / float(metrics['edits-maintainer'])
        metrics['reintroOthersAvg'] = sum(reintroOthers) / float(metrics['edits-others']) 
        metrics['reintroRatio'] = metrics['reintroMaintainerAvg'] / float(metrics['reintroOthersAvg'])
        metrics['selfreintroMaintainerAvg'] = sum(selfreintroMaintainer) / float(metrics['edits-maintainer'])
        metrics['selfreintroOthersAvg'] = sum(selfreintroOthers) / float(metrics['edits-others']) 
        metrics['selfreintroRatio'] = metrics['selfreintroMaintainerAvg'] / float(metrics['selfreintroOthersAvg'])
        metrics['antActionsMaintainerAvg'] = sum(antActionsMaintainer)/float(metrics['edits-maintainer']) 
        metrics['antActionsOthersAvg'] = sum(antActionsOthers)/float(metrics['edits-others'])
        
        # metrics['deletesMaintainerAvg'] = sum(deletesMaintainer)/float(metrics['edits-maintainer'])
        # metrics['deletesOthersAvg'] = sum(deletesOthers)/float(metrics['edits-others'])
        # metrics['deletesRatio'] = sum(metrics['deletesMaintainerAvg']) / float(metrics['edits-maintainer']) / float(sum(temp['deletesOthersAvg']) / float(metrics['edits-others']))
        
        # metrics['revertsMaintainerAvg'] = sum(revertsMaintainer)/float(metrics['edits-maintainer'])
        # metrics['revertsOthersAvg'] = sum(revertsOthers)/float(metrics['edits-others'])
        # metrics['revertsRatio'] = sum(metrics['revertsMaintainerAvg']) / float(metrics['edits-maintainer']) / float(sum(metrics['revertsOthersAvg']) / float(metrics['edits-others']))
        # metrics['revertsMaintainerPot'] = sum(revertsMaintainer)/float(metrics['edits-maintainer'])
        # metrics['revertsOthersPot'] = sum(revertsOthers)/float(metrics['edits-others'])
        # metrics['revertsPotRatio'] = sum(metrics['revertsMaintainerAvg']) / float(metrics['edits-maintainer']) / float(sum(metrics['revertsOthersAvg']) / float(metrics['edits-others']))
        
        # metrics['reintroMaintainerAvg'] = sum(reintroMaintainer)/float(metrics['edits-maintainer'])
        # metrics['reintroOthersAvg'] = sum(reintroOthers)/float(metrics['edits-others'])
        # metrics['reintroRatio'] = sum(metrics['reintroMaintainerAvg']) / float(metrics['edits-maintainer']) / float(sum(metrics['reintroOthersAvg']) / float(metrics['edits-others']))

        # metrics['selfreintroMaintainerAvg'] = sum(selfreintroMaintainer)/float(metrics['edits-maintainer'])
        # metrics['selfreintroOthersAvg'] = sum(selfreintroOthers)/float(metrics['edits-others'])
        # metrics['selfreintroRatio'] = sum(metrics['selfreintroMaintainerAvg']) / float(metrics['edits-maintainer']) / float(sum(metrics['selfreintroOthersAvg']) / float(metrics['edits-others']))

        # share of selfreintroductions of potential own tokens
        # temp['selfreintroMaintainerPot'] = [(b/float(a)) for a,b in zip(ownershipMaintainerAbs[:len(ownershipMaintainerAbs)-2], selfreintroMaintainer[1:len(selfreintroMaintainer)-1])]
        # temp['selfreintroOthersPot'] = [(b/float(c-a)) for a,b in zip(ownershipMaintainerAbs[:len(ownershipMaintainerAbs)-2], selfreintroOthers[1:len(selfreintroOthers)-1], number_tokens[:len(number_tokens)-2) if a is not 0]
        # metrics['selfreintroPotRatio'] = sum(temp['selfreintroMaintainerPot']) / float(metrics['edits-maintainer']) / float(sum(temp['selfreintroOthersPot']) / float(metrics['edits-others']))
        
        # temp['antActionsMaintainerPot'] = [(b/float(a)) for a,b in zip(ownershipMaintainerAbs[:len(ownershipMaintainerAbs)-2], antActionsMaintainer[1:len(antActionsMaintainer)-1]) if a is not 0]
        # temp['antActionsOthersPot'] = [(b/float(c-a)) for a,b in zip(ownershipMaintainerAbs[:len(ownershipMaintainerAbs)-2], antActionsOthers[1:len(antActionsOthers)-1], number_tokens[:len(number_tokens)-2]) if a is not 0]
        # metrics['antActionsRatio'] = sum(temp['antActionsMaintainerPot']) / float(metrics['edits-maintainer']) / float(sum(temp['antActionsOthersPot']) / float(metrics['edits-others']))
            
        data[title] = metrics

    data = DataFrame(data)
    data = data.transpose()
    save2CSV(data, filename)

    return data