Esempio n. 1
1
 def send_to_db(self):
     conn = sqlite3.connect('data2.sqlite', timeout=30)
     c = conn.cursor()
     df = DataFrame(self.__dict__.items(), index=self.__dict__.keys())
     df = df.drop(0,1)
     df = df.transpose()
     df = df.sort(axis=1)
     df.to_sql('earnings_calendar', conn, if_exists='append', index=False)
Esempio n. 2
0
def test():
    """DataFrame editor test"""
    from numpy import nan

    df1 = DataFrame([
                     [True, "bool"],
                     [1+1j, "complex"],
                     ['test', "string"],
                     [1.11, "float"],
                     [1, "int"],
                     [np.random.rand(3, 3), "Unkown type"],
                     ["Large value", 100],
                     ["áéí", "unicode"]
                    ],
                    index=['a', 'b', nan, nan, nan, 'c',
                           "Test global max", 'd'],
                    columns=[nan, 'Type'])
    out = test_edit(df1)
    print("out:", out)
    out = test_edit(df1.iloc[0])
    print("out:", out)
    df1 = DataFrame(np.random.rand(100001, 10))
    # Sorting large DataFrame takes time
    df1.sort(columns=[0, 1], inplace=True)
    out = test_edit(df1)
    print("out:", out)
    out = test_edit(TimeSeries(np.arange(10)))
    print("out:", out)
    return out
Esempio n. 3
0
def test():
    """DataFrame editor test"""
    from numpy import nan

    df1 = DataFrame([
                     [True, "bool"],
                     [1+1j, "complex"],
                     ['test', "string"],
                     [1.11, "float"],
                     [1, "int"],
                     [np.random.rand(3, 3), "Unkown type"],
                     ["Large value", 100],
                     ["áéí", "unicode"]
                    ],
                    index=['a', 'b', nan, nan, nan, 'c',
                           "Test global max", 'd'],
                    columns=[nan, 'Type'])
    out = test_edit(df1)
    print("out:", out)
    out = test_edit(df1.iloc[0])
    print("out:", out)
    df1 = DataFrame(np.random.rand(100001, 10))
    # Sorting large DataFrame takes time
    df1.sort(columns=[0, 1], inplace=True)
    out = test_edit(df1)
    print("out:", out)
    out = test_edit(TimeSeries(np.arange(10)))
    print("out:", out)
    return out
Esempio n. 4
0
    def calculate_top_10_solutions(self):
        '''calcualte all schemes and select top 10 solutions'''
        
        columns = ['name','rate','money']

        if isfile( learning_progres_csv ):
            scheme_profit = read_csv(learning_progres_csv)
        else:
            scheme_profit = DataFrame(columns = columns)            
        scheme_profit.set_index('name',inplace = True)

        with open(learning_progres_csv, 'w+') as csvfile:
            writer = csv.DictWriter(csvfile,delimiter=',',fieldnames = columns)
            writer.writeheader()
            csvfile.flush()
            for sc in self.generate_all_schemes():
                if sc.name not in scheme_profit.index:
                    e = evaluator(sc)
                    scheme_profit.ix[sc.name] = rate,money = e.calculate()
                    writer.writerow({'name':sc.name,'rate':rate,'money':money})
                    csvfile.flush()
                    if self.log:
                        print(sc.name + ' - ' + str(money) + ' \t rate = ' + str(rate))
                else:
                    writer.writerow({'name':sc.name,'rate':scheme_profit.rate[sc.name],'money':scheme_profit.money[sc.name]})
                    if self.log:
                        print(sc.name + ' - ' + str(scheme_profit.money[sc.name]) + ' \t rate = ' + str(scheme_profit.rate[sc.name]))
                    csvfile.flush()

        #TODO:write into scheme
        scheme_profit.sort(['money'],ascending = False)
        return scheme_profit[:10].to_dict()
Esempio n. 5
0
def test():
    """DataFrame editor test"""
    from numpy import nan
    from pandas.util.testing import assert_frame_equal, assert_series_equal

    df1 = DataFrame(
        [[True, "bool"], [1 + 1j, "complex"], ['test', "string"],
         [1.11, "float"], [1, "int"], [np.random.rand(3, 3), "Unkown type"],
         ["Large value", 100], ["áéí", "unicode"]],
        index=['a', 'b', nan, nan, nan, 'c', "Test global max", 'd'],
        columns=[nan, 'Type'])
    out = test_edit(df1)
    assert_frame_equal(df1, out)

    result = Series([True, "bool"], index=[nan, 'Type'], name='a')
    out = test_edit(df1.iloc[0])
    assert_series_equal(result, out)

    # Sorting large DataFrame takes time
    df1 = DataFrame(np.random.rand(100100, 10))
    df1.sort(columns=[0, 1], inplace=True)
    out = test_edit(df1)
    assert_frame_equal(out, df1)

    series = Series(np.arange(10), name=0)
    out = test_edit(series)
    assert_series_equal(series, out)
Esempio n. 6
0
def test():
    """DataFrame editor test"""
    from numpy import nan
    from pandas.util.testing import assert_frame_equal, assert_series_equal

    df1 = DataFrame([
                     [True, "bool"],
                     [1+1j, "complex"],
                     ['test', "string"],
                     [1.11, "float"],
                     [1, "int"],
                     [np.random.rand(3, 3), "Unkown type"],
                     ["Large value", 100],
                     ["áéí", "unicode"]
                    ],
                    index=['a', 'b', nan, nan, nan, 'c',
                           "Test global max", 'd'],
                    columns=[nan, 'Type'])
    out = test_edit(df1)
    assert_frame_equal(df1, out)

    result = Series([True, "bool"], index=[nan, 'Type'], name='a')
    out = test_edit(df1.iloc[0])
    assert_series_equal(result, out)

    # Sorting large DataFrame takes time
    df1 = DataFrame(np.random.rand(100100, 10))
    df1.sort(columns=[0, 1], inplace=True)
    out = test_edit(df1)
    assert_frame_equal(out, df1)

    series = Series(np.arange(10), name=0)
    out = test_edit(series)
    assert_series_equal(series, out)
def analyze():
    data = DataFrame()
    for architecture in performer.ARCHITECTURES:
        performer.initialize('test', architecture, 8, 112)
        print performer.DATASET_TEST
        data1 = read_csv(performer.DATASET_TEST, sep='\t')
        data = concat([data, data1], ignore_index=True)
    data.sort('benchmark', inplace=True)
    print 'analyze:', data.columns.values
    # results = data.ix[data.groupby(['architecture', 'benchmark'])['latency'].idxmin()]
    results = data
    results['graph'] = [
        performer.string_to_graph(t) for t in results['topology']
    ]
    results['average_hop_count'] = [
        average_shortest_path_length(g) for g in results['graph']
    ]
    results['link_lengths'] = [
        get_edge_attributes(g, 'length').values() for g in results['graph']
    ]
    mask = (results['architecture']
            == 'small_world') | (results['architecture'] == 'optimum')
    print data[mask][['architecture', 'benchmark', 'latency']]
    performer.plot_figures(results)
    # for normalized_attribute, attribute in zip(performer.NORMALIZED_ATTRIBUTES, performer.ATTRIBUTES):
    #     normlized_values = []
    #     for index, row in results.iterrows():
    #         mesh_index = (results['architecture'] == 'mesh') & (results['benchmark'] == row['benchmark'])
    #         normlized_values.append(row[attribute]/squeeze(results[mesh_index][attribute]))
    #     results[normalized_attribute] = normlized_values
    return
Esempio n. 8
0
    def __parallel_evaluation(self, X):

        from mpi4py import MPI
        from pandas import DataFrame

        comm = MPI.COMM_WORLD

        n_procs, _ = X.shape

        # Spawning processes to test kriging mixture
        comm = MPI.COMM_SELF.Spawn(sys.executable,
                                   args=['ego_evaluation.py'],
                                   maxprocs=n_procs)

        # scatter the models and data
        comm.bcast(self.fitness, root=MPI.ROOT)
        comm.scatter([(k, X[k, :]) \
            for k in range(n_procs)], root=MPI.ROOT)

        # Synchronization while the children process are performing
        # heavy computations...
        comm.Barrier()

        # Gether the fitted model from the childrenn process
        # Note that 'None' is only valid in master-slave working mode
        result = comm.gather(None, root=MPI.ROOT)

        comm.Disconnect()

        # register the measures
        data = DataFrame([[d['index'], d['y']] \
            for d in result], columns=['index', 'y'])
        data.sort('index', inplace=True)

        return array(data['y'])
Esempio n. 9
0
    def test_sort(self):
        frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"])

        # 9816 deprecated
        with tm.assert_produces_warning(FutureWarning):
            frame.sort(columns="A")
        with tm.assert_produces_warning(FutureWarning):
            frame.sort()
Esempio n. 10
0
    def transform_dataframe(self, df):
        """
        Use matplotlib to compute boxplot statistics on timeseries data.
        """
        from pandas import DataFrame
        group = self.get_grouping(len(df.columns))
        serializer = self.get_serializer()
        value_col = serializer.value_field
        series_col = serializer.key_fields[0]
        param_cols = serializer.parameter_fields
        ncols = 1 + len(param_cols)

        if "index" in group:
            # Separate stats for each column in dataset
            groups = {col: df[col] for col in df.columns}
        else:
            # Stats for entire dataset
            df = df.stack().stack().stack()
            df.reset_index(inplace=True)
            index = serializer.get_index(df)
            df.set_index(index[0], inplace=True)
            groups = {(value_col, ) + ('all', ) * ncols: df.value}

        # Compute stats for each column, potentially grouped by year
        all_stats = []
        for g, series in groups.items():
            if g[0] != serializer.value_field:
                continue
            series_info = g[-1]
            param_info = list(reversed(g[1:-1]))
            if "year" in group or "month" in group:
                groupby = "year" if "year" in group else "month"
                dstats = self.compute_boxplots(series, groupby)
                for s in dstats:
                    s[series_col] = series_info
                    for pname, pval in zip(param_cols, param_info):
                        s[pname] = pval
            else:
                stats = self.compute_boxplot(series)
                stats[series_col] = series_info
                for pname, pval in zip(param_cols, param_info):
                    stats[pname] = pval
                dstats = [stats]
            all_stats += dstats

        df = DataFrame(all_stats)
        index = [series_col] + param_cols
        if "year" in group:
            index = ['year'] + index
        elif "month" in group:
            index = ['month'] + index
        df.sort(index, inplace=True)
        df.set_index(index, inplace=True)
        df.columns.name = ""
        df = df.unstack().unstack()
        if "year" in group or "month" in group:
            df = df.unstack()
        return df
Esempio n. 11
0
    def test_sort(self):
        frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
                          columns=['A', 'B', 'C', 'D'])

        # 9816 deprecated
        with tm.assert_produces_warning(FutureWarning):
            frame.sort(columns='A')
        with tm.assert_produces_warning(FutureWarning):
            frame.sort()
Esempio n. 12
0
    def test_sort(self):
        frame = DataFrame(np.arange(16).reshape(4, 4),
                          index=[1, 2, 3, 4],
                          columns=['A', 'B', 'C', 'D'])

        # 9816 deprecated
        with tm.assert_produces_warning(FutureWarning):
            frame.sort(columns='A')
        with tm.assert_produces_warning(FutureWarning):
            frame.sort()
Esempio n. 13
0
File: ddb_runs.py Progetto: yz-/ut
 def get_runs_df(self):
     """
     Returns all table as dataframe, sorted with most recent entry on bottom (ascending order)
     """
     df = DataFrame([{k: v for k, v in r.items()} for r in self.table.scan()])
     if df.empty:
         return df
     else:
         df.sort(columns=['dt'], ascending=True, inplace=True)
         # force df to have columns in this order
         return df[['dt', 'start', 'end']]
Esempio n. 14
0
 def get_runs_df(self):
     """
     Returns all table as dataframe, sorted with most recent entry on bottom (ascending order)
     """
     df = DataFrame([{k: v
                      for k, v in r.items()} for r in self.table.scan()])
     if df.empty:
         return df
     else:
         df.sort(columns=['dt'], ascending=True, inplace=True)
         # force df to have columns in this order
         return df[['dt', 'start', 'end']]
Esempio n. 15
0
def filter_tags(tag_pickle='results/material_tags.pickle', exclude_tags='results/exclude.csv', n=50):
    exclude_words, duplicate_sets = load_filter_tags(exclude_tags)
    with open(tag_pickle, 'r') as f:
        t = DataFrame(pickle.load(f)['result']).set_index('_id')
    for setn in duplicate_sets:
        t.ix[setn[0]] += sum(map(lambda x: t.ix[x] , setn[1:]))
        for tag in setn[1:]:
            t.drop(tag, inplace=True)
    for tag in exclude_words:
        t.drop(tag, inplace=True)
    t.sort(ascending=False)
    return t[:n].index
Esempio n. 16
0
def calculate_accuracy_by_category(y_test, predicted):
    """Calculates the accuracy of each by category. This is used for the outcome of a classifier.
    Parameters:
        y_test (array): the y_test
        predicted (array): the predicted values
    Returns:
        A data frame with the predicted values"""

    df = DataFrame({'Target': y_test, 'Predicted': predicted})
    df['Score'] = df.Target == df.Predicted
    df = df.groupby('Target').apply(lambda x: 100.0 * sum(x.Score) / len(x))
    df.sort()
    return df
Esempio n. 17
0
def test_outer_join():
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    left = DataFrame(left, columns=['id', 'name', 'amount'])

    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]
    right = DataFrame(right, columns=['city', 'id'])

    lsym = symbol('lsym', 'var * {id: int, name: string, amount: real}')
    rsym = symbol('rsym', 'var * {city: string, id: int}')

    convert = lambda df: set(df.to_records(index=False).tolist())

    assert (convert(compute(join(lsym, rsym), {lsym: left, rsym: right})) ==
            set([(1, 'Alice', 100, 'NYC'),
                 (1, 'Alice', 100, 'Boston'),
                 (4, 'Dennis', 400, 'Moscow')]))

    assert (convert(compute(join(lsym, rsym, how='left'),
                            {lsym: left, rsym: right})) ==
            set([(1, 'Alice', 100, 'NYC'),
                 (1, 'Alice', 100, 'Boston'),
                 (2, 'Bob', 200, np.nan),
                 (4, 'Dennis', 400, 'Moscow')]))

    df = compute(join(lsym, rsym, how='right'), {lsym: left, rsym: right})
    expected = DataFrame([(1., 'Alice', 100., 'NYC'),
                          (1., 'Alice', 100., 'Boston'),
                          (3., np.nan, np.nan, 'lsymA'),
                          (4., 'Dennis', 400., 'Moscow')],
                         columns=['id', 'name', 'amount', 'city'])

    result = df.sort('id').to_records(index=False)
    expected = expected.sort('id').to_records(index=False)
    np.array_equal(result, expected)

    df = compute(join(lsym, rsym, how='outer'), {lsym: left, rsym: right})
    expected = DataFrame([(1., 'Alice', 100., 'NYC'),
                          (1., 'Alice', 100., 'Boston'),
                          (2., 'Bob', 200., np.nan),
                          (3., np.nan, np.nan, 'LA'),
                          (4., 'Dennis', 400., 'Moscow')],
                         columns=['id', 'name', 'amount', 'city'])

    result = df.sort('id').to_records(index=False)
    expected = expected.sort('id').to_records(index=False)
    np.array_equal(result, expected)
Esempio n. 18
0
def test_outer_join():
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    left = DataFrame(left, columns=['id', 'name', 'amount'])

    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]
    right = DataFrame(right, columns=['city', 'id'])

    lsym = symbol('lsym', 'var * {id: int, name: string, amount: real}')
    rsym = symbol('rsym', 'var * {city: string, id: int}')

    convert = lambda df: set(df.to_records(index=False).tolist())

    assert (convert(compute(join(lsym, rsym), {lsym: left, rsym: right})) ==
            set([(1, 'Alice', 100, 'NYC'),
                 (1, 'Alice', 100, 'Boston'),
                 (4, 'Dennis', 400, 'Moscow')]))

    assert (convert(compute(join(lsym, rsym, how='left'),
                            {lsym: left, rsym: right})) ==
            set([(1, 'Alice', 100, 'NYC'),
                 (1, 'Alice', 100, 'Boston'),
                 (2, 'Bob', 200, np.nan),
                 (4, 'Dennis', 400, 'Moscow')]))

    df = compute(join(lsym, rsym, how='right'), {lsym: left, rsym: right})
    expected = DataFrame([(1., 'Alice', 100., 'NYC'),
                          (1., 'Alice', 100., 'Boston'),
                          (3., np.nan, np.nan, 'lsymA'),
                          (4., 'Dennis', 400., 'Moscow')],
                         columns=['id', 'name', 'amount', 'city'])

    result = df.sort('id').to_records(index=False)
    expected = expected.sort('id').to_records(index=False)
    np.array_equal(result, expected)

    df = compute(join(lsym, rsym, how='outer'), {lsym: left, rsym: right})
    expected = DataFrame([(1., 'Alice', 100., 'NYC'),
                          (1., 'Alice', 100., 'Boston'),
                          (2., 'Bob', 200., np.nan),
                          (3., np.nan, np.nan, 'LA'),
                          (4., 'Dennis', 400., 'Moscow')],
                         columns=['id', 'name', 'amount', 'city'])

    result = df.sort('id').to_records(index=False)
    expected = expected.sort('id').to_records(index=False)
    np.array_equal(result, expected)
Esempio n. 19
0
def test_outer_join():
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    left = DataFrame(left, columns=['id', 'name', 'amount'])

    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]
    right = DataFrame(right, columns=['city', 'id'])

    L = symbol('L', 'var * {id: int, name: string, amount: real}')
    R = symbol('R', 'var * {city: string, id: int}')

    convert = lambda df: set(df.to_records(index=False).tolist())

    assert convert(compute(join(L, R), {L: left, R: right})) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (4, 'Dennis', 400, 'Moscow')])

    assert convert(compute(join(L, R, how='left'), {L: left, R: right})) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (2, 'Bob', 200, np.nan),
             (4, 'Dennis', 400, 'Moscow')])

    df = compute(join(L, R, how='right'), {L: left, R: right})
    expected = DataFrame(
            [(1., 'Alice', 100., 'NYC'),
             (1., 'Alice', 100., 'Boston'),
             (3., np.nan, np.nan, 'LA'),
             (4., 'Dennis', 400., 'Moscow')],
            columns=['id', 'name', 'amount', 'city'])

    assert str(df.sort('id').to_records(index=False)) ==\
            str(expected.sort('id').to_records(index=False))

    df = compute(join(L, R, how='outer'), {L: left, R: right})
    expected = DataFrame(
            [(1., 'Alice', 100., 'NYC'),
             (1., 'Alice', 100., 'Boston'),
             (2., 'Bob', 200., np.nan),
             (3., np.nan, np.nan, 'LA'),
             (4., 'Dennis', 400., 'Moscow')],
            columns=['id', 'name', 'amount', 'city'])

    assert str(df.sort('id').to_records(index=False)) ==\
            str(expected.sort('id').to_records(index=False))
Esempio n. 20
0
def plot_bic_ranks(df, group_by, analysis_col, percentage=True, **kwargs):
    COL_NAMES = [
        'First', 'Second', 'Third', 'Fourth', 'Fifth', 'Sixth', 'Seventh'
    ]

    bic_df = _create_bic_df(df, group_by, analysis_col, **kwargs)
    rank = bic_df.rank(axis=1).filter(regex='_BIC$')

    bic_cols = rank.filter(regex='_BIC$').columns
    rank_counts = {col: rank[col].value_counts() for col in bic_cols}

    rank_counts = DataFrame(rank_counts).transpose().fillna(value=0)
    rank_counts.columns = COL_NAMES[:len(rank_counts)]
    rank_counts = rank_counts.sort(columns=COL_NAMES[:len(rank_counts)],
                                   ascending=False)

    if percentage:
        rank_counts = (rank_counts / rank_counts.sum()) * 100
        ylabel = 'Percentage'
    else:
        ylabel = 'Count'

    ax = rank_counts['First'].plot(kind='bar',
                                   title='Distribution BIC First Place',
                                   rot=-30)
    ax.set_ylabel(ylabel)
    plt.show()
    ax = rank_counts.plot(kind='bar', title='Distribution BIC Ranks', rot=-30)
    ax.set_ylabel(ylabel)

    return rank_counts
 def json_to_df(json_file):
     page = ''
     pic = ''
     text = ''
     point = ''
     point_list = []
     with open(json_file) as data_file:
         data = json.load(data_file)
         for i in range(0, len(data)):
             if data[i]['type'] == 'PageTurn':
                 page = data[i]
             elif data[i]['type'] == 'Picture':
                 pic = data[i]
             elif data[i]['type'] == 'Text':
                 text = data[i]
             elif data[i]['type'] == 'SampleGaze':
                 point = data[i]
                 point.update(page)
                 point.update(pic)
                 point.update(text)
                 point['type'] = u'SampleGaze'
                 point_list.append(point)
             elif data[i]['type'] == 'SampleFixation':
                 point = data[i]
                 point.update(page)
                 point.update(pic)
                 point.update(text)
                 point['type'] = u'SampleFixation'
                 point_list.append(point)
         df = DataFrame(point_list)
         start_time = df['timestamp'].min()
         df['timestamp'] = df['timestamp'] - start_time
         df = df.sort('timestamp')
         return df
Esempio n. 22
0
def foreach_dataframe(self, func, force_dict=False, *args, **kwargs):
    """
        Really just does a foreach with each being dfs in a panel. 
    """
    d = {}
    for key, df in self.items():
        d[key] = func(df, *args, **kwargs)
    container = PanelDict
    for key, result in list(d.items()):
        if isinstance(result, Series):
            container = DataFrame
            break
        if isinstance(result, DataFrame):
            container = Panel
            break

    index = []
    for key, result in list(d.items()):
        if not isinstance(result, (DataFrame, Series)):
            continue
        result.name = key
        ind = result.index
        index = set(index).union(ind) 

    if force_dict:
        return PanelDict(d)

    res = DataFrame(None, index=index)
    for key, result in list(d.items()):
        res = res.join(result)

    res = res.sort()
    return res
Esempio n. 23
0
    def predict(self, tree):
        """
        TODO Should take an array and predict every item. A score can be stored.
        It would follow the guidelines set by scikit-learn.
        """
        tree_rules = self.extract_rules(tree)
        df = DataFrame(columns=['label', 'prob'])
        gb = self.posteriori.groupby('label')


        for key, indexes in gb.groups.items():
            apriori_prob = self.apriori[self.apriori.label == key]['freq'].values[0]
            prob = apriori_prob

            group_df, missing_prob = self.apply_smoothing(self.posteriori.ix[indexes], tree_rules)

            for rule in tree_rules:
                prob_evidence = group_df[group_df.rule == rule]['freq']
                if len(prob_evidence) == 0:
                    prob_evidence = missing_prob
                else:
                    prob_evidence = prob_evidence.values[0]
                prob *= prob_evidence
            
            post = DataFrame({'label':[key], 'prob':[prob]})
            df = df.append(post)

        df.index = np.arange(df.index.size)
        df = df.sort(columns='prob', ascending=False)
        return df.ix[df['prob'].idxmax()]
Esempio n. 24
0
    def homer_to_narrow_peaks(self, data, output_file):
        '''
        Given a Homer peak dataframe, extract necessary columns and convert
        to a broadPeak file. From the IDR package description:
        
            NarrowPeak files are in BED6+4 format. It consists of 10 tab-delimited columns
    
            1.chrom     string     Name of the chromosome
            2.chromStart     int     The starting position of the feature in the chromosome. The first base in a chromosome is numbered 0.
            3.chromEnd     int     The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the   feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
            4.name     string     Name given to a region (preferably unique). Use '.' if no name is assigned
            5.score     int     Indicates how dark the peak will be displayed in the browser (1-1000). If '0', the DCC will assign this based on signal value.         Ideally average signalValue per base spread between 100-1000.
            6.strand     char     +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned.
            7.signalValue     float     Measurement of overall (usually, average) enrichment for the region.
            8.pValue     float     Measurement of statistical signficance (-log10). Use -1 if no pValue is assigned.
            9.qValue     float     Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
            10.peak     int     Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
        
        '''

        columns = OrderedDict((
            ('chrom', self.get_first_column(data, ['chr','chrom', 'chromosome'])),
            ('chromStart', self.get_first_column(data, ['chromStart','start'])),
            ('chromEnd', self.get_first_column(data, ['chromEnd','end'])),
            ('name', self.get_first_column(data, ['#PeakID','PeakID','ID','name'])),
            ('score', Series([0]*data.shape[0])), # Leave zero so that signalValue column is used
            ('strand', self.get_first_column(data, ['strand'])),       
            ('signalValue', self.get_first_column(data, self.tag_count_columns)),
            ('pValue', -np.log10(self.get_first_column(data, self.p_value_columns))),
            ('qValue', Series([-1]*data.shape[0])), # Leave -1 as no individual FDR is called for each peak
            ('peak', Series([-1]*data.shape[0])), # Leave -1 as no point-source is called for each peak
            ))
        df = DataFrame(columns)
        df = df.sort(['signalValue','pValue'], ascending=False)
        df.to_csv(output_file, sep='\t', header=False, index=False)
Esempio n. 25
0
 def _search_by_inchi_fuzzy(self, inchi):
     # TODO: use openbabel if available
     matches = difflib.get_close_matches(inchi, self.data_frame.InChI.dropna(), n=5, cutoff=.8)
     ranks = dict([(match, i) for i, match in enumerate(matches)])
     selection = DataFrame(self.data_frame[self.data_frame.InChI.isin(matches)])
     selection['search_rank'] = selection.name.map(ranks)
     return selection.sort('search_rank')
def upsert_unique_indices(apps, schema_editor):
    datapoint_values_list = ['id','created_at','indicator_id','location_id','campaign_id','data_date']
    historical_dps = DataFrame(list(DataPoint.objects.filter(unique_index = -1)\
        .values_list('id','created_at','indicator_id','location_id','campaign_id','data_date')), columns=datapoint_values_list)
    # create the unique index
    historical_dps = historical_dps.apply(add_unique_index, axis=1)

    # group by and max on created at, get the most recent upload
    historical_dps = historical_dps.sort("created_at", ascending=False).groupby("unique_index", as_index=False).first()

    # get the ids into a list and select them
    dps_to_update = DataPoint.objects.filter(id__in=list(historical_dps['id']))
    print 'dps to update'
    print len(dps_to_update)
    # then run a query and update each
    for dp in dps_to_update:
        unique_index = historical_dps[historical_dps['id'] == dp.id].iloc[0]['unique_index']
        dp.unique_index = unique_index
        dp.save()
    
    # delete all the other duplicates
    dps_to_delete = DataPoint.objects.filter(unique_index=-1)
    print 'dps_to_delete'
    print len(dps_to_delete)
    dps_to_delete.delete()
Esempio n. 27
0
def sorted_plot(f, set1, k):
    temp_avg = np.array([])
    for i in range(len(set1[f].unique())):
        temp_avg = np.append(
            temp_avg, np.mean(set1['click'][set1[f] == (set1[f].unique())[i]]))
    if (k == 2):
        f1 = figure()
        plt.plot(range(len(set1[f].unique())), temp_avg, 'bo',
                 range(len(set1[f].unique())), temp_avg, 'k')
        plt.grid()
        plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    df_temp = DataFrame({f: set1[f].unique(), 'Avg_click': temp_avg})
    df_temp = df_temp.sort(columns='Avg_click')
    plt.plot(range(len(set1[f].unique())), df_temp['Avg_click'], 'bo',
             range(len(set1[f].unique())), df_temp['Avg_click'], 'k')
    for x, y in zip(range(len(set1[f].unique())), df_temp['Avg_click']):  # <--
        #ax.annotate('(%s, %s)' % xy, xy=xy, textcoords='offset points')
        plt.text(x - 0.004, y + 0.007, df_temp[f][x], fontsize=12)
    plt.grid()
    plt.show()
    print 'total count =', len(df_temp)
    print 'maximum_value =', df_temp['Avg_click'].max(), 'at', df_temp[f][
        df_temp['Avg_click'] == df_temp['Avg_click'].max()]
    print 'minimum_value =', df_temp['Avg_click'].min(), 'at', df_temp[f][
        df_temp['Avg_click'] == df_temp['Avg_click'].min()]
    print 'number of zeroes =', len(df_temp[df_temp['Avg_click'] == 0])
    print 'number of ones =', len(df_temp[df_temp['Avg_click'] == 1])
    print 'Distribution:(including the starting value)'
    for i in np.arange(0, 1.1, 0.1):
        print i, '<->', (i + 0.1), ' = ', len(
            df_temp[(df_temp['Avg_click'] >= i)
                    & (df_temp['Avg_click'] < (i + 0.1))])
Esempio n. 28
0
def analyze():
    signals = read_csv(FILE_SIGNALS)
    devices = signals["id"].unique()
    
    print("got %d signals from %d devices" % (len(signals), len(devices)))

    signals = signals.groupby(["frequency", "id"]).size()
    signals = signals.reindex(MultiIndex.from_product([SPECTRUM, devices],
                                                      names=signals.index.names),
                              fill_value=0)
    signals = signals.unstack("id")
    
    # let's only keep frequencies with all signals present
    candidates = signals.dropna()
    # suggest frequency where the weakest sensor has the most
    # received signals, and then the frequency with most total
    # received signals for all sensors
    candidates = DataFrame({"total":   candidates.sum(axis=1),
                            "weakest": candidates.min(axis=1)})
    appropriate_freq = candidates.sort(["weakest", "total"],
                                       ascending=False).index[0]
    print("suggesting frequency %s" % mhz(appropriate_freq))

    signals.to_csv("spectrum.csv")
    
    import matplotlib.pyplot as plt
    from matplotlib.ticker import EngFormatter

    p=signals.plot(kind="Area")
    p.xaxis.set_major_formatter(EngFormatter(unit='Hz', places=2))
    plt.savefig(FILE_SPECTRUM, dpi=300)
    print("saved spectrum as %s" % FILE_SPECTRUM)
Esempio n. 29
0
def adjust(data, adjustments: pd.DataFrame):
    """
    IMPORTANT !!! This method supports single index df
    :param data: dataframe with data.
    :param adjustments: list of adjustments in the form of [(date, split_factor/dividend_amount, 'split'/'dividend'), ...]
    :return adjusted data
    """
    adjustments.sort(key=lambda x: x[0], reverse=True)

    for (_, row) in adjustments.iterrows():
        if row.name[2] == 'split':
            adjust_split(data=data, split_date=row.name[0], split_factor=row[0])
        elif row.name[2] == 'dividend':
            adjust_dividend(data=data, dividend_date=row.name[0], dividend_amount=row[0])

    return data
Esempio n. 30
0
def foreach_dataframe(self, func, force_dict=False, *args, **kwargs):
    """
        Really just does a foreach with each being dfs in a panel. 
    """
    d = {}
    for key, df in self.items():
        d[key] = func(df, *args, **kwargs)
    container = PanelDict
    for key, result in list(d.items()):
        if isinstance(result, Series):
            container = DataFrame
            break
        if isinstance(result, DataFrame):
            container = Panel
            break

    index = []
    for key, result in list(d.items()):
        if not isinstance(result, (DataFrame, Series)):
            continue
        result.name = key
        ind = result.index
        index = set(index).union(ind)

    if force_dict:
        return PanelDict(d)

    res = DataFrame(None, index=index)
    for key, result in list(d.items()):
        res = res.join(result)

    res = res.sort()
    return res
	    def json_to_df(j_file):
	        page = ''
	        pic = ''
	        text = ''
	        point = ''
	        point_list = []
	        with open(j_file) as data_file:    
	            data = json.load(data_file)
	            for i in range(0, len(data)):
	                if data[i]['type'] == 'PageTurn':
	                    page = data[i]
	                elif data[i]['type'] == 'Picture':
	                    pic = data[i]
	                elif data[i]['type'] == 'Text':
	                    text = data[i]
	                elif data[i]['type'] == 'SampleGaze':
	                    point = data[i]
	                    point.update(page)
	                    point.update(pic)
	                    point.update(text)
	                    point['type'] = u'SampleGaze'
	                    point_list.append(point)
	                elif data[i]['type'] == 'SampleFixation':
	                    point = data[i]
	                    point.update(page)
	                    point.update(pic)
	                    point.update(text)
	                    point['type'] = u'SampleFixation'
	                    point_list.append(point)
	            df = DataFrame(point_list)
	            start_time = df['timestamp'].min()
	            df['timestamp'] = df['timestamp'] - start_time
	            df = df.sort('timestamp')
	            return df
Esempio n. 32
0
def Main():
    import matplotlib.pyplot as pl
    companies = [
        'AAPL', 'ADSK', 'GOOG', 'MSFT', 'AUY', 'TWTR', 'YHOO', 'CAT', 'GE',
        'CSCO', 'F'
    ]
    url = 'http://finance.yahoo.com/d/quotes.csv?s=' + '+'.join(
        companies) + '&f=nabp'
    response = urllib2.urlopen(url)
    data = list(csv.reader(response))
    columns = ['Name', 'Pricing - Ask', 'Pricing - Bid', 'Previous close']
    data = DataFrame(data, columns=columns)
    data = data.replace(['N/A'], [0])
    data['Previous close'] = data['Previous close'].astype(float)
    data = data.sort(columns=['Previous close'], ascending=False, axis=0)

    pl.plot(data['Pricing - Ask'], label='Pricing Ask')
    pl.plot(data['Pricing - Bid'], label='Pricing Bid')
    pl.plot(data['Previous close'], label='Previous Close')
    pl.title("Stock values for some companies (Sorted by Ask)")
    pl.xticks(np.arange(len(data['Name'])), data['Name'].tolist(), rotation=45)
    pl.legend()
    pl.tight_layout()
    apis_helpers.save_fig(pl, 'yahoo-finance', 'basic')
    pl.close()
Esempio n. 33
0
    def test_sort_values(self):
        # API for 9816

        # sort_index
        frame = DataFrame(np.arange(16).reshape(4, 4),
                          index=[1, 2, 3, 4],
                          columns=['A', 'B', 'C', 'D'])

        # 9816 deprecated
        with tm.assert_produces_warning(FutureWarning):
            frame.sort(columns='A')
        with tm.assert_produces_warning(FutureWarning):
            frame.sort()

        unordered = frame.ix[[3, 2, 4, 1]]
        expected = unordered.sort_index()

        result = unordered.sort_index(axis=0)
        assert_frame_equal(result, expected)

        unordered = frame.ix[:, [2, 1, 3, 0]]
        expected = unordered.sort_index(axis=1)

        result = unordered.sort_index(axis=1)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, expected)

        # sortlevel
        mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
        df = DataFrame([[1, 2], [3, 4]], mi)

        result = df.sort_index(level='A', sort_remaining=False)
        expected = df.sortlevel('A', sort_remaining=False)
        assert_frame_equal(result, expected)

        df = df.T
        result = df.sort_index(level='A', axis=1, sort_remaining=False)
        expected = df.sortlevel('A', axis=1, sort_remaining=False)
        assert_frame_equal(result, expected)

        # MI sort, but no by
        mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
        df = DataFrame([[1, 2], [3, 4]], mi)
        result = df.sort_index(sort_remaining=False)
        expected = df.sort_index()
        assert_frame_equal(result, expected)
def estrutura_contagem(contagem_votos,contagem_votacoes):
    resultado = DataFrame({"bancada":list(contagem_votos.keys()),"votos_com_governo":list(contagem_votos.values()),"votacoes":list(contagem_votacoes.values())})
    #calcula a fidelidade partidária (número de votos pró-governo dividido pelo número de votações válidas)
    resultado["fidelidade"] = 100*resultado["votos_com_governo"]/resultado["votacoes"]
    #retira o governo do dataframe
    resultado = resultado[resultado.bancada != "GOV."] 
    resultado = resultado.sort("fidelidade",ascending=False)
    return resultado    
Esempio n. 35
0
    def __init__(self, column, baseline, adjustments=None):
        self.column = column
        self.baseline = baseline.values
        self.dates = baseline.index
        self.assets = baseline.columns

        if adjustments is None:
            adjustments = DataFrame(index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS)
        else:
            # Ensure that columns are in the correct order.
            adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1)
            adjustments.sort(["apply_date", "sid"], inplace=True)

        self.adjustments = adjustments
        self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date)
        self.adjustment_end_dates = DatetimeIndex(adjustments.end_date)
        self.adjustment_sids = Int64Index(adjustments.sid)
 def merge_variables(self, to_merge):
     """Merges time series variables into new time series variables.
     :param to_merge: dictionary mapping new variable name to list of variables to be merged.
     :return:
     """
     dold = self._data.copy()
     s = Series(data=np.zeros((dold.shape[0],)), index=dold.index).replace(0, np.nan)
     dnew = DataFrame(dict([ (k, s) for k in to_merge.keys() if len(set(to_merge[k]).intersection(dold.columns))>0 ]))
     for newvar in dnew.columns:
         for oldvar in to_merge[newvar]:
             if oldvar in dold.columns:
                 dnew[newvar][dold[oldvar].notnull()] = dold[oldvar][dold[oldvar].notnull()]
                 del dold[oldvar]
     dnew = dnew.join(dold, how='outer')
     dnew.sort(axis=1, inplace=True)
     dnew.sort(axis=0, inplace=True)
     self._data = dnew
Esempio n. 37
0
def word_freq(file_name, suffix='_wordfreq', sep='\t', threshold=.5):
    print "start word_freq"
    # start = datetime.datetime.now()
    # print start
    reviews = pd.read_csv(file_name, error_bad_lines=False, sep=sep)
    cb = reviews['stopword_body']
    rate = reviews['Rating']
    # label all words with the rating
    cb_temp = []
    for i, c in enumerate(cb):
        cb_temp.append([(w, rate[i]) for w in ast.literal_eval(c)])
    reviews['stopword_body'] = cb_temp
    # calculate_time(start)
    # get the corpus of all reviews, lists of all words with label
    '''--------------------------------------------------------'''
    cop_wl = []
    for b in cb_temp:
        # change the unicode data to the raw string
        # cop_wl += [(unicodedata.normalize('NFKD', w[0]).encode('utf-8','replace'), w[1]) for w in b if type(w[0])==unicode]
        cop_wl += b
    '''--------------------------------------------------------'''
    # calculate_time(start)
    # word frequency of the corpus with label
    wfq = nltk.FreqDist(cop_wl)
    # calculate_time(start)
    # get the word list of all reviews without label
    cop = [w[0] for w in cop_wl]
    cop = set(cop)
    cop_len = len(cop)
    # calculate_time(start)
    # get freq of all words in one list
    wfq_l = []
    for w in cop:
        for i in range(1, 6):
            wfq_l.append(wfq[(w, i)])

    # calculate_time(start)
    # reshape the list to a matrix
    wfq_mx = DataFrame(np.array(wfq_l).reshape((cop_len, 5)),
                       index=pd.Index(cop),
                       columns=pd.Index([1, 2, 3, 4, 5]))
    # calculate_time(start)
    # calculate the prob of each rating
    w_s = []
    w_sum = []
    for i, r in wfq_mx.iterrows():
        word_sum = wfq_mx.ix[i].sum()
        # wfq_mx.ix[i] = wfq_mx.ix[i]/word_sum
        w_s.append(word_useful_score(list(wfq_mx.ix[i]), word_sum))
        w_sum.append(word_sum)

    wfq_mx['score'] = w_s
    wfq_mx['sum'] = w_sum
    wfq_mx = wfq_mx.sort(columns='sum').ix[-int(len(w_s) * threshold):, :]
    print wfq_mx
    wfq_mx.to_csv(file_name.split('.')[0] + suffix + '.' +
                  file_name.split('.')[1],
                  sep='\t')
Esempio n. 38
0
    def test_sort_values(self):
        # API for 9816

        # sort_index
        frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
                          columns=['A', 'B', 'C', 'D'])

        # 9816 deprecated
        with tm.assert_produces_warning(FutureWarning):
            frame.sort(columns='A')
        with tm.assert_produces_warning(FutureWarning):
            frame.sort()

        unordered = frame.ix[[3, 2, 4, 1]]
        expected = unordered.sort_index()

        result = unordered.sort_index(axis=0)
        assert_frame_equal(result, expected)

        unordered = frame.ix[:, [2, 1, 3, 0]]
        expected = unordered.sort_index(axis=1)

        result = unordered.sort_index(axis=1)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, expected)

        # sortlevel
        mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
        df = DataFrame([[1, 2], [3, 4]], mi)

        result = df.sort_index(level='A', sort_remaining=False)
        expected = df.sortlevel('A', sort_remaining=False)
        assert_frame_equal(result, expected)

        df = df.T
        result = df.sort_index(level='A', axis=1, sort_remaining=False)
        expected = df.sortlevel('A', axis=1, sort_remaining=False)
        assert_frame_equal(result, expected)

        # MI sort, but no by
        mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
        df = DataFrame([[1, 2], [3, 4]], mi)
        result = df.sort_index(sort_remaining=False)
        expected = df.sort_index()
        assert_frame_equal(result, expected)
Esempio n. 39
0
def set2df(sets, column_names, index=None, sort=True):
    df = DataFrame(list(sets), columns=column_names, index=index)
    if sort:
        df = df.sort(column_names)
        if index:
            df.index = index
        else:
            df.index = range(len(df))
    return df
Esempio n. 40
0
    def _parse_fits(filepath):
        """Parses a GOES FITS file from
        http://umbra.nascom.nasa.gov/goes/fits/"""
        fits = pyfits.open(filepath)
        header = fits[0].header
        if len(fits) == 4:
            if is_time_in_given_format(fits[0].header['DATE-OBS'], '%d/%m/%Y'):
                start_time = datetime.datetime.strptime(
                    fits[0].header['DATE-OBS'], '%d/%m/%Y')
            elif is_time_in_given_format(fits[0].header['DATE-OBS'],
                                         '%d/%m/%y'):
                start_time = datetime.datetime.strptime(
                    fits[0].header['DATE-OBS'], '%d/%m/%y')
            else:
                raise ValueError("Date not recognized")
            xrsb = fits[2].data['FLUX'][0][:, 0]
            xrsa = fits[2].data['FLUX'][0][:, 1]
            seconds_from_start = fits[2].data['TIME'][0]
        elif 1 <= len(fits) <= 3:
            start_time = parse_time(header['TIMEZERO'])
            seconds_from_start = fits[0].data[0]
            xrsb = fits[0].data[1]
            xrsa = fits[0].data[2]
        else:
            raise ValueError("Don't know how to parse this file")

        times = [
            start_time + datetime.timedelta(seconds=int(floor(s)),
                                            microseconds=int(
                                                (s - floor(s)) * 1e6))
            for s in seconds_from_start
        ]

        # remove bad values as defined in header comments
        xrsb[xrsb == -99999] = nan
        xrsa[xrsa == -99999] = nan

        # fix byte ordering
        newxrsa = xrsa.byteswap().newbyteorder()
        newxrsb = xrsb.byteswap().newbyteorder()

        data = DataFrame({'xrsa': newxrsa, 'xrsb': newxrsb}, index=times)
        data.sort(inplace=True)
        return header, data
Esempio n. 41
0
def compile_predictions(pred):
	'''
	groups predictions made on patches of an image into a set of labels and confidences

	Args:
		pred (array-like):
			output from call to [some sklearn model].predict

	Returns:
		DataFrame: compiled predictions
	'''
	data = DataFrame()
	data['yhat'] = pred
	data['confidence'] = 1.0
	data = data.groupby('yhat').agg(lambda x: x.sum() / data.shape[0])
	data.sort('confidence', ascending=False, inplace=True)
	data['label'] = data.index
	data.reset_index(drop=True, inplace=True)
	return data
Esempio n. 42
0
 def _search_by_name_fuzzy(self, name):
     matches = difflib.get_close_matches(name,
                                         self.data_frame.name.dropna(),
                                         n=5,
                                         cutoff=.8)
     ranks = dict([(match, i) for i, match in enumerate(matches)])
     selection = DataFrame(
         self.data_frame[self.data_frame.name.isin(matches)])
     selection['search_rank'] = selection.name.map(ranks)
     return selection.sort('search_rank')
Esempio n. 43
0
class WaferRun:

    def __init__(self, run_id, wafer_id, label, measurements):
        self.run_id = int(run_id)
        self.wafer_id = int(wafer_id)
        self.label = int(label)
        self.measurements = DataFrame(measurements)
        self.measurements.sort(axis=1, inplace=True)
        self.measurements.sort_index(inplace=True)
    
    @staticmethod
    def from_files(path, run_id, wafer_id):
        fn_base = os.path.join(path, '{0}_{1:02}'.format(run_id, wafer_id))
        
        try:
            df = DataFrame({11: DataFrame.from_csv(fn_base + '.11', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            12: DataFrame.from_csv(fn_base + '.12', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            15: DataFrame.from_csv(fn_base + '.15', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            6: DataFrame.from_csv(fn_base + '.6', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            7: DataFrame.from_csv(fn_base + '.7', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            8: DataFrame.from_csv(fn_base + '.8', header=None, sep='\t', index_col=None, parse_dates=False)[1]})
        except:
            return None
        
        m = re.search('/(normal|abnormal)', path)
        if m is None:
            return None
    
        label = 1 if m.group(1) == 'abnormal' else -1
        
        return WaferRun(run_id, wafer_id, label, df)
    
    def as_nparray(self):
        """Spits out data as a T x D numpy.array (T=# samples, D=# variables)

        Notes:
        Notice what we do here: we start with a pandas.DataFrame where each channel
        is a column (so you can think of it as a T x D matrix). We first rename the
        columns to channel numbers,then sort the columns, then sort the index, then
        transform to numpy.array.
        """
        return self.measurements.sort(axis=1).sort_index().reset_index().as_matrix().astype(float)
Esempio n. 44
0
def ip_requests(log_list, count_dict) :
    '''
    Pandas applications
    '''
    df = DataFrame(log_list)
    ips = df.groupby('clientip').size()
    ips.sort()
    ips_fd = DataFrame({'Number of requests':ips[-10:]})
    ips_fd = ips_fd.sort(columns='Number of requests',  ascending=False)
    count_dict['ips_fd'] = ips_fd
    return count_dict
Esempio n. 45
0
    def __init__(self, column, baseline, adjustments=None):
        self.column = column
        self.baseline = baseline.values
        self.dates = baseline.index
        self.assets = baseline.columns

        if adjustments is None:
            adjustments = DataFrame(
                index=DatetimeIndex([]),
                columns=ADJUSTMENT_COLUMNS,
            )
        else:
            # Ensure that columns are in the correct order.
            adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1)
            adjustments.sort(['apply_date', 'sid'], inplace=True)

        self.adjustments = adjustments
        self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date)
        self.adjustment_end_dates = DatetimeIndex(adjustments.end_date)
        self.adjustment_sids = Int64Index(adjustments.sid)
Esempio n. 46
0
 def _search_by_inchi_fuzzy(self, inchi):
     # TODO: use openbabel if available
     matches = difflib.get_close_matches(inchi,
                                         self.data_frame.InChI.dropna(),
                                         n=5,
                                         cutoff=.8)
     ranks = dict([(match, i) for i, match in enumerate(matches)])
     selection = DataFrame(
         self.data_frame[self.data_frame.InChI.isin(matches)])
     selection['search_rank'] = selection.name.map(ranks)
     return selection.sort('search_rank')
Esempio n. 47
0
def word_freq(file_name, suffix='_wordfreq', sep='\t', threshold=.5):
	print "start word_freq"
	# start = datetime.datetime.now()
	# print start
	reviews = pd.read_csv(file_name, error_bad_lines=False, sep=sep)
	cb = reviews['stopword_body']
	rate = reviews['Rating']
	# label all words with the rating
	cb_temp = []
	for i, c in enumerate(cb):
		cb_temp.append([(w, rate[i]) for w in ast.literal_eval(c)])
	reviews['stopword_body'] = cb_temp
	# calculate_time(start)
	# get the corpus of all reviews, lists of all words with label
	'''--------------------------------------------------------'''
	cop_wl = []
	for b in cb_temp:
		# change the unicode data to the raw string
		# cop_wl += [(unicodedata.normalize('NFKD', w[0]).encode('utf-8','replace'), w[1]) for w in b if type(w[0])==unicode]
		cop_wl += b
	'''--------------------------------------------------------'''
	# calculate_time(start)
	# word frequency of the corpus with label
	wfq = nltk.FreqDist(cop_wl)
	# calculate_time(start)
	# get the word list of all reviews without label
	cop = [w[0] for w in cop_wl]
	cop = set(cop)
	cop_len = len(cop)
	# calculate_time(start)
	# get freq of all words in one list
	wfq_l = []
	for w in cop:
		for i in range(1, 6):
			wfq_l.append(wfq[(w, i)])

	# calculate_time(start)
	# reshape the list to a matrix
	wfq_mx = DataFrame(np.array(wfq_l).reshape((cop_len,5)), index=pd.Index(cop), columns=pd.Index([1,2,3,4,5]))
	# calculate_time(start)
	# calculate the prob of each rating
	w_s = []
	w_sum = []
	for i, r in wfq_mx.iterrows():
		word_sum = wfq_mx.ix[i].sum()
		# wfq_mx.ix[i] = wfq_mx.ix[i]/word_sum
		w_s.append(word_useful_score(list(wfq_mx.ix[i]), word_sum))
		w_sum.append(word_sum)

	wfq_mx['score'] = w_s
	wfq_mx['sum'] = w_sum
	wfq_mx = wfq_mx.sort(columns='sum').ix[-int(len(w_s) * threshold):,:]
	print wfq_mx
	wfq_mx.to_csv(file_name.split('.')[0] + suffix + '.' + file_name.split('.')[1], sep='\t')
Esempio n. 48
0
 def list_stock_info(self, stock_list):
     keys = stock_list.keys()
     data = DataFrame(self.foundmental_data, index=keys)
     data = data.sort(columns='earn_ratio', ascending=False)   
     for code in data.index:
         gross_profit_rate = None   #默认值
         if code in self.profit_data.index:
             gross_profit_rate = self.profit_data.ix[code]['gross_profit_rate']
         #2. 过滤毛利率过低的股票
         if gross_profit_rate and gross_profit_rate < 15:
             continue
         self.print_stock_info(code, data.ix[code],gross_profit_rate)
Esempio n. 49
0
    def homer_to_narrow_peaks(self, data, output_file):
        '''
        Given a Homer peak dataframe, extract necessary columns and convert
        to a broadPeak file. From the IDR package description:
        
            NarrowPeak files are in BED6+4 format. It consists of 10 tab-delimited columns
    
            1.chrom     string     Name of the chromosome
            2.chromStart     int     The starting position of the feature in the chromosome. The first base in a chromosome is numbered 0.
            3.chromEnd     int     The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the   feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
            4.name     string     Name given to a region (preferably unique). Use '.' if no name is assigned
            5.score     int     Indicates how dark the peak will be displayed in the browser (1-1000). If '0', the DCC will assign this based on signal value.         Ideally average signalValue per base spread between 100-1000.
            6.strand     char     +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned.
            7.signalValue     float     Measurement of overall (usually, average) enrichment for the region.
            8.pValue     float     Measurement of statistical signficance (-log10). Use -1 if no pValue is assigned.
            9.qValue     float     Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
            10.peak     int     Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
        
        '''

        # We don't want to require p-value, as Homer doesn't always output it.
        # Prep it here if it exists, or substitute tag count.
        pval_col = self.get_first_column(data,
                                         self.p_value_columns,
                                         required=False)
        if pval_col is not None:
            pvals = -np.log10(pval_col)
        else:
            pvals = pvals = [-1] * data.shape[0]

        columns = OrderedDict((
            ('chrom',
             self.get_first_column(data, ['chr', 'chrom', 'chromosome'])),
            ('chromStart', self.get_first_column(data,
                                                 ['chromStart', 'start'])),
            ('chromEnd', self.get_first_column(data, ['chromEnd', 'end'])),
            ('name',
             self.get_first_column(data, ['#PeakID', 'PeakID', 'ID', 'name'])),
            ('score', Series([0] * data.shape[0])
             ),  # Leave zero so that signalValue column is used
            ('strand', self.get_first_column(data, ['strand'])),
            ('signalValue', self.get_first_column(data,
                                                  self.tag_count_columns)),
            ('pValue', pvals),  # P-value if it exists, or tag count
            ('qValue', Series([-1] * data.shape[0])
             ),  # Leave -1 as no individual FDR is called for each peak
            ('peak', Series([-1] * data.shape[0])
             ),  # Leave -1 as no point-source is called for each peak
        ))
        df = DataFrame(columns)
        df = df.sort(['signalValue', 'pValue'], ascending=False)
        df.to_csv(output_file, sep='\t', header=False, index=False)
Esempio n. 50
0
    def merge_variables(self, to_merge):
        """Merges time series variables into new time series variables.

        :param to_merge: dictionary mapping new variable name to list of variables to be merged.
        :return:
        """
        dold = self._data.copy()
        s = Series(data=np.zeros((dold.shape[0], )),
                   index=dold.index).replace(0, np.nan)
        dnew = DataFrame(
            dict([(k, s) for k in to_merge.keys()
                  if len(set(to_merge[k]).intersection(dold.columns)) > 0]))
        for newvar in dnew.columns:
            for oldvar in to_merge[newvar]:
                if oldvar in dold.columns:
                    dnew[newvar][dold[oldvar].notnull()] = dold[oldvar][
                        dold[oldvar].notnull()]
                    del dold[oldvar]
        dnew = dnew.join(dold, how='outer')
        dnew.sort(axis=1, inplace=True)
        dnew.sort(axis=0, inplace=True)
        self._data = dnew
Esempio n. 51
0
 def list_stock_info(self, stock_list):
     keys = stock_list.keys()
     data = DataFrame(self.foundmental_data, index=keys)
     data = data.sort(columns='earn_ratio', ascending=False)
     for code in data.index:
         gross_profit_rate = None  #默认值
         if code in self.profit_data.index:
             gross_profit_rate = self.profit_data.ix[code][
                 'gross_profit_rate']
         #2. 过滤毛利率过低的股票
         if gross_profit_rate and gross_profit_rate < 15:
             continue
         self.print_stock_info(code, data.ix[code], gross_profit_rate)
Esempio n. 52
0
def _assert_matrix_of_thesaurus_c_is_as_expected(matrix, rows, cols):
    # rows may come in any order
    assert set(rows) == set(['g/N', 'a/N', 'd/J', 'b/V', 'a/J_b/N'])
    # columns must be in alphabetical order
    assert cols == ['a/N', 'b/V', 'd/J', 'g/N', 'x/X']
    # test the vectors for each entry
    expected_matrix = np.array([
        [0.1, 0., 0.2, 0.8, 0.],  # ab
        [0., 0.1, 0.5, 0.3, 0.],  # a
        [0.1, 0., 0.3, 0.6, 0.],  # b
        [0.5, 0.3, 0., 0.7, 0.],  # d
        [0.3, 0.6, 0.7, 0., 0.9]  # g
    ])
    # put the rows in the matrix in the order in which they are in expected_matrix
    matrix_ordered_by_rows = matrix[np.argsort(np.array(rows)), :]
    assert_array_equal(matrix_ordered_by_rows, expected_matrix)

    vec_df = DataFrame(matrix, columns=cols, index=rows)
    from pandas.util.testing import assert_frame_equal

    expected_frame = DataFrame(expected_matrix, index=['a/J_b/N', 'a/N', 'b/V', 'd/J', 'g/N'], columns=cols)
    assert_frame_equal(vec_df.sort(axis=0), expected_frame.sort(axis=0))
Esempio n. 53
0
    def expandVocab(self, docs):
        print 'expanding vocabulary...'
        freqCounts = self.countTokens(docs)

        tokenList = []
        freqCountList = []
        for token in freqCounts:
            tokenList.append(token)
            freqCountList.append(freqCounts[token])

        expTokenDf = DataFrame({
            'tokens': tokenList,
            'freqCounts': freqCountList
        })
        expTokenDf = expTokenDf.sort('freqCounts', ascending=False)
        expandableTokensFiltered = set(
            expTokenDf['tokens'][2000:3000]).difference(ENGLISH_STOP_WORDS)
        batchSize = 10000
        print "%d filtered tokens chosen" % len(expandableTokensFiltered)
        print "Expandable tokens: "
        print expandableTokensFiltered
        newDocs = []
        for i in xrange(0, len(docs)):
            doc = docs[i]
            newDocSplit = doc.split()
            tokenList = doc.split(' ')
            start = 0
            newTokens = set()
            while start < len(tokenList):
                stop = start + batchSize
                tokens = set(tokenList[start:stop])
                start = start + batchSize / 2
                tokensToExpand = tokens.intersection(expandableTokensFiltered)
                newTokens = newTokens.union(
                    self.expandVocabFromSet(tokensToExpand))

            newDocSplit.extend(list(newTokens))
            newDoc = ''
            for token in newDocSplit:
                newDoc += ' ' + token + ' '
            newDocs.append(newDoc)

            if i % 500 == 0:
                print '\nprocessed %d docs' % i
                print '%d new tokens added to document' % len(newTokens)
                print 'new tokens:'
                print newTokens
                print len(tokens)

        return newDocs
Esempio n. 54
0
    def _parse_fits(filepath):
        """Parses a GOES FITS file from
        http://umbra.nascom.nasa.gov/goes/fits/"""
        fits = pyfits.open(filepath)
        header = fits[0].header
        if len(fits) == 4:
            if is_time_in_given_format(fits[0].header['DATE-OBS'], '%d/%m/%Y'):
                start_time = datetime.datetime.strptime(fits[0].header['DATE-OBS'], '%d/%m/%Y')
            elif is_time_in_given_format(fits[0].header['DATE-OBS'], '%d/%m/%y'):
                start_time = datetime.datetime.strptime(fits[0].header['DATE-OBS'], '%d/%m/%y')
            else:
                raise ValueError("Date not recognized")
            xrsb = fits[2].data['FLUX'][0][:, 0]
            xrsa = fits[2].data['FLUX'][0][:, 1]
            seconds_from_start = fits[2].data['TIME'][0]
        elif 1 <= len(fits) <= 3:
            start_time = parse_time(header['TIMEZERO'])
            seconds_from_start = fits[0].data[0]
            xrsb = fits[0].data[1]
            xrsa = fits[0].data[2]
        else:
            raise ValueError("Don't know how to parse this file")

        times = [start_time + datetime.timedelta(seconds=int(floor(s)),
                                                 microseconds=int((s - floor(s)) * 1e6)) for s in seconds_from_start]

        # remove bad values as defined in header comments
        xrsb[xrsb == -99999] = nan
        xrsa[xrsa == -99999] = nan

        # fix byte ordering
        newxrsa = xrsa.byteswap().newbyteorder()
        newxrsb = xrsb.byteswap().newbyteorder()

        data = DataFrame({'xrsa': newxrsa, 'xrsb': newxrsb}, index=times)
        data.sort(inplace=True)
        return header, data
Esempio n. 55
0
def p_adjust(p, method):
	if method == 'bonferroni':
		return np.minimum(p*len(p), 1)
	if method == 'holm':
		temp = DataFrame({'p': p})
		temp.sort(columns='p', inplace=True)
		temp['newID'] = range(1, len(temp)+1)
		temp['p_adj'] = np.minimum(temp['p'] * (1 + len(temp) - temp['newID']), 1)
		temp.sort(inplace=True)
		return temp['p_adj']
	if method == 'fdr':
		temp = DataFrame({'p': p})
		temp.sort(columns='p', inplace=True, ascending=False)
		temp['newID'] = range(1, len(temp)+1)
		temp['p_adj'] = np.minimum(1, len(temp)/temp['newID'] * temp['p'])
		temp.sort(inplace=True)
		return np.round(temp['p_adj'], 3)
def split_big_dframe(finalhhframe, hhcat):
    a = finalhhframe[['weight', 'reg02'
                      ]].groupby('reg02').apply(lambda x: x['weight'].count())
    b = DataFrame(a, columns=['count'])
    c = b.sort(columns=['count'], ascending=False)
    bool1 = c.cumsum() < c.cumsum()['count'].iloc[-1] / 2
    list_reg = list(c[bool1].dropna().index)
    finalhhframe1 = finalhhframe.ix[finalhhframe['reg02'].isin(list_reg), :]
    finalhhframe2 = finalhhframe.ix[~finalhhframe['reg02'].isin(list_reg), :]
    int_columns = ['children', 'old', 'decile'] + [
        'cat{}workers'.format(thecat) for thecat in hhcat['hhcat'].unique()
    ]
    finalhhframe1 = merges_rows_bis(int_columns, finalhhframe1)
    finalhhframe2 = merges_rows_bis(int_columns, finalhhframe2)
    return finalhhframe1, finalhhframe2
Esempio n. 57
0
def draw_feature_importance(train_x,clf):
    feature_names = train_x.columns
    feature_importance = clf.feature_importances_
    df = DataFrame({'feature_names':feature_names,'feature_importances':feature_importance})
    df1 = df.sort(columns='feature_importances',ascending=False)
    df1.index = [i for i in range(len(df1))]
    fig = plt.figure(num=random.randint(1,10000))
    ax = fig.add_subplot(111) 
        
    ax.set_xticks([i for i in range(len(df.feature_names))]) 
    ax.set_xticklabels(df1.feature_names,rotation=-90)
    ax.grid()
    ax.plot(df1.feature_importances,label='feature_importance')
    plt.subplots_adjust(bottom=0.2)
    return df1