Example #1
0
def test():
    """DataFrame editor test"""
    from numpy import nan

    df1 = DataFrame([
                     [True, "bool"],
                     [1+1j, "complex"],
                     ['test', "string"],
                     [1.11, "float"],
                     [1, "int"],
                     [np.random.rand(3, 3), "Unkown type"],
                     ["Large value", 100],
                     ["áéí", "unicode"]
                    ],
                    index=['a', 'b', nan, nan, nan, 'c',
                           "Test global max", 'd'],
                    columns=[nan, 'Type'])
    out = test_edit(df1)
    print("out:", out)
    out = test_edit(df1.iloc[0])
    print("out:", out)
    df1 = DataFrame(np.random.rand(100001, 10))
    # Sorting large DataFrame takes time
    df1.sort(columns=[0, 1], inplace=True)
    out = test_edit(df1)
    print("out:", out)
    out = test_edit(TimeSeries(np.arange(10)))
    print("out:", out)
    return out
Example #2
0
def test():
    """DataFrame editor test"""
    from numpy import nan
    from pandas.util.testing import assert_frame_equal, assert_series_equal

    df1 = DataFrame([
                     [True, "bool"],
                     [1+1j, "complex"],
                     ['test', "string"],
                     [1.11, "float"],
                     [1, "int"],
                     [np.random.rand(3, 3), "Unkown type"],
                     ["Large value", 100],
                     ["áéí", "unicode"]
                    ],
                    index=['a', 'b', nan, nan, nan, 'c',
                           "Test global max", 'd'],
                    columns=[nan, 'Type'])
    out = test_edit(df1)
    assert_frame_equal(df1, out)

    result = Series([True, "bool"], index=[nan, 'Type'], name='a')
    out = test_edit(df1.iloc[0])
    assert_series_equal(result, out)

    # Sorting large DataFrame takes time
    df1 = DataFrame(np.random.rand(100100, 10))
    df1.sort(columns=[0, 1], inplace=True)
    out = test_edit(df1)
    assert_frame_equal(out, df1)

    series = Series(np.arange(10), name=0)
    out = test_edit(series)
    assert_series_equal(series, out)
Example #3
0
    def calculate_top_10_solutions(self):
        '''calcualte all schemes and select top 10 solutions'''
        
        columns = ['name','rate','money']

        if isfile( learning_progres_csv ):
            scheme_profit = read_csv(learning_progres_csv)
        else:
            scheme_profit = DataFrame(columns = columns)            
        scheme_profit.set_index('name',inplace = True)

        with open(learning_progres_csv, 'w+') as csvfile:
            writer = csv.DictWriter(csvfile,delimiter=',',fieldnames = columns)
            writer.writeheader()
            csvfile.flush()
            for sc in self.generate_all_schemes():
                if sc.name not in scheme_profit.index:
                    e = evaluator(sc)
                    scheme_profit.ix[sc.name] = rate,money = e.calculate()
                    writer.writerow({'name':sc.name,'rate':rate,'money':money})
                    csvfile.flush()
                    if self.log:
                        print(sc.name + ' - ' + str(money) + ' \t rate = ' + str(rate))
                else:
                    writer.writerow({'name':sc.name,'rate':scheme_profit.rate[sc.name],'money':scheme_profit.money[sc.name]})
                    if self.log:
                        print(sc.name + ' - ' + str(scheme_profit.money[sc.name]) + ' \t rate = ' + str(scheme_profit.rate[sc.name]))
                    csvfile.flush()

        #TODO:write into scheme
        scheme_profit.sort(['money'],ascending = False)
        return scheme_profit[:10].to_dict()
Example #4
0
    def test_sort(self):
        frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"])

        # 9816 deprecated
        with tm.assert_produces_warning(FutureWarning):
            frame.sort(columns="A")
        with tm.assert_produces_warning(FutureWarning):
            frame.sort()
Example #5
0
    def test_sort(self):
        frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
                          columns=['A', 'B', 'C', 'D'])

        # 9816 deprecated
        with tm.assert_produces_warning(FutureWarning):
            frame.sort(columns='A')
        with tm.assert_produces_warning(FutureWarning):
            frame.sort()
Example #6
0
File: ddb_runs.py Project: yz-/ut
 def get_runs_df(self):
     """
     Returns all table as dataframe, sorted with most recent entry on bottom (ascending order)
     """
     df = DataFrame([{k: v for k, v in r.items()} for r in self.table.scan()])
     if df.empty:
         return df
     else:
         df.sort(columns=['dt'], ascending=True, inplace=True)
         # force df to have columns in this order
         return df[['dt', 'start', 'end']]
def filter_tags(tag_pickle='results/material_tags.pickle', exclude_tags='results/exclude.csv', n=50):
    exclude_words, duplicate_sets = load_filter_tags(exclude_tags)
    with open(tag_pickle, 'r') as f:
        t = DataFrame(pickle.load(f)['result']).set_index('_id')
    for setn in duplicate_sets:
        t.ix[setn[0]] += sum(map(lambda x: t.ix[x] , setn[1:]))
        for tag in setn[1:]:
            t.drop(tag, inplace=True)
    for tag in exclude_words:
        t.drop(tag, inplace=True)
    t.sort(ascending=False)
    return t[:n].index
def calculate_accuracy_by_category(y_test, predicted):
    """Calculates the accuracy of each by category. This is used for the outcome of a classifier.
    Parameters:
        y_test (array): the y_test
        predicted (array): the predicted values
    Returns:
        A data frame with the predicted values"""

    df = DataFrame({'Target': y_test, 'Predicted': predicted})
    df['Score'] = df.Target == df.Predicted
    df = df.groupby('Target').apply(lambda x: 100.0 * sum(x.Score) / len(x))
    df.sort()
    return df
Example #9
0
def test_outer_join():
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    left = DataFrame(left, columns=['id', 'name', 'amount'])

    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]
    right = DataFrame(right, columns=['city', 'id'])

    lsym = symbol('lsym', 'var * {id: int, name: string, amount: real}')
    rsym = symbol('rsym', 'var * {city: string, id: int}')

    convert = lambda df: set(df.to_records(index=False).tolist())

    assert (convert(compute(join(lsym, rsym), {lsym: left, rsym: right})) ==
            set([(1, 'Alice', 100, 'NYC'),
                 (1, 'Alice', 100, 'Boston'),
                 (4, 'Dennis', 400, 'Moscow')]))

    assert (convert(compute(join(lsym, rsym, how='left'),
                            {lsym: left, rsym: right})) ==
            set([(1, 'Alice', 100, 'NYC'),
                 (1, 'Alice', 100, 'Boston'),
                 (2, 'Bob', 200, np.nan),
                 (4, 'Dennis', 400, 'Moscow')]))

    df = compute(join(lsym, rsym, how='right'), {lsym: left, rsym: right})
    expected = DataFrame([(1., 'Alice', 100., 'NYC'),
                          (1., 'Alice', 100., 'Boston'),
                          (3., np.nan, np.nan, 'lsymA'),
                          (4., 'Dennis', 400., 'Moscow')],
                         columns=['id', 'name', 'amount', 'city'])

    result = df.sort('id').to_records(index=False)
    expected = expected.sort('id').to_records(index=False)
    np.array_equal(result, expected)

    df = compute(join(lsym, rsym, how='outer'), {lsym: left, rsym: right})
    expected = DataFrame([(1., 'Alice', 100., 'NYC'),
                          (1., 'Alice', 100., 'Boston'),
                          (2., 'Bob', 200., np.nan),
                          (3., np.nan, np.nan, 'LA'),
                          (4., 'Dennis', 400., 'Moscow')],
                         columns=['id', 'name', 'amount', 'city'])

    result = df.sort('id').to_records(index=False)
    expected = expected.sort('id').to_records(index=False)
    np.array_equal(result, expected)
Example #10
0
def test_outer_join():
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    left = DataFrame(left, columns=['id', 'name', 'amount'])

    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]
    right = DataFrame(right, columns=['city', 'id'])

    L = symbol('L', 'var * {id: int, name: string, amount: real}')
    R = symbol('R', 'var * {city: string, id: int}')

    convert = lambda df: set(df.to_records(index=False).tolist())

    assert convert(compute(join(L, R), {L: left, R: right})) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (4, 'Dennis', 400, 'Moscow')])

    assert convert(compute(join(L, R, how='left'), {L: left, R: right})) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (2, 'Bob', 200, np.nan),
             (4, 'Dennis', 400, 'Moscow')])

    df = compute(join(L, R, how='right'), {L: left, R: right})
    expected = DataFrame(
            [(1., 'Alice', 100., 'NYC'),
             (1., 'Alice', 100., 'Boston'),
             (3., np.nan, np.nan, 'LA'),
             (4., 'Dennis', 400., 'Moscow')],
            columns=['id', 'name', 'amount', 'city'])

    assert str(df.sort('id').to_records(index=False)) ==\
            str(expected.sort('id').to_records(index=False))

    df = compute(join(L, R, how='outer'), {L: left, R: right})
    expected = DataFrame(
            [(1., 'Alice', 100., 'NYC'),
             (1., 'Alice', 100., 'Boston'),
             (2., 'Bob', 200., np.nan),
             (3., np.nan, np.nan, 'LA'),
             (4., 'Dennis', 400., 'Moscow')],
            columns=['id', 'name', 'amount', 'city'])

    assert str(df.sort('id').to_records(index=False)) ==\
            str(expected.sort('id').to_records(index=False))
Example #11
0
    def homer_to_narrow_peaks(self, data, output_file):
        '''
        Given a Homer peak dataframe, extract necessary columns and convert
        to a broadPeak file. From the IDR package description:
        
            NarrowPeak files are in BED6+4 format. It consists of 10 tab-delimited columns
    
            1.chrom     string     Name of the chromosome
            2.chromStart     int     The starting position of the feature in the chromosome. The first base in a chromosome is numbered 0.
            3.chromEnd     int     The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the   feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
            4.name     string     Name given to a region (preferably unique). Use '.' if no name is assigned
            5.score     int     Indicates how dark the peak will be displayed in the browser (1-1000). If '0', the DCC will assign this based on signal value.         Ideally average signalValue per base spread between 100-1000.
            6.strand     char     +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned.
            7.signalValue     float     Measurement of overall (usually, average) enrichment for the region.
            8.pValue     float     Measurement of statistical signficance (-log10). Use -1 if no pValue is assigned.
            9.qValue     float     Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
            10.peak     int     Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
        
        '''

        columns = OrderedDict((
            ('chrom', self.get_first_column(data, ['chr','chrom', 'chromosome'])),
            ('chromStart', self.get_first_column(data, ['chromStart','start'])),
            ('chromEnd', self.get_first_column(data, ['chromEnd','end'])),
            ('name', self.get_first_column(data, ['#PeakID','PeakID','ID','name'])),
            ('score', Series([0]*data.shape[0])), # Leave zero so that signalValue column is used
            ('strand', self.get_first_column(data, ['strand'])),       
            ('signalValue', self.get_first_column(data, self.tag_count_columns)),
            ('pValue', -np.log10(self.get_first_column(data, self.p_value_columns))),
            ('qValue', Series([-1]*data.shape[0])), # Leave -1 as no individual FDR is called for each peak
            ('peak', Series([-1]*data.shape[0])), # Leave -1 as no point-source is called for each peak
            ))
        df = DataFrame(columns)
        df = df.sort(['signalValue','pValue'], ascending=False)
        df.to_csv(output_file, sep='\t', header=False, index=False)
def upsert_unique_indices(apps, schema_editor):
    datapoint_values_list = ['id','created_at','indicator_id','location_id','campaign_id','data_date']
    historical_dps = DataFrame(list(DataPoint.objects.filter(unique_index = -1)\
        .values_list('id','created_at','indicator_id','location_id','campaign_id','data_date')), columns=datapoint_values_list)
    # create the unique index
    historical_dps = historical_dps.apply(add_unique_index, axis=1)

    # group by and max on created at, get the most recent upload
    historical_dps = historical_dps.sort("created_at", ascending=False).groupby("unique_index", as_index=False).first()

    # get the ids into a list and select them
    dps_to_update = DataPoint.objects.filter(id__in=list(historical_dps['id']))
    print 'dps to update'
    print len(dps_to_update)
    # then run a query and update each
    for dp in dps_to_update:
        unique_index = historical_dps[historical_dps['id'] == dp.id].iloc[0]['unique_index']
        dp.unique_index = unique_index
        dp.save()
    
    # delete all the other duplicates
    dps_to_delete = DataPoint.objects.filter(unique_index=-1)
    print 'dps_to_delete'
    print len(dps_to_delete)
    dps_to_delete.delete()
Example #13
0
    def predict(self, tree):
        """
        TODO Should take an array and predict every item. A score can be stored.
        It would follow the guidelines set by scikit-learn.
        """
        tree_rules = self.extract_rules(tree)
        df = DataFrame(columns=['label', 'prob'])
        gb = self.posteriori.groupby('label')


        for key, indexes in gb.groups.items():
            apriori_prob = self.apriori[self.apriori.label == key]['freq'].values[0]
            prob = apriori_prob

            group_df, missing_prob = self.apply_smoothing(self.posteriori.ix[indexes], tree_rules)

            for rule in tree_rules:
                prob_evidence = group_df[group_df.rule == rule]['freq']
                if len(prob_evidence) == 0:
                    prob_evidence = missing_prob
                else:
                    prob_evidence = prob_evidence.values[0]
                prob *= prob_evidence
            
            post = DataFrame({'label':[key], 'prob':[prob]})
            df = df.append(post)

        df.index = np.arange(df.index.size)
        df = df.sort(columns='prob', ascending=False)
        return df.ix[df['prob'].idxmax()]
Example #14
0
def foreach_dataframe(self, func, force_dict=False, *args, **kwargs):
    """
        Really just does a foreach with each being dfs in a panel. 
    """
    d = {}
    for key, df in self.items():
        d[key] = func(df, *args, **kwargs)
    container = PanelDict
    for key, result in list(d.items()):
        if isinstance(result, Series):
            container = DataFrame
            break
        if isinstance(result, DataFrame):
            container = Panel
            break

    index = []
    for key, result in list(d.items()):
        if not isinstance(result, (DataFrame, Series)):
            continue
        result.name = key
        ind = result.index
        index = set(index).union(ind) 

    if force_dict:
        return PanelDict(d)

    res = DataFrame(None, index=index)
    for key, result in list(d.items()):
        res = res.join(result)

    res = res.sort()
    return res
Example #15
0
def analyze():
    signals = read_csv(FILE_SIGNALS)
    devices = signals["id"].unique()
    
    print("got %d signals from %d devices" % (len(signals), len(devices)))

    signals = signals.groupby(["frequency", "id"]).size()
    signals = signals.reindex(MultiIndex.from_product([SPECTRUM, devices],
                                                      names=signals.index.names),
                              fill_value=0)
    signals = signals.unstack("id")
    
    # let's only keep frequencies with all signals present
    candidates = signals.dropna()
    # suggest frequency where the weakest sensor has the most
    # received signals, and then the frequency with most total
    # received signals for all sensors
    candidates = DataFrame({"total":   candidates.sum(axis=1),
                            "weakest": candidates.min(axis=1)})
    appropriate_freq = candidates.sort(["weakest", "total"],
                                       ascending=False).index[0]
    print("suggesting frequency %s" % mhz(appropriate_freq))

    signals.to_csv("spectrum.csv")
    
    import matplotlib.pyplot as plt
    from matplotlib.ticker import EngFormatter

    p=signals.plot(kind="Area")
    p.xaxis.set_major_formatter(EngFormatter(unit='Hz', places=2))
    plt.savefig(FILE_SPECTRUM, dpi=300)
    print("saved spectrum as %s" % FILE_SPECTRUM)
Example #16
0
 def _search_by_inchi_fuzzy(self, inchi):
     # TODO: use openbabel if available
     matches = difflib.get_close_matches(inchi, self.data_frame.InChI.dropna(), n=5, cutoff=.8)
     ranks = dict([(match, i) for i, match in enumerate(matches)])
     selection = DataFrame(self.data_frame[self.data_frame.InChI.isin(matches)])
     selection['search_rank'] = selection.name.map(ranks)
     return selection.sort('search_rank')
	    def json_to_df(j_file):
	        page = ''
	        pic = ''
	        text = ''
	        point = ''
	        point_list = []
	        with open(j_file) as data_file:    
	            data = json.load(data_file)
	            for i in range(0, len(data)):
	                if data[i]['type'] == 'PageTurn':
	                    page = data[i]
	                elif data[i]['type'] == 'Picture':
	                    pic = data[i]
	                elif data[i]['type'] == 'Text':
	                    text = data[i]
	                elif data[i]['type'] == 'SampleGaze':
	                    point = data[i]
	                    point.update(page)
	                    point.update(pic)
	                    point.update(text)
	                    point['type'] = u'SampleGaze'
	                    point_list.append(point)
	                elif data[i]['type'] == 'SampleFixation':
	                    point = data[i]
	                    point.update(page)
	                    point.update(pic)
	                    point.update(text)
	                    point['type'] = u'SampleFixation'
	                    point_list.append(point)
	            df = DataFrame(point_list)
	            start_time = df['timestamp'].min()
	            df['timestamp'] = df['timestamp'] - start_time
	            df = df.sort('timestamp')
	            return df
Example #18
0
    def __init__(self, column, baseline, adjustments=None):
        self.column = column
        self.baseline = baseline.values
        self.dates = baseline.index
        self.assets = baseline.columns

        if adjustments is None:
            adjustments = DataFrame(index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS)
        else:
            # Ensure that columns are in the correct order.
            adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1)
            adjustments.sort(["apply_date", "sid"], inplace=True)

        self.adjustments = adjustments
        self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date)
        self.adjustment_end_dates = DatetimeIndex(adjustments.end_date)
        self.adjustment_sids = Int64Index(adjustments.sid)
 def merge_variables(self, to_merge):
     """Merges time series variables into new time series variables.
     :param to_merge: dictionary mapping new variable name to list of variables to be merged.
     :return:
     """
     dold = self._data.copy()
     s = Series(data=np.zeros((dold.shape[0],)), index=dold.index).replace(0, np.nan)
     dnew = DataFrame(dict([ (k, s) for k in to_merge.keys() if len(set(to_merge[k]).intersection(dold.columns))>0 ]))
     for newvar in dnew.columns:
         for oldvar in to_merge[newvar]:
             if oldvar in dold.columns:
                 dnew[newvar][dold[oldvar].notnull()] = dold[oldvar][dold[oldvar].notnull()]
                 del dold[oldvar]
     dnew = dnew.join(dold, how='outer')
     dnew.sort(axis=1, inplace=True)
     dnew.sort(axis=0, inplace=True)
     self._data = dnew
def estrutura_contagem(contagem_votos,contagem_votacoes):
    resultado = DataFrame({"bancada":list(contagem_votos.keys()),"votos_com_governo":list(contagem_votos.values()),"votacoes":list(contagem_votacoes.values())})
    #calcula a fidelidade partidária (número de votos pró-governo dividido pelo número de votações válidas)
    resultado["fidelidade"] = 100*resultado["votos_com_governo"]/resultado["votacoes"]
    #retira o governo do dataframe
    resultado = resultado[resultado.bancada != "GOV."] 
    resultado = resultado.sort("fidelidade",ascending=False)
    return resultado    
Example #21
0
 def send_to_db(self):
     conn = sqlite3.connect('data2.sqlite', timeout=30)
     c = conn.cursor()
     df = DataFrame(self.__dict__.items(), index=self.__dict__.keys())
     df = df.drop(0,1)
     df = df.transpose()
     df = df.sort(axis=1)
     df.to_sql('earnings_calendar', conn, if_exists='append', index=False)
Example #22
0
    def test_sort_values(self):
        # API for 9816

        # sort_index
        frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
                          columns=['A', 'B', 'C', 'D'])

        # 9816 deprecated
        with tm.assert_produces_warning(FutureWarning):
            frame.sort(columns='A')
        with tm.assert_produces_warning(FutureWarning):
            frame.sort()

        unordered = frame.ix[[3, 2, 4, 1]]
        expected = unordered.sort_index()

        result = unordered.sort_index(axis=0)
        assert_frame_equal(result, expected)

        unordered = frame.ix[:, [2, 1, 3, 0]]
        expected = unordered.sort_index(axis=1)

        result = unordered.sort_index(axis=1)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, expected)

        # sortlevel
        mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
        df = DataFrame([[1, 2], [3, 4]], mi)

        result = df.sort_index(level='A', sort_remaining=False)
        expected = df.sortlevel('A', sort_remaining=False)
        assert_frame_equal(result, expected)

        df = df.T
        result = df.sort_index(level='A', axis=1, sort_remaining=False)
        expected = df.sortlevel('A', axis=1, sort_remaining=False)
        assert_frame_equal(result, expected)

        # MI sort, but no by
        mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
        df = DataFrame([[1, 2], [3, 4]], mi)
        result = df.sort_index(sort_remaining=False)
        expected = df.sort_index()
        assert_frame_equal(result, expected)
Example #23
0
def set2df(sets, column_names, index=None, sort=True):
    df = DataFrame(list(sets), columns=column_names, index=index)
    if sort:
        df = df.sort(column_names)
        if index:
            df.index = index
        else:
            df.index = range(len(df))
    return df
def compile_predictions(pred):
	'''
	groups predictions made on patches of an image into a set of labels and confidences

	Args:
		pred (array-like):
			output from call to [some sklearn model].predict

	Returns:
		DataFrame: compiled predictions
	'''
	data = DataFrame()
	data['yhat'] = pred
	data['confidence'] = 1.0
	data = data.groupby('yhat').agg(lambda x: x.sum() / data.shape[0])
	data.sort('confidence', ascending=False, inplace=True)
	data['label'] = data.index
	data.reset_index(drop=True, inplace=True)
	return data
Example #25
0
class WaferRun:

    def __init__(self, run_id, wafer_id, label, measurements):
        self.run_id = int(run_id)
        self.wafer_id = int(wafer_id)
        self.label = int(label)
        self.measurements = DataFrame(measurements)
        self.measurements.sort(axis=1, inplace=True)
        self.measurements.sort_index(inplace=True)
    
    @staticmethod
    def from_files(path, run_id, wafer_id):
        fn_base = os.path.join(path, '{0}_{1:02}'.format(run_id, wafer_id))
        
        try:
            df = DataFrame({11: DataFrame.from_csv(fn_base + '.11', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            12: DataFrame.from_csv(fn_base + '.12', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            15: DataFrame.from_csv(fn_base + '.15', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            6: DataFrame.from_csv(fn_base + '.6', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            7: DataFrame.from_csv(fn_base + '.7', header=None, sep='\t', index_col=None, parse_dates=False)[1],
                            8: DataFrame.from_csv(fn_base + '.8', header=None, sep='\t', index_col=None, parse_dates=False)[1]})
        except:
            return None
        
        m = re.search('/(normal|abnormal)', path)
        if m is None:
            return None
    
        label = 1 if m.group(1) == 'abnormal' else -1
        
        return WaferRun(run_id, wafer_id, label, df)
    
    def as_nparray(self):
        """Spits out data as a T x D numpy.array (T=# samples, D=# variables)

        Notes:
        Notice what we do here: we start with a pandas.DataFrame where each channel
        is a column (so you can think of it as a T x D matrix). We first rename the
        columns to channel numbers,then sort the columns, then sort the index, then
        transform to numpy.array.
        """
        return self.measurements.sort(axis=1).sort_index().reset_index().as_matrix().astype(float)
Example #26
0
def ip_requests(log_list, count_dict) :
    '''
    Pandas applications
    '''
    df = DataFrame(log_list)
    ips = df.groupby('clientip').size()
    ips.sort()
    ips_fd = DataFrame({'Number of requests':ips[-10:]})
    ips_fd = ips_fd.sort(columns='Number of requests',  ascending=False)
    count_dict['ips_fd'] = ips_fd
    return count_dict
Example #27
0
def word_freq(file_name, suffix='_wordfreq', sep='\t', threshold=.5):
	print "start word_freq"
	# start = datetime.datetime.now()
	# print start
	reviews = pd.read_csv(file_name, error_bad_lines=False, sep=sep)
	cb = reviews['stopword_body']
	rate = reviews['Rating']
	# label all words with the rating
	cb_temp = []
	for i, c in enumerate(cb):
		cb_temp.append([(w, rate[i]) for w in ast.literal_eval(c)])
	reviews['stopword_body'] = cb_temp
	# calculate_time(start)
	# get the corpus of all reviews, lists of all words with label
	'''--------------------------------------------------------'''
	cop_wl = []
	for b in cb_temp:
		# change the unicode data to the raw string
		# cop_wl += [(unicodedata.normalize('NFKD', w[0]).encode('utf-8','replace'), w[1]) for w in b if type(w[0])==unicode]
		cop_wl += b
	'''--------------------------------------------------------'''
	# calculate_time(start)
	# word frequency of the corpus with label
	wfq = nltk.FreqDist(cop_wl)
	# calculate_time(start)
	# get the word list of all reviews without label
	cop = [w[0] for w in cop_wl]
	cop = set(cop)
	cop_len = len(cop)
	# calculate_time(start)
	# get freq of all words in one list
	wfq_l = []
	for w in cop:
		for i in range(1, 6):
			wfq_l.append(wfq[(w, i)])

	# calculate_time(start)
	# reshape the list to a matrix
	wfq_mx = DataFrame(np.array(wfq_l).reshape((cop_len,5)), index=pd.Index(cop), columns=pd.Index([1,2,3,4,5]))
	# calculate_time(start)
	# calculate the prob of each rating
	w_s = []
	w_sum = []
	for i, r in wfq_mx.iterrows():
		word_sum = wfq_mx.ix[i].sum()
		# wfq_mx.ix[i] = wfq_mx.ix[i]/word_sum
		w_s.append(word_useful_score(list(wfq_mx.ix[i]), word_sum))
		w_sum.append(word_sum)

	wfq_mx['score'] = w_s
	wfq_mx['sum'] = w_sum
	wfq_mx = wfq_mx.sort(columns='sum').ix[-int(len(w_s) * threshold):,:]
	print wfq_mx
	wfq_mx.to_csv(file_name.split('.')[0] + suffix + '.' + file_name.split('.')[1], sep='\t')
Example #28
0
 def list_stock_info(self, stock_list):
     keys = stock_list.keys()
     data = DataFrame(self.foundmental_data, index=keys)
     data = data.sort(columns='earn_ratio', ascending=False)   
     for code in data.index:
         gross_profit_rate = None   #默认值
         if code in self.profit_data.index:
             gross_profit_rate = self.profit_data.ix[code]['gross_profit_rate']
         #2. 过滤毛利率过低的股票
         if gross_profit_rate and gross_profit_rate < 15:
             continue
         self.print_stock_info(code, data.ix[code],gross_profit_rate)
Example #29
0
def _assert_matrix_of_thesaurus_c_is_as_expected(matrix, rows, cols):
    # rows may come in any order
    assert set(rows) == set(['g/N', 'a/N', 'd/J', 'b/V', 'a/J_b/N'])
    # columns must be in alphabetical order
    assert cols == ['a/N', 'b/V', 'd/J', 'g/N', 'x/X']
    # test the vectors for each entry
    expected_matrix = np.array([
        [0.1, 0., 0.2, 0.8, 0.],  # ab
        [0., 0.1, 0.5, 0.3, 0.],  # a
        [0.1, 0., 0.3, 0.6, 0.],  # b
        [0.5, 0.3, 0., 0.7, 0.],  # d
        [0.3, 0.6, 0.7, 0., 0.9]  # g
    ])
    # put the rows in the matrix in the order in which they are in expected_matrix
    matrix_ordered_by_rows = matrix[np.argsort(np.array(rows)), :]
    assert_array_equal(matrix_ordered_by_rows, expected_matrix)

    vec_df = DataFrame(matrix, columns=cols, index=rows)
    from pandas.util.testing import assert_frame_equal

    expected_frame = DataFrame(expected_matrix, index=['a/J_b/N', 'a/N', 'b/V', 'd/J', 'g/N'], columns=cols)
    assert_frame_equal(vec_df.sort(axis=0), expected_frame.sort(axis=0))
Example #30
0
    def _parse_fits(filepath):
        """Parses a GOES FITS file from
        http://umbra.nascom.nasa.gov/goes/fits/"""
        fits = pyfits.open(filepath)
        header = fits[0].header
        if len(fits) == 4:
            if is_time_in_given_format(fits[0].header['DATE-OBS'], '%d/%m/%Y'):
                start_time = datetime.datetime.strptime(fits[0].header['DATE-OBS'], '%d/%m/%Y')
            elif is_time_in_given_format(fits[0].header['DATE-OBS'], '%d/%m/%y'):
                start_time = datetime.datetime.strptime(fits[0].header['DATE-OBS'], '%d/%m/%y')
            else:
                raise ValueError("Date not recognized")
            xrsb = fits[2].data['FLUX'][0][:, 0]
            xrsa = fits[2].data['FLUX'][0][:, 1]
            seconds_from_start = fits[2].data['TIME'][0]
        elif 1 <= len(fits) <= 3:
            start_time = parse_time(header['TIMEZERO'])
            seconds_from_start = fits[0].data[0]
            xrsb = fits[0].data[1]
            xrsa = fits[0].data[2]
        else:
            raise ValueError("Don't know how to parse this file")

        times = [start_time + datetime.timedelta(seconds=int(floor(s)),
                                                 microseconds=int((s - floor(s)) * 1e6)) for s in seconds_from_start]

        # remove bad values as defined in header comments
        xrsb[xrsb == -99999] = nan
        xrsa[xrsa == -99999] = nan

        # fix byte ordering
        newxrsa = xrsa.byteswap().newbyteorder()
        newxrsb = xrsb.byteswap().newbyteorder()

        data = DataFrame({'xrsa': newxrsa, 'xrsb': newxrsb}, index=times)
        data.sort(inplace=True)
        return header, data