Example #1
0
    def test_fillna_dtype_conversion(self):
        # make sure that fillna on an empty frame works
        df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
        result = df.get_dtype_counts().sort_values()
        expected = Series({'object': 5})
        assert_series_equal(result, expected)

        result = df.fillna(1)
        expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
        result = result.get_dtype_counts().sort_values()
        expected = Series({'int64': 5})
        assert_series_equal(result, expected)

        # empty block
        df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
        result = df.fillna('nan')
        expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
        assert_frame_equal(result, expected)

        # equiv of replace
        df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
        for v in ['', 1, np.nan, 1.0]:
            expected = df.replace(np.nan, v)
            result = df.fillna(v)
            assert_frame_equal(result, expected)
Example #2
0
    def test_merge_na_keys(self):
        data = [[1950, "A", 1.5],
                [1950, "B", 1.5],
                [1955, "B", 1.5],
                [1960, "B", np.nan],
                [1970, "B", 4.],
                [1950, "C", 4.],
                [1960, "C", np.nan],
                [1965, "C", 3.],
                [1970, "C", 4.]]

        frame = DataFrame(data, columns=["year", "panel", "data"])

        other_data = [[1960, 'A', np.nan],
                      [1970, 'A', np.nan],
                      [1955, 'A', np.nan],
                      [1965, 'A', np.nan],
                      [1965, 'B', np.nan],
                      [1955, 'C', np.nan]]
        other = DataFrame(other_data, columns=['year', 'panel', 'data'])

        result = frame.merge(other, how='outer')

        expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
        expected = expected.replace(-999, np.nan)

        tm.assert_frame_equal(result, expected)
Example #3
0
    def test_fillna_categorical_nan(self):
        # GH 14021
        # np.nan should always be a valid filler
        cat = Categorical([np.nan, 2, np.nan])
        val = Categorical([np.nan, np.nan, np.nan])
        df = DataFrame({"cats": cat, "vals": val})
        res = df.fillna(df.median())
        v_exp = [np.nan, np.nan, np.nan]
        df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
                           dtype='category')
        tm.assert_frame_equal(res, df_exp)

        result = df.cats.fillna(np.nan)
        tm.assert_series_equal(result, df.cats)
        result = df.vals.fillna(np.nan)
        tm.assert_series_equal(result, df.vals)

        idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
                                '2011-01-01 09:00', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
                              pd.NaT, pd.NaT], freq='M')
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.TimedeltaIndex(['1 days', '2 days',
                                 '1 days', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
Example #4
0
def timeseries_to_supervised(data, lag=1):
    df = DataFrame(data)
    columns = [df.shift(i) for i in range(1, lag+1)]
    columns.append(df)
    df = concat(columns, axis=1)
    df.fillna(0, inplace=True)
    return df
Example #5
0
    def test_fillna_skip_certain_blocks(self):
        # don't try to fill boolean, int blocks

        df = DataFrame(np.random.randn(10, 4).astype(int))

        # it works!
        df.fillna(np.nan)
def getPostData(fbGraph, entry):
    global CHART_LIMIT
    retrieved = False
    i=0
    while retrieved == False:
        i += 1
        try:
            posts = fbGraph.get_object(entry['page'] + '/posts',
                                       limit=CHART_LIMIT*15)['data']
            retrieved = True
        except facebook.GraphAPIError:
            print "Failed retrieving Graph object from facebook, retrying..."
            pass
        if i > 14:
            print "Giving up"
            return None
        
    frame = DataFrame(posts)
    ##Later, maybe output this frame for further study
    
    postData = DataFrame(columns=('Date', 'Likes', 'Shares'))
    postData['Shares'] = frame['shares'].map(fmtShares)
    postData['Likes']  = frame['id'].map(fmtLikes)
    postData['Date']   = frame['created_time'].map(fmtDate)
    
    postData = postData.groupby(by='Date', sort=False).mean()
    postData = postData.head(n=CHART_LIMIT)
    postData.fillna(value=0)
    return postData
Example #7
0
    def test_operators_none_as_na(self):
        df = DataFrame({"col1": [2, 5.0, 123, None],
                        "col2": [1, 2, 3, 4]}, dtype=object)

        ops = [operator.add, operator.sub, operator.mul, operator.truediv]

        # since filling converts dtypes from object, changed expected to be
        # object
        for op in ops:
            filled = df.fillna(np.nan)
            result = op(df, 3)
            expected = op(filled, 3).astype(object)
            expected[com.isnull(expected)] = None
            assert_frame_equal(result, expected)

            result = op(df, df)
            expected = op(filled, filled).astype(object)
            expected[com.isnull(expected)] = None
            assert_frame_equal(result, expected)

            result = op(df, df.fillna(7))
            assert_frame_equal(result, expected)

            result = op(df.fillna(7), df)
            assert_frame_equal(result, expected, check_dtype=False)
def pad_smooth(sample, window_len):
    nr_frames = sample.frame.max() + 40
    out = DataFrame({'frame': np.arange(nr_frames)})
    out['sample_id'] = sample.sample_id.unique()[0]

    out = pd.merge(out, sample, how='outer', on=['sample_id', 'frame'])
    out.fillna(method='ffill', inplace=True, limit=2)
    out.fillna(method='bfill', inplace=True)
    out.fillna(method='ffill', inplace=True)
    for gesture in range(21):
        out[gesture] = smooth(out[gesture], window_len=window_len, window='hanning')
    return out
Example #9
0
class Fillna(object):

    params = ([True, False], ['pad', 'bfill'])
    param_names = ['inplace', 'method']

    def setup(self, inplace, method):
        values = np.random.randn(10000, 100)
        values[::2] = np.nan
        self.df = DataFrame(values)

    def time_frame_fillna(self, inplace, method):
        self.df.fillna(inplace=inplace, method=method)
Example #10
0
    def test_fillna_columns(self):
        df = DataFrame(np.random.randn(10, 10))
        df.values[:, ::2] = np.nan

        result = df.fillna(method='ffill', axis=1)
        expected = df.T.fillna(method='pad').T
        assert_frame_equal(result, expected)

        df.insert(6, 'foo', 5)
        result = df.fillna(method='ffill', axis=1)
        expected = df.astype(float).fillna(method='ffill', axis=1)
        assert_frame_equal(result, expected)
Example #11
0
def makeguesses(morceaux,
                collectionname='grenoble',
                databasename='local'):
    with MongoClient() as client:
        coll = client[databasename][collectionname]
        dbvecs = DataFrame({d['T']: titlevector(d['T'])
                            for d in coll.find()
                            if d['T'] is not None})
        dbvecs.fillna(0, inplace=True)
        dbvecs.sort_index(inplace=True)
        dbkeys = {d['T']: d['_id']
                  for d in coll.find() if d['T'] is not None}
        # get the best fit
        guesses = (np.abs(dbvecs.subtract(titlevector(m), axis=0)).sum(axis=0).idxmin() for m in morceaux)
    return [[m, g, dbkeys[g]] for m, g in zip(morceaux, guesses)]
Example #12
0
def twitter_daily_aggregate(retrievaldate):

	#Date Retrieval
	d=[]
	dt = parser.parse(retrievaldate) + timedelta(days=-1)
	d.append(dt)
	d.append(d[-1] + timedelta(days=1))

	#DataFrame Init
	ctrend = DataFrame()
	while d[-1] < datetime.utcnow(): 
		print 'processing ', d[-1], ' ..........'
		#Daily Mention Count
		mnts = twitter_count(d, mentions)

		#User Follower Count
		usrs =  twitter_follower(d,users)
		#Join
		trend = mnts.join(usrs)
		trend['Date'] = Period(d[-1],'D')
		#Append to DataFrame
		ctrend = concat([ctrend,trend])
		#Extend Dates
		d.append(d[-1] + timedelta(days=1))
	#Join DataFrames and Fill NAs
	ctrend =  ctrend.fillna(0)
	#Save
	print 'printing the file'
	ctrend.to_csv('twitter_trend.csv')
	return ctrend
Example #13
0
    def get_date_trend(self, mode_date):
        """
        :param mode_date: 日期模式,合并到最短时间单位. 0-day, 1-week, 2-month, 3-Quarter. (default 2)
        """
        axisLabels = self.oriDate[:]
        pointVals = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues]

        rule_mode = {'0': 'D', '1': 'W', '2': 'M', '3': 'Q'}

        df = DataFrame(pointVals, index=axisLabels)
        df = df.resample(rule_mode[str(mode_date)], how='sum')
        df = df.fillna(0)

        """各项总和"""
        # cols_name = []
        # for name, col in df.iteritems():
        #     cols_name.append(name)
        # df['SUM'] = 0
        # for i in xrange(len(cols_name)):
        #     df['SUM'] += df[cols_name[i]]

        """宿舍比重"""
        # df['PER_DORM'] = df['dorm']/df['SUM'] if 'dorm' in df else 0  # 仅当存在宿舍值时才计算宿舍比重,否则设为0

        axisLabels = map(lambda x: x.strftime('%Y-%m-%d'), df.index.tolist())  # 从dataframe 中取出作为索引的日期标签成为队列
        seriesData = []
        legendLabels = []
        for colName, col in df.iteritems():
            legendLabels.append(colName)
            data = map(lambda x: 0.0 if isnan(x) else float(x), col.tolist())
            seriesData.append({'name': colName, 'data': data})

        json_dateTrend = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData}
        return json_dateTrend
Example #14
0
def pandas_fillna(df: pd.DataFrame,
                  value: float = None,
                  method: str = None,
                  limit: int = None,
                  **kwargs) -> pd.DataFrame:
    """
    Return a new dataframe with NaN values filled according to the given value
    or method.

    This is a wrapper for the ``pandas.fillna()`` function For additional
    keyword arguments and information refer to pandas documentation at
    http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html

    :param df: The dataframe to fill
    :param value: Value to fill
    :param method: Method according to which to fill NaN. ffill/pad will
           propagate the last valid observation to the next valid observation.
           backfill/bfill will propagate the next valid observation back to the last
           valid observation.
    :param limit: Maximum number of NaN values to forward/backward fill.
    :return: A dataframe with nan values filled with the given value or according to the given method.
    """
    # The following code is needed, because Pandas treats any kw given in kwargs as being set, even if just None.
    kwargs = dict(kwargs)
    if value:
        kwargs.update(value=value)
    if method:
        kwargs.update(method=method)
    if limit:
        kwargs.update(limit=limit)

    return df.fillna(**kwargs)
Example #15
0
def calc_distance_matrix(G, max_distance=None):
    """Returns a matrix containing the shortest distance
    between all nodes in a network

    Parameters
    ----------
    G : graph
       A NetworkX graph

    max_distance : float or None, optional (default='None')
       The maximum possible distance value in the network.
       If None, max_distance is the longest shortest path between
       two nodes of the network (the graph eccentricity)

    Returns
    -------
    dist_matrix : NumPy array
      An NxN numpy array.

    Notes
    -----
    Along the diagonal, the values are all 0.
    Unconnected nodes have a distance of max_distance to other nodes.
    """

    # Network (collaborator) Distance
    dist_matrix = nx.all_pairs_shortest_path_length(G)
    dist_matrix = DataFrame(dist_matrix, index=G.nodes(), columns=G.nodes())
    if max_distance is None:
        max_distance = float(dist_matrix.max().max())
    dist_matrix = dist_matrix.fillna(max_distance)
    # The unconnected ones are infinitely far from the rest
    diag_idx = np.diag_indices(len(dist_matrix), ndim=2)
    dist_matrix.values[diag_idx] = 0
    return dist_matrix
Example #16
0
def main():
    """
    Handling of not applicable values
    """

    string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
    print string_data
    print string_data.isnull()
    string_data[0] = None
    print string_data.isnull()
    print None is np.nan, None == np.nan # not same

    # Exclude N/A
    print '',''
    NA = np.nan
    data = Series([1, NA, 3.5, NA, 7])
    print data.dropna()
    print data[data.notnull()]

    data = DataFrame([
        [1., 6.5, 3.],
        [1., NA, NA],
        [NA, NA, NA],
        [NA, 6.5, 3.]
    ])
    cleaned = data.dropna() # row that all value is not NA
    print data
    print cleaned
    print data.dropna(how='all')
    data[4] = None
    print data.dropna(axis=1, how='all')
    print data.dropna(thresh=2) # non NA is more 2

    # Fill NA
    print '',''
    print data.fillna(0)
    print data.fillna({1: 0.5, 2: -1})
    _ = data.fillna(0, inplace=True)
    print data
    print '',''
    df = DataFrame(np.arange(18).reshape((6, 3)))
    df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill', limit=2)
    data = Series([1., NA, 3.5, NA, 7])
    print data.fillna(data.mean())
Example #17
0
    def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True,
                                  workers=1, ignore_globs=None, include_globs=None):
        """
        Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines
        blamed to each committer at each timestamp as data.

        :param branch: (optional, default 'master') the branch to work in
        :param limit: (optional, default None), the maximum number of revisions to return, None for no limit
        :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping.
        :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used
        :param committer: (optional, defualt=True) true if committer should be reported, false if author
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core.
        :return: DataFrame

        """

        if not _has_joblib:
            raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use
            cumulative_blame() instead.''')

        revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints)

        if self.verbose:
            print('Beginning processing for cumulative blame:')

        revisions = json.loads(revs.to_json(orient='index'))
        revisions = [revisions[key] for key in revisions]

        ds = Parallel(n_jobs=workers, backend='threading', verbose=5)(
            delayed(_parallel_cumulative_blame_func)
            (self, x, committer, ignore_globs, include_globs) for x in revisions
        )

        revs = DataFrame(ds)
        del revs['rev']

        revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp))
        revs.set_index(keys=['date'], drop=True, inplace=True)
        revs = revs.fillna(0.0)

        # drop 0 cols
        for col in revs.columns.values:
            if col != 'col':
                if revs[col].sum() == 0:
                    del revs[col]

        # drop 0 rows
        keep_idx = []
        committers = [x for x in revs.columns.values if x != 'date']
        for idx, row in revs.iterrows():
            if sum([row[x] for x in committers]) > 0:
                keep_idx.append(idx)

        revs = revs.ix[keep_idx]
        revs.sort_index(ascending=False, inplace=True)

        return revs
def prepare_pop_data(population_data: pd.DataFrame, cols=None) -> pd.DataFrame:
    pop_data = population_data.fillna(value=0)
    if not cols:
        cols = ['plot.number', 'total.men', 'total.women', 'orthodox', 'other.christian', 'other.religion']
    pop_data.loc[:, cols] = pop_data.loc[:, cols].astype(int)
    pop_data['lutheran'] = pop_data['total.men'] + pop_data['total.women'] \
                           - pop_data['orthodox'] - pop_data['other.christian'] - pop_data['other.religion']
    return pop_data
Example #19
0
    def test_fill_value_when_combine_const(self):
        # GH12723
        dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
        df = DataFrame({'foo': dat}, index=range(6))

        exp = df.fillna(0).add(2)
        res = df.add(2, fill_value=0)
        assert_frame_equal(res, exp)
Example #20
0
    def test_fillna_nat(self):
        series = Series([0, 1, 2, NaT], dtype="M8[us]")

        filled = series.fillna(method="pad")
        filled2 = series.fillna(value=series[2])

        expected = series.copy()
        expected[3] = expected[2]

        assert_series_equal(filled, expected)
        assert_series_equal(filled2, expected)

        df = DataFrame({"A": series})
        filled = df.fillna(method="pad")
        filled2 = df.fillna(value=series[2])
        expected = DataFrame({"A": expected})
        assert_frame_equal(filled, expected)
        assert_frame_equal(filled2, expected)
Example #21
0
    def test_fillna_nat(self):
        series = Series([0, 1, 2, NaT], dtype='M8[ns]')

        filled = series.fillna(method='pad')
        filled2 = series.fillna(value=series.values[2])

        expected = series.copy()
        expected.values[3] = expected.values[2]

        assert_series_equal(filled, expected)
        assert_series_equal(filled2, expected)

        df = DataFrame({'A': series})
        filled = df.fillna(method='pad')
        filled2 = df.fillna(value=series.values[2])
        expected = DataFrame({'A': expected})
        assert_frame_equal(filled, expected)
        assert_frame_equal(filled2, expected)
Example #22
0
 def calculate_ground_truth(self):
     gt = DataFrame(copy.deepcopy(self.events_consuming))
     gt = gt.fillna(-1)
     for x in gt.columns:
         l = len(gt[x])
         for y in range(1,l):
             if (gt[x][y] == -1):
                 gt[x][y] = gt[x][y-1]
     self.appliances_status = gt
Example #23
0
    def fillna_dict(cls, prop):
        """
        Use trade history then fill empty with value row above
        """
        df = DataFrame(prop)
        df = df.replace(['', 'DEBIT', 'CREDIT'], numpy.nan)
        df = df.fillna(method='ffill')

        return [r.to_dict() for k, r in df.iterrows()]
Example #24
0
def seriesData_fitTimeFrame(dfList, colName, ts):
	dataList = []
	#ts = TimeSeries(pd.DateRange(datetime(1998,1,1), datetime(2013, 3, 20)))
	for i in range(len(dfList)):
		tf = DataFrame(index = ts)
		df = dfList[i]
		tf[colName] = df[colName]
		if colName == 'TradeID' or colName == 'Symbol' or colName == 'Year' or colName == 'Buy' or colName == 'Sell' \
			or colName == 'Strat' or colName == 'StartTrading' or colName == 'EndTrading':	
			tf = tf.fillna(method = 'bfill')
			tf = tf.fillna(-99)
		elif colName == 'Close' or colName == 'IsPeriod' or colName == 'PosSize' or colName == 'PosDir' \
			or colName == 'DynamicPNL' or colName == 'DynamicDollarPNL':
			tf = tf.fillna(method = 'ffill')
			tf = tf.fillna(0)
		elif colName == 'RealizedPNL' or colName == 'RealizedDollarPNL' or colName == 'DailyPNLChange':
			tf = tf.fillna(0)
		dataList.append(np.asarray(tf[colName]))
	return dataList	
Example #25
0
    def test_replace_datetimetz(self):

        # GH 11326
        # behaving poorly when presented with a datetime64[ns, tz]
        df = DataFrame({'A': date_range('20130101', periods=3,
                                        tz='US/Eastern'),
                        'B': [0, np.nan, 2]})
        result = df.replace(np.nan, 1)
        expected = DataFrame({'A': date_range('20130101', periods=3,
                                              tz='US/Eastern'),
                              'B': Series([0, 1, 2], dtype='float64')})
        assert_frame_equal(result, expected)

        result = df.fillna(1)
        assert_frame_equal(result, expected)

        result = df.replace(0, np.nan)
        expected = DataFrame({'A': date_range('20130101', periods=3,
                                              tz='US/Eastern'),
                              'B': [np.nan, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.replace(Timestamp('20130102', tz='US/Eastern'),
                            Timestamp('20130104', tz='US/Eastern'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104', tz='US/Eastern'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace(
            {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern'))
        assert_frame_equal(result, expected)

        # coerce to object
        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace(
            {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104', tz='US/Pacific'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': np.nan}, Timestamp('20130104'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)
Example #26
0
    def test_na_actions_categorical(self):

        cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
        vals = ["a", "b", np.nan, "d"]
        df = DataFrame({"cats": cat, "vals": vals})
        cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
        vals2 = ["a", "b", "b", "d"]
        df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
        cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
        vals3 = ["a", "b", np.nan]
        df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
        cat4 = Categorical([1, 2], categories=[1, 2, 3])
        vals4 = ["a", "b"]
        df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})

        # fillna
        res = df.fillna(value={"cats": 3, "vals": "b"})
        tm.assert_frame_equal(res, df_exp_fill)

        with pytest.raises(ValueError, match=("fill value must "
                                              "be in categories")):
            df.fillna(value={"cats": 4, "vals": "c"})

        res = df.fillna(method='pad')
        tm.assert_frame_equal(res, df_exp_fill)

        # dropna
        res = df.dropna(subset=["cats"])
        tm.assert_frame_equal(res, df_exp_drop_cats)

        res = df.dropna()
        tm.assert_frame_equal(res, df_exp_drop_all)

        # make sure that fillna takes missing values into account
        c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
        df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})

        cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
        df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})

        res = df.fillna("a")
        tm.assert_frame_equal(res, df_exp)
Example #27
0
File: text.py Project: Afey/ramp
 def _apply(self, data, fitted_feature):
     docs = list(data)
     dct = self.dictionary.get_dict(self.context, docs)
     tfidf = self.dictionary.get_tfidf(self.context, docs)
     docs = [dct.doc2bow(d) for d in docs]
     vecs = tfidf[docs]
     df = DataFrame([dict(row) for row in vecs], index=data.index)
     df.columns = ['%s_%s' % (dct[i], data.name) for i in df.columns]
     df = df.fillna(0)
     logging.debug(df)
     return df
Example #28
0
 def _create(self, data):
     dct = self.get_prep_data(data)
     data = get_single_column(data)
     docs = [dct.doc2bow(d) for d in data]
     ids = dct.keys()
     df = DataFrame([dict(row) for row in docs], columns=ids, index=data.index)
     df.columns = ["%s_%s" % (dct[i], data.name) for i in ids]
     df = df.fillna(0)
     if self.bool_:
         df = df.astype(bool).astype(int)
     return df
Example #29
0
def merge_dataset(files):
    test = DataFrame(columns = ['file'])
    for csvfile in files:
        if csvfile != '.DS_Store':
            dataframe = pd.read_csv(textdir + csvfile)
            test = pd.concat([test, dataframe], ignore_index = True)
            test = test.fillna(np.float(re.match(r'(.+)\.csv', csvfile).group(1)))
    test = test[
        ['file', 'position', 'number_of_stopwords', 'length', 'highlight_marker', 'sum_of_word_weigth', 'scoreline_s1',
         'scoreline_s2', 'scoreline_s3', 'specific_timestamp', 'similarity1', 'similarity2', 'f_score']]
    return test
Example #30
0
 def _create(self, data):
     docs = list(data)
     dct = self.dictionary.get_dict(self.context, docs)
     tfidf = self.dictionary.get_tfidf(self.context, docs)
     docs = [dct.doc2bow(d) for d in docs]
     vecs = tfidf[docs]
     df = DataFrame([dict(row) for row in vecs], index=data.index)
     df.columns = ['%s_%s' % (dct[i], data.name) for i in df.columns]
     df = df.fillna(0)
     print df
     return df