Exemple #1
0
    def test_basic(self):

        # array or list or dates
        N = 50
        rng = date_range('1/1/1990', periods=N, freq='53s')
        ts = Series(np.random.randn(N), index=rng)
        ts[15:30] = np.nan
        dates = date_range('1/1/1990', periods=N * 3, freq='25s')

        result = ts.asof(dates)
        assert notna(result).all()
        lb = ts.index[14]
        ub = ts.index[30]

        result = ts.asof(list(dates))
        assert notna(result).all()
        lb = ts.index[14]
        ub = ts.index[30]

        mask = (result.index >= lb) & (result.index < ub)
        rs = result[mask]
        assert (rs == ts[lb]).all()

        val = result[result.index[result.index >= ub][0]]
        assert ts[ub] == val
Exemple #2
0
def map_state(company_list):
    states = geo_list[(geo_list['Level ID'] == 0) | (geo_list['Level ID'] == 1)]
    cities = geo_list[(geo_list['Level ID'] == 0) | (geo_list['Level ID'] == 2)]
    company_list['State_Abbr'] = None
    for index, company in company_list.iterrows():
        # Has state
        if pd.notna(company['State']):
            if not states[states['Name'] == company['State']].empty:
                company_list.loc[index, 'State_Abbr'] = states.loc[states['Name'] == company['State'], 'PingYin2'].values[0].upper()
            elif not states[states['Full Name'] == company['State']].empty:
                company_list.loc[index, 'State_Abbr'] = states.loc[states['Full Name'] == company['State'], 'PingYin2'].values[0].upper()
                company_list.loc[index, 'State'] = states.loc[states['Full Name'] == company['State'], 'Name'].values[0]
        # Only has city
        elif pd.notna(company['City']):
            if not cities[cities['Name'] == company['City']].empty:
                if (cities.loc[cities['Name'] == company['City'], 'Level ID'] == 0).any():
                    company_list.loc[index, 'State_Abbr'] = cities.loc[cities['Name'] == company['City'], 'PingYin2'].values[0].upper()
                    company_list.loc[index, 'State'] = cities.loc[cities['Name'] == company['City'], 'Name'].values[0]
                else:
                    city_pid = cities.loc[cities['Name'] == company['City'], 'PID'].values[0]
                    if not states[states['ID'] == city_pid].empty:
                        company_list.loc[index, 'State_Abbr'] = states.loc[states['ID'] == city_pid, 'PingYin2'].values[0].upper()
                        company_list.loc[index, 'State'] = states.loc[states['ID'] == city_pid, 'Name'].values[0]
            elif not cities[cities['Full Name'] == company['City']].empty:
                if (cities.loc[cities['Full Name'] == company['City'], 'Level ID'] == 0).any():
                    company_list.loc[index, 'State_Abbr'] = cities.loc[cities['Full Name'] == company['City'], 'PingYin2'].values[0].upper()
                    company_list.loc[index, 'State'] = cities.loc[cities['Full Name'] == company['City'], 'Name'].values[0]
                else:
                    city_pid = cities.loc[cities['Full Name'] == company['City'], 'PID'].values[0]
                    if not states[states['ID'] == city_pid].empty:
                        company_list.loc[index, 'State_Abbr'] = states.loc[states['ID'] == city_pid, 'PingYin2'].values[0].upper()
                        company_list.loc[index, 'State'] = states.loc[states['ID'] == city_pid, 'Name'].values[0]

    return company_list
def convert_bitwarden(bitwarden_df):
    '''
    # Lastpass
    Documentation on expected CSV format: https://helpdesk.lastpass.com/importing-from-other-password-managers/

    ## Select notes from lastpass docs
    Lastpass valid columns: url, username, password, extra, name, grouping, type, hostname
    To import Secure Note data, enter the values as follows: “url” = http://sn, “extra” = the contents of the note. Give the note a “name”, and then consider adding “group”. It is important to leave the username and password columns blank.
    ----
    # Bitwarden
    Bitwarden exported columns: folder,favorite,type,name,notes,fields,login_uri,login_username,login_password,login_totp
    '''
    lp_sn_url_format = "http://sn"

    # Other columns aren't used in lastpass
    for col in ['favorite', 'login_totp', 'type']:
        del bitwarden_df[col]

    rename_dict = {
        'login_uri' : 'url',
        'login_username' : 'username',
        'login_password' : 'password',
        'notes' : 'extra',
        'folder' : 'grouping',
    }
    bitwarden_df.rename(columns=rename_dict, inplace=True)

    has_fields_df = bitwarden_df[pd.notna(bitwarden_df['fields'])]
    print("{} entries have a non-empty 'fields' entry".format(len(has_fields_df)))
    assert len(has_fields_df[has_fields_df['url'] == lp_sn_url_format]) == 0 # ensure none of these are lp formatted
    assert len(has_fields_df[pd.notna(has_fields_df['extra'])]) == 0 # ensure none of these have 'extra' entries as well
    bitwarden_df.loc[has_fields_df.index,'extra'] = has_fields_df['fields']
    del bitwarden_df['fields']

    has_extra_df = bitwarden_df[pd.notna(bitwarden_df['extra'])]
    print("{} entries have notes".format(len(has_extra_df)))
    lp_secure_notes_df = has_extra_df[has_extra_df['url'] == lp_sn_url_format]
    print("\t{} entries are lastpass formatted secure notes".format(len(lp_secure_notes_df)))
    assert lp_secure_notes_df[['username', 'password']].isnull().all().all() # make sure all usernames/passwords are empty

    bw_extra_df = has_extra_df[has_extra_df['url'] != lp_sn_url_format]
    bw_secure_notes_df = bw_extra_df[(bw_extra_df['url'].isnull()) |
                                     ((bw_extra_df['password'].isnull()) & (bw_extra_df['username'].isnull()))]
    print("\t{} entries are bitwarden secure notes".format(len(bw_secure_notes_df)))
    assert bw_secure_notes_df[['username', 'password']].isnull().all().all() # make sure all usernames/passwords are empty
    bitwarden_df.loc[bw_secure_notes_df.index,'url'] = lp_sn_url_format

    has_extra_df = bitwarden_df[pd.notna(bitwarden_df['extra'])]
    bw_extra_df = has_extra_df[has_extra_df['url'] != lp_sn_url_format]
    assert pd.notna(bw_extra_df[['password','url']]).all().all()
    print("\t{} entries are logins with notes".format(len(bw_extra_df)))

    no_url_mask = bitwarden_df['url'].isnull()
    bitwarden_df.loc[no_url_mask,'url'] = lp_sn_url_format
    print("{} entries have no url, setting as secure notes".format(no_url_mask.sum()))
Exemple #4
0
    def test_where_tz(self):
        i = pd.date_range('20130101', periods=3, tz='US/Eastern')
        result = i.where(notna(i))
        expected = i
        tm.assert_index_equal(result, expected)

        i2 = i.copy()
        i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist())
        result = i.where(notna(i2))
        expected = i2
        tm.assert_index_equal(result, expected)
Exemple #5
0
    def test_where(self):
        i = self.create_index()
        result = i.where(notna(i))
        expected = i
        tm.assert_index_equal(result, expected)

        i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(),
                            freq='D')
        result = i.where(notna(i2))
        expected = i2
        tm.assert_index_equal(result, expected)
    def test_where(self):
        i = self.create_index()
        result = i.where(notna(i))
        expected = i
        tm.assert_index_equal(result, expected)

        i2 = pd.CategoricalIndex([np.nan, np.nan] + i[2:].tolist(),
                                 categories=i.categories)
        result = i.where(notna(i2))
        expected = i2
        tm.assert_index_equal(result, expected)
Exemple #7
0
    def test_series_setitem(
            self, multiindex_year_month_day_dataframe_random_data):
        ymd = multiindex_year_month_day_dataframe_random_data
        s = ymd['A']

        s[2000, 3] = np.nan
        assert isna(s.values[42:65]).all()
        assert notna(s.values[:42]).all()
        assert notna(s.values[65:]).all()

        s[2000, 3, 10] = np.nan
        assert isna(s[49])
Exemple #8
0
    def test_properties(self, closed):
        index = self.create_index(closed=closed)
        assert len(index) == 10
        assert index.size == 10
        assert index.shape == (10, )

        tm.assert_index_equal(index.left, Index(np.arange(10)))
        tm.assert_index_equal(index.right, Index(np.arange(1, 11)))
        tm.assert_index_equal(index.mid, Index(np.arange(0.5, 10.5)))

        assert index.closed == closed

        ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))]
        expected = np.array(ivs, dtype=object)
        tm.assert_numpy_array_equal(np.asarray(index), expected)

        # with nans
        index = self.create_index_with_nan(closed=closed)
        assert len(index) == 10
        assert index.size == 10
        assert index.shape == (10, )

        expected_left = Index([0, np.nan, 2, 3, 4, 5, 6, 7, 8, 9])
        expected_right = expected_left + 1
        expected_mid = expected_left + 0.5
        tm.assert_index_equal(index.left, expected_left)
        tm.assert_index_equal(index.right, expected_right)
        tm.assert_index_equal(index.mid, expected_mid)

        assert index.closed == closed

        ivs = [Interval(l, r, closed) if notna(l) else np.nan
               for l, r in zip(expected_left, expected_right)]
        expected = np.array(ivs, dtype=object)
        tm.assert_numpy_array_equal(np.asarray(index), expected)
Exemple #9
0
def dedup_company(company_common_list, contact_common_list):
    company_common_list['ComName_temp'] = None
    company_common_list['vc_Deduplicate'] = None
    # company_common_list['Load'] = None
    company_common_list['vc_Master_ID'] = None
    for index, company in company_common_list.iterrows():
        if pd.notna(company['Company_Name_CN']):
            company_common_list.ix[index, 'ComName_temp'] = extract_keyword(company['Company_Name_CN'])
        else:
            company_common_list.ix[index, 'ComName_temp'] = format_space(str(company['Company_Name']).strip().lower())
    company_common_list['vc_Deduplicate'] = company_common_list.duplicated(subset=['ComName_temp'], keep=False)
    company_common_list['vc_Deduplicate'] = company_common_list['vc_Deduplicate'].apply(lambda x: False if x else True)
    # Duplicate list needs review
    company_duplicate_list = company_common_list[company_common_list['vc_Deduplicate'] == False]
    company_duplicate_list['Load'] = False
    # Full duplicate list
    company_duplicate_full = company_duplicate_list
    company_duplications = list(company_duplicate_list.groupby(['ComName_temp']).count().index)
    for dup in company_duplications:
        company_dup_group = company_duplicate_list[company_duplicate_list['ComName_temp'] == dup]
        company_masterid, company_common_list, company_dup_group = dedup_get_master(company_common_list, company_dup_group)

        if company_masterid is None:
            continue
        else:
            # Merge similar companies, set master company load as TRUE
            company_duplicate_full.loc[company_duplicate_full['Source_ID'] == company_masterid, 'Load'] = True
            company_common_list, contact_common_list = dedup_fix(company_common_list, contact_common_list, company_dup_group)
            company_duplicate_list = company_duplicate_list[company_duplicate_list['ComName_temp'] != dup]

    company_common_list.loc[company_common_list['vc_Deduplicate'] == False, 'Load'] = False
    return company_duplicate_list, company_duplicate_full, company_common_list, contact_common_list
    def _check_stat_op(self, name, alternate, string_series_,
                       check_objects=False, check_allna=False):

        with pd.option_context('use_bottleneck', False):
            f = getattr(Series, name)

            # add some NaNs
            string_series_[5:15] = np.NaN

            # mean, idxmax, idxmin, min, and max are valid for dates
            if name not in ['max', 'min', 'mean']:
                ds = Series(pd.date_range('1/1/2001', periods=10))
                with pytest.raises(TypeError):
                    f(ds)

            # skipna or no
            assert pd.notna(f(string_series_))
            assert pd.isna(f(string_series_, skipna=False))

            # check the result is correct
            nona = string_series_.dropna()
            tm.assert_almost_equal(f(nona), alternate(nona.values))
            tm.assert_almost_equal(f(string_series_), alternate(nona.values))

            allna = string_series_ * np.nan

            if check_allna:
                assert np.isnan(f(allna))

            # dtype=object with None, it works!
            s = Series([1, 2, 3, None, 5])
            f(s)

            # GH#2888
            items = [0]
            items.extend(lrange(2 ** 40, 2 ** 40 + 1000))
            s = Series(items, dtype='int64')
            tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))

            # check date range
            if check_objects:
                s = Series(pd.bdate_range('1/1/2000', periods=10))
                res = f(s)
                exp = alternate(s)
                assert res == exp

            # check on string data
            if name not in ['sum', 'min', 'max']:
                with pytest.raises(TypeError):
                    f(Series(list('abc')))

            # Invalid axis.
            with pytest.raises(ValueError):
                f(string_series_, axis=1)

            # Unimplemented numeric_only parameter.
            if 'numeric_only' in compat.signature(f).args:
                with pytest.raises(NotImplementedError, match=name):
                    f(string_series_, numeric_only=True)
Exemple #11
0
    def test_valid(self):
        ts = self.ts.copy()
        ts[::2] = np.NaN

        result = ts.valid()
        assert len(result) == ts.count()
        tm.assert_series_equal(result, ts[1::2])
        tm.assert_series_equal(result, ts[pd.notna(ts)])
Exemple #12
0
def init_list(raw_list, colnames, **kwargs):
    for col in colnames:
        for i in null_list:
            if col not in list(raw_list) or pd.isnull(raw_list[col]).all():
                break
            else:
                if col in ['Source_ID', 'Source_Company_ID']:
                    continue
                else:
                    raw_list[col] = raw_list[col].astype(object).str.lower().replace(i, np.nan, regex=True)
                    raw_list[col] = raw_list[col].astype(object).str.title()

    if kwargs['mode'] == 'Company':
        raw_list['db_New'] = True
        raw_list['Load'] = True
        raw_list['Company_Name_CN'] = raw_list.loc[pd.notnull(raw_list['Company_Name_CN']), 'Company_Name_CN'].apply(lambda x: x.replace(' ', ''))
        for index, company in raw_list.iterrows():
            address_list = dict()
            if pd.notna(company['Billing_Address_CN']):
                address_list = enrich_address(company['Billing_Address_CN'])
            elif pd.notna(company['Billing_Address']):
                address_list = enrich_address(company['Billing_Address'])
            for key in address_list.keys():
                raw_list.loc[index, key] = address_list[key]
        # if len(args) > 2:
        #     raw_list['Source_ID'] = raw_list['Source_ID'].apply(lambda x: args[1] + '_' + args[2] + '_' + 'Company' + '_' + str(x))
    if kwargs['mode'] == 'Contact':
        raw_list['db_New'] = True
        raw_list['Load'] = True
        raw_list['Source_ID'] = list(range(1, (len(raw_list) + 1)))
        raw_list['Source_ID'] = raw_list['Source_ID'].apply(lambda x: kwargs['sourcename'] + '_' + kwargs['timestamp'] + '_' + 'Contact' + '_' + str(x))
        if 'company' in kwargs.keys():
            company_list = kwargs['company']
            raw_list['Billing_Address'] = company_list['Billing_Address']
            raw_list['Billing_Address_CN'] = company_list['Billing_Address_CN']
            raw_list['District'] = company_list['District']
            raw_list['District_CN'] = company_list['District_CN']
            raw_list['City'] = company_list['City']
            raw_list['City_CN'] = company_list['City_CN']
            raw_list['State'] = company_list['State']
            raw_list['State_CN'] = company_list['State_CN']
            raw_list['Postal_Code'] = company_list['Postal_Code']
            raw_list['Country'] = company_list['Country']
        # if len(args) > 2 and raw_list:
        #     raw_list['Source_Company_ID'] = raw_list['Source_Company_ID'].apply(lambda x: args[1] + '_' + args[2] + '_' + 'Company' + '_' + str(x))
    return raw_list
Exemple #13
0
    def test_where_other(self):
        # other is ndarray or Index
        i = pd.date_range('20130101', periods=3, tz='US/Eastern')

        for arr in [np.nan, pd.NaT]:
            result = i.where(notna(i), other=np.nan)
            expected = i
            tm.assert_index_equal(result, expected)

        i2 = i.copy()
        i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist())
        result = i.where(notna(i2), i2)
        tm.assert_index_equal(result, i2)

        i2 = i.copy()
        i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist())
        result = i.where(notna(i2), i2.values)
        tm.assert_index_equal(result, i2)
Exemple #14
0
    def test_where_other(self):
        i = period_range('20130101', periods=5, freq='D')
        for arr in [np.nan, pd.NaT]:
            result = i.where(notna(i), other=np.nan)
            expected = i
            tm.assert_index_equal(result, expected)

        i2 = i.copy()
        i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(),
                            freq='D')
        result = i.where(notna(i2), i2)
        tm.assert_index_equal(result, i2)

        i2 = i.copy()
        i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(),
                            freq='D')
        result = i.where(notna(i2), i2.values)
        tm.assert_index_equal(result, i2)
Exemple #15
0
 def test_override_inferred_closed(self, constructor, data, closed):
     # GH 19370
     if isinstance(data, IntervalIndex):
         tuples = data.to_tuples()
     else:
         tuples = [(iv.left, iv.right) if notna(iv) else iv for iv in data]
     expected = IntervalIndex.from_tuples(tuples, closed=closed)
     result = constructor(data, closed=closed)
     tm.assert_index_equal(result, expected)
Exemple #16
0
    def test_periodindex(self):
        from pandas import period_range, PeriodIndex
        # array or list or dates
        N = 50
        rng = period_range('1/1/1990', periods=N, freq='H')
        ts = Series(np.random.randn(N), index=rng)
        ts[15:30] = np.nan
        dates = date_range('1/1/1990', periods=N * 3, freq='37min')

        result = ts.asof(dates)
        assert notna(result).all()
        lb = ts.index[14]
        ub = ts.index[30]

        result = ts.asof(list(dates))
        assert notna(result).all()
        lb = ts.index[14]
        ub = ts.index[30]

        pix = PeriodIndex(result.index.values, freq='H')
        mask = (pix >= lb) & (pix < ub)
        rs = result[mask]
        assert (rs == ts[lb]).all()

        ts[5:10] = np.nan
        ts[15:20] = np.nan

        val1 = ts.asof(ts.index[7])
        val2 = ts.asof(ts.index[19])

        assert val1 == ts[4]
        assert val2 == ts[14]

        # accepts strings
        val1 = ts.asof(str(ts.index[7]))
        assert val1 == ts[4]

        # in there
        assert ts.asof(ts.index[3]) == ts[3]

        # no as of value
        d = ts.index[0].to_timestamp() - offsets.BDay()
        assert isna(ts.asof(d))
Exemple #17
0
def validate_name(contact):
    nfirst = True
    nlast = False
    nspace = False

    # Remove more than two space and starting/ending space, format Last_Name
    if pd.notna(contact['Last_Name']):
        contact['Last_Name'] = format_space(contact['Last_Name'].lower().capitalize())
    if pd.notna(contact['First_Name']):
        contact['First_Name'] = format_space(contact['First_Name'])
    if pd.isna(contact['Reject_Reason']):
        contact['Reject_Reason'] = ''
    # Check First_Name and Last_Name misplace

    for lan in lastname_list.iloc[:, 1:]:
        lastnames = list(lastname_list[lan])
        if contact['Last_Name'] in lastnames:
            contact['vn_Lastname_CN'] = lastname_list.ix[lastnames.index(contact['Last_Name']), '简体中文']
            nlast = True
            break
        elif contact['First_Name'] in lastnames:
            nfirst = False
            break
    if not (nlast or nfirst):

        contact['Reject_Reason'] = contact['Reject_Reason'] + 'First_Name_CN and Last_Name_CN misplace;  '

    # Check name contains space
    if pd.notna(contact['First_Name']) and pd.notna(contact['Last_Name']):
        if ' ' in contact['First_Name'] or ' ' in contact['Last_Name']:
            contact['Reject_Reason'] = contact['Reject_Reason'] + 'Name contains space;  '
        else:
            nspace = True
    else:
        nspace = True

    # Name check
    ncheck = (nlast or nfirst) and nspace
    contact['vn_Name_Swap'] = (nlast or nfirst)
    contact['vn_Name_Space'] = nspace
    contact['vn_Name_Check'] = ncheck

    return contact
Exemple #18
0
def NanCleanerApply(x):
    #@param x is a column of the dataset
    maskNan = pd.isna(x)
    maskNotNan = pd.notna(x)
    notNan = x[maskNotNan]
    nan = x[maskNan]
    avg = int(np.average(notNan))
    for i in range (0, len(x)):
        if(pd.isna(x[i])):
            x[i] = avg
    return x
Exemple #19
0
    def test_length(self, closed, breaks):
        # GH 18789
        index = IntervalIndex.from_breaks(breaks, closed=closed)
        result = index.length
        expected = Index(iv.length for iv in index)
        tm.assert_index_equal(result, expected)

        # with NA
        index = index.insert(1, np.nan)
        result = index.length
        expected = Index(iv.length if notna(iv) else iv for iv in index)
        tm.assert_index_equal(result, expected)
Exemple #20
0
    def test_where(self):
        i = self.create_index()
        result = i.where(notna(i))
        expected = i
        tm.assert_index_equal(result, expected)

        _nan = i._na_value
        cond = [False] + [True] * len(i[1:])
        expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype)

        result = i.where(cond)
        tm.assert_index_equal(result, expected)
Exemple #21
0
    def get_kwargs_from_breaks(self, breaks, closed='right'):
        """
        converts intervals in breaks format to a dictionary of kwargs to
        specific to the format expected by the IntervalIndex/Index constructors
        """
        if len(breaks) == 0:
            return {'data': breaks}

        ivs = [Interval(l, r, closed) if notna(l) else l
               for l, r in zip(breaks[:-1], breaks[1:])]

        if isinstance(breaks, list):
            return {'data': ivs}
        return {'data': np.array(ivs, dtype=object)}
Exemple #22
0
def enrich_no_address(company_load_list, company_address_review):
    company_address_review = company_address_review[company_address_review['Load'] == True]
    for index, company in company_address_review.iterrows():
        sourceid = company['Source_ID']
        if pd.notna(company['Billing_Address_CN']):
            address_list = enrich_address(company['Billing_Address_CN'])
        elif pd.notna(company['Billing_Address']):
            address_list = enrich_address(company['Billing_Address'])
        for key in address_list.keys():
            company_load_list.loc[company_load_list['Source_ID'] == sourceid, key] = address_list[key]
        # if pd.isna(company_load_list.loc[index, 'State']):
        #     company_load_list.loc[index, 'State'] = state
        # if pd.isna(company_load_list.loc[index, 'City']):
        #     company_load_list.loc[index, 'City'] = city
        # if pd.isna(company_load_list.loc[index, 'District']):
        #     company_load_list.loc[index, 'District'] = district
        # if pd.isna(company_load_list.loc[index, 'Postal_Code']):
        #     company_load_list.loc[index, 'Postal_Code'] = zipcode

    # company_load_list.loc[pd.notnull(company_load_list['District']), 'Full_Address'] = company_load_list['District'] + company_load_list['Billing_Address']
    # company_load_list.loc[pd.isnull(company_load_list['District']), 'Full_Address'] = company_load_list['Billing_Address']

    return company_load_list
Exemple #23
0
def dedup_comany_db(company_dedup_list, company_db_return):
    if company_db_return.empty:
        return company_db_return

    company_merge_list = company_dedup_list.merge(company_db_return, on=['ComName_temp'], suffixes=['', '_db'], how='left')
    company_existing_list = company_merge_list[pd.notna(company_merge_list['Source_ID_db'])]
    company_existing_list['db_New'] = False
    company_existing_list['Load'] = False
    existing_company = company_existing_list['Source_ID'].tolist()
    existing_company = pd.Series(company_dedup_list['Source_ID'].isin(existing_company))
    company_dedup_list.loc[existing_company, 'db_New'] = False
    company_dedup_list.loc[existing_company, 'Load'] = False

    return company_dedup_list, company_existing_list
Exemple #24
0
def enrich_scrapy(company, scrapy):

    if scrapy.empty:
        return company

    else:
        if pd.notna(scrapy['英文名']).any():
            company['Company_Name'] = scrapy['英文名'].values[0]
        company['Company_Name_CN'] = scrapy['公司名称'].values[0]
        if scrapy['境外公司'] is True:
            company['Country'] = ''
        else:
            company['Country'] = 'China'

        if pd.isna(company['Billing_Address_CN']):
            address_list = enrich_address(scrapy['地址'].values[0])
            for key in address_list.keys():
                company[key] = address_list[key]

            # Set state as '所属地区'
            if pd.notna(scrapy['所属地区']).all():
                state = scrapy['所属地区'].values[0]
                states = geo_list[geo_list['Level ID'] == 1]
                for index, s in states.iterrows():
                    if s['Full Name'] == state or s['Name'] == state or s['PingYin3']  == state.lower():
                        company['State_CN'] = s['Name']
                        company['State'] = s['PingYin3'].capitalize()
                        break

        # company['Company_Type'] = scrapy['公司类型'].values[0]
        company['Phone'] = scrapy['电话'].values[0]
        company['Website'] = scrapy['网址'].values[0]
        company['Email'] = scrapy['邮箱'].values[0]
        company['Industry'] = scrapy['所属行业'].values[0]
        company['Employee'] = scrapy['参保人数'].values[0]

    return company
Exemple #25
0
def test_custom_grouper(index):

    dti = index
    s = Series(np.array([1] * len(dti)), index=dti, dtype='int64')

    b = TimeGrouper(Minute(5))
    g = s.groupby(b)

    # check all cython functions work
    funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
    for f in funcs:
        g._cython_agg_general(f)

    b = TimeGrouper(Minute(5), closed='right', label='right')
    g = s.groupby(b)
    # check all cython functions work
    funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
    for f in funcs:
        g._cython_agg_general(f)

    assert g.ngroups == 2593
    assert notna(g.mean()).all()

    # construct expected val
    arr = [1] + [5] * 2592
    idx = dti[0:-1:5]
    idx = idx.append(dti[-1:])
    expect = Series(arr, index=idx)

    # GH2763 - return in put dtype if we can
    result = g.agg(np.sum)
    assert_series_equal(result, expect)

    df = DataFrame(np.random.rand(len(dti), 10),
                   index=dti, dtype='float64')
    r = df.groupby(b).agg(np.sum)

    assert len(r.columns) == 10
    assert len(r.index) == 2593
Exemple #26
0
def validate_contacts(contact_dedup_list, contact_colnames, company_scrapy_list):
    contact_validate_list = pd.DataFrame(columns=contact_colnames)

    for index, contact in contact_dedup_list.iterrows():
        sourceid = contact['Source_Company_ID']
        company = company_scrapy_list.loc[company_scrapy_list['Source_ID'] == sourceid]
        contact = validate_name(contact)
        contact = validate_email(contact, company)
        if not company.empty:
            contact['Company_Name'] = company['Company_Name'].values[0]
            contact['Company_Name_CN'] = company['Company_Name_CN'].values[0]
            if pd.isna(contact['Billing_Address']):
                contact['Billing_Address'] = company['Full_Address'].values[0]
            if pd.isna(contact['City']):
                contact['City'] = company['City'].values[0]
            if pd.isna(contact['State']):
                contact['State'] = company['State'].values[0]
            if pd.isna(contact['Postal_Code']):
                contact['Postal_Code'] = company['Postal_Code'].values[0]
            if pd.isna(contact['Country']):
                contact['Country'] = company['Country'].values[0]
        if pd.isna(contact['Mobile']) and pd.isna(contact['Phone']) and pd.isna(contact['Email']):
            contact['Reject_Reason'] = contact['Reject_Reason'] + 'No communication method;  '
        contact['Load'] = contact['vn_Name_Check'] and (contact['ve_Email_Check'] or pd.notna(contact['Mobile']) or pd.notna(contact['Phone'])) and contact['db_New']

        contact_validate_list = contact_validate_list.append(contact, ignore_index=True)
    # Deduplicate by name and email
    contact_validate_list['Fname_temp'] = contact_validate_list['First_Name'].apply(lambda x: x if x is np.nan else x.lower())
    contact_validate_list['Lname_temp'] = contact_validate_list['Last_Name'].apply(lambda x: x if x is np.nan else x.lower())
    # TODO: keep only letters in email as Email_temp
    # Switch True and False
    contact_validate_list['vc_Deduplicate'] = contact_validate_list.duplicated(subset=['Fname_temp', 'Lname_temp', 'Email'], keep=False)
    contact_validate_list['vc_Deduplicate'] = contact_validate_list['vc_Deduplicate'].apply(lambda x: False if x else True)
    contact_validate_list.loc[contact_validate_list['vc_Deduplicate'] == False, 'Reject_Reason'] = contact_validate_list['Reject_Reason'].astype(str) + 'Duplicates in source data; '
    contact_validate_list['Load'] = contact_validate_list['Load'] & contact_validate_list['vc_Deduplicate']
    return contact_validate_list
Exemple #27
0
 def transform2(row):
     if (notna(row['C']) and row['C'].startswith('shin') and
             row['A'] == 'foo'):
         row['D'] = 7
     return row
Exemple #28
0
                            op='intersects')
dcasindexparcel['facindex'] = pd.qcut(
    dcasindexparcel['facilitypa'], 50, labels=False) + 1
dcasindexparcel['spdindex'] = 50 - pd.qcut(
    dcasindexparcel['avgspeed'], 50, labels=False)
dcasindexparcel['dcasindex'] = (dcasindexparcel['facindex'] +
                                dcasindexparcel['spdindex'])
dcasindexparcel = dcasindexparcel[[
    'facindex', 'spdindex', 'dcasindex', 'geometry'
]].reset_index(drop=True)
dcasindexparcel.to_file(path + 'OUTPUT/dcasindexparcel.shp')

# DCAS Index by NTA
dcasindexparcel = gpd.read_file(path + 'OUTPUT/dcasindexparcel.shp')
dcasindexparcel.crs = {'init': 'epsg:4326'}
dcasindexparcel = dcasindexparcel[pd.notna(
    dcasindexparcel['dcasindex'])].reset_index(drop=True)
ntaclippedadj = gpd.read_file(path + 'SHP/ntaclippedadj.shp')
ntaclippedadj.crs = {'init': 'epsg:4326'}
dcasindexnta = gpd.sjoin(dcasindexparcel,
                         ntaclippedadj,
                         how='inner',
                         op='intersects')
dcasindexnta = dcasindexnta.groupby(['NTACode', 'NTAName'],
                                    as_index=False).agg({
                                        'facindex': 'mean',
                                        'spdindex': 'mean',
                                        'dcasindex': 'mean'
                                    }).reset_index(drop=True)
facilityntact = gpd.read_file(path + 'OUTPUT/facilityparcelwgs.shp')
facilityntact.crs = {'init': 'epsg:4326'}
facilityntact = gpd.sjoin(facilityntact,
Exemple #29
0
def categorical_impute(X):
    for i in range(len(X)):
        for j in range(len(X[0])):
            if not (pd.notna(X[i][j])):
                X[i][j] = 'No'
Exemple #30
0
                      sep=";")

terc_voivodeships = {
    row[1][0]: row[1][1]
    for row in terc_df[pd.isna(terc_df['POW'])][["WOJ", "NAZWA"]].iterrows()
}

terc_counties = {
    row[1][0] + row[1][1]: row[1][2]
    for row in terc_df[terc_df['POW'].notna()][pd.isna(terc_df['GMI'])]
    [["WOJ", "POW", "NAZWA"]].iterrows()
}

terc_communes = {
    row[1][0] + row[1][1] + row[1][2]: row[1][3]
    for row in terc_df[pd.notna(terc_df['RODZ'])]
    [["WOJ", "POW", "GMI", "NAZWA"]].iterrows()
}

terc_communetypes = {
    row[1][0] + row[1][1] + row[1][2] + row[1][3]: row[1][4]
    for row in terc_df[pd.notna(terc_df['RODZ'])]
    [["WOJ", "POW", "GMI", "RODZ", "NAZWA_DOD"]].iterrows()
}

# Populacje w gminach

terc_populations = {}

for i, population in pd.read_excel('data/original/tabela17.xls',
                                   usecols="B:C",
Exemple #31
0
    def run(self):
        # ESP_MSVP stopped at Actualización nº 116 on 25.05.2020
        start = 116
        stop = (date.today() - date(2020, 5, 25)).days + 117
        if self.sliding_window_days:
            start = max(start, stop - self.sliding_window_days)

        for actualizacion in range(start, stop):
            parsed = self.fetch(actualizacion)
            time.sleep(5)  # crawl delay
            if parsed is None:
                continue
            content = unicodedata.normalize('NFKC', parsed['content'])
            fecha = datetime.strptime(get_fecha(content),
                                      '%d.%m.%Y').strftime('%Y-%m-%d')
            tabs = get_ccaa_tables(content,
                                   ['Tabla 1. Casos', 'Tabla 2. Casos'])

            if 'Acrobat Distiller' in parsed['metadata'][
                    'producer']:  # fragile
                tabs[0] = [[col for col in row if col != '']
                           for row in tabs[0]]
                tabs[1] = [[col for col in row if col != '']
                           for row in tabs[1]]

            df1 = pd.DataFrame([row[0:2] for row in tabs[0]],
                               columns=['ccaa', 'confirmed'])
            df2 = pd.DataFrame(
                [[row[i] for i in (0, 1, 3, 5)] for row in tabs[1]],
                columns=['ccaa', 'hospitalised', 'hospitalised_icu', 'dead'])
            data = pd.merge(df1, df2, on='ccaa')

            for index, record in data.iterrows():
                # ccaa,confirmed,hospitalised,hospitalised_icu,dead
                ccaa = record[0]
                confirmed = int(record[1]) if pd.notna(record[1]) else None
                hospitalised = int(record[2]) if pd.notna(record[2]) else None
                hospitalised_icu = int(record[3]) if pd.notna(
                    record[3]) else None
                dead = int(record[4]) if pd.notna(record[4]) else None

                success, adm_area_1, adm_area_2, adm_area_3, gid = self.adm_translator.tr(
                    country_code='ESP',
                    input_adm_area_1=ccaa,
                    input_adm_area_2=None,
                    input_adm_area_3=None,
                    return_original_if_failure=True)

                upsert_obj = {
                    'source': self.SOURCE,
                    'date': fecha,
                    'country': 'Spain',
                    'countrycode': 'ESP',
                    'adm_area_1': adm_area_1,
                    'adm_area_2': adm_area_2,
                    'adm_area_3': adm_area_3,
                    'confirmed': confirmed,
                    'dead': dead,
                    'hospitalised': hospitalised,
                    'hospitalised_icu': hospitalised_icu,
                    'gid': gid
                }
                self.upsert_data(**upsert_obj)
Exemple #32
0
def fill_location_for_tours_df(tdf, network_df, data_content):
    tid = tdf[pd.isna(tdf['latitude'])]
    pbar = tqdm(total=tid.shape[0],
                bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
    pbar.set_description('Step 4 of 4')

    for idx, _ in tid.iterrows():
        cat = ['going', 'maybe', 'invited', 'not_going']
        bik = data_content.tour_convoy_df[
            data_content.tour_convoy_df['tour_id'] == tdf.loc[idx, 'tour_id']]
        coll = []
        for c in cat:
            if not pd.isna(bik[c].tolist()[0]):
                coll += bik[c].tolist()[0].split()
        g = network_df[network_df['biker_id'].isin(coll)]
        if g.shape[0] > 0:
            m, _ = mode(g[['latitude']], axis=0)
            if not np.isnan(m[0, 0]):
                index = g[g['latitude'] == m[0, 0]].index.tolist()[0]
                lat, long = g.loc[index, 'latitude'], g.loc[index, 'longitude']
                tdf.loc[idx, 'latitude'] = lat
                tdf.loc[idx, 'longitude'] = long
        pbar.update(1)
    pbar.close()

    bid = tdf[pd.isna(tdf['latitude'])]['biker_id'].drop_duplicates().tolist()
    chi = data_content.tours_df[data_content.tours_df['biker_id'].isin(bid)]
    chi = chi[pd.notna(chi['latitude'])].groupby('biker_id')[[
        'latitude', 'longitude'
    ]].agg(lambda x: x.value_counts().index[0])
    chi = chi.reset_index()

    for idx, _ in tdf[pd.isna(tdf['latitude'])].iterrows():
        m = chi[chi['biker_id'] == tdf.loc[idx, 'biker_id']]
        if m.shape[0] != 0:
            tdf.loc[idx, 'latitude'] = m['latitude'].tolist()[0]
            tdf.loc[idx, 'longitude'] = m['longitude'].tolist()[0]

    # Using tour_convoy_df to find tours attended by biker organizing this tour
    # and fill location from based on that information.
    coll = []
    tid = tdf[pd.isna(tdf['latitude'])]
    sdf = data_content.convoy_df[data_content.convoy_df['biker_id'].isin(
        tid['biker_id'].tolist())]

    for idx, _ in tid.iterrows():
        cat = ['going', 'maybe', 'invited', 'not_going']
        bik = sdf[sdf['biker_id'] == tid.loc[idx, 'biker_id']]
        if bik.shape[0] > 0:
            for c in cat:
                if not pd.isna(bik[c].tolist()[0]):
                    coll += bik[c].tolist()[0].split()

    small_df = data_content.tours_df[data_content.tours_df['tour_id'].isin(
        coll)]
    for idx, _ in tid.iterrows():
        cat = ['going', 'maybe', 'invited', 'not_going']
        bik = sdf[sdf['biker_id'] == tdf.loc[idx, 'biker_id']]
        if bik.shape[0] > 0:
            coll = []
            for c in cat:
                if not pd.isna(bik[c].tolist()[0]):
                    coll += bik[c].tolist()[0].split()
            g = small_df[small_df['tour_id'].isin(coll)]
            if g.shape[0] > 0:
                m, _ = mode(g[['latitude']], axis=0)
                if not np.isnan(m[0, 0]):
                    index = g[g['latitude'] == m[0, 0]].index.tolist()[0]
                    lat, long = g.loc[index, 'latitude'], g.loc[index,
                                                                'longitude']
                    tdf.loc[idx, 'latitude'] = lat
                    tdf.loc[idx, 'longitude'] = long

    return tdf
        print("No trades")
        equity_arr.append(equity)
        drawdown_arr.append(equity / high_eq)
        prevdate = date
    else:
        print("Long:")
        print(longs)
        print("Short:")
        print(shorts)
        long_returns = return_df.loc[return_df['asset'].isin(longs),
                                     'return_1d']
        short_returns = return_df.loc[return_df['asset'].isin(shorts),
                                      'return_1d']

        long_returns = [
            val - 1 for val in long_returns.to_numpy() if pd.notna(val)
        ]
        short_returns = [
            val - 1 for val in short_returns.to_numpy() if pd.notna(val)
        ]

        if len(long_returns) == 0:
            long_returns = [0]
        if len(short_returns) == 0:
            short_returns = [0]

        long_return = LEVERAGE * (INV_VAR * np.mean(long_returns))
        short_return = LEVERAGE * (INV_VAR * np.mean(short_returns))

        equity = equity * (long_return - short_return + 1 - FEE)
Exemple #34
0
 def transform2(row):
     if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo":
         row["D"] = 7
     return row
 def make_properties(row):
     return [
         api.models.ModelProperty(key[2:],
                                  api.models.PropertyValue(row[key]))
         for key in prop_keys if pd.notna(row[key])
     ]
Exemple #36
0
 def count(s):
     return notna(s).sum()
Exemple #37
0
def condense_census(df_in):
    df_out = condense_record(df_in, CENSUS_COLUMNS)
    # Drop data from state censuses & where year is missing
    df_out.year = df_out.year.apply(process_year)
    df_out = df_out[pd.notna(df_out.year) & (df_out.year % 10 == 0)]
    return df_out
Exemple #38
0
	def validate_dataTempo(self, df, column):
		size = pd.notna(df).sum()
		if size == 0:
			return False

		validated = 0
		# datetime
		for value in df:
			value = str(value).strip()
			# mask: date ^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})$
			# 1-1-12 - 01/01/1999
			if len(value) >=6 and len(value) <= 10:
				if re.fullmatch('^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})$', str(value)):
					validated = validated + 1
			# mask: time ^((0|1)?[0-9]|[2][0-3]):([0-5][0-9])(:([0-5][0-9]))?$
			# 1:00 - 23:59:59
			if len(value) >=4 and len(value) <= 8:
				if re.fullmatch('^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})$', str(value)):
					validated = validated + 1
			# mask: datetime ^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})\s((0|1)?[0-9]|[2][0-3]):([0-5][0-9])(:([0-5][0-9]))?$
			#1-1-12 1:00 - 01/01/1999 23:59:59
			if len(value) >=11 and len(value) <= 20:
				if re.fullmatch('^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})\s((0|1)?[0-9]|[2][0-3]):([0-5][0-9])(:([0-5][0-9]))?$', str(value)):
					validated = validated + 1

		if validated/size >= self.threshold:
			return True

		# checking month
		if "mes" == column or column.startswith("mes_"):
			# integer values
			months = set()
			for x in df:
				try:
					value = int(value)
				except:
					continue
				months.add(value)
			
			if len(months) > 0:
				if min(months) >= 1 and max(months) <= 12:
					return True

			# string
			validated = 0
			for x in df:
				if str(x).lower() in ['janeiro', 'fevereiro', 'março', 'maro', 'marco', 'abril', 'maio', 'junho', 'julho', 'agosto', 'setembro', 'outubro', 'novembro', 'dezembro', 'jan', 'fev', 'mar', 'abr', 'maio', 'jun', 'jul', 'ago', 'set', 'out', 'nov', 'dez']:
					validated = validated + 1

			if validated/size >= self.threshold:
				return True
	
		# checking year
		if "ano" == column or column.startswith("ano_"):
			# integer values
			validated = 0
			for value in df:
				try:
					value = int(value)
				except:
					continue
				if value >= 1900 and value <= 2099:
					validated = validated + 1

			if validated/size >= self.threshold:
				return True
			
		# checking month-year
		if "mes" == column or column.startswith("mes_") or "ano" == column or column.startswith("ano_"):
			# mask jan/19 dez-2020
			validated = 0
			for value in df:
				value = str(value).strip().lower()
				if len(value) >=6 or len(value) <= 9:
					if re.fullmatch('^(jan|fev|mar|abr|mai|maio|jun|jul|ago|set|out|nov|dez)(\/|-)(19|20)(\d{2})?$', value):
						validated = validated + 1

			if validated/size >= self.threshold:
				return True

			# mask 19/jan 2020-dez
			validated = 0
			for value in df:
				value = str(value).strip().lower()
				if len(value) >=6 or len(value) <= 9:
					if re.fullmatch('^(19|20)(\d{2})?(\/|-)(jan|fev|mar|abr|mai|maio|jun|jul|ago|set|out|nov|dez)$', value):
						validated = validated + 1

			

			if validated/size >= self.threshold:
				return True

		# Special case: month with year
		if ("ano" in column and "mes" in column) or column.startswith("anoems") or column.startswith("anoms") or ("mes" in column and "refer" in column):
			# case 1: mask = YearMonth 2020/1 201406
			validated = 0
			for value in df:
				value = str(value).strip()
				if len(value) ==6 or len(value) ==7:
					if re.fullmatch('^(19|20)\d{2}(-|\/)?(0[1-9]|1[0-2])$', value):
						validated = validated + 1

			if validated/size >= self.threshold:
				return True

			# case 2: mask = MonthYear 07-2014 062016
			validated = 0
			for value in df:
				value = str(value).strip()
				if len(value) ==6 or len(value) ==7:
					if re.fullmatch('^(0[1-9]|1[0-2])(-|\/)?(19|20)\d{2}$', value):
						validated = validated + 1

			if validated/size >= self.threshold:
				return True

			# case 3: mask = Month/Year abr/16
			validated = 0
			for value in df:
				value = str(value).strip()
				if len(value) ==6 or len(value) ==7:
					if re.fullmatch('^(jan|fev|mar|abr|mai|maio|jun|jul|ago|set|out|nov|dez)(\/|-)\d{2}$', value):
						validated = validated + 1

			if validated/size >= self.threshold:
				return True

		# special cases without format
		if column.startswith("dta_") or column.startswith("data_") or column.startswith("dt_"):
			# mask yearMonthDay - 20170927
			validated = 0
			for value in df:
				value = str(value).strip()
				if len(value) ==8:
					if re.fullmatch('^(19|20)\d{2}(0[1-9]|1[0-2])((0[1-9])|((1|2)[0-9]))$', value):
						validated = validated + 1

			if validated/size >= self.threshold:
				return True

		return False
Exemple #39
0
            if high_se is not None and low_se is not None:
                detection_df.loc[index, 'SE (D)'] = (low_se + high_se) / 2

            elif high_se is not None:
                detection_df.loc[index, 'SE (D)'] = high_se

            elif low_se is not None:
                detection_df.loc[index, 'SE (D)'] = low_se

    # Maximum Likelihood Spatial Capture-Recapture
    if pd.isnull(row['SE (D).1']):
        if pd.notnull(row['Estimated D.1']):
            high_se = None
            low_se = None
            if pd.notna(row['Upper CI (D).1']):
                high_se = high_ci_to_se(row['Upper CI (D).1'],
                                        row['Estimated D.1'])

            if pd.notnull(row['Lower CI (D).1']):
                low_se = low_ci_to_se(row['Lower CI (D).1'],
                                      row['Estimated D.1'])

            if high_se is not None and low_se is not None:
                detection_df.loc[index, 'SE (D)'] = (low_se + high_se) / 2

            elif high_se is not None:
                detection_df.loc[index, 'SE (D)'] = high_se

            elif low_se is not None:
                detection_df.loc[index, 'SE (D)'] = low_se
Exemple #40
0
def notna(obj):
    if isinstance(obj, BasePandasDataset):
        return obj.notna()
    else:
        return pandas.notna(obj)
Exemple #41
0
def interpolate_predict(method="index"):
    start = datetime.datetime.now()
    data = pd.read_hdf(data_path)

    final_result = pd.DataFrame()
    score_df = pd.DataFrame()
    score_df["var"] = var_col
    for i in tqdm(range(1, 34)):
        sub = data[data["wtid"] == i]
        score_temp = []
        for var in var_col:
            sub1 = sub[pd.notna(sub[var])].reset_index(drop=True)
            index = 0
            for index, t in enumerate(tool.types):
                if var in t:
                    break
            col_name = str(index) + "_test"
            sub2 = sub1[[var]].copy()
            sub1.loc[sub1[col_name] == 1, var] = np.nan
            sub1[var] = sub1[var].interpolate(method=method)

            true_value = sub2[sub1[col_name] == 1][var]
            predict_value = sub1[sub1[col_name] == 1][var]
            if_round = False
            if var in category_col:
                predict_value = np.array(predict_value).astype(int)
                true_value = np.array(true_value).astype(int)
                score = tool.label_score(true_value, predict_value)
            else:
                score = tool.regression_score(true_value, predict_value)
                predict_value2 = np.round(predict_value, 2)
                score2 = tool.regression_score(true_value, predict_value2)
                if score < score2 - threshold:
                    score = score2
                    if_round = 2
                predict_value2 = np.round(predict_value, 1)
                score2 = tool.regression_score(true_value, predict_value2)
                if score < score2 - threshold:
                    score = score2
                    if_round = 1
            score_temp.append(score)

            # 预测结果
            sub[var] = sub[var].interpolate(method=method)
            if if_round:
                sub[var] = np.round(sub[var], if_round)

        final_result = pd.concat((final_result, sub),
                                 axis=0,
                                 ignore_index=True)
        score_df[str(i)] = score_temp

    score_df.set_index("var", inplace=True)
    score_df = score_df.T
    score_df.reset_index(inplace=True)
    score_df.rename(columns={"index": "wtid"}, inplace=True)
    score_df.to_csv("./result/{}_score.csv".format(method),
                    encoding="utf8",
                    index=False,
                    float_format='%.4f')

    final_result = final_result[final_result["count_miss"] > 0]
    final_result = final_result[head_col]
    final_result.sort_values(["wtid", "ts"], inplace=True)
    for var in category_col:
        final_result[var] = final_result[var].astype(int)
    final_result.to_csv("./result/{}_result.csv".format(method),
                        encoding="utf8",
                        index=False,
                        float_format='%.2f')
    end = datetime.datetime.now()
    print("finish", method, "interpolate_predict time: ", end - start)
 def make_identifiers(row):
     return {
         identifier: api.models.InstrumentIdValue(row[identifier])
         for identifier in identifiers if pd.notna(row[identifier])
     }
Exemple #43
0
def qichacha(company_input_list, path, sheetname):

    company_count = len(company_input_list)
    company_progress = 0

    # Find existing file
    try:
        company_scrapy_result = pd.read_excel(path, sheet_name=sheetname)
        # Remove breakpoint record
        # company_keyword_break = company_scrapy_result[company_scrapy_result['ID'] == 'breakpoint']['搜索词']
        company_sourceid_break = company_scrapy_result[company_scrapy_result['ID'] == 'breakpoint']['Source_ID'].values[0]
        company_progress = len(company_scrapy_result['Source_ID'].unique().tolist())
        company_scrapy_result = company_scrapy_result[company_scrapy_result['Source_ID'] != company_sourceid_break]
        company_done = company_scrapy_result['Source_ID'].unique().tolist()
        # if company_input_list[company_input_list['Company_Name_CN'] == company_keyword_break].empty == False:
        #     company_input_break = np.array(
        #         company_input_list[company_input_list['Company_Name_CN'] == company_keyword_break].index).tolist()[0]
        # else:
        #     company_input_break = np.array(
        #         company_input_list[company_input_list['Company_Name'] == company_keyword_break].index).tolist()[0]
        company_progress = len(company_input_list[company_input_list['Source_ID'].isin(company_done)])
        company_input_list = company_input_list[~company_input_list['Source_ID'].isin(company_done)]
        print('Restart from breakpoint.')
    # First time running
    except:
        company_scrapy_result = pd.DataFrame()  # columns = columnname)

    for index, row in company_input_list.iterrows():
        company_progress = int(company_progress) + 1
        if pd.notna(row['Company_Name_CN']):
            company_keyword = row['Company_Name_CN']
        else:
            company_keyword = row['Company_Name']
        company_sourceid = row['Source_ID']

        # Search filter
        search_base = 'https://www.qichacha.com/search?key={}#'
        # Keyword
        print('---------', company_keyword, '----------')
        search_key = urllib.parse.quote(company_keyword)
        # Organization Type: 0:Company 1:Organization 3:HK Company 5:TW Company
        search_type = '&searchType='
        # Searching Index: 2:Company_Name 4:Representative/Share holder  6:Management 8:Brand/Product 10:Connection(Address)
        search_index = '&index:2'
        # Province
        search_province = '&province:'
        # Fuzzy search for keyword
        time.sleep(random.randint(1, 2))
        if pd.notna(row['State_Abbr']):
            search_province = search_province + row['State_Abbr']
            search_url_keyword = search_base.format(search_key) + search_index + search_province + '&'
        else:
            search_url_keyword = search_base.format(search_key) + search_index + '&'
        # print(search_url_keyword)
        respond_keyword = requests.get(search_url_keyword, headers=search_headers)
        soup_keyword = BeautifulSoup(respond_keyword.text, 'lxml')
        company_info_list_flag = soup_keyword.find('span', attrs={'id': 'countOld'})

        # Company details
        if company_info_list_flag != None and company_info_list_flag.span.text.strip() != '0':
            try:
                search_companys = soup_keyword.find('table', attrs={'class': 'm_srchList'}).tbody.find_all('td')
                step = 0
                for company in search_companys:
                    if step % 3 == 1:
                        company_href = company.a['href']
                        search_url_company = 'https://www.qichacha.com' + company_href
                        time.sleep(random.randint(0, 1))
                        respond_company = requests.get(search_url_company, headers=search_headers)
                        soup_company = BeautifulSoup(respond_company.text, 'lxml')

                        company_isforeign = False
                        if (soup_company.find('div', attrs={'class': 'row title'}).h1 == None):  # HongKong Company
                            soup_company.find('div', attrs={'class': 'row title'}).span.extract()
                            company_name = soup_company.find('div', attrs={'class': 'row title'}).text
                            company_isforeign = True
                        else:
                            company_name = soup_company.find('div', attrs={'class': 'row title'}).h1.text
                        company_id = re.findall(r'/firm_(.*).html', str(company_href))[0]
                        # print(company_id, company_name)
                        company_phone = ''
                        company_website = ''
                        company_email = ''
                        company_address = ''
                        for i in soup_company.find_all('span', attrs={'class': "cdes"}):
                            if i.text == '电话:':
                                if (i.next_sibling.span != None):
                                    company_phone = i.next_sibling.span.text
                        # if (soup_company.find('span', attrs={'class': "cdes"}).next_sibling.span != None):
                        #     company_phone = soup_company.find('span', attrs={'class': "cdes"}).next_sibling.span.text
                        if (soup_company.find('a', attrs={'onclick': "zhugeTrack('企业主页-企业头部-官网')"}) != None):
                            company_website = soup_company.find('a', attrs={'onclick': "zhugeTrack('企业主页-企业头部-官网')"})[
                                'href']
                        if (soup_company.find('a', attrs={'title': '发送邮件'}) != None):
                            company_email = soup_company.find('a', attrs={'title': '发送邮件'}).text
                        if (soup_company.find('a', attrs={'title': "查看地址"}) != None):
                            company_address = soup_company.find('a', attrs={'title': "查看地址"}).text
                        search_id = str(company_id)  # str(company_sourceid) + '_' +
                        print(company_name)
                        # print('---------', company_name, '----------')
                        # print('---------', company_id, '----------')
                        if company_isforeign:
                            company_info_data = [search_id, company_sourceid, company_keyword, company_name, company_id,
                                                 company_phone, company_website, company_email, company_address, company_isforeign, '', '',
                                                 '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
                            company_info_data = dict(zip(columnname, company_info_data))
                            company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True)
                            step += 1
                            continue
                        company_info_list = soup_company.find_all('table', attrs={'class': 'ntable'})[1].find_all('tr')
                        company_info_data = []
                        company_info_data.append(search_id)
                        company_info_data.append(company_sourceid)
                        company_info_data.append(company_keyword)
                        company_info_data.append(company_name)
                        company_info_data.append(company_id)
                        company_info_data.append(company_phone)
                        company_info_data.append(company_website)
                        company_info_data.append(company_email)
                        company_info_data.append(company_address)
                        company_info_data.append(company_isforeign)
                        for business_info in company_info_list[:-2]:
                            company_info_data.append(business_info.find_all('td')[1].text.replace('\n', '').strip())
                            company_info_data.append(business_info.find_all('td')[3].text.replace('\n', '').strip())
                        # Business scope
                        company_info_data.append(company_info_list[-1].find_all('td')[1].text.replace('\n', '').strip())

                        # Lawsuit count
                        company_lawsuit = soup_company.find('a', attrs={'id': 'susong_title'}).span.text
                        company_info_data.append(company_lawsuit)

                        # Risk
                        company_risk_info = soup_company.find('div', attrs={'class': 'risk-panel b-a'})
                        if (company_risk_info != None):
                            company_risk_details = company_risk_info.find_all('span', attrs={'class': 'text-danger'})
                            company_risk_operation = soup_company.find('a', attrs={'id': 'fengxian_title'}).span.text
                            company_info_data.append(company_risk_details[0].text)
                            company_info_data.append(company_risk_details[1].text)
                            company_info_data.append(company_risk_operation)

                        # # Finance
                        # company_name_encode = urllib.parse.quote(company_name)
                        # search_url_finance = 'http://www.qichacha.com/company_getinfos?unique=' + company_id + '&companyname=' + company_name_encode + '&tab=run'
                        # company_info_data.append(search_url_finance)
                        # time.sleep(random.randint(2, 4))
                        # respond_finance = requests.get(search_url_finance,headers = search_headers)
                        # soup_finance = BeautifulSoup(respond_finance.text,'lxml')
                        # finance_info_list_flag = soup_finance.find('section',attrs = {'id':'V3_cwzl'})
                        # if finance_info_list_flag:
                        #     finance_info_list = finance_info_list_flag.find_all('td')
                        #     company_info_data.append(finance_info_list[1].text)
                        #     company_info_data.append(finance_info_list[3].text)
                        #     company_info_data.append(finance_info_list[5].text)
                        #     company_info_data.append(finance_info_list[7].text)
                        # else:
                        #     company_info_data.append('')
                        #     company_info_data.append('')
                        #     company_info_data.append('')
                        #     company_info_data.append('')

                        # # Anual_report
                        # search_url_report = 'http://www.qichacha.com/company_getinfos?unique=' + company_id + '&companyname=' + company_name_encode + '&tab=report'
                        # company_info_data.append(search_url_report)
                        # time.sleep(random.randint(2, 4))
                        # respond_report = requests.get(search_url_report,headers = search_headers)
                        # soup_report = BeautifulSoup(respond_report.text,'lxml')
                        # report_info_list = soup_report.find('div',attrs = {'class':'tab-pane fade in active'})
                        # print(report_info_list)
                        # report_info_list = report_info_list.find_all('td')
                        # report_info_list_flag = 'N'
                        #
                        # for report in report_info_list:
                        #     if report.text == '城镇职工基本养老保险':
                        #         report_info_list_flag = 'Y'
                        #
                        # if report_info_list_flag == 'Y':
                        #     for report in report_info_list:
                        #         if report.text == '城镇职工基本养老保险':
                        #             company_info_data.append(report_info_list[report_info_list.index(report)+1].text)
                        #             print(report_info_list[report_info_list.index(report)+1].text)
                        #         if report.text == '职工基本医疗保险':
                        #             company_info_data.append(report_info_list[report_info_list.index(report)+1].text)
                        #         if report.text == '生育保险':
                        #             company_info_data.append(report_info_list[report_info_list.index(report)+1].text)
                        #         if report.text == '失业保险':
                        #             company_info_data.append(report_info_list[report_info_list.index(report)+1].text)
                        #         if report.text == '工伤保险':
                        #             company_info_data.append(report_info_list[report_info_list.index(report)+1].text)
                        # else:
                        #     company_info_data.append('')
                        #     company_info_data.append('')
                        #     company_info_data.append('')
                        #     company_info_data.append('')
                        #     company_info_data.append('')

                        company_info_data = dict(zip(columnname, company_info_data))
                        company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True)
                    step += 1

            except:  # Need verification, set ID as 'breakpoint'
                company_info_data = ['breakpoint', company_sourceid, company_keyword, '', '', '', '', '', '', '', '',
                                     '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
                company_info_data = dict(zip(columnname, company_info_data))
                company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True)
                print('Need Verification case 1!')
                print('Progress: {} %'.format(company_progress / company_count * 100))
                break
        # Need verification, set ID as 'breakpoint'
        elif company_info_list_flag == None:
            company_info_data = ['breakpoint', company_sourceid, company_keyword, '', '', '', '', '', '', '', '',
                                 '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
            company_info_data = dict(zip(columnname, company_info_data))
            company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True)

            print('Need Verification case 2!')
            print('Progress: {} %'.format(company_progress / company_count * 100))
            break
        # No result return
        elif company_info_list_flag.span.text.strip() == '0':
            search_id = str(company_sourceid)
            # Column count 32
            company_info_data = [search_id, company_sourceid, company_keyword, '', '', '', '', '', '', '', '', '', '',
                                 '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
            company_info_data = dict(zip(columnname, company_info_data))
            company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True)

    return company_scrapy_result
import sys
sys.path.append('./libraries')
import pyfpgrowth
import pandas as pd
import numpy as np

data = pd.read_csv('./datasets/sepet.csv', header=None)

transactions = []
for d in data.values:
    tmp = []
    #print(pd.notna(d))
    for i in d:
        if pd.notna(i):
            tmp.append(i)
    transactions.append(tmp)

patterns = pyfpgrowth.find_frequent_patterns(transactions, 10)
rules = pyfpgrowth.generate_association_rules(patterns, 0.8)
Exemple #45
0
 def test_count(self):
     f = lambda s: notna(s).sum()
     self._check_stat_op('count', f, obj=self.panel4d, has_skipna=False)
Exemple #46
0
    def _try_convert_data(self,
                          name,
                          data,
                          use_dtypes=True,
                          convert_dates=True):
        """
        Try to parse a ndarray like into a column by inferring dtype.
        """
        # don't try to coerce, unless a force conversion
        if use_dtypes:
            if not self.dtype:
                if all(notna(data)):
                    return data, False
                return data.fillna(np.nan), True

            elif self.dtype is True:
                pass
            else:
                # dtype to force
                dtype = (self.dtype.get(name)
                         if isinstance(self.dtype, dict) else self.dtype)
                if dtype is not None:
                    try:
                        dtype = np.dtype(dtype)
                        return data.astype(dtype), True
                    except (TypeError, ValueError):
                        return data, False

        if convert_dates:
            new_data, result = self._try_convert_to_date(data)
            if result:
                return new_data, True

        result = False

        if data.dtype == "object":

            # try float
            try:
                data = data.astype("float64")
                result = True
            except (TypeError, ValueError):
                pass

        if data.dtype.kind == "f":

            if data.dtype != "float64":

                # coerce floats to 64
                try:
                    data = data.astype("float64")
                    result = True
                except (TypeError, ValueError):
                    pass

        # don't coerce 0-len data
        if len(data) and (data.dtype == "float" or data.dtype == "object"):

            # coerce ints if we can
            try:
                new_data = data.astype("int64")
                if (new_data == data).all():
                    data = new_data
                    result = True
            except (TypeError, ValueError, OverflowError):
                pass

        # coerce ints to 64
        if data.dtype == "int":

            # coerce floats to 64
            try:
                data = data.astype("int64")
                result = True
            except (TypeError, ValueError):
                pass

        return data, result
        )[0] in genome_ids:  # extracts the genome id and confirms it is a training sample
            spgene_files.append(filename)
            df = pd.read_csv(os.path.join(args.feature_folder, filename),
                             dtype=str,
                             sep='\t')
            df = df.loc[df['property'].isin(args.properties)]
            if feature_df is None:
                feature_df = df
            else:
                feature_df = pd.concat([feature_df, df], ignore_index=True)

    # getting the set of classifications that will serve as a feature vector
    classifications = list(
        set([
            feat for feat in list(feature_df['classification'])
            if pd.notna(feat)
        ]))
    num_classifications = len(classifications)
    print('Number of gene family classifications: ' + str(num_classifications))

    # building our list of genomes and associated resistance label vector, feature vector
    col_names = ['ID', 'Antibiotics', 'Phenotype', 'Annotations', 'Features']
    samples = list()
    for gen_id in genome_ids:
        sample = [
            gen_id, args.antibiotics,
            [0 for _ in range(len(args.antibiotics))],
            [False for _ in range(len(args.antibiotics))],
            [0.0 for _ in range(num_classifications)]
        ]
        # filling the label vector, noting if actually annotated in the data or default-filled
                                     'Deaths': ['sum'],
                                     'Recovered': ['sum'],
                                 })
        grouped.columns = ['Confirmed', 'Deaths', 'Recovered']
        grouped = grouped.reset_index()

        df_raw = grouped

    # extract information from CSV
    data = {
        'reportdate':
        date(year, month, day),
        'region':
        df_raw[key_region].str.strip(),
        'subregion':
        df_raw.where(pd.notna(df_raw[key_subregion]),
                     df_raw[key_region],
                     axis=0)[key_subregion].str.strip(),
        'lat':
        df_raw['Lat'] if 'Lat' in cols else None,
        'lng':
        df_raw['Long_'] if 'Long_' in cols else None,
        'confirmed':
        df_raw['Confirmed'].fillna(0).astype('int32'),
        'deaths':
        df_raw['Deaths'].fillna(0).astype('int32'),
        'recovered':
        df_raw['Recovered'].fillna(0).astype('int32')
    }

    # append the content of this file
Exemple #49
0
def dataSetCheck():
    def isfloat(value):
        try:
            float(value)
            return True
        except:
            return False

    badCords = []
    emptyRows = []
    inconsitent = []

    myDBChk = pd.read_excel(myFile,
                            converters={
                                'PS_NETWORK_KEY-Spatial': str,
                                'POWER_SUPPLY_NAME': str,
                                'Continuity PS Name': str,
                                'Mac Address': str,
                                'Good Latitude': str,
                                'Good Longitude': str,
                                'Status': str,
                                'Comment': str
                            })
    myDBChk = myDBChk.replace(r'^\s*$', np.nan, regex=True)

    badCords = np.where(((pd.isna(myDBChk['Good Latitude']))
                         ^ (pd.isna(myDBChk['Good Longitude'])))
                        | ~(myDBChk['Good Latitude'].apply(isfloat))
                        | ~(myDBChk['Good Longitude'].apply(isfloat)))[0]

    filledRows = np.where((pd.notna(myDBChk['Status']))
                          | (pd.notna(myDBChk['Good Latitude'])
                             & pd.isna(myDBChk['Good Longitude'])))[0]
    if len(filledRows) > 0:
        mask = np.full(len(myDBChk['Status']), False)
        for i in range(0, filledRows[len(filledRows) - 1]):
            mask[i] = True
        emptyRows = np.where(mask & (pd.isna(myDBChk['Status']))
                             & (pd.isna(myDBChk['Good Latitude'])
                                | pd.isna(myDBChk['Good Longitude'])))[0]
        if len(emptyRows) > 0:
            print('Nothing filled in at rows:', emptyRows + 2)

    if checkInputFile:
        inputCSV = pd.read_csv('input.csv',
                               converters={
                                   'Id_Info': str,
                                   'Comment': str,
                                   'New Name': str,
                                   'New Mac': str
                               })
        inputCSV = inputCSV.replace(r'^\s*$', np.nan, regex=True)

        filledRows = np.where((pd.notna(inputCSV['Case']))
                              | (pd.notna(inputCSV['New Lat'])
                                 & pd.isna(inputCSV['New Long'])))[0]
        if len(filledRows) > 0:
            mask = np.full(len(inputCSV['Case']), False)
            for i in range(0, filledRows[len(filledRows) - 1]):
                mask[i] = True
            emptyRows = np.where(mask & (pd.isna(inputCSV['Case']))
                                 & (pd.isna(inputCSV['New Lat'])
                                    | pd.isna(inputCSV['New Long'])))[0]
            if len(emptyRows) > 0:
                print('Nothing filled in input file at rows:', emptyRows + 2)

    if len(badCords) == 0:
        if checkInputFile:
            myDB = pd.read_excel(myFile,
                                 converters={
                                     'PS_NETWORK_KEY-Spatial': str,
                                     'POWER_SUPPLY_NAME': str,
                                     'Continuity PS Name': str,
                                     'Mac Address': str,
                                     'Good Latitude': float,
                                     'Good Longitude': float,
                                     'Status': str,
                                     'Comment': str
                                 })
            myDB = myDB.replace(r'^\s*$', np.nan, regex=True)
            inputCSV = pd.read_csv('input.csv',
                                   converters={
                                       'Id_Info': str,
                                       'Comment': str,
                                       'New Name': str,
                                       'New Mac': str
                                   })
            inputCSV = inputCSV.replace(r'^\s*$', np.nan, regex=True)
            inconsitent = np.where(
                (abs(myDB['Good Latitude'] - inputCSV['New Lat']) > 0.0001)
                | ((pd.isna(myDB['Good Latitude']))
                   ^ (pd.isna(inputCSV['New Lat'])))
                | (abs(myDB['Good Longitude'] - inputCSV['New Long']) > 0.0001)
                | ((pd.isna(myDB['Good Longitude']))
                   ^ (pd.isna(inputCSV['New Long'])))
                | ((myDB['Continuity PS Name'] != inputCSV['New Name'])
                   & (myDB['Continuity PS Name'].notna()
                      | inputCSV['New Name'].notna()))
                | ((myDB['Mac Address'] != inputCSV['New Mac'])
                   & (myDB['Mac Address'].notna()
                      | inputCSV['New Mac'].notna())))[0]
            if len(inconsitent) > 0:
                print('Inconsitent with input file at rows:', inconsitent + 2)
    else:
        print('Bad coordinate at rows:', shifted + 2)

    if len(badCords) == 0 and len(emptyRows) == 0 and len(inconsitent) == 0:
        print('Datasheet filled correctly')
Exemple #50
0
plt.xlabel("hour")
plt.legend(loc='upper center',
           bbox_to_anchor=(0.5, 1.15),
           ncol=5,
           fancybox=True,
           shadow=True)
plt.xticks(np.arange(0, 24, step=1))
plt.show()

#############

##############

import geopandas as gpd

indx = pd.notna(df['District'])
df_map = df.loc[indx].groupby(['District']).count()['ID']

data1 = []
for i in district:
    data1 = data1 + [[str(np.int64(i)), df_map[i]]]

map_df = pd.DataFrame(np.array(data1), columns=['Distict', 'CrimeRatio'])
map_df.Distict.apply(str)
map_df["CrimeRatio"] = pd.to_numeric(map_df["CrimeRatio"])
map_df['Distict'] = map_df['Distict'].astype(str)

import folium
import json

with open('Boundaries - Police Districts (current).geojson', 'r') as jasonfile:
Exemple #51
0
def get_terms(version: str) -> Iterable[Term]:
    """Get ComplexPortal terms."""
    df = get_df(version=version)
    df.rename(inplace=True,
              columns={
                  'Aliases for complex': 'aliases',
                  'Identifiers (and stoichiometry) of molecules in complex':
                  'members',
                  'Taxonomy identifier': 'taxonomy_id',
                  'Cross references': 'xrefs',
                  'Description': 'definition',
                  'Recommended name': 'name',
                  '#Complex ac': 'complexportal_id',
              })

    df['aliases'] = df['aliases'].map(lambda s: s.split('|')
                                      if pd.notna(s) else [])
    df['members'] = df['members'].map(_parse_members)
    df['xrefs'] = df['xrefs'].map(_parse_xrefs)

    taxnomy_id_to_name = get_id_name_mapping('ncbitaxon')
    df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get)

    slim_df = df[[
        'complexportal_id',
        'name',
        'definition',
        'aliases',
        'xrefs',
        'taxonomy_id',
        'taxonomy_name',
        'members',
    ]]
    it = tqdm(slim_df.values,
              total=len(slim_df.index),
              desc=f'mapping {PREFIX}')
    unhandled_xref_type = set()
    for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it:
        synonyms = [Synonym(name=alias) for alias in aliases]
        _xrefs = []
        provenance = []
        for reference, note in xrefs:
            if note == 'identity':
                _xrefs.append(reference)
            elif note == 'see-also' and reference.prefix == 'pubmed':
                provenance.append(reference)
            elif (note, reference.prefix) not in unhandled_xref_type:
                logger.debug(
                    f'unhandled xref type: {note} / {reference.prefix}')
                unhandled_xref_type.add((note, reference.prefix))

        term = Term(
            reference=Reference(prefix=PREFIX,
                                identifier=complexportal_id,
                                name=name),
            definition=definition.strip() if pd.notna(definition) else None,
            synonyms=synonyms,
            xrefs=_xrefs,
            provenance=provenance,
        )
        term.set_species(identifier=taxonomy_id, name=taxonomy_name)

        for reference, _count in members:
            term.append_relationship(has_part, reference)

        yield term
Exemple #52
0
def validate_email(contact, company):
    eformat = False
    esuffix = False
    epersonal = False
    edomain = False
    edup = False
    suffix = [r'\.com$', r'\.cn$', r'\.org$', r'\.net$', r'\.cc$', r'\.uk$', r'\.fr$', r'\.hk$', r'\.tw$', r'\.au$', r'\.jp$', r'\.sg$']
    personal = ['@gmail.com', '@hotmail.com', '@yahoo.com', '@sina.com', '@vip.sina.com', '@163.com', '@126.com', '@qq.com', '@vip.qq.com', '@139.com']

    if pd.notna(contact['Email']):
        # Lower and no space
        email = contact['Email'].lower().replace(' ', '')
    else:
        echeck = eformat and esuffix and (epersonal or edomain)
        contact['ve_Email_Format'] = eformat
        contact['ve_Email_Suffix'] = esuffix
        contact['ve_Email_Domain'] = epersonal or edomain
        contact['ve_Email_Check'] = echeck
        contact['Reject_Reason'] = contact['Reject_Reason'] + 'No Email;  '
        return contact
    # TODO: Email format check

    # Email must contain @
    if '@' in email:
        eformat = True
    else:
        contact['Reject_Reason'] = contact['Reject_Reason'] + 'Email without @;  '

    # Email suffix check
    for s in suffix:
        if re.search(re.compile(s, re.I), email) is not None:
            esuffix = True
            break
    if not esuffix:
        contact['Reject_Reason'] = contact['Reject_Reason'] + 'Email invalid suffix;  '

    # Email personal check
    for p in personal:
        if p in email:
            epersonal = True
            break

    # Email domain check
    domain = None
    if not company.empty:
        if pd.notna(company['Website']).bool():
            company_website = company['Website'].values[0]
            domain = company_website.split('.')[1]
        elif pd.notna(company['Email']).bool():
            company_email = company['Email'].values[0]
            domain = company_email.split('@')[1].split('.')[0]
            for p in personal:
                if p in company_email:
                    domain = None
                    break
        if domain is not None:
            if domain in email:
                edomain = True
            else:
                contact['Reject_Reason'] = contact['Reject_Reason'] + 'Email domain not match;  '
        else:
            edomain = True

    else:
        contact['Reject_Reason'] = contact['Reject_Reason'] + 'Company under review;  '

    # Email check
    echeck = eformat and esuffix and (epersonal or edomain)
    contact['ve_Email_Format'] = eformat
    contact['ve_Email_Suffix'] = esuffix
    contact['ve_Email_Domain'] = epersonal or edomain
    contact['ve_Email_Check'] = echeck
    return contact
def label_duplicate_links(in_file, node1_idx_reverse = [1,0],\
                            src_equals_dest_idx = 8, experiment_str='all_links_duplicate_clustered'):
        '''
        This function finds all the symmetric(also called 'duplicate' here) links B-A for the link A-B.
        Deletes all the identical links
        Input:
        -----
        in_file: CSV file.
        node1_idx_reverse: index B-A for A-B

        Output:
        ------
        out_file_path:  is the path of the csv with the duplicates assigned same
        label. And Identical links removed

        '''
        df = pd.read_csv(in_file)
        df_mat = df.values
        new_df = []
        i = 0
        label = 0
        node1_idx = node1_idx_reverse + list(range(len(node1_idx_reverse),\
                                                            len(df.columns)))
        # Instead of all the columns, focus just on the relevant indices
        # node1_idx = node1_idx_reverse

        for idx1 in range(df_mat.shape[0]):
            found_duplicate = False
            found_identical = False

            if pd.notna(df_mat[idx1,node1_idx_reverse[0]]) or\
                pd.notna(df_mat[idx1,node1_idx_reverse[int(len(node1_idx_reverse)/2)]]):

                for idx2 in range(idx1 + 1, df_mat.shape[0]):

                    # Delete Identical links
                    if (df_mat[idx1,:] == df_mat[idx2,:]).all(): # and \
                        df_mat[idx2,:] = [np.nan]*len(df_mat[idx2,:])
                        found_identical = True

                    # Keep one copy of indentically symmetrical links and remove all others

                    # If All the columns need to be considered
                    # if (df_mat[idx1,node1_idx] == df_mat[idx2,:]).all():

                    # If only the ABIDE columns need to be considered
                    if (df_mat[idx1,node1_idx_reverse] == df_mat[idx2,sorted(node1_idx_reverse)]).all():
                        i = i+1
                        if not found_duplicate:
                            label = label + 1
                            new_df.append(np.append(df_mat[idx1,:], label))
                            print('-----------------------------------------------')
                            print(i,':',np.append(df_mat[idx1,:], label))

                            if experiment_str != 'no_duplicates_others_clustered':
                                i = i+1
                                new_df.append(np.append(df_mat[idx2,:], label))
                                print(i,':',np.append(df_mat[idx2,:], label))

                            print('-----------------------------------------------')
                            df_mat[idx2,:] = [np.nan]*len(df_mat[idx2,:])
                            found_duplicate = True
                        else:
                            # So that the symmetric links does not come again and again
                            if experiment_str != 'no_duplicates_others_clustered':
                                i = i+1
                                new_df.append(np.append(df_mat[idx2,:], label))
                                print(i,':',np.append(df_mat[idx2,:], label))

                            df_mat[idx2,:] = [np.nan]*len(df_mat[idx2,:])

                # If no symmetric links are found
                if (not found_duplicate) or found_identical:
                    _label = None
                    # Assign a label and append to df
                    if experiment_str == 'all_links_duplicate_clustered':
                        label = label + 1
                        _label = label
                    elif experiment_str == 'all_links_duplicate_clustered_others_clustered' or\
                         experiment_str == 'no_duplicates_others_clustered':
                        _label = 'Single'

                    new_df.append(np.append(df_mat[idx1,:], _label))
                    i = i + 1
                    print(i,': Single',np.append(df_mat[idx1,:], _label))


        in_file_name = os.path.splitext(in_file)[0]
        out_file_path = in_file_name + '_' + experiment_str + '.csv'
        new_df = np.array(new_df)
        new_df = pd.DataFrame(data=new_df, columns=np.append(df.columns, 'Link_Label'))
        new_df.to_csv(out_file_path,index=False)

        return out_file_path, new_df
Exemple #54
0
    key_journal_id, 'Rank', 'NormalizedName', 'DisplayName', key_issn,
    'Publisher', 'Webpage', 'PaperCount', 'CitationCount', 'CreatedDate'
]

client = MongoClient('localhost', 27017)

db = client['mag']
collection = db['jour']

cnt = 0

chunksize = 10**6
for chunk in pd.read_csv(filename, names=header, sep='\t',
                         chunksize=chunksize):

    data = []

    for key, val in chunk.iterrows():

        journal_id = val[key_journal_id]
        issn = val[key_issn]

        if pd.notna(issn):

            current = {key_journal_id: journal_id, key_issn: issn}

            data.append(current)

    collection.insert_many(data)
    print(cnt)
    cnt += 1
        {
            True: 0,
            False: 1
        })
    #print(stock_plus_tweet)

    stock_plus_tweet['Output'] = stock_plus_tweet['Output'].fillna(
        method='backfill')
    stock_plus_tweet['EMA5'] = stock_plus_tweet['EMA5'].fillna(
        method='backfill')
    stock_plus_tweet['EMA10'] = stock_plus_tweet['EMA10'].fillna(
        method='backfill')
    stock_plus_tweet['EMA20'] = stock_plus_tweet['EMA20'].fillna(
        method='backfill')

    stock_plus_tweet = stock_plus_tweet[pd.notna(stock_plus_tweet['Output'])]
    stock_plus_tweet = stock_plus_tweet[pd.notna(stock_plus_tweet['text'])]

    number_of_tweets = stock_plus_tweet.groupby('date').count()

    number_of_tweets['numTweets'] = number_of_tweets['text']
    number_of_tweets = number_of_tweets['numTweets']

    stock_plus_tweet = pd.merge(stock_plus_tweet,
                                number_of_tweets,
                                how='left',
                                on='date')

    stock_plus_tweet = stock_plus_tweet[[
        'date', 'time', 'retweet_count', 'neg', 'neu', 'pos', 'cmpd',
        'IsTradingDay', 'is_retweet', 'numTweets', 'EMA5', 'EMA10', 'EMA20',
def process_args(api, args):
    aliases = {
        "CINT": "ClientInternal",
        "FIGI": "Figi",
        "RIC": "P:Instrument/default/RIC",
        "TICKER": "P:Instrument/default/Ticker",
        "ISIN": "P:Instrument/default/Isin",
    }

    if args.input:
        df = pd.concat(
            [
                lpt.read_input(input_file, dtype=str)
                for input_file in args.input
            ],
            ignore_index=True,
            sort=False,
        )

        if args.mappings:
            df.rename(
                columns=dict([(s[1], aliases.get(s[0], s[0]))
                              for s in [m.split("=") for m in args.mappings]]),
                inplace=True,
            )

        prop_keys = [col for col in df.columns.values if col.startswith("P:")]

        identifiers = [
            col for col in df.columns.values if col in args.identifiers
        ]

        # Identifiers have to be unique
        df = df.drop_duplicates(identifiers)

        def make_identifiers(row):
            return {
                identifier: api.models.InstrumentIdValue(row[identifier])
                for identifier in identifiers if pd.notna(row[identifier])
            }

        def make_properties(row):
            return [
                api.models.ModelProperty(key[2:],
                                         api.models.PropertyValue(row[key]))
                for key in prop_keys if pd.notna(row[key])
            ]

        def success(r):
            df = lpt.to_df([err[1] for err in r.content.failed.items()],
                           ["id", "detail"])
            df.columns = ["FAILED-INSTRUMENT", "ERROR"]
            return lpt.trim_df(df, args.limit, sort="FAILED-INSTRUMENT")

        has_lookthrough = LT_SCOPE in df.columns.values

        requests = [
            api.models.InstrumentDefinition(
                row["name"],
                make_identifiers(row),
                make_properties(row),
                api.models.ResourceId(row[LT_SCOPE], row[LT_CODE]) if
                (has_lookthrough and pd.notna(row[LT_SCOPE])) else None,
            ) for idx, row in df.iterrows()
        ]

        # Convert valid requests to dictionary
        def make_key(r):
            sec_id = list(r.identifiers.items())[0]
            return "{}:{}".format(sec_id[0], sec_id[1].value)

        requests = {
            make_key(r): r
            for r in requests if len(r.identifiers.keys()) > 0
        }

        if args.test:
            lpt.display_df(df[identifiers + prop_keys + ["name"]])
            print(requests)
            exit()

        return api.call.upsert_instruments(instruments=requests).bind(success)
# import semantic type frequencies
with open('semanticTypes.pickle', 'rb') as handle:
    sty = pickle.load(handle)

# Extract defined relations among semantic types
srstr_sty = srstr.loc[:542]
srstr_sty.columns = ["STY1", "RL", "STY2", "LS"]
srstr_sty_d = srstr_sty[srstr_sty.LS == "D"]
srstr_sty_b = srstr_sty[srstr_sty.LS == "B"]
srstr_sty_dni = srstr_sty[srstr_sty.LS == "DNI"]

# Create multi-directed-graphs with full SRSTR relations and only isa relations
srstr_trees = nx.MultiDiGraph(name='SRSTR defined tree (only isa relation)')
for index, row in srstr_sty_d.iterrows():
    if pd.notna(row['STY2']):  # Disconnect topmost nodes from ''
        if row['RL'] == 'isa':
            srstr_trees.add_edge(row['STY1'], row['STY2'], relation=row['RL'])

color_dict = dict((el, None) for el in sty)
current_level = ['Event']
current_color = [255, 0, 0, 1.0]
while len(current_level) != 0:
    next_level = []
    for sty in current_level:
        color_dict[sty] = 'rgba' + str(tuple(current_color))
        next_level += list(srstr_trees.predecessors(sty))
    current_color[1] += 45
    current_level = next_level

current_level = ['Entity']
Exemple #58
0
def top_predict():
    data = pd.read_hdf(data_path)

    score_df = pd.DataFrame()
    score_df["var"] = [i for i in var_col]
    final_result = pd.DataFrame()
    start = datetime.datetime.now()

    for wtid in tqdm(range(1, 34)):
        use_data = data[data["wtid"] == wtid]
        test_scores = []

        for var in var_col:
            train_data = use_data[pd.notna(use_data[var])]
            predict_data = use_data[pd.isna(use_data[var])]

            index = 0
            for index, t in enumerate(tool.types):
                if var in t:
                    break
            test_label_col = str(index) + "_test"

            train_feature = train_data[train_data[test_label_col] == 0]
            top_values = train_feature[var].value_counts().index
            test_feature = train_data[train_data[test_label_col] == 1]
            test_y = np.array(test_feature[var])

            # 用出现次数最多的数值
            test_pred = np.array([top_values[0]] * len(test_y))
            predict_y = np.array([top_values[0]] * len(predict_data))
            if var in category_col:
                test_score = tool.label_score(test_y, test_pred)
            else:
                test_score = tool.regression_score(test_y, test_pred)

            # 检验第二多的数值
            if test_score > 0.1 and len(top_values) > 1:
                test_pred2 = [top_values[1]] * len(test_y)
                if var in category_col:
                    test_score2 = tool.label_score(test_y, test_pred2)
                else:
                    test_score2 = tool.regression_score(test_y, test_pred2)
                if test_score2 > test_score:
                    test_score = test_score2
                    predict_y = np.array([top_values[1]] * len(predict_data))

            test_scores.append(test_score)
            use_data.loc[predict_data.index, var] = predict_y

        score_df[str(wtid)] = test_scores
        final_result = pd.concat(
            (final_result, use_data[use_data["count_miss"] > 0]),
            axis=0,
            ignore_index=True)

    final_result = final_result[head_col]
    final_result.sort_values(["wtid", "ts"], inplace=True)
    final_result.to_csv("./result/top_result.csv",
                        encoding="utf8",
                        index=False,
                        float_format='%.2f')

    score_df.set_index("var", inplace=True)
    score_df = score_df.T
    score_df.reset_index(inplace=True)
    score_df.rename(columns={"index": "wtid"}, inplace=True)
    score_df.to_csv("./result/top_score.csv",
                    encoding="utf8",
                    index=False,
                    float_format='%.4f')
    end = datetime.datetime.now()
    print("finish top_predict time: ", end - start, "\n")
Exemple #59
0
    def __init__(self, row):
        self.top_30ds_quantity = float(row["top_30ds_quantity"])
        self.top_90ds_quantity = float(row["top_90ds_quantity"])
        self.edlp_unit_price = float(row["edlp_unit_price"])
        self.bsd_unit_price = float(row["bsd_unit_price"])
        self.hd_unit_price = float(row["hd_unit_price"])
        self.edlp_fixed_price = float(row["edlp_fixed_price"])
        self.bsd_fixed_price = float(row["bsd_fixed_price"])
        self.hd_fixed_price = float(row["hd_fixed_price"])
        self.bh01_mac_price = float(row["bh01_mac_price"])
        self.bh02_mac_price = float(row["bh02_mac_price"])
        self.bh03_mac_price = float(row["bh03_mac_price"])
        self.wmt_mac_price = float(row["wmt_mac_price"])
        self.hd_syr_mac_price = float(row["hd_syr_mac_price"])
        self.bh01_dispensing_fee = float(row["bh01_dispensing_fee"])
        self.bh02_dispensing_fee = float(row["bh02_dispensing_fee"])
        self.bh03_dispensing_fee = float(row["bh03_dispensing_fee"])
        self.wmt_dispensing_fee = float(row["wmt_dispensing_fee"])
        self.hd_syr_dispensing_fee = float(row["hd_syr_dispensing_fee"])
        self.wmt_2018_11_28_qty1 = float(row["wmt_2018_11_28_qty1"])
        self.wmt_2018_11_28_qty2 = float(row["wmt_2018_11_28_qty2"])
        self.wmt_2018_11_28_price1 = float(row["wmt_2018_11_28_price1"])
        self.wmt_2018_11_28_price2 = float(row["wmt_2018_11_28_price2"])
        self.wmt_2018_11_28_flg = float(row["wmt_2018_11_28_flg"])
        self.last_30ds_qty = float(row["last_30ds_qty"])
        self.last_90ds_qty = float(row["last_90ds_qty"])
        # min_grx_30ds = float(row["min_grx_30ds"])
        # min_grx_90ds = float(row["min_grx_90ds"])
        self.min_major_retail_grx_30ds = float(
            row["min_major_retail_grx_30ds"])
        self.min_major_retail_grx_90ds = float(
            row["min_major_retail_grx_90ds"])
        # min_retail_grx_30ds = float(row["min_retail_grx_30ds"])
        # min_retail_grx_90ds = float(row["min_retail_grx_90ds"])
        # ltd_30_day_scripts = float(row["ltd_30_day_scripts"])
        # ltd_90_day_scripts = float(row["ltd_90_day_scripts"])
        # ltd_30_day_scripts_pct = float(row["ltd_30_day_scripts_pct"])
        # ltd_90_day_scripts_pct = float(row["ltd_90_day_scripts_pct"])
        # r30_30_day_scripts = float(row["r30_30_day_scripts"])
        # r30_90_day_scripts = float(row["r30_90_day_scripts"])
        # r30_30_day_script_pct = float(row["r30_30_day_script_pct"])
        # r30_90_day_script_pct = float(row["r30_90_day_script_pct"])
        self.fills = float(row["fills"])
        self.margin = float(row["margin"])
        self.orders = float(row["orders"])
        self.revenue = float(row["revenue"])
        self.users = float(row["users"])
        self.default_quantity = float(row["default_quantity"])

        self.pharmacy_network_id = float(row["pharmacy_network_id"])

        if self.pharmacy_network_id == 1:
            self.mac_unit_price = 0.45 * self.bh01_mac_price + 0.3 * self.wmt_mac_price + 0.175 * self.bh03_mac_price + 0.075 * self.bh02_mac_price
            self.mac_fixed_price = 0.45 * self.bh01_dispensing_fee + 0.3 * self.wmt_dispensing_fee + 0.175 * self.bh03_dispensing_fee + 0.075 * self.bh02_dispensing_fee
            self.sale_fixed_price = self.edlp_fixed_price
            self.sale_unit_price = self.edlp_unit_price
        if self.pharmacy_network_id == 2:
            self.mac_unit_price = self.bh01_mac_price
            self.mac_fixed_price = self.bh01_dispensing_fee
            self.sale_fixed_price = self.bsd_fixed_price
            self.sale_unit_price = self.bsd_unit_price
        if self.pharmacy_network_id == 3:
            self.mac_unit_price = self.hd_syr_mac_price
            self.mac_fixed_price = self.hd_syr_dispensing_fee
            self.sale_fixed_price = self.hd_fixed_price
            self.sale_unit_price = self.hd_unit_price

        # For Default Quantity
        self.sale_price_30ds_qty = self.sale_unit_price * self.last_30ds_qty + self.sale_fixed_price
        self.mac_price_30ds_qty = self.mac_unit_price * self.last_30ds_qty + self.mac_fixed_price

        self.sale_price_wmt_qty1 = self.sale_unit_price * self.wmt_2018_11_28_qty1 + self.sale_fixed_price
        self.sale_price_wmt_qty2 = self.sale_unit_price * self.wmt_2018_11_28_qty2 + self.sale_fixed_price
        self.mac_price_wmt_qty1 = self.mac_unit_price * self.wmt_2018_11_28_qty1 + self.mac_fixed_price
        self.mac_price_wmt_qty2 = self.mac_unit_price * self.wmt_2018_11_28_qty2 + self.mac_fixed_price

        self.inConsideration = row['in_consideration']
        self.isMarginPositive = self.mac_price_30ds_qty < self.sale_price_30ds_qty
        self.isCompetitive = self.sale_price_30ds_qty < self.min_major_retail_grx_30ds if pd.notna(
            self.last_30ds_qty) else False
        self.walmartDrugGroup = True if pd.notna(
            self.wmt_2018_11_28_flg) else False

        self.pricesChanged = False
        self.newer_unit_price = self.sale_unit_price
        self.newer_fixed_price = self.sale_fixed_price
        self.comment = 'No Change'
Exemple #60
0
    def _check_moment_func(
        self,
        static_comp,
        name,
        raw,
        has_min_periods=True,
        has_center=True,
        has_time_rule=True,
        fill_value=None,
        zero_min_periods_equal=True,
        **kwargs,
    ):

        # inject raw
        if name == "apply":
            kwargs = copy.copy(kwargs)
            kwargs["raw"] = raw

        def get_result(obj, window, min_periods=None, center=False):
            r = obj.rolling(window=window, min_periods=min_periods, center=center)
            return getattr(r, name)(**kwargs)

        series_result = get_result(self.series, window=50)
        assert isinstance(series_result, Series)
        tm.assert_almost_equal(series_result.iloc[-1], static_comp(self.series[-50:]))

        frame_result = get_result(self.frame, window=50)
        assert isinstance(frame_result, DataFrame)
        tm.assert_series_equal(
            frame_result.iloc[-1, :],
            self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw),
            check_names=False,
        )

        # check time_rule works
        if has_time_rule:
            win = 25
            minp = 10
            series = self.series[::2].resample("B").mean()
            frame = self.frame[::2].resample("B").mean()

            if has_min_periods:
                series_result = get_result(series, window=win, min_periods=minp)
                frame_result = get_result(frame, window=win, min_periods=minp)
            else:
                series_result = get_result(series, window=win)
                frame_result = get_result(frame, window=win)

            last_date = series_result.index[-1]
            prev_date = last_date - 24 * offsets.BDay()

            trunc_series = self.series[::2].truncate(prev_date, last_date)
            trunc_frame = self.frame[::2].truncate(prev_date, last_date)

            tm.assert_almost_equal(series_result[-1], static_comp(trunc_series))

            tm.assert_series_equal(
                frame_result.xs(last_date),
                trunc_frame.apply(static_comp, raw=raw),
                check_names=False,
            )

        # excluding NaNs correctly
        obj = Series(randn(50))
        obj[:10] = np.NaN
        obj[-10:] = np.NaN
        if has_min_periods:
            result = get_result(obj, 50, min_periods=30)
            tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10]))

            # min_periods is working correctly
            result = get_result(obj, 20, min_periods=15)
            assert isna(result.iloc[23])
            assert not isna(result.iloc[24])

            assert not isna(result.iloc[-6])
            assert isna(result.iloc[-5])

            obj2 = Series(randn(20))
            result = get_result(obj2, 10, min_periods=5)
            assert isna(result.iloc[3])
            assert notna(result.iloc[4])

            if zero_min_periods_equal:
                # min_periods=0 may be equivalent to min_periods=1
                result0 = get_result(obj, 20, min_periods=0)
                result1 = get_result(obj, 20, min_periods=1)
                tm.assert_almost_equal(result0, result1)
        else:
            result = get_result(obj, 50)
            tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10]))

        # window larger than series length (#7297)
        if has_min_periods:
            for minp in (0, len(self.series) - 1, len(self.series)):
                result = get_result(self.series, len(self.series) + 1, min_periods=minp)
                expected = get_result(self.series, len(self.series), min_periods=minp)
                nan_mask = isna(result)
                tm.assert_series_equal(nan_mask, isna(expected))

                nan_mask = ~nan_mask
                tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
        else:
            result = get_result(self.series, len(self.series) + 1)
            expected = get_result(self.series, len(self.series))
            nan_mask = isna(result)
            tm.assert_series_equal(nan_mask, isna(expected))

            nan_mask = ~nan_mask
            tm.assert_almost_equal(result[nan_mask], expected[nan_mask])

        # check center=True
        if has_center:
            if has_min_periods:
                result = get_result(obj, 20, min_periods=15, center=True)
                expected = get_result(
                    pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=15
                )[9:].reset_index(drop=True)
            else:
                result = get_result(obj, 20, center=True)
                expected = get_result(pd.concat([obj, Series([np.NaN] * 9)]), 20)[
                    9:
                ].reset_index(drop=True)

            tm.assert_series_equal(result, expected)

            # shifter index
            s = ["x{x:d}".format(x=x) for x in range(12)]

            if has_min_periods:
                minp = 10

                series_xp = (
                    get_result(
                        self.series.reindex(list(self.series.index) + s),
                        window=25,
                        min_periods=minp,
                    )
                    .shift(-12)
                    .reindex(self.series.index)
                )
                frame_xp = (
                    get_result(
                        self.frame.reindex(list(self.frame.index) + s),
                        window=25,
                        min_periods=minp,
                    )
                    .shift(-12)
                    .reindex(self.frame.index)
                )

                series_rs = get_result(
                    self.series, window=25, min_periods=minp, center=True
                )
                frame_rs = get_result(
                    self.frame, window=25, min_periods=minp, center=True
                )

            else:
                series_xp = (
                    get_result(
                        self.series.reindex(list(self.series.index) + s), window=25
                    )
                    .shift(-12)
                    .reindex(self.series.index)
                )
                frame_xp = (
                    get_result(
                        self.frame.reindex(list(self.frame.index) + s), window=25
                    )
                    .shift(-12)
                    .reindex(self.frame.index)
                )

                series_rs = get_result(self.series, window=25, center=True)
                frame_rs = get_result(self.frame, window=25, center=True)

            if fill_value is not None:
                series_xp = series_xp.fillna(fill_value)
                frame_xp = frame_xp.fillna(fill_value)
            tm.assert_series_equal(series_xp, series_rs)
            tm.assert_frame_equal(frame_xp, frame_rs)