def test_basic(self): # array or list or dates N = 50 rng = date_range('1/1/1990', periods=N, freq='53s') ts = Series(np.random.randn(N), index=rng) ts[15:30] = np.nan dates = date_range('1/1/1990', periods=N * 3, freq='25s') result = ts.asof(dates) assert notna(result).all() lb = ts.index[14] ub = ts.index[30] result = ts.asof(list(dates)) assert notna(result).all() lb = ts.index[14] ub = ts.index[30] mask = (result.index >= lb) & (result.index < ub) rs = result[mask] assert (rs == ts[lb]).all() val = result[result.index[result.index >= ub][0]] assert ts[ub] == val
def map_state(company_list): states = geo_list[(geo_list['Level ID'] == 0) | (geo_list['Level ID'] == 1)] cities = geo_list[(geo_list['Level ID'] == 0) | (geo_list['Level ID'] == 2)] company_list['State_Abbr'] = None for index, company in company_list.iterrows(): # Has state if pd.notna(company['State']): if not states[states['Name'] == company['State']].empty: company_list.loc[index, 'State_Abbr'] = states.loc[states['Name'] == company['State'], 'PingYin2'].values[0].upper() elif not states[states['Full Name'] == company['State']].empty: company_list.loc[index, 'State_Abbr'] = states.loc[states['Full Name'] == company['State'], 'PingYin2'].values[0].upper() company_list.loc[index, 'State'] = states.loc[states['Full Name'] == company['State'], 'Name'].values[0] # Only has city elif pd.notna(company['City']): if not cities[cities['Name'] == company['City']].empty: if (cities.loc[cities['Name'] == company['City'], 'Level ID'] == 0).any(): company_list.loc[index, 'State_Abbr'] = cities.loc[cities['Name'] == company['City'], 'PingYin2'].values[0].upper() company_list.loc[index, 'State'] = cities.loc[cities['Name'] == company['City'], 'Name'].values[0] else: city_pid = cities.loc[cities['Name'] == company['City'], 'PID'].values[0] if not states[states['ID'] == city_pid].empty: company_list.loc[index, 'State_Abbr'] = states.loc[states['ID'] == city_pid, 'PingYin2'].values[0].upper() company_list.loc[index, 'State'] = states.loc[states['ID'] == city_pid, 'Name'].values[0] elif not cities[cities['Full Name'] == company['City']].empty: if (cities.loc[cities['Full Name'] == company['City'], 'Level ID'] == 0).any(): company_list.loc[index, 'State_Abbr'] = cities.loc[cities['Full Name'] == company['City'], 'PingYin2'].values[0].upper() company_list.loc[index, 'State'] = cities.loc[cities['Full Name'] == company['City'], 'Name'].values[0] else: city_pid = cities.loc[cities['Full Name'] == company['City'], 'PID'].values[0] if not states[states['ID'] == city_pid].empty: company_list.loc[index, 'State_Abbr'] = states.loc[states['ID'] == city_pid, 'PingYin2'].values[0].upper() company_list.loc[index, 'State'] = states.loc[states['ID'] == city_pid, 'Name'].values[0] return company_list
def convert_bitwarden(bitwarden_df): ''' # Lastpass Documentation on expected CSV format: https://helpdesk.lastpass.com/importing-from-other-password-managers/ ## Select notes from lastpass docs Lastpass valid columns: url, username, password, extra, name, grouping, type, hostname To import Secure Note data, enter the values as follows: “url” = http://sn, “extra” = the contents of the note. Give the note a “name”, and then consider adding “group”. It is important to leave the username and password columns blank. ---- # Bitwarden Bitwarden exported columns: folder,favorite,type,name,notes,fields,login_uri,login_username,login_password,login_totp ''' lp_sn_url_format = "http://sn" # Other columns aren't used in lastpass for col in ['favorite', 'login_totp', 'type']: del bitwarden_df[col] rename_dict = { 'login_uri' : 'url', 'login_username' : 'username', 'login_password' : 'password', 'notes' : 'extra', 'folder' : 'grouping', } bitwarden_df.rename(columns=rename_dict, inplace=True) has_fields_df = bitwarden_df[pd.notna(bitwarden_df['fields'])] print("{} entries have a non-empty 'fields' entry".format(len(has_fields_df))) assert len(has_fields_df[has_fields_df['url'] == lp_sn_url_format]) == 0 # ensure none of these are lp formatted assert len(has_fields_df[pd.notna(has_fields_df['extra'])]) == 0 # ensure none of these have 'extra' entries as well bitwarden_df.loc[has_fields_df.index,'extra'] = has_fields_df['fields'] del bitwarden_df['fields'] has_extra_df = bitwarden_df[pd.notna(bitwarden_df['extra'])] print("{} entries have notes".format(len(has_extra_df))) lp_secure_notes_df = has_extra_df[has_extra_df['url'] == lp_sn_url_format] print("\t{} entries are lastpass formatted secure notes".format(len(lp_secure_notes_df))) assert lp_secure_notes_df[['username', 'password']].isnull().all().all() # make sure all usernames/passwords are empty bw_extra_df = has_extra_df[has_extra_df['url'] != lp_sn_url_format] bw_secure_notes_df = bw_extra_df[(bw_extra_df['url'].isnull()) | ((bw_extra_df['password'].isnull()) & (bw_extra_df['username'].isnull()))] print("\t{} entries are bitwarden secure notes".format(len(bw_secure_notes_df))) assert bw_secure_notes_df[['username', 'password']].isnull().all().all() # make sure all usernames/passwords are empty bitwarden_df.loc[bw_secure_notes_df.index,'url'] = lp_sn_url_format has_extra_df = bitwarden_df[pd.notna(bitwarden_df['extra'])] bw_extra_df = has_extra_df[has_extra_df['url'] != lp_sn_url_format] assert pd.notna(bw_extra_df[['password','url']]).all().all() print("\t{} entries are logins with notes".format(len(bw_extra_df))) no_url_mask = bitwarden_df['url'].isnull() bitwarden_df.loc[no_url_mask,'url'] = lp_sn_url_format print("{} entries have no url, setting as secure notes".format(no_url_mask.sum()))
def test_where_tz(self): i = pd.date_range('20130101', periods=3, tz='US/Eastern') result = i.where(notna(i)) expected = i tm.assert_index_equal(result, expected) i2 = i.copy() i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) result = i.where(notna(i2)) expected = i2 tm.assert_index_equal(result, expected)
def test_where(self): i = self.create_index() result = i.where(notna(i)) expected = i tm.assert_index_equal(result, expected) i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq='D') result = i.where(notna(i2)) expected = i2 tm.assert_index_equal(result, expected)
def test_where(self): i = self.create_index() result = i.where(notna(i)) expected = i tm.assert_index_equal(result, expected) i2 = pd.CategoricalIndex([np.nan, np.nan] + i[2:].tolist(), categories=i.categories) result = i.where(notna(i2)) expected = i2 tm.assert_index_equal(result, expected)
def test_series_setitem( self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data s = ymd['A'] s[2000, 3] = np.nan assert isna(s.values[42:65]).all() assert notna(s.values[:42]).all() assert notna(s.values[65:]).all() s[2000, 3, 10] = np.nan assert isna(s[49])
def test_properties(self, closed): index = self.create_index(closed=closed) assert len(index) == 10 assert index.size == 10 assert index.shape == (10, ) tm.assert_index_equal(index.left, Index(np.arange(10))) tm.assert_index_equal(index.right, Index(np.arange(1, 11))) tm.assert_index_equal(index.mid, Index(np.arange(0.5, 10.5))) assert index.closed == closed ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) # with nans index = self.create_index_with_nan(closed=closed) assert len(index) == 10 assert index.size == 10 assert index.shape == (10, ) expected_left = Index([0, np.nan, 2, 3, 4, 5, 6, 7, 8, 9]) expected_right = expected_left + 1 expected_mid = expected_left + 0.5 tm.assert_index_equal(index.left, expected_left) tm.assert_index_equal(index.right, expected_right) tm.assert_index_equal(index.mid, expected_mid) assert index.closed == closed ivs = [Interval(l, r, closed) if notna(l) else np.nan for l, r in zip(expected_left, expected_right)] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected)
def dedup_company(company_common_list, contact_common_list): company_common_list['ComName_temp'] = None company_common_list['vc_Deduplicate'] = None # company_common_list['Load'] = None company_common_list['vc_Master_ID'] = None for index, company in company_common_list.iterrows(): if pd.notna(company['Company_Name_CN']): company_common_list.ix[index, 'ComName_temp'] = extract_keyword(company['Company_Name_CN']) else: company_common_list.ix[index, 'ComName_temp'] = format_space(str(company['Company_Name']).strip().lower()) company_common_list['vc_Deduplicate'] = company_common_list.duplicated(subset=['ComName_temp'], keep=False) company_common_list['vc_Deduplicate'] = company_common_list['vc_Deduplicate'].apply(lambda x: False if x else True) # Duplicate list needs review company_duplicate_list = company_common_list[company_common_list['vc_Deduplicate'] == False] company_duplicate_list['Load'] = False # Full duplicate list company_duplicate_full = company_duplicate_list company_duplications = list(company_duplicate_list.groupby(['ComName_temp']).count().index) for dup in company_duplications: company_dup_group = company_duplicate_list[company_duplicate_list['ComName_temp'] == dup] company_masterid, company_common_list, company_dup_group = dedup_get_master(company_common_list, company_dup_group) if company_masterid is None: continue else: # Merge similar companies, set master company load as TRUE company_duplicate_full.loc[company_duplicate_full['Source_ID'] == company_masterid, 'Load'] = True company_common_list, contact_common_list = dedup_fix(company_common_list, contact_common_list, company_dup_group) company_duplicate_list = company_duplicate_list[company_duplicate_list['ComName_temp'] != dup] company_common_list.loc[company_common_list['vc_Deduplicate'] == False, 'Load'] = False return company_duplicate_list, company_duplicate_full, company_common_list, contact_common_list
def _check_stat_op(self, name, alternate, string_series_, check_objects=False, check_allna=False): with pd.option_context('use_bottleneck', False): f = getattr(Series, name) # add some NaNs string_series_[5:15] = np.NaN # mean, idxmax, idxmin, min, and max are valid for dates if name not in ['max', 'min', 'mean']: ds = Series(pd.date_range('1/1/2001', periods=10)) with pytest.raises(TypeError): f(ds) # skipna or no assert pd.notna(f(string_series_)) assert pd.isna(f(string_series_, skipna=False)) # check the result is correct nona = string_series_.dropna() tm.assert_almost_equal(f(nona), alternate(nona.values)) tm.assert_almost_equal(f(string_series_), alternate(nona.values)) allna = string_series_ * np.nan if check_allna: assert np.isnan(f(allna)) # dtype=object with None, it works! s = Series([1, 2, 3, None, 5]) f(s) # GH#2888 items = [0] items.extend(lrange(2 ** 40, 2 ** 40 + 1000)) s = Series(items, dtype='int64') tm.assert_almost_equal(float(f(s)), float(alternate(s.values))) # check date range if check_objects: s = Series(pd.bdate_range('1/1/2000', periods=10)) res = f(s) exp = alternate(s) assert res == exp # check on string data if name not in ['sum', 'min', 'max']: with pytest.raises(TypeError): f(Series(list('abc'))) # Invalid axis. with pytest.raises(ValueError): f(string_series_, axis=1) # Unimplemented numeric_only parameter. if 'numeric_only' in compat.signature(f).args: with pytest.raises(NotImplementedError, match=name): f(string_series_, numeric_only=True)
def test_valid(self): ts = self.ts.copy() ts[::2] = np.NaN result = ts.valid() assert len(result) == ts.count() tm.assert_series_equal(result, ts[1::2]) tm.assert_series_equal(result, ts[pd.notna(ts)])
def init_list(raw_list, colnames, **kwargs): for col in colnames: for i in null_list: if col not in list(raw_list) or pd.isnull(raw_list[col]).all(): break else: if col in ['Source_ID', 'Source_Company_ID']: continue else: raw_list[col] = raw_list[col].astype(object).str.lower().replace(i, np.nan, regex=True) raw_list[col] = raw_list[col].astype(object).str.title() if kwargs['mode'] == 'Company': raw_list['db_New'] = True raw_list['Load'] = True raw_list['Company_Name_CN'] = raw_list.loc[pd.notnull(raw_list['Company_Name_CN']), 'Company_Name_CN'].apply(lambda x: x.replace(' ', '')) for index, company in raw_list.iterrows(): address_list = dict() if pd.notna(company['Billing_Address_CN']): address_list = enrich_address(company['Billing_Address_CN']) elif pd.notna(company['Billing_Address']): address_list = enrich_address(company['Billing_Address']) for key in address_list.keys(): raw_list.loc[index, key] = address_list[key] # if len(args) > 2: # raw_list['Source_ID'] = raw_list['Source_ID'].apply(lambda x: args[1] + '_' + args[2] + '_' + 'Company' + '_' + str(x)) if kwargs['mode'] == 'Contact': raw_list['db_New'] = True raw_list['Load'] = True raw_list['Source_ID'] = list(range(1, (len(raw_list) + 1))) raw_list['Source_ID'] = raw_list['Source_ID'].apply(lambda x: kwargs['sourcename'] + '_' + kwargs['timestamp'] + '_' + 'Contact' + '_' + str(x)) if 'company' in kwargs.keys(): company_list = kwargs['company'] raw_list['Billing_Address'] = company_list['Billing_Address'] raw_list['Billing_Address_CN'] = company_list['Billing_Address_CN'] raw_list['District'] = company_list['District'] raw_list['District_CN'] = company_list['District_CN'] raw_list['City'] = company_list['City'] raw_list['City_CN'] = company_list['City_CN'] raw_list['State'] = company_list['State'] raw_list['State_CN'] = company_list['State_CN'] raw_list['Postal_Code'] = company_list['Postal_Code'] raw_list['Country'] = company_list['Country'] # if len(args) > 2 and raw_list: # raw_list['Source_Company_ID'] = raw_list['Source_Company_ID'].apply(lambda x: args[1] + '_' + args[2] + '_' + 'Company' + '_' + str(x)) return raw_list
def test_where_other(self): # other is ndarray or Index i = pd.date_range('20130101', periods=3, tz='US/Eastern') for arr in [np.nan, pd.NaT]: result = i.where(notna(i), other=np.nan) expected = i tm.assert_index_equal(result, expected) i2 = i.copy() i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) result = i.where(notna(i2), i2) tm.assert_index_equal(result, i2) i2 = i.copy() i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) result = i.where(notna(i2), i2.values) tm.assert_index_equal(result, i2)
def test_where_other(self): i = period_range('20130101', periods=5, freq='D') for arr in [np.nan, pd.NaT]: result = i.where(notna(i), other=np.nan) expected = i tm.assert_index_equal(result, expected) i2 = i.copy() i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq='D') result = i.where(notna(i2), i2) tm.assert_index_equal(result, i2) i2 = i.copy() i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq='D') result = i.where(notna(i2), i2.values) tm.assert_index_equal(result, i2)
def test_override_inferred_closed(self, constructor, data, closed): # GH 19370 if isinstance(data, IntervalIndex): tuples = data.to_tuples() else: tuples = [(iv.left, iv.right) if notna(iv) else iv for iv in data] expected = IntervalIndex.from_tuples(tuples, closed=closed) result = constructor(data, closed=closed) tm.assert_index_equal(result, expected)
def test_periodindex(self): from pandas import period_range, PeriodIndex # array or list or dates N = 50 rng = period_range('1/1/1990', periods=N, freq='H') ts = Series(np.random.randn(N), index=rng) ts[15:30] = np.nan dates = date_range('1/1/1990', periods=N * 3, freq='37min') result = ts.asof(dates) assert notna(result).all() lb = ts.index[14] ub = ts.index[30] result = ts.asof(list(dates)) assert notna(result).all() lb = ts.index[14] ub = ts.index[30] pix = PeriodIndex(result.index.values, freq='H') mask = (pix >= lb) & (pix < ub) rs = result[mask] assert (rs == ts[lb]).all() ts[5:10] = np.nan ts[15:20] = np.nan val1 = ts.asof(ts.index[7]) val2 = ts.asof(ts.index[19]) assert val1 == ts[4] assert val2 == ts[14] # accepts strings val1 = ts.asof(str(ts.index[7])) assert val1 == ts[4] # in there assert ts.asof(ts.index[3]) == ts[3] # no as of value d = ts.index[0].to_timestamp() - offsets.BDay() assert isna(ts.asof(d))
def validate_name(contact): nfirst = True nlast = False nspace = False # Remove more than two space and starting/ending space, format Last_Name if pd.notna(contact['Last_Name']): contact['Last_Name'] = format_space(contact['Last_Name'].lower().capitalize()) if pd.notna(contact['First_Name']): contact['First_Name'] = format_space(contact['First_Name']) if pd.isna(contact['Reject_Reason']): contact['Reject_Reason'] = '' # Check First_Name and Last_Name misplace for lan in lastname_list.iloc[:, 1:]: lastnames = list(lastname_list[lan]) if contact['Last_Name'] in lastnames: contact['vn_Lastname_CN'] = lastname_list.ix[lastnames.index(contact['Last_Name']), '简体中文'] nlast = True break elif contact['First_Name'] in lastnames: nfirst = False break if not (nlast or nfirst): contact['Reject_Reason'] = contact['Reject_Reason'] + 'First_Name_CN and Last_Name_CN misplace; ' # Check name contains space if pd.notna(contact['First_Name']) and pd.notna(contact['Last_Name']): if ' ' in contact['First_Name'] or ' ' in contact['Last_Name']: contact['Reject_Reason'] = contact['Reject_Reason'] + 'Name contains space; ' else: nspace = True else: nspace = True # Name check ncheck = (nlast or nfirst) and nspace contact['vn_Name_Swap'] = (nlast or nfirst) contact['vn_Name_Space'] = nspace contact['vn_Name_Check'] = ncheck return contact
def NanCleanerApply(x): #@param x is a column of the dataset maskNan = pd.isna(x) maskNotNan = pd.notna(x) notNan = x[maskNotNan] nan = x[maskNan] avg = int(np.average(notNan)) for i in range (0, len(x)): if(pd.isna(x[i])): x[i] = avg return x
def test_length(self, closed, breaks): # GH 18789 index = IntervalIndex.from_breaks(breaks, closed=closed) result = index.length expected = Index(iv.length for iv in index) tm.assert_index_equal(result, expected) # with NA index = index.insert(1, np.nan) result = index.length expected = Index(iv.length if notna(iv) else iv for iv in index) tm.assert_index_equal(result, expected)
def test_where(self): i = self.create_index() result = i.where(notna(i)) expected = i tm.assert_index_equal(result, expected) _nan = i._na_value cond = [False] + [True] * len(i[1:]) expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype) result = i.where(cond) tm.assert_index_equal(result, expected)
def get_kwargs_from_breaks(self, breaks, closed='right'): """ converts intervals in breaks format to a dictionary of kwargs to specific to the format expected by the IntervalIndex/Index constructors """ if len(breaks) == 0: return {'data': breaks} ivs = [Interval(l, r, closed) if notna(l) else l for l, r in zip(breaks[:-1], breaks[1:])] if isinstance(breaks, list): return {'data': ivs} return {'data': np.array(ivs, dtype=object)}
def enrich_no_address(company_load_list, company_address_review): company_address_review = company_address_review[company_address_review['Load'] == True] for index, company in company_address_review.iterrows(): sourceid = company['Source_ID'] if pd.notna(company['Billing_Address_CN']): address_list = enrich_address(company['Billing_Address_CN']) elif pd.notna(company['Billing_Address']): address_list = enrich_address(company['Billing_Address']) for key in address_list.keys(): company_load_list.loc[company_load_list['Source_ID'] == sourceid, key] = address_list[key] # if pd.isna(company_load_list.loc[index, 'State']): # company_load_list.loc[index, 'State'] = state # if pd.isna(company_load_list.loc[index, 'City']): # company_load_list.loc[index, 'City'] = city # if pd.isna(company_load_list.loc[index, 'District']): # company_load_list.loc[index, 'District'] = district # if pd.isna(company_load_list.loc[index, 'Postal_Code']): # company_load_list.loc[index, 'Postal_Code'] = zipcode # company_load_list.loc[pd.notnull(company_load_list['District']), 'Full_Address'] = company_load_list['District'] + company_load_list['Billing_Address'] # company_load_list.loc[pd.isnull(company_load_list['District']), 'Full_Address'] = company_load_list['Billing_Address'] return company_load_list
def dedup_comany_db(company_dedup_list, company_db_return): if company_db_return.empty: return company_db_return company_merge_list = company_dedup_list.merge(company_db_return, on=['ComName_temp'], suffixes=['', '_db'], how='left') company_existing_list = company_merge_list[pd.notna(company_merge_list['Source_ID_db'])] company_existing_list['db_New'] = False company_existing_list['Load'] = False existing_company = company_existing_list['Source_ID'].tolist() existing_company = pd.Series(company_dedup_list['Source_ID'].isin(existing_company)) company_dedup_list.loc[existing_company, 'db_New'] = False company_dedup_list.loc[existing_company, 'Load'] = False return company_dedup_list, company_existing_list
def enrich_scrapy(company, scrapy): if scrapy.empty: return company else: if pd.notna(scrapy['英文名']).any(): company['Company_Name'] = scrapy['英文名'].values[0] company['Company_Name_CN'] = scrapy['公司名称'].values[0] if scrapy['境外公司'] is True: company['Country'] = '' else: company['Country'] = 'China' if pd.isna(company['Billing_Address_CN']): address_list = enrich_address(scrapy['地址'].values[0]) for key in address_list.keys(): company[key] = address_list[key] # Set state as '所属地区' if pd.notna(scrapy['所属地区']).all(): state = scrapy['所属地区'].values[0] states = geo_list[geo_list['Level ID'] == 1] for index, s in states.iterrows(): if s['Full Name'] == state or s['Name'] == state or s['PingYin3'] == state.lower(): company['State_CN'] = s['Name'] company['State'] = s['PingYin3'].capitalize() break # company['Company_Type'] = scrapy['公司类型'].values[0] company['Phone'] = scrapy['电话'].values[0] company['Website'] = scrapy['网址'].values[0] company['Email'] = scrapy['邮箱'].values[0] company['Industry'] = scrapy['所属行业'].values[0] company['Employee'] = scrapy['参保人数'].values[0] return company
def test_custom_grouper(index): dti = index s = Series(np.array([1] * len(dti)), index=dti, dtype='int64') b = TimeGrouper(Minute(5)) g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) b = TimeGrouper(Minute(5), closed='right', label='right') g = s.groupby(b) # check all cython functions work funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] for f in funcs: g._cython_agg_general(f) assert g.ngroups == 2593 assert notna(g.mean()).all() # construct expected val arr = [1] + [5] * 2592 idx = dti[0:-1:5] idx = idx.append(dti[-1:]) expect = Series(arr, index=idx) # GH2763 - return in put dtype if we can result = g.agg(np.sum) assert_series_equal(result, expect) df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype='float64') r = df.groupby(b).agg(np.sum) assert len(r.columns) == 10 assert len(r.index) == 2593
def validate_contacts(contact_dedup_list, contact_colnames, company_scrapy_list): contact_validate_list = pd.DataFrame(columns=contact_colnames) for index, contact in contact_dedup_list.iterrows(): sourceid = contact['Source_Company_ID'] company = company_scrapy_list.loc[company_scrapy_list['Source_ID'] == sourceid] contact = validate_name(contact) contact = validate_email(contact, company) if not company.empty: contact['Company_Name'] = company['Company_Name'].values[0] contact['Company_Name_CN'] = company['Company_Name_CN'].values[0] if pd.isna(contact['Billing_Address']): contact['Billing_Address'] = company['Full_Address'].values[0] if pd.isna(contact['City']): contact['City'] = company['City'].values[0] if pd.isna(contact['State']): contact['State'] = company['State'].values[0] if pd.isna(contact['Postal_Code']): contact['Postal_Code'] = company['Postal_Code'].values[0] if pd.isna(contact['Country']): contact['Country'] = company['Country'].values[0] if pd.isna(contact['Mobile']) and pd.isna(contact['Phone']) and pd.isna(contact['Email']): contact['Reject_Reason'] = contact['Reject_Reason'] + 'No communication method; ' contact['Load'] = contact['vn_Name_Check'] and (contact['ve_Email_Check'] or pd.notna(contact['Mobile']) or pd.notna(contact['Phone'])) and contact['db_New'] contact_validate_list = contact_validate_list.append(contact, ignore_index=True) # Deduplicate by name and email contact_validate_list['Fname_temp'] = contact_validate_list['First_Name'].apply(lambda x: x if x is np.nan else x.lower()) contact_validate_list['Lname_temp'] = contact_validate_list['Last_Name'].apply(lambda x: x if x is np.nan else x.lower()) # TODO: keep only letters in email as Email_temp # Switch True and False contact_validate_list['vc_Deduplicate'] = contact_validate_list.duplicated(subset=['Fname_temp', 'Lname_temp', 'Email'], keep=False) contact_validate_list['vc_Deduplicate'] = contact_validate_list['vc_Deduplicate'].apply(lambda x: False if x else True) contact_validate_list.loc[contact_validate_list['vc_Deduplicate'] == False, 'Reject_Reason'] = contact_validate_list['Reject_Reason'].astype(str) + 'Duplicates in source data; ' contact_validate_list['Load'] = contact_validate_list['Load'] & contact_validate_list['vc_Deduplicate'] return contact_validate_list
def transform2(row): if (notna(row['C']) and row['C'].startswith('shin') and row['A'] == 'foo'): row['D'] = 7 return row
op='intersects') dcasindexparcel['facindex'] = pd.qcut( dcasindexparcel['facilitypa'], 50, labels=False) + 1 dcasindexparcel['spdindex'] = 50 - pd.qcut( dcasindexparcel['avgspeed'], 50, labels=False) dcasindexparcel['dcasindex'] = (dcasindexparcel['facindex'] + dcasindexparcel['spdindex']) dcasindexparcel = dcasindexparcel[[ 'facindex', 'spdindex', 'dcasindex', 'geometry' ]].reset_index(drop=True) dcasindexparcel.to_file(path + 'OUTPUT/dcasindexparcel.shp') # DCAS Index by NTA dcasindexparcel = gpd.read_file(path + 'OUTPUT/dcasindexparcel.shp') dcasindexparcel.crs = {'init': 'epsg:4326'} dcasindexparcel = dcasindexparcel[pd.notna( dcasindexparcel['dcasindex'])].reset_index(drop=True) ntaclippedadj = gpd.read_file(path + 'SHP/ntaclippedadj.shp') ntaclippedadj.crs = {'init': 'epsg:4326'} dcasindexnta = gpd.sjoin(dcasindexparcel, ntaclippedadj, how='inner', op='intersects') dcasindexnta = dcasindexnta.groupby(['NTACode', 'NTAName'], as_index=False).agg({ 'facindex': 'mean', 'spdindex': 'mean', 'dcasindex': 'mean' }).reset_index(drop=True) facilityntact = gpd.read_file(path + 'OUTPUT/facilityparcelwgs.shp') facilityntact.crs = {'init': 'epsg:4326'} facilityntact = gpd.sjoin(facilityntact,
def categorical_impute(X): for i in range(len(X)): for j in range(len(X[0])): if not (pd.notna(X[i][j])): X[i][j] = 'No'
sep=";") terc_voivodeships = { row[1][0]: row[1][1] for row in terc_df[pd.isna(terc_df['POW'])][["WOJ", "NAZWA"]].iterrows() } terc_counties = { row[1][0] + row[1][1]: row[1][2] for row in terc_df[terc_df['POW'].notna()][pd.isna(terc_df['GMI'])] [["WOJ", "POW", "NAZWA"]].iterrows() } terc_communes = { row[1][0] + row[1][1] + row[1][2]: row[1][3] for row in terc_df[pd.notna(terc_df['RODZ'])] [["WOJ", "POW", "GMI", "NAZWA"]].iterrows() } terc_communetypes = { row[1][0] + row[1][1] + row[1][2] + row[1][3]: row[1][4] for row in terc_df[pd.notna(terc_df['RODZ'])] [["WOJ", "POW", "GMI", "RODZ", "NAZWA_DOD"]].iterrows() } # Populacje w gminach terc_populations = {} for i, population in pd.read_excel('data/original/tabela17.xls', usecols="B:C",
def run(self): # ESP_MSVP stopped at Actualización nº 116 on 25.05.2020 start = 116 stop = (date.today() - date(2020, 5, 25)).days + 117 if self.sliding_window_days: start = max(start, stop - self.sliding_window_days) for actualizacion in range(start, stop): parsed = self.fetch(actualizacion) time.sleep(5) # crawl delay if parsed is None: continue content = unicodedata.normalize('NFKC', parsed['content']) fecha = datetime.strptime(get_fecha(content), '%d.%m.%Y').strftime('%Y-%m-%d') tabs = get_ccaa_tables(content, ['Tabla 1. Casos', 'Tabla 2. Casos']) if 'Acrobat Distiller' in parsed['metadata'][ 'producer']: # fragile tabs[0] = [[col for col in row if col != ''] for row in tabs[0]] tabs[1] = [[col for col in row if col != ''] for row in tabs[1]] df1 = pd.DataFrame([row[0:2] for row in tabs[0]], columns=['ccaa', 'confirmed']) df2 = pd.DataFrame( [[row[i] for i in (0, 1, 3, 5)] for row in tabs[1]], columns=['ccaa', 'hospitalised', 'hospitalised_icu', 'dead']) data = pd.merge(df1, df2, on='ccaa') for index, record in data.iterrows(): # ccaa,confirmed,hospitalised,hospitalised_icu,dead ccaa = record[0] confirmed = int(record[1]) if pd.notna(record[1]) else None hospitalised = int(record[2]) if pd.notna(record[2]) else None hospitalised_icu = int(record[3]) if pd.notna( record[3]) else None dead = int(record[4]) if pd.notna(record[4]) else None success, adm_area_1, adm_area_2, adm_area_3, gid = self.adm_translator.tr( country_code='ESP', input_adm_area_1=ccaa, input_adm_area_2=None, input_adm_area_3=None, return_original_if_failure=True) upsert_obj = { 'source': self.SOURCE, 'date': fecha, 'country': 'Spain', 'countrycode': 'ESP', 'adm_area_1': adm_area_1, 'adm_area_2': adm_area_2, 'adm_area_3': adm_area_3, 'confirmed': confirmed, 'dead': dead, 'hospitalised': hospitalised, 'hospitalised_icu': hospitalised_icu, 'gid': gid } self.upsert_data(**upsert_obj)
def fill_location_for_tours_df(tdf, network_df, data_content): tid = tdf[pd.isna(tdf['latitude'])] pbar = tqdm(total=tid.shape[0], bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') pbar.set_description('Step 4 of 4') for idx, _ in tid.iterrows(): cat = ['going', 'maybe', 'invited', 'not_going'] bik = data_content.tour_convoy_df[ data_content.tour_convoy_df['tour_id'] == tdf.loc[idx, 'tour_id']] coll = [] for c in cat: if not pd.isna(bik[c].tolist()[0]): coll += bik[c].tolist()[0].split() g = network_df[network_df['biker_id'].isin(coll)] if g.shape[0] > 0: m, _ = mode(g[['latitude']], axis=0) if not np.isnan(m[0, 0]): index = g[g['latitude'] == m[0, 0]].index.tolist()[0] lat, long = g.loc[index, 'latitude'], g.loc[index, 'longitude'] tdf.loc[idx, 'latitude'] = lat tdf.loc[idx, 'longitude'] = long pbar.update(1) pbar.close() bid = tdf[pd.isna(tdf['latitude'])]['biker_id'].drop_duplicates().tolist() chi = data_content.tours_df[data_content.tours_df['biker_id'].isin(bid)] chi = chi[pd.notna(chi['latitude'])].groupby('biker_id')[[ 'latitude', 'longitude' ]].agg(lambda x: x.value_counts().index[0]) chi = chi.reset_index() for idx, _ in tdf[pd.isna(tdf['latitude'])].iterrows(): m = chi[chi['biker_id'] == tdf.loc[idx, 'biker_id']] if m.shape[0] != 0: tdf.loc[idx, 'latitude'] = m['latitude'].tolist()[0] tdf.loc[idx, 'longitude'] = m['longitude'].tolist()[0] # Using tour_convoy_df to find tours attended by biker organizing this tour # and fill location from based on that information. coll = [] tid = tdf[pd.isna(tdf['latitude'])] sdf = data_content.convoy_df[data_content.convoy_df['biker_id'].isin( tid['biker_id'].tolist())] for idx, _ in tid.iterrows(): cat = ['going', 'maybe', 'invited', 'not_going'] bik = sdf[sdf['biker_id'] == tid.loc[idx, 'biker_id']] if bik.shape[0] > 0: for c in cat: if not pd.isna(bik[c].tolist()[0]): coll += bik[c].tolist()[0].split() small_df = data_content.tours_df[data_content.tours_df['tour_id'].isin( coll)] for idx, _ in tid.iterrows(): cat = ['going', 'maybe', 'invited', 'not_going'] bik = sdf[sdf['biker_id'] == tdf.loc[idx, 'biker_id']] if bik.shape[0] > 0: coll = [] for c in cat: if not pd.isna(bik[c].tolist()[0]): coll += bik[c].tolist()[0].split() g = small_df[small_df['tour_id'].isin(coll)] if g.shape[0] > 0: m, _ = mode(g[['latitude']], axis=0) if not np.isnan(m[0, 0]): index = g[g['latitude'] == m[0, 0]].index.tolist()[0] lat, long = g.loc[index, 'latitude'], g.loc[index, 'longitude'] tdf.loc[idx, 'latitude'] = lat tdf.loc[idx, 'longitude'] = long return tdf
print("No trades") equity_arr.append(equity) drawdown_arr.append(equity / high_eq) prevdate = date else: print("Long:") print(longs) print("Short:") print(shorts) long_returns = return_df.loc[return_df['asset'].isin(longs), 'return_1d'] short_returns = return_df.loc[return_df['asset'].isin(shorts), 'return_1d'] long_returns = [ val - 1 for val in long_returns.to_numpy() if pd.notna(val) ] short_returns = [ val - 1 for val in short_returns.to_numpy() if pd.notna(val) ] if len(long_returns) == 0: long_returns = [0] if len(short_returns) == 0: short_returns = [0] long_return = LEVERAGE * (INV_VAR * np.mean(long_returns)) short_return = LEVERAGE * (INV_VAR * np.mean(short_returns)) equity = equity * (long_return - short_return + 1 - FEE)
def transform2(row): if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": row["D"] = 7 return row
def make_properties(row): return [ api.models.ModelProperty(key[2:], api.models.PropertyValue(row[key])) for key in prop_keys if pd.notna(row[key]) ]
def count(s): return notna(s).sum()
def condense_census(df_in): df_out = condense_record(df_in, CENSUS_COLUMNS) # Drop data from state censuses & where year is missing df_out.year = df_out.year.apply(process_year) df_out = df_out[pd.notna(df_out.year) & (df_out.year % 10 == 0)] return df_out
def validate_dataTempo(self, df, column): size = pd.notna(df).sum() if size == 0: return False validated = 0 # datetime for value in df: value = str(value).strip() # mask: date ^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})$ # 1-1-12 - 01/01/1999 if len(value) >=6 and len(value) <= 10: if re.fullmatch('^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})$', str(value)): validated = validated + 1 # mask: time ^((0|1)?[0-9]|[2][0-3]):([0-5][0-9])(:([0-5][0-9]))?$ # 1:00 - 23:59:59 if len(value) >=4 and len(value) <= 8: if re.fullmatch('^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})$', str(value)): validated = validated + 1 # mask: datetime ^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})\s((0|1)?[0-9]|[2][0-3]):([0-5][0-9])(:([0-5][0-9]))?$ #1-1-12 1:00 - 01/01/1999 23:59:59 if len(value) >=11 and len(value) <= 20: if re.fullmatch('^(0?[1-9]|[12][0-9]|3[01])(-|\/)((0?[1-9])|(1[0-2]))(-|\/)(\d{2}|\d{4})\s((0|1)?[0-9]|[2][0-3]):([0-5][0-9])(:([0-5][0-9]))?$', str(value)): validated = validated + 1 if validated/size >= self.threshold: return True # checking month if "mes" == column or column.startswith("mes_"): # integer values months = set() for x in df: try: value = int(value) except: continue months.add(value) if len(months) > 0: if min(months) >= 1 and max(months) <= 12: return True # string validated = 0 for x in df: if str(x).lower() in ['janeiro', 'fevereiro', 'março', 'maro', 'marco', 'abril', 'maio', 'junho', 'julho', 'agosto', 'setembro', 'outubro', 'novembro', 'dezembro', 'jan', 'fev', 'mar', 'abr', 'maio', 'jun', 'jul', 'ago', 'set', 'out', 'nov', 'dez']: validated = validated + 1 if validated/size >= self.threshold: return True # checking year if "ano" == column or column.startswith("ano_"): # integer values validated = 0 for value in df: try: value = int(value) except: continue if value >= 1900 and value <= 2099: validated = validated + 1 if validated/size >= self.threshold: return True # checking month-year if "mes" == column or column.startswith("mes_") or "ano" == column or column.startswith("ano_"): # mask jan/19 dez-2020 validated = 0 for value in df: value = str(value).strip().lower() if len(value) >=6 or len(value) <= 9: if re.fullmatch('^(jan|fev|mar|abr|mai|maio|jun|jul|ago|set|out|nov|dez)(\/|-)(19|20)(\d{2})?$', value): validated = validated + 1 if validated/size >= self.threshold: return True # mask 19/jan 2020-dez validated = 0 for value in df: value = str(value).strip().lower() if len(value) >=6 or len(value) <= 9: if re.fullmatch('^(19|20)(\d{2})?(\/|-)(jan|fev|mar|abr|mai|maio|jun|jul|ago|set|out|nov|dez)$', value): validated = validated + 1 if validated/size >= self.threshold: return True # Special case: month with year if ("ano" in column and "mes" in column) or column.startswith("anoems") or column.startswith("anoms") or ("mes" in column and "refer" in column): # case 1: mask = YearMonth 2020/1 201406 validated = 0 for value in df: value = str(value).strip() if len(value) ==6 or len(value) ==7: if re.fullmatch('^(19|20)\d{2}(-|\/)?(0[1-9]|1[0-2])$', value): validated = validated + 1 if validated/size >= self.threshold: return True # case 2: mask = MonthYear 07-2014 062016 validated = 0 for value in df: value = str(value).strip() if len(value) ==6 or len(value) ==7: if re.fullmatch('^(0[1-9]|1[0-2])(-|\/)?(19|20)\d{2}$', value): validated = validated + 1 if validated/size >= self.threshold: return True # case 3: mask = Month/Year abr/16 validated = 0 for value in df: value = str(value).strip() if len(value) ==6 or len(value) ==7: if re.fullmatch('^(jan|fev|mar|abr|mai|maio|jun|jul|ago|set|out|nov|dez)(\/|-)\d{2}$', value): validated = validated + 1 if validated/size >= self.threshold: return True # special cases without format if column.startswith("dta_") or column.startswith("data_") or column.startswith("dt_"): # mask yearMonthDay - 20170927 validated = 0 for value in df: value = str(value).strip() if len(value) ==8: if re.fullmatch('^(19|20)\d{2}(0[1-9]|1[0-2])((0[1-9])|((1|2)[0-9]))$', value): validated = validated + 1 if validated/size >= self.threshold: return True return False
if high_se is not None and low_se is not None: detection_df.loc[index, 'SE (D)'] = (low_se + high_se) / 2 elif high_se is not None: detection_df.loc[index, 'SE (D)'] = high_se elif low_se is not None: detection_df.loc[index, 'SE (D)'] = low_se # Maximum Likelihood Spatial Capture-Recapture if pd.isnull(row['SE (D).1']): if pd.notnull(row['Estimated D.1']): high_se = None low_se = None if pd.notna(row['Upper CI (D).1']): high_se = high_ci_to_se(row['Upper CI (D).1'], row['Estimated D.1']) if pd.notnull(row['Lower CI (D).1']): low_se = low_ci_to_se(row['Lower CI (D).1'], row['Estimated D.1']) if high_se is not None and low_se is not None: detection_df.loc[index, 'SE (D)'] = (low_se + high_se) / 2 elif high_se is not None: detection_df.loc[index, 'SE (D)'] = high_se elif low_se is not None: detection_df.loc[index, 'SE (D)'] = low_se
def notna(obj): if isinstance(obj, BasePandasDataset): return obj.notna() else: return pandas.notna(obj)
def interpolate_predict(method="index"): start = datetime.datetime.now() data = pd.read_hdf(data_path) final_result = pd.DataFrame() score_df = pd.DataFrame() score_df["var"] = var_col for i in tqdm(range(1, 34)): sub = data[data["wtid"] == i] score_temp = [] for var in var_col: sub1 = sub[pd.notna(sub[var])].reset_index(drop=True) index = 0 for index, t in enumerate(tool.types): if var in t: break col_name = str(index) + "_test" sub2 = sub1[[var]].copy() sub1.loc[sub1[col_name] == 1, var] = np.nan sub1[var] = sub1[var].interpolate(method=method) true_value = sub2[sub1[col_name] == 1][var] predict_value = sub1[sub1[col_name] == 1][var] if_round = False if var in category_col: predict_value = np.array(predict_value).astype(int) true_value = np.array(true_value).astype(int) score = tool.label_score(true_value, predict_value) else: score = tool.regression_score(true_value, predict_value) predict_value2 = np.round(predict_value, 2) score2 = tool.regression_score(true_value, predict_value2) if score < score2 - threshold: score = score2 if_round = 2 predict_value2 = np.round(predict_value, 1) score2 = tool.regression_score(true_value, predict_value2) if score < score2 - threshold: score = score2 if_round = 1 score_temp.append(score) # 预测结果 sub[var] = sub[var].interpolate(method=method) if if_round: sub[var] = np.round(sub[var], if_round) final_result = pd.concat((final_result, sub), axis=0, ignore_index=True) score_df[str(i)] = score_temp score_df.set_index("var", inplace=True) score_df = score_df.T score_df.reset_index(inplace=True) score_df.rename(columns={"index": "wtid"}, inplace=True) score_df.to_csv("./result/{}_score.csv".format(method), encoding="utf8", index=False, float_format='%.4f') final_result = final_result[final_result["count_miss"] > 0] final_result = final_result[head_col] final_result.sort_values(["wtid", "ts"], inplace=True) for var in category_col: final_result[var] = final_result[var].astype(int) final_result.to_csv("./result/{}_result.csv".format(method), encoding="utf8", index=False, float_format='%.2f') end = datetime.datetime.now() print("finish", method, "interpolate_predict time: ", end - start)
def make_identifiers(row): return { identifier: api.models.InstrumentIdValue(row[identifier]) for identifier in identifiers if pd.notna(row[identifier]) }
def qichacha(company_input_list, path, sheetname): company_count = len(company_input_list) company_progress = 0 # Find existing file try: company_scrapy_result = pd.read_excel(path, sheet_name=sheetname) # Remove breakpoint record # company_keyword_break = company_scrapy_result[company_scrapy_result['ID'] == 'breakpoint']['搜索词'] company_sourceid_break = company_scrapy_result[company_scrapy_result['ID'] == 'breakpoint']['Source_ID'].values[0] company_progress = len(company_scrapy_result['Source_ID'].unique().tolist()) company_scrapy_result = company_scrapy_result[company_scrapy_result['Source_ID'] != company_sourceid_break] company_done = company_scrapy_result['Source_ID'].unique().tolist() # if company_input_list[company_input_list['Company_Name_CN'] == company_keyword_break].empty == False: # company_input_break = np.array( # company_input_list[company_input_list['Company_Name_CN'] == company_keyword_break].index).tolist()[0] # else: # company_input_break = np.array( # company_input_list[company_input_list['Company_Name'] == company_keyword_break].index).tolist()[0] company_progress = len(company_input_list[company_input_list['Source_ID'].isin(company_done)]) company_input_list = company_input_list[~company_input_list['Source_ID'].isin(company_done)] print('Restart from breakpoint.') # First time running except: company_scrapy_result = pd.DataFrame() # columns = columnname) for index, row in company_input_list.iterrows(): company_progress = int(company_progress) + 1 if pd.notna(row['Company_Name_CN']): company_keyword = row['Company_Name_CN'] else: company_keyword = row['Company_Name'] company_sourceid = row['Source_ID'] # Search filter search_base = 'https://www.qichacha.com/search?key={}#' # Keyword print('---------', company_keyword, '----------') search_key = urllib.parse.quote(company_keyword) # Organization Type: 0:Company 1:Organization 3:HK Company 5:TW Company search_type = '&searchType=' # Searching Index: 2:Company_Name 4:Representative/Share holder 6:Management 8:Brand/Product 10:Connection(Address) search_index = '&index:2' # Province search_province = '&province:' # Fuzzy search for keyword time.sleep(random.randint(1, 2)) if pd.notna(row['State_Abbr']): search_province = search_province + row['State_Abbr'] search_url_keyword = search_base.format(search_key) + search_index + search_province + '&' else: search_url_keyword = search_base.format(search_key) + search_index + '&' # print(search_url_keyword) respond_keyword = requests.get(search_url_keyword, headers=search_headers) soup_keyword = BeautifulSoup(respond_keyword.text, 'lxml') company_info_list_flag = soup_keyword.find('span', attrs={'id': 'countOld'}) # Company details if company_info_list_flag != None and company_info_list_flag.span.text.strip() != '0': try: search_companys = soup_keyword.find('table', attrs={'class': 'm_srchList'}).tbody.find_all('td') step = 0 for company in search_companys: if step % 3 == 1: company_href = company.a['href'] search_url_company = 'https://www.qichacha.com' + company_href time.sleep(random.randint(0, 1)) respond_company = requests.get(search_url_company, headers=search_headers) soup_company = BeautifulSoup(respond_company.text, 'lxml') company_isforeign = False if (soup_company.find('div', attrs={'class': 'row title'}).h1 == None): # HongKong Company soup_company.find('div', attrs={'class': 'row title'}).span.extract() company_name = soup_company.find('div', attrs={'class': 'row title'}).text company_isforeign = True else: company_name = soup_company.find('div', attrs={'class': 'row title'}).h1.text company_id = re.findall(r'/firm_(.*).html', str(company_href))[0] # print(company_id, company_name) company_phone = '' company_website = '' company_email = '' company_address = '' for i in soup_company.find_all('span', attrs={'class': "cdes"}): if i.text == '电话:': if (i.next_sibling.span != None): company_phone = i.next_sibling.span.text # if (soup_company.find('span', attrs={'class': "cdes"}).next_sibling.span != None): # company_phone = soup_company.find('span', attrs={'class': "cdes"}).next_sibling.span.text if (soup_company.find('a', attrs={'onclick': "zhugeTrack('企业主页-企业头部-官网')"}) != None): company_website = soup_company.find('a', attrs={'onclick': "zhugeTrack('企业主页-企业头部-官网')"})[ 'href'] if (soup_company.find('a', attrs={'title': '发送邮件'}) != None): company_email = soup_company.find('a', attrs={'title': '发送邮件'}).text if (soup_company.find('a', attrs={'title': "查看地址"}) != None): company_address = soup_company.find('a', attrs={'title': "查看地址"}).text search_id = str(company_id) # str(company_sourceid) + '_' + print(company_name) # print('---------', company_name, '----------') # print('---------', company_id, '----------') if company_isforeign: company_info_data = [search_id, company_sourceid, company_keyword, company_name, company_id, company_phone, company_website, company_email, company_address, company_isforeign, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''] company_info_data = dict(zip(columnname, company_info_data)) company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True) step += 1 continue company_info_list = soup_company.find_all('table', attrs={'class': 'ntable'})[1].find_all('tr') company_info_data = [] company_info_data.append(search_id) company_info_data.append(company_sourceid) company_info_data.append(company_keyword) company_info_data.append(company_name) company_info_data.append(company_id) company_info_data.append(company_phone) company_info_data.append(company_website) company_info_data.append(company_email) company_info_data.append(company_address) company_info_data.append(company_isforeign) for business_info in company_info_list[:-2]: company_info_data.append(business_info.find_all('td')[1].text.replace('\n', '').strip()) company_info_data.append(business_info.find_all('td')[3].text.replace('\n', '').strip()) # Business scope company_info_data.append(company_info_list[-1].find_all('td')[1].text.replace('\n', '').strip()) # Lawsuit count company_lawsuit = soup_company.find('a', attrs={'id': 'susong_title'}).span.text company_info_data.append(company_lawsuit) # Risk company_risk_info = soup_company.find('div', attrs={'class': 'risk-panel b-a'}) if (company_risk_info != None): company_risk_details = company_risk_info.find_all('span', attrs={'class': 'text-danger'}) company_risk_operation = soup_company.find('a', attrs={'id': 'fengxian_title'}).span.text company_info_data.append(company_risk_details[0].text) company_info_data.append(company_risk_details[1].text) company_info_data.append(company_risk_operation) # # Finance # company_name_encode = urllib.parse.quote(company_name) # search_url_finance = 'http://www.qichacha.com/company_getinfos?unique=' + company_id + '&companyname=' + company_name_encode + '&tab=run' # company_info_data.append(search_url_finance) # time.sleep(random.randint(2, 4)) # respond_finance = requests.get(search_url_finance,headers = search_headers) # soup_finance = BeautifulSoup(respond_finance.text,'lxml') # finance_info_list_flag = soup_finance.find('section',attrs = {'id':'V3_cwzl'}) # if finance_info_list_flag: # finance_info_list = finance_info_list_flag.find_all('td') # company_info_data.append(finance_info_list[1].text) # company_info_data.append(finance_info_list[3].text) # company_info_data.append(finance_info_list[5].text) # company_info_data.append(finance_info_list[7].text) # else: # company_info_data.append('') # company_info_data.append('') # company_info_data.append('') # company_info_data.append('') # # Anual_report # search_url_report = 'http://www.qichacha.com/company_getinfos?unique=' + company_id + '&companyname=' + company_name_encode + '&tab=report' # company_info_data.append(search_url_report) # time.sleep(random.randint(2, 4)) # respond_report = requests.get(search_url_report,headers = search_headers) # soup_report = BeautifulSoup(respond_report.text,'lxml') # report_info_list = soup_report.find('div',attrs = {'class':'tab-pane fade in active'}) # print(report_info_list) # report_info_list = report_info_list.find_all('td') # report_info_list_flag = 'N' # # for report in report_info_list: # if report.text == '城镇职工基本养老保险': # report_info_list_flag = 'Y' # # if report_info_list_flag == 'Y': # for report in report_info_list: # if report.text == '城镇职工基本养老保险': # company_info_data.append(report_info_list[report_info_list.index(report)+1].text) # print(report_info_list[report_info_list.index(report)+1].text) # if report.text == '职工基本医疗保险': # company_info_data.append(report_info_list[report_info_list.index(report)+1].text) # if report.text == '生育保险': # company_info_data.append(report_info_list[report_info_list.index(report)+1].text) # if report.text == '失业保险': # company_info_data.append(report_info_list[report_info_list.index(report)+1].text) # if report.text == '工伤保险': # company_info_data.append(report_info_list[report_info_list.index(report)+1].text) # else: # company_info_data.append('') # company_info_data.append('') # company_info_data.append('') # company_info_data.append('') # company_info_data.append('') company_info_data = dict(zip(columnname, company_info_data)) company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True) step += 1 except: # Need verification, set ID as 'breakpoint' company_info_data = ['breakpoint', company_sourceid, company_keyword, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''] company_info_data = dict(zip(columnname, company_info_data)) company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True) print('Need Verification case 1!') print('Progress: {} %'.format(company_progress / company_count * 100)) break # Need verification, set ID as 'breakpoint' elif company_info_list_flag == None: company_info_data = ['breakpoint', company_sourceid, company_keyword, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''] company_info_data = dict(zip(columnname, company_info_data)) company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True) print('Need Verification case 2!') print('Progress: {} %'.format(company_progress / company_count * 100)) break # No result return elif company_info_list_flag.span.text.strip() == '0': search_id = str(company_sourceid) # Column count 32 company_info_data = [search_id, company_sourceid, company_keyword, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''] company_info_data = dict(zip(columnname, company_info_data)) company_scrapy_result = company_scrapy_result.append(company_info_data, ignore_index=True) return company_scrapy_result
import sys sys.path.append('./libraries') import pyfpgrowth import pandas as pd import numpy as np data = pd.read_csv('./datasets/sepet.csv', header=None) transactions = [] for d in data.values: tmp = [] #print(pd.notna(d)) for i in d: if pd.notna(i): tmp.append(i) transactions.append(tmp) patterns = pyfpgrowth.find_frequent_patterns(transactions, 10) rules = pyfpgrowth.generate_association_rules(patterns, 0.8)
def test_count(self): f = lambda s: notna(s).sum() self._check_stat_op('count', f, obj=self.panel4d, has_skipna=False)
def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): """ Try to parse a ndarray like into a column by inferring dtype. """ # don't try to coerce, unless a force conversion if use_dtypes: if not self.dtype: if all(notna(data)): return data, False return data.fillna(np.nan), True elif self.dtype is True: pass else: # dtype to force dtype = (self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype) if dtype is not None: try: dtype = np.dtype(dtype) return data.astype(dtype), True except (TypeError, ValueError): return data, False if convert_dates: new_data, result = self._try_convert_to_date(data) if result: return new_data, True result = False if data.dtype == "object": # try float try: data = data.astype("float64") result = True except (TypeError, ValueError): pass if data.dtype.kind == "f": if data.dtype != "float64": # coerce floats to 64 try: data = data.astype("float64") result = True except (TypeError, ValueError): pass # don't coerce 0-len data if len(data) and (data.dtype == "float" or data.dtype == "object"): # coerce ints if we can try: new_data = data.astype("int64") if (new_data == data).all(): data = new_data result = True except (TypeError, ValueError, OverflowError): pass # coerce ints to 64 if data.dtype == "int": # coerce floats to 64 try: data = data.astype("int64") result = True except (TypeError, ValueError): pass return data, result
)[0] in genome_ids: # extracts the genome id and confirms it is a training sample spgene_files.append(filename) df = pd.read_csv(os.path.join(args.feature_folder, filename), dtype=str, sep='\t') df = df.loc[df['property'].isin(args.properties)] if feature_df is None: feature_df = df else: feature_df = pd.concat([feature_df, df], ignore_index=True) # getting the set of classifications that will serve as a feature vector classifications = list( set([ feat for feat in list(feature_df['classification']) if pd.notna(feat) ])) num_classifications = len(classifications) print('Number of gene family classifications: ' + str(num_classifications)) # building our list of genomes and associated resistance label vector, feature vector col_names = ['ID', 'Antibiotics', 'Phenotype', 'Annotations', 'Features'] samples = list() for gen_id in genome_ids: sample = [ gen_id, args.antibiotics, [0 for _ in range(len(args.antibiotics))], [False for _ in range(len(args.antibiotics))], [0.0 for _ in range(num_classifications)] ] # filling the label vector, noting if actually annotated in the data or default-filled
'Deaths': ['sum'], 'Recovered': ['sum'], }) grouped.columns = ['Confirmed', 'Deaths', 'Recovered'] grouped = grouped.reset_index() df_raw = grouped # extract information from CSV data = { 'reportdate': date(year, month, day), 'region': df_raw[key_region].str.strip(), 'subregion': df_raw.where(pd.notna(df_raw[key_subregion]), df_raw[key_region], axis=0)[key_subregion].str.strip(), 'lat': df_raw['Lat'] if 'Lat' in cols else None, 'lng': df_raw['Long_'] if 'Long_' in cols else None, 'confirmed': df_raw['Confirmed'].fillna(0).astype('int32'), 'deaths': df_raw['Deaths'].fillna(0).astype('int32'), 'recovered': df_raw['Recovered'].fillna(0).astype('int32') } # append the content of this file
def dataSetCheck(): def isfloat(value): try: float(value) return True except: return False badCords = [] emptyRows = [] inconsitent = [] myDBChk = pd.read_excel(myFile, converters={ 'PS_NETWORK_KEY-Spatial': str, 'POWER_SUPPLY_NAME': str, 'Continuity PS Name': str, 'Mac Address': str, 'Good Latitude': str, 'Good Longitude': str, 'Status': str, 'Comment': str }) myDBChk = myDBChk.replace(r'^\s*$', np.nan, regex=True) badCords = np.where(((pd.isna(myDBChk['Good Latitude'])) ^ (pd.isna(myDBChk['Good Longitude']))) | ~(myDBChk['Good Latitude'].apply(isfloat)) | ~(myDBChk['Good Longitude'].apply(isfloat)))[0] filledRows = np.where((pd.notna(myDBChk['Status'])) | (pd.notna(myDBChk['Good Latitude']) & pd.isna(myDBChk['Good Longitude'])))[0] if len(filledRows) > 0: mask = np.full(len(myDBChk['Status']), False) for i in range(0, filledRows[len(filledRows) - 1]): mask[i] = True emptyRows = np.where(mask & (pd.isna(myDBChk['Status'])) & (pd.isna(myDBChk['Good Latitude']) | pd.isna(myDBChk['Good Longitude'])))[0] if len(emptyRows) > 0: print('Nothing filled in at rows:', emptyRows + 2) if checkInputFile: inputCSV = pd.read_csv('input.csv', converters={ 'Id_Info': str, 'Comment': str, 'New Name': str, 'New Mac': str }) inputCSV = inputCSV.replace(r'^\s*$', np.nan, regex=True) filledRows = np.where((pd.notna(inputCSV['Case'])) | (pd.notna(inputCSV['New Lat']) & pd.isna(inputCSV['New Long'])))[0] if len(filledRows) > 0: mask = np.full(len(inputCSV['Case']), False) for i in range(0, filledRows[len(filledRows) - 1]): mask[i] = True emptyRows = np.where(mask & (pd.isna(inputCSV['Case'])) & (pd.isna(inputCSV['New Lat']) | pd.isna(inputCSV['New Long'])))[0] if len(emptyRows) > 0: print('Nothing filled in input file at rows:', emptyRows + 2) if len(badCords) == 0: if checkInputFile: myDB = pd.read_excel(myFile, converters={ 'PS_NETWORK_KEY-Spatial': str, 'POWER_SUPPLY_NAME': str, 'Continuity PS Name': str, 'Mac Address': str, 'Good Latitude': float, 'Good Longitude': float, 'Status': str, 'Comment': str }) myDB = myDB.replace(r'^\s*$', np.nan, regex=True) inputCSV = pd.read_csv('input.csv', converters={ 'Id_Info': str, 'Comment': str, 'New Name': str, 'New Mac': str }) inputCSV = inputCSV.replace(r'^\s*$', np.nan, regex=True) inconsitent = np.where( (abs(myDB['Good Latitude'] - inputCSV['New Lat']) > 0.0001) | ((pd.isna(myDB['Good Latitude'])) ^ (pd.isna(inputCSV['New Lat']))) | (abs(myDB['Good Longitude'] - inputCSV['New Long']) > 0.0001) | ((pd.isna(myDB['Good Longitude'])) ^ (pd.isna(inputCSV['New Long']))) | ((myDB['Continuity PS Name'] != inputCSV['New Name']) & (myDB['Continuity PS Name'].notna() | inputCSV['New Name'].notna())) | ((myDB['Mac Address'] != inputCSV['New Mac']) & (myDB['Mac Address'].notna() | inputCSV['New Mac'].notna())))[0] if len(inconsitent) > 0: print('Inconsitent with input file at rows:', inconsitent + 2) else: print('Bad coordinate at rows:', shifted + 2) if len(badCords) == 0 and len(emptyRows) == 0 and len(inconsitent) == 0: print('Datasheet filled correctly')
plt.xlabel("hour") plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=5, fancybox=True, shadow=True) plt.xticks(np.arange(0, 24, step=1)) plt.show() ############# ############## import geopandas as gpd indx = pd.notna(df['District']) df_map = df.loc[indx].groupby(['District']).count()['ID'] data1 = [] for i in district: data1 = data1 + [[str(np.int64(i)), df_map[i]]] map_df = pd.DataFrame(np.array(data1), columns=['Distict', 'CrimeRatio']) map_df.Distict.apply(str) map_df["CrimeRatio"] = pd.to_numeric(map_df["CrimeRatio"]) map_df['Distict'] = map_df['Distict'].astype(str) import folium import json with open('Boundaries - Police Districts (current).geojson', 'r') as jasonfile:
def get_terms(version: str) -> Iterable[Term]: """Get ComplexPortal terms.""" df = get_df(version=version) df.rename(inplace=True, columns={ 'Aliases for complex': 'aliases', 'Identifiers (and stoichiometry) of molecules in complex': 'members', 'Taxonomy identifier': 'taxonomy_id', 'Cross references': 'xrefs', 'Description': 'definition', 'Recommended name': 'name', '#Complex ac': 'complexportal_id', }) df['aliases'] = df['aliases'].map(lambda s: s.split('|') if pd.notna(s) else []) df['members'] = df['members'].map(_parse_members) df['xrefs'] = df['xrefs'].map(_parse_xrefs) taxnomy_id_to_name = get_id_name_mapping('ncbitaxon') df['taxonomy_name'] = df['taxonomy_id'].map(taxnomy_id_to_name.get) slim_df = df[[ 'complexportal_id', 'name', 'definition', 'aliases', 'xrefs', 'taxonomy_id', 'taxonomy_name', 'members', ]] it = tqdm(slim_df.values, total=len(slim_df.index), desc=f'mapping {PREFIX}') unhandled_xref_type = set() for complexportal_id, name, definition, aliases, xrefs, taxonomy_id, taxonomy_name, members in it: synonyms = [Synonym(name=alias) for alias in aliases] _xrefs = [] provenance = [] for reference, note in xrefs: if note == 'identity': _xrefs.append(reference) elif note == 'see-also' and reference.prefix == 'pubmed': provenance.append(reference) elif (note, reference.prefix) not in unhandled_xref_type: logger.debug( f'unhandled xref type: {note} / {reference.prefix}') unhandled_xref_type.add((note, reference.prefix)) term = Term( reference=Reference(prefix=PREFIX, identifier=complexportal_id, name=name), definition=definition.strip() if pd.notna(definition) else None, synonyms=synonyms, xrefs=_xrefs, provenance=provenance, ) term.set_species(identifier=taxonomy_id, name=taxonomy_name) for reference, _count in members: term.append_relationship(has_part, reference) yield term
def validate_email(contact, company): eformat = False esuffix = False epersonal = False edomain = False edup = False suffix = [r'\.com$', r'\.cn$', r'\.org$', r'\.net$', r'\.cc$', r'\.uk$', r'\.fr$', r'\.hk$', r'\.tw$', r'\.au$', r'\.jp$', r'\.sg$'] personal = ['@gmail.com', '@hotmail.com', '@yahoo.com', '@sina.com', '@vip.sina.com', '@163.com', '@126.com', '@qq.com', '@vip.qq.com', '@139.com'] if pd.notna(contact['Email']): # Lower and no space email = contact['Email'].lower().replace(' ', '') else: echeck = eformat and esuffix and (epersonal or edomain) contact['ve_Email_Format'] = eformat contact['ve_Email_Suffix'] = esuffix contact['ve_Email_Domain'] = epersonal or edomain contact['ve_Email_Check'] = echeck contact['Reject_Reason'] = contact['Reject_Reason'] + 'No Email; ' return contact # TODO: Email format check # Email must contain @ if '@' in email: eformat = True else: contact['Reject_Reason'] = contact['Reject_Reason'] + 'Email without @; ' # Email suffix check for s in suffix: if re.search(re.compile(s, re.I), email) is not None: esuffix = True break if not esuffix: contact['Reject_Reason'] = contact['Reject_Reason'] + 'Email invalid suffix; ' # Email personal check for p in personal: if p in email: epersonal = True break # Email domain check domain = None if not company.empty: if pd.notna(company['Website']).bool(): company_website = company['Website'].values[0] domain = company_website.split('.')[1] elif pd.notna(company['Email']).bool(): company_email = company['Email'].values[0] domain = company_email.split('@')[1].split('.')[0] for p in personal: if p in company_email: domain = None break if domain is not None: if domain in email: edomain = True else: contact['Reject_Reason'] = contact['Reject_Reason'] + 'Email domain not match; ' else: edomain = True else: contact['Reject_Reason'] = contact['Reject_Reason'] + 'Company under review; ' # Email check echeck = eformat and esuffix and (epersonal or edomain) contact['ve_Email_Format'] = eformat contact['ve_Email_Suffix'] = esuffix contact['ve_Email_Domain'] = epersonal or edomain contact['ve_Email_Check'] = echeck return contact
def label_duplicate_links(in_file, node1_idx_reverse = [1,0],\ src_equals_dest_idx = 8, experiment_str='all_links_duplicate_clustered'): ''' This function finds all the symmetric(also called 'duplicate' here) links B-A for the link A-B. Deletes all the identical links Input: ----- in_file: CSV file. node1_idx_reverse: index B-A for A-B Output: ------ out_file_path: is the path of the csv with the duplicates assigned same label. And Identical links removed ''' df = pd.read_csv(in_file) df_mat = df.values new_df = [] i = 0 label = 0 node1_idx = node1_idx_reverse + list(range(len(node1_idx_reverse),\ len(df.columns))) # Instead of all the columns, focus just on the relevant indices # node1_idx = node1_idx_reverse for idx1 in range(df_mat.shape[0]): found_duplicate = False found_identical = False if pd.notna(df_mat[idx1,node1_idx_reverse[0]]) or\ pd.notna(df_mat[idx1,node1_idx_reverse[int(len(node1_idx_reverse)/2)]]): for idx2 in range(idx1 + 1, df_mat.shape[0]): # Delete Identical links if (df_mat[idx1,:] == df_mat[idx2,:]).all(): # and \ df_mat[idx2,:] = [np.nan]*len(df_mat[idx2,:]) found_identical = True # Keep one copy of indentically symmetrical links and remove all others # If All the columns need to be considered # if (df_mat[idx1,node1_idx] == df_mat[idx2,:]).all(): # If only the ABIDE columns need to be considered if (df_mat[idx1,node1_idx_reverse] == df_mat[idx2,sorted(node1_idx_reverse)]).all(): i = i+1 if not found_duplicate: label = label + 1 new_df.append(np.append(df_mat[idx1,:], label)) print('-----------------------------------------------') print(i,':',np.append(df_mat[idx1,:], label)) if experiment_str != 'no_duplicates_others_clustered': i = i+1 new_df.append(np.append(df_mat[idx2,:], label)) print(i,':',np.append(df_mat[idx2,:], label)) print('-----------------------------------------------') df_mat[idx2,:] = [np.nan]*len(df_mat[idx2,:]) found_duplicate = True else: # So that the symmetric links does not come again and again if experiment_str != 'no_duplicates_others_clustered': i = i+1 new_df.append(np.append(df_mat[idx2,:], label)) print(i,':',np.append(df_mat[idx2,:], label)) df_mat[idx2,:] = [np.nan]*len(df_mat[idx2,:]) # If no symmetric links are found if (not found_duplicate) or found_identical: _label = None # Assign a label and append to df if experiment_str == 'all_links_duplicate_clustered': label = label + 1 _label = label elif experiment_str == 'all_links_duplicate_clustered_others_clustered' or\ experiment_str == 'no_duplicates_others_clustered': _label = 'Single' new_df.append(np.append(df_mat[idx1,:], _label)) i = i + 1 print(i,': Single',np.append(df_mat[idx1,:], _label)) in_file_name = os.path.splitext(in_file)[0] out_file_path = in_file_name + '_' + experiment_str + '.csv' new_df = np.array(new_df) new_df = pd.DataFrame(data=new_df, columns=np.append(df.columns, 'Link_Label')) new_df.to_csv(out_file_path,index=False) return out_file_path, new_df
key_journal_id, 'Rank', 'NormalizedName', 'DisplayName', key_issn, 'Publisher', 'Webpage', 'PaperCount', 'CitationCount', 'CreatedDate' ] client = MongoClient('localhost', 27017) db = client['mag'] collection = db['jour'] cnt = 0 chunksize = 10**6 for chunk in pd.read_csv(filename, names=header, sep='\t', chunksize=chunksize): data = [] for key, val in chunk.iterrows(): journal_id = val[key_journal_id] issn = val[key_issn] if pd.notna(issn): current = {key_journal_id: journal_id, key_issn: issn} data.append(current) collection.insert_many(data) print(cnt) cnt += 1
{ True: 0, False: 1 }) #print(stock_plus_tweet) stock_plus_tweet['Output'] = stock_plus_tweet['Output'].fillna( method='backfill') stock_plus_tweet['EMA5'] = stock_plus_tweet['EMA5'].fillna( method='backfill') stock_plus_tweet['EMA10'] = stock_plus_tweet['EMA10'].fillna( method='backfill') stock_plus_tweet['EMA20'] = stock_plus_tweet['EMA20'].fillna( method='backfill') stock_plus_tweet = stock_plus_tweet[pd.notna(stock_plus_tweet['Output'])] stock_plus_tweet = stock_plus_tweet[pd.notna(stock_plus_tweet['text'])] number_of_tweets = stock_plus_tweet.groupby('date').count() number_of_tweets['numTweets'] = number_of_tweets['text'] number_of_tweets = number_of_tweets['numTweets'] stock_plus_tweet = pd.merge(stock_plus_tweet, number_of_tweets, how='left', on='date') stock_plus_tweet = stock_plus_tweet[[ 'date', 'time', 'retweet_count', 'neg', 'neu', 'pos', 'cmpd', 'IsTradingDay', 'is_retweet', 'numTweets', 'EMA5', 'EMA10', 'EMA20',
def process_args(api, args): aliases = { "CINT": "ClientInternal", "FIGI": "Figi", "RIC": "P:Instrument/default/RIC", "TICKER": "P:Instrument/default/Ticker", "ISIN": "P:Instrument/default/Isin", } if args.input: df = pd.concat( [ lpt.read_input(input_file, dtype=str) for input_file in args.input ], ignore_index=True, sort=False, ) if args.mappings: df.rename( columns=dict([(s[1], aliases.get(s[0], s[0])) for s in [m.split("=") for m in args.mappings]]), inplace=True, ) prop_keys = [col for col in df.columns.values if col.startswith("P:")] identifiers = [ col for col in df.columns.values if col in args.identifiers ] # Identifiers have to be unique df = df.drop_duplicates(identifiers) def make_identifiers(row): return { identifier: api.models.InstrumentIdValue(row[identifier]) for identifier in identifiers if pd.notna(row[identifier]) } def make_properties(row): return [ api.models.ModelProperty(key[2:], api.models.PropertyValue(row[key])) for key in prop_keys if pd.notna(row[key]) ] def success(r): df = lpt.to_df([err[1] for err in r.content.failed.items()], ["id", "detail"]) df.columns = ["FAILED-INSTRUMENT", "ERROR"] return lpt.trim_df(df, args.limit, sort="FAILED-INSTRUMENT") has_lookthrough = LT_SCOPE in df.columns.values requests = [ api.models.InstrumentDefinition( row["name"], make_identifiers(row), make_properties(row), api.models.ResourceId(row[LT_SCOPE], row[LT_CODE]) if (has_lookthrough and pd.notna(row[LT_SCOPE])) else None, ) for idx, row in df.iterrows() ] # Convert valid requests to dictionary def make_key(r): sec_id = list(r.identifiers.items())[0] return "{}:{}".format(sec_id[0], sec_id[1].value) requests = { make_key(r): r for r in requests if len(r.identifiers.keys()) > 0 } if args.test: lpt.display_df(df[identifiers + prop_keys + ["name"]]) print(requests) exit() return api.call.upsert_instruments(instruments=requests).bind(success)
# import semantic type frequencies with open('semanticTypes.pickle', 'rb') as handle: sty = pickle.load(handle) # Extract defined relations among semantic types srstr_sty = srstr.loc[:542] srstr_sty.columns = ["STY1", "RL", "STY2", "LS"] srstr_sty_d = srstr_sty[srstr_sty.LS == "D"] srstr_sty_b = srstr_sty[srstr_sty.LS == "B"] srstr_sty_dni = srstr_sty[srstr_sty.LS == "DNI"] # Create multi-directed-graphs with full SRSTR relations and only isa relations srstr_trees = nx.MultiDiGraph(name='SRSTR defined tree (only isa relation)') for index, row in srstr_sty_d.iterrows(): if pd.notna(row['STY2']): # Disconnect topmost nodes from '' if row['RL'] == 'isa': srstr_trees.add_edge(row['STY1'], row['STY2'], relation=row['RL']) color_dict = dict((el, None) for el in sty) current_level = ['Event'] current_color = [255, 0, 0, 1.0] while len(current_level) != 0: next_level = [] for sty in current_level: color_dict[sty] = 'rgba' + str(tuple(current_color)) next_level += list(srstr_trees.predecessors(sty)) current_color[1] += 45 current_level = next_level current_level = ['Entity']
def top_predict(): data = pd.read_hdf(data_path) score_df = pd.DataFrame() score_df["var"] = [i for i in var_col] final_result = pd.DataFrame() start = datetime.datetime.now() for wtid in tqdm(range(1, 34)): use_data = data[data["wtid"] == wtid] test_scores = [] for var in var_col: train_data = use_data[pd.notna(use_data[var])] predict_data = use_data[pd.isna(use_data[var])] index = 0 for index, t in enumerate(tool.types): if var in t: break test_label_col = str(index) + "_test" train_feature = train_data[train_data[test_label_col] == 0] top_values = train_feature[var].value_counts().index test_feature = train_data[train_data[test_label_col] == 1] test_y = np.array(test_feature[var]) # 用出现次数最多的数值 test_pred = np.array([top_values[0]] * len(test_y)) predict_y = np.array([top_values[0]] * len(predict_data)) if var in category_col: test_score = tool.label_score(test_y, test_pred) else: test_score = tool.regression_score(test_y, test_pred) # 检验第二多的数值 if test_score > 0.1 and len(top_values) > 1: test_pred2 = [top_values[1]] * len(test_y) if var in category_col: test_score2 = tool.label_score(test_y, test_pred2) else: test_score2 = tool.regression_score(test_y, test_pred2) if test_score2 > test_score: test_score = test_score2 predict_y = np.array([top_values[1]] * len(predict_data)) test_scores.append(test_score) use_data.loc[predict_data.index, var] = predict_y score_df[str(wtid)] = test_scores final_result = pd.concat( (final_result, use_data[use_data["count_miss"] > 0]), axis=0, ignore_index=True) final_result = final_result[head_col] final_result.sort_values(["wtid", "ts"], inplace=True) final_result.to_csv("./result/top_result.csv", encoding="utf8", index=False, float_format='%.2f') score_df.set_index("var", inplace=True) score_df = score_df.T score_df.reset_index(inplace=True) score_df.rename(columns={"index": "wtid"}, inplace=True) score_df.to_csv("./result/top_score.csv", encoding="utf8", index=False, float_format='%.4f') end = datetime.datetime.now() print("finish top_predict time: ", end - start, "\n")
def __init__(self, row): self.top_30ds_quantity = float(row["top_30ds_quantity"]) self.top_90ds_quantity = float(row["top_90ds_quantity"]) self.edlp_unit_price = float(row["edlp_unit_price"]) self.bsd_unit_price = float(row["bsd_unit_price"]) self.hd_unit_price = float(row["hd_unit_price"]) self.edlp_fixed_price = float(row["edlp_fixed_price"]) self.bsd_fixed_price = float(row["bsd_fixed_price"]) self.hd_fixed_price = float(row["hd_fixed_price"]) self.bh01_mac_price = float(row["bh01_mac_price"]) self.bh02_mac_price = float(row["bh02_mac_price"]) self.bh03_mac_price = float(row["bh03_mac_price"]) self.wmt_mac_price = float(row["wmt_mac_price"]) self.hd_syr_mac_price = float(row["hd_syr_mac_price"]) self.bh01_dispensing_fee = float(row["bh01_dispensing_fee"]) self.bh02_dispensing_fee = float(row["bh02_dispensing_fee"]) self.bh03_dispensing_fee = float(row["bh03_dispensing_fee"]) self.wmt_dispensing_fee = float(row["wmt_dispensing_fee"]) self.hd_syr_dispensing_fee = float(row["hd_syr_dispensing_fee"]) self.wmt_2018_11_28_qty1 = float(row["wmt_2018_11_28_qty1"]) self.wmt_2018_11_28_qty2 = float(row["wmt_2018_11_28_qty2"]) self.wmt_2018_11_28_price1 = float(row["wmt_2018_11_28_price1"]) self.wmt_2018_11_28_price2 = float(row["wmt_2018_11_28_price2"]) self.wmt_2018_11_28_flg = float(row["wmt_2018_11_28_flg"]) self.last_30ds_qty = float(row["last_30ds_qty"]) self.last_90ds_qty = float(row["last_90ds_qty"]) # min_grx_30ds = float(row["min_grx_30ds"]) # min_grx_90ds = float(row["min_grx_90ds"]) self.min_major_retail_grx_30ds = float( row["min_major_retail_grx_30ds"]) self.min_major_retail_grx_90ds = float( row["min_major_retail_grx_90ds"]) # min_retail_grx_30ds = float(row["min_retail_grx_30ds"]) # min_retail_grx_90ds = float(row["min_retail_grx_90ds"]) # ltd_30_day_scripts = float(row["ltd_30_day_scripts"]) # ltd_90_day_scripts = float(row["ltd_90_day_scripts"]) # ltd_30_day_scripts_pct = float(row["ltd_30_day_scripts_pct"]) # ltd_90_day_scripts_pct = float(row["ltd_90_day_scripts_pct"]) # r30_30_day_scripts = float(row["r30_30_day_scripts"]) # r30_90_day_scripts = float(row["r30_90_day_scripts"]) # r30_30_day_script_pct = float(row["r30_30_day_script_pct"]) # r30_90_day_script_pct = float(row["r30_90_day_script_pct"]) self.fills = float(row["fills"]) self.margin = float(row["margin"]) self.orders = float(row["orders"]) self.revenue = float(row["revenue"]) self.users = float(row["users"]) self.default_quantity = float(row["default_quantity"]) self.pharmacy_network_id = float(row["pharmacy_network_id"]) if self.pharmacy_network_id == 1: self.mac_unit_price = 0.45 * self.bh01_mac_price + 0.3 * self.wmt_mac_price + 0.175 * self.bh03_mac_price + 0.075 * self.bh02_mac_price self.mac_fixed_price = 0.45 * self.bh01_dispensing_fee + 0.3 * self.wmt_dispensing_fee + 0.175 * self.bh03_dispensing_fee + 0.075 * self.bh02_dispensing_fee self.sale_fixed_price = self.edlp_fixed_price self.sale_unit_price = self.edlp_unit_price if self.pharmacy_network_id == 2: self.mac_unit_price = self.bh01_mac_price self.mac_fixed_price = self.bh01_dispensing_fee self.sale_fixed_price = self.bsd_fixed_price self.sale_unit_price = self.bsd_unit_price if self.pharmacy_network_id == 3: self.mac_unit_price = self.hd_syr_mac_price self.mac_fixed_price = self.hd_syr_dispensing_fee self.sale_fixed_price = self.hd_fixed_price self.sale_unit_price = self.hd_unit_price # For Default Quantity self.sale_price_30ds_qty = self.sale_unit_price * self.last_30ds_qty + self.sale_fixed_price self.mac_price_30ds_qty = self.mac_unit_price * self.last_30ds_qty + self.mac_fixed_price self.sale_price_wmt_qty1 = self.sale_unit_price * self.wmt_2018_11_28_qty1 + self.sale_fixed_price self.sale_price_wmt_qty2 = self.sale_unit_price * self.wmt_2018_11_28_qty2 + self.sale_fixed_price self.mac_price_wmt_qty1 = self.mac_unit_price * self.wmt_2018_11_28_qty1 + self.mac_fixed_price self.mac_price_wmt_qty2 = self.mac_unit_price * self.wmt_2018_11_28_qty2 + self.mac_fixed_price self.inConsideration = row['in_consideration'] self.isMarginPositive = self.mac_price_30ds_qty < self.sale_price_30ds_qty self.isCompetitive = self.sale_price_30ds_qty < self.min_major_retail_grx_30ds if pd.notna( self.last_30ds_qty) else False self.walmartDrugGroup = True if pd.notna( self.wmt_2018_11_28_flg) else False self.pricesChanged = False self.newer_unit_price = self.sale_unit_price self.newer_fixed_price = self.sale_fixed_price self.comment = 'No Change'
def _check_moment_func( self, static_comp, name, raw, has_min_periods=True, has_center=True, has_time_rule=True, fill_value=None, zero_min_periods_equal=True, **kwargs, ): # inject raw if name == "apply": kwargs = copy.copy(kwargs) kwargs["raw"] = raw def get_result(obj, window, min_periods=None, center=False): r = obj.rolling(window=window, min_periods=min_periods, center=center) return getattr(r, name)(**kwargs) series_result = get_result(self.series, window=50) assert isinstance(series_result, Series) tm.assert_almost_equal(series_result.iloc[-1], static_comp(self.series[-50:])) frame_result = get_result(self.frame, window=50) assert isinstance(frame_result, DataFrame) tm.assert_series_equal( frame_result.iloc[-1, :], self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw), check_names=False, ) # check time_rule works if has_time_rule: win = 25 minp = 10 series = self.series[::2].resample("B").mean() frame = self.frame[::2].resample("B").mean() if has_min_periods: series_result = get_result(series, window=win, min_periods=minp) frame_result = get_result(frame, window=win, min_periods=minp) else: series_result = get_result(series, window=win) frame_result = get_result(frame, window=win) last_date = series_result.index[-1] prev_date = last_date - 24 * offsets.BDay() trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) tm.assert_almost_equal(series_result[-1], static_comp(trunc_series)) tm.assert_series_equal( frame_result.xs(last_date), trunc_frame.apply(static_comp, raw=raw), check_names=False, ) # excluding NaNs correctly obj = Series(randn(50)) obj[:10] = np.NaN obj[-10:] = np.NaN if has_min_periods: result = get_result(obj, 50, min_periods=30) tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) # min_periods is working correctly result = get_result(obj, 20, min_periods=15) assert isna(result.iloc[23]) assert not isna(result.iloc[24]) assert not isna(result.iloc[-6]) assert isna(result.iloc[-5]) obj2 = Series(randn(20)) result = get_result(obj2, 10, min_periods=5) assert isna(result.iloc[3]) assert notna(result.iloc[4]) if zero_min_periods_equal: # min_periods=0 may be equivalent to min_periods=1 result0 = get_result(obj, 20, min_periods=0) result1 = get_result(obj, 20, min_periods=1) tm.assert_almost_equal(result0, result1) else: result = get_result(obj, 50) tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) # window larger than series length (#7297) if has_min_periods: for minp in (0, len(self.series) - 1, len(self.series)): result = get_result(self.series, len(self.series) + 1, min_periods=minp) expected = get_result(self.series, len(self.series), min_periods=minp) nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) nan_mask = ~nan_mask tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) else: result = get_result(self.series, len(self.series) + 1) expected = get_result(self.series, len(self.series)) nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) nan_mask = ~nan_mask tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) # check center=True if has_center: if has_min_periods: result = get_result(obj, 20, min_periods=15, center=True) expected = get_result( pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=15 )[9:].reset_index(drop=True) else: result = get_result(obj, 20, center=True) expected = get_result(pd.concat([obj, Series([np.NaN] * 9)]), 20)[ 9: ].reset_index(drop=True) tm.assert_series_equal(result, expected) # shifter index s = ["x{x:d}".format(x=x) for x in range(12)] if has_min_periods: minp = 10 series_xp = ( get_result( self.series.reindex(list(self.series.index) + s), window=25, min_periods=minp, ) .shift(-12) .reindex(self.series.index) ) frame_xp = ( get_result( self.frame.reindex(list(self.frame.index) + s), window=25, min_periods=minp, ) .shift(-12) .reindex(self.frame.index) ) series_rs = get_result( self.series, window=25, min_periods=minp, center=True ) frame_rs = get_result( self.frame, window=25, min_periods=minp, center=True ) else: series_xp = ( get_result( self.series.reindex(list(self.series.index) + s), window=25 ) .shift(-12) .reindex(self.series.index) ) frame_xp = ( get_result( self.frame.reindex(list(self.frame.index) + s), window=25 ) .shift(-12) .reindex(self.frame.index) ) series_rs = get_result(self.series, window=25, center=True) frame_rs = get_result(self.frame, window=25, center=True) if fill_value is not None: series_xp = series_xp.fillna(fill_value) frame_xp = frame_xp.fillna(fill_value) tm.assert_series_equal(series_xp, series_rs) tm.assert_frame_equal(frame_xp, frame_rs)