def process_text_for_word_count(text): """ Preprocesses the text before it will be fed to the tokenizer. Here, we should put things like lower-casing the text, casting letters to "simple" ("ascii", "non-accentuated") letters, replacing some common strings (such as "bed and breakfast", "New York" by singular token representatives such as "b&b", "new_york"), and what ever needs to be done before tokens are retrieved from text. """ return toascii(to_unicode_or_bust(text)).lower()
def kw_str(keyword): """ produces a kw_str version of the input keyword (or list of keywords), i.e. lower ascii and strip_kw are applied """ #return strip_kw(pstr_trans.lower(pstr_trans.toascii(pstr_trans.to_unicode_or_bust(keyword)))) if isinstance(keyword, basestring): return str( strip_kw( pstr_trans.lower( pstr_trans.toascii( pstr_trans.to_unicode_or_bust(keyword))))) else: return map( lambda x: str( strip_kw( pstr_trans.lower( pstr_trans.toascii(pstr_trans.to_unicode_or_bust(x)))) ), keyword)
def kw_dup_diagnosis(df, grp_keys=['match_type'], # grp_keys=['match_type','ad_group','campaign']? grp_fun_dict={'dups': lambda x: len(x)}, grp_id_name='grp_id',grp_id_type='int',output_nondup_df=False): dup_df_dict = dict() grp_keys = oc.intersect(df.columns, grp_keys) + ['kw_representative'] df = df.copy() # to change the input df (can be handled differently if need to spare memory) df.keyword = df.keyword.apply(lambda x:to_unicode_or_bust(x)) # change all keyword strings to unicode # util function (returns a dataframe containing grp_id and dups of a df def _get_grp_id_and_dups(df): """ this function makes grp_id and dups duplication info columns and returns only those rows with dups>1 NOTE: It is not meant to be used externally, but by the kw_dup_diagnosis() only """ df = daf_dup_diag.ad_group_info_cols(df, grp_keys=grp_keys, grp_fun_dict=grp_fun_dict, grp_id_name=grp_id_name, grp_id_type=grp_id_type ) if len(df)>0: return df[['grp_id', 'dups']][df.dups>1] else: # return an empty dataframe (but with the usual columns (necessary for the further joins) return pd.DataFrame(columns=['grp_id', 'dups']) # make a kw_representative column where different "group representatives" will be placed df['kw_representative'] = df['keyword'] # get the kw_stripped duplicates df['kw_representative'] = aw_manip.strip_kw(df['kw_representative']) dup_df_dict['strip'] = _get_grp_id_and_dups(df) # get the kw_lower duplicates df['kw_representative'] = pstr_trans.lower(df['kw_representative']) dup_df_dict['lower'] = _get_grp_id_and_dups(df) # get the ascii duplicates df['kw_representative'] = pstr_trans.toascii(df['kw_representative']) dup_df_dict['ascii'] = _get_grp_id_and_dups(df) # get the order duplicates (only for Broads) d = df[df.match_type=='Broad'] d['kw_representative'] = aw_manip.order_words(d['kw_representative']) dup_df_dict['order'] = _get_grp_id_and_dups(d) # join all this together d = dup_df_dict['strip'].join(dup_df_dict['lower'],how='outer',lsuffix='_strip').fillna(0) d = d.join(dup_df_dict['ascii'],how='outer',lsuffix='_lower').fillna(0) d = d.join(dup_df_dict['order'],how='outer',lsuffix='_ascii',rsuffix='_order').fillna(0) del df['kw_representative'] d = d.join(df) if output_nondup_df==False: return d else: named_tuple = collections.namedtuple('dup_stats',['dup_diag_df','non_dup_df']) return named_tuple(dup_diag_df=d, non_dup_df=df.ix[list(set(df.index)-set(d.index))])
def add_col(df, colname=None, overwrite=True, **kwargs): """ Adds one or several requested columns (colname) to df, usually computed based on other columns of the df. Details of what colname does what inside the code! The overwrite flag (defaulted to True) specified whether """ if colname is None: print "colname choices: " print "%s" % str([ 'pos_impressions', 'pos', 'day_of_week_num', 'day_of_week', 'week_of_year', 'cvr', 'ctr', 'cpc', 'spc', 'kw_lower', 'kw_lower_ascii', 'kw_lower_ascii_ordered', 'destination' ]) return None df_columns = df.columns if isinstance(colname, basestring): if overwrite is False and has_columns(df, colname): return df # just return the df as is else: if colname == 'pos_impressions': df['pos_impressions'] = df['avg_position'] * df['impressions'] elif colname == 'pos': df['pos'] = df['pos_impressions'] / df['impressions'] elif colname == 'day_of_week_num': if 'day' in df.columns: df['day_of_week_num'] = df['day'].apply( pd.datetime.weekday) elif 'date' in df.columns: df['day_of_week_num'] = df['date'].apply( pd.datetime.weekday) else: days = [ u'Monday', u'Tuesday', u'Wednesday', u'Thursday', u'Friday', u'Saturday', u'Sunday' ] key_col = 'day_of_week' day_2_num = pd.DataFrame({ 'day_of_week': days, 'day_of_week_num': np.arange(len(days)) }) if key_col in df.index.names: index_names = df.index.names df = df.reset_index(drop=False) df = df.merge(day_2_num) if kwargs.get('rm_key_cols', False): df.drop(key_col, axis=1, inplace=True) index_names = list( set(index_names).difference([key_col])) df = df.set_index(index_names) else: df = df.merge(day_2_num) if kwargs.get('rm_key_cols', False): df.drop(key_col, axis=1, inplace=True) elif colname == 'day_of_week': days = [ u'Monday', u'Tuesday', u'Wednesday', u'Thursday', u'Friday', u'Saturday', u'Sunday' ] key_col = 'day_of_week_num' day_2_num = pd.DataFrame({ 'day_of_week': days, 'day_of_week_num': np.arange(len(days)) }) if key_col in df.index.names: index_names = df.index.names df = df.reset_index(drop=False) df = df.merge(day_2_num) if kwargs.get('rm_key_cols', False): df.drop(key_col, axis=1, inplace=True) index_names = list( set(index_names).difference([key_col])) df = df.set_index(index_names) else: df = df.merge(day_2_num) if kwargs.get('rm_key_cols', False): df.drop(key_col, axis=1, inplace=True) elif colname == 'week_of_year': date_col = kwargs.get('date_col', None) if date_col is None: date_col = get_first_item_contained_in_intersection_of( ['day', 'date'], df.columns, None) if date_col is None: raise KeyError( "Couldn't find a date_col to work with: Tell me what it is" ) if isinstance(date_col, basestring): date_col = df[date_col] try: df['week_of_year'] = map(lambda t: t.isocalendar()[1], date_col) except AttributeError: df['week_of_year'] = map(lambda t: t.weekofyear, date_col) elif colname == 'cvr': df['cvr'] = df['conversions'] / df['clicks'] elif colname == 'ctr': df['ctr'] = df['clicks'] / df['impressions'] elif colname == 'cpc': df['cpc'] = df['cost'] / df['clicks'] elif colname == 'spc': mean_cvr = sum(df['conversions']) / sum(df['clicks']) prior_clicks = kwargs.get('prior_clicks', 300) df['spc'] = (df['conversions'] + mean_cvr * prior_clicks) / ( df['clicks'] + prior_clicks) elif colname == 'kw_lower': assert_dependencies(df, 'keyword', "to get {}".format(colname)) df[colname] = lower_series(df['keyword']) elif colname == 'kw_lower_ascii': assert_dependencies(df, 'keyword', "to get {}".format(colname)) df[colname] = pstr_trans.toascii(lower_series(df['keyword'])) elif colname == 'kw_lower_ascii_ordered': assert_dependencies(df, 'keyword', "to get {}".format(colname)) df[colname] = [ ' '.join(np.sort(x.split(' '))) for x in pstr_trans.toascii(lower_series(df['keyword'])) ] elif colname == 'destination': if 'ad_group' in df_columns: #ag_triad = map(lambda x: x.split('|'), pstr_trans.lower(pstr_trans.toascii(list(df['ad_group'])))) ag_triad = map(lambda x: x.split('|'), df['ad_group']) ag_triad_0 = kw_str([x[0] for x in ag_triad]) ag_triad_2 = kw_str([x[2] for x in ag_triad]) df[colname] = map(lambda x2, x0: '|'.join([x2, x0]), ag_triad_2, ag_triad_0) elif 'campaign' in df_columns: df[colname] = kw_str(df['campaign']) else: raise ValueError( 'You need ad_group or campaign to get a destination') else: raise RuntimeError("unknown colname requested") # remove columns? if 'remove_cols' in kwargs.keys(): df.drop(set(kwargs.get('remove_cols', None)).union(df.columns), axis=1, inplace=True) else: try: for c in colname: df = add_col(df, c, overwrite=overwrite) except TypeError: raise RuntimeError("colname must be a string or a list of string") return df
def kw_dup_diagnosis( df, grp_keys=['match_type' ], # grp_keys=['match_type','ad_group','campaign']? grp_fun_dict={'dups': lambda x: len(x)}, grp_id_name='grp_id', grp_id_type='int', output_nondup_df=False): dup_df_dict = dict() grp_keys = oc.intersect(df.columns, grp_keys) + ['kw_representative'] df = df.copy( ) # to change the input df (can be handled differently if need to spare memory) df.keyword = df.keyword.apply(lambda x: to_unicode_or_bust(x) ) # change all keyword strings to unicode # util function (returns a dataframe containing grp_id and dups of a df def _get_grp_id_and_dups(df): """ this function makes grp_id and dups duplication info columns and returns only those rows with dups>1 NOTE: It is not meant to be used externally, but by the kw_dup_diagnosis() only """ df = daf_dup_diag.ad_group_info_cols(df, grp_keys=grp_keys, grp_fun_dict=grp_fun_dict, grp_id_name=grp_id_name, grp_id_type=grp_id_type) if len(df) > 0: return df[['grp_id', 'dups']][df.dups > 1] else: # return an empty dataframe (but with the usual columns (necessary for the further joins) return pd.DataFrame(columns=['grp_id', 'dups']) # make a kw_representative column where different "group representatives" will be placed df['kw_representative'] = df['keyword'] # get the kw_stripped duplicates df['kw_representative'] = aw_manip.strip_kw(df['kw_representative']) dup_df_dict['strip'] = _get_grp_id_and_dups(df) # get the kw_lower duplicates df['kw_representative'] = pstr_trans.lower(df['kw_representative']) dup_df_dict['lower'] = _get_grp_id_and_dups(df) # get the ascii duplicates df['kw_representative'] = pstr_trans.toascii(df['kw_representative']) dup_df_dict['ascii'] = _get_grp_id_and_dups(df) # get the order duplicates (only for Broads) d = df[df.match_type == 'Broad'] d['kw_representative'] = aw_manip.order_words(d['kw_representative']) dup_df_dict['order'] = _get_grp_id_and_dups(d) # join all this together d = dup_df_dict['strip'].join(dup_df_dict['lower'], how='outer', lsuffix='_strip').fillna(0) d = d.join(dup_df_dict['ascii'], how='outer', lsuffix='_lower').fillna(0) d = d.join(dup_df_dict['order'], how='outer', lsuffix='_ascii', rsuffix='_order').fillna(0) del df['kw_representative'] d = d.join(df) if output_nondup_df == False: return d else: named_tuple = collections.namedtuple('dup_stats', ['dup_diag_df', 'non_dup_df']) return named_tuple(dup_diag_df=d, non_dup_df=df.ix[list(set(df.index) - set(d.index))])